Skip to content

Generate archive of historical analysis data #62

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jul 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9,622 changes: 3,737 additions & 5,885 deletions package-lock.json

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions packages/core/src/problems.ts
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ export const problemKindInfo: Record<ProblemKind, ProblemKindInfo> = {
},
};

export const allProblemKinds = Object.keys(problemKindInfo) as ProblemKind[];

export interface ProblemFilter {
kind?: ProblemKind;
entrypoint?: string;
Expand Down
5 changes: 5 additions & 0 deletions packages/history/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
tmp
data
.env
*.d.ts
scripts/lib
31 changes: 31 additions & 0 deletions packages/history/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# @arethetypeswrong/history

This package provides `@arethetypeswrong/core` analysis for every [npm-high-impact](https://github.com/wooorm/npm-high-impact) package at the latest version available on the first of every month since January 2022.

The analysis is saved as a 1.2 GB newline-delimited JSON file, cached in Azure Storage for incremental updates going forward, compressed down to 34 MB for shipping to npm, and accessible in Node as a JavaScript object via a small programmatic interface.

## Usage

```ts
import { getAllDataAsObject, getVersionsByDate } from "@arethetypeswrong/history";

const dates = await getVersionsByDate();
const data = await getAllDataAsObject();

function getPackagesWithFalseCJSProblems(date) {
const packages = dates[date];
const result = [];
for (const { packageName, packageVersion } of packages) {
const analysis = data[`${packageName}@${packageVersion}`];
// `analysis` is undefined if the package doesn't contain types
if (analysis?.problems.some((p) => p.kind === "FalseESM")) {
result.push(analysis);
}
}
return result;
}

const mayFalseESMProblems = getPackagesWithFalseCJSProblems("2023-05-01").length;
const juneFalseESMProblems = getPackagesWithFalseCJSProblems("2023-06-01").length;
console.log({ mayFalseESMProblems, juneFalseESMProblems });
```
60 changes: 60 additions & 0 deletions packages/history/main.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import { createReadStream, createWriteStream } from "node:fs";
import { open, readFile, stat } from "node:fs/promises";
import { createGunzip } from "node:zlib";

/**
* @type {boolean | undefined}
*/
let decompressed;
const compressedFullJsonFileName = new URL("./data/full.json.gz", import.meta.url);
const fullJsonFileName = new URL("./data/full.json", import.meta.url);
const datesJsonFileName = new URL("./data/dates.json", import.meta.url);

async function unzip() {
if (decompressed === undefined) {
decompressed = await stat(fullJsonFileName).then(() => true).catch(() => false);
}
if (!decompressed) {
return /** @type {Promise<void>} */(new Promise((resolve, reject) => {
createReadStream(compressedFullJsonFileName)
.pipe(createGunzip())
.pipe(createWriteStream(fullJsonFileName))
.on("finish", () => {
decompressed = true;
resolve();
})
.on("error", reject);
}));
}
}

/**
* Gets a single object containing all Analysis results for sampled packages.
* Keys are package specs in the format `package@version`.
* @returns {Promise<Record<string, import("@arethetypeswrong/core").Analysis>>}
*/
export async function getAllDataAsObject() {
await unzip();
const fh = await open(fullJsonFileName, "r");
/** @type {Record<string, import("@arethetypeswrong/core").Analysis>} */
const result = {};
for await (const line of fh.readLines()) {
/** @type {import("./scripts/types.js").FullJsonLine} */
const { packageSpec, analysis } = JSON.parse(line);
if (analysis.types) {
result[packageSpec] = analysis;
}
}
return result;
}

/**
* Gets the list of top packages processed for each sampled date.
* Keys are dates in the format YYYY-MM-DD.
* @returns {Promise<Record<string, { packageName: string, packageVersion: string, tarballUrl: string }[]>>}
*/
export async function getVersionsByDate() {
/** @type {import("./scripts/types.js").DatesJson} */
const dates = JSON.parse(await readFile(datesJsonFileName, "utf8"));
return dates.dates
}
34 changes: 34 additions & 0 deletions packages/history/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"name": "@arethetypeswrong/history",
"version": "0.0.0",
"type": "module",
"files": [
"*.js",
"*.d.ts",
"./data/full.json.gz",
"./data/dates.json"
],
"main": "./main.js",
"types": "./main.d.ts",
"scripts": {
"build": "tsc -b",
"build:scripts": "tsc -b scripts",
"pregenerate": "npm run build:scripts",
"generate": "tsx scripts/generateFull.ts",
"reduce": "tsx scripts/reduceFull.ts"
},
"devDependencies": {
"@arethetypeswrong/core": "file:../core",
"@azure/storage-blob": "^12.14.0",
"@types/cli-progress": "^3.11.0",
"@types/node": "^20.3.1",
"@types/pacote": "^11.1.5",
"@types/semver": "^7.5.0",
"cli-progress": "^3.12.0",
"dotenv": "^16.3.1",
"npm-high-impact": "^1.3.0",
"pacote": "^15.2.0",
"semver": "^7.5.3",
"tsx": "^3.12.7"
}
}
159 changes: 159 additions & 0 deletions packages/history/scripts/checkPackages.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import { checkPackage, createPackageFromTarballUrl } from "@arethetypeswrong/core";
import { appendFileSync } from "fs";
import { Worker, isMainThread, parentPort, workerData } from "node:worker_threads";
import type { Blob, FullJsonLine } from "./types.ts";
import { versions } from "@arethetypeswrong/core/versions";
import { npmHighImpact } from "npm-high-impact";

const delay = 10;

function sleep(ms: number) {
return new Promise((resolve) => setTimeout(resolve, ms));
}

function postBlob(blob: Blob) {
if (isMainThread) {
throw new Error("This function must be called from a worker thread.");
}

parentPort!.postMessage(blob);
}

if (!isMainThread && parentPort) {
parentPort.on("message", async ({ packageName, packageVersion, tarballUrl, prevMessage }) => {
let tries = 0;
while (true) {
try {
const analysis = await checkPackage(await createPackageFromTarballUrl(tarballUrl));
postBlob({
kind: "analysis",
workerId: workerData.workerId,
data: analysis,
});
return;
} catch (error) {
await sleep(delay * 100 * tries);
if (tries++ > 3) {
postBlob({
kind: "error",
workerId: workerData.workerId,
packageName,
packageVersion,
tarballUrl,
message: "" + (error as Error)?.message,
prevMessage,
});
return;
}
}
}
});
}

export default function checkPackages(
packages: { packageName: string; packageVersion: string; tarballUrl: string }[],
outFile: URL,
workerCount: number
): Promise<boolean> {
if (!packages.length) {
return Promise.resolve(false);
}

if (!isMainThread) {
throw new Error("This function must be called from the main thread.");
}

const workers = Array.from({ length: workerCount }, (_, i) => {
return new Worker(new URL(import.meta.url), { workerData: { workerId: i } });
});

return new Promise<boolean>(async (resolve, reject) => {
let wroteChanges = false;
const packagesDonePerWorker = new Array(workerCount).fill(0);
const workQueue: { packageName: string; packageVersion: string; tarballUrl: string; prevMessage?: string }[] = [
...packages,
];
let finishedWorkers = 0;
for (const worker of workers) {
worker.on("message", async (blob: Blob) => {
const workerIndex = workers.indexOf(worker);
packagesDonePerWorker[workerIndex]++;
if (blob.kind === "error") {
console.error(`[${workerIndex}] ${blob.packageName}@${blob.packageVersion}: ${blob.message}`);
if (blob.prevMessage === blob.message) {
console.error(`Package ${blob.packageName}@${blob.packageVersion} failed repeatedly; skipping.`);
} else {
workQueue.push({
packageName: blob.packageName,
packageVersion: blob.packageVersion,
tarballUrl: blob.tarballUrl,
prevMessage: blob.message,
});
}
} else {
// Sometimes the package version in the npm manifest is different from the package.json,
// so we need to use the version we were asked for so we don't repeat this work.
const originalPackage = packages.find((p) => p.packageName === blob.data.packageName)!;
const packageSpec = `${blob.data.packageName}@${originalPackage.packageVersion}`;
appendFileSync(
outFile,
JSON.stringify(
{
analysis: blob.data,
coreVersion: versions.core,
packageSpec,
rank: npmHighImpact.indexOf(blob.data.packageName),
} satisfies FullJsonLine,
(key, value) => {
if (key === "trace") {
return [];
}
return value;
}
) + "\n"
);
console.log(`[${workerIndex}] ${packages.length - workQueue.length}/${packages.length} ${packageSpec}`);
wroteChanges = true;
}

await sleep(delay);
if (workQueue.length > 0) {
const next = workQueue.shift()!;
worker.postMessage(next);
} else {
await worker.terminate();
console.log(`[${workerIndex}] done`);
finishedWorkers++;

if (finishedWorkers === workers.length) {
resolve(wroteChanges);
}
}
});

worker.once("error", async (error) => {
await Promise.all(workers.map((worker) => worker.terminate()));
reject(error);
});

await sleep(delay);
const nextPackage = workQueue.shift();
if (nextPackage) {
worker.postMessage({ ...nextPackage, index: packages.indexOf(nextPackage) });
} else {
await worker.terminate();
console.log(`[${workers.indexOf(worker)}] done`);
finishedWorkers++;

if (finishedWorkers === workers.length) {
resolve(wroteChanges);
}
}
}

process.on("SIGINT", async () => {
await Promise.all(workers.map((worker) => worker.terminate()));
reject(new Error("SIGINT"));
});
});
}
Loading