Skip to content

Commit 742758c

Browse files
authored
Merge pull request #62 from arethetypeswrong/history
Generate archive of historical analysis data
2 parents a8dece2 + b64a42a commit 742758c

12 files changed

+4329
-5885
lines changed

package-lock.json

Lines changed: 3737 additions & 5885 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/core/src/problems.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@ export const problemKindInfo: Record<ProblemKind, ProblemKindInfo> = {
109109
},
110110
};
111111

112+
export const allProblemKinds = Object.keys(problemKindInfo) as ProblemKind[];
113+
112114
export interface ProblemFilter {
113115
kind?: ProblemKind;
114116
entrypoint?: string;

packages/history/.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
tmp
2+
data
3+
.env
4+
*.d.ts
5+
scripts/lib

packages/history/README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# @arethetypeswrong/history
2+
3+
This package provides `@arethetypeswrong/core` analysis for every [npm-high-impact](https://github.com/wooorm/npm-high-impact) package at the latest version available on the first of every month since January 2022.
4+
5+
The analysis is saved as a 1.2 GB newline-delimited JSON file, cached in Azure Storage for incremental updates going forward, compressed down to 34 MB for shipping to npm, and accessible in Node as a JavaScript object via a small programmatic interface.
6+
7+
## Usage
8+
9+
```ts
10+
import { getAllDataAsObject, getVersionsByDate } from "@arethetypeswrong/history";
11+
12+
const dates = await getVersionsByDate();
13+
const data = await getAllDataAsObject();
14+
15+
function getPackagesWithFalseCJSProblems(date) {
16+
const packages = dates[date];
17+
const result = [];
18+
for (const { packageName, packageVersion } of packages) {
19+
const analysis = data[`${packageName}@${packageVersion}`];
20+
// `analysis` is undefined if the package doesn't contain types
21+
if (analysis?.problems.some((p) => p.kind === "FalseESM")) {
22+
result.push(analysis);
23+
}
24+
}
25+
return result;
26+
}
27+
28+
const mayFalseESMProblems = getPackagesWithFalseCJSProblems("2023-05-01").length;
29+
const juneFalseESMProblems = getPackagesWithFalseCJSProblems("2023-06-01").length;
30+
console.log({ mayFalseESMProblems, juneFalseESMProblems });
31+
```

packages/history/main.js

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import { createReadStream, createWriteStream } from "node:fs";
2+
import { open, readFile, stat } from "node:fs/promises";
3+
import { createGunzip } from "node:zlib";
4+
5+
/**
6+
* @type {boolean | undefined}
7+
*/
8+
let decompressed;
9+
const compressedFullJsonFileName = new URL("./data/full.json.gz", import.meta.url);
10+
const fullJsonFileName = new URL("./data/full.json", import.meta.url);
11+
const datesJsonFileName = new URL("./data/dates.json", import.meta.url);
12+
13+
async function unzip() {
14+
if (decompressed === undefined) {
15+
decompressed = await stat(fullJsonFileName).then(() => true).catch(() => false);
16+
}
17+
if (!decompressed) {
18+
return /** @type {Promise<void>} */(new Promise((resolve, reject) => {
19+
createReadStream(compressedFullJsonFileName)
20+
.pipe(createGunzip())
21+
.pipe(createWriteStream(fullJsonFileName))
22+
.on("finish", () => {
23+
decompressed = true;
24+
resolve();
25+
})
26+
.on("error", reject);
27+
}));
28+
}
29+
}
30+
31+
/**
32+
* Gets a single object containing all Analysis results for sampled packages.
33+
* Keys are package specs in the format `package@version`.
34+
* @returns {Promise<Record<string, import("@arethetypeswrong/core").Analysis>>}
35+
*/
36+
export async function getAllDataAsObject() {
37+
await unzip();
38+
const fh = await open(fullJsonFileName, "r");
39+
/** @type {Record<string, import("@arethetypeswrong/core").Analysis>} */
40+
const result = {};
41+
for await (const line of fh.readLines()) {
42+
/** @type {import("./scripts/types.js").FullJsonLine} */
43+
const { packageSpec, analysis } = JSON.parse(line);
44+
if (analysis.types) {
45+
result[packageSpec] = analysis;
46+
}
47+
}
48+
return result;
49+
}
50+
51+
/**
52+
* Gets the list of top packages processed for each sampled date.
53+
* Keys are dates in the format YYYY-MM-DD.
54+
* @returns {Promise<Record<string, { packageName: string, packageVersion: string, tarballUrl: string }[]>>}
55+
*/
56+
export async function getVersionsByDate() {
57+
/** @type {import("./scripts/types.js").DatesJson} */
58+
const dates = JSON.parse(await readFile(datesJsonFileName, "utf8"));
59+
return dates.dates
60+
}

packages/history/package.json

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"name": "@arethetypeswrong/history",
3+
"version": "0.0.0",
4+
"type": "module",
5+
"files": [
6+
"*.js",
7+
"*.d.ts",
8+
"./data/full.json.gz",
9+
"./data/dates.json"
10+
],
11+
"main": "./main.js",
12+
"types": "./main.d.ts",
13+
"scripts": {
14+
"build": "tsc -b",
15+
"build:scripts": "tsc -b scripts",
16+
"pregenerate": "npm run build:scripts",
17+
"generate": "tsx scripts/generateFull.ts",
18+
"reduce": "tsx scripts/reduceFull.ts"
19+
},
20+
"devDependencies": {
21+
"@arethetypeswrong/core": "file:../core",
22+
"@azure/storage-blob": "^12.14.0",
23+
"@types/cli-progress": "^3.11.0",
24+
"@types/node": "^20.3.1",
25+
"@types/pacote": "^11.1.5",
26+
"@types/semver": "^7.5.0",
27+
"cli-progress": "^3.12.0",
28+
"dotenv": "^16.3.1",
29+
"npm-high-impact": "^1.3.0",
30+
"pacote": "^15.2.0",
31+
"semver": "^7.5.3",
32+
"tsx": "^3.12.7"
33+
}
34+
}
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
import { checkPackage, createPackageFromTarballUrl } from "@arethetypeswrong/core";
2+
import { appendFileSync } from "fs";
3+
import { Worker, isMainThread, parentPort, workerData } from "node:worker_threads";
4+
import type { Blob, FullJsonLine } from "./types.ts";
5+
import { versions } from "@arethetypeswrong/core/versions";
6+
import { npmHighImpact } from "npm-high-impact";
7+
8+
const delay = 10;
9+
10+
function sleep(ms: number) {
11+
return new Promise((resolve) => setTimeout(resolve, ms));
12+
}
13+
14+
function postBlob(blob: Blob) {
15+
if (isMainThread) {
16+
throw new Error("This function must be called from a worker thread.");
17+
}
18+
19+
parentPort!.postMessage(blob);
20+
}
21+
22+
if (!isMainThread && parentPort) {
23+
parentPort.on("message", async ({ packageName, packageVersion, tarballUrl, prevMessage }) => {
24+
let tries = 0;
25+
while (true) {
26+
try {
27+
const analysis = await checkPackage(await createPackageFromTarballUrl(tarballUrl));
28+
postBlob({
29+
kind: "analysis",
30+
workerId: workerData.workerId,
31+
data: analysis,
32+
});
33+
return;
34+
} catch (error) {
35+
await sleep(delay * 100 * tries);
36+
if (tries++ > 3) {
37+
postBlob({
38+
kind: "error",
39+
workerId: workerData.workerId,
40+
packageName,
41+
packageVersion,
42+
tarballUrl,
43+
message: "" + (error as Error)?.message,
44+
prevMessage,
45+
});
46+
return;
47+
}
48+
}
49+
}
50+
});
51+
}
52+
53+
export default function checkPackages(
54+
packages: { packageName: string; packageVersion: string; tarballUrl: string }[],
55+
outFile: URL,
56+
workerCount: number
57+
): Promise<boolean> {
58+
if (!packages.length) {
59+
return Promise.resolve(false);
60+
}
61+
62+
if (!isMainThread) {
63+
throw new Error("This function must be called from the main thread.");
64+
}
65+
66+
const workers = Array.from({ length: workerCount }, (_, i) => {
67+
return new Worker(new URL(import.meta.url), { workerData: { workerId: i } });
68+
});
69+
70+
return new Promise<boolean>(async (resolve, reject) => {
71+
let wroteChanges = false;
72+
const packagesDonePerWorker = new Array(workerCount).fill(0);
73+
const workQueue: { packageName: string; packageVersion: string; tarballUrl: string; prevMessage?: string }[] = [
74+
...packages,
75+
];
76+
let finishedWorkers = 0;
77+
for (const worker of workers) {
78+
worker.on("message", async (blob: Blob) => {
79+
const workerIndex = workers.indexOf(worker);
80+
packagesDonePerWorker[workerIndex]++;
81+
if (blob.kind === "error") {
82+
console.error(`[${workerIndex}] ${blob.packageName}@${blob.packageVersion}: ${blob.message}`);
83+
if (blob.prevMessage === blob.message) {
84+
console.error(`Package ${blob.packageName}@${blob.packageVersion} failed repeatedly; skipping.`);
85+
} else {
86+
workQueue.push({
87+
packageName: blob.packageName,
88+
packageVersion: blob.packageVersion,
89+
tarballUrl: blob.tarballUrl,
90+
prevMessage: blob.message,
91+
});
92+
}
93+
} else {
94+
// Sometimes the package version in the npm manifest is different from the package.json,
95+
// so we need to use the version we were asked for so we don't repeat this work.
96+
const originalPackage = packages.find((p) => p.packageName === blob.data.packageName)!;
97+
const packageSpec = `${blob.data.packageName}@${originalPackage.packageVersion}`;
98+
appendFileSync(
99+
outFile,
100+
JSON.stringify(
101+
{
102+
analysis: blob.data,
103+
coreVersion: versions.core,
104+
packageSpec,
105+
rank: npmHighImpact.indexOf(blob.data.packageName),
106+
} satisfies FullJsonLine,
107+
(key, value) => {
108+
if (key === "trace") {
109+
return [];
110+
}
111+
return value;
112+
}
113+
) + "\n"
114+
);
115+
console.log(`[${workerIndex}] ${packages.length - workQueue.length}/${packages.length} ${packageSpec}`);
116+
wroteChanges = true;
117+
}
118+
119+
await sleep(delay);
120+
if (workQueue.length > 0) {
121+
const next = workQueue.shift()!;
122+
worker.postMessage(next);
123+
} else {
124+
await worker.terminate();
125+
console.log(`[${workerIndex}] done`);
126+
finishedWorkers++;
127+
128+
if (finishedWorkers === workers.length) {
129+
resolve(wroteChanges);
130+
}
131+
}
132+
});
133+
134+
worker.once("error", async (error) => {
135+
await Promise.all(workers.map((worker) => worker.terminate()));
136+
reject(error);
137+
});
138+
139+
await sleep(delay);
140+
const nextPackage = workQueue.shift();
141+
if (nextPackage) {
142+
worker.postMessage({ ...nextPackage, index: packages.indexOf(nextPackage) });
143+
} else {
144+
await worker.terminate();
145+
console.log(`[${workers.indexOf(worker)}] done`);
146+
finishedWorkers++;
147+
148+
if (finishedWorkers === workers.length) {
149+
resolve(wroteChanges);
150+
}
151+
}
152+
}
153+
154+
process.on("SIGINT", async () => {
155+
await Promise.all(workers.map((worker) => worker.terminate()));
156+
reject(new Error("SIGINT"));
157+
});
158+
});
159+
}

0 commit comments

Comments
 (0)