requestcrawl.js
1#!/usr/bin/env node
2
3// ok look this code was definitely never meant to be published
4// but i haven't made it into something better and it's been a
5// while so
6//
7// hi
8//
9// this has my weird changes and half-baked bits, runs much
10// slower than needed, and basically works.
11//
12// (initially based on futur's [script](https://gist.github.com/futurGH/2ee18d385eff3ba98f5b35b9dcac0aed#file-requestcrawl-ts))
13
14
15for (const envVar of ["RELAY_ADDRESS", "RELAY_ADMIN_KEY"]) {
16 if (!process.env[envVar]) throw new Error(`Missing env var ${envVar}`);
17}
18
19
20const start_at = 0;
21let shrooms_found = 0;
22const shrooms_limit = 30;
23
24async function bluh(url, i) {
25 if (i < start_at) {
26 console.log(`skipping ${i} (before start)`);
27 return true;
28 }
29 // else if (shrooms_found >= shrooms_limit) {
30 // console.log(`skipping ${i} (reached limit)`);
31 // return;
32 // } else {
33 // shrooms_found += 1;
34 // }
35
36 try {
37 const res = await fetch(`${process.env.RELAY_ADDRESS}/admin/pds/requestCrawl`, {
38 method: "POST",
39 headers: {
40 "Content-Type": "application/json",
41 Authorization: `Basic ${process.env.RELAY_ADMIN_KEY}`,
42 },
43 body: JSON.stringify({
44 hostname: "https://" + url.hostname,
45 per_second: 200,
46 per_hour: 150 * 60 * 60,
47 per_day: 120 * 60 * 60 * 24,
48 crawl_rate: 50,
49 repo_limit: 1_000_000,
50 }),
51 });
52 if (res.ok) {
53 console.log(`${i} got ${url.hostname}`);
54 } else {
55 const ej = await await res.json();
56 // if (e?.message.includes('i/o')) {
57
58 // }
59 console.error(
60 `${i} Error requesting crawl for ${url.hostname}: ${res.status} ${res.statusText} — ${ej ? JSON.stringify(ej) : "unknown error"}`,
61 );
62 }
63 } catch (err) {
64 console.error(`${i} Network error requesting crawl for ${url.hostname}: ${err}`);
65 }
66}
67
68let i = 0;
69async function main() {
70 const pdses = (await fetchPdses()).map(url => new URL(url));
71
72 console.log("Requesting crawls...");
73
74 async function get_next() {
75 if (pdses.length === 0) {
76 console.log("Done crawling!");
77
78 if (process.env["HEALTHCHECK_URL"]) {
79 console.log('trying to ping healtcheck...');
80 try {
81 const res = await fetch(process.env["HEALTHCHECK_URL"]);
82 console.log(`Pinged healthcheck endpoint! ok? ${res.ok}`);
83 } catch (e) {
84 console.error(`Failed to ping healtcheck: ${e}`);
85 throw e;
86 }
87 }
88
89 return;
90 }
91 let gofast = await bluh(pdses.shift(), i);
92 i += 1;
93 setTimeout(get_next, gofast ? 0 : 300);
94 }
95 get_next();
96}
97
98async function fetchPdses() {
99 const data = await fetch(
100 "https://raw.githubusercontent.com/mary-ext/atproto-scraping/refs/heads/trunk/state.json",
101 ).then((res) => res.ok ? res.json() : null);
102
103 if (!data?.pdses) throw new Error("Failed to fetch PDSes");
104
105 const pdses = Object
106 .keys(data.pdses)
107 .filter((pds) => pds.startsWith("https://"))
108 // .filter((pds) => pds.includes('bsky.network'));
109 return pdses;
110}
111
112main();