a kind of bad cron script for keeping relays connected to pdss
requestcrawl.js
112 lines 3.0 kB view raw
1#!/usr/bin/env node 2 3// ok look this code was definitely never meant to be published 4// but i haven't made it into something better and it's been a 5// while so 6// 7// hi 8// 9// this has my weird changes and half-baked bits, runs much 10// slower than needed, and basically works. 11// 12// (initially based on futur's [script](https://gist.github.com/futurGH/2ee18d385eff3ba98f5b35b9dcac0aed#file-requestcrawl-ts)) 13 14 15for (const envVar of ["RELAY_ADDRESS", "RELAY_ADMIN_KEY"]) { 16 if (!process.env[envVar]) throw new Error(`Missing env var ${envVar}`); 17} 18 19 20const start_at = 0; 21let shrooms_found = 0; 22const shrooms_limit = 30; 23 24async function bluh(url, i) { 25 if (i < start_at) { 26 console.log(`skipping ${i} (before start)`); 27 return true; 28 } 29 // else if (shrooms_found >= shrooms_limit) { 30 // console.log(`skipping ${i} (reached limit)`); 31 // return; 32 // } else { 33 // shrooms_found += 1; 34 // } 35 36 try { 37 const res = await fetch(`${process.env.RELAY_ADDRESS}/admin/pds/requestCrawl`, { 38 method: "POST", 39 headers: { 40 "Content-Type": "application/json", 41 Authorization: `Basic ${process.env.RELAY_ADMIN_KEY}`, 42 }, 43 body: JSON.stringify({ 44 hostname: "https://" + url.hostname, 45 per_second: 200, 46 per_hour: 150 * 60 * 60, 47 per_day: 120 * 60 * 60 * 24, 48 crawl_rate: 50, 49 repo_limit: 1_000_000, 50 }), 51 }); 52 if (res.ok) { 53 console.log(`${i} got ${url.hostname}`); 54 } else { 55 const ej = await await res.json(); 56 // if (e?.message.includes('i/o')) { 57 58 // } 59 console.error( 60 `${i} Error requesting crawl for ${url.hostname}: ${res.status} ${res.statusText}${ej ? JSON.stringify(ej) : "unknown error"}`, 61 ); 62 } 63 } catch (err) { 64 console.error(`${i} Network error requesting crawl for ${url.hostname}: ${err}`); 65 } 66} 67 68let i = 0; 69async function main() { 70 const pdses = (await fetchPdses()).map(url => new URL(url)); 71 72 console.log("Requesting crawls..."); 73 74 async function get_next() { 75 if (pdses.length === 0) { 76 console.log("Done crawling!"); 77 78 if (process.env["HEALTHCHECK_URL"]) { 79 console.log('trying to ping healtcheck...'); 80 try { 81 const res = await fetch(process.env["HEALTHCHECK_URL"]); 82 console.log(`Pinged healthcheck endpoint! ok? ${res.ok}`); 83 } catch (e) { 84 console.error(`Failed to ping healtcheck: ${e}`); 85 throw e; 86 } 87 } 88 89 return; 90 } 91 let gofast = await bluh(pdses.shift(), i); 92 i += 1; 93 setTimeout(get_next, gofast ? 0 : 300); 94 } 95 get_next(); 96} 97 98async function fetchPdses() { 99 const data = await fetch( 100 "https://raw.githubusercontent.com/mary-ext/atproto-scraping/refs/heads/trunk/state.json", 101 ).then((res) => res.ok ? res.json() : null); 102 103 if (!data?.pdses) throw new Error("Failed to fetch PDSes"); 104 105 const pdses = Object 106 .keys(data.pdses) 107 .filter((pds) => pds.startsWith("https://")) 108 // .filter((pds) => pds.includes('bsky.network')); 109 return pdses; 110} 111 112main();