import { AtpAgent } from '@atproto/api'; import type { Record as WispFsRecord, Directory, Entry, File } from '../lexicon/types/place/wisp/fs'; import { existsSync, mkdirSync, readFileSync, rmSync } from 'fs'; import { writeFile, readFile, rename } from 'fs/promises'; import { safeFetchJson, safeFetchBlob } from './safe-fetch'; import { CID } from 'multiformats'; const CACHE_DIR = './cache/sites'; const CACHE_TTL = 14 * 24 * 60 * 60 * 1000; // 14 days cache TTL interface CacheMetadata { recordCid: string; cachedAt: number; did: string; rkey: string; } interface IpldLink { $link: string; } interface TypedBlobRef { ref: CID | IpldLink; } interface UntypedBlobRef { cid: string; } function isIpldLink(obj: unknown): obj is IpldLink { return typeof obj === 'object' && obj !== null && '$link' in obj && typeof (obj as IpldLink).$link === 'string'; } function isTypedBlobRef(obj: unknown): obj is TypedBlobRef { return typeof obj === 'object' && obj !== null && 'ref' in obj; } function isUntypedBlobRef(obj: unknown): obj is UntypedBlobRef { return typeof obj === 'object' && obj !== null && 'cid' in obj && typeof (obj as UntypedBlobRef).cid === 'string'; } export async function resolveDid(identifier: string): Promise { try { // If it's already a DID, return it if (identifier.startsWith('did:')) { return identifier; } // Otherwise, resolve the handle using agent's built-in method const agent = new AtpAgent({ service: 'https://public.api.bsky.app' }); const response = await agent.resolveHandle({ handle: identifier }); return response.data.did; } catch (err) { console.error('Failed to resolve identifier', identifier, err); return null; } } export async function getPdsForDid(did: string): Promise { try { let doc; if (did.startsWith('did:plc:')) { doc = await safeFetchJson(`https://plc.directory/${encodeURIComponent(did)}`); } else if (did.startsWith('did:web:')) { const didUrl = didWebToHttps(did); doc = await safeFetchJson(didUrl); } else { console.error('Unsupported DID method', did); return null; } const services = doc.service || []; const pdsService = services.find((s: any) => s.id === '#atproto_pds'); return pdsService?.serviceEndpoint || null; } catch (err) { console.error('Failed to get PDS for DID', did, err); return null; } } function didWebToHttps(did: string): string { const didParts = did.split(':'); if (didParts.length < 3 || didParts[0] !== 'did' || didParts[1] !== 'web') { throw new Error('Invalid did:web format'); } const domain = didParts[2]; const pathParts = didParts.slice(3); if (pathParts.length === 0) { return `https://${domain}/.well-known/did.json`; } else { const path = pathParts.join('/'); return `https://${domain}/${path}/did.json`; } } export async function fetchSiteRecord(did: string, rkey: string): Promise<{ record: WispFsRecord; cid: string } | null> { try { const pdsEndpoint = await getPdsForDid(did); if (!pdsEndpoint) return null; const url = `${pdsEndpoint}/xrpc/com.atproto.repo.getRecord?repo=${encodeURIComponent(did)}&collection=place.wisp.fs&rkey=${encodeURIComponent(rkey)}`; const data = await safeFetchJson(url); return { record: data.value as WispFsRecord, cid: data.cid || '' }; } catch (err) { console.error('Failed to fetch site record', did, rkey, err); return null; } } export function extractBlobCid(blobRef: unknown): string | null { if (isIpldLink(blobRef)) { return blobRef.$link; } if (isTypedBlobRef(blobRef)) { const ref = blobRef.ref; const cid = CID.asCID(ref); if (cid) { return cid.toString(); } if (isIpldLink(ref)) { return ref.$link; } } if (isUntypedBlobRef(blobRef)) { return blobRef.cid; } return null; } export async function downloadAndCacheSite(did: string, rkey: string, record: WispFsRecord, pdsEndpoint: string, recordCid: string): Promise { console.log('Caching site', did, rkey); if (!record.root) { console.error('Record missing root directory:', JSON.stringify(record, null, 2)); throw new Error('Invalid record structure: missing root directory'); } if (!record.root.entries || !Array.isArray(record.root.entries)) { console.error('Record root missing entries array:', JSON.stringify(record.root, null, 2)); throw new Error('Invalid record structure: root missing entries array'); } // Use a temporary directory with timestamp to avoid collisions const tempSuffix = `.tmp-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`; const tempDir = `${CACHE_DIR}/${did}/${rkey}${tempSuffix}`; const finalDir = `${CACHE_DIR}/${did}/${rkey}`; try { // Download to temporary directory await cacheFiles(did, rkey, record.root.entries, pdsEndpoint, '', tempSuffix); await saveCacheMetadata(did, rkey, recordCid, tempSuffix); // Atomically replace old cache with new cache // On POSIX systems (Linux/macOS), rename is atomic if (existsSync(finalDir)) { // Rename old directory to backup const backupDir = `${finalDir}.old-${Date.now()}`; await rename(finalDir, backupDir); try { // Rename new directory to final location await rename(tempDir, finalDir); // Clean up old backup rmSync(backupDir, { recursive: true, force: true }); } catch (err) { // If rename failed, restore backup if (existsSync(backupDir) && !existsSync(finalDir)) { await rename(backupDir, finalDir); } throw err; } } else { // No existing cache, just rename temp to final await rename(tempDir, finalDir); } console.log('Successfully cached site atomically', did, rkey); } catch (err) { // Clean up temp directory on failure if (existsSync(tempDir)) { rmSync(tempDir, { recursive: true, force: true }); } throw err; } } async function cacheFiles( did: string, site: string, entries: Entry[], pdsEndpoint: string, pathPrefix: string, dirSuffix: string = '' ): Promise { // Collect all file blob download tasks first const downloadTasks: Array<() => Promise> = []; function collectFileTasks( entries: Entry[], currentPathPrefix: string ) { for (const entry of entries) { const currentPath = currentPathPrefix ? `${currentPathPrefix}/${entry.name}` : entry.name; const node = entry.node; if ('type' in node && node.type === 'directory' && 'entries' in node) { collectFileTasks(node.entries, currentPath); } else if ('type' in node && node.type === 'file' && 'blob' in node) { const fileNode = node as File; downloadTasks.push(() => cacheFileBlob( did, site, currentPath, fileNode.blob, pdsEndpoint, fileNode.encoding, fileNode.mimeType, fileNode.base64, dirSuffix )); } } } collectFileTasks(entries, pathPrefix); // Execute downloads concurrently with a limit of 3 at a time const concurrencyLimit = 3; for (let i = 0; i < downloadTasks.length; i += concurrencyLimit) { const batch = downloadTasks.slice(i, i + concurrencyLimit); await Promise.all(batch.map(task => task())); } } async function cacheFileBlob( did: string, site: string, filePath: string, blobRef: any, pdsEndpoint: string, encoding?: 'gzip', mimeType?: string, base64?: boolean, dirSuffix: string = '' ): Promise { const cid = extractBlobCid(blobRef); if (!cid) { console.error('Could not extract CID from blob', blobRef); return; } const blobUrl = `${pdsEndpoint}/xrpc/com.atproto.sync.getBlob?did=${encodeURIComponent(did)}&cid=${encodeURIComponent(cid)}`; // Allow up to 100MB per file blob, with 2 minute timeout let content = await safeFetchBlob(blobUrl, { maxSize: 100 * 1024 * 1024, timeout: 120000 }); console.log(`[DEBUG] ${filePath}: fetched ${content.length} bytes, base64=${base64}, encoding=${encoding}, mimeType=${mimeType}`); // If content is base64-encoded, decode it back to binary (gzipped or not) if (base64) { const originalSize = content.length; // The content from the blob is base64 text, decode it directly to binary const buffer = Buffer.from(content); const base64String = buffer.toString('ascii'); // Use ascii for base64 text, not utf-8 console.log(`[DEBUG] ${filePath}: base64 string first 100 chars: ${base64String.substring(0, 100)}`); content = Buffer.from(base64String, 'base64'); console.log(`[DEBUG] ${filePath}: decoded from ${originalSize} bytes to ${content.length} bytes`); // Check if it's actually gzipped by looking at magic bytes if (content.length >= 2) { const magic = content[0] === 0x1f && content[1] === 0x8b; const byte0 = content[0]; const byte1 = content[1]; console.log(`[DEBUG] ${filePath}: has gzip magic bytes: ${magic} (0x${byte0?.toString(16)}, 0x${byte1?.toString(16)})`); } } const cacheFile = `${CACHE_DIR}/${did}/${site}${dirSuffix}/${filePath}`; const fileDir = cacheFile.substring(0, cacheFile.lastIndexOf('/')); if (fileDir && !existsSync(fileDir)) { mkdirSync(fileDir, { recursive: true }); } await writeFile(cacheFile, content); // Store metadata if file is compressed if (encoding === 'gzip' && mimeType) { const metaFile = `${cacheFile}.meta`; await writeFile(metaFile, JSON.stringify({ encoding, mimeType })); console.log('Cached file', filePath, content.length, 'bytes (gzipped,', mimeType + ')'); } else { console.log('Cached file', filePath, content.length, 'bytes'); } } /** * Sanitize a file path to prevent directory traversal attacks * Removes any path segments that attempt to go up directories */ export function sanitizePath(filePath: string): string { // Remove leading slashes let cleaned = filePath.replace(/^\/+/, ''); // Split into segments and filter out dangerous ones const segments = cleaned.split('/').filter(segment => { // Remove empty segments if (!segment || segment === '.') return false; // Remove parent directory references if (segment === '..') return false; // Remove segments with null bytes if (segment.includes('\0')) return false; return true; }); // Rejoin the safe segments return segments.join('/'); } export function getCachedFilePath(did: string, site: string, filePath: string): string { const sanitizedPath = sanitizePath(filePath); return `${CACHE_DIR}/${did}/${site}/${sanitizedPath}`; } export function isCached(did: string, site: string): boolean { return existsSync(`${CACHE_DIR}/${did}/${site}`); } async function saveCacheMetadata(did: string, rkey: string, recordCid: string, dirSuffix: string = ''): Promise { const metadata: CacheMetadata = { recordCid, cachedAt: Date.now(), did, rkey }; const metadataPath = `${CACHE_DIR}/${did}/${rkey}${dirSuffix}/.metadata.json`; const metadataDir = metadataPath.substring(0, metadataPath.lastIndexOf('/')); if (!existsSync(metadataDir)) { mkdirSync(metadataDir, { recursive: true }); } await writeFile(metadataPath, JSON.stringify(metadata, null, 2)); } async function getCacheMetadata(did: string, rkey: string): Promise { try { const metadataPath = `${CACHE_DIR}/${did}/${rkey}/.metadata.json`; if (!existsSync(metadataPath)) return null; const content = await readFile(metadataPath, 'utf-8'); return JSON.parse(content) as CacheMetadata; } catch (err) { console.error('Failed to read cache metadata', err); return null; } } export async function isCacheValid(did: string, rkey: string, currentRecordCid?: string): Promise { const metadata = await getCacheMetadata(did, rkey); if (!metadata) return false; // Check if cache has expired (14 days TTL) const cacheAge = Date.now() - metadata.cachedAt; if (cacheAge > CACHE_TTL) { console.log('[Cache] Cache expired for', did, rkey); return false; } // If current CID is provided, verify it matches if (currentRecordCid && metadata.recordCid !== currentRecordCid) { console.log('[Cache] CID mismatch for', did, rkey, 'cached:', metadata.recordCid, 'current:', currentRecordCid); return false; } return true; }