hosting-service/src/lib/utils.ts at cdb8937ffb504e44fafc5b13f7ede57d6fb3d66e · nekomimi.pet/wisp.place-monorepo

Monorepo for wisp.place. A static site hosting service built on top of the AT Protocol. wisp.place
wisp.place-monorepo / hosting-service / src / lib / utils.ts
at cdb8937ffb504e44fafc5b13f7ede57d6fb3d66e 22 kB view raw
  1import { AtpAgent } from '@atproto/api';
  2import type { Record as WispFsRecord, Directory, Entry, File } from '../lexicon/types/place/wisp/fs';
  3import type { Record as SubfsRecord } from '../lexicon/types/place/wisp/subfs';
  4import { existsSync, mkdirSync, readFileSync, rmSync } from 'fs';
  5import { writeFile, readFile, rename } from 'fs/promises';
  6import { safeFetchJson, safeFetchBlob } from './safe-fetch';
  7import { CID } from 'multiformats';
  8
  9const CACHE_DIR = process.env.CACHE_DIR || './cache/sites';
 10const CACHE_TTL = 14 * 24 * 60 * 60 * 1000; // 14 days cache TTL
 11
 12interface CacheMetadata {
 13  recordCid: string;
 14  cachedAt: number;
 15  did: string;
 16  rkey: string;
 17  // Map of file path to blob CID for incremental updates
 18  fileCids?: Record<string, string>;
 19}
 20
 21/**
 22 * Determines if a MIME type should benefit from gzip compression.
 23 * Returns true for text-based web assets (HTML, CSS, JS, JSON, XML, SVG).
 24 * Returns false for already-compressed formats (images, video, audio, PDFs).
 25 * 
 26 */
 27export function shouldCompressMimeType(mimeType: string | undefined): boolean {
 28  if (!mimeType) return false;
 29  
 30  const mime = mimeType.toLowerCase();
 31  
 32  // Text-based web assets that benefit from compression
 33  const compressibleTypes = [
 34    'text/html',
 35    'text/css',
 36    'text/javascript',
 37    'application/javascript',
 38    'application/x-javascript',
 39    'text/xml',
 40    'application/xml',
 41    'application/json',
 42    'text/plain',
 43    'image/svg+xml',
 44  ];
 45  
 46  if (compressibleTypes.some(type => mime === type || mime.startsWith(type))) {
 47    return true;
 48  }
 49  
 50  // Already-compressed formats that should NOT be double-compressed
 51  const alreadyCompressedPrefixes = [
 52    'video/',
 53    'audio/',
 54    'image/',
 55    'application/pdf',
 56    'application/zip',
 57    'application/gzip',
 58  ];
 59  
 60  if (alreadyCompressedPrefixes.some(prefix => mime.startsWith(prefix))) {
 61    return false;
 62  }
 63  
 64  // Default to not compressing for unknown types
 65  return false;
 66}
 67
 68interface IpldLink {
 69  $link: string;
 70}
 71
 72interface TypedBlobRef {
 73  ref: CID | IpldLink;
 74}
 75
 76interface UntypedBlobRef {
 77  cid: string;
 78}
 79
 80function isIpldLink(obj: unknown): obj is IpldLink {
 81  return typeof obj === 'object' && obj !== null && '$link' in obj && typeof (obj as IpldLink).$link === 'string';
 82}
 83
 84function isTypedBlobRef(obj: unknown): obj is TypedBlobRef {
 85  return typeof obj === 'object' && obj !== null && 'ref' in obj;
 86}
 87
 88function isUntypedBlobRef(obj: unknown): obj is UntypedBlobRef {
 89  return typeof obj === 'object' && obj !== null && 'cid' in obj && typeof (obj as UntypedBlobRef).cid === 'string';
 90}
 91
 92export async function resolveDid(identifier: string): Promise<string | null> {
 93  try {
 94    // If it's already a DID, return it
 95    if (identifier.startsWith('did:')) {
 96      return identifier;
 97    }
 98
 99    // Otherwise, resolve the handle using agent's built-in method
100    const agent = new AtpAgent({ service: 'https://public.api.bsky.app' });
101    const response = await agent.resolveHandle({ handle: identifier });
102    return response.data.did;
103  } catch (err) {
104    console.error('Failed to resolve identifier', identifier, err);
105    return null;
106  }
107}
108
109export async function getPdsForDid(did: string): Promise<string | null> {
110  try {
111    let doc;
112
113    if (did.startsWith('did:plc:')) {
114      doc = await safeFetchJson(`https://plc.directory/${encodeURIComponent(did)}`);
115    } else if (did.startsWith('did:web:')) {
116      const didUrl = didWebToHttps(did);
117      doc = await safeFetchJson(didUrl);
118    } else {
119      console.error('Unsupported DID method', did);
120      return null;
121    }
122
123    const services = doc.service || [];
124    const pdsService = services.find((s: any) => s.id === '#atproto_pds');
125
126    return pdsService?.serviceEndpoint || null;
127  } catch (err) {
128    console.error('Failed to get PDS for DID', did, err);
129    return null;
130  }
131}
132
133function didWebToHttps(did: string): string {
134  const didParts = did.split(':');
135  if (didParts.length < 3 || didParts[0] !== 'did' || didParts[1] !== 'web') {
136    throw new Error('Invalid did:web format');
137  }
138
139  const domain = didParts[2];
140  const pathParts = didParts.slice(3);
141
142  if (pathParts.length === 0) {
143    return `https://${domain}/.well-known/did.json`;
144  } else {
145    const path = pathParts.join('/');
146    return `https://${domain}/${path}/did.json`;
147  }
148}
149
150export async function fetchSiteRecord(did: string, rkey: string): Promise<{ record: WispFsRecord; cid: string } | null> {
151  try {
152    const pdsEndpoint = await getPdsForDid(did);
153    if (!pdsEndpoint) return null;
154
155    const url = `${pdsEndpoint}/xrpc/com.atproto.repo.getRecord?repo=${encodeURIComponent(did)}&collection=place.wisp.fs&rkey=${encodeURIComponent(rkey)}`;
156    const data = await safeFetchJson(url);
157
158    return {
159      record: data.value as WispFsRecord,
160      cid: data.cid || ''
161    };
162  } catch (err) {
163    console.error('Failed to fetch site record', did, rkey, err);
164    return null;
165  }
166}
167
168export function extractBlobCid(blobRef: unknown): string | null {
169  if (isIpldLink(blobRef)) {
170    return blobRef.$link;
171  }
172
173  if (isTypedBlobRef(blobRef)) {
174    const ref = blobRef.ref;
175
176    const cid = CID.asCID(ref);
177    if (cid) {
178      return cid.toString();
179    }
180
181    if (isIpldLink(ref)) {
182      return ref.$link;
183    }
184  }
185
186  if (isUntypedBlobRef(blobRef)) {
187    return blobRef.cid;
188  }
189
190  return null;
191}
192
193/**
194 * Extract all subfs URIs from a directory tree with their mount paths
195 */
196function extractSubfsUris(directory: Directory, currentPath: string = ''): Array<{ uri: string; path: string }> {
197  const uris: Array<{ uri: string; path: string }> = [];
198
199  for (const entry of directory.entries) {
200    const fullPath = currentPath ? `${currentPath}/${entry.name}` : entry.name;
201
202    if ('type' in entry.node) {
203      if (entry.node.type === 'subfs') {
204        // Subfs node with subject URI
205        const subfsNode = entry.node as any;
206        if (subfsNode.subject) {
207          uris.push({ uri: subfsNode.subject, path: fullPath });
208        }
209      } else if (entry.node.type === 'directory') {
210        // Recursively search subdirectories
211        const subUris = extractSubfsUris(entry.node as Directory, fullPath);
212        uris.push(...subUris);
213      }
214    }
215  }
216
217  return uris;
218}
219
220/**
221 * Fetch a subfs record from the PDS
222 */
223async function fetchSubfsRecord(uri: string, pdsEndpoint: string): Promise<SubfsRecord | null> {
224  try {
225    // Parse URI: at://did/collection/rkey
226    const parts = uri.replace('at://', '').split('/');
227    if (parts.length < 3) {
228      console.error('Invalid subfs URI:', uri);
229      return null;
230    }
231
232    const did = parts[0];
233    const collection = parts[1];
234    const rkey = parts[2];
235
236    // Fetch the record from PDS
237    const url = `${pdsEndpoint}/xrpc/com.atproto.repo.getRecord?repo=${encodeURIComponent(did)}&collection=${encodeURIComponent(collection)}&rkey=${encodeURIComponent(rkey)}`;
238    const response = await safeFetchJson(url);
239
240    if (!response || !response.value) {
241      console.error('Subfs record not found:', uri);
242      return null;
243    }
244
245    return response.value as SubfsRecord;
246  } catch (err) {
247    console.error('Failed to fetch subfs record:', uri, err);
248    return null;
249  }
250}
251
252/**
253 * Replace subfs nodes in a directory tree with their actual content
254 */
255async function expandSubfsNodes(directory: Directory, pdsEndpoint: string): Promise<Directory> {
256  // Extract all subfs URIs
257  const subfsUris = extractSubfsUris(directory);
258
259  if (subfsUris.length === 0) {
260    // No subfs nodes, return as-is
261    return directory;
262  }
263
264  console.log(`Found ${subfsUris.length} subfs records, fetching...`);
265
266  // Fetch all subfs records in parallel
267  const subfsRecords = await Promise.all(
268    subfsUris.map(async ({ uri, path }) => {
269      const record = await fetchSubfsRecord(uri, pdsEndpoint);
270      return { record, path };
271    })
272  );
273
274  // Build a map of path -> directory content
275  const subfsMap = new Map<string, Directory>();
276  for (const { record, path } of subfsRecords) {
277    if (record && record.root) {
278      subfsMap.set(path, record.root);
279    }
280  }
281
282  // Replace subfs nodes with their actual content
283  function replaceSubfsInEntries(entries: Entry[], currentPath: string = ''): Entry[] {
284    return entries.map(entry => {
285      const fullPath = currentPath ? `${currentPath}/${entry.name}` : entry.name;
286      const node = entry.node;
287
288      if ('type' in node && node.type === 'subfs') {
289        // Replace with actual directory content
290        const subfsDir = subfsMap.get(fullPath);
291        if (subfsDir) {
292          console.log(`Expanding subfs node at ${fullPath}`);
293          return {
294            ...entry,
295            node: subfsDir
296          };
297        }
298        // If fetch failed, keep the subfs node (will be skipped later)
299        return entry;
300      } else if ('type' in node && node.type === 'directory' && 'entries' in node) {
301        // Recursively process subdirectories
302        return {
303          ...entry,
304          node: {
305            ...node,
306            entries: replaceSubfsInEntries(node.entries, fullPath)
307          }
308        };
309      }
310
311      return entry;
312    });
313  }
314
315  return {
316    ...directory,
317    entries: replaceSubfsInEntries(directory.entries)
318  };
319}
320
321export async function downloadAndCacheSite(did: string, rkey: string, record: WispFsRecord, pdsEndpoint: string, recordCid: string): Promise<void> {
322  console.log('Caching site', did, rkey);
323
324  if (!record.root) {
325    console.error('Record missing root directory:', JSON.stringify(record, null, 2));
326    throw new Error('Invalid record structure: missing root directory');
327  }
328
329  if (!record.root.entries || !Array.isArray(record.root.entries)) {
330    console.error('Record root missing entries array:', JSON.stringify(record.root, null, 2));
331    throw new Error('Invalid record structure: root missing entries array');
332  }
333
334  // Expand subfs nodes before caching
335  const expandedRoot = await expandSubfsNodes(record.root, pdsEndpoint);
336
337  // Get existing cache metadata to check for incremental updates
338  const existingMetadata = await getCacheMetadata(did, rkey);
339  const existingFileCids = existingMetadata?.fileCids || {};
340
341  // Use a temporary directory with timestamp to avoid collisions
342  const tempSuffix = `.tmp-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`;
343  const tempDir = `${CACHE_DIR}/${did}/${rkey}${tempSuffix}`;
344  const finalDir = `${CACHE_DIR}/${did}/${rkey}`;
345
346  try {
347    // Collect file CIDs from the new record (using expanded root)
348    const newFileCids: Record<string, string> = {};
349    collectFileCidsFromEntries(expandedRoot.entries, '', newFileCids);
350
351    // Download/copy files to temporary directory (with incremental logic, using expanded root)
352    await cacheFiles(did, rkey, expandedRoot.entries, pdsEndpoint, '', tempSuffix, existingFileCids, finalDir);
353    await saveCacheMetadata(did, rkey, recordCid, tempSuffix, newFileCids);
354
355    // Atomically replace old cache with new cache
356    // On POSIX systems (Linux/macOS), rename is atomic
357    if (existsSync(finalDir)) {
358      // Rename old directory to backup
359      const backupDir = `${finalDir}.old-${Date.now()}`;
360      await rename(finalDir, backupDir);
361
362      try {
363        // Rename new directory to final location
364        await rename(tempDir, finalDir);
365
366        // Clean up old backup
367        rmSync(backupDir, { recursive: true, force: true });
368      } catch (err) {
369        // If rename failed, restore backup
370        if (existsSync(backupDir) && !existsSync(finalDir)) {
371          await rename(backupDir, finalDir);
372        }
373        throw err;
374      }
375    } else {
376      // No existing cache, just rename temp to final
377      await rename(tempDir, finalDir);
378    }
379
380    console.log('Successfully cached site atomically', did, rkey);
381  } catch (err) {
382    // Clean up temp directory on failure
383    if (existsSync(tempDir)) {
384      rmSync(tempDir, { recursive: true, force: true });
385    }
386    throw err;
387  }
388}
389
390/**
391 * Recursively collect file CIDs from entries for incremental update tracking
392 */
393function collectFileCidsFromEntries(entries: Entry[], pathPrefix: string, fileCids: Record<string, string>): void {
394  for (const entry of entries) {
395    const currentPath = pathPrefix ? `${pathPrefix}/${entry.name}` : entry.name;
396    const node = entry.node;
397
398    if ('type' in node && node.type === 'directory' && 'entries' in node) {
399      collectFileCidsFromEntries(node.entries, currentPath, fileCids);
400    } else if ('type' in node && node.type === 'file' && 'blob' in node) {
401      const fileNode = node as File;
402      const cid = extractBlobCid(fileNode.blob);
403      if (cid) {
404        fileCids[currentPath] = cid;
405      }
406    }
407  }
408}
409
410async function cacheFiles(
411  did: string,
412  site: string,
413  entries: Entry[],
414  pdsEndpoint: string,
415  pathPrefix: string,
416  dirSuffix: string = '',
417  existingFileCids: Record<string, string> = {},
418  existingCacheDir?: string
419): Promise<void> {
420  // Collect file tasks, separating unchanged files from new/changed files
421  const downloadTasks: Array<() => Promise<void>> = [];
422  const copyTasks: Array<() => Promise<void>> = [];
423
424  function collectFileTasks(
425    entries: Entry[],
426    currentPathPrefix: string
427  ) {
428    for (const entry of entries) {
429      const currentPath = currentPathPrefix ? `${currentPathPrefix}/${entry.name}` : entry.name;
430      const node = entry.node;
431
432      if ('type' in node && node.type === 'directory' && 'entries' in node) {
433        collectFileTasks(node.entries, currentPath);
434      } else if ('type' in node && node.type === 'file' && 'blob' in node) {
435        const fileNode = node as File;
436        const cid = extractBlobCid(fileNode.blob);
437
438        // Check if file is unchanged (same CID as existing cache)
439        if (cid && existingFileCids[currentPath] === cid && existingCacheDir) {
440          // File unchanged - copy from existing cache instead of downloading
441          copyTasks.push(() => copyExistingFile(
442            did,
443            site,
444            currentPath,
445            dirSuffix,
446            existingCacheDir
447          ));
448        } else {
449          // File new or changed - download it
450          downloadTasks.push(() => cacheFileBlob(
451            did,
452            site,
453            currentPath,
454            fileNode.blob,
455            pdsEndpoint,
456            fileNode.encoding,
457            fileNode.mimeType,
458            fileNode.base64,
459            dirSuffix
460          ));
461        }
462      }
463    }
464  }
465
466  collectFileTasks(entries, pathPrefix);
467
468  console.log(`[Incremental Update] Files to copy: ${copyTasks.length}, Files to download: ${downloadTasks.length}`);
469
470  // Copy unchanged files in parallel (fast local operations)
471  const copyLimit = 10;
472  for (let i = 0; i < copyTasks.length; i += copyLimit) {
473    const batch = copyTasks.slice(i, i + copyLimit);
474    await Promise.all(batch.map(task => task()));
475  }
476
477  // Download new/changed files concurrently with a limit of 3 at a time
478  const downloadLimit = 3;
479  for (let i = 0; i < downloadTasks.length; i += downloadLimit) {
480    const batch = downloadTasks.slice(i, i + downloadLimit);
481    await Promise.all(batch.map(task => task()));
482  }
483}
484
485/**
486 * Copy an unchanged file from existing cache to new cache location
487 */
488async function copyExistingFile(
489  did: string,
490  site: string,
491  filePath: string,
492  dirSuffix: string,
493  existingCacheDir: string
494): Promise<void> {
495  const { copyFile } = await import('fs/promises');
496
497  const sourceFile = `${existingCacheDir}/${filePath}`;
498  const destFile = `${CACHE_DIR}/${did}/${site}${dirSuffix}/${filePath}`;
499  const destDir = destFile.substring(0, destFile.lastIndexOf('/'));
500
501  // Create destination directory if needed
502  if (destDir && !existsSync(destDir)) {
503    mkdirSync(destDir, { recursive: true });
504  }
505
506  try {
507    // Copy the file
508    await copyFile(sourceFile, destFile);
509
510    // Copy metadata file if it exists
511    const sourceMetaFile = `${sourceFile}.meta`;
512    const destMetaFile = `${destFile}.meta`;
513    if (existsSync(sourceMetaFile)) {
514      await copyFile(sourceMetaFile, destMetaFile);
515    }
516
517    console.log(`[Incremental] Copied unchanged file: ${filePath}`);
518  } catch (err) {
519    console.error(`[Incremental] Failed to copy file ${filePath}, will attempt download:`, err);
520    throw err;
521  }
522}
523
524async function cacheFileBlob(
525  did: string,
526  site: string,
527  filePath: string,
528  blobRef: any,
529  pdsEndpoint: string,
530  encoding?: 'gzip',
531  mimeType?: string,
532  base64?: boolean,
533  dirSuffix: string = ''
534): Promise<void> {
535  const cid = extractBlobCid(blobRef);
536  if (!cid) {
537    console.error('Could not extract CID from blob', blobRef);
538    return;
539  }
540
541  const blobUrl = `${pdsEndpoint}/xrpc/com.atproto.sync.getBlob?did=${encodeURIComponent(did)}&cid=${encodeURIComponent(cid)}`;
542
543  // Allow up to 500MB per file blob, with 5 minute timeout
544  let content = await safeFetchBlob(blobUrl, { maxSize: 500 * 1024 * 1024, timeout: 300000 });
545
546  console.log(`[DEBUG] ${filePath}: fetched ${content.length} bytes, base64=${base64}, encoding=${encoding}, mimeType=${mimeType}`);
547
548  // If content is base64-encoded, decode it back to raw binary (gzipped or not)
549  if (base64) {
550    const originalSize = content.length;
551    // Decode base64 directly from raw bytes - no string conversion
552    // The blob contains base64-encoded text as raw bytes, decode it in-place
553    const textDecoder = new TextDecoder();
554    const base64String = textDecoder.decode(content);
555    content = Buffer.from(base64String, 'base64');
556    console.log(`[DEBUG] ${filePath}: decoded base64 from ${originalSize} bytes to ${content.length} bytes`);
557    
558    // Check if it's actually gzipped by looking at magic bytes
559    if (content.length >= 2) {
560      const hasGzipMagic = content[0] === 0x1f && content[1] === 0x8b;
561      console.log(`[DEBUG] ${filePath}: has gzip magic bytes: ${hasGzipMagic}`);
562    }
563  }
564
565  const cacheFile = `${CACHE_DIR}/${did}/${site}${dirSuffix}/${filePath}`;
566  const fileDir = cacheFile.substring(0, cacheFile.lastIndexOf('/'));
567
568  if (fileDir && !existsSync(fileDir)) {
569    mkdirSync(fileDir, { recursive: true });
570  }
571
572  // Use the shared function to determine if this should remain compressed
573  const shouldStayCompressed = shouldCompressMimeType(mimeType);
574
575  // Decompress files that shouldn't be stored compressed
576  if (encoding === 'gzip' && !shouldStayCompressed && content.length >= 2 && 
577      content[0] === 0x1f && content[1] === 0x8b) {
578    console.log(`[DEBUG] ${filePath}: decompressing non-compressible type (${mimeType}) before caching`);
579    try {
580      const { gunzipSync } = await import('zlib');
581      const decompressed = gunzipSync(content);
582      console.log(`[DEBUG] ${filePath}: decompressed from ${content.length} to ${decompressed.length} bytes`);
583      content = decompressed;
584      // Clear the encoding flag since we're storing decompressed
585      encoding = undefined;
586    } catch (error) {
587      console.log(`[DEBUG] ${filePath}: failed to decompress, storing original gzipped content. Error:`, error);
588    }
589  }
590
591  await writeFile(cacheFile, content);
592
593  // Store metadata only if file is still compressed
594  if (encoding === 'gzip' && mimeType) {
595    const metaFile = `${cacheFile}.meta`;
596    await writeFile(metaFile, JSON.stringify({ encoding, mimeType }));
597    console.log('Cached file', filePath, content.length, 'bytes (gzipped,', mimeType + ')');
598  } else {
599    console.log('Cached file', filePath, content.length, 'bytes');
600  }
601}
602
603/**
604 * Sanitize a file path to prevent directory traversal attacks
605 * Removes any path segments that attempt to go up directories
606 */
607export function sanitizePath(filePath: string): string {
608  // Remove leading slashes
609  let cleaned = filePath.replace(/^\/+/, '');
610
611  // Split into segments and filter out dangerous ones
612  const segments = cleaned.split('/').filter(segment => {
613    // Remove empty segments
614    if (!segment || segment === '.') return false;
615    // Remove parent directory references
616    if (segment === '..') return false;
617    // Remove segments with null bytes
618    if (segment.includes('\0')) return false;
619    return true;
620  });
621
622  // Rejoin the safe segments
623  return segments.join('/');
624}
625
626export function getCachedFilePath(did: string, site: string, filePath: string): string {
627  const sanitizedPath = sanitizePath(filePath);
628  return `${CACHE_DIR}/${did}/${site}/${sanitizedPath}`;
629}
630
631export function isCached(did: string, site: string): boolean {
632  return existsSync(`${CACHE_DIR}/${did}/${site}`);
633}
634
635async function saveCacheMetadata(did: string, rkey: string, recordCid: string, dirSuffix: string = '', fileCids?: Record<string, string>): Promise<void> {
636  const metadata: CacheMetadata = {
637    recordCid,
638    cachedAt: Date.now(),
639    did,
640    rkey,
641    fileCids
642  };
643
644  const metadataPath = `${CACHE_DIR}/${did}/${rkey}${dirSuffix}/.metadata.json`;
645  const metadataDir = metadataPath.substring(0, metadataPath.lastIndexOf('/'));
646
647  if (!existsSync(metadataDir)) {
648    mkdirSync(metadataDir, { recursive: true });
649  }
650
651  await writeFile(metadataPath, JSON.stringify(metadata, null, 2));
652}
653
654async function getCacheMetadata(did: string, rkey: string): Promise<CacheMetadata | null> {
655  try {
656    const metadataPath = `${CACHE_DIR}/${did}/${rkey}/.metadata.json`;
657    if (!existsSync(metadataPath)) return null;
658
659    const content = await readFile(metadataPath, 'utf-8');
660    return JSON.parse(content) as CacheMetadata;
661  } catch (err) {
662    console.error('Failed to read cache metadata', err);
663    return null;
664  }
665}
666
667export async function isCacheValid(did: string, rkey: string, currentRecordCid?: string): Promise<boolean> {
668  const metadata = await getCacheMetadata(did, rkey);
669  if (!metadata) return false;
670
671  // Check if cache has expired (14 days TTL)
672  const cacheAge = Date.now() - metadata.cachedAt;
673  if (cacheAge > CACHE_TTL) {
674    console.log('[Cache] Cache expired for', did, rkey);
675    return false;
676  }
677
678  // If current CID is provided, verify it matches
679  if (currentRecordCid && metadata.recordCid !== currentRecordCid) {
680    console.log('[Cache] CID mismatch for', did, rkey, 'cached:', metadata.recordCid, 'current:', currentRecordCid);
681    return false;
682  }
683
684  return true;
685}