hosting-service/src/lib/utils.ts at redirects · nekomimi.pet/wisp.place-monorepo

Monorepo for wisp.place. A static site hosting service built on top of the AT Protocol. wisp.place
wisp.place-monorepo / hosting-service / src / lib / utils.ts
at redirects 18 kB view raw
  1import { AtpAgent } from '@atproto/api';
  2import type { Record as WispFsRecord, Directory, Entry, File } from '../lexicon/types/place/wisp/fs';
  3import { existsSync, mkdirSync, readFileSync, rmSync } from 'fs';
  4import { writeFile, readFile, rename } from 'fs/promises';
  5import { safeFetchJson, safeFetchBlob } from './safe-fetch';
  6import { CID } from 'multiformats';
  7
  8const CACHE_DIR = process.env.CACHE_DIR || './cache/sites';
  9const CACHE_TTL = 14 * 24 * 60 * 60 * 1000; // 14 days cache TTL
 10
 11interface CacheMetadata {
 12  recordCid: string;
 13  cachedAt: number;
 14  did: string;
 15  rkey: string;
 16  // Map of file path to blob CID for incremental updates
 17  fileCids?: Record<string, string>;
 18}
 19
 20/**
 21 * Determines if a MIME type should benefit from gzip compression.
 22 * Returns true for text-based web assets (HTML, CSS, JS, JSON, XML, SVG).
 23 * Returns false for already-compressed formats (images, video, audio, PDFs).
 24 * 
 25 */
 26export function shouldCompressMimeType(mimeType: string | undefined): boolean {
 27  if (!mimeType) return false;
 28  
 29  const mime = mimeType.toLowerCase();
 30  
 31  // Text-based web assets that benefit from compression
 32  const compressibleTypes = [
 33    'text/html',
 34    'text/css',
 35    'text/javascript',
 36    'application/javascript',
 37    'application/x-javascript',
 38    'text/xml',
 39    'application/xml',
 40    'application/json',
 41    'text/plain',
 42    'image/svg+xml',
 43  ];
 44  
 45  if (compressibleTypes.some(type => mime === type || mime.startsWith(type))) {
 46    return true;
 47  }
 48  
 49  // Already-compressed formats that should NOT be double-compressed
 50  const alreadyCompressedPrefixes = [
 51    'video/',
 52    'audio/',
 53    'image/',
 54    'application/pdf',
 55    'application/zip',
 56    'application/gzip',
 57  ];
 58  
 59  if (alreadyCompressedPrefixes.some(prefix => mime.startsWith(prefix))) {
 60    return false;
 61  }
 62  
 63  // Default to not compressing for unknown types
 64  return false;
 65}
 66
 67interface IpldLink {
 68  $link: string;
 69}
 70
 71interface TypedBlobRef {
 72  ref: CID | IpldLink;
 73}
 74
 75interface UntypedBlobRef {
 76  cid: string;
 77}
 78
 79function isIpldLink(obj: unknown): obj is IpldLink {
 80  return typeof obj === 'object' && obj !== null && '$link' in obj && typeof (obj as IpldLink).$link === 'string';
 81}
 82
 83function isTypedBlobRef(obj: unknown): obj is TypedBlobRef {
 84  return typeof obj === 'object' && obj !== null && 'ref' in obj;
 85}
 86
 87function isUntypedBlobRef(obj: unknown): obj is UntypedBlobRef {
 88  return typeof obj === 'object' && obj !== null && 'cid' in obj && typeof (obj as UntypedBlobRef).cid === 'string';
 89}
 90
 91export async function resolveDid(identifier: string): Promise<string | null> {
 92  try {
 93    // If it's already a DID, return it
 94    if (identifier.startsWith('did:')) {
 95      return identifier;
 96    }
 97
 98    // Otherwise, resolve the handle using agent's built-in method
 99    const agent = new AtpAgent({ service: 'https://public.api.bsky.app' });
100    const response = await agent.resolveHandle({ handle: identifier });
101    return response.data.did;
102  } catch (err) {
103    console.error('Failed to resolve identifier', identifier, err);
104    return null;
105  }
106}
107
108export async function getPdsForDid(did: string): Promise<string | null> {
109  try {
110    let doc;
111
112    if (did.startsWith('did:plc:')) {
113      doc = await safeFetchJson(`https://plc.directory/${encodeURIComponent(did)}`);
114    } else if (did.startsWith('did:web:')) {
115      const didUrl = didWebToHttps(did);
116      doc = await safeFetchJson(didUrl);
117    } else {
118      console.error('Unsupported DID method', did);
119      return null;
120    }
121
122    const services = doc.service || [];
123    const pdsService = services.find((s: any) => s.id === '#atproto_pds');
124
125    return pdsService?.serviceEndpoint || null;
126  } catch (err) {
127    console.error('Failed to get PDS for DID', did, err);
128    return null;
129  }
130}
131
132function didWebToHttps(did: string): string {
133  const didParts = did.split(':');
134  if (didParts.length < 3 || didParts[0] !== 'did' || didParts[1] !== 'web') {
135    throw new Error('Invalid did:web format');
136  }
137
138  const domain = didParts[2];
139  const pathParts = didParts.slice(3);
140
141  if (pathParts.length === 0) {
142    return `https://${domain}/.well-known/did.json`;
143  } else {
144    const path = pathParts.join('/');
145    return `https://${domain}/${path}/did.json`;
146  }
147}
148
149export async function fetchSiteRecord(did: string, rkey: string): Promise<{ record: WispFsRecord; cid: string } | null> {
150  try {
151    const pdsEndpoint = await getPdsForDid(did);
152    if (!pdsEndpoint) return null;
153
154    const url = `${pdsEndpoint}/xrpc/com.atproto.repo.getRecord?repo=${encodeURIComponent(did)}&collection=place.wisp.fs&rkey=${encodeURIComponent(rkey)}`;
155    const data = await safeFetchJson(url);
156
157    return {
158      record: data.value as WispFsRecord,
159      cid: data.cid || ''
160    };
161  } catch (err) {
162    console.error('Failed to fetch site record', did, rkey, err);
163    return null;
164  }
165}
166
167export function extractBlobCid(blobRef: unknown): string | null {
168  if (isIpldLink(blobRef)) {
169    return blobRef.$link;
170  }
171
172  if (isTypedBlobRef(blobRef)) {
173    const ref = blobRef.ref;
174
175    const cid = CID.asCID(ref);
176    if (cid) {
177      return cid.toString();
178    }
179
180    if (isIpldLink(ref)) {
181      return ref.$link;
182    }
183  }
184
185  if (isUntypedBlobRef(blobRef)) {
186    return blobRef.cid;
187  }
188
189  return null;
190}
191
192export async function downloadAndCacheSite(did: string, rkey: string, record: WispFsRecord, pdsEndpoint: string, recordCid: string): Promise<void> {
193  console.log('Caching site', did, rkey);
194
195  if (!record.root) {
196    console.error('Record missing root directory:', JSON.stringify(record, null, 2));
197    throw new Error('Invalid record structure: missing root directory');
198  }
199
200  if (!record.root.entries || !Array.isArray(record.root.entries)) {
201    console.error('Record root missing entries array:', JSON.stringify(record.root, null, 2));
202    throw new Error('Invalid record structure: root missing entries array');
203  }
204
205  // Get existing cache metadata to check for incremental updates
206  const existingMetadata = await getCacheMetadata(did, rkey);
207  const existingFileCids = existingMetadata?.fileCids || {};
208
209  // Use a temporary directory with timestamp to avoid collisions
210  const tempSuffix = `.tmp-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`;
211  const tempDir = `${CACHE_DIR}/${did}/${rkey}${tempSuffix}`;
212  const finalDir = `${CACHE_DIR}/${did}/${rkey}`;
213
214  try {
215    // Collect file CIDs from the new record
216    const newFileCids: Record<string, string> = {};
217    collectFileCidsFromEntries(record.root.entries, '', newFileCids);
218
219    // Download/copy files to temporary directory (with incremental logic)
220    await cacheFiles(did, rkey, record.root.entries, pdsEndpoint, '', tempSuffix, existingFileCids, finalDir);
221    await saveCacheMetadata(did, rkey, recordCid, tempSuffix, newFileCids);
222
223    // Atomically replace old cache with new cache
224    // On POSIX systems (Linux/macOS), rename is atomic
225    if (existsSync(finalDir)) {
226      // Rename old directory to backup
227      const backupDir = `${finalDir}.old-${Date.now()}`;
228      await rename(finalDir, backupDir);
229
230      try {
231        // Rename new directory to final location
232        await rename(tempDir, finalDir);
233
234        // Clean up old backup
235        rmSync(backupDir, { recursive: true, force: true });
236      } catch (err) {
237        // If rename failed, restore backup
238        if (existsSync(backupDir) && !existsSync(finalDir)) {
239          await rename(backupDir, finalDir);
240        }
241        throw err;
242      }
243    } else {
244      // No existing cache, just rename temp to final
245      await rename(tempDir, finalDir);
246    }
247
248    console.log('Successfully cached site atomically', did, rkey);
249  } catch (err) {
250    // Clean up temp directory on failure
251    if (existsSync(tempDir)) {
252      rmSync(tempDir, { recursive: true, force: true });
253    }
254    throw err;
255  }
256}
257
258/**
259 * Recursively collect file CIDs from entries for incremental update tracking
260 */
261function collectFileCidsFromEntries(entries: Entry[], pathPrefix: string, fileCids: Record<string, string>): void {
262  for (const entry of entries) {
263    const currentPath = pathPrefix ? `${pathPrefix}/${entry.name}` : entry.name;
264    const node = entry.node;
265
266    if ('type' in node && node.type === 'directory' && 'entries' in node) {
267      collectFileCidsFromEntries(node.entries, currentPath, fileCids);
268    } else if ('type' in node && node.type === 'file' && 'blob' in node) {
269      const fileNode = node as File;
270      const cid = extractBlobCid(fileNode.blob);
271      if (cid) {
272        fileCids[currentPath] = cid;
273      }
274    }
275  }
276}
277
278async function cacheFiles(
279  did: string,
280  site: string,
281  entries: Entry[],
282  pdsEndpoint: string,
283  pathPrefix: string,
284  dirSuffix: string = '',
285  existingFileCids: Record<string, string> = {},
286  existingCacheDir?: string
287): Promise<void> {
288  // Collect file tasks, separating unchanged files from new/changed files
289  const downloadTasks: Array<() => Promise<void>> = [];
290  const copyTasks: Array<() => Promise<void>> = [];
291
292  function collectFileTasks(
293    entries: Entry[],
294    currentPathPrefix: string
295  ) {
296    for (const entry of entries) {
297      const currentPath = currentPathPrefix ? `${currentPathPrefix}/${entry.name}` : entry.name;
298      const node = entry.node;
299
300      if ('type' in node && node.type === 'directory' && 'entries' in node) {
301        collectFileTasks(node.entries, currentPath);
302      } else if ('type' in node && node.type === 'file' && 'blob' in node) {
303        const fileNode = node as File;
304        const cid = extractBlobCid(fileNode.blob);
305
306        // Check if file is unchanged (same CID as existing cache)
307        if (cid && existingFileCids[currentPath] === cid && existingCacheDir) {
308          // File unchanged - copy from existing cache instead of downloading
309          copyTasks.push(() => copyExistingFile(
310            did,
311            site,
312            currentPath,
313            dirSuffix,
314            existingCacheDir
315          ));
316        } else {
317          // File new or changed - download it
318          downloadTasks.push(() => cacheFileBlob(
319            did,
320            site,
321            currentPath,
322            fileNode.blob,
323            pdsEndpoint,
324            fileNode.encoding,
325            fileNode.mimeType,
326            fileNode.base64,
327            dirSuffix
328          ));
329        }
330      }
331    }
332  }
333
334  collectFileTasks(entries, pathPrefix);
335
336  console.log(`[Incremental Update] Files to copy: ${copyTasks.length}, Files to download: ${downloadTasks.length}`);
337
338  // Copy unchanged files in parallel (fast local operations)
339  const copyLimit = 10;
340  for (let i = 0; i < copyTasks.length; i += copyLimit) {
341    const batch = copyTasks.slice(i, i + copyLimit);
342    await Promise.all(batch.map(task => task()));
343  }
344
345  // Download new/changed files concurrently with a limit of 3 at a time
346  const downloadLimit = 3;
347  for (let i = 0; i < downloadTasks.length; i += downloadLimit) {
348    const batch = downloadTasks.slice(i, i + downloadLimit);
349    await Promise.all(batch.map(task => task()));
350  }
351}
352
353/**
354 * Copy an unchanged file from existing cache to new cache location
355 */
356async function copyExistingFile(
357  did: string,
358  site: string,
359  filePath: string,
360  dirSuffix: string,
361  existingCacheDir: string
362): Promise<void> {
363  const { copyFile } = await import('fs/promises');
364
365  const sourceFile = `${existingCacheDir}/${filePath}`;
366  const destFile = `${CACHE_DIR}/${did}/${site}${dirSuffix}/${filePath}`;
367  const destDir = destFile.substring(0, destFile.lastIndexOf('/'));
368
369  // Create destination directory if needed
370  if (destDir && !existsSync(destDir)) {
371    mkdirSync(destDir, { recursive: true });
372  }
373
374  try {
375    // Copy the file
376    await copyFile(sourceFile, destFile);
377
378    // Copy metadata file if it exists
379    const sourceMetaFile = `${sourceFile}.meta`;
380    const destMetaFile = `${destFile}.meta`;
381    if (existsSync(sourceMetaFile)) {
382      await copyFile(sourceMetaFile, destMetaFile);
383    }
384
385    console.log(`[Incremental] Copied unchanged file: ${filePath}`);
386  } catch (err) {
387    console.error(`[Incremental] Failed to copy file ${filePath}, will attempt download:`, err);
388    throw err;
389  }
390}
391
392async function cacheFileBlob(
393  did: string,
394  site: string,
395  filePath: string,
396  blobRef: any,
397  pdsEndpoint: string,
398  encoding?: 'gzip',
399  mimeType?: string,
400  base64?: boolean,
401  dirSuffix: string = ''
402): Promise<void> {
403  const cid = extractBlobCid(blobRef);
404  if (!cid) {
405    console.error('Could not extract CID from blob', blobRef);
406    return;
407  }
408
409  const blobUrl = `${pdsEndpoint}/xrpc/com.atproto.sync.getBlob?did=${encodeURIComponent(did)}&cid=${encodeURIComponent(cid)}`;
410
411  // Allow up to 500MB per file blob, with 5 minute timeout
412  let content = await safeFetchBlob(blobUrl, { maxSize: 500 * 1024 * 1024, timeout: 300000 });
413
414  console.log(`[DEBUG] ${filePath}: fetched ${content.length} bytes, base64=${base64}, encoding=${encoding}, mimeType=${mimeType}`);
415
416  // If content is base64-encoded, decode it back to raw binary (gzipped or not)
417  if (base64) {
418    const originalSize = content.length;
419    // Decode base64 directly from raw bytes - no string conversion
420    // The blob contains base64-encoded text as raw bytes, decode it in-place
421    const textDecoder = new TextDecoder();
422    const base64String = textDecoder.decode(content);
423    content = Buffer.from(base64String, 'base64');
424    console.log(`[DEBUG] ${filePath}: decoded base64 from ${originalSize} bytes to ${content.length} bytes`);
425    
426    // Check if it's actually gzipped by looking at magic bytes
427    if (content.length >= 2) {
428      const hasGzipMagic = content[0] === 0x1f && content[1] === 0x8b;
429      console.log(`[DEBUG] ${filePath}: has gzip magic bytes: ${hasGzipMagic}`);
430    }
431  }
432
433  const cacheFile = `${CACHE_DIR}/${did}/${site}${dirSuffix}/${filePath}`;
434  const fileDir = cacheFile.substring(0, cacheFile.lastIndexOf('/'));
435
436  if (fileDir && !existsSync(fileDir)) {
437    mkdirSync(fileDir, { recursive: true });
438  }
439
440  // Use the shared function to determine if this should remain compressed
441  const shouldStayCompressed = shouldCompressMimeType(mimeType);
442
443  // Decompress files that shouldn't be stored compressed
444  if (encoding === 'gzip' && !shouldStayCompressed && content.length >= 2 && 
445      content[0] === 0x1f && content[1] === 0x8b) {
446    console.log(`[DEBUG] ${filePath}: decompressing non-compressible type (${mimeType}) before caching`);
447    try {
448      const { gunzipSync } = await import('zlib');
449      const decompressed = gunzipSync(content);
450      console.log(`[DEBUG] ${filePath}: decompressed from ${content.length} to ${decompressed.length} bytes`);
451      content = decompressed;
452      // Clear the encoding flag since we're storing decompressed
453      encoding = undefined;
454    } catch (error) {
455      console.log(`[DEBUG] ${filePath}: failed to decompress, storing original gzipped content. Error:`, error);
456    }
457  }
458
459  await writeFile(cacheFile, content);
460
461  // Store metadata only if file is still compressed
462  if (encoding === 'gzip' && mimeType) {
463    const metaFile = `${cacheFile}.meta`;
464    await writeFile(metaFile, JSON.stringify({ encoding, mimeType }));
465    console.log('Cached file', filePath, content.length, 'bytes (gzipped,', mimeType + ')');
466  } else {
467    console.log('Cached file', filePath, content.length, 'bytes');
468  }
469}
470
471/**
472 * Sanitize a file path to prevent directory traversal attacks
473 * Removes any path segments that attempt to go up directories
474 */
475export function sanitizePath(filePath: string): string {
476  // Remove leading slashes
477  let cleaned = filePath.replace(/^\/+/, '');
478
479  // Split into segments and filter out dangerous ones
480  const segments = cleaned.split('/').filter(segment => {
481    // Remove empty segments
482    if (!segment || segment === '.') return false;
483    // Remove parent directory references
484    if (segment === '..') return false;
485    // Remove segments with null bytes
486    if (segment.includes('\0')) return false;
487    return true;
488  });
489
490  // Rejoin the safe segments
491  return segments.join('/');
492}
493
494export function getCachedFilePath(did: string, site: string, filePath: string): string {
495  const sanitizedPath = sanitizePath(filePath);
496  return `${CACHE_DIR}/${did}/${site}/${sanitizedPath}`;
497}
498
499export function isCached(did: string, site: string): boolean {
500  return existsSync(`${CACHE_DIR}/${did}/${site}`);
501}
502
503async function saveCacheMetadata(did: string, rkey: string, recordCid: string, dirSuffix: string = '', fileCids?: Record<string, string>): Promise<void> {
504  const metadata: CacheMetadata = {
505    recordCid,
506    cachedAt: Date.now(),
507    did,
508    rkey,
509    fileCids
510  };
511
512  const metadataPath = `${CACHE_DIR}/${did}/${rkey}${dirSuffix}/.metadata.json`;
513  const metadataDir = metadataPath.substring(0, metadataPath.lastIndexOf('/'));
514
515  if (!existsSync(metadataDir)) {
516    mkdirSync(metadataDir, { recursive: true });
517  }
518
519  await writeFile(metadataPath, JSON.stringify(metadata, null, 2));
520}
521
522async function getCacheMetadata(did: string, rkey: string): Promise<CacheMetadata | null> {
523  try {
524    const metadataPath = `${CACHE_DIR}/${did}/${rkey}/.metadata.json`;
525    if (!existsSync(metadataPath)) return null;
526
527    const content = await readFile(metadataPath, 'utf-8');
528    return JSON.parse(content) as CacheMetadata;
529  } catch (err) {
530    console.error('Failed to read cache metadata', err);
531    return null;
532  }
533}
534
535export async function isCacheValid(did: string, rkey: string, currentRecordCid?: string): Promise<boolean> {
536  const metadata = await getCacheMetadata(did, rkey);
537  if (!metadata) return false;
538
539  // Check if cache has expired (14 days TTL)
540  const cacheAge = Date.now() - metadata.cachedAt;
541  if (cacheAge > CACHE_TTL) {
542    console.log('[Cache] Cache expired for', did, rkey);
543    return false;
544  }
545
546  // If current CID is provided, verify it matches
547  if (currentRecordCid && metadata.recordCid !== currentRecordCid) {
548    console.log('[Cache] CID mismatch for', did, rkey, 'cached:', metadata.recordCid, 'current:', currentRecordCid);
549    return false;
550  }
551
552  return true;
553}