hosting-service/src/lib/utils.ts at main · ptr.pet/wisp.place-monorepo

forked from nekomimi.pet/wisp.place-monorepo
Monorepo for Wisp.place. A static site hosting service built on top of the AT Protocol.
wisp.place-monorepo / hosting-service / src / lib / utils.ts
at main 14 kB view raw
  1import { AtpAgent } from '@atproto/api';
  2import type { Record as WispFsRecord, Directory, Entry, File } from '../lexicon/types/place/wisp/fs';
  3import { existsSync, mkdirSync, readFileSync, rmSync } from 'fs';
  4import { writeFile, readFile, rename } from 'fs/promises';
  5import { safeFetchJson, safeFetchBlob } from './safe-fetch';
  6import { CID } from 'multiformats';
  7
  8const CACHE_DIR = process.env.CACHE_DIR || './cache/sites';
  9const CACHE_TTL = 14 * 24 * 60 * 60 * 1000; // 14 days cache TTL
 10
 11interface CacheMetadata {
 12  recordCid: string;
 13  cachedAt: number;
 14  did: string;
 15  rkey: string;
 16}
 17
 18/**
 19 * Determines if a MIME type should benefit from gzip compression.
 20 * Returns true for text-based web assets (HTML, CSS, JS, JSON, XML, SVG).
 21 * Returns false for already-compressed formats (images, video, audio, PDFs).
 22 * 
 23 */
 24export function shouldCompressMimeType(mimeType: string | undefined): boolean {
 25  if (!mimeType) return false;
 26  
 27  const mime = mimeType.toLowerCase();
 28  
 29  // Text-based web assets that benefit from compression
 30  const compressibleTypes = [
 31    'text/html',
 32    'text/css',
 33    'text/javascript',
 34    'application/javascript',
 35    'application/x-javascript',
 36    'text/xml',
 37    'application/xml',
 38    'application/json',
 39    'text/plain',
 40    'image/svg+xml',
 41  ];
 42  
 43  if (compressibleTypes.some(type => mime === type || mime.startsWith(type))) {
 44    return true;
 45  }
 46  
 47  // Already-compressed formats that should NOT be double-compressed
 48  const alreadyCompressedPrefixes = [
 49    'video/',
 50    'audio/',
 51    'image/',
 52    'application/pdf',
 53    'application/zip',
 54    'application/gzip',
 55  ];
 56  
 57  if (alreadyCompressedPrefixes.some(prefix => mime.startsWith(prefix))) {
 58    return false;
 59  }
 60  
 61  // Default to not compressing for unknown types
 62  return false;
 63}
 64
 65interface IpldLink {
 66  $link: string;
 67}
 68
 69interface TypedBlobRef {
 70  ref: CID | IpldLink;
 71}
 72
 73interface UntypedBlobRef {
 74  cid: string;
 75}
 76
 77function isIpldLink(obj: unknown): obj is IpldLink {
 78  return typeof obj === 'object' && obj !== null && '$link' in obj && typeof (obj as IpldLink).$link === 'string';
 79}
 80
 81function isTypedBlobRef(obj: unknown): obj is TypedBlobRef {
 82  return typeof obj === 'object' && obj !== null && 'ref' in obj;
 83}
 84
 85function isUntypedBlobRef(obj: unknown): obj is UntypedBlobRef {
 86  return typeof obj === 'object' && obj !== null && 'cid' in obj && typeof (obj as UntypedBlobRef).cid === 'string';
 87}
 88
 89export async function resolveDid(identifier: string): Promise<string | null> {
 90  try {
 91    // If it's already a DID, return it
 92    if (identifier.startsWith('did:')) {
 93      return identifier;
 94    }
 95
 96    // Otherwise, resolve the handle using agent's built-in method
 97    const agent = new AtpAgent({ service: 'https://public.api.bsky.app' });
 98    const response = await agent.resolveHandle({ handle: identifier });
 99    return response.data.did;
100  } catch (err) {
101    console.error('Failed to resolve identifier', identifier, err);
102    return null;
103  }
104}
105
106export async function getPdsForDid(did: string): Promise<string | null> {
107  try {
108    let doc;
109
110    if (did.startsWith('did:plc:')) {
111      doc = await safeFetchJson(`https://plc.directory/${encodeURIComponent(did)}`);
112    } else if (did.startsWith('did:web:')) {
113      const didUrl = didWebToHttps(did);
114      doc = await safeFetchJson(didUrl);
115    } else {
116      console.error('Unsupported DID method', did);
117      return null;
118    }
119
120    const services = doc.service || [];
121    const pdsService = services.find((s: any) => s.id === '#atproto_pds');
122
123    return pdsService?.serviceEndpoint || null;
124  } catch (err) {
125    console.error('Failed to get PDS for DID', did, err);
126    return null;
127  }
128}
129
130function didWebToHttps(did: string): string {
131  const didParts = did.split(':');
132  if (didParts.length < 3 || didParts[0] !== 'did' || didParts[1] !== 'web') {
133    throw new Error('Invalid did:web format');
134  }
135
136  const domain = didParts[2];
137  const pathParts = didParts.slice(3);
138
139  if (pathParts.length === 0) {
140    return `https://${domain}/.well-known/did.json`;
141  } else {
142    const path = pathParts.join('/');
143    return `https://${domain}/${path}/did.json`;
144  }
145}
146
147export async function fetchSiteRecord(did: string, rkey: string): Promise<{ record: WispFsRecord; cid: string } | null> {
148  try {
149    const pdsEndpoint = await getPdsForDid(did);
150    if (!pdsEndpoint) return null;
151
152    const url = `${pdsEndpoint}/xrpc/com.atproto.repo.getRecord?repo=${encodeURIComponent(did)}&collection=place.wisp.fs&rkey=${encodeURIComponent(rkey)}`;
153    const data = await safeFetchJson(url);
154
155    return {
156      record: data.value as WispFsRecord,
157      cid: data.cid || ''
158    };
159  } catch (err) {
160    console.error('Failed to fetch site record', did, rkey, err);
161    return null;
162  }
163}
164
165export function extractBlobCid(blobRef: unknown): string | null {
166  if (isIpldLink(blobRef)) {
167    return blobRef.$link;
168  }
169
170  if (isTypedBlobRef(blobRef)) {
171    const ref = blobRef.ref;
172
173    const cid = CID.asCID(ref);
174    if (cid) {
175      return cid.toString();
176    }
177
178    if (isIpldLink(ref)) {
179      return ref.$link;
180    }
181  }
182
183  if (isUntypedBlobRef(blobRef)) {
184    return blobRef.cid;
185  }
186
187  return null;
188}
189
190export async function downloadAndCacheSite(did: string, rkey: string, record: WispFsRecord, pdsEndpoint: string, recordCid: string): Promise<void> {
191  console.log('Caching site', did, rkey);
192
193  if (!record.root) {
194    console.error('Record missing root directory:', JSON.stringify(record, null, 2));
195    throw new Error('Invalid record structure: missing root directory');
196  }
197
198  if (!record.root.entries || !Array.isArray(record.root.entries)) {
199    console.error('Record root missing entries array:', JSON.stringify(record.root, null, 2));
200    throw new Error('Invalid record structure: root missing entries array');
201  }
202
203  // Use a temporary directory with timestamp to avoid collisions
204  const tempSuffix = `.tmp-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`;
205  const tempDir = `${CACHE_DIR}/${did}/${rkey}${tempSuffix}`;
206  const finalDir = `${CACHE_DIR}/${did}/${rkey}`;
207
208  try {
209    // Download to temporary directory
210    await cacheFiles(did, rkey, record.root.entries, pdsEndpoint, '', tempSuffix);
211    await saveCacheMetadata(did, rkey, recordCid, tempSuffix);
212
213    // Atomically replace old cache with new cache
214    // On POSIX systems (Linux/macOS), rename is atomic
215    if (existsSync(finalDir)) {
216      // Rename old directory to backup
217      const backupDir = `${finalDir}.old-${Date.now()}`;
218      await rename(finalDir, backupDir);
219
220      try {
221        // Rename new directory to final location
222        await rename(tempDir, finalDir);
223
224        // Clean up old backup
225        rmSync(backupDir, { recursive: true, force: true });
226      } catch (err) {
227        // If rename failed, restore backup
228        if (existsSync(backupDir) && !existsSync(finalDir)) {
229          await rename(backupDir, finalDir);
230        }
231        throw err;
232      }
233    } else {
234      // No existing cache, just rename temp to final
235      await rename(tempDir, finalDir);
236    }
237
238    console.log('Successfully cached site atomically', did, rkey);
239  } catch (err) {
240    // Clean up temp directory on failure
241    if (existsSync(tempDir)) {
242      rmSync(tempDir, { recursive: true, force: true });
243    }
244    throw err;
245  }
246}
247
248async function cacheFiles(
249  did: string,
250  site: string,
251  entries: Entry[],
252  pdsEndpoint: string,
253  pathPrefix: string,
254  dirSuffix: string = ''
255): Promise<void> {
256  // Collect all file blob download tasks first
257  const downloadTasks: Array<() => Promise<void>> = [];
258  
259  function collectFileTasks(
260    entries: Entry[],
261    currentPathPrefix: string
262  ) {
263    for (const entry of entries) {
264      const currentPath = currentPathPrefix ? `${currentPathPrefix}/${entry.name}` : entry.name;
265      const node = entry.node;
266
267      if ('type' in node && node.type === 'directory' && 'entries' in node) {
268        collectFileTasks(node.entries, currentPath);
269      } else if ('type' in node && node.type === 'file' && 'blob' in node) {
270        const fileNode = node as File;
271        downloadTasks.push(() => cacheFileBlob(
272          did,
273          site,
274          currentPath,
275          fileNode.blob,
276          pdsEndpoint,
277          fileNode.encoding,
278          fileNode.mimeType,
279          fileNode.base64,
280          dirSuffix
281        ));
282      }
283    }
284  }
285
286  collectFileTasks(entries, pathPrefix);
287
288  // Execute downloads concurrently with a limit of 3 at a time
289  const concurrencyLimit = 3;
290  for (let i = 0; i < downloadTasks.length; i += concurrencyLimit) {
291    const batch = downloadTasks.slice(i, i + concurrencyLimit);
292    await Promise.all(batch.map(task => task()));
293  }
294}
295
296async function cacheFileBlob(
297  did: string,
298  site: string,
299  filePath: string,
300  blobRef: any,
301  pdsEndpoint: string,
302  encoding?: 'gzip',
303  mimeType?: string,
304  base64?: boolean,
305  dirSuffix: string = ''
306): Promise<void> {
307  const cid = extractBlobCid(blobRef);
308  if (!cid) {
309    console.error('Could not extract CID from blob', blobRef);
310    return;
311  }
312
313  const blobUrl = `${pdsEndpoint}/xrpc/com.atproto.sync.getBlob?did=${encodeURIComponent(did)}&cid=${encodeURIComponent(cid)}`;
314
315  // Allow up to 100MB per file blob, with 2 minute timeout
316  let content = await safeFetchBlob(blobUrl, { maxSize: 100 * 1024 * 1024, timeout: 120000 });
317
318  console.log(`[DEBUG] ${filePath}: fetched ${content.length} bytes, base64=${base64}, encoding=${encoding}, mimeType=${mimeType}`);
319
320  // If content is base64-encoded, decode it back to raw binary (gzipped or not)
321  if (base64) {
322    const originalSize = content.length;
323    // Decode base64 directly from raw bytes - no string conversion
324    // The blob contains base64-encoded text as raw bytes, decode it in-place
325    const textDecoder = new TextDecoder();
326    const base64String = textDecoder.decode(content);
327    content = Buffer.from(base64String, 'base64');
328    console.log(`[DEBUG] ${filePath}: decoded base64 from ${originalSize} bytes to ${content.length} bytes`);
329    
330    // Check if it's actually gzipped by looking at magic bytes
331    if (content.length >= 2) {
332      const hasGzipMagic = content[0] === 0x1f && content[1] === 0x8b;
333      console.log(`[DEBUG] ${filePath}: has gzip magic bytes: ${hasGzipMagic}`);
334    }
335  }
336
337  const cacheFile = `${CACHE_DIR}/${did}/${site}${dirSuffix}/${filePath}`;
338  const fileDir = cacheFile.substring(0, cacheFile.lastIndexOf('/'));
339
340  if (fileDir && !existsSync(fileDir)) {
341    mkdirSync(fileDir, { recursive: true });
342  }
343
344  // Use the shared function to determine if this should remain compressed
345  const shouldStayCompressed = shouldCompressMimeType(mimeType);
346
347  // Decompress files that shouldn't be stored compressed
348  if (encoding === 'gzip' && !shouldStayCompressed && content.length >= 2 && 
349      content[0] === 0x1f && content[1] === 0x8b) {
350    console.log(`[DEBUG] ${filePath}: decompressing non-compressible type (${mimeType}) before caching`);
351    try {
352      const { gunzipSync } = await import('zlib');
353      const decompressed = gunzipSync(content);
354      console.log(`[DEBUG] ${filePath}: decompressed from ${content.length} to ${decompressed.length} bytes`);
355      content = decompressed;
356      // Clear the encoding flag since we're storing decompressed
357      encoding = undefined;
358    } catch (error) {
359      console.log(`[DEBUG] ${filePath}: failed to decompress, storing original gzipped content. Error:`, error);
360    }
361  }
362
363  await writeFile(cacheFile, content);
364
365  // Store metadata only if file is still compressed
366  if (encoding === 'gzip' && mimeType) {
367    const metaFile = `${cacheFile}.meta`;
368    await writeFile(metaFile, JSON.stringify({ encoding, mimeType }));
369    console.log('Cached file', filePath, content.length, 'bytes (gzipped,', mimeType + ')');
370  } else {
371    console.log('Cached file', filePath, content.length, 'bytes');
372  }
373}
374
375/**
376 * Sanitize a file path to prevent directory traversal attacks
377 * Removes any path segments that attempt to go up directories
378 */
379export function sanitizePath(filePath: string): string {
380  // Remove leading slashes
381  let cleaned = filePath.replace(/^\/+/, '');
382
383  // Split into segments and filter out dangerous ones
384  const segments = cleaned.split('/').filter(segment => {
385    // Remove empty segments
386    if (!segment || segment === '.') return false;
387    // Remove parent directory references
388    if (segment === '..') return false;
389    // Remove segments with null bytes
390    if (segment.includes('\0')) return false;
391    return true;
392  });
393
394  // Rejoin the safe segments
395  return segments.join('/');
396}
397
398export function getCachedFilePath(did: string, site: string, filePath: string): string {
399  const sanitizedPath = sanitizePath(filePath);
400  return `${CACHE_DIR}/${did}/${site}/${sanitizedPath}`;
401}
402
403export function isCached(did: string, site: string): boolean {
404  return existsSync(`${CACHE_DIR}/${did}/${site}`);
405}
406
407async function saveCacheMetadata(did: string, rkey: string, recordCid: string, dirSuffix: string = ''): Promise<void> {
408  const metadata: CacheMetadata = {
409    recordCid,
410    cachedAt: Date.now(),
411    did,
412    rkey
413  };
414
415  const metadataPath = `${CACHE_DIR}/${did}/${rkey}${dirSuffix}/.metadata.json`;
416  const metadataDir = metadataPath.substring(0, metadataPath.lastIndexOf('/'));
417
418  if (!existsSync(metadataDir)) {
419    mkdirSync(metadataDir, { recursive: true });
420  }
421
422  await writeFile(metadataPath, JSON.stringify(metadata, null, 2));
423}
424
425async function getCacheMetadata(did: string, rkey: string): Promise<CacheMetadata | null> {
426  try {
427    const metadataPath = `${CACHE_DIR}/${did}/${rkey}/.metadata.json`;
428    if (!existsSync(metadataPath)) return null;
429
430    const content = await readFile(metadataPath, 'utf-8');
431    return JSON.parse(content) as CacheMetadata;
432  } catch (err) {
433    console.error('Failed to read cache metadata', err);
434    return null;
435  }
436}
437
438export async function isCacheValid(did: string, rkey: string, currentRecordCid?: string): Promise<boolean> {
439  const metadata = await getCacheMetadata(did, rkey);
440  if (!metadata) return false;
441
442  // Check if cache has expired (14 days TTL)
443  const cacheAge = Date.now() - metadata.cachedAt;
444  if (cacheAge > CACHE_TTL) {
445    console.log('[Cache] Cache expired for', did, rkey);
446    return false;
447  }
448
449  // If current CID is provided, verify it matches
450  if (currentRecordCid && metadata.recordCid !== currentRecordCid) {
451    console.log('[Cache] CID mismatch for', did, rkey, 'cached:', metadata.recordCid, 'current:', currentRecordCid);
452    return false;
453  }
454
455  return true;
456}