/**
* Safely rewrites absolute paths in HTML to be relative to a base path
* Only processes common HTML attributes and preserves external URLs, data URIs, etc.
*/
const REWRITABLE_ATTRIBUTES = [
'src',
'href',
'action',
'data',
'poster',
'srcset',
] as const;
/**
* Check if a path should be rewritten
*/
function shouldRewritePath(path: string): boolean {
// Don't rewrite empty paths
if (!path) return false;
// Don't rewrite external URLs (http://, https://, //)
if (path.startsWith('http://') || path.startsWith('https://') || path.startsWith('//')) {
return false;
}
// Don't rewrite data URIs or other schemes (except file paths)
if (path.includes(':') && !path.startsWith('./') && !path.startsWith('../')) {
return false;
}
// Don't rewrite pure anchors or paths that start with /#
if (path.startsWith('#') || path.startsWith('/#')) return false;
// Don't rewrite relative paths (./ or ../)
if (path.startsWith('./') || path.startsWith('../')) return false;
// Rewrite absolute paths (/)
return true;
}
/**
* Rewrite a single path
*/
function rewritePath(path: string, basePath: string): string {
if (!shouldRewritePath(path)) {
return path;
}
// Handle absolute paths: /file.js -> /base/file.js
if (path.startsWith('/')) {
return basePath + path.slice(1);
}
// At this point, only plain filenames without ./ or ../ prefix should reach here
// But since we're filtering those in shouldRewritePath, this shouldn't happen
return path;
}
/**
* Rewrite srcset attribute (can contain multiple URLs)
* Format: "url1 1x, url2 2x" or "url1 100w, url2 200w"
*/
function rewriteSrcset(srcset: string, basePath: string): string {
return srcset
.split(',')
.map(part => {
const trimmed = part.trim();
const spaceIndex = trimmed.indexOf(' ');
if (spaceIndex === -1) {
// No descriptor, just URL
return rewritePath(trimmed, basePath);
}
const url = trimmed.substring(0, spaceIndex);
const descriptor = trimmed.substring(spaceIndex);
return rewritePath(url, basePath) + descriptor;
})
.join(', ');
}
/**
* Rewrite absolute paths in HTML content
* Uses simple regex matching for safety (no full HTML parsing)
*/
export function rewriteHtmlPaths(html: string, basePath: string): string {
// Ensure base path ends with /
const normalizedBase = basePath.endsWith('/') ? basePath : basePath + '/';
let rewritten = html;
// Rewrite each attribute type
// Use more specific patterns to prevent ReDoS attacks
for (const attr of REWRITABLE_ATTRIBUTES) {
if (attr === 'srcset') {
// Special handling for srcset - use possessive quantifiers via atomic grouping simulation
// Limit whitespace to reasonable amount (max 5 spaces) to prevent ReDoS
const srcsetRegex = new RegExp(
`\\b${attr}[ \\t]{0,5}=[ \\t]{0,5}"([^"]*)"`,
'gi'
);
rewritten = rewritten.replace(srcsetRegex, (match, value) => {
const rewrittenValue = rewriteSrcset(value, normalizedBase);
return `${attr}="${rewrittenValue}"`;
});
} else {
// Regular attributes with quoted values
// Limit whitespace to prevent catastrophic backtracking
const doubleQuoteRegex = new RegExp(
`\\b${attr}[ \\t]{0,5}=[ \\t]{0,5}"([^"]*)"`,
'gi'
);
const singleQuoteRegex = new RegExp(
`\\b${attr}[ \\t]{0,5}=[ \\t]{0,5}'([^']*)'`,
'gi'
);
rewritten = rewritten.replace(doubleQuoteRegex, (match, value) => {
const rewrittenValue = rewritePath(value, normalizedBase);
return `${attr}="${rewrittenValue}"`;
});
rewritten = rewritten.replace(singleQuoteRegex, (match, value) => {
const rewrittenValue = rewritePath(value, normalizedBase);
return `${attr}='${rewrittenValue}'`;
});
}
}
return rewritten;
}
/**
* Check if content is HTML based on content or filename
*/
export function isHtmlContent(
filepath: string,
contentType?: string
): boolean {
if (contentType && contentType.includes('text/html')) {
return true;
}
const ext = filepath.toLowerCase().split('.').pop();
return ext === 'html' || ext === 'htm';
}