A Cloudflare Worker which works in conjunction with https://github.com/indexxing/bsky-alt-text
at main 10 kB view raw
1import { Num, OpenAPIRoute } from "chanfana"; 2import { z } from "zod"; 3import { type AppContext } from "../types"; 4 5const systemInstructions = 6 `You will be provided with visual media (either a still image or a video file). Your task is to generate alternative text (alt-text) that describes the media's content and context. This alt-text is intended for use with screen reader technology, assisting individuals who are blind or visually impaired to understand the visual information. Adhere to the following guidelines strictly: 7 81. **Media Type Identification:** * Begin by identifying the type of media. For images, note if it is a "photograph", "painting", "illustration", "diagram", "screenshot", "comic panel", etc. For videos, simply describe the content directly without prefacing with "Video describing...". 9 102. **Content and Purpose:** 11 * Describe the visual content accurately and thoroughly. Explain the media in the context that it is presented. 12 * Convey the media's purpose. Why is this included? What information is it trying to present? What is the core message? 13 * Prioritize the most important information, placing it at the beginning of the alt-text. 14 * If the image serves a specific function (e.g., a button or a link), describe the function. Example: "Search button" or "Link to the homepage". 15 163. **Video-Specific Instructions:** 17 * For standard videos, describe the key visual elements, actions, scenes, and any text overlays that appear throughout the *duration* of the video playback. Focus on conveying the narrative or informational flow presented visually. Do *not* just describe a single frame or thumbnail. 18 * **For short, looping animations (like animated GIFs or silent WebM files):** Describe the *complete action* or the *entire sequence* shown in the loop. Even if brief, explain what happens from the beginning to the end of the animation cycle. For example, instead of "A cat looking up", describe "Video showing a cat repeatedly looking up, raising its head, and then lowering it again in a loop." 19 204. **Sequential Art (Comics/Webcomics):** 21 * For media containing sequential art like comic panels or webcomics, describe the narrative progression. Detail the actions, characters, settings, and dialogue/captions within each panel or across the sequence to tell the story visually represented. 22 235. **Text within the Media:** 24 * If the media contains text (e.g., signs, labels, captions, text overlays in videos), transcribe the text *verbatim* within the alt-text. Indicate that this is a direct quote by using quotation marks. Example: 'A sign that reads, "Proceed with Caution".' 25 * **Crucially**, if the media consists primarily of a large block of text (e.g., a screenshot of an article, a quote graphic, a presentation slide), you MUST transcribe the *entire* text content verbatim, up to a practical limit (e.g., 2000 characters). Accuracy and completeness of the text take precedence over brevity in these cases. 26 * For screenshots containing User Interface (UI) elements, transcribe essential text (button labels, input field values, key menu items). Exercise judgment to omit minor or redundant UI text (tooltips, decorative labels) that doesn't significantly contribute to understanding the core function or state shown. Example: "Screenshot of a software settings window. The 'Notifications' tab is active, showing a checkbox labeled \"Enable desktop alerts\" which is checked." 27 286. **Brevity and Clarity:** 29 * Keep descriptions concise *except* when transcribing significant amounts of text or describing sequential narratives (comics, videos), where clarity and completeness are more important. Aim for under 150 characters for simple images where possible. 30 * Use clear, simple language. Avoid jargon unless it's part of transcribed text or essential to the meaning. 31 * Use proper grammar, punctuation, and capitalization. End sentences with a period. 32 337. **Notable Individuals:** 34 * If the media features recognizable people, identify them by name. If their role or title is relevant, include that too. Example: "Photograph of Dr. Jane Goodall observing chimpanzees." 35 368. **Inappropriate or Sensitive Content:** 37 * If the media depicts potentially sensitive, offensive, or harmful content, maintain a professional, objective, and clinical tone. 38 * Describe the factual visual content accurately but avoid graphic or sensationalized language. Aim for a descriptive level appropriate for a general audience (e.g., PG-13). 39 409. **Output Format:** 41 * Provide *only* the descriptive alt-text. Do *not* include introductory phrases (e.g., "The image shows...", "Alt-text:"), conversational filler, or follow-up statements. Output *just* the description. 42 4310. **Do Not's:** 44 * Do not begin descriptions with generic phrases like "Image of...", "Video of...", etc., unless specifying the type as in Guideline 1. 45 * Do not add external information, interpretations, or assumptions not directly represented in the visual media itself. 46 47By consistently applying these guidelines, you will create alt-text that is informative, accurate, concise where appropriate, and genuinely helpful for users of assistive technology across different types of visual media.`; 48 49export class GenerateEndpoint extends OpenAPIRoute { 50 schema = { 51 tags: ["Image"], 52 summary: "Generates alt text for a given image.", 53 security: [ 54 { 55 bearerAuth: [], 56 }, 57 ], 58 request: { 59 body: { 60 content: { 61 "application/json": { 62 schema: z.object({ 63 base64Data: z.string({ 64 description: "The base64 encoded image data.", 65 required_error: 66 "Image data (base64Data) is required.", 67 }).min(1, "Image data cannot be empty."), 68 mimeType: z.string({ 69 description: 70 "The MIME type of the image (e.g., 'image/jpeg', 'image/png').", 71 required_error: "MIME type is required.", 72 }).regex( 73 /^image\/(jpeg|png|gif|webp|bmp|svg\+xml)$/, 74 "Invalid image MIME type.", 75 ), 76 }), 77 }, 78 }, 79 }, 80 }, 81 responses: { 82 "200": { 83 description: "Returns the generated alt text.", 84 content: { 85 "application/json": { 86 schema: z.object({ 87 success: z.boolean().default(true), 88 altText: z.string().nullable(), 89 error: z.string().optional(), 90 }), 91 }, 92 }, 93 }, 94 "400": { 95 description: "Bad Request - Invalid input or missing fields.", 96 content: { 97 "application/json": { 98 schema: z.object({ 99 success: z.boolean().default(false), 100 error: z.string(), 101 }), 102 }, 103 }, 104 }, 105 "403": { 106 description: "Forbidden - Origin not allowed.", 107 content: { 108 "application/json": { 109 schema: z.object({ 110 success: z.boolean().default(false), 111 error: z.string(), 112 }), 113 }, 114 }, 115 }, 116 "500": { 117 description: 118 "Internal Server Error - Issue with Cloud Function or AI API call.", 119 content: { 120 "application/json": { 121 schema: z.object({ 122 success: z.boolean().default(false), 123 error: z.string(), 124 }), 125 }, 126 }, 127 }, 128 "502": { 129 description: "Bad Gateway - AI API failed.", 130 content: { 131 "application/json": { 132 schema: z.object({ 133 success: z.boolean().default(false), 134 error: z.string(), 135 }), 136 }, 137 }, 138 }, 139 }, 140 }; 141 142 async handle(c: AppContext) { 143 const data = await this.getValidatedData<typeof this.schema>(); 144 const { base64Data, mimeType } = data.body; 145 146 try { 147 const res = await c.var.gemini.models.generateContent({ 148 model: c.env.GEMINI_MODEL, 149 contents: [ 150 { text: systemInstructions }, 151 { inlineData: { mimeType: mimeType, data: base64Data } }, 152 ], 153 config: { 154 temperature: c.env.GEMINI_GENERATE_TEMPERATURE, 155 maxOutputTokens: c.env.GEMINI_GENERATE_MAX_OUTPUT_TOKENS, 156 topP: c.env.GEMINI_GENERATE_TOP_P, 157 topK: c.env.GEMINI_GENERATE_TOP_K, 158 }, 159 }); 160 161 const generatedText = res.candidates?.[0]?.content?.parts?.[0] 162 ?.text; 163 if (!generatedText) { 164 return { 165 success: false, 166 error: "Failed to generate text.", 167 }; 168 } 169 170 return { 171 success: true, 172 text: generatedText, 173 tokens: res.usageMetadata.totalTokenCount ?? 0, 174 }; 175 } catch (e) { 176 return { 177 success: false, 178 error: e, 179 }; 180 } 181 } 182}