A community based topic aggregation platform built on atproto
1package unfurl
2
3import (
4 "context"
5 "encoding/json"
6 "fmt"
7 "io"
8 "net/http"
9 "net/url"
10 "strings"
11 "time"
12
13 "golang.org/x/net/html"
14)
15
16// Provider configuration
17var oEmbedEndpoints = map[string]string{
18 "streamable.com": "https://api.streamable.com/oembed",
19 "youtube.com": "https://www.youtube.com/oembed",
20 "youtu.be": "https://www.youtube.com/oembed",
21 "reddit.com": "https://www.reddit.com/oembed",
22}
23
24// oEmbedResponse represents a standard oEmbed response
25type oEmbedResponse struct {
26 ThumbnailURL string `json:"thumbnail_url"`
27 Version string `json:"version"`
28 Title string `json:"title"`
29 AuthorName string `json:"author_name"`
30 ProviderName string `json:"provider_name"`
31 ProviderURL string `json:"provider_url"`
32 Type string `json:"type"`
33 HTML string `json:"html"`
34 Description string `json:"description"`
35 ThumbnailWidth int `json:"thumbnail_width"`
36 ThumbnailHeight int `json:"thumbnail_height"`
37 Width int `json:"width"`
38 Height int `json:"height"`
39}
40
41// extractDomain extracts the domain from a URL
42func extractDomain(urlStr string) string {
43 parsed, err := url.Parse(urlStr)
44 if err != nil {
45 return ""
46 }
47 // Remove www. prefix
48 domain := strings.TrimPrefix(parsed.Host, "www.")
49 return domain
50}
51
52// isSupported checks if this is a valid HTTP/HTTPS URL
53func isSupported(urlStr string) bool {
54 parsed, err := url.Parse(urlStr)
55 if err != nil {
56 return false
57 }
58 scheme := strings.ToLower(parsed.Scheme)
59 return scheme == "http" || scheme == "https"
60}
61
62// isOEmbedProvider checks if we have an oEmbed endpoint for this URL
63func isOEmbedProvider(urlStr string) bool {
64 domain := extractDomain(urlStr)
65 _, exists := oEmbedEndpoints[domain]
66 return exists
67}
68
69// fetchOEmbed fetches oEmbed data from the provider
70func fetchOEmbed(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*oEmbedResponse, error) {
71 domain := extractDomain(urlStr)
72 endpoint, exists := oEmbedEndpoints[domain]
73 if !exists {
74 return nil, fmt.Errorf("no oEmbed endpoint for domain: %s", domain)
75 }
76
77 // Build oEmbed request URL
78 oembedURL := fmt.Sprintf("%s?url=%s&format=json", endpoint, url.QueryEscape(urlStr))
79
80 // Create HTTP request
81 req, err := http.NewRequestWithContext(ctx, "GET", oembedURL, nil)
82 if err != nil {
83 return nil, fmt.Errorf("failed to create oEmbed request: %w", err)
84 }
85
86 req.Header.Set("User-Agent", userAgent)
87
88 // Create HTTP client with timeout
89 client := &http.Client{Timeout: timeout}
90 resp, err := client.Do(req)
91 if err != nil {
92 return nil, fmt.Errorf("failed to fetch oEmbed data: %w", err)
93 }
94 defer func() { _ = resp.Body.Close() }()
95
96 if resp.StatusCode != http.StatusOK {
97 return nil, fmt.Errorf("oEmbed endpoint returned status %d", resp.StatusCode)
98 }
99
100 // Parse JSON response
101 var oembed oEmbedResponse
102 if err := json.NewDecoder(resp.Body).Decode(&oembed); err != nil {
103 return nil, fmt.Errorf("failed to parse oEmbed response: %w", err)
104 }
105
106 return &oembed, nil
107}
108
109// normalizeURL converts protocol-relative URLs to HTTPS
110// Examples:
111// "//example.com/image.jpg" -> "https://example.com/image.jpg"
112// "https://example.com/image.jpg" -> "https://example.com/image.jpg" (unchanged)
113func normalizeURL(urlStr string) string {
114 if strings.HasPrefix(urlStr, "//") {
115 return "https:" + urlStr
116 }
117 return urlStr
118}
119
120// mapOEmbedToResult converts oEmbed response to UnfurlResult
121func mapOEmbedToResult(oembed *oEmbedResponse, originalURL string) *UnfurlResult {
122 result := &UnfurlResult{
123 URI: originalURL,
124 Title: oembed.Title,
125 Description: oembed.Description,
126 ThumbnailURL: normalizeURL(oembed.ThumbnailURL),
127 Provider: strings.ToLower(oembed.ProviderName),
128 Domain: extractDomain(originalURL),
129 Width: oembed.Width,
130 Height: oembed.Height,
131 }
132
133 // Map oEmbed type to our embedType
134 switch oembed.Type {
135 case "video":
136 result.Type = "video"
137 case "photo":
138 result.Type = "image"
139 default:
140 result.Type = "article"
141 }
142
143 // If no description but we have author name, use that
144 if result.Description == "" && oembed.AuthorName != "" {
145 result.Description = fmt.Sprintf("By %s", oembed.AuthorName)
146 }
147
148 return result
149}
150
151// openGraphData represents OpenGraph metadata extracted from HTML
152type openGraphData struct {
153 Title string
154 Description string
155 Image string
156 URL string
157}
158
159// fetchOpenGraph fetches OpenGraph metadata from a URL
160func fetchOpenGraph(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) {
161 // Create HTTP request
162 req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
163 if err != nil {
164 return nil, fmt.Errorf("failed to create request: %w", err)
165 }
166
167 req.Header.Set("User-Agent", userAgent)
168
169 // Create HTTP client with timeout
170 client := &http.Client{Timeout: timeout}
171 resp, err := client.Do(req)
172 if err != nil {
173 return nil, fmt.Errorf("failed to fetch URL: %w", err)
174 }
175 defer func() { _ = resp.Body.Close() }()
176
177 if resp.StatusCode != http.StatusOK {
178 return nil, fmt.Errorf("HTTP request returned status %d", resp.StatusCode)
179 }
180
181 // Read response body (limit to 10MB to prevent abuse)
182 limitedReader := io.LimitReader(resp.Body, 10*1024*1024)
183 body, err := io.ReadAll(limitedReader)
184 if err != nil {
185 return nil, fmt.Errorf("failed to read response body: %w", err)
186 }
187
188 // Parse OpenGraph metadata
189 og, err := parseOpenGraph(string(body))
190 if err != nil {
191 return nil, fmt.Errorf("failed to parse OpenGraph metadata: %w", err)
192 }
193
194 // Build UnfurlResult
195 result := &UnfurlResult{
196 Type: "article", // Default type for OpenGraph
197 URI: urlStr,
198 Title: og.Title,
199 Description: og.Description,
200 ThumbnailURL: normalizeURL(og.Image),
201 Provider: "opengraph",
202 Domain: extractDomain(urlStr),
203 }
204
205 // Use og:url if available and valid
206 if og.URL != "" {
207 result.URI = og.URL
208 }
209
210 return result, nil
211}
212
213// parseOpenGraph extracts OpenGraph metadata from HTML
214func parseOpenGraph(htmlContent string) (*openGraphData, error) {
215 og := &openGraphData{}
216 doc, err := html.Parse(strings.NewReader(htmlContent))
217 if err != nil {
218 // Try best-effort parsing even with invalid HTML
219 return og, nil
220 }
221
222 // Extract OpenGraph tags and fallbacks
223 var pageTitle string
224 var metaDescription string
225
226 var traverse func(*html.Node)
227 traverse = func(n *html.Node) {
228 if n.Type == html.ElementNode {
229 switch n.Data {
230 case "meta":
231 property := getAttr(n, "property")
232 name := getAttr(n, "name")
233 content := getAttr(n, "content")
234
235 // OpenGraph tags
236 if strings.HasPrefix(property, "og:") {
237 switch property {
238 case "og:title":
239 if og.Title == "" {
240 og.Title = content
241 }
242 case "og:description":
243 if og.Description == "" {
244 og.Description = content
245 }
246 case "og:image":
247 if og.Image == "" {
248 og.Image = content
249 }
250 case "og:url":
251 if og.URL == "" {
252 og.URL = content
253 }
254 }
255 }
256
257 // Fallback meta tags
258 if name == "description" && metaDescription == "" {
259 metaDescription = content
260 }
261
262 case "title":
263 if pageTitle == "" && n.FirstChild != nil {
264 pageTitle = n.FirstChild.Data
265 }
266 }
267 }
268
269 for c := n.FirstChild; c != nil; c = c.NextSibling {
270 traverse(c)
271 }
272 }
273
274 traverse(doc)
275
276 // Apply fallbacks
277 if og.Title == "" {
278 og.Title = pageTitle
279 }
280 if og.Description == "" {
281 og.Description = metaDescription
282 }
283
284 return og, nil
285}
286
287// getAttr gets an attribute value from an HTML node
288func getAttr(n *html.Node, key string) string {
289 for _, attr := range n.Attr {
290 if attr.Key == key {
291 return attr.Val
292 }
293 }
294 return ""
295}
296
297// fetchKagiKite handles special unfurling for Kagi Kite news pages
298// Kagi Kite pages use client-side rendering, so og:image tags aren't available at SSR time
299// Instead, we parse the HTML to extract the story image from the page content
300func fetchKagiKite(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) {
301 // Create HTTP request
302 req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
303 if err != nil {
304 return nil, fmt.Errorf("failed to create request: %w", err)
305 }
306
307 req.Header.Set("User-Agent", userAgent)
308
309 // Create HTTP client with timeout
310 client := &http.Client{Timeout: timeout}
311 resp, err := client.Do(req)
312 if err != nil {
313 return nil, fmt.Errorf("failed to fetch URL: %w", err)
314 }
315 defer func() { _ = resp.Body.Close() }()
316
317 if resp.StatusCode != http.StatusOK {
318 return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
319 }
320
321 // Limit response size to 10MB
322 limitedReader := io.LimitReader(resp.Body, 10*1024*1024)
323
324 // Parse HTML
325 doc, err := html.Parse(limitedReader)
326 if err != nil {
327 return nil, fmt.Errorf("failed to parse HTML: %w", err)
328 }
329
330 result := &UnfurlResult{
331 Type: "article",
332 URI: urlStr,
333 Domain: "kite.kagi.com",
334 Provider: "kagi",
335 }
336
337 // First try OpenGraph tags (in case they get added in the future)
338 var findOG func(*html.Node)
339 findOG = func(n *html.Node) {
340 if n.Type == html.ElementNode && n.Data == "meta" {
341 var property, content string
342 for _, attr := range n.Attr {
343 if attr.Key == "property" {
344 property = attr.Val
345 } else if attr.Key == "content" {
346 content = attr.Val
347 }
348 }
349
350 switch property {
351 case "og:title":
352 if result.Title == "" {
353 result.Title = content
354 }
355 case "og:description":
356 if result.Description == "" {
357 result.Description = content
358 }
359 case "og:image":
360 if result.ThumbnailURL == "" {
361 result.ThumbnailURL = content
362 }
363 }
364 }
365 for c := n.FirstChild; c != nil; c = c.NextSibling {
366 findOG(c)
367 }
368 }
369 findOG(doc)
370
371 // Fallback: Extract from page content
372 // Look for images with kagiproxy.com URLs (Kagi's image proxy)
373 // Note: Skip the first image as it's often a shared header/logo
374 if result.ThumbnailURL == "" {
375 var images []struct {
376 url string
377 alt string
378 }
379
380 var findImg func(*html.Node)
381 findImg = func(n *html.Node) {
382 if n.Type == html.ElementNode && n.Data == "img" {
383 for _, attr := range n.Attr {
384 if attr.Key == "src" && strings.Contains(attr.Val, "kagiproxy.com") {
385 // Get alt text if available
386 var altText string
387 for _, a := range n.Attr {
388 if a.Key == "alt" {
389 altText = a.Val
390 break
391 }
392 }
393 images = append(images, struct {
394 url string
395 alt string
396 }{url: attr.Val, alt: altText})
397 break
398 }
399 }
400 }
401 for c := n.FirstChild; c != nil; c = c.NextSibling {
402 findImg(c)
403 }
404 }
405 findImg(doc)
406
407 // Skip first image (often shared header/logo), use second if available
408 if len(images) > 1 {
409 result.ThumbnailURL = images[1].url
410 if result.Description == "" && images[1].alt != "" {
411 result.Description = images[1].alt
412 }
413 } else if len(images) == 1 {
414 // Only one image found, use it
415 result.ThumbnailURL = images[0].url
416 if result.Description == "" && images[0].alt != "" {
417 result.Description = images[0].alt
418 }
419 }
420 }
421
422 // Fallback to <title> tag if og:title not found
423 if result.Title == "" {
424 var findTitle func(*html.Node) string
425 findTitle = func(n *html.Node) string {
426 if n.Type == html.ElementNode && n.Data == "title" {
427 if n.FirstChild != nil && n.FirstChild.Type == html.TextNode {
428 return n.FirstChild.Data
429 }
430 }
431 for c := n.FirstChild; c != nil; c = c.NextSibling {
432 if title := findTitle(c); title != "" {
433 return title
434 }
435 }
436 return ""
437 }
438 result.Title = findTitle(doc)
439 }
440
441 // If still no image, return error
442 if result.ThumbnailURL == "" {
443 return nil, fmt.Errorf("no image found in Kagi page")
444 }
445
446 return result, nil
447}