A community based topic aggregation platform built on atproto
1package unfurl
2
3import (
4 "context"
5 "encoding/json"
6 "fmt"
7 "io"
8 "net/http"
9 "net/url"
10 "strings"
11 "time"
12
13 "golang.org/x/net/html"
14)
15
16// Provider configuration
17var oEmbedEndpoints = map[string]string{
18 "streamable.com": "https://api.streamable.com/oembed",
19 "youtube.com": "https://www.youtube.com/oembed",
20 "youtu.be": "https://www.youtube.com/oembed",
21 "reddit.com": "https://www.reddit.com/oembed",
22}
23
24// oEmbedResponse represents a standard oEmbed response
25type oEmbedResponse struct {
26 ThumbnailURL string `json:"thumbnail_url"`
27 Version string `json:"version"`
28 Title string `json:"title"`
29 AuthorName string `json:"author_name"`
30 ProviderName string `json:"provider_name"`
31 ProviderURL string `json:"provider_url"`
32 Type string `json:"type"`
33 HTML string `json:"html"`
34 Description string `json:"description"`
35 ThumbnailWidth int `json:"thumbnail_width"`
36 ThumbnailHeight int `json:"thumbnail_height"`
37 Width int `json:"width"`
38 Height int `json:"height"`
39}
40
41// extractDomain extracts the domain from a URL
42func extractDomain(urlStr string) string {
43 parsed, err := url.Parse(urlStr)
44 if err != nil {
45 return ""
46 }
47 // Remove www. prefix
48 domain := strings.TrimPrefix(parsed.Host, "www.")
49 return domain
50}
51
52// isSupported checks if this is a valid HTTP/HTTPS URL
53func isSupported(urlStr string) bool {
54 parsed, err := url.Parse(urlStr)
55 if err != nil {
56 return false
57 }
58 scheme := strings.ToLower(parsed.Scheme)
59 return scheme == "http" || scheme == "https"
60}
61
62// isOEmbedProvider checks if we have an oEmbed endpoint for this URL
63func isOEmbedProvider(urlStr string) bool {
64 domain := extractDomain(urlStr)
65 _, exists := oEmbedEndpoints[domain]
66 return exists
67}
68
69// fetchOEmbed fetches oEmbed data from the provider
70func fetchOEmbed(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*oEmbedResponse, error) {
71 domain := extractDomain(urlStr)
72 endpoint, exists := oEmbedEndpoints[domain]
73 if !exists {
74 return nil, fmt.Errorf("no oEmbed endpoint for domain: %s", domain)
75 }
76
77 // Build oEmbed request URL
78 oembedURL := fmt.Sprintf("%s?url=%s&format=json", endpoint, url.QueryEscape(urlStr))
79
80 // Create HTTP request
81 req, err := http.NewRequestWithContext(ctx, "GET", oembedURL, nil)
82 if err != nil {
83 return nil, fmt.Errorf("failed to create oEmbed request: %w", err)
84 }
85
86 req.Header.Set("User-Agent", userAgent)
87
88 // Create HTTP client with timeout
89 client := &http.Client{Timeout: timeout}
90 resp, err := client.Do(req)
91 if err != nil {
92 return nil, fmt.Errorf("failed to fetch oEmbed data: %w", err)
93 }
94 defer func() { _ = resp.Body.Close() }()
95
96 if resp.StatusCode != http.StatusOK {
97 return nil, fmt.Errorf("oEmbed endpoint returned status %d", resp.StatusCode)
98 }
99
100 // Parse JSON response
101 var oembed oEmbedResponse
102 if err := json.NewDecoder(resp.Body).Decode(&oembed); err != nil {
103 return nil, fmt.Errorf("failed to parse oEmbed response: %w", err)
104 }
105
106 return &oembed, nil
107}
108
109// normalizeURL converts protocol-relative URLs to HTTPS
110// Examples:
111//
112// "//example.com/image.jpg" -> "https://example.com/image.jpg"
113// "https://example.com/image.jpg" -> "https://example.com/image.jpg" (unchanged)
114func normalizeURL(urlStr string) string {
115 if strings.HasPrefix(urlStr, "//") {
116 return "https:" + urlStr
117 }
118 return urlStr
119}
120
121// mapOEmbedToResult converts oEmbed response to UnfurlResult
122func mapOEmbedToResult(oembed *oEmbedResponse, originalURL string) *UnfurlResult {
123 result := &UnfurlResult{
124 URI: originalURL,
125 Title: oembed.Title,
126 Description: oembed.Description,
127 ThumbnailURL: normalizeURL(oembed.ThumbnailURL),
128 Provider: strings.ToLower(oembed.ProviderName),
129 Domain: extractDomain(originalURL),
130 Width: oembed.Width,
131 Height: oembed.Height,
132 }
133
134 // Map oEmbed type to our embedType
135 switch oembed.Type {
136 case "video":
137 result.Type = "video"
138 case "photo":
139 result.Type = "image"
140 default:
141 result.Type = "article"
142 }
143
144 // If no description but we have author name, use that
145 if result.Description == "" && oembed.AuthorName != "" {
146 result.Description = fmt.Sprintf("By %s", oembed.AuthorName)
147 }
148
149 return result
150}
151
152// openGraphData represents OpenGraph metadata extracted from HTML
153type openGraphData struct {
154 Title string
155 Description string
156 Image string
157 URL string
158}
159
160// fetchOpenGraph fetches OpenGraph metadata from a URL
161func fetchOpenGraph(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) {
162 // Create HTTP request
163 req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
164 if err != nil {
165 return nil, fmt.Errorf("failed to create request: %w", err)
166 }
167
168 req.Header.Set("User-Agent", userAgent)
169
170 // Create HTTP client with timeout
171 client := &http.Client{Timeout: timeout}
172 resp, err := client.Do(req)
173 if err != nil {
174 return nil, fmt.Errorf("failed to fetch URL: %w", err)
175 }
176 defer func() { _ = resp.Body.Close() }()
177
178 if resp.StatusCode != http.StatusOK {
179 return nil, fmt.Errorf("HTTP request returned status %d", resp.StatusCode)
180 }
181
182 // Read response body (limit to 10MB to prevent abuse)
183 limitedReader := io.LimitReader(resp.Body, 10*1024*1024)
184 body, err := io.ReadAll(limitedReader)
185 if err != nil {
186 return nil, fmt.Errorf("failed to read response body: %w", err)
187 }
188
189 // Parse OpenGraph metadata
190 og, err := parseOpenGraph(string(body))
191 if err != nil {
192 return nil, fmt.Errorf("failed to parse OpenGraph metadata: %w", err)
193 }
194
195 // Build UnfurlResult
196 result := &UnfurlResult{
197 Type: "article", // Default type for OpenGraph
198 URI: urlStr,
199 Title: og.Title,
200 Description: og.Description,
201 ThumbnailURL: normalizeURL(og.Image),
202 Provider: "opengraph",
203 Domain: extractDomain(urlStr),
204 }
205
206 // Use og:url if available and valid
207 if og.URL != "" {
208 result.URI = og.URL
209 }
210
211 return result, nil
212}
213
214// parseOpenGraph extracts OpenGraph metadata from HTML
215func parseOpenGraph(htmlContent string) (*openGraphData, error) {
216 og := &openGraphData{}
217 doc, err := html.Parse(strings.NewReader(htmlContent))
218 if err != nil {
219 // Try best-effort parsing even with invalid HTML
220 return og, nil
221 }
222
223 // Extract OpenGraph tags and fallbacks
224 var pageTitle string
225 var metaDescription string
226
227 var traverse func(*html.Node)
228 traverse = func(n *html.Node) {
229 if n.Type == html.ElementNode {
230 switch n.Data {
231 case "meta":
232 property := getAttr(n, "property")
233 name := getAttr(n, "name")
234 content := getAttr(n, "content")
235
236 // OpenGraph tags
237 if strings.HasPrefix(property, "og:") {
238 switch property {
239 case "og:title":
240 if og.Title == "" {
241 og.Title = content
242 }
243 case "og:description":
244 if og.Description == "" {
245 og.Description = content
246 }
247 case "og:image":
248 if og.Image == "" {
249 og.Image = content
250 }
251 case "og:url":
252 if og.URL == "" {
253 og.URL = content
254 }
255 }
256 }
257
258 // Fallback meta tags
259 if name == "description" && metaDescription == "" {
260 metaDescription = content
261 }
262
263 case "title":
264 if pageTitle == "" && n.FirstChild != nil {
265 pageTitle = n.FirstChild.Data
266 }
267 }
268 }
269
270 for c := n.FirstChild; c != nil; c = c.NextSibling {
271 traverse(c)
272 }
273 }
274
275 traverse(doc)
276
277 // Apply fallbacks
278 if og.Title == "" {
279 og.Title = pageTitle
280 }
281 if og.Description == "" {
282 og.Description = metaDescription
283 }
284
285 return og, nil
286}
287
288// getAttr gets an attribute value from an HTML node
289func getAttr(n *html.Node, key string) string {
290 for _, attr := range n.Attr {
291 if attr.Key == key {
292 return attr.Val
293 }
294 }
295 return ""
296}
297
298// fetchKagiKite handles special unfurling for Kagi Kite news pages
299// Kagi Kite pages use client-side rendering, so og:image tags aren't available at SSR time
300// Instead, we parse the HTML to extract the story image from the page content
301func fetchKagiKite(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) {
302 // Create HTTP request
303 req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
304 if err != nil {
305 return nil, fmt.Errorf("failed to create request: %w", err)
306 }
307
308 req.Header.Set("User-Agent", userAgent)
309
310 // Create HTTP client with timeout
311 client := &http.Client{Timeout: timeout}
312 resp, err := client.Do(req)
313 if err != nil {
314 return nil, fmt.Errorf("failed to fetch URL: %w", err)
315 }
316 defer func() { _ = resp.Body.Close() }()
317
318 if resp.StatusCode != http.StatusOK {
319 return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
320 }
321
322 // Limit response size to 10MB
323 limitedReader := io.LimitReader(resp.Body, 10*1024*1024)
324
325 // Parse HTML
326 doc, err := html.Parse(limitedReader)
327 if err != nil {
328 return nil, fmt.Errorf("failed to parse HTML: %w", err)
329 }
330
331 result := &UnfurlResult{
332 Type: "article",
333 URI: urlStr,
334 Domain: "kite.kagi.com",
335 Provider: "kagi",
336 }
337
338 // First try OpenGraph tags (in case they get added in the future)
339 var findOG func(*html.Node)
340 findOG = func(n *html.Node) {
341 if n.Type == html.ElementNode && n.Data == "meta" {
342 var property, content string
343 for _, attr := range n.Attr {
344 if attr.Key == "property" {
345 property = attr.Val
346 } else if attr.Key == "content" {
347 content = attr.Val
348 }
349 }
350
351 switch property {
352 case "og:title":
353 if result.Title == "" {
354 result.Title = content
355 }
356 case "og:description":
357 if result.Description == "" {
358 result.Description = content
359 }
360 case "og:image":
361 if result.ThumbnailURL == "" {
362 result.ThumbnailURL = content
363 }
364 }
365 }
366 for c := n.FirstChild; c != nil; c = c.NextSibling {
367 findOG(c)
368 }
369 }
370 findOG(doc)
371
372 // Fallback: Extract from page content
373 // Look for images with kagiproxy.com URLs (Kagi's image proxy)
374 // Note: Skip the first image as it's often a shared header/logo
375 if result.ThumbnailURL == "" {
376 var images []struct {
377 url string
378 alt string
379 }
380
381 var findImg func(*html.Node)
382 findImg = func(n *html.Node) {
383 if n.Type == html.ElementNode && n.Data == "img" {
384 for _, attr := range n.Attr {
385 if attr.Key == "src" && strings.Contains(attr.Val, "kagiproxy.com") {
386 // Get alt text if available
387 var altText string
388 for _, a := range n.Attr {
389 if a.Key == "alt" {
390 altText = a.Val
391 break
392 }
393 }
394 images = append(images, struct {
395 url string
396 alt string
397 }{url: attr.Val, alt: altText})
398 break
399 }
400 }
401 }
402 for c := n.FirstChild; c != nil; c = c.NextSibling {
403 findImg(c)
404 }
405 }
406 findImg(doc)
407
408 // Skip first image (often shared header/logo), use second if available
409 if len(images) > 1 {
410 result.ThumbnailURL = images[1].url
411 if result.Description == "" && images[1].alt != "" {
412 result.Description = images[1].alt
413 }
414 } else if len(images) == 1 {
415 // Only one image found, use it
416 result.ThumbnailURL = images[0].url
417 if result.Description == "" && images[0].alt != "" {
418 result.Description = images[0].alt
419 }
420 }
421 }
422
423 // Fallback to <title> tag if og:title not found
424 if result.Title == "" {
425 var findTitle func(*html.Node) string
426 findTitle = func(n *html.Node) string {
427 if n.Type == html.ElementNode && n.Data == "title" {
428 if n.FirstChild != nil && n.FirstChild.Type == html.TextNode {
429 return n.FirstChild.Data
430 }
431 }
432 for c := n.FirstChild; c != nil; c = c.NextSibling {
433 if title := findTitle(c); title != "" {
434 return title
435 }
436 }
437 return ""
438 }
439 result.Title = findTitle(doc)
440 }
441
442 // If still no image, return error
443 if result.ThumbnailURL == "" {
444 return nil, fmt.Errorf("no image found in Kagi page")
445 }
446
447 return result, nil
448}