A community based topic aggregation platform built on atproto
1package unfurl
2
3import (
4 "context"
5 "encoding/json"
6 "fmt"
7 "io"
8 "net/http"
9 "net/url"
10 "strings"
11 "time"
12
13 "golang.org/x/net/html"
14)
15
16// Provider configuration
17var oEmbedEndpoints = map[string]string{
18 "streamable.com": "https://api.streamable.com/oembed",
19 "youtube.com": "https://www.youtube.com/oembed",
20 "youtu.be": "https://www.youtube.com/oembed",
21 "reddit.com": "https://www.reddit.com/oembed",
22}
23
24// oEmbedResponse represents a standard oEmbed response
25type oEmbedResponse struct {
26 ThumbnailURL string `json:"thumbnail_url"`
27 Version string `json:"version"`
28 Title string `json:"title"`
29 AuthorName string `json:"author_name"`
30 ProviderName string `json:"provider_name"`
31 ProviderURL string `json:"provider_url"`
32 Type string `json:"type"`
33 HTML string `json:"html"`
34 Description string `json:"description"`
35 ThumbnailWidth int `json:"thumbnail_width"`
36 ThumbnailHeight int `json:"thumbnail_height"`
37 Width int `json:"width"`
38 Height int `json:"height"`
39}
40
41// extractDomain extracts the domain from a URL
42func extractDomain(urlStr string) string {
43 parsed, err := url.Parse(urlStr)
44 if err != nil {
45 return ""
46 }
47 // Remove www. prefix
48 domain := strings.TrimPrefix(parsed.Host, "www.")
49 return domain
50}
51
52// isSupported checks if this is a valid HTTP/HTTPS URL
53func isSupported(urlStr string) bool {
54 parsed, err := url.Parse(urlStr)
55 if err != nil {
56 return false
57 }
58 scheme := strings.ToLower(parsed.Scheme)
59 return scheme == "http" || scheme == "https"
60}
61
62// isOEmbedProvider checks if we have an oEmbed endpoint for this URL
63func isOEmbedProvider(urlStr string) bool {
64 domain := extractDomain(urlStr)
65 _, exists := oEmbedEndpoints[domain]
66 return exists
67}
68
69// fetchOEmbed fetches oEmbed data from the provider
70func fetchOEmbed(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*oEmbedResponse, error) {
71 domain := extractDomain(urlStr)
72 endpoint, exists := oEmbedEndpoints[domain]
73 if !exists {
74 return nil, fmt.Errorf("no oEmbed endpoint for domain: %s", domain)
75 }
76
77 // Build oEmbed request URL
78 oembedURL := fmt.Sprintf("%s?url=%s&format=json", endpoint, url.QueryEscape(urlStr))
79
80 // Create HTTP request
81 req, err := http.NewRequestWithContext(ctx, "GET", oembedURL, nil)
82 if err != nil {
83 return nil, fmt.Errorf("failed to create oEmbed request: %w", err)
84 }
85
86 req.Header.Set("User-Agent", userAgent)
87
88 // Create HTTP client with timeout
89 client := &http.Client{Timeout: timeout}
90 resp, err := client.Do(req)
91 if err != nil {
92 return nil, fmt.Errorf("failed to fetch oEmbed data: %w", err)
93 }
94 defer func() { _ = resp.Body.Close() }()
95
96 if resp.StatusCode != http.StatusOK {
97 return nil, fmt.Errorf("oEmbed endpoint returned status %d", resp.StatusCode)
98 }
99
100 // Parse JSON response
101 var oembed oEmbedResponse
102 if err := json.NewDecoder(resp.Body).Decode(&oembed); err != nil {
103 return nil, fmt.Errorf("failed to parse oEmbed response: %w", err)
104 }
105
106 return &oembed, nil
107}
108
109// mapOEmbedToResult converts oEmbed response to UnfurlResult
110func mapOEmbedToResult(oembed *oEmbedResponse, originalURL string) *UnfurlResult {
111 result := &UnfurlResult{
112 URI: originalURL,
113 Title: oembed.Title,
114 Description: oembed.Description,
115 ThumbnailURL: oembed.ThumbnailURL,
116 Provider: strings.ToLower(oembed.ProviderName),
117 Domain: extractDomain(originalURL),
118 Width: oembed.Width,
119 Height: oembed.Height,
120 }
121
122 // Map oEmbed type to our embedType
123 switch oembed.Type {
124 case "video":
125 result.Type = "video"
126 case "photo":
127 result.Type = "image"
128 default:
129 result.Type = "article"
130 }
131
132 // If no description but we have author name, use that
133 if result.Description == "" && oembed.AuthorName != "" {
134 result.Description = fmt.Sprintf("By %s", oembed.AuthorName)
135 }
136
137 return result
138}
139
140// openGraphData represents OpenGraph metadata extracted from HTML
141type openGraphData struct {
142 Title string
143 Description string
144 Image string
145 URL string
146}
147
148// fetchOpenGraph fetches OpenGraph metadata from a URL
149func fetchOpenGraph(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) {
150 // Create HTTP request
151 req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
152 if err != nil {
153 return nil, fmt.Errorf("failed to create request: %w", err)
154 }
155
156 req.Header.Set("User-Agent", userAgent)
157
158 // Create HTTP client with timeout
159 client := &http.Client{Timeout: timeout}
160 resp, err := client.Do(req)
161 if err != nil {
162 return nil, fmt.Errorf("failed to fetch URL: %w", err)
163 }
164 defer func() { _ = resp.Body.Close() }()
165
166 if resp.StatusCode != http.StatusOK {
167 return nil, fmt.Errorf("HTTP request returned status %d", resp.StatusCode)
168 }
169
170 // Read response body (limit to 10MB to prevent abuse)
171 limitedReader := io.LimitReader(resp.Body, 10*1024*1024)
172 body, err := io.ReadAll(limitedReader)
173 if err != nil {
174 return nil, fmt.Errorf("failed to read response body: %w", err)
175 }
176
177 // Parse OpenGraph metadata
178 og, err := parseOpenGraph(string(body))
179 if err != nil {
180 return nil, fmt.Errorf("failed to parse OpenGraph metadata: %w", err)
181 }
182
183 // Build UnfurlResult
184 result := &UnfurlResult{
185 Type: "article", // Default type for OpenGraph
186 URI: urlStr,
187 Title: og.Title,
188 Description: og.Description,
189 ThumbnailURL: og.Image,
190 Provider: "opengraph",
191 Domain: extractDomain(urlStr),
192 }
193
194 // Use og:url if available and valid
195 if og.URL != "" {
196 result.URI = og.URL
197 }
198
199 return result, nil
200}
201
202// parseOpenGraph extracts OpenGraph metadata from HTML
203func parseOpenGraph(htmlContent string) (*openGraphData, error) {
204 og := &openGraphData{}
205 doc, err := html.Parse(strings.NewReader(htmlContent))
206 if err != nil {
207 // Try best-effort parsing even with invalid HTML
208 return og, nil
209 }
210
211 // Extract OpenGraph tags and fallbacks
212 var pageTitle string
213 var metaDescription string
214
215 var traverse func(*html.Node)
216 traverse = func(n *html.Node) {
217 if n.Type == html.ElementNode {
218 switch n.Data {
219 case "meta":
220 property := getAttr(n, "property")
221 name := getAttr(n, "name")
222 content := getAttr(n, "content")
223
224 // OpenGraph tags
225 if strings.HasPrefix(property, "og:") {
226 switch property {
227 case "og:title":
228 if og.Title == "" {
229 og.Title = content
230 }
231 case "og:description":
232 if og.Description == "" {
233 og.Description = content
234 }
235 case "og:image":
236 if og.Image == "" {
237 og.Image = content
238 }
239 case "og:url":
240 if og.URL == "" {
241 og.URL = content
242 }
243 }
244 }
245
246 // Fallback meta tags
247 if name == "description" && metaDescription == "" {
248 metaDescription = content
249 }
250
251 case "title":
252 if pageTitle == "" && n.FirstChild != nil {
253 pageTitle = n.FirstChild.Data
254 }
255 }
256 }
257
258 for c := n.FirstChild; c != nil; c = c.NextSibling {
259 traverse(c)
260 }
261 }
262
263 traverse(doc)
264
265 // Apply fallbacks
266 if og.Title == "" {
267 og.Title = pageTitle
268 }
269 if og.Description == "" {
270 og.Description = metaDescription
271 }
272
273 return og, nil
274}
275
276// getAttr gets an attribute value from an HTML node
277func getAttr(n *html.Node, key string) string {
278 for _, attr := range n.Attr {
279 if attr.Key == key {
280 return attr.Val
281 }
282 }
283 return ""
284}
285
286// fetchKagiKite handles special unfurling for Kagi Kite news pages
287// Kagi Kite pages use client-side rendering, so og:image tags aren't available at SSR time
288// Instead, we parse the HTML to extract the story image from the page content
289func fetchKagiKite(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) {
290 // Create HTTP request
291 req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
292 if err != nil {
293 return nil, fmt.Errorf("failed to create request: %w", err)
294 }
295
296 req.Header.Set("User-Agent", userAgent)
297
298 // Create HTTP client with timeout
299 client := &http.Client{Timeout: timeout}
300 resp, err := client.Do(req)
301 if err != nil {
302 return nil, fmt.Errorf("failed to fetch URL: %w", err)
303 }
304 defer func() { _ = resp.Body.Close() }()
305
306 if resp.StatusCode != http.StatusOK {
307 return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
308 }
309
310 // Limit response size to 10MB
311 limitedReader := io.LimitReader(resp.Body, 10*1024*1024)
312
313 // Parse HTML
314 doc, err := html.Parse(limitedReader)
315 if err != nil {
316 return nil, fmt.Errorf("failed to parse HTML: %w", err)
317 }
318
319 result := &UnfurlResult{
320 Type: "article",
321 URI: urlStr,
322 Domain: "kite.kagi.com",
323 Provider: "kagi",
324 }
325
326 // First try OpenGraph tags (in case they get added in the future)
327 var findOG func(*html.Node)
328 findOG = func(n *html.Node) {
329 if n.Type == html.ElementNode && n.Data == "meta" {
330 var property, content string
331 for _, attr := range n.Attr {
332 if attr.Key == "property" {
333 property = attr.Val
334 } else if attr.Key == "content" {
335 content = attr.Val
336 }
337 }
338
339 switch property {
340 case "og:title":
341 if result.Title == "" {
342 result.Title = content
343 }
344 case "og:description":
345 if result.Description == "" {
346 result.Description = content
347 }
348 case "og:image":
349 if result.ThumbnailURL == "" {
350 result.ThumbnailURL = content
351 }
352 }
353 }
354 for c := n.FirstChild; c != nil; c = c.NextSibling {
355 findOG(c)
356 }
357 }
358 findOG(doc)
359
360 // Fallback: Extract from page content
361 // Look for images with kagiproxy.com URLs (Kagi's image proxy)
362 // Note: Skip the first image as it's often a shared header/logo
363 if result.ThumbnailURL == "" {
364 var images []struct {
365 url string
366 alt string
367 }
368
369 var findImg func(*html.Node)
370 findImg = func(n *html.Node) {
371 if n.Type == html.ElementNode && n.Data == "img" {
372 for _, attr := range n.Attr {
373 if attr.Key == "src" && strings.Contains(attr.Val, "kagiproxy.com") {
374 // Get alt text if available
375 var altText string
376 for _, a := range n.Attr {
377 if a.Key == "alt" {
378 altText = a.Val
379 break
380 }
381 }
382 images = append(images, struct {
383 url string
384 alt string
385 }{url: attr.Val, alt: altText})
386 break
387 }
388 }
389 }
390 for c := n.FirstChild; c != nil; c = c.NextSibling {
391 findImg(c)
392 }
393 }
394 findImg(doc)
395
396 // Skip first image (often shared header/logo), use second if available
397 if len(images) > 1 {
398 result.ThumbnailURL = images[1].url
399 if result.Description == "" && images[1].alt != "" {
400 result.Description = images[1].alt
401 }
402 } else if len(images) == 1 {
403 // Only one image found, use it
404 result.ThumbnailURL = images[0].url
405 if result.Description == "" && images[0].alt != "" {
406 result.Description = images[0].alt
407 }
408 }
409 }
410
411 // Fallback to <title> tag if og:title not found
412 if result.Title == "" {
413 var findTitle func(*html.Node) string
414 findTitle = func(n *html.Node) string {
415 if n.Type == html.ElementNode && n.Data == "title" {
416 if n.FirstChild != nil && n.FirstChild.Type == html.TextNode {
417 return n.FirstChild.Data
418 }
419 }
420 for c := n.FirstChild; c != nil; c = c.NextSibling {
421 if title := findTitle(c); title != "" {
422 return title
423 }
424 }
425 return ""
426 }
427 result.Title = findTitle(doc)
428 }
429
430 // If still no image, return error
431 if result.ThumbnailURL == "" {
432 return nil, fmt.Errorf("no image found in Kagi page")
433 }
434
435 return result, nil
436}