A community based topic aggregation platform built on atproto
1package unfurl 2 3import ( 4 "context" 5 "encoding/json" 6 "fmt" 7 "io" 8 "net/http" 9 "net/url" 10 "strings" 11 "time" 12 13 "golang.org/x/net/html" 14) 15 16// Provider configuration 17var oEmbedEndpoints = map[string]string{ 18 "streamable.com": "https://api.streamable.com/oembed", 19 "youtube.com": "https://www.youtube.com/oembed", 20 "youtu.be": "https://www.youtube.com/oembed", 21 "reddit.com": "https://www.reddit.com/oembed", 22} 23 24// oEmbedResponse represents a standard oEmbed response 25type oEmbedResponse struct { 26 ThumbnailURL string `json:"thumbnail_url"` 27 Version string `json:"version"` 28 Title string `json:"title"` 29 AuthorName string `json:"author_name"` 30 ProviderName string `json:"provider_name"` 31 ProviderURL string `json:"provider_url"` 32 Type string `json:"type"` 33 HTML string `json:"html"` 34 Description string `json:"description"` 35 ThumbnailWidth int `json:"thumbnail_width"` 36 ThumbnailHeight int `json:"thumbnail_height"` 37 Width int `json:"width"` 38 Height int `json:"height"` 39} 40 41// extractDomain extracts the domain from a URL 42func extractDomain(urlStr string) string { 43 parsed, err := url.Parse(urlStr) 44 if err != nil { 45 return "" 46 } 47 // Remove www. prefix 48 domain := strings.TrimPrefix(parsed.Host, "www.") 49 return domain 50} 51 52// isSupported checks if this is a valid HTTP/HTTPS URL 53func isSupported(urlStr string) bool { 54 parsed, err := url.Parse(urlStr) 55 if err != nil { 56 return false 57 } 58 scheme := strings.ToLower(parsed.Scheme) 59 return scheme == "http" || scheme == "https" 60} 61 62// isOEmbedProvider checks if we have an oEmbed endpoint for this URL 63func isOEmbedProvider(urlStr string) bool { 64 domain := extractDomain(urlStr) 65 _, exists := oEmbedEndpoints[domain] 66 return exists 67} 68 69// fetchOEmbed fetches oEmbed data from the provider 70func fetchOEmbed(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*oEmbedResponse, error) { 71 domain := extractDomain(urlStr) 72 endpoint, exists := oEmbedEndpoints[domain] 73 if !exists { 74 return nil, fmt.Errorf("no oEmbed endpoint for domain: %s", domain) 75 } 76 77 // Build oEmbed request URL 78 oembedURL := fmt.Sprintf("%s?url=%s&format=json", endpoint, url.QueryEscape(urlStr)) 79 80 // Create HTTP request 81 req, err := http.NewRequestWithContext(ctx, "GET", oembedURL, nil) 82 if err != nil { 83 return nil, fmt.Errorf("failed to create oEmbed request: %w", err) 84 } 85 86 req.Header.Set("User-Agent", userAgent) 87 88 // Create HTTP client with timeout 89 client := &http.Client{Timeout: timeout} 90 resp, err := client.Do(req) 91 if err != nil { 92 return nil, fmt.Errorf("failed to fetch oEmbed data: %w", err) 93 } 94 defer func() { _ = resp.Body.Close() }() 95 96 if resp.StatusCode != http.StatusOK { 97 return nil, fmt.Errorf("oEmbed endpoint returned status %d", resp.StatusCode) 98 } 99 100 // Parse JSON response 101 var oembed oEmbedResponse 102 if err := json.NewDecoder(resp.Body).Decode(&oembed); err != nil { 103 return nil, fmt.Errorf("failed to parse oEmbed response: %w", err) 104 } 105 106 return &oembed, nil 107} 108 109// normalizeURL converts protocol-relative URLs to HTTPS 110// Examples: 111// 112// "//example.com/image.jpg" -> "https://example.com/image.jpg" 113// "https://example.com/image.jpg" -> "https://example.com/image.jpg" (unchanged) 114func normalizeURL(urlStr string) string { 115 if strings.HasPrefix(urlStr, "//") { 116 return "https:" + urlStr 117 } 118 return urlStr 119} 120 121// mapOEmbedToResult converts oEmbed response to UnfurlResult 122func mapOEmbedToResult(oembed *oEmbedResponse, originalURL string) *UnfurlResult { 123 result := &UnfurlResult{ 124 URI: originalURL, 125 Title: oembed.Title, 126 Description: oembed.Description, 127 ThumbnailURL: normalizeURL(oembed.ThumbnailURL), 128 Provider: strings.ToLower(oembed.ProviderName), 129 Domain: extractDomain(originalURL), 130 Width: oembed.Width, 131 Height: oembed.Height, 132 } 133 134 // Map oEmbed type to our embedType 135 switch oembed.Type { 136 case "video": 137 result.Type = "video" 138 case "photo": 139 result.Type = "image" 140 default: 141 result.Type = "article" 142 } 143 144 // If no description but we have author name, use that 145 if result.Description == "" && oembed.AuthorName != "" { 146 result.Description = fmt.Sprintf("By %s", oembed.AuthorName) 147 } 148 149 return result 150} 151 152// openGraphData represents OpenGraph metadata extracted from HTML 153type openGraphData struct { 154 Title string 155 Description string 156 Image string 157 URL string 158} 159 160// fetchOpenGraph fetches OpenGraph metadata from a URL 161func fetchOpenGraph(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) { 162 // Create HTTP request 163 req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil) 164 if err != nil { 165 return nil, fmt.Errorf("failed to create request: %w", err) 166 } 167 168 req.Header.Set("User-Agent", userAgent) 169 170 // Create HTTP client with timeout 171 client := &http.Client{Timeout: timeout} 172 resp, err := client.Do(req) 173 if err != nil { 174 return nil, fmt.Errorf("failed to fetch URL: %w", err) 175 } 176 defer func() { _ = resp.Body.Close() }() 177 178 if resp.StatusCode != http.StatusOK { 179 return nil, fmt.Errorf("HTTP request returned status %d", resp.StatusCode) 180 } 181 182 // Read response body (limit to 10MB to prevent abuse) 183 limitedReader := io.LimitReader(resp.Body, 10*1024*1024) 184 body, err := io.ReadAll(limitedReader) 185 if err != nil { 186 return nil, fmt.Errorf("failed to read response body: %w", err) 187 } 188 189 // Parse OpenGraph metadata 190 og, err := parseOpenGraph(string(body)) 191 if err != nil { 192 return nil, fmt.Errorf("failed to parse OpenGraph metadata: %w", err) 193 } 194 195 // Build UnfurlResult 196 result := &UnfurlResult{ 197 Type: "article", // Default type for OpenGraph 198 URI: urlStr, 199 Title: og.Title, 200 Description: og.Description, 201 ThumbnailURL: normalizeURL(og.Image), 202 Provider: "opengraph", 203 Domain: extractDomain(urlStr), 204 } 205 206 // Use og:url if available and valid 207 if og.URL != "" { 208 result.URI = og.URL 209 } 210 211 return result, nil 212} 213 214// parseOpenGraph extracts OpenGraph metadata from HTML 215func parseOpenGraph(htmlContent string) (*openGraphData, error) { 216 og := &openGraphData{} 217 doc, err := html.Parse(strings.NewReader(htmlContent)) 218 if err != nil { 219 // Try best-effort parsing even with invalid HTML 220 return og, nil 221 } 222 223 // Extract OpenGraph tags and fallbacks 224 var pageTitle string 225 var metaDescription string 226 227 var traverse func(*html.Node) 228 traverse = func(n *html.Node) { 229 if n.Type == html.ElementNode { 230 switch n.Data { 231 case "meta": 232 property := getAttr(n, "property") 233 name := getAttr(n, "name") 234 content := getAttr(n, "content") 235 236 // OpenGraph tags 237 if strings.HasPrefix(property, "og:") { 238 switch property { 239 case "og:title": 240 if og.Title == "" { 241 og.Title = content 242 } 243 case "og:description": 244 if og.Description == "" { 245 og.Description = content 246 } 247 case "og:image": 248 if og.Image == "" { 249 og.Image = content 250 } 251 case "og:url": 252 if og.URL == "" { 253 og.URL = content 254 } 255 } 256 } 257 258 // Fallback meta tags 259 if name == "description" && metaDescription == "" { 260 metaDescription = content 261 } 262 263 case "title": 264 if pageTitle == "" && n.FirstChild != nil { 265 pageTitle = n.FirstChild.Data 266 } 267 } 268 } 269 270 for c := n.FirstChild; c != nil; c = c.NextSibling { 271 traverse(c) 272 } 273 } 274 275 traverse(doc) 276 277 // Apply fallbacks 278 if og.Title == "" { 279 og.Title = pageTitle 280 } 281 if og.Description == "" { 282 og.Description = metaDescription 283 } 284 285 return og, nil 286} 287 288// getAttr gets an attribute value from an HTML node 289func getAttr(n *html.Node, key string) string { 290 for _, attr := range n.Attr { 291 if attr.Key == key { 292 return attr.Val 293 } 294 } 295 return "" 296} 297 298// fetchKagiKite handles special unfurling for Kagi Kite news pages 299// Kagi Kite pages use client-side rendering, so og:image tags aren't available at SSR time 300// Instead, we parse the HTML to extract the story image from the page content 301func fetchKagiKite(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) { 302 // Create HTTP request 303 req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil) 304 if err != nil { 305 return nil, fmt.Errorf("failed to create request: %w", err) 306 } 307 308 req.Header.Set("User-Agent", userAgent) 309 310 // Create HTTP client with timeout 311 client := &http.Client{Timeout: timeout} 312 resp, err := client.Do(req) 313 if err != nil { 314 return nil, fmt.Errorf("failed to fetch URL: %w", err) 315 } 316 defer func() { _ = resp.Body.Close() }() 317 318 if resp.StatusCode != http.StatusOK { 319 return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) 320 } 321 322 // Limit response size to 10MB 323 limitedReader := io.LimitReader(resp.Body, 10*1024*1024) 324 325 // Parse HTML 326 doc, err := html.Parse(limitedReader) 327 if err != nil { 328 return nil, fmt.Errorf("failed to parse HTML: %w", err) 329 } 330 331 result := &UnfurlResult{ 332 Type: "article", 333 URI: urlStr, 334 Domain: "kite.kagi.com", 335 Provider: "kagi", 336 } 337 338 // First try OpenGraph tags (in case they get added in the future) 339 var findOG func(*html.Node) 340 findOG = func(n *html.Node) { 341 if n.Type == html.ElementNode && n.Data == "meta" { 342 var property, content string 343 for _, attr := range n.Attr { 344 if attr.Key == "property" { 345 property = attr.Val 346 } else if attr.Key == "content" { 347 content = attr.Val 348 } 349 } 350 351 switch property { 352 case "og:title": 353 if result.Title == "" { 354 result.Title = content 355 } 356 case "og:description": 357 if result.Description == "" { 358 result.Description = content 359 } 360 case "og:image": 361 if result.ThumbnailURL == "" { 362 result.ThumbnailURL = content 363 } 364 } 365 } 366 for c := n.FirstChild; c != nil; c = c.NextSibling { 367 findOG(c) 368 } 369 } 370 findOG(doc) 371 372 // Fallback: Extract from page content 373 // Look for images with kagiproxy.com URLs (Kagi's image proxy) 374 // Note: Skip the first image as it's often a shared header/logo 375 if result.ThumbnailURL == "" { 376 var images []struct { 377 url string 378 alt string 379 } 380 381 var findImg func(*html.Node) 382 findImg = func(n *html.Node) { 383 if n.Type == html.ElementNode && n.Data == "img" { 384 for _, attr := range n.Attr { 385 if attr.Key == "src" && strings.Contains(attr.Val, "kagiproxy.com") { 386 // Get alt text if available 387 var altText string 388 for _, a := range n.Attr { 389 if a.Key == "alt" { 390 altText = a.Val 391 break 392 } 393 } 394 images = append(images, struct { 395 url string 396 alt string 397 }{url: attr.Val, alt: altText}) 398 break 399 } 400 } 401 } 402 for c := n.FirstChild; c != nil; c = c.NextSibling { 403 findImg(c) 404 } 405 } 406 findImg(doc) 407 408 // Skip first image (often shared header/logo), use second if available 409 if len(images) > 1 { 410 result.ThumbnailURL = images[1].url 411 if result.Description == "" && images[1].alt != "" { 412 result.Description = images[1].alt 413 } 414 } else if len(images) == 1 { 415 // Only one image found, use it 416 result.ThumbnailURL = images[0].url 417 if result.Description == "" && images[0].alt != "" { 418 result.Description = images[0].alt 419 } 420 } 421 } 422 423 // Fallback to <title> tag if og:title not found 424 if result.Title == "" { 425 var findTitle func(*html.Node) string 426 findTitle = func(n *html.Node) string { 427 if n.Type == html.ElementNode && n.Data == "title" { 428 if n.FirstChild != nil && n.FirstChild.Type == html.TextNode { 429 return n.FirstChild.Data 430 } 431 } 432 for c := n.FirstChild; c != nil; c = c.NextSibling { 433 if title := findTitle(c); title != "" { 434 return title 435 } 436 } 437 return "" 438 } 439 result.Title = findTitle(doc) 440 } 441 442 // If still no image, return error 443 if result.ThumbnailURL == "" { 444 return nil, fmt.Errorf("no image found in Kagi page") 445 } 446 447 return result, nil 448}