A community based topic aggregation platform built on atproto
1package unfurl 2 3import ( 4 "context" 5 "encoding/json" 6 "fmt" 7 "io" 8 "net/http" 9 "net/url" 10 "strings" 11 "time" 12 13 "golang.org/x/net/html" 14) 15 16// Provider configuration 17var oEmbedEndpoints = map[string]string{ 18 "streamable.com": "https://api.streamable.com/oembed", 19 "youtube.com": "https://www.youtube.com/oembed", 20 "youtu.be": "https://www.youtube.com/oembed", 21 "reddit.com": "https://www.reddit.com/oembed", 22} 23 24// oEmbedResponse represents a standard oEmbed response 25type oEmbedResponse struct { 26 ThumbnailURL string `json:"thumbnail_url"` 27 Version string `json:"version"` 28 Title string `json:"title"` 29 AuthorName string `json:"author_name"` 30 ProviderName string `json:"provider_name"` 31 ProviderURL string `json:"provider_url"` 32 Type string `json:"type"` 33 HTML string `json:"html"` 34 Description string `json:"description"` 35 ThumbnailWidth int `json:"thumbnail_width"` 36 ThumbnailHeight int `json:"thumbnail_height"` 37 Width int `json:"width"` 38 Height int `json:"height"` 39} 40 41// extractDomain extracts the domain from a URL 42func extractDomain(urlStr string) string { 43 parsed, err := url.Parse(urlStr) 44 if err != nil { 45 return "" 46 } 47 // Remove www. prefix 48 domain := strings.TrimPrefix(parsed.Host, "www.") 49 return domain 50} 51 52// isSupported checks if this is a valid HTTP/HTTPS URL 53func isSupported(urlStr string) bool { 54 parsed, err := url.Parse(urlStr) 55 if err != nil { 56 return false 57 } 58 scheme := strings.ToLower(parsed.Scheme) 59 return scheme == "http" || scheme == "https" 60} 61 62// isOEmbedProvider checks if we have an oEmbed endpoint for this URL 63func isOEmbedProvider(urlStr string) bool { 64 domain := extractDomain(urlStr) 65 _, exists := oEmbedEndpoints[domain] 66 return exists 67} 68 69// fetchOEmbed fetches oEmbed data from the provider 70func fetchOEmbed(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*oEmbedResponse, error) { 71 domain := extractDomain(urlStr) 72 endpoint, exists := oEmbedEndpoints[domain] 73 if !exists { 74 return nil, fmt.Errorf("no oEmbed endpoint for domain: %s", domain) 75 } 76 77 // Build oEmbed request URL 78 oembedURL := fmt.Sprintf("%s?url=%s&format=json", endpoint, url.QueryEscape(urlStr)) 79 80 // Create HTTP request 81 req, err := http.NewRequestWithContext(ctx, "GET", oembedURL, nil) 82 if err != nil { 83 return nil, fmt.Errorf("failed to create oEmbed request: %w", err) 84 } 85 86 req.Header.Set("User-Agent", userAgent) 87 88 // Create HTTP client with timeout 89 client := &http.Client{Timeout: timeout} 90 resp, err := client.Do(req) 91 if err != nil { 92 return nil, fmt.Errorf("failed to fetch oEmbed data: %w", err) 93 } 94 defer func() { _ = resp.Body.Close() }() 95 96 if resp.StatusCode != http.StatusOK { 97 return nil, fmt.Errorf("oEmbed endpoint returned status %d", resp.StatusCode) 98 } 99 100 // Parse JSON response 101 var oembed oEmbedResponse 102 if err := json.NewDecoder(resp.Body).Decode(&oembed); err != nil { 103 return nil, fmt.Errorf("failed to parse oEmbed response: %w", err) 104 } 105 106 return &oembed, nil 107} 108 109// mapOEmbedToResult converts oEmbed response to UnfurlResult 110func mapOEmbedToResult(oembed *oEmbedResponse, originalURL string) *UnfurlResult { 111 result := &UnfurlResult{ 112 URI: originalURL, 113 Title: oembed.Title, 114 Description: oembed.Description, 115 ThumbnailURL: oembed.ThumbnailURL, 116 Provider: strings.ToLower(oembed.ProviderName), 117 Domain: extractDomain(originalURL), 118 Width: oembed.Width, 119 Height: oembed.Height, 120 } 121 122 // Map oEmbed type to our embedType 123 switch oembed.Type { 124 case "video": 125 result.Type = "video" 126 case "photo": 127 result.Type = "image" 128 default: 129 result.Type = "article" 130 } 131 132 // If no description but we have author name, use that 133 if result.Description == "" && oembed.AuthorName != "" { 134 result.Description = fmt.Sprintf("By %s", oembed.AuthorName) 135 } 136 137 return result 138} 139 140// openGraphData represents OpenGraph metadata extracted from HTML 141type openGraphData struct { 142 Title string 143 Description string 144 Image string 145 URL string 146} 147 148// fetchOpenGraph fetches OpenGraph metadata from a URL 149func fetchOpenGraph(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) { 150 // Create HTTP request 151 req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil) 152 if err != nil { 153 return nil, fmt.Errorf("failed to create request: %w", err) 154 } 155 156 req.Header.Set("User-Agent", userAgent) 157 158 // Create HTTP client with timeout 159 client := &http.Client{Timeout: timeout} 160 resp, err := client.Do(req) 161 if err != nil { 162 return nil, fmt.Errorf("failed to fetch URL: %w", err) 163 } 164 defer func() { _ = resp.Body.Close() }() 165 166 if resp.StatusCode != http.StatusOK { 167 return nil, fmt.Errorf("HTTP request returned status %d", resp.StatusCode) 168 } 169 170 // Read response body (limit to 10MB to prevent abuse) 171 limitedReader := io.LimitReader(resp.Body, 10*1024*1024) 172 body, err := io.ReadAll(limitedReader) 173 if err != nil { 174 return nil, fmt.Errorf("failed to read response body: %w", err) 175 } 176 177 // Parse OpenGraph metadata 178 og, err := parseOpenGraph(string(body)) 179 if err != nil { 180 return nil, fmt.Errorf("failed to parse OpenGraph metadata: %w", err) 181 } 182 183 // Build UnfurlResult 184 result := &UnfurlResult{ 185 Type: "article", // Default type for OpenGraph 186 URI: urlStr, 187 Title: og.Title, 188 Description: og.Description, 189 ThumbnailURL: og.Image, 190 Provider: "opengraph", 191 Domain: extractDomain(urlStr), 192 } 193 194 // Use og:url if available and valid 195 if og.URL != "" { 196 result.URI = og.URL 197 } 198 199 return result, nil 200} 201 202// parseOpenGraph extracts OpenGraph metadata from HTML 203func parseOpenGraph(htmlContent string) (*openGraphData, error) { 204 og := &openGraphData{} 205 doc, err := html.Parse(strings.NewReader(htmlContent)) 206 if err != nil { 207 // Try best-effort parsing even with invalid HTML 208 return og, nil 209 } 210 211 // Extract OpenGraph tags and fallbacks 212 var pageTitle string 213 var metaDescription string 214 215 var traverse func(*html.Node) 216 traverse = func(n *html.Node) { 217 if n.Type == html.ElementNode { 218 switch n.Data { 219 case "meta": 220 property := getAttr(n, "property") 221 name := getAttr(n, "name") 222 content := getAttr(n, "content") 223 224 // OpenGraph tags 225 if strings.HasPrefix(property, "og:") { 226 switch property { 227 case "og:title": 228 if og.Title == "" { 229 og.Title = content 230 } 231 case "og:description": 232 if og.Description == "" { 233 og.Description = content 234 } 235 case "og:image": 236 if og.Image == "" { 237 og.Image = content 238 } 239 case "og:url": 240 if og.URL == "" { 241 og.URL = content 242 } 243 } 244 } 245 246 // Fallback meta tags 247 if name == "description" && metaDescription == "" { 248 metaDescription = content 249 } 250 251 case "title": 252 if pageTitle == "" && n.FirstChild != nil { 253 pageTitle = n.FirstChild.Data 254 } 255 } 256 } 257 258 for c := n.FirstChild; c != nil; c = c.NextSibling { 259 traverse(c) 260 } 261 } 262 263 traverse(doc) 264 265 // Apply fallbacks 266 if og.Title == "" { 267 og.Title = pageTitle 268 } 269 if og.Description == "" { 270 og.Description = metaDescription 271 } 272 273 return og, nil 274} 275 276// getAttr gets an attribute value from an HTML node 277func getAttr(n *html.Node, key string) string { 278 for _, attr := range n.Attr { 279 if attr.Key == key { 280 return attr.Val 281 } 282 } 283 return "" 284} 285 286// fetchKagiKite handles special unfurling for Kagi Kite news pages 287// Kagi Kite pages use client-side rendering, so og:image tags aren't available at SSR time 288// Instead, we parse the HTML to extract the story image from the page content 289func fetchKagiKite(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) { 290 // Create HTTP request 291 req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil) 292 if err != nil { 293 return nil, fmt.Errorf("failed to create request: %w", err) 294 } 295 296 req.Header.Set("User-Agent", userAgent) 297 298 // Create HTTP client with timeout 299 client := &http.Client{Timeout: timeout} 300 resp, err := client.Do(req) 301 if err != nil { 302 return nil, fmt.Errorf("failed to fetch URL: %w", err) 303 } 304 defer func() { _ = resp.Body.Close() }() 305 306 if resp.StatusCode != http.StatusOK { 307 return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) 308 } 309 310 // Limit response size to 10MB 311 limitedReader := io.LimitReader(resp.Body, 10*1024*1024) 312 313 // Parse HTML 314 doc, err := html.Parse(limitedReader) 315 if err != nil { 316 return nil, fmt.Errorf("failed to parse HTML: %w", err) 317 } 318 319 result := &UnfurlResult{ 320 Type: "article", 321 URI: urlStr, 322 Domain: "kite.kagi.com", 323 Provider: "kagi", 324 } 325 326 // First try OpenGraph tags (in case they get added in the future) 327 var findOG func(*html.Node) 328 findOG = func(n *html.Node) { 329 if n.Type == html.ElementNode && n.Data == "meta" { 330 var property, content string 331 for _, attr := range n.Attr { 332 if attr.Key == "property" { 333 property = attr.Val 334 } else if attr.Key == "content" { 335 content = attr.Val 336 } 337 } 338 339 switch property { 340 case "og:title": 341 if result.Title == "" { 342 result.Title = content 343 } 344 case "og:description": 345 if result.Description == "" { 346 result.Description = content 347 } 348 case "og:image": 349 if result.ThumbnailURL == "" { 350 result.ThumbnailURL = content 351 } 352 } 353 } 354 for c := n.FirstChild; c != nil; c = c.NextSibling { 355 findOG(c) 356 } 357 } 358 findOG(doc) 359 360 // Fallback: Extract from page content 361 // Look for images with kagiproxy.com URLs (Kagi's image proxy) 362 // Note: Skip the first image as it's often a shared header/logo 363 if result.ThumbnailURL == "" { 364 var images []struct { 365 url string 366 alt string 367 } 368 369 var findImg func(*html.Node) 370 findImg = func(n *html.Node) { 371 if n.Type == html.ElementNode && n.Data == "img" { 372 for _, attr := range n.Attr { 373 if attr.Key == "src" && strings.Contains(attr.Val, "kagiproxy.com") { 374 // Get alt text if available 375 var altText string 376 for _, a := range n.Attr { 377 if a.Key == "alt" { 378 altText = a.Val 379 break 380 } 381 } 382 images = append(images, struct { 383 url string 384 alt string 385 }{url: attr.Val, alt: altText}) 386 break 387 } 388 } 389 } 390 for c := n.FirstChild; c != nil; c = c.NextSibling { 391 findImg(c) 392 } 393 } 394 findImg(doc) 395 396 // Skip first image (often shared header/logo), use second if available 397 if len(images) > 1 { 398 result.ThumbnailURL = images[1].url 399 if result.Description == "" && images[1].alt != "" { 400 result.Description = images[1].alt 401 } 402 } else if len(images) == 1 { 403 // Only one image found, use it 404 result.ThumbnailURL = images[0].url 405 if result.Description == "" && images[0].alt != "" { 406 result.Description = images[0].alt 407 } 408 } 409 } 410 411 // Fallback to <title> tag if og:title not found 412 if result.Title == "" { 413 var findTitle func(*html.Node) string 414 findTitle = func(n *html.Node) string { 415 if n.Type == html.ElementNode && n.Data == "title" { 416 if n.FirstChild != nil && n.FirstChild.Type == html.TextNode { 417 return n.FirstChild.Data 418 } 419 } 420 for c := n.FirstChild; c != nil; c = c.NextSibling { 421 if title := findTitle(c); title != "" { 422 return title 423 } 424 } 425 return "" 426 } 427 result.Title = findTitle(doc) 428 } 429 430 // If still no image, return error 431 if result.ThumbnailURL == "" { 432 return nil, fmt.Errorf("no image found in Kagi page") 433 } 434 435 return result, nil 436}