A community based topic aggregation platform built on atproto
1package unfurl 2 3import ( 4 "context" 5 "encoding/json" 6 "fmt" 7 "io" 8 "net/http" 9 "net/url" 10 "strings" 11 "time" 12 13 "golang.org/x/net/html" 14) 15 16// Provider configuration 17var oEmbedEndpoints = map[string]string{ 18 "streamable.com": "https://api.streamable.com/oembed", 19 "youtube.com": "https://www.youtube.com/oembed", 20 "youtu.be": "https://www.youtube.com/oembed", 21 "reddit.com": "https://www.reddit.com/oembed", 22} 23 24// oEmbedResponse represents a standard oEmbed response 25type oEmbedResponse struct { 26 ThumbnailURL string `json:"thumbnail_url"` 27 Version string `json:"version"` 28 Title string `json:"title"` 29 AuthorName string `json:"author_name"` 30 ProviderName string `json:"provider_name"` 31 ProviderURL string `json:"provider_url"` 32 Type string `json:"type"` 33 HTML string `json:"html"` 34 Description string `json:"description"` 35 ThumbnailWidth int `json:"thumbnail_width"` 36 ThumbnailHeight int `json:"thumbnail_height"` 37 Width int `json:"width"` 38 Height int `json:"height"` 39} 40 41// extractDomain extracts the domain from a URL 42func extractDomain(urlStr string) string { 43 parsed, err := url.Parse(urlStr) 44 if err != nil { 45 return "" 46 } 47 // Remove www. prefix 48 domain := strings.TrimPrefix(parsed.Host, "www.") 49 return domain 50} 51 52// isSupported checks if this is a valid HTTP/HTTPS URL 53func isSupported(urlStr string) bool { 54 parsed, err := url.Parse(urlStr) 55 if err != nil { 56 return false 57 } 58 scheme := strings.ToLower(parsed.Scheme) 59 return scheme == "http" || scheme == "https" 60} 61 62// isOEmbedProvider checks if we have an oEmbed endpoint for this URL 63func isOEmbedProvider(urlStr string) bool { 64 domain := extractDomain(urlStr) 65 _, exists := oEmbedEndpoints[domain] 66 return exists 67} 68 69// fetchOEmbed fetches oEmbed data from the provider 70func fetchOEmbed(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*oEmbedResponse, error) { 71 domain := extractDomain(urlStr) 72 endpoint, exists := oEmbedEndpoints[domain] 73 if !exists { 74 return nil, fmt.Errorf("no oEmbed endpoint for domain: %s", domain) 75 } 76 77 // Build oEmbed request URL 78 oembedURL := fmt.Sprintf("%s?url=%s&format=json", endpoint, url.QueryEscape(urlStr)) 79 80 // Create HTTP request 81 req, err := http.NewRequestWithContext(ctx, "GET", oembedURL, nil) 82 if err != nil { 83 return nil, fmt.Errorf("failed to create oEmbed request: %w", err) 84 } 85 86 req.Header.Set("User-Agent", userAgent) 87 88 // Create HTTP client with timeout 89 client := &http.Client{Timeout: timeout} 90 resp, err := client.Do(req) 91 if err != nil { 92 return nil, fmt.Errorf("failed to fetch oEmbed data: %w", err) 93 } 94 defer func() { _ = resp.Body.Close() }() 95 96 if resp.StatusCode != http.StatusOK { 97 return nil, fmt.Errorf("oEmbed endpoint returned status %d", resp.StatusCode) 98 } 99 100 // Parse JSON response 101 var oembed oEmbedResponse 102 if err := json.NewDecoder(resp.Body).Decode(&oembed); err != nil { 103 return nil, fmt.Errorf("failed to parse oEmbed response: %w", err) 104 } 105 106 return &oembed, nil 107} 108 109// normalizeURL converts protocol-relative URLs to HTTPS 110// Examples: 111// "//example.com/image.jpg" -> "https://example.com/image.jpg" 112// "https://example.com/image.jpg" -> "https://example.com/image.jpg" (unchanged) 113func normalizeURL(urlStr string) string { 114 if strings.HasPrefix(urlStr, "//") { 115 return "https:" + urlStr 116 } 117 return urlStr 118} 119 120// mapOEmbedToResult converts oEmbed response to UnfurlResult 121func mapOEmbedToResult(oembed *oEmbedResponse, originalURL string) *UnfurlResult { 122 result := &UnfurlResult{ 123 URI: originalURL, 124 Title: oembed.Title, 125 Description: oembed.Description, 126 ThumbnailURL: normalizeURL(oembed.ThumbnailURL), 127 Provider: strings.ToLower(oembed.ProviderName), 128 Domain: extractDomain(originalURL), 129 Width: oembed.Width, 130 Height: oembed.Height, 131 } 132 133 // Map oEmbed type to our embedType 134 switch oembed.Type { 135 case "video": 136 result.Type = "video" 137 case "photo": 138 result.Type = "image" 139 default: 140 result.Type = "article" 141 } 142 143 // If no description but we have author name, use that 144 if result.Description == "" && oembed.AuthorName != "" { 145 result.Description = fmt.Sprintf("By %s", oembed.AuthorName) 146 } 147 148 return result 149} 150 151// openGraphData represents OpenGraph metadata extracted from HTML 152type openGraphData struct { 153 Title string 154 Description string 155 Image string 156 URL string 157} 158 159// fetchOpenGraph fetches OpenGraph metadata from a URL 160func fetchOpenGraph(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) { 161 // Create HTTP request 162 req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil) 163 if err != nil { 164 return nil, fmt.Errorf("failed to create request: %w", err) 165 } 166 167 req.Header.Set("User-Agent", userAgent) 168 169 // Create HTTP client with timeout 170 client := &http.Client{Timeout: timeout} 171 resp, err := client.Do(req) 172 if err != nil { 173 return nil, fmt.Errorf("failed to fetch URL: %w", err) 174 } 175 defer func() { _ = resp.Body.Close() }() 176 177 if resp.StatusCode != http.StatusOK { 178 return nil, fmt.Errorf("HTTP request returned status %d", resp.StatusCode) 179 } 180 181 // Read response body (limit to 10MB to prevent abuse) 182 limitedReader := io.LimitReader(resp.Body, 10*1024*1024) 183 body, err := io.ReadAll(limitedReader) 184 if err != nil { 185 return nil, fmt.Errorf("failed to read response body: %w", err) 186 } 187 188 // Parse OpenGraph metadata 189 og, err := parseOpenGraph(string(body)) 190 if err != nil { 191 return nil, fmt.Errorf("failed to parse OpenGraph metadata: %w", err) 192 } 193 194 // Build UnfurlResult 195 result := &UnfurlResult{ 196 Type: "article", // Default type for OpenGraph 197 URI: urlStr, 198 Title: og.Title, 199 Description: og.Description, 200 ThumbnailURL: normalizeURL(og.Image), 201 Provider: "opengraph", 202 Domain: extractDomain(urlStr), 203 } 204 205 // Use og:url if available and valid 206 if og.URL != "" { 207 result.URI = og.URL 208 } 209 210 return result, nil 211} 212 213// parseOpenGraph extracts OpenGraph metadata from HTML 214func parseOpenGraph(htmlContent string) (*openGraphData, error) { 215 og := &openGraphData{} 216 doc, err := html.Parse(strings.NewReader(htmlContent)) 217 if err != nil { 218 // Try best-effort parsing even with invalid HTML 219 return og, nil 220 } 221 222 // Extract OpenGraph tags and fallbacks 223 var pageTitle string 224 var metaDescription string 225 226 var traverse func(*html.Node) 227 traverse = func(n *html.Node) { 228 if n.Type == html.ElementNode { 229 switch n.Data { 230 case "meta": 231 property := getAttr(n, "property") 232 name := getAttr(n, "name") 233 content := getAttr(n, "content") 234 235 // OpenGraph tags 236 if strings.HasPrefix(property, "og:") { 237 switch property { 238 case "og:title": 239 if og.Title == "" { 240 og.Title = content 241 } 242 case "og:description": 243 if og.Description == "" { 244 og.Description = content 245 } 246 case "og:image": 247 if og.Image == "" { 248 og.Image = content 249 } 250 case "og:url": 251 if og.URL == "" { 252 og.URL = content 253 } 254 } 255 } 256 257 // Fallback meta tags 258 if name == "description" && metaDescription == "" { 259 metaDescription = content 260 } 261 262 case "title": 263 if pageTitle == "" && n.FirstChild != nil { 264 pageTitle = n.FirstChild.Data 265 } 266 } 267 } 268 269 for c := n.FirstChild; c != nil; c = c.NextSibling { 270 traverse(c) 271 } 272 } 273 274 traverse(doc) 275 276 // Apply fallbacks 277 if og.Title == "" { 278 og.Title = pageTitle 279 } 280 if og.Description == "" { 281 og.Description = metaDescription 282 } 283 284 return og, nil 285} 286 287// getAttr gets an attribute value from an HTML node 288func getAttr(n *html.Node, key string) string { 289 for _, attr := range n.Attr { 290 if attr.Key == key { 291 return attr.Val 292 } 293 } 294 return "" 295} 296 297// fetchKagiKite handles special unfurling for Kagi Kite news pages 298// Kagi Kite pages use client-side rendering, so og:image tags aren't available at SSR time 299// Instead, we parse the HTML to extract the story image from the page content 300func fetchKagiKite(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) { 301 // Create HTTP request 302 req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil) 303 if err != nil { 304 return nil, fmt.Errorf("failed to create request: %w", err) 305 } 306 307 req.Header.Set("User-Agent", userAgent) 308 309 // Create HTTP client with timeout 310 client := &http.Client{Timeout: timeout} 311 resp, err := client.Do(req) 312 if err != nil { 313 return nil, fmt.Errorf("failed to fetch URL: %w", err) 314 } 315 defer func() { _ = resp.Body.Close() }() 316 317 if resp.StatusCode != http.StatusOK { 318 return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) 319 } 320 321 // Limit response size to 10MB 322 limitedReader := io.LimitReader(resp.Body, 10*1024*1024) 323 324 // Parse HTML 325 doc, err := html.Parse(limitedReader) 326 if err != nil { 327 return nil, fmt.Errorf("failed to parse HTML: %w", err) 328 } 329 330 result := &UnfurlResult{ 331 Type: "article", 332 URI: urlStr, 333 Domain: "kite.kagi.com", 334 Provider: "kagi", 335 } 336 337 // First try OpenGraph tags (in case they get added in the future) 338 var findOG func(*html.Node) 339 findOG = func(n *html.Node) { 340 if n.Type == html.ElementNode && n.Data == "meta" { 341 var property, content string 342 for _, attr := range n.Attr { 343 if attr.Key == "property" { 344 property = attr.Val 345 } else if attr.Key == "content" { 346 content = attr.Val 347 } 348 } 349 350 switch property { 351 case "og:title": 352 if result.Title == "" { 353 result.Title = content 354 } 355 case "og:description": 356 if result.Description == "" { 357 result.Description = content 358 } 359 case "og:image": 360 if result.ThumbnailURL == "" { 361 result.ThumbnailURL = content 362 } 363 } 364 } 365 for c := n.FirstChild; c != nil; c = c.NextSibling { 366 findOG(c) 367 } 368 } 369 findOG(doc) 370 371 // Fallback: Extract from page content 372 // Look for images with kagiproxy.com URLs (Kagi's image proxy) 373 // Note: Skip the first image as it's often a shared header/logo 374 if result.ThumbnailURL == "" { 375 var images []struct { 376 url string 377 alt string 378 } 379 380 var findImg func(*html.Node) 381 findImg = func(n *html.Node) { 382 if n.Type == html.ElementNode && n.Data == "img" { 383 for _, attr := range n.Attr { 384 if attr.Key == "src" && strings.Contains(attr.Val, "kagiproxy.com") { 385 // Get alt text if available 386 var altText string 387 for _, a := range n.Attr { 388 if a.Key == "alt" { 389 altText = a.Val 390 break 391 } 392 } 393 images = append(images, struct { 394 url string 395 alt string 396 }{url: attr.Val, alt: altText}) 397 break 398 } 399 } 400 } 401 for c := n.FirstChild; c != nil; c = c.NextSibling { 402 findImg(c) 403 } 404 } 405 findImg(doc) 406 407 // Skip first image (often shared header/logo), use second if available 408 if len(images) > 1 { 409 result.ThumbnailURL = images[1].url 410 if result.Description == "" && images[1].alt != "" { 411 result.Description = images[1].alt 412 } 413 } else if len(images) == 1 { 414 // Only one image found, use it 415 result.ThumbnailURL = images[0].url 416 if result.Description == "" && images[0].alt != "" { 417 result.Description = images[0].alt 418 } 419 } 420 } 421 422 // Fallback to <title> tag if og:title not found 423 if result.Title == "" { 424 var findTitle func(*html.Node) string 425 findTitle = func(n *html.Node) string { 426 if n.Type == html.ElementNode && n.Data == "title" { 427 if n.FirstChild != nil && n.FirstChild.Type == html.TextNode { 428 return n.FirstChild.Data 429 } 430 } 431 for c := n.FirstChild; c != nil; c = c.NextSibling { 432 if title := findTitle(c); title != "" { 433 return title 434 } 435 } 436 return "" 437 } 438 result.Title = findTitle(doc) 439 } 440 441 // If still no image, return error 442 if result.ThumbnailURL == "" { 443 return nil, fmt.Errorf("no image found in Kagi page") 444 } 445 446 return result, nil 447}