internal/core/unfurl/providers.go at 301eae7bcb4683c20618addd14cde1b9c0cb60b7 · bretton.dev/coves

bretton.dev / coves
A community based topic aggregation platform built on atproto
coves / internal / core / unfurl / providers.go
at 301eae7bcb4683c20618addd14cde1b9c0cb60b7 11 kB view raw
  1package unfurl
  2
  3import (
  4	"context"
  5	"encoding/json"
  6	"fmt"
  7	"io"
  8	"net/http"
  9	"net/url"
 10	"strings"
 11	"time"
 12
 13	"golang.org/x/net/html"
 14)
 15
 16// Provider configuration
 17var oEmbedEndpoints = map[string]string{
 18	"streamable.com": "https://api.streamable.com/oembed",
 19	"youtube.com":    "https://www.youtube.com/oembed",
 20	"youtu.be":       "https://www.youtube.com/oembed",
 21	"reddit.com":     "https://www.reddit.com/oembed",
 22}
 23
 24// oEmbedResponse represents a standard oEmbed response
 25type oEmbedResponse struct {
 26	ThumbnailURL    string `json:"thumbnail_url"`
 27	Version         string `json:"version"`
 28	Title           string `json:"title"`
 29	AuthorName      string `json:"author_name"`
 30	ProviderName    string `json:"provider_name"`
 31	ProviderURL     string `json:"provider_url"`
 32	Type            string `json:"type"`
 33	HTML            string `json:"html"`
 34	Description     string `json:"description"`
 35	ThumbnailWidth  int    `json:"thumbnail_width"`
 36	ThumbnailHeight int    `json:"thumbnail_height"`
 37	Width           int    `json:"width"`
 38	Height          int    `json:"height"`
 39}
 40
 41// extractDomain extracts the domain from a URL
 42func extractDomain(urlStr string) string {
 43	parsed, err := url.Parse(urlStr)
 44	if err != nil {
 45		return ""
 46	}
 47	// Remove www. prefix
 48	domain := strings.TrimPrefix(parsed.Host, "www.")
 49	return domain
 50}
 51
 52// isSupported checks if this is a valid HTTP/HTTPS URL
 53func isSupported(urlStr string) bool {
 54	parsed, err := url.Parse(urlStr)
 55	if err != nil {
 56		return false
 57	}
 58	scheme := strings.ToLower(parsed.Scheme)
 59	return scheme == "http" || scheme == "https"
 60}
 61
 62// isOEmbedProvider checks if we have an oEmbed endpoint for this URL
 63func isOEmbedProvider(urlStr string) bool {
 64	domain := extractDomain(urlStr)
 65	_, exists := oEmbedEndpoints[domain]
 66	return exists
 67}
 68
 69// fetchOEmbed fetches oEmbed data from the provider
 70func fetchOEmbed(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*oEmbedResponse, error) {
 71	domain := extractDomain(urlStr)
 72	endpoint, exists := oEmbedEndpoints[domain]
 73	if !exists {
 74		return nil, fmt.Errorf("no oEmbed endpoint for domain: %s", domain)
 75	}
 76
 77	// Build oEmbed request URL
 78	oembedURL := fmt.Sprintf("%s?url=%s&format=json", endpoint, url.QueryEscape(urlStr))
 79
 80	// Create HTTP request
 81	req, err := http.NewRequestWithContext(ctx, "GET", oembedURL, nil)
 82	if err != nil {
 83		return nil, fmt.Errorf("failed to create oEmbed request: %w", err)
 84	}
 85
 86	req.Header.Set("User-Agent", userAgent)
 87
 88	// Create HTTP client with timeout
 89	client := &http.Client{Timeout: timeout}
 90	resp, err := client.Do(req)
 91	if err != nil {
 92		return nil, fmt.Errorf("failed to fetch oEmbed data: %w", err)
 93	}
 94	defer func() { _ = resp.Body.Close() }()
 95
 96	if resp.StatusCode != http.StatusOK {
 97		return nil, fmt.Errorf("oEmbed endpoint returned status %d", resp.StatusCode)
 98	}
 99
100	// Parse JSON response
101	var oembed oEmbedResponse
102	if err := json.NewDecoder(resp.Body).Decode(&oembed); err != nil {
103		return nil, fmt.Errorf("failed to parse oEmbed response: %w", err)
104	}
105
106	return &oembed, nil
107}
108
109// mapOEmbedToResult converts oEmbed response to UnfurlResult
110func mapOEmbedToResult(oembed *oEmbedResponse, originalURL string) *UnfurlResult {
111	result := &UnfurlResult{
112		URI:          originalURL,
113		Title:        oembed.Title,
114		Description:  oembed.Description,
115		ThumbnailURL: oembed.ThumbnailURL,
116		Provider:     strings.ToLower(oembed.ProviderName),
117		Domain:       extractDomain(originalURL),
118		Width:        oembed.Width,
119		Height:       oembed.Height,
120	}
121
122	// Map oEmbed type to our embedType
123	switch oembed.Type {
124	case "video":
125		result.Type = "video"
126	case "photo":
127		result.Type = "image"
128	default:
129		result.Type = "article"
130	}
131
132	// If no description but we have author name, use that
133	if result.Description == "" && oembed.AuthorName != "" {
134		result.Description = fmt.Sprintf("By %s", oembed.AuthorName)
135	}
136
137	return result
138}
139
140// openGraphData represents OpenGraph metadata extracted from HTML
141type openGraphData struct {
142	Title       string
143	Description string
144	Image       string
145	URL         string
146}
147
148// fetchOpenGraph fetches OpenGraph metadata from a URL
149func fetchOpenGraph(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) {
150	// Create HTTP request
151	req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
152	if err != nil {
153		return nil, fmt.Errorf("failed to create request: %w", err)
154	}
155
156	req.Header.Set("User-Agent", userAgent)
157
158	// Create HTTP client with timeout
159	client := &http.Client{Timeout: timeout}
160	resp, err := client.Do(req)
161	if err != nil {
162		return nil, fmt.Errorf("failed to fetch URL: %w", err)
163	}
164	defer func() { _ = resp.Body.Close() }()
165
166	if resp.StatusCode != http.StatusOK {
167		return nil, fmt.Errorf("HTTP request returned status %d", resp.StatusCode)
168	}
169
170	// Read response body (limit to 10MB to prevent abuse)
171	limitedReader := io.LimitReader(resp.Body, 10*1024*1024)
172	body, err := io.ReadAll(limitedReader)
173	if err != nil {
174		return nil, fmt.Errorf("failed to read response body: %w", err)
175	}
176
177	// Parse OpenGraph metadata
178	og, err := parseOpenGraph(string(body))
179	if err != nil {
180		return nil, fmt.Errorf("failed to parse OpenGraph metadata: %w", err)
181	}
182
183	// Build UnfurlResult
184	result := &UnfurlResult{
185		Type:         "article", // Default type for OpenGraph
186		URI:          urlStr,
187		Title:        og.Title,
188		Description:  og.Description,
189		ThumbnailURL: og.Image,
190		Provider:     "opengraph",
191		Domain:       extractDomain(urlStr),
192	}
193
194	// Use og:url if available and valid
195	if og.URL != "" {
196		result.URI = og.URL
197	}
198
199	return result, nil
200}
201
202// parseOpenGraph extracts OpenGraph metadata from HTML
203func parseOpenGraph(htmlContent string) (*openGraphData, error) {
204	og := &openGraphData{}
205	doc, err := html.Parse(strings.NewReader(htmlContent))
206	if err != nil {
207		// Try best-effort parsing even with invalid HTML
208		return og, nil
209	}
210
211	// Extract OpenGraph tags and fallbacks
212	var pageTitle string
213	var metaDescription string
214
215	var traverse func(*html.Node)
216	traverse = func(n *html.Node) {
217		if n.Type == html.ElementNode {
218			switch n.Data {
219			case "meta":
220				property := getAttr(n, "property")
221				name := getAttr(n, "name")
222				content := getAttr(n, "content")
223
224				// OpenGraph tags
225				if strings.HasPrefix(property, "og:") {
226					switch property {
227					case "og:title":
228						if og.Title == "" {
229							og.Title = content
230						}
231					case "og:description":
232						if og.Description == "" {
233							og.Description = content
234						}
235					case "og:image":
236						if og.Image == "" {
237							og.Image = content
238						}
239					case "og:url":
240						if og.URL == "" {
241							og.URL = content
242						}
243					}
244				}
245
246				// Fallback meta tags
247				if name == "description" && metaDescription == "" {
248					metaDescription = content
249				}
250
251			case "title":
252				if pageTitle == "" && n.FirstChild != nil {
253					pageTitle = n.FirstChild.Data
254				}
255			}
256		}
257
258		for c := n.FirstChild; c != nil; c = c.NextSibling {
259			traverse(c)
260		}
261	}
262
263	traverse(doc)
264
265	// Apply fallbacks
266	if og.Title == "" {
267		og.Title = pageTitle
268	}
269	if og.Description == "" {
270		og.Description = metaDescription
271	}
272
273	return og, nil
274}
275
276// getAttr gets an attribute value from an HTML node
277func getAttr(n *html.Node, key string) string {
278	for _, attr := range n.Attr {
279		if attr.Key == key {
280			return attr.Val
281		}
282	}
283	return ""
284}
285
286// fetchKagiKite handles special unfurling for Kagi Kite news pages
287// Kagi Kite pages use client-side rendering, so og:image tags aren't available at SSR time
288// Instead, we parse the HTML to extract the story image from the page content
289func fetchKagiKite(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) {
290	// Create HTTP request
291	req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
292	if err != nil {
293		return nil, fmt.Errorf("failed to create request: %w", err)
294	}
295
296	req.Header.Set("User-Agent", userAgent)
297
298	// Create HTTP client with timeout
299	client := &http.Client{Timeout: timeout}
300	resp, err := client.Do(req)
301	if err != nil {
302		return nil, fmt.Errorf("failed to fetch URL: %w", err)
303	}
304	defer func() { _ = resp.Body.Close() }()
305
306	if resp.StatusCode != http.StatusOK {
307		return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
308	}
309
310	// Limit response size to 10MB
311	limitedReader := io.LimitReader(resp.Body, 10*1024*1024)
312
313	// Parse HTML
314	doc, err := html.Parse(limitedReader)
315	if err != nil {
316		return nil, fmt.Errorf("failed to parse HTML: %w", err)
317	}
318
319	result := &UnfurlResult{
320		Type:     "article",
321		URI:      urlStr,
322		Domain:   "kite.kagi.com",
323		Provider: "kagi",
324	}
325
326	// First try OpenGraph tags (in case they get added in the future)
327	var findOG func(*html.Node)
328	findOG = func(n *html.Node) {
329		if n.Type == html.ElementNode && n.Data == "meta" {
330			var property, content string
331			for _, attr := range n.Attr {
332				if attr.Key == "property" {
333					property = attr.Val
334				} else if attr.Key == "content" {
335					content = attr.Val
336				}
337			}
338
339			switch property {
340			case "og:title":
341				if result.Title == "" {
342					result.Title = content
343				}
344			case "og:description":
345				if result.Description == "" {
346					result.Description = content
347				}
348			case "og:image":
349				if result.ThumbnailURL == "" {
350					result.ThumbnailURL = content
351				}
352			}
353		}
354		for c := n.FirstChild; c != nil; c = c.NextSibling {
355			findOG(c)
356		}
357	}
358	findOG(doc)
359
360	// Fallback: Extract from page content
361	// Look for images with kagiproxy.com URLs (Kagi's image proxy)
362	// Note: Skip the first image as it's often a shared header/logo
363	if result.ThumbnailURL == "" {
364		var images []struct {
365			url string
366			alt string
367		}
368
369		var findImg func(*html.Node)
370		findImg = func(n *html.Node) {
371			if n.Type == html.ElementNode && n.Data == "img" {
372				for _, attr := range n.Attr {
373					if attr.Key == "src" && strings.Contains(attr.Val, "kagiproxy.com") {
374						// Get alt text if available
375						var altText string
376						for _, a := range n.Attr {
377							if a.Key == "alt" {
378								altText = a.Val
379								break
380							}
381						}
382						images = append(images, struct {
383							url string
384							alt string
385						}{url: attr.Val, alt: altText})
386						break
387					}
388				}
389			}
390			for c := n.FirstChild; c != nil; c = c.NextSibling {
391				findImg(c)
392			}
393		}
394		findImg(doc)
395
396		// Skip first image (often shared header/logo), use second if available
397		if len(images) > 1 {
398			result.ThumbnailURL = images[1].url
399			if result.Description == "" && images[1].alt != "" {
400				result.Description = images[1].alt
401			}
402		} else if len(images) == 1 {
403			// Only one image found, use it
404			result.ThumbnailURL = images[0].url
405			if result.Description == "" && images[0].alt != "" {
406				result.Description = images[0].alt
407			}
408		}
409	}
410
411	// Fallback to <title> tag if og:title not found
412	if result.Title == "" {
413		var findTitle func(*html.Node) string
414		findTitle = func(n *html.Node) string {
415			if n.Type == html.ElementNode && n.Data == "title" {
416				if n.FirstChild != nil && n.FirstChild.Type == html.TextNode {
417					return n.FirstChild.Data
418				}
419			}
420			for c := n.FirstChild; c != nil; c = c.NextSibling {
421				if title := findTitle(c); title != "" {
422					return title
423				}
424			}
425			return ""
426		}
427		result.Title = findTitle(doc)
428	}
429
430	// If still no image, return error
431	if result.ThumbnailURL == "" {
432		return nil, fmt.Errorf("no image found in Kagi page")
433	}
434
435	return result, nil
436}