internal/core/unfurl/providers.go at 5b7122a2a1533c433659d18e1a668b1938aa36b7 · bretton.dev/coves

bretton.dev / coves
A community based topic aggregation platform built on atproto
coves / internal / core / unfurl / providers.go
at 5b7122a2a1533c433659d18e1a668b1938aa36b7 12 kB view raw
  1package unfurl
  2
  3import (
  4	"context"
  5	"encoding/json"
  6	"fmt"
  7	"io"
  8	"net/http"
  9	"net/url"
 10	"strings"
 11	"time"
 12
 13	"golang.org/x/net/html"
 14)
 15
 16// Provider configuration
 17var oEmbedEndpoints = map[string]string{
 18	"streamable.com": "https://api.streamable.com/oembed",
 19	"youtube.com":    "https://www.youtube.com/oembed",
 20	"youtu.be":       "https://www.youtube.com/oembed",
 21	"reddit.com":     "https://www.reddit.com/oembed",
 22}
 23
 24// oEmbedResponse represents a standard oEmbed response
 25type oEmbedResponse struct {
 26	ThumbnailURL    string `json:"thumbnail_url"`
 27	Version         string `json:"version"`
 28	Title           string `json:"title"`
 29	AuthorName      string `json:"author_name"`
 30	ProviderName    string `json:"provider_name"`
 31	ProviderURL     string `json:"provider_url"`
 32	Type            string `json:"type"`
 33	HTML            string `json:"html"`
 34	Description     string `json:"description"`
 35	ThumbnailWidth  int    `json:"thumbnail_width"`
 36	ThumbnailHeight int    `json:"thumbnail_height"`
 37	Width           int    `json:"width"`
 38	Height          int    `json:"height"`
 39}
 40
 41// extractDomain extracts the domain from a URL
 42func extractDomain(urlStr string) string {
 43	parsed, err := url.Parse(urlStr)
 44	if err != nil {
 45		return ""
 46	}
 47	// Remove www. prefix
 48	domain := strings.TrimPrefix(parsed.Host, "www.")
 49	return domain
 50}
 51
 52// isSupported checks if this is a valid HTTP/HTTPS URL
 53func isSupported(urlStr string) bool {
 54	parsed, err := url.Parse(urlStr)
 55	if err != nil {
 56		return false
 57	}
 58	scheme := strings.ToLower(parsed.Scheme)
 59	return scheme == "http" || scheme == "https"
 60}
 61
 62// isOEmbedProvider checks if we have an oEmbed endpoint for this URL
 63func isOEmbedProvider(urlStr string) bool {
 64	domain := extractDomain(urlStr)
 65	_, exists := oEmbedEndpoints[domain]
 66	return exists
 67}
 68
 69// fetchOEmbed fetches oEmbed data from the provider
 70func fetchOEmbed(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*oEmbedResponse, error) {
 71	domain := extractDomain(urlStr)
 72	endpoint, exists := oEmbedEndpoints[domain]
 73	if !exists {
 74		return nil, fmt.Errorf("no oEmbed endpoint for domain: %s", domain)
 75	}
 76
 77	// Build oEmbed request URL
 78	oembedURL := fmt.Sprintf("%s?url=%s&format=json", endpoint, url.QueryEscape(urlStr))
 79
 80	// Create HTTP request
 81	req, err := http.NewRequestWithContext(ctx, "GET", oembedURL, nil)
 82	if err != nil {
 83		return nil, fmt.Errorf("failed to create oEmbed request: %w", err)
 84	}
 85
 86	req.Header.Set("User-Agent", userAgent)
 87
 88	// Create HTTP client with timeout
 89	client := &http.Client{Timeout: timeout}
 90	resp, err := client.Do(req)
 91	if err != nil {
 92		return nil, fmt.Errorf("failed to fetch oEmbed data: %w", err)
 93	}
 94	defer func() { _ = resp.Body.Close() }()
 95
 96	if resp.StatusCode != http.StatusOK {
 97		return nil, fmt.Errorf("oEmbed endpoint returned status %d", resp.StatusCode)
 98	}
 99
100	// Parse JSON response
101	var oembed oEmbedResponse
102	if err := json.NewDecoder(resp.Body).Decode(&oembed); err != nil {
103		return nil, fmt.Errorf("failed to parse oEmbed response: %w", err)
104	}
105
106	return &oembed, nil
107}
108
109// normalizeURL converts protocol-relative URLs to HTTPS
110// Examples:
111//   "//example.com/image.jpg" -> "https://example.com/image.jpg"
112//   "https://example.com/image.jpg" -> "https://example.com/image.jpg" (unchanged)
113func normalizeURL(urlStr string) string {
114	if strings.HasPrefix(urlStr, "//") {
115		return "https:" + urlStr
116	}
117	return urlStr
118}
119
120// mapOEmbedToResult converts oEmbed response to UnfurlResult
121func mapOEmbedToResult(oembed *oEmbedResponse, originalURL string) *UnfurlResult {
122	result := &UnfurlResult{
123		URI:          originalURL,
124		Title:        oembed.Title,
125		Description:  oembed.Description,
126		ThumbnailURL: normalizeURL(oembed.ThumbnailURL),
127		Provider:     strings.ToLower(oembed.ProviderName),
128		Domain:       extractDomain(originalURL),
129		Width:        oembed.Width,
130		Height:       oembed.Height,
131	}
132
133	// Map oEmbed type to our embedType
134	switch oembed.Type {
135	case "video":
136		result.Type = "video"
137	case "photo":
138		result.Type = "image"
139	default:
140		result.Type = "article"
141	}
142
143	// If no description but we have author name, use that
144	if result.Description == "" && oembed.AuthorName != "" {
145		result.Description = fmt.Sprintf("By %s", oembed.AuthorName)
146	}
147
148	return result
149}
150
151// openGraphData represents OpenGraph metadata extracted from HTML
152type openGraphData struct {
153	Title       string
154	Description string
155	Image       string
156	URL         string
157}
158
159// fetchOpenGraph fetches OpenGraph metadata from a URL
160func fetchOpenGraph(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) {
161	// Create HTTP request
162	req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
163	if err != nil {
164		return nil, fmt.Errorf("failed to create request: %w", err)
165	}
166
167	req.Header.Set("User-Agent", userAgent)
168
169	// Create HTTP client with timeout
170	client := &http.Client{Timeout: timeout}
171	resp, err := client.Do(req)
172	if err != nil {
173		return nil, fmt.Errorf("failed to fetch URL: %w", err)
174	}
175	defer func() { _ = resp.Body.Close() }()
176
177	if resp.StatusCode != http.StatusOK {
178		return nil, fmt.Errorf("HTTP request returned status %d", resp.StatusCode)
179	}
180
181	// Read response body (limit to 10MB to prevent abuse)
182	limitedReader := io.LimitReader(resp.Body, 10*1024*1024)
183	body, err := io.ReadAll(limitedReader)
184	if err != nil {
185		return nil, fmt.Errorf("failed to read response body: %w", err)
186	}
187
188	// Parse OpenGraph metadata
189	og, err := parseOpenGraph(string(body))
190	if err != nil {
191		return nil, fmt.Errorf("failed to parse OpenGraph metadata: %w", err)
192	}
193
194	// Build UnfurlResult
195	result := &UnfurlResult{
196		Type:         "article", // Default type for OpenGraph
197		URI:          urlStr,
198		Title:        og.Title,
199		Description:  og.Description,
200		ThumbnailURL: normalizeURL(og.Image),
201		Provider:     "opengraph",
202		Domain:       extractDomain(urlStr),
203	}
204
205	// Use og:url if available and valid
206	if og.URL != "" {
207		result.URI = og.URL
208	}
209
210	return result, nil
211}
212
213// parseOpenGraph extracts OpenGraph metadata from HTML
214func parseOpenGraph(htmlContent string) (*openGraphData, error) {
215	og := &openGraphData{}
216	doc, err := html.Parse(strings.NewReader(htmlContent))
217	if err != nil {
218		// Try best-effort parsing even with invalid HTML
219		return og, nil
220	}
221
222	// Extract OpenGraph tags and fallbacks
223	var pageTitle string
224	var metaDescription string
225
226	var traverse func(*html.Node)
227	traverse = func(n *html.Node) {
228		if n.Type == html.ElementNode {
229			switch n.Data {
230			case "meta":
231				property := getAttr(n, "property")
232				name := getAttr(n, "name")
233				content := getAttr(n, "content")
234
235				// OpenGraph tags
236				if strings.HasPrefix(property, "og:") {
237					switch property {
238					case "og:title":
239						if og.Title == "" {
240							og.Title = content
241						}
242					case "og:description":
243						if og.Description == "" {
244							og.Description = content
245						}
246					case "og:image":
247						if og.Image == "" {
248							og.Image = content
249						}
250					case "og:url":
251						if og.URL == "" {
252							og.URL = content
253						}
254					}
255				}
256
257				// Fallback meta tags
258				if name == "description" && metaDescription == "" {
259					metaDescription = content
260				}
261
262			case "title":
263				if pageTitle == "" && n.FirstChild != nil {
264					pageTitle = n.FirstChild.Data
265				}
266			}
267		}
268
269		for c := n.FirstChild; c != nil; c = c.NextSibling {
270			traverse(c)
271		}
272	}
273
274	traverse(doc)
275
276	// Apply fallbacks
277	if og.Title == "" {
278		og.Title = pageTitle
279	}
280	if og.Description == "" {
281		og.Description = metaDescription
282	}
283
284	return og, nil
285}
286
287// getAttr gets an attribute value from an HTML node
288func getAttr(n *html.Node, key string) string {
289	for _, attr := range n.Attr {
290		if attr.Key == key {
291			return attr.Val
292		}
293	}
294	return ""
295}
296
297// fetchKagiKite handles special unfurling for Kagi Kite news pages
298// Kagi Kite pages use client-side rendering, so og:image tags aren't available at SSR time
299// Instead, we parse the HTML to extract the story image from the page content
300func fetchKagiKite(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) {
301	// Create HTTP request
302	req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
303	if err != nil {
304		return nil, fmt.Errorf("failed to create request: %w", err)
305	}
306
307	req.Header.Set("User-Agent", userAgent)
308
309	// Create HTTP client with timeout
310	client := &http.Client{Timeout: timeout}
311	resp, err := client.Do(req)
312	if err != nil {
313		return nil, fmt.Errorf("failed to fetch URL: %w", err)
314	}
315	defer func() { _ = resp.Body.Close() }()
316
317	if resp.StatusCode != http.StatusOK {
318		return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
319	}
320
321	// Limit response size to 10MB
322	limitedReader := io.LimitReader(resp.Body, 10*1024*1024)
323
324	// Parse HTML
325	doc, err := html.Parse(limitedReader)
326	if err != nil {
327		return nil, fmt.Errorf("failed to parse HTML: %w", err)
328	}
329
330	result := &UnfurlResult{
331		Type:     "article",
332		URI:      urlStr,
333		Domain:   "kite.kagi.com",
334		Provider: "kagi",
335	}
336
337	// First try OpenGraph tags (in case they get added in the future)
338	var findOG func(*html.Node)
339	findOG = func(n *html.Node) {
340		if n.Type == html.ElementNode && n.Data == "meta" {
341			var property, content string
342			for _, attr := range n.Attr {
343				if attr.Key == "property" {
344					property = attr.Val
345				} else if attr.Key == "content" {
346					content = attr.Val
347				}
348			}
349
350			switch property {
351			case "og:title":
352				if result.Title == "" {
353					result.Title = content
354				}
355			case "og:description":
356				if result.Description == "" {
357					result.Description = content
358				}
359			case "og:image":
360				if result.ThumbnailURL == "" {
361					result.ThumbnailURL = content
362				}
363			}
364		}
365		for c := n.FirstChild; c != nil; c = c.NextSibling {
366			findOG(c)
367		}
368	}
369	findOG(doc)
370
371	// Fallback: Extract from page content
372	// Look for images with kagiproxy.com URLs (Kagi's image proxy)
373	// Note: Skip the first image as it's often a shared header/logo
374	if result.ThumbnailURL == "" {
375		var images []struct {
376			url string
377			alt string
378		}
379
380		var findImg func(*html.Node)
381		findImg = func(n *html.Node) {
382			if n.Type == html.ElementNode && n.Data == "img" {
383				for _, attr := range n.Attr {
384					if attr.Key == "src" && strings.Contains(attr.Val, "kagiproxy.com") {
385						// Get alt text if available
386						var altText string
387						for _, a := range n.Attr {
388							if a.Key == "alt" {
389								altText = a.Val
390								break
391							}
392						}
393						images = append(images, struct {
394							url string
395							alt string
396						}{url: attr.Val, alt: altText})
397						break
398					}
399				}
400			}
401			for c := n.FirstChild; c != nil; c = c.NextSibling {
402				findImg(c)
403			}
404		}
405		findImg(doc)
406
407		// Skip first image (often shared header/logo), use second if available
408		if len(images) > 1 {
409			result.ThumbnailURL = images[1].url
410			if result.Description == "" && images[1].alt != "" {
411				result.Description = images[1].alt
412			}
413		} else if len(images) == 1 {
414			// Only one image found, use it
415			result.ThumbnailURL = images[0].url
416			if result.Description == "" && images[0].alt != "" {
417				result.Description = images[0].alt
418			}
419		}
420	}
421
422	// Fallback to <title> tag if og:title not found
423	if result.Title == "" {
424		var findTitle func(*html.Node) string
425		findTitle = func(n *html.Node) string {
426			if n.Type == html.ElementNode && n.Data == "title" {
427				if n.FirstChild != nil && n.FirstChild.Type == html.TextNode {
428					return n.FirstChild.Data
429				}
430			}
431			for c := n.FirstChild; c != nil; c = c.NextSibling {
432				if title := findTitle(c); title != "" {
433					return title
434				}
435			}
436			return ""
437		}
438		result.Title = findTitle(doc)
439	}
440
441	// If still no image, return error
442	if result.ThumbnailURL == "" {
443		return nil, fmt.Errorf("no image found in Kagi page")
444	}
445
446	return result, nil
447}