internal/core/unfurl/providers.go at feat/vote-xrpc-endpoints · bretton.dev/coves

bretton.dev / coves
A community based topic aggregation platform built on atproto
coves / internal / core / unfurl / providers.go
at feat/vote-xrpc-endpoints 12 kB view raw
  1package unfurl
  2
  3import (
  4	"context"
  5	"encoding/json"
  6	"fmt"
  7	"io"
  8	"net/http"
  9	"net/url"
 10	"strings"
 11	"time"
 12
 13	"golang.org/x/net/html"
 14)
 15
 16// Provider configuration
 17var oEmbedEndpoints = map[string]string{
 18	"streamable.com": "https://api.streamable.com/oembed",
 19	"youtube.com":    "https://www.youtube.com/oembed",
 20	"youtu.be":       "https://www.youtube.com/oembed",
 21	"reddit.com":     "https://www.reddit.com/oembed",
 22}
 23
 24// oEmbedResponse represents a standard oEmbed response
 25type oEmbedResponse struct {
 26	ThumbnailURL    string `json:"thumbnail_url"`
 27	Version         string `json:"version"`
 28	Title           string `json:"title"`
 29	AuthorName      string `json:"author_name"`
 30	ProviderName    string `json:"provider_name"`
 31	ProviderURL     string `json:"provider_url"`
 32	Type            string `json:"type"`
 33	HTML            string `json:"html"`
 34	Description     string `json:"description"`
 35	ThumbnailWidth  int    `json:"thumbnail_width"`
 36	ThumbnailHeight int    `json:"thumbnail_height"`
 37	Width           int    `json:"width"`
 38	Height          int    `json:"height"`
 39}
 40
 41// extractDomain extracts the domain from a URL
 42func extractDomain(urlStr string) string {
 43	parsed, err := url.Parse(urlStr)
 44	if err != nil {
 45		return ""
 46	}
 47	// Remove www. prefix
 48	domain := strings.TrimPrefix(parsed.Host, "www.")
 49	return domain
 50}
 51
 52// isSupported checks if this is a valid HTTP/HTTPS URL
 53func isSupported(urlStr string) bool {
 54	parsed, err := url.Parse(urlStr)
 55	if err != nil {
 56		return false
 57	}
 58	scheme := strings.ToLower(parsed.Scheme)
 59	return scheme == "http" || scheme == "https"
 60}
 61
 62// isOEmbedProvider checks if we have an oEmbed endpoint for this URL
 63func isOEmbedProvider(urlStr string) bool {
 64	domain := extractDomain(urlStr)
 65	_, exists := oEmbedEndpoints[domain]
 66	return exists
 67}
 68
 69// fetchOEmbed fetches oEmbed data from the provider
 70func fetchOEmbed(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*oEmbedResponse, error) {
 71	domain := extractDomain(urlStr)
 72	endpoint, exists := oEmbedEndpoints[domain]
 73	if !exists {
 74		return nil, fmt.Errorf("no oEmbed endpoint for domain: %s", domain)
 75	}
 76
 77	// Build oEmbed request URL
 78	oembedURL := fmt.Sprintf("%s?url=%s&format=json", endpoint, url.QueryEscape(urlStr))
 79
 80	// Create HTTP request
 81	req, err := http.NewRequestWithContext(ctx, "GET", oembedURL, nil)
 82	if err != nil {
 83		return nil, fmt.Errorf("failed to create oEmbed request: %w", err)
 84	}
 85
 86	req.Header.Set("User-Agent", userAgent)
 87
 88	// Create HTTP client with timeout
 89	client := &http.Client{Timeout: timeout}
 90	resp, err := client.Do(req)
 91	if err != nil {
 92		return nil, fmt.Errorf("failed to fetch oEmbed data: %w", err)
 93	}
 94	defer func() { _ = resp.Body.Close() }()
 95
 96	if resp.StatusCode != http.StatusOK {
 97		return nil, fmt.Errorf("oEmbed endpoint returned status %d", resp.StatusCode)
 98	}
 99
100	// Parse JSON response
101	var oembed oEmbedResponse
102	if err := json.NewDecoder(resp.Body).Decode(&oembed); err != nil {
103		return nil, fmt.Errorf("failed to parse oEmbed response: %w", err)
104	}
105
106	return &oembed, nil
107}
108
109// normalizeURL converts protocol-relative URLs to HTTPS
110// Examples:
111//
112//	"//example.com/image.jpg" -> "https://example.com/image.jpg"
113//	"https://example.com/image.jpg" -> "https://example.com/image.jpg" (unchanged)
114func normalizeURL(urlStr string) string {
115	if strings.HasPrefix(urlStr, "//") {
116		return "https:" + urlStr
117	}
118	return urlStr
119}
120
121// mapOEmbedToResult converts oEmbed response to UnfurlResult
122func mapOEmbedToResult(oembed *oEmbedResponse, originalURL string) *UnfurlResult {
123	result := &UnfurlResult{
124		URI:          originalURL,
125		Title:        oembed.Title,
126		Description:  oembed.Description,
127		ThumbnailURL: normalizeURL(oembed.ThumbnailURL),
128		Provider:     strings.ToLower(oembed.ProviderName),
129		Domain:       extractDomain(originalURL),
130		Width:        oembed.Width,
131		Height:       oembed.Height,
132	}
133
134	// Map oEmbed type to our embedType
135	switch oembed.Type {
136	case "video":
137		result.Type = "video"
138	case "photo":
139		result.Type = "image"
140	default:
141		result.Type = "article"
142	}
143
144	// If no description but we have author name, use that
145	if result.Description == "" && oembed.AuthorName != "" {
146		result.Description = fmt.Sprintf("By %s", oembed.AuthorName)
147	}
148
149	return result
150}
151
152// openGraphData represents OpenGraph metadata extracted from HTML
153type openGraphData struct {
154	Title       string
155	Description string
156	Image       string
157	URL         string
158}
159
160// fetchOpenGraph fetches OpenGraph metadata from a URL
161func fetchOpenGraph(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) {
162	// Create HTTP request
163	req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
164	if err != nil {
165		return nil, fmt.Errorf("failed to create request: %w", err)
166	}
167
168	req.Header.Set("User-Agent", userAgent)
169
170	// Create HTTP client with timeout
171	client := &http.Client{Timeout: timeout}
172	resp, err := client.Do(req)
173	if err != nil {
174		return nil, fmt.Errorf("failed to fetch URL: %w", err)
175	}
176	defer func() { _ = resp.Body.Close() }()
177
178	if resp.StatusCode != http.StatusOK {
179		return nil, fmt.Errorf("HTTP request returned status %d", resp.StatusCode)
180	}
181
182	// Read response body (limit to 10MB to prevent abuse)
183	limitedReader := io.LimitReader(resp.Body, 10*1024*1024)
184	body, err := io.ReadAll(limitedReader)
185	if err != nil {
186		return nil, fmt.Errorf("failed to read response body: %w", err)
187	}
188
189	// Parse OpenGraph metadata
190	og, err := parseOpenGraph(string(body))
191	if err != nil {
192		return nil, fmt.Errorf("failed to parse OpenGraph metadata: %w", err)
193	}
194
195	// Build UnfurlResult
196	result := &UnfurlResult{
197		Type:         "article", // Default type for OpenGraph
198		URI:          urlStr,
199		Title:        og.Title,
200		Description:  og.Description,
201		ThumbnailURL: normalizeURL(og.Image),
202		Provider:     "opengraph",
203		Domain:       extractDomain(urlStr),
204	}
205
206	// Use og:url if available and valid
207	if og.URL != "" {
208		result.URI = og.URL
209	}
210
211	return result, nil
212}
213
214// parseOpenGraph extracts OpenGraph metadata from HTML
215func parseOpenGraph(htmlContent string) (*openGraphData, error) {
216	og := &openGraphData{}
217	doc, err := html.Parse(strings.NewReader(htmlContent))
218	if err != nil {
219		// Try best-effort parsing even with invalid HTML
220		return og, nil
221	}
222
223	// Extract OpenGraph tags and fallbacks
224	var pageTitle string
225	var metaDescription string
226
227	var traverse func(*html.Node)
228	traverse = func(n *html.Node) {
229		if n.Type == html.ElementNode {
230			switch n.Data {
231			case "meta":
232				property := getAttr(n, "property")
233				name := getAttr(n, "name")
234				content := getAttr(n, "content")
235
236				// OpenGraph tags
237				if strings.HasPrefix(property, "og:") {
238					switch property {
239					case "og:title":
240						if og.Title == "" {
241							og.Title = content
242						}
243					case "og:description":
244						if og.Description == "" {
245							og.Description = content
246						}
247					case "og:image":
248						if og.Image == "" {
249							og.Image = content
250						}
251					case "og:url":
252						if og.URL == "" {
253							og.URL = content
254						}
255					}
256				}
257
258				// Fallback meta tags
259				if name == "description" && metaDescription == "" {
260					metaDescription = content
261				}
262
263			case "title":
264				if pageTitle == "" && n.FirstChild != nil {
265					pageTitle = n.FirstChild.Data
266				}
267			}
268		}
269
270		for c := n.FirstChild; c != nil; c = c.NextSibling {
271			traverse(c)
272		}
273	}
274
275	traverse(doc)
276
277	// Apply fallbacks
278	if og.Title == "" {
279		og.Title = pageTitle
280	}
281	if og.Description == "" {
282		og.Description = metaDescription
283	}
284
285	return og, nil
286}
287
288// getAttr gets an attribute value from an HTML node
289func getAttr(n *html.Node, key string) string {
290	for _, attr := range n.Attr {
291		if attr.Key == key {
292			return attr.Val
293		}
294	}
295	return ""
296}
297
298// fetchKagiKite handles special unfurling for Kagi Kite news pages
299// Kagi Kite pages use client-side rendering, so og:image tags aren't available at SSR time
300// Instead, we parse the HTML to extract the story image from the page content
301func fetchKagiKite(ctx context.Context, urlStr string, timeout time.Duration, userAgent string) (*UnfurlResult, error) {
302	// Create HTTP request
303	req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
304	if err != nil {
305		return nil, fmt.Errorf("failed to create request: %w", err)
306	}
307
308	req.Header.Set("User-Agent", userAgent)
309
310	// Create HTTP client with timeout
311	client := &http.Client{Timeout: timeout}
312	resp, err := client.Do(req)
313	if err != nil {
314		return nil, fmt.Errorf("failed to fetch URL: %w", err)
315	}
316	defer func() { _ = resp.Body.Close() }()
317
318	if resp.StatusCode != http.StatusOK {
319		return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
320	}
321
322	// Limit response size to 10MB
323	limitedReader := io.LimitReader(resp.Body, 10*1024*1024)
324
325	// Parse HTML
326	doc, err := html.Parse(limitedReader)
327	if err != nil {
328		return nil, fmt.Errorf("failed to parse HTML: %w", err)
329	}
330
331	result := &UnfurlResult{
332		Type:     "article",
333		URI:      urlStr,
334		Domain:   "kite.kagi.com",
335		Provider: "kagi",
336	}
337
338	// First try OpenGraph tags (in case they get added in the future)
339	var findOG func(*html.Node)
340	findOG = func(n *html.Node) {
341		if n.Type == html.ElementNode && n.Data == "meta" {
342			var property, content string
343			for _, attr := range n.Attr {
344				if attr.Key == "property" {
345					property = attr.Val
346				} else if attr.Key == "content" {
347					content = attr.Val
348				}
349			}
350
351			switch property {
352			case "og:title":
353				if result.Title == "" {
354					result.Title = content
355				}
356			case "og:description":
357				if result.Description == "" {
358					result.Description = content
359				}
360			case "og:image":
361				if result.ThumbnailURL == "" {
362					result.ThumbnailURL = content
363				}
364			}
365		}
366		for c := n.FirstChild; c != nil; c = c.NextSibling {
367			findOG(c)
368		}
369	}
370	findOG(doc)
371
372	// Fallback: Extract from page content
373	// Look for images with kagiproxy.com URLs (Kagi's image proxy)
374	// Note: Skip the first image as it's often a shared header/logo
375	if result.ThumbnailURL == "" {
376		var images []struct {
377			url string
378			alt string
379		}
380
381		var findImg func(*html.Node)
382		findImg = func(n *html.Node) {
383			if n.Type == html.ElementNode && n.Data == "img" {
384				for _, attr := range n.Attr {
385					if attr.Key == "src" && strings.Contains(attr.Val, "kagiproxy.com") {
386						// Get alt text if available
387						var altText string
388						for _, a := range n.Attr {
389							if a.Key == "alt" {
390								altText = a.Val
391								break
392							}
393						}
394						images = append(images, struct {
395							url string
396							alt string
397						}{url: attr.Val, alt: altText})
398						break
399					}
400				}
401			}
402			for c := n.FirstChild; c != nil; c = c.NextSibling {
403				findImg(c)
404			}
405		}
406		findImg(doc)
407
408		// Skip first image (often shared header/logo), use second if available
409		if len(images) > 1 {
410			result.ThumbnailURL = images[1].url
411			if result.Description == "" && images[1].alt != "" {
412				result.Description = images[1].alt
413			}
414		} else if len(images) == 1 {
415			// Only one image found, use it
416			result.ThumbnailURL = images[0].url
417			if result.Description == "" && images[0].alt != "" {
418				result.Description = images[0].alt
419			}
420		}
421	}
422
423	// Fallback to <title> tag if og:title not found
424	if result.Title == "" {
425		var findTitle func(*html.Node) string
426		findTitle = func(n *html.Node) string {
427			if n.Type == html.ElementNode && n.Data == "title" {
428				if n.FirstChild != nil && n.FirstChild.Type == html.TextNode {
429					return n.FirstChild.Data
430				}
431			}
432			for c := n.FirstChild; c != nil; c = c.NextSibling {
433				if title := findTitle(c); title != "" {
434					return title
435				}
436			}
437			return ""
438		}
439		result.Title = findTitle(doc)
440	}
441
442	// If still no image, return error
443	if result.ThumbnailURL == "" {
444		return nil, fmt.Errorf("no image found in Kagi page")
445	}
446
447	return result, nil
448}