""" Kagi News HTML description parser. Parses the HTML content from RSS feed item descriptions into structured data. """ import re import logging from typing import Dict, List, Optional from datetime import datetime from bs4 import BeautifulSoup from urllib.parse import urlparse from src.models import KagiStory, Perspective, Quote, Source logger = logging.getLogger(__name__) class KagiHTMLParser: """Parses Kagi News HTML descriptions into structured data.""" def parse(self, html_description: str) -> Dict: """ Parse HTML description into structured data. Args: html_description: HTML content from RSS item description Returns: Dictionary with extracted data: - summary: str - image_url: Optional[str] - image_alt: Optional[str] - highlights: List[str] - quote: Optional[Dict[str, str]] - perspectives: List[Dict] - sources: List[Dict] """ soup = BeautifulSoup(html_description, 'html.parser') return { 'summary': self._extract_summary(soup), 'image_url': self._extract_image_url(soup), 'image_alt': self._extract_image_alt(soup), 'highlights': self._extract_highlights(soup), 'quote': self._extract_quote(soup), 'perspectives': self._extract_perspectives(soup), 'sources': self._extract_sources(soup), } def parse_to_story( self, title: str, link: str, guid: str, pub_date: datetime, categories: List[str], html_description: str ) -> KagiStory: """ Parse HTML and create a KagiStory object. Args: title: Story title link: Story URL guid: Unique identifier pub_date: Publication date categories: List of categories html_description: HTML content from description Returns: KagiStory object """ parsed = self.parse(html_description) # Convert parsed data to model objects perspectives = [ Perspective( actor=p['actor'], description=p['description'], source_url=p['source_url'], source_name=p.get('source_name', '') ) for p in parsed['perspectives'] ] sources = [ Source( title=s['title'], url=s['url'], domain=s['domain'] ) for s in parsed['sources'] ] quote = None if parsed['quote']: quote = Quote( text=parsed['quote']['text'], attribution=parsed['quote']['attribution'] ) return KagiStory( title=title, link=link, guid=guid, pub_date=pub_date, categories=categories, summary=parsed['summary'], highlights=parsed['highlights'], perspectives=perspectives, quote=quote, sources=sources, image_url=parsed['image_url'], image_alt=parsed['image_alt'] ) def _extract_summary(self, soup: BeautifulSoup) -> str: """Extract summary from first

tag.""" p_tag = soup.find('p') if p_tag: return p_tag.get_text(strip=True) return "" def _extract_image_url(self, soup: BeautifulSoup) -> Optional[str]: """Extract image URL from tag.""" img_tag = soup.find('img') if img_tag and img_tag.get('src'): return img_tag['src'] return None def _extract_image_alt(self, soup: BeautifulSoup) -> Optional[str]: """Extract image alt text from tag.""" img_tag = soup.find('img') if img_tag and img_tag.get('alt'): return img_tag['alt'] return None def _extract_highlights(self, soup: BeautifulSoup) -> List[str]: """Extract highlights list from H3 section.""" highlights = [] # Find "Highlights:" h3 tag h3_tags = soup.find_all('h3') for h3 in h3_tags: if 'Highlights' in h3.get_text(): # Get the