A community based topic aggregation platform built on atproto
1""" 2Kagi News HTML description parser. 3 4Parses the HTML content from RSS feed item descriptions 5into structured data. 6""" 7import re 8import logging 9from typing import Dict, List, Optional 10from datetime import datetime 11from bs4 import BeautifulSoup 12from urllib.parse import urlparse 13 14from src.models import KagiStory, Perspective, Quote, Source 15 16logger = logging.getLogger(__name__) 17 18 19class KagiHTMLParser: 20 """Parses Kagi News HTML descriptions into structured data.""" 21 22 def parse(self, html_description: str) -> Dict: 23 """ 24 Parse HTML description into structured data. 25 26 Args: 27 html_description: HTML content from RSS item description 28 29 Returns: 30 Dictionary with extracted data: 31 - summary: str 32 - image_url: Optional[str] 33 - image_alt: Optional[str] 34 - highlights: List[str] 35 - quote: Optional[Dict[str, str]] 36 - perspectives: List[Dict] 37 - sources: List[Dict] 38 """ 39 soup = BeautifulSoup(html_description, 'html.parser') 40 41 return { 42 'summary': self._extract_summary(soup), 43 'image_url': self._extract_image_url(soup), 44 'image_alt': self._extract_image_alt(soup), 45 'highlights': self._extract_highlights(soup), 46 'quote': self._extract_quote(soup), 47 'perspectives': self._extract_perspectives(soup), 48 'sources': self._extract_sources(soup), 49 } 50 51 def parse_to_story( 52 self, 53 title: str, 54 link: str, 55 guid: str, 56 pub_date: datetime, 57 categories: List[str], 58 html_description: str 59 ) -> KagiStory: 60 """ 61 Parse HTML and create a KagiStory object. 62 63 Args: 64 title: Story title 65 link: Story URL 66 guid: Unique identifier 67 pub_date: Publication date 68 categories: List of categories 69 html_description: HTML content from description 70 71 Returns: 72 KagiStory object 73 """ 74 parsed = self.parse(html_description) 75 76 # Convert parsed data to model objects 77 perspectives = [ 78 Perspective( 79 actor=p['actor'], 80 description=p['description'], 81 source_url=p['source_url'] 82 ) 83 for p in parsed['perspectives'] 84 ] 85 86 sources = [ 87 Source( 88 title=s['title'], 89 url=s['url'], 90 domain=s['domain'] 91 ) 92 for s in parsed['sources'] 93 ] 94 95 quote = None 96 if parsed['quote']: 97 quote = Quote( 98 text=parsed['quote']['text'], 99 attribution=parsed['quote']['attribution'] 100 ) 101 102 return KagiStory( 103 title=title, 104 link=link, 105 guid=guid, 106 pub_date=pub_date, 107 categories=categories, 108 summary=parsed['summary'], 109 highlights=parsed['highlights'], 110 perspectives=perspectives, 111 quote=quote, 112 sources=sources, 113 image_url=parsed['image_url'], 114 image_alt=parsed['image_alt'] 115 ) 116 117 def _extract_summary(self, soup: BeautifulSoup) -> str: 118 """Extract summary from first <p> tag.""" 119 p_tag = soup.find('p') 120 if p_tag: 121 return p_tag.get_text(strip=True) 122 return "" 123 124 def _extract_image_url(self, soup: BeautifulSoup) -> Optional[str]: 125 """Extract image URL from <img> tag.""" 126 img_tag = soup.find('img') 127 if img_tag and img_tag.get('src'): 128 return img_tag['src'] 129 return None 130 131 def _extract_image_alt(self, soup: BeautifulSoup) -> Optional[str]: 132 """Extract image alt text from <img> tag.""" 133 img_tag = soup.find('img') 134 if img_tag and img_tag.get('alt'): 135 return img_tag['alt'] 136 return None 137 138 def _extract_highlights(self, soup: BeautifulSoup) -> List[str]: 139 """Extract highlights list from H3 section.""" 140 highlights = [] 141 142 # Find "Highlights:" h3 tag 143 h3_tags = soup.find_all('h3') 144 for h3 in h3_tags: 145 if 'Highlights' in h3.get_text(): 146 # Get the <ul> that follows this h3 147 ul = h3.find_next_sibling('ul') 148 if ul: 149 for li in ul.find_all('li'): 150 highlights.append(li.get_text(strip=True)) 151 break 152 153 return highlights 154 155 def _extract_quote(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]: 156 """Extract quote from <blockquote> tag.""" 157 blockquote = soup.find('blockquote') 158 if not blockquote: 159 return None 160 161 text = blockquote.get_text(strip=True) 162 163 # Try to split on " - " to separate quote from attribution 164 if ' - ' in text: 165 quote_text, attribution = text.rsplit(' - ', 1) 166 return { 167 'text': quote_text.strip(), 168 'attribution': attribution.strip() 169 } 170 171 # If no attribution found, entire text is the quote 172 # Try to infer attribution from context (often mentioned in highlights/perspectives) 173 return { 174 'text': text, 175 'attribution': self._infer_quote_attribution(soup, text) 176 } 177 178 def _infer_quote_attribution(self, soup: BeautifulSoup, quote_text: str) -> str: 179 """ 180 Try to infer quote attribution from context. 181 182 This is a fallback when quote doesn't have explicit attribution. 183 """ 184 # For now, check if any perspective mentions similar keywords 185 perspectives_section = soup.find('h3', string=re.compile(r'Perspectives')) 186 if perspectives_section: 187 ul = perspectives_section.find_next_sibling('ul') 188 if ul: 189 for li in ul.find_all('li'): 190 li_text = li.get_text() 191 # Extract actor name (before first colon) 192 if ':' in li_text: 193 actor = li_text.split(':', 1)[0].strip() 194 return actor 195 196 return "Unknown" 197 198 def _extract_perspectives(self, soup: BeautifulSoup) -> List[Dict]: 199 """Extract perspectives from H3 section.""" 200 perspectives = [] 201 202 # Find "Perspectives:" h3 tag 203 h3_tags = soup.find_all('h3') 204 for h3 in h3_tags: 205 if 'Perspectives' in h3.get_text(): 206 # Get the <ul> that follows this h3 207 ul = h3.find_next_sibling('ul') 208 if ul: 209 for li in ul.find_all('li'): 210 perspective = self._parse_perspective_li(li) 211 if perspective: 212 perspectives.append(perspective) 213 break 214 215 return perspectives 216 217 def _parse_perspective_li(self, li) -> Optional[Dict]: 218 """ 219 Parse a single perspective <li> element. 220 221 Format: "Actor: Description. (Source)" 222 """ 223 # Get full text 224 full_text = li.get_text() 225 226 # Extract actor (before first colon) 227 if ':' not in full_text: 228 return None 229 230 actor, rest = full_text.split(':', 1) 231 actor = actor.strip() 232 233 # Find the <a> tag for source URL 234 a_tag = li.find('a') 235 source_url = a_tag['href'] if a_tag and a_tag.get('href') else "" 236 237 # Extract description (between colon and source link) 238 # Remove the source citation part in parentheses 239 description = rest 240 241 # Remove source citation like "(The Straits Times)" from description 242 if a_tag: 243 # Remove the link text and surrounding parentheses 244 link_text = a_tag.get_text() 245 description = description.replace(f"({link_text})", "").strip() 246 247 # Clean up trailing period 248 description = description.strip('. ') 249 250 return { 251 'actor': actor, 252 'description': description, 253 'source_url': source_url 254 } 255 256 def _extract_sources(self, soup: BeautifulSoup) -> List[Dict]: 257 """Extract sources list from H3 section.""" 258 sources = [] 259 260 # Find "Sources:" h3 tag 261 h3_tags = soup.find_all('h3') 262 for h3 in h3_tags: 263 if 'Sources' in h3.get_text(): 264 # Get the <ul> that follows this h3 265 ul = h3.find_next_sibling('ul') 266 if ul: 267 for li in ul.find_all('li'): 268 source = self._parse_source_li(li) 269 if source: 270 sources.append(source) 271 break 272 273 return sources 274 275 def _parse_source_li(self, li) -> Optional[Dict]: 276 """ 277 Parse a single source <li> element. 278 279 Format: "<a href='...'>Title</a> - domain.com" 280 """ 281 a_tag = li.find('a') 282 if not a_tag or not a_tag.get('href'): 283 return None 284 285 title = a_tag.get_text(strip=True) 286 url = a_tag['href'] 287 288 # Extract domain from URL 289 parsed_url = urlparse(url) 290 domain = parsed_url.netloc 291 292 # Remove "www." prefix if present 293 if domain.startswith('www.'): 294 domain = domain[4:] 295 296 return { 297 'title': title, 298 'url': url, 299 'domain': domain 300 }