A community based topic aggregation platform built on atproto
at main 9.6 kB view raw
1""" 2Kagi News HTML description parser. 3 4Parses the HTML content from RSS feed item descriptions 5into structured data. 6""" 7import re 8import logging 9from typing import Dict, List, Optional 10from datetime import datetime 11from bs4 import BeautifulSoup 12from urllib.parse import urlparse 13 14from src.models import KagiStory, Perspective, Quote, Source 15 16logger = logging.getLogger(__name__) 17 18 19class KagiHTMLParser: 20 """Parses Kagi News HTML descriptions into structured data.""" 21 22 def parse(self, html_description: str) -> Dict: 23 """ 24 Parse HTML description into structured data. 25 26 Args: 27 html_description: HTML content from RSS item description 28 29 Returns: 30 Dictionary with extracted data: 31 - summary: str 32 - image_url: Optional[str] 33 - image_alt: Optional[str] 34 - highlights: List[str] 35 - quote: Optional[Dict[str, str]] 36 - perspectives: List[Dict] 37 - sources: List[Dict] 38 """ 39 soup = BeautifulSoup(html_description, 'html.parser') 40 41 return { 42 'summary': self._extract_summary(soup), 43 'image_url': self._extract_image_url(soup), 44 'image_alt': self._extract_image_alt(soup), 45 'highlights': self._extract_highlights(soup), 46 'quote': self._extract_quote(soup), 47 'perspectives': self._extract_perspectives(soup), 48 'sources': self._extract_sources(soup), 49 } 50 51 def parse_to_story( 52 self, 53 title: str, 54 link: str, 55 guid: str, 56 pub_date: datetime, 57 categories: List[str], 58 html_description: str 59 ) -> KagiStory: 60 """ 61 Parse HTML and create a KagiStory object. 62 63 Args: 64 title: Story title 65 link: Story URL 66 guid: Unique identifier 67 pub_date: Publication date 68 categories: List of categories 69 html_description: HTML content from description 70 71 Returns: 72 KagiStory object 73 """ 74 parsed = self.parse(html_description) 75 76 # Convert parsed data to model objects 77 perspectives = [ 78 Perspective( 79 actor=p['actor'], 80 description=p['description'], 81 source_url=p['source_url'], 82 source_name=p.get('source_name', '') 83 ) 84 for p in parsed['perspectives'] 85 ] 86 87 sources = [ 88 Source( 89 title=s['title'], 90 url=s['url'], 91 domain=s['domain'] 92 ) 93 for s in parsed['sources'] 94 ] 95 96 quote = None 97 if parsed['quote']: 98 quote = Quote( 99 text=parsed['quote']['text'], 100 attribution=parsed['quote']['attribution'] 101 ) 102 103 return KagiStory( 104 title=title, 105 link=link, 106 guid=guid, 107 pub_date=pub_date, 108 categories=categories, 109 summary=parsed['summary'], 110 highlights=parsed['highlights'], 111 perspectives=perspectives, 112 quote=quote, 113 sources=sources, 114 image_url=parsed['image_url'], 115 image_alt=parsed['image_alt'] 116 ) 117 118 def _extract_summary(self, soup: BeautifulSoup) -> str: 119 """Extract summary from first <p> tag.""" 120 p_tag = soup.find('p') 121 if p_tag: 122 return p_tag.get_text(strip=True) 123 return "" 124 125 def _extract_image_url(self, soup: BeautifulSoup) -> Optional[str]: 126 """Extract image URL from <img> tag.""" 127 img_tag = soup.find('img') 128 if img_tag and img_tag.get('src'): 129 return img_tag['src'] 130 return None 131 132 def _extract_image_alt(self, soup: BeautifulSoup) -> Optional[str]: 133 """Extract image alt text from <img> tag.""" 134 img_tag = soup.find('img') 135 if img_tag and img_tag.get('alt'): 136 return img_tag['alt'] 137 return None 138 139 def _extract_highlights(self, soup: BeautifulSoup) -> List[str]: 140 """Extract highlights list from H3 section.""" 141 highlights = [] 142 143 # Find "Highlights:" h3 tag 144 h3_tags = soup.find_all('h3') 145 for h3 in h3_tags: 146 if 'Highlights' in h3.get_text(): 147 # Get the <ul> that follows this h3 148 ul = h3.find_next_sibling('ul') 149 if ul: 150 for li in ul.find_all('li'): 151 highlights.append(li.get_text(strip=True)) 152 break 153 154 return highlights 155 156 def _extract_quote(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]: 157 """Extract quote from <blockquote> tag.""" 158 blockquote = soup.find('blockquote') 159 if not blockquote: 160 return None 161 162 text = blockquote.get_text(strip=True) 163 164 # Try to split on " - " to separate quote from attribution 165 if ' - ' in text: 166 quote_text, attribution = text.rsplit(' - ', 1) 167 return { 168 'text': quote_text.strip(), 169 'attribution': attribution.strip() 170 } 171 172 # If no attribution found, entire text is the quote 173 # Try to infer attribution from context (often mentioned in highlights/perspectives) 174 return { 175 'text': text, 176 'attribution': self._infer_quote_attribution(soup, text) 177 } 178 179 def _infer_quote_attribution(self, soup: BeautifulSoup, quote_text: str) -> str: 180 """ 181 Try to infer quote attribution from context. 182 183 This is a fallback when quote doesn't have explicit attribution. 184 """ 185 # For now, check if any perspective mentions similar keywords 186 perspectives_section = soup.find('h3', string=re.compile(r'Perspectives')) 187 if perspectives_section: 188 ul = perspectives_section.find_next_sibling('ul') 189 if ul: 190 for li in ul.find_all('li'): 191 li_text = li.get_text() 192 # Extract actor name (before first colon) 193 if ':' in li_text: 194 actor = li_text.split(':', 1)[0].strip() 195 return actor 196 197 return "Unknown" 198 199 def _extract_perspectives(self, soup: BeautifulSoup) -> List[Dict]: 200 """Extract perspectives from H3 section.""" 201 perspectives = [] 202 203 # Find "Perspectives:" h3 tag 204 h3_tags = soup.find_all('h3') 205 for h3 in h3_tags: 206 if 'Perspectives' in h3.get_text(): 207 # Get the <ul> that follows this h3 208 ul = h3.find_next_sibling('ul') 209 if ul: 210 for li in ul.find_all('li'): 211 perspective = self._parse_perspective_li(li) 212 if perspective: 213 perspectives.append(perspective) 214 break 215 216 return perspectives 217 218 def _parse_perspective_li(self, li) -> Optional[Dict]: 219 """ 220 Parse a single perspective <li> element. 221 222 Format: "Actor: Description. (Source)" 223 """ 224 # Get full text 225 full_text = li.get_text() 226 227 # Extract actor (before first colon) 228 if ':' not in full_text: 229 return None 230 231 actor, rest = full_text.split(':', 1) 232 actor = actor.strip() 233 234 # Find the <a> tag for source URL and name 235 a_tag = li.find('a') 236 source_url = a_tag['href'] if a_tag and a_tag.get('href') else "" 237 source_name = a_tag.get_text(strip=True) if a_tag else "" 238 239 # Extract description (between colon and source link) 240 # Remove the source citation part in parentheses 241 description = rest 242 243 # Remove source citation like "(The Straits Times)" from description 244 if a_tag: 245 # Remove the link text and surrounding parentheses 246 link_text = a_tag.get_text() 247 description = description.replace(f"({link_text})", "").strip() 248 249 # Clean up trailing period 250 description = description.strip('. ') 251 252 return { 253 'actor': actor, 254 'description': description, 255 'source_url': source_url, 256 'source_name': source_name 257 } 258 259 def _extract_sources(self, soup: BeautifulSoup) -> List[Dict]: 260 """Extract sources list from H3 section.""" 261 sources = [] 262 263 # Find "Sources:" h3 tag 264 h3_tags = soup.find_all('h3') 265 for h3 in h3_tags: 266 if 'Sources' in h3.get_text(): 267 # Get the <ul> that follows this h3 268 ul = h3.find_next_sibling('ul') 269 if ul: 270 for li in ul.find_all('li'): 271 source = self._parse_source_li(li) 272 if source: 273 sources.append(source) 274 break 275 276 return sources 277 278 def _parse_source_li(self, li) -> Optional[Dict]: 279 """ 280 Parse a single source <li> element. 281 282 Format: "<a href='...'>Title</a> - domain.com" 283 """ 284 a_tag = li.find('a') 285 if not a_tag or not a_tag.get('href'): 286 return None 287 288 title = a_tag.get_text(strip=True) 289 url = a_tag['href'] 290 291 # Extract domain from URL 292 parsed_url = urlparse(url) 293 domain = parsed_url.netloc 294 295 # Remove "www." prefix if present 296 if domain.startswith('www.'): 297 domain = domain[4:] 298 299 return { 300 'title': title, 301 'url': url, 302 'domain': domain 303 }