A community based topic aggregation platform built on atproto
1"""
2Kagi News HTML description parser.
3
4Parses the HTML content from RSS feed item descriptions
5into structured data.
6"""
7import re
8import logging
9from typing import Dict, List, Optional
10from datetime import datetime
11from bs4 import BeautifulSoup
12from urllib.parse import urlparse
13
14from src.models import KagiStory, Perspective, Quote, Source
15
16logger = logging.getLogger(__name__)
17
18
19class KagiHTMLParser:
20 """Parses Kagi News HTML descriptions into structured data."""
21
22 def parse(self, html_description: str) -> Dict:
23 """
24 Parse HTML description into structured data.
25
26 Args:
27 html_description: HTML content from RSS item description
28
29 Returns:
30 Dictionary with extracted data:
31 - summary: str
32 - image_url: Optional[str]
33 - image_alt: Optional[str]
34 - highlights: List[str]
35 - quote: Optional[Dict[str, str]]
36 - perspectives: List[Dict]
37 - sources: List[Dict]
38 """
39 soup = BeautifulSoup(html_description, 'html.parser')
40
41 return {
42 'summary': self._extract_summary(soup),
43 'image_url': self._extract_image_url(soup),
44 'image_alt': self._extract_image_alt(soup),
45 'highlights': self._extract_highlights(soup),
46 'quote': self._extract_quote(soup),
47 'perspectives': self._extract_perspectives(soup),
48 'sources': self._extract_sources(soup),
49 }
50
51 def parse_to_story(
52 self,
53 title: str,
54 link: str,
55 guid: str,
56 pub_date: datetime,
57 categories: List[str],
58 html_description: str
59 ) -> KagiStory:
60 """
61 Parse HTML and create a KagiStory object.
62
63 Args:
64 title: Story title
65 link: Story URL
66 guid: Unique identifier
67 pub_date: Publication date
68 categories: List of categories
69 html_description: HTML content from description
70
71 Returns:
72 KagiStory object
73 """
74 parsed = self.parse(html_description)
75
76 # Convert parsed data to model objects
77 perspectives = [
78 Perspective(
79 actor=p['actor'],
80 description=p['description'],
81 source_url=p['source_url']
82 )
83 for p in parsed['perspectives']
84 ]
85
86 sources = [
87 Source(
88 title=s['title'],
89 url=s['url'],
90 domain=s['domain']
91 )
92 for s in parsed['sources']
93 ]
94
95 quote = None
96 if parsed['quote']:
97 quote = Quote(
98 text=parsed['quote']['text'],
99 attribution=parsed['quote']['attribution']
100 )
101
102 return KagiStory(
103 title=title,
104 link=link,
105 guid=guid,
106 pub_date=pub_date,
107 categories=categories,
108 summary=parsed['summary'],
109 highlights=parsed['highlights'],
110 perspectives=perspectives,
111 quote=quote,
112 sources=sources,
113 image_url=parsed['image_url'],
114 image_alt=parsed['image_alt']
115 )
116
117 def _extract_summary(self, soup: BeautifulSoup) -> str:
118 """Extract summary from first <p> tag."""
119 p_tag = soup.find('p')
120 if p_tag:
121 return p_tag.get_text(strip=True)
122 return ""
123
124 def _extract_image_url(self, soup: BeautifulSoup) -> Optional[str]:
125 """Extract image URL from <img> tag."""
126 img_tag = soup.find('img')
127 if img_tag and img_tag.get('src'):
128 return img_tag['src']
129 return None
130
131 def _extract_image_alt(self, soup: BeautifulSoup) -> Optional[str]:
132 """Extract image alt text from <img> tag."""
133 img_tag = soup.find('img')
134 if img_tag and img_tag.get('alt'):
135 return img_tag['alt']
136 return None
137
138 def _extract_highlights(self, soup: BeautifulSoup) -> List[str]:
139 """Extract highlights list from H3 section."""
140 highlights = []
141
142 # Find "Highlights:" h3 tag
143 h3_tags = soup.find_all('h3')
144 for h3 in h3_tags:
145 if 'Highlights' in h3.get_text():
146 # Get the <ul> that follows this h3
147 ul = h3.find_next_sibling('ul')
148 if ul:
149 for li in ul.find_all('li'):
150 highlights.append(li.get_text(strip=True))
151 break
152
153 return highlights
154
155 def _extract_quote(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]:
156 """Extract quote from <blockquote> tag."""
157 blockquote = soup.find('blockquote')
158 if not blockquote:
159 return None
160
161 text = blockquote.get_text(strip=True)
162
163 # Try to split on " - " to separate quote from attribution
164 if ' - ' in text:
165 quote_text, attribution = text.rsplit(' - ', 1)
166 return {
167 'text': quote_text.strip(),
168 'attribution': attribution.strip()
169 }
170
171 # If no attribution found, entire text is the quote
172 # Try to infer attribution from context (often mentioned in highlights/perspectives)
173 return {
174 'text': text,
175 'attribution': self._infer_quote_attribution(soup, text)
176 }
177
178 def _infer_quote_attribution(self, soup: BeautifulSoup, quote_text: str) -> str:
179 """
180 Try to infer quote attribution from context.
181
182 This is a fallback when quote doesn't have explicit attribution.
183 """
184 # For now, check if any perspective mentions similar keywords
185 perspectives_section = soup.find('h3', string=re.compile(r'Perspectives'))
186 if perspectives_section:
187 ul = perspectives_section.find_next_sibling('ul')
188 if ul:
189 for li in ul.find_all('li'):
190 li_text = li.get_text()
191 # Extract actor name (before first colon)
192 if ':' in li_text:
193 actor = li_text.split(':', 1)[0].strip()
194 return actor
195
196 return "Unknown"
197
198 def _extract_perspectives(self, soup: BeautifulSoup) -> List[Dict]:
199 """Extract perspectives from H3 section."""
200 perspectives = []
201
202 # Find "Perspectives:" h3 tag
203 h3_tags = soup.find_all('h3')
204 for h3 in h3_tags:
205 if 'Perspectives' in h3.get_text():
206 # Get the <ul> that follows this h3
207 ul = h3.find_next_sibling('ul')
208 if ul:
209 for li in ul.find_all('li'):
210 perspective = self._parse_perspective_li(li)
211 if perspective:
212 perspectives.append(perspective)
213 break
214
215 return perspectives
216
217 def _parse_perspective_li(self, li) -> Optional[Dict]:
218 """
219 Parse a single perspective <li> element.
220
221 Format: "Actor: Description. (Source)"
222 """
223 # Get full text
224 full_text = li.get_text()
225
226 # Extract actor (before first colon)
227 if ':' not in full_text:
228 return None
229
230 actor, rest = full_text.split(':', 1)
231 actor = actor.strip()
232
233 # Find the <a> tag for source URL
234 a_tag = li.find('a')
235 source_url = a_tag['href'] if a_tag and a_tag.get('href') else ""
236
237 # Extract description (between colon and source link)
238 # Remove the source citation part in parentheses
239 description = rest
240
241 # Remove source citation like "(The Straits Times)" from description
242 if a_tag:
243 # Remove the link text and surrounding parentheses
244 link_text = a_tag.get_text()
245 description = description.replace(f"({link_text})", "").strip()
246
247 # Clean up trailing period
248 description = description.strip('. ')
249
250 return {
251 'actor': actor,
252 'description': description,
253 'source_url': source_url
254 }
255
256 def _extract_sources(self, soup: BeautifulSoup) -> List[Dict]:
257 """Extract sources list from H3 section."""
258 sources = []
259
260 # Find "Sources:" h3 tag
261 h3_tags = soup.find_all('h3')
262 for h3 in h3_tags:
263 if 'Sources' in h3.get_text():
264 # Get the <ul> that follows this h3
265 ul = h3.find_next_sibling('ul')
266 if ul:
267 for li in ul.find_all('li'):
268 source = self._parse_source_li(li)
269 if source:
270 sources.append(source)
271 break
272
273 return sources
274
275 def _parse_source_li(self, li) -> Optional[Dict]:
276 """
277 Parse a single source <li> element.
278
279 Format: "<a href='...'>Title</a> - domain.com"
280 """
281 a_tag = li.find('a')
282 if not a_tag or not a_tag.get('href'):
283 return None
284
285 title = a_tag.get_text(strip=True)
286 url = a_tag['href']
287
288 # Extract domain from URL
289 parsed_url = urlparse(url)
290 domain = parsed_url.netloc
291
292 # Remove "www." prefix if present
293 if domain.startswith('www.'):
294 domain = domain[4:]
295
296 return {
297 'title': title,
298 'url': url,
299 'domain': domain
300 }