A community based topic aggregation platform built on atproto
1"""
2Kagi News HTML description parser.
3
4Parses the HTML content from RSS feed item descriptions
5into structured data.
6"""
7import re
8import logging
9from typing import Dict, List, Optional
10from datetime import datetime
11from bs4 import BeautifulSoup
12from urllib.parse import urlparse
13
14from src.models import KagiStory, Perspective, Quote, Source
15
16logger = logging.getLogger(__name__)
17
18
19class KagiHTMLParser:
20 """Parses Kagi News HTML descriptions into structured data."""
21
22 def parse(self, html_description: str) -> Dict:
23 """
24 Parse HTML description into structured data.
25
26 Args:
27 html_description: HTML content from RSS item description
28
29 Returns:
30 Dictionary with extracted data:
31 - summary: str
32 - image_url: Optional[str]
33 - image_alt: Optional[str]
34 - highlights: List[str]
35 - quote: Optional[Dict[str, str]]
36 - perspectives: List[Dict]
37 - sources: List[Dict]
38 """
39 soup = BeautifulSoup(html_description, 'html.parser')
40
41 return {
42 'summary': self._extract_summary(soup),
43 'image_url': self._extract_image_url(soup),
44 'image_alt': self._extract_image_alt(soup),
45 'highlights': self._extract_highlights(soup),
46 'quote': self._extract_quote(soup),
47 'perspectives': self._extract_perspectives(soup),
48 'sources': self._extract_sources(soup),
49 }
50
51 def parse_to_story(
52 self,
53 title: str,
54 link: str,
55 guid: str,
56 pub_date: datetime,
57 categories: List[str],
58 html_description: str
59 ) -> KagiStory:
60 """
61 Parse HTML and create a KagiStory object.
62
63 Args:
64 title: Story title
65 link: Story URL
66 guid: Unique identifier
67 pub_date: Publication date
68 categories: List of categories
69 html_description: HTML content from description
70
71 Returns:
72 KagiStory object
73 """
74 parsed = self.parse(html_description)
75
76 # Convert parsed data to model objects
77 perspectives = [
78 Perspective(
79 actor=p['actor'],
80 description=p['description'],
81 source_url=p['source_url'],
82 source_name=p.get('source_name', '')
83 )
84 for p in parsed['perspectives']
85 ]
86
87 sources = [
88 Source(
89 title=s['title'],
90 url=s['url'],
91 domain=s['domain']
92 )
93 for s in parsed['sources']
94 ]
95
96 quote = None
97 if parsed['quote']:
98 quote = Quote(
99 text=parsed['quote']['text'],
100 attribution=parsed['quote']['attribution']
101 )
102
103 return KagiStory(
104 title=title,
105 link=link,
106 guid=guid,
107 pub_date=pub_date,
108 categories=categories,
109 summary=parsed['summary'],
110 highlights=parsed['highlights'],
111 perspectives=perspectives,
112 quote=quote,
113 sources=sources,
114 image_url=parsed['image_url'],
115 image_alt=parsed['image_alt']
116 )
117
118 def _extract_summary(self, soup: BeautifulSoup) -> str:
119 """Extract summary from first <p> tag."""
120 p_tag = soup.find('p')
121 if p_tag:
122 return p_tag.get_text(strip=True)
123 return ""
124
125 def _extract_image_url(self, soup: BeautifulSoup) -> Optional[str]:
126 """Extract image URL from <img> tag."""
127 img_tag = soup.find('img')
128 if img_tag and img_tag.get('src'):
129 return img_tag['src']
130 return None
131
132 def _extract_image_alt(self, soup: BeautifulSoup) -> Optional[str]:
133 """Extract image alt text from <img> tag."""
134 img_tag = soup.find('img')
135 if img_tag and img_tag.get('alt'):
136 return img_tag['alt']
137 return None
138
139 def _extract_highlights(self, soup: BeautifulSoup) -> List[str]:
140 """Extract highlights list from H3 section."""
141 highlights = []
142
143 # Find "Highlights:" h3 tag
144 h3_tags = soup.find_all('h3')
145 for h3 in h3_tags:
146 if 'Highlights' in h3.get_text():
147 # Get the <ul> that follows this h3
148 ul = h3.find_next_sibling('ul')
149 if ul:
150 for li in ul.find_all('li'):
151 highlights.append(li.get_text(strip=True))
152 break
153
154 return highlights
155
156 def _extract_quote(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]:
157 """Extract quote from <blockquote> tag."""
158 blockquote = soup.find('blockquote')
159 if not blockquote:
160 return None
161
162 text = blockquote.get_text(strip=True)
163
164 # Try to split on " - " to separate quote from attribution
165 if ' - ' in text:
166 quote_text, attribution = text.rsplit(' - ', 1)
167 return {
168 'text': quote_text.strip(),
169 'attribution': attribution.strip()
170 }
171
172 # If no attribution found, entire text is the quote
173 # Try to infer attribution from context (often mentioned in highlights/perspectives)
174 return {
175 'text': text,
176 'attribution': self._infer_quote_attribution(soup, text)
177 }
178
179 def _infer_quote_attribution(self, soup: BeautifulSoup, quote_text: str) -> str:
180 """
181 Try to infer quote attribution from context.
182
183 This is a fallback when quote doesn't have explicit attribution.
184 """
185 # For now, check if any perspective mentions similar keywords
186 perspectives_section = soup.find('h3', string=re.compile(r'Perspectives'))
187 if perspectives_section:
188 ul = perspectives_section.find_next_sibling('ul')
189 if ul:
190 for li in ul.find_all('li'):
191 li_text = li.get_text()
192 # Extract actor name (before first colon)
193 if ':' in li_text:
194 actor = li_text.split(':', 1)[0].strip()
195 return actor
196
197 return "Unknown"
198
199 def _extract_perspectives(self, soup: BeautifulSoup) -> List[Dict]:
200 """Extract perspectives from H3 section."""
201 perspectives = []
202
203 # Find "Perspectives:" h3 tag
204 h3_tags = soup.find_all('h3')
205 for h3 in h3_tags:
206 if 'Perspectives' in h3.get_text():
207 # Get the <ul> that follows this h3
208 ul = h3.find_next_sibling('ul')
209 if ul:
210 for li in ul.find_all('li'):
211 perspective = self._parse_perspective_li(li)
212 if perspective:
213 perspectives.append(perspective)
214 break
215
216 return perspectives
217
218 def _parse_perspective_li(self, li) -> Optional[Dict]:
219 """
220 Parse a single perspective <li> element.
221
222 Format: "Actor: Description. (Source)"
223 """
224 # Get full text
225 full_text = li.get_text()
226
227 # Extract actor (before first colon)
228 if ':' not in full_text:
229 return None
230
231 actor, rest = full_text.split(':', 1)
232 actor = actor.strip()
233
234 # Find the <a> tag for source URL and name
235 a_tag = li.find('a')
236 source_url = a_tag['href'] if a_tag and a_tag.get('href') else ""
237 source_name = a_tag.get_text(strip=True) if a_tag else ""
238
239 # Extract description (between colon and source link)
240 # Remove the source citation part in parentheses
241 description = rest
242
243 # Remove source citation like "(The Straits Times)" from description
244 if a_tag:
245 # Remove the link text and surrounding parentheses
246 link_text = a_tag.get_text()
247 description = description.replace(f"({link_text})", "").strip()
248
249 # Clean up trailing period
250 description = description.strip('. ')
251
252 return {
253 'actor': actor,
254 'description': description,
255 'source_url': source_url,
256 'source_name': source_name
257 }
258
259 def _extract_sources(self, soup: BeautifulSoup) -> List[Dict]:
260 """Extract sources list from H3 section."""
261 sources = []
262
263 # Find "Sources:" h3 tag
264 h3_tags = soup.find_all('h3')
265 for h3 in h3_tags:
266 if 'Sources' in h3.get_text():
267 # Get the <ul> that follows this h3
268 ul = h3.find_next_sibling('ul')
269 if ul:
270 for li in ul.find_all('li'):
271 source = self._parse_source_li(li)
272 if source:
273 sources.append(source)
274 break
275
276 return sources
277
278 def _parse_source_li(self, li) -> Optional[Dict]:
279 """
280 Parse a single source <li> element.
281
282 Format: "<a href='...'>Title</a> - domain.com"
283 """
284 a_tag = li.find('a')
285 if not a_tag or not a_tag.get('href'):
286 return None
287
288 title = a_tag.get_text(strip=True)
289 url = a_tag['href']
290
291 # Extract domain from URL
292 parsed_url = urlparse(url)
293 domain = parsed_url.netloc
294
295 # Remove "www." prefix if present
296 if domain.startswith('www.'):
297 domain = domain[4:]
298
299 return {
300 'title': title,
301 'url': url,
302 'domain': domain
303 }