···
2
+
Kagi News HTML description parser.
4
+
Parses the HTML content from RSS feed item descriptions
5
+
into structured data.
9
+
from typing import Dict, List, Optional
10
+
from datetime import datetime
11
+
from bs4 import BeautifulSoup
12
+
from urllib.parse import urlparse
14
+
from src.models import KagiStory, Perspective, Quote, Source
16
+
logger = logging.getLogger(__name__)
19
+
class KagiHTMLParser:
20
+
"""Parses Kagi News HTML descriptions into structured data."""
22
+
def parse(self, html_description: str) -> Dict:
24
+
Parse HTML description into structured data.
27
+
html_description: HTML content from RSS item description
30
+
Dictionary with extracted data:
32
+
- image_url: Optional[str]
33
+
- image_alt: Optional[str]
34
+
- highlights: List[str]
35
+
- quote: Optional[Dict[str, str]]
36
+
- perspectives: List[Dict]
37
+
- sources: List[Dict]
39
+
soup = BeautifulSoup(html_description, 'html.parser')
42
+
'summary': self._extract_summary(soup),
43
+
'image_url': self._extract_image_url(soup),
44
+
'image_alt': self._extract_image_alt(soup),
45
+
'highlights': self._extract_highlights(soup),
46
+
'quote': self._extract_quote(soup),
47
+
'perspectives': self._extract_perspectives(soup),
48
+
'sources': self._extract_sources(soup),
57
+
categories: List[str],
58
+
html_description: str
61
+
Parse HTML and create a KagiStory object.
66
+
guid: Unique identifier
67
+
pub_date: Publication date
68
+
categories: List of categories
69
+
html_description: HTML content from description
74
+
parsed = self.parse(html_description)
76
+
# Convert parsed data to model objects
80
+
description=p['description'],
81
+
source_url=p['source_url']
83
+
for p in parsed['perspectives']
92
+
for s in parsed['sources']
98
+
text=parsed['quote']['text'],
99
+
attribution=parsed['quote']['attribution']
107
+
categories=categories,
108
+
summary=parsed['summary'],
109
+
highlights=parsed['highlights'],
110
+
perspectives=perspectives,
113
+
image_url=parsed['image_url'],
114
+
image_alt=parsed['image_alt']
117
+
def _extract_summary(self, soup: BeautifulSoup) -> str:
118
+
"""Extract summary from first <p> tag."""
119
+
p_tag = soup.find('p')
121
+
return p_tag.get_text(strip=True)
124
+
def _extract_image_url(self, soup: BeautifulSoup) -> Optional[str]:
125
+
"""Extract image URL from <img> tag."""
126
+
img_tag = soup.find('img')
127
+
if img_tag and img_tag.get('src'):
128
+
return img_tag['src']
131
+
def _extract_image_alt(self, soup: BeautifulSoup) -> Optional[str]:
132
+
"""Extract image alt text from <img> tag."""
133
+
img_tag = soup.find('img')
134
+
if img_tag and img_tag.get('alt'):
135
+
return img_tag['alt']
138
+
def _extract_highlights(self, soup: BeautifulSoup) -> List[str]:
139
+
"""Extract highlights list from H3 section."""
142
+
# Find "Highlights:" h3 tag
143
+
h3_tags = soup.find_all('h3')
145
+
if 'Highlights' in h3.get_text():
146
+
# Get the <ul> that follows this h3
147
+
ul = h3.find_next_sibling('ul')
149
+
for li in ul.find_all('li'):
150
+
highlights.append(li.get_text(strip=True))
155
+
def _extract_quote(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]:
156
+
"""Extract quote from <blockquote> tag."""
157
+
blockquote = soup.find('blockquote')
161
+
text = blockquote.get_text(strip=True)
163
+
# Try to split on " - " to separate quote from attribution
165
+
quote_text, attribution = text.rsplit(' - ', 1)
167
+
'text': quote_text.strip(),
168
+
'attribution': attribution.strip()
171
+
# If no attribution found, entire text is the quote
172
+
# Try to infer attribution from context (often mentioned in highlights/perspectives)
175
+
'attribution': self._infer_quote_attribution(soup, text)
178
+
def _infer_quote_attribution(self, soup: BeautifulSoup, quote_text: str) -> str:
180
+
Try to infer quote attribution from context.
182
+
This is a fallback when quote doesn't have explicit attribution.
184
+
# For now, check if any perspective mentions similar keywords
185
+
perspectives_section = soup.find('h3', string=re.compile(r'Perspectives'))
186
+
if perspectives_section:
187
+
ul = perspectives_section.find_next_sibling('ul')
189
+
for li in ul.find_all('li'):
190
+
li_text = li.get_text()
191
+
# Extract actor name (before first colon)
193
+
actor = li_text.split(':', 1)[0].strip()
198
+
def _extract_perspectives(self, soup: BeautifulSoup) -> List[Dict]:
199
+
"""Extract perspectives from H3 section."""
202
+
# Find "Perspectives:" h3 tag
203
+
h3_tags = soup.find_all('h3')
205
+
if 'Perspectives' in h3.get_text():
206
+
# Get the <ul> that follows this h3
207
+
ul = h3.find_next_sibling('ul')
209
+
for li in ul.find_all('li'):
210
+
perspective = self._parse_perspective_li(li)
212
+
perspectives.append(perspective)
215
+
return perspectives
217
+
def _parse_perspective_li(self, li) -> Optional[Dict]:
219
+
Parse a single perspective <li> element.
221
+
Format: "Actor: Description. (Source)"
224
+
full_text = li.get_text()
226
+
# Extract actor (before first colon)
227
+
if ':' not in full_text:
230
+
actor, rest = full_text.split(':', 1)
231
+
actor = actor.strip()
233
+
# Find the <a> tag for source URL
234
+
a_tag = li.find('a')
235
+
source_url = a_tag['href'] if a_tag and a_tag.get('href') else ""
237
+
# Extract description (between colon and source link)
238
+
# Remove the source citation part in parentheses
241
+
# Remove source citation like "(The Straits Times)" from description
243
+
# Remove the link text and surrounding parentheses
244
+
link_text = a_tag.get_text()
245
+
description = description.replace(f"({link_text})", "").strip()
247
+
# Clean up trailing period
248
+
description = description.strip('. ')
252
+
'description': description,
253
+
'source_url': source_url
256
+
def _extract_sources(self, soup: BeautifulSoup) -> List[Dict]:
257
+
"""Extract sources list from H3 section."""
260
+
# Find "Sources:" h3 tag
261
+
h3_tags = soup.find_all('h3')
263
+
if 'Sources' in h3.get_text():
264
+
# Get the <ul> that follows this h3
265
+
ul = h3.find_next_sibling('ul')
267
+
for li in ul.find_all('li'):
268
+
source = self._parse_source_li(li)
270
+
sources.append(source)
275
+
def _parse_source_li(self, li) -> Optional[Dict]:
277
+
Parse a single source <li> element.
279
+
Format: "<a href='...'>Title</a> - domain.com"
281
+
a_tag = li.find('a')
282
+
if not a_tag or not a_tag.get('href'):
285
+
title = a_tag.get_text(strip=True)
286
+
url = a_tag['href']
288
+
# Extract domain from URL
289
+
parsed_url = urlparse(url)
290
+
domain = parsed_url.netloc
292
+
# Remove "www." prefix if present
293
+
if domain.startswith('www.'):
294
+
domain = domain[4:]