···
+
Kagi News HTML description parser.
+
Parses the HTML content from RSS feed item descriptions
+
from typing import Dict, List, Optional
+
from datetime import datetime
+
from bs4 import BeautifulSoup
+
from urllib.parse import urlparse
+
from src.models import KagiStory, Perspective, Quote, Source
+
logger = logging.getLogger(__name__)
+
"""Parses Kagi News HTML descriptions into structured data."""
+
def parse(self, html_description: str) -> Dict:
+
Parse HTML description into structured data.
+
html_description: HTML content from RSS item description
+
Dictionary with extracted data:
+
- image_url: Optional[str]
+
- image_alt: Optional[str]
+
- highlights: List[str]
+
- quote: Optional[Dict[str, str]]
+
- perspectives: List[Dict]
+
soup = BeautifulSoup(html_description, 'html.parser')
+
'summary': self._extract_summary(soup),
+
'image_url': self._extract_image_url(soup),
+
'image_alt': self._extract_image_alt(soup),
+
'highlights': self._extract_highlights(soup),
+
'quote': self._extract_quote(soup),
+
'perspectives': self._extract_perspectives(soup),
+
'sources': self._extract_sources(soup),
+
Parse HTML and create a KagiStory object.
+
guid: Unique identifier
+
pub_date: Publication date
+
categories: List of categories
+
html_description: HTML content from description
+
parsed = self.parse(html_description)
+
# Convert parsed data to model objects
+
description=p['description'],
+
source_url=p['source_url']
+
for p in parsed['perspectives']
+
for s in parsed['sources']
+
text=parsed['quote']['text'],
+
attribution=parsed['quote']['attribution']
+
summary=parsed['summary'],
+
highlights=parsed['highlights'],
+
perspectives=perspectives,
+
image_url=parsed['image_url'],
+
image_alt=parsed['image_alt']
+
def _extract_summary(self, soup: BeautifulSoup) -> str:
+
"""Extract summary from first <p> tag."""
+
return p_tag.get_text(strip=True)
+
def _extract_image_url(self, soup: BeautifulSoup) -> Optional[str]:
+
"""Extract image URL from <img> tag."""
+
img_tag = soup.find('img')
+
if img_tag and img_tag.get('src'):
+
def _extract_image_alt(self, soup: BeautifulSoup) -> Optional[str]:
+
"""Extract image alt text from <img> tag."""
+
img_tag = soup.find('img')
+
if img_tag and img_tag.get('alt'):
+
def _extract_highlights(self, soup: BeautifulSoup) -> List[str]:
+
"""Extract highlights list from H3 section."""
+
# Find "Highlights:" h3 tag
+
h3_tags = soup.find_all('h3')
+
if 'Highlights' in h3.get_text():
+
# Get the <ul> that follows this h3
+
ul = h3.find_next_sibling('ul')
+
for li in ul.find_all('li'):
+
highlights.append(li.get_text(strip=True))
+
def _extract_quote(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]:
+
"""Extract quote from <blockquote> tag."""
+
blockquote = soup.find('blockquote')
+
text = blockquote.get_text(strip=True)
+
# Try to split on " - " to separate quote from attribution
+
quote_text, attribution = text.rsplit(' - ', 1)
+
'text': quote_text.strip(),
+
'attribution': attribution.strip()
+
# If no attribution found, entire text is the quote
+
# Try to infer attribution from context (often mentioned in highlights/perspectives)
+
'attribution': self._infer_quote_attribution(soup, text)
+
def _infer_quote_attribution(self, soup: BeautifulSoup, quote_text: str) -> str:
+
Try to infer quote attribution from context.
+
This is a fallback when quote doesn't have explicit attribution.
+
# For now, check if any perspective mentions similar keywords
+
perspectives_section = soup.find('h3', string=re.compile(r'Perspectives'))
+
if perspectives_section:
+
ul = perspectives_section.find_next_sibling('ul')
+
for li in ul.find_all('li'):
+
li_text = li.get_text()
+
# Extract actor name (before first colon)
+
actor = li_text.split(':', 1)[0].strip()
+
def _extract_perspectives(self, soup: BeautifulSoup) -> List[Dict]:
+
"""Extract perspectives from H3 section."""
+
# Find "Perspectives:" h3 tag
+
h3_tags = soup.find_all('h3')
+
if 'Perspectives' in h3.get_text():
+
# Get the <ul> that follows this h3
+
ul = h3.find_next_sibling('ul')
+
for li in ul.find_all('li'):
+
perspective = self._parse_perspective_li(li)
+
perspectives.append(perspective)
+
def _parse_perspective_li(self, li) -> Optional[Dict]:
+
Parse a single perspective <li> element.
+
Format: "Actor: Description. (Source)"
+
full_text = li.get_text()
+
# Extract actor (before first colon)
+
if ':' not in full_text:
+
actor, rest = full_text.split(':', 1)
+
# Find the <a> tag for source URL
+
source_url = a_tag['href'] if a_tag and a_tag.get('href') else ""
+
# Extract description (between colon and source link)
+
# Remove the source citation part in parentheses
+
# Remove source citation like "(The Straits Times)" from description
+
# Remove the link text and surrounding parentheses
+
link_text = a_tag.get_text()
+
description = description.replace(f"({link_text})", "").strip()
+
# Clean up trailing period
+
description = description.strip('. ')
+
'description': description,
+
'source_url': source_url
+
def _extract_sources(self, soup: BeautifulSoup) -> List[Dict]:
+
"""Extract sources list from H3 section."""
+
# Find "Sources:" h3 tag
+
h3_tags = soup.find_all('h3')
+
if 'Sources' in h3.get_text():
+
# Get the <ul> that follows this h3
+
ul = h3.find_next_sibling('ul')
+
for li in ul.find_all('li'):
+
source = self._parse_source_li(li)
+
def _parse_source_li(self, li) -> Optional[Dict]:
+
Parse a single source <li> element.
+
Format: "<a href='...'>Title</a> - domain.com"
+
if not a_tag or not a_tag.get('href'):
+
title = a_tag.get_text(strip=True)
+
# Extract domain from URL
+
parsed_url = urlparse(url)
+
domain = parsed_url.netloc
+
# Remove "www." prefix if present
+
if domain.startswith('www.'):