aggregators/kagi-news/src/html_parser.py at feat/comment-query-api-phase2a · bretton.dev/coves

bretton.dev / coves
A community based topic aggregation platform built on atproto
coves / aggregators / kagi-news / src / html_parser.py
at feat/comment-query-api-phase2a 9.5 kB view raw
  1"""
  2Kagi News HTML description parser.
  3
  4Parses the HTML content from RSS feed item descriptions
  5into structured data.
  6"""
  7import re
  8import logging
  9from typing import Dict, List, Optional
 10from datetime import datetime
 11from bs4 import BeautifulSoup
 12from urllib.parse import urlparse
 13
 14from src.models import KagiStory, Perspective, Quote, Source
 15
 16logger = logging.getLogger(__name__)
 17
 18
 19class KagiHTMLParser:
 20    """Parses Kagi News HTML descriptions into structured data."""
 21
 22    def parse(self, html_description: str) -> Dict:
 23        """
 24        Parse HTML description into structured data.
 25
 26        Args:
 27            html_description: HTML content from RSS item description
 28
 29        Returns:
 30            Dictionary with extracted data:
 31                - summary: str
 32                - image_url: Optional[str]
 33                - image_alt: Optional[str]
 34                - highlights: List[str]
 35                - quote: Optional[Dict[str, str]]
 36                - perspectives: List[Dict]
 37                - sources: List[Dict]
 38        """
 39        soup = BeautifulSoup(html_description, 'html.parser')
 40
 41        return {
 42            'summary': self._extract_summary(soup),
 43            'image_url': self._extract_image_url(soup),
 44            'image_alt': self._extract_image_alt(soup),
 45            'highlights': self._extract_highlights(soup),
 46            'quote': self._extract_quote(soup),
 47            'perspectives': self._extract_perspectives(soup),
 48            'sources': self._extract_sources(soup),
 49        }
 50
 51    def parse_to_story(
 52        self,
 53        title: str,
 54        link: str,
 55        guid: str,
 56        pub_date: datetime,
 57        categories: List[str],
 58        html_description: str
 59    ) -> KagiStory:
 60        """
 61        Parse HTML and create a KagiStory object.
 62
 63        Args:
 64            title: Story title
 65            link: Story URL
 66            guid: Unique identifier
 67            pub_date: Publication date
 68            categories: List of categories
 69            html_description: HTML content from description
 70
 71        Returns:
 72            KagiStory object
 73        """
 74        parsed = self.parse(html_description)
 75
 76        # Convert parsed data to model objects
 77        perspectives = [
 78            Perspective(
 79                actor=p['actor'],
 80                description=p['description'],
 81                source_url=p['source_url']
 82            )
 83            for p in parsed['perspectives']
 84        ]
 85
 86        sources = [
 87            Source(
 88                title=s['title'],
 89                url=s['url'],
 90                domain=s['domain']
 91            )
 92            for s in parsed['sources']
 93        ]
 94
 95        quote = None
 96        if parsed['quote']:
 97            quote = Quote(
 98                text=parsed['quote']['text'],
 99                attribution=parsed['quote']['attribution']
100            )
101
102        return KagiStory(
103            title=title,
104            link=link,
105            guid=guid,
106            pub_date=pub_date,
107            categories=categories,
108            summary=parsed['summary'],
109            highlights=parsed['highlights'],
110            perspectives=perspectives,
111            quote=quote,
112            sources=sources,
113            image_url=parsed['image_url'],
114            image_alt=parsed['image_alt']
115        )
116
117    def _extract_summary(self, soup: BeautifulSoup) -> str:
118        """Extract summary from first <p> tag."""
119        p_tag = soup.find('p')
120        if p_tag:
121            return p_tag.get_text(strip=True)
122        return ""
123
124    def _extract_image_url(self, soup: BeautifulSoup) -> Optional[str]:
125        """Extract image URL from <img> tag."""
126        img_tag = soup.find('img')
127        if img_tag and img_tag.get('src'):
128            return img_tag['src']
129        return None
130
131    def _extract_image_alt(self, soup: BeautifulSoup) -> Optional[str]:
132        """Extract image alt text from <img> tag."""
133        img_tag = soup.find('img')
134        if img_tag and img_tag.get('alt'):
135            return img_tag['alt']
136        return None
137
138    def _extract_highlights(self, soup: BeautifulSoup) -> List[str]:
139        """Extract highlights list from H3 section."""
140        highlights = []
141
142        # Find "Highlights:" h3 tag
143        h3_tags = soup.find_all('h3')
144        for h3 in h3_tags:
145            if 'Highlights' in h3.get_text():
146                # Get the <ul> that follows this h3
147                ul = h3.find_next_sibling('ul')
148                if ul:
149                    for li in ul.find_all('li'):
150                        highlights.append(li.get_text(strip=True))
151                break
152
153        return highlights
154
155    def _extract_quote(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]:
156        """Extract quote from <blockquote> tag."""
157        blockquote = soup.find('blockquote')
158        if not blockquote:
159            return None
160
161        text = blockquote.get_text(strip=True)
162
163        # Try to split on " - " to separate quote from attribution
164        if ' - ' in text:
165            quote_text, attribution = text.rsplit(' - ', 1)
166            return {
167                'text': quote_text.strip(),
168                'attribution': attribution.strip()
169            }
170
171        # If no attribution found, entire text is the quote
172        # Try to infer attribution from context (often mentioned in highlights/perspectives)
173        return {
174            'text': text,
175            'attribution': self._infer_quote_attribution(soup, text)
176        }
177
178    def _infer_quote_attribution(self, soup: BeautifulSoup, quote_text: str) -> str:
179        """
180        Try to infer quote attribution from context.
181
182        This is a fallback when quote doesn't have explicit attribution.
183        """
184        # For now, check if any perspective mentions similar keywords
185        perspectives_section = soup.find('h3', string=re.compile(r'Perspectives'))
186        if perspectives_section:
187            ul = perspectives_section.find_next_sibling('ul')
188            if ul:
189                for li in ul.find_all('li'):
190                    li_text = li.get_text()
191                    # Extract actor name (before first colon)
192                    if ':' in li_text:
193                        actor = li_text.split(':', 1)[0].strip()
194                        return actor
195
196        return "Unknown"
197
198    def _extract_perspectives(self, soup: BeautifulSoup) -> List[Dict]:
199        """Extract perspectives from H3 section."""
200        perspectives = []
201
202        # Find "Perspectives:" h3 tag
203        h3_tags = soup.find_all('h3')
204        for h3 in h3_tags:
205            if 'Perspectives' in h3.get_text():
206                # Get the <ul> that follows this h3
207                ul = h3.find_next_sibling('ul')
208                if ul:
209                    for li in ul.find_all('li'):
210                        perspective = self._parse_perspective_li(li)
211                        if perspective:
212                            perspectives.append(perspective)
213                break
214
215        return perspectives
216
217    def _parse_perspective_li(self, li) -> Optional[Dict]:
218        """
219        Parse a single perspective <li> element.
220
221        Format: "Actor: Description. (Source)"
222        """
223        # Get full text
224        full_text = li.get_text()
225
226        # Extract actor (before first colon)
227        if ':' not in full_text:
228            return None
229
230        actor, rest = full_text.split(':', 1)
231        actor = actor.strip()
232
233        # Find the <a> tag for source URL
234        a_tag = li.find('a')
235        source_url = a_tag['href'] if a_tag and a_tag.get('href') else ""
236
237        # Extract description (between colon and source link)
238        # Remove the source citation part in parentheses
239        description = rest
240
241        # Remove source citation like "(The Straits Times)" from description
242        if a_tag:
243            # Remove the link text and surrounding parentheses
244            link_text = a_tag.get_text()
245            description = description.replace(f"({link_text})", "").strip()
246
247        # Clean up trailing period
248        description = description.strip('. ')
249
250        return {
251            'actor': actor,
252            'description': description,
253            'source_url': source_url
254        }
255
256    def _extract_sources(self, soup: BeautifulSoup) -> List[Dict]:
257        """Extract sources list from H3 section."""
258        sources = []
259
260        # Find "Sources:" h3 tag
261        h3_tags = soup.find_all('h3')
262        for h3 in h3_tags:
263            if 'Sources' in h3.get_text():
264                # Get the <ul> that follows this h3
265                ul = h3.find_next_sibling('ul')
266                if ul:
267                    for li in ul.find_all('li'):
268                        source = self._parse_source_li(li)
269                        if source:
270                            sources.append(source)
271                break
272
273        return sources
274
275    def _parse_source_li(self, li) -> Optional[Dict]:
276        """
277        Parse a single source <li> element.
278
279        Format: "<a href='...'>Title</a> - domain.com"
280        """
281        a_tag = li.find('a')
282        if not a_tag or not a_tag.get('href'):
283            return None
284
285        title = a_tag.get_text(strip=True)
286        url = a_tag['href']
287
288        # Extract domain from URL
289        parsed_url = urlparse(url)
290        domain = parsed_url.netloc
291
292        # Remove "www." prefix if present
293        if domain.startswith('www.'):
294            domain = domain[4:]
295
296        return {
297            'title': title,
298            'url': url,
299            'domain': domain
300        }