aggregators/kagi-news/src/html_parser.py at main · bretton.dev/coves

bretton.dev / coves
A community based topic aggregation platform built on atproto
coves / aggregators / kagi-news / src / html_parser.py
at main 9.6 kB view raw
  1"""
  2Kagi News HTML description parser.
  3
  4Parses the HTML content from RSS feed item descriptions
  5into structured data.
  6"""
  7import re
  8import logging
  9from typing import Dict, List, Optional
 10from datetime import datetime
 11from bs4 import BeautifulSoup
 12from urllib.parse import urlparse
 13
 14from src.models import KagiStory, Perspective, Quote, Source
 15
 16logger = logging.getLogger(__name__)
 17
 18
 19class KagiHTMLParser:
 20    """Parses Kagi News HTML descriptions into structured data."""
 21
 22    def parse(self, html_description: str) -> Dict:
 23        """
 24        Parse HTML description into structured data.
 25
 26        Args:
 27            html_description: HTML content from RSS item description
 28
 29        Returns:
 30            Dictionary with extracted data:
 31                - summary: str
 32                - image_url: Optional[str]
 33                - image_alt: Optional[str]
 34                - highlights: List[str]
 35                - quote: Optional[Dict[str, str]]
 36                - perspectives: List[Dict]
 37                - sources: List[Dict]
 38        """
 39        soup = BeautifulSoup(html_description, 'html.parser')
 40
 41        return {
 42            'summary': self._extract_summary(soup),
 43            'image_url': self._extract_image_url(soup),
 44            'image_alt': self._extract_image_alt(soup),
 45            'highlights': self._extract_highlights(soup),
 46            'quote': self._extract_quote(soup),
 47            'perspectives': self._extract_perspectives(soup),
 48            'sources': self._extract_sources(soup),
 49        }
 50
 51    def parse_to_story(
 52        self,
 53        title: str,
 54        link: str,
 55        guid: str,
 56        pub_date: datetime,
 57        categories: List[str],
 58        html_description: str
 59    ) -> KagiStory:
 60        """
 61        Parse HTML and create a KagiStory object.
 62
 63        Args:
 64            title: Story title
 65            link: Story URL
 66            guid: Unique identifier
 67            pub_date: Publication date
 68            categories: List of categories
 69            html_description: HTML content from description
 70
 71        Returns:
 72            KagiStory object
 73        """
 74        parsed = self.parse(html_description)
 75
 76        # Convert parsed data to model objects
 77        perspectives = [
 78            Perspective(
 79                actor=p['actor'],
 80                description=p['description'],
 81                source_url=p['source_url'],
 82                source_name=p.get('source_name', '')
 83            )
 84            for p in parsed['perspectives']
 85        ]
 86
 87        sources = [
 88            Source(
 89                title=s['title'],
 90                url=s['url'],
 91                domain=s['domain']
 92            )
 93            for s in parsed['sources']
 94        ]
 95
 96        quote = None
 97        if parsed['quote']:
 98            quote = Quote(
 99                text=parsed['quote']['text'],
100                attribution=parsed['quote']['attribution']
101            )
102
103        return KagiStory(
104            title=title,
105            link=link,
106            guid=guid,
107            pub_date=pub_date,
108            categories=categories,
109            summary=parsed['summary'],
110            highlights=parsed['highlights'],
111            perspectives=perspectives,
112            quote=quote,
113            sources=sources,
114            image_url=parsed['image_url'],
115            image_alt=parsed['image_alt']
116        )
117
118    def _extract_summary(self, soup: BeautifulSoup) -> str:
119        """Extract summary from first <p> tag."""
120        p_tag = soup.find('p')
121        if p_tag:
122            return p_tag.get_text(strip=True)
123        return ""
124
125    def _extract_image_url(self, soup: BeautifulSoup) -> Optional[str]:
126        """Extract image URL from <img> tag."""
127        img_tag = soup.find('img')
128        if img_tag and img_tag.get('src'):
129            return img_tag['src']
130        return None
131
132    def _extract_image_alt(self, soup: BeautifulSoup) -> Optional[str]:
133        """Extract image alt text from <img> tag."""
134        img_tag = soup.find('img')
135        if img_tag and img_tag.get('alt'):
136            return img_tag['alt']
137        return None
138
139    def _extract_highlights(self, soup: BeautifulSoup) -> List[str]:
140        """Extract highlights list from H3 section."""
141        highlights = []
142
143        # Find "Highlights:" h3 tag
144        h3_tags = soup.find_all('h3')
145        for h3 in h3_tags:
146            if 'Highlights' in h3.get_text():
147                # Get the <ul> that follows this h3
148                ul = h3.find_next_sibling('ul')
149                if ul:
150                    for li in ul.find_all('li'):
151                        highlights.append(li.get_text(strip=True))
152                break
153
154        return highlights
155
156    def _extract_quote(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]:
157        """Extract quote from <blockquote> tag."""
158        blockquote = soup.find('blockquote')
159        if not blockquote:
160            return None
161
162        text = blockquote.get_text(strip=True)
163
164        # Try to split on " - " to separate quote from attribution
165        if ' - ' in text:
166            quote_text, attribution = text.rsplit(' - ', 1)
167            return {
168                'text': quote_text.strip(),
169                'attribution': attribution.strip()
170            }
171
172        # If no attribution found, entire text is the quote
173        # Try to infer attribution from context (often mentioned in highlights/perspectives)
174        return {
175            'text': text,
176            'attribution': self._infer_quote_attribution(soup, text)
177        }
178
179    def _infer_quote_attribution(self, soup: BeautifulSoup, quote_text: str) -> str:
180        """
181        Try to infer quote attribution from context.
182
183        This is a fallback when quote doesn't have explicit attribution.
184        """
185        # For now, check if any perspective mentions similar keywords
186        perspectives_section = soup.find('h3', string=re.compile(r'Perspectives'))
187        if perspectives_section:
188            ul = perspectives_section.find_next_sibling('ul')
189            if ul:
190                for li in ul.find_all('li'):
191                    li_text = li.get_text()
192                    # Extract actor name (before first colon)
193                    if ':' in li_text:
194                        actor = li_text.split(':', 1)[0].strip()
195                        return actor
196
197        return "Unknown"
198
199    def _extract_perspectives(self, soup: BeautifulSoup) -> List[Dict]:
200        """Extract perspectives from H3 section."""
201        perspectives = []
202
203        # Find "Perspectives:" h3 tag
204        h3_tags = soup.find_all('h3')
205        for h3 in h3_tags:
206            if 'Perspectives' in h3.get_text():
207                # Get the <ul> that follows this h3
208                ul = h3.find_next_sibling('ul')
209                if ul:
210                    for li in ul.find_all('li'):
211                        perspective = self._parse_perspective_li(li)
212                        if perspective:
213                            perspectives.append(perspective)
214                break
215
216        return perspectives
217
218    def _parse_perspective_li(self, li) -> Optional[Dict]:
219        """
220        Parse a single perspective <li> element.
221
222        Format: "Actor: Description. (Source)"
223        """
224        # Get full text
225        full_text = li.get_text()
226
227        # Extract actor (before first colon)
228        if ':' not in full_text:
229            return None
230
231        actor, rest = full_text.split(':', 1)
232        actor = actor.strip()
233
234        # Find the <a> tag for source URL and name
235        a_tag = li.find('a')
236        source_url = a_tag['href'] if a_tag and a_tag.get('href') else ""
237        source_name = a_tag.get_text(strip=True) if a_tag else ""
238
239        # Extract description (between colon and source link)
240        # Remove the source citation part in parentheses
241        description = rest
242
243        # Remove source citation like "(The Straits Times)" from description
244        if a_tag:
245            # Remove the link text and surrounding parentheses
246            link_text = a_tag.get_text()
247            description = description.replace(f"({link_text})", "").strip()
248
249        # Clean up trailing period
250        description = description.strip('. ')
251
252        return {
253            'actor': actor,
254            'description': description,
255            'source_url': source_url,
256            'source_name': source_name
257        }
258
259    def _extract_sources(self, soup: BeautifulSoup) -> List[Dict]:
260        """Extract sources list from H3 section."""
261        sources = []
262
263        # Find "Sources:" h3 tag
264        h3_tags = soup.find_all('h3')
265        for h3 in h3_tags:
266            if 'Sources' in h3.get_text():
267                # Get the <ul> that follows this h3
268                ul = h3.find_next_sibling('ul')
269                if ul:
270                    for li in ul.find_all('li'):
271                        source = self._parse_source_li(li)
272                        if source:
273                            sources.append(source)
274                break
275
276        return sources
277
278    def _parse_source_li(self, li) -> Optional[Dict]:
279        """
280        Parse a single source <li> element.
281
282        Format: "<a href='...'>Title</a> - domain.com"
283        """
284        a_tag = li.find('a')
285        if not a_tag or not a_tag.get('href'):
286            return None
287
288        title = a_tag.get_text(strip=True)
289        url = a_tag['href']
290
291        # Extract domain from URL
292        parsed_url = urlparse(url)
293        domain = parsed_url.netloc
294
295        # Remove "www." prefix if present
296        if domain.startswith('www.'):
297            domain = domain[4:]
298
299        return {
300            'title': title,
301            'url': url,
302            'domain': domain
303        }