commit 22371dccc3544046e9149369fe32b229789ace0a · bretton.dev/coves

+41

aggregators/kagi-news/.gitignore

···

       1
       +
       # Environment and config

     

       2
       +
       .env

     

       3
       +
       config.yaml

     

       4
       +
       venv/

     

       5
       +
       

     

       6
       +
       # State files

     

       7
       +
       data/*.json

     

       8
       +
       data/world.xml

     

       9
       +
       

     

       10
       +
       # Python

     

       11
       +
       __pycache__/

     

       12
       +
       *.py[cod]

     

       13
       +
       *$py.class

     

       14
       +
       *.so

     

       15
       +
       .Python

     

       16
       +
       build/

     

       17
       +
       develop-eggs/

     

       18
       +
       dist/

     

       19
       +
       downloads/

     

       20
       +
       eggs/

     

       21
       +
       .eggs/

     

       22
       +
       lib/

     

       23
       +
       lib64/

     

       24
       +
       parts/

     

       25
       +
       sdist/

     

       26
       +
       var/

     

       27
       +
       wheels/

     

       28
       +
       *.egg-info/

     

       29
       +
       .installed.cfg

     

       30
       +
       *.egg

     

       31
       +
       

     

       32
       +
       # Testing

     

       33
       +
       .pytest_cache/

     

       34
       +
       .coverage

     

       35
       +
       htmlcov/

     

       36
       +
       

     

       37
       +
       # IDE

     

       38
       +
       .vscode/

     

       39
       +
       .idea/

     

       40
       +
       *.swp

     

       41
       +
       *.swo

aggregators/kagi-news/src/__init__.py

···

       1
       +
       """Kagi News RSS Aggregator for Coves."""

     

       2
       +
       

     

       3
       +
       __version__ = "0.1.0"

+165

aggregators/kagi-news/src/config.py

···

       1
       +
       """

     

       2
       +
       Configuration Loader for Kagi News Aggregator.

     

       3
       +
       

     

       4
       +
       Loads and validates configuration from YAML files.

     

       5
       +
       """

     

       6
       +
       import os

     

       7
       +
       import logging

     

       8
       +
       from pathlib import Path

     

       9
       +
       from typing import Dict, Any

     

       10
       +
       import yaml

     

       11
       +
       from urllib.parse import urlparse

     

       12
       +
       

     

       13
       +
       from src.models import AggregatorConfig, FeedConfig

     

       14
       +
       

     

       15
       +
       logger = logging.getLogger(__name__)

     

       16
       +
       

     

       17
       +
       

     

       18
       +
       class ConfigError(Exception):

     

       19
       +
           """Configuration error."""

     

       20
       +
           pass

     

       21
       +
       

     

       22
       +
       

     

       23
       +
       class ConfigLoader:

     

       24
       +
           """

     

       25
       +
           Loads and validates aggregator configuration.

     

       26
       +
       

     

       27
       +
           Supports:

     

       28
       +
           - Loading from YAML file

     

       29
       +
           - Environment variable overrides

     

       30
       +
           - Validation of required fields

     

       31
       +
           - URL validation

     

       32
       +
           """

     

       33
       +
       

     

       34
       +
           def __init__(self, config_path: Path):

     

       35
       +
               """

     

       36
       +
               Initialize config loader.

     

       37
       +
       

     

       38
       +
               Args:

     

       39
       +
                   config_path: Path to config.yaml file

     

       40
       +
               """

     

       41
       +
               self.config_path = Path(config_path)

     

       42
       +
       

     

       43
       +
           def load(self) -> AggregatorConfig:

     

       44
       +
               """

     

       45
       +
               Load and validate configuration.

     

       46
       +
       

     

       47
       +
               Returns:

     

       48
       +
                   AggregatorConfig object

     

       49
       +
       

     

       50
       +
               Raises:

     

       51
       +
                   ConfigError: If config is invalid or missing

     

       52
       +
               """

     

       53
       +
               # Check file exists

     

       54
       +
               if not self.config_path.exists():

     

       55
       +
                   raise ConfigError(f"Configuration file not found: {self.config_path}")

     

       56
       +
       

     

       57
       +
               # Load YAML

     

       58
       +
               try:

     

       59
       +
                   with open(self.config_path, 'r') as f:

     

       60
       +
                       config_data = yaml.safe_load(f)

     

       61
       +
               except yaml.YAMLError as e:

     

       62
       +
                   raise ConfigError(f"Failed to parse YAML: {e}")

     

       63
       +
       

     

       64
       +
               if not config_data:

     

       65
       +
                   raise ConfigError("Configuration file is empty")

     

       66
       +
       

     

       67
       +
               # Validate and parse

     

       68
       +
               try:

     

       69
       +
                   return self._parse_config(config_data)

     

       70
       +
               except Exception as e:

     

       71
       +
                   raise ConfigError(f"Invalid configuration: {e}")

     

       72
       +
       

     

       73
       +
           def _parse_config(self, data: Dict[str, Any]) -> AggregatorConfig:

     

       74
       +
               """

     

       75
       +
               Parse and validate configuration data.

     

       76
       +
       

     

       77
       +
               Args:

     

       78
       +
                   data: Parsed YAML data

     

       79
       +
       

     

       80
       +
               Returns:

     

       81
       +
                   AggregatorConfig object

     

       82
       +
       

     

       83
       +
               Raises:

     

       84
       +
                   ConfigError: If validation fails

     

       85
       +
               """

     

       86
       +
               # Get coves_api_url (with env override)

     

       87
       +
               coves_api_url = os.getenv('COVES_API_URL', data.get('coves_api_url'))

     

       88
       +
               if not coves_api_url:

     

       89
       +
                   raise ConfigError("Missing required field: coves_api_url")

     

       90
       +
       

     

       91
       +
               # Validate URL

     

       92
       +
               if not self._is_valid_url(coves_api_url):

     

       93
       +
                   raise ConfigError(f"Invalid URL for coves_api_url: {coves_api_url}")

     

       94
       +
       

     

       95
       +
               # Get log level (default to info)

     

       96
       +
               log_level = data.get('log_level', 'info')

     

       97
       +
       

     

       98
       +
               # Parse feeds

     

       99
       +
               feeds_data = data.get('feeds', [])

     

       100
       +
               if not feeds_data:

     

       101
       +
                   raise ConfigError("Configuration must include at least one feed")

     

       102
       +
       

     

       103
       +
               feeds = []

     

       104
       +
               for feed_data in feeds_data:

     

       105
       +
                   feed = self._parse_feed(feed_data)

     

       106
       +
                   feeds.append(feed)

     

       107
       +
       

     

       108
       +
               logger.info(f"Loaded configuration with {len(feeds)} feeds ({sum(1 for f in feeds if f.enabled)} enabled)")

     

       109
       +
       

     

       110
       +
               return AggregatorConfig(

     

       111
       +
                   coves_api_url=coves_api_url,

     

       112
       +
                   feeds=feeds,

     

       113
       +
                   log_level=log_level

     

       114
       +
               )

     

       115
       +
       

     

       116
       +
           def _parse_feed(self, data: Dict[str, Any]) -> FeedConfig:

     

       117
       +
               """

     

       118
       +
               Parse and validate a single feed configuration.

     

       119
       +
       

     

       120
       +
               Args:

     

       121
       +
                   data: Feed configuration data

     

       122
       +
       

     

       123
       +
               Returns:

     

       124
       +
                   FeedConfig object

     

       125
       +
       

     

       126
       +
               Raises:

     

       127
       +
                   ConfigError: If validation fails

     

       128
       +
               """

     

       129
       +
               # Required fields

     

       130
       +
               required_fields = ['name', 'url', 'community_handle']

     

       131
       +
               for field in required_fields:

     

       132
       +
                   if field not in data:

     

       133
       +
                       raise ConfigError(f"Missing required field in feed config: {field}")

     

       134
       +
       

     

       135
       +
               name = data['name']

     

       136
       +
               url = data['url']

     

       137
       +
               community_handle = data['community_handle']

     

       138
       +
               enabled = data.get('enabled', True)  # Default to True

     

       139
       +
       

     

       140
       +
               # Validate URL

     

       141
       +
               if not self._is_valid_url(url):

     

       142
       +
                   raise ConfigError(f"Invalid URL for feed '{name}': {url}")

     

       143
       +
       

     

       144
       +
               return FeedConfig(

     

       145
       +
                   name=name,

     

       146
       +
                   url=url,

     

       147
       +
                   community_handle=community_handle,

     

       148
       +
                   enabled=enabled

     

       149
       +
               )

     

       150
       +
       

     

       151
       +
           def _is_valid_url(self, url: str) -> bool:

     

       152
       +
               """

     

       153
       +
               Validate URL format.

     

       154
       +
       

     

       155
       +
               Args:

     

       156
       +
                   url: URL to validate

     

       157
       +
       

     

       158
       +
               Returns:

     

       159
       +
                   True if valid, False otherwise

     

       160
       +
               """

     

       161
       +
               try:

     

       162
       +
                   result = urlparse(url)

     

       163
       +
                   return all([result.scheme, result.netloc])

     

       164
       +
               except Exception:

     

       165
       +
                   return False

+175

aggregators/kagi-news/src/coves_client.py

···

       1
       +
       """

     

       2
       +
       Coves API Client for posting to communities.

     

       3
       +
       

     

       4
       +
       Handles authentication and posting via XRPC.

     

       5
       +
       """

     

       6
       +
       import logging

     

       7
       +
       import requests

     

       8
       +
       from typing import Dict, List, Optional

     

       9
       +
       from atproto import Client

     

       10
       +
       

     

       11
       +
       logger = logging.getLogger(__name__)

     

       12
       +
       

     

       13
       +
       

     

       14
       +
       class CovesClient:

     

       15
       +
           """

     

       16
       +
           Client for posting to Coves communities via XRPC.

     

       17
       +
       

     

       18
       +
           Handles:

     

       19
       +
           - Authentication with aggregator credentials

     

       20
       +
           - Creating posts in communities (social.coves.post.create)

     

       21
       +
           - External embed formatting

     

       22
       +
           """

     

       23
       +
       

     

       24
       +
           def __init__(self, api_url: str, handle: str, password: str, pds_url: Optional[str] = None):

     

       25
       +
               """

     

       26
       +
               Initialize Coves client.

     

       27
       +
       

     

       28
       +
               Args:

     

       29
       +
                   api_url: Coves AppView URL for posting (e.g., "http://localhost:8081")

     

       30
       +
                   handle: Aggregator handle (e.g., "kagi-news.coves.social")

     

       31
       +
                   password: Aggregator password/app password

     

       32
       +
                   pds_url: Optional PDS URL for authentication (defaults to api_url)

     

       33
       +
               """

     

       34
       +
               self.api_url = api_url

     

       35
       +
               self.pds_url = pds_url or api_url  # Auth through PDS, post through AppView

     

       36
       +
               self.handle = handle

     

       37
       +
               self.password = password

     

       38
       +
               self.client = Client(base_url=self.pds_url)  # Use PDS for auth

     

       39
       +
               self._authenticated = False

     

       40
       +
       

     

       41
       +
           def authenticate(self):

     

       42
       +
               """

     

       43
       +
               Authenticate with Coves API.

     

       44
       +
       

     

       45
       +
               Uses com.atproto.server.createSession directly to avoid

     

       46
       +
               Bluesky-specific endpoints that don't exist on Coves PDS.

     

       47
       +
       

     

       48
       +
               Raises:

     

       49
       +
                   Exception: If authentication fails

     

       50
       +
               """

     

       51
       +
               try:

     

       52
       +
                   logger.info(f"Authenticating as {self.handle}")

     

       53
       +
       

     

       54
       +
                   # Use createSession directly (avoid app.bsky.actor.getProfile)

     

       55
       +
                   session = self.client.com.atproto.server.create_session(

     

       56
       +
                       {"identifier": self.handle, "password": self.password}

     

       57
       +
                   )

     

       58
       +
       

     

       59
       +
                   # Manually set session (skip profile fetch)

     

       60
       +
                   self.client._session = session

     

       61
       +
                   self._authenticated = True

     

       62
       +
                   self.did = session.did

     

       63
       +
       

     

       64
       +
                   logger.info(f"Authentication successful (DID: {self.did})")

     

       65
       +
               except Exception as e:

     

       66
       +
                   logger.error(f"Authentication failed: {e}")

     

       67
       +
                   raise

     

       68
       +
       

     

       69
       +
           def create_post(

     

       70
       +
               self,

     

       71
       +
               community_handle: str,

     

       72
       +
               content: str,

     

       73
       +
               facets: List[Dict],

     

       74
       +
               embed: Optional[Dict] = None

     

       75
       +
           ) -> str:

     

       76
       +
               """

     

       77
       +
               Create a post in a community.

     

       78
       +
       

     

       79
       +
               Args:

     

       80
       +
                   community_handle: Community handle (e.g., "world-news.coves.social")

     

       81
       +
                   content: Post content (rich text)

     

       82
       +
                   facets: Rich text facets (formatting, links)

     

       83
       +
                   embed: Optional external embed

     

       84
       +
       

     

       85
       +
               Returns:

     

       86
       +
                   AT Proto URI of created post (e.g., "at://did:plc:.../social.coves.post/...")

     

       87
       +
       

     

       88
       +
               Raises:

     

       89
       +
                   Exception: If post creation fails

     

       90
       +
               """

     

       91
       +
               if not self._authenticated:

     

       92
       +
                   self.authenticate()

     

       93
       +
       

     

       94
       +
               try:

     

       95
       +
                   # Prepare post data for social.coves.post.create endpoint

     

       96
       +
                   post_data = {

     

       97
       +
                       "community": community_handle,

     

       98
       +
                       "content": content,

     

       99
       +
                       "facets": facets

     

       100
       +
                   }

     

       101
       +
       

     

       102
       +
                   # Add embed if provided

     

       103
       +
                   if embed:

     

       104
       +
                       post_data["embed"] = embed

     

       105
       +
       

     

       106
       +
                   # Use Coves-specific endpoint (not direct PDS write)

     

       107
       +
                   # This provides validation, authorization, and business logic

     

       108
       +
                   logger.info(f"Creating post in community: {community_handle}")

     

       109
       +
       

     

       110
       +
                   # Make direct HTTP request to XRPC endpoint

     

       111
       +
                   url = f"{self.api_url}/xrpc/social.coves.post.create"

     

       112
       +
                   headers = {

     

       113
       +
                       "Authorization": f"Bearer {self.client._session.access_jwt}",

     

       114
       +
                       "Content-Type": "application/json"

     

       115
       +
                   }

     

       116
       +
       

     

       117
       +
                   response = requests.post(url, json=post_data, headers=headers, timeout=30)

     

       118
       +
       

     

       119
       +
                   # Log detailed error if request fails

     

       120
       +
                   if not response.ok:

     

       121
       +
                       error_body = response.text

     

       122
       +
                       logger.error(f"Post creation failed ({response.status_code}): {error_body}")

     

       123
       +
                       response.raise_for_status()

     

       124
       +
       

     

       125
       +
                   result = response.json()

     

       126
       +
                   post_uri = result["uri"]

     

       127
       +
                   logger.info(f"Post created: {post_uri}")

     

       128
       +
                   return post_uri

     

       129
       +
       

     

       130
       +
               except Exception as e:

     

       131
       +
                   logger.error(f"Failed to create post: {e}")

     

       132
       +
                   raise

     

       133
       +
       

     

       134
       +
           def create_external_embed(

     

       135
       +
               self,

     

       136
       +
               uri: str,

     

       137
       +
               title: str,

     

       138
       +
               description: str,

     

       139
       +
               thumb: Optional[str] = None

     

       140
       +
           ) -> Dict:

     

       141
       +
               """

     

       142
       +
               Create external embed object for hot-linked content.

     

       143
       +
       

     

       144
       +
               Args:

     

       145
       +
                   uri: External URL (story link)

     

       146
       +
                   title: Story title

     

       147
       +
                   description: Story description/summary

     

       148
       +
                   thumb: Optional thumbnail image URL

     

       149
       +
       

     

       150
       +
               Returns:

     

       151
       +
                   External embed dictionary

     

       152
       +
               """

     

       153
       +
               embed = {

     

       154
       +
                   "$type": "social.coves.embed.external",

     

       155
       +
                   "external": {

     

       156
       +
                       "uri": uri,

     

       157
       +
                       "title": title,

     

       158
       +
                       "description": description

     

       159
       +
                   }

     

       160
       +
               }

     

       161
       +
       

     

       162
       +
               if thumb:

     

       163
       +
                   embed["external"]["thumb"] = thumb

     

       164
       +
       

     

       165
       +
               return embed

     

       166
       +
       

     

       167
       +
           def _get_timestamp(self) -> str:

     

       168
       +
               """

     

       169
       +
               Get current timestamp in ISO 8601 format.

     

       170
       +
       

     

       171
       +
               Returns:

     

       172
       +
                   ISO timestamp string

     

       173
       +
               """

     

       174
       +
               from datetime import datetime, timezone

     

       175
       +
               return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")

+300

aggregators/kagi-news/src/html_parser.py

···

       1
       +
       """

     

       2
       +
       Kagi News HTML description parser.

     

       3
       +
       

     

       4
       +
       Parses the HTML content from RSS feed item descriptions

     

       5
       +
       into structured data.

     

       6
       +
       """

     

       7
       +
       import re

     

       8
       +
       import logging

     

       9
       +
       from typing import Dict, List, Optional

     

       10
       +
       from datetime import datetime

     

       11
       +
       from bs4 import BeautifulSoup

     

       12
       +
       from urllib.parse import urlparse

     

       13
       +
       

     

       14
       +
       from src.models import KagiStory, Perspective, Quote, Source

     

       15
       +
       

     

       16
       +
       logger = logging.getLogger(__name__)

     

       17
       +
       

     

       18
       +
       

     

       19
       +
       class KagiHTMLParser:

     

       20
       +
           """Parses Kagi News HTML descriptions into structured data."""

     

       21
       +
       

     

       22
       +
           def parse(self, html_description: str) -> Dict:

     

       23
       +
               """

     

       24
       +
               Parse HTML description into structured data.

     

       25
       +
       

     

       26
       +
               Args:

     

       27
       +
                   html_description: HTML content from RSS item description

     

       28
       +
       

     

       29
       +
               Returns:

     

       30
       +
                   Dictionary with extracted data:

     

       31
       +
                       - summary: str

     

       32
       +
                       - image_url: Optional[str]

     

       33
       +
                       - image_alt: Optional[str]

     

       34
       +
                       - highlights: List[str]

     

       35
       +
                       - quote: Optional[Dict[str, str]]

     

       36
       +
                       - perspectives: List[Dict]

     

       37
       +
                       - sources: List[Dict]

     

       38
       +
               """

     

       39
       +
               soup = BeautifulSoup(html_description, 'html.parser')

     

       40
       +
       

     

       41
       +
               return {

     

       42
       +
                   'summary': self._extract_summary(soup),

     

       43
       +
                   'image_url': self._extract_image_url(soup),

     

       44
       +
                   'image_alt': self._extract_image_alt(soup),

     

       45
       +
                   'highlights': self._extract_highlights(soup),

     

       46
       +
                   'quote': self._extract_quote(soup),

     

       47
       +
                   'perspectives': self._extract_perspectives(soup),

     

       48
       +
                   'sources': self._extract_sources(soup),

     

       49
       +
               }

     

       50
       +
       

     

       51
       +
           def parse_to_story(

     

       52
       +
               self,

     

       53
       +
               title: str,

     

       54
       +
               link: str,

     

       55
       +
               guid: str,

     

       56
       +
               pub_date: datetime,

     

       57
       +
               categories: List[str],

     

       58
       +
               html_description: str

     

       59
       +
           ) -> KagiStory:

     

       60
       +
               """

     

       61
       +
               Parse HTML and create a KagiStory object.

     

       62
       +
       

     

       63
       +
               Args:

     

       64
       +
                   title: Story title

     

       65
       +
                   link: Story URL

     

       66
       +
                   guid: Unique identifier

     

       67
       +
                   pub_date: Publication date

     

       68
       +
                   categories: List of categories

     

       69
       +
                   html_description: HTML content from description

     

       70
       +
       

     

       71
       +
               Returns:

     

       72
       +
                   KagiStory object

     

       73
       +
               """

     

       74
       +
               parsed = self.parse(html_description)

     

       75
       +
       

     

       76
       +
               # Convert parsed data to model objects

     

       77
       +
               perspectives = [

     

       78
       +
                   Perspective(

     

       79
       +
                       actor=p['actor'],

     

       80
       +
                       description=p['description'],

     

       81
       +
                       source_url=p['source_url']

     

       82
       +
                   )

     

       83
       +
                   for p in parsed['perspectives']

     

       84
       +
               ]

     

       85
       +
       

     

       86
       +
               sources = [

     

       87
       +
                   Source(

     

       88
       +
                       title=s['title'],

     

       89
       +
                       url=s['url'],

     

       90
       +
                       domain=s['domain']

     

       91
       +
                   )

     

       92
       +
                   for s in parsed['sources']

     

       93
       +
               ]

     

       94
       +
       

     

       95
       +
               quote = None

     

       96
       +
               if parsed['quote']:

     

       97
       +
                   quote = Quote(

     

       98
       +
                       text=parsed['quote']['text'],

     

       99
       +
                       attribution=parsed['quote']['attribution']

     

       100
       +
                   )

     

       101
       +
       

     

       102
       +
               return KagiStory(

     

       103
       +
                   title=title,

     

       104
       +
                   link=link,

     

       105
       +
                   guid=guid,

     

       106
       +
                   pub_date=pub_date,

     

       107
       +
                   categories=categories,

     

       108
       +
                   summary=parsed['summary'],

     

       109
       +
                   highlights=parsed['highlights'],

     

       110
       +
                   perspectives=perspectives,

     

       111
       +
                   quote=quote,

     

       112
       +
                   sources=sources,

     

       113
       +
                   image_url=parsed['image_url'],

     

       114
       +
                   image_alt=parsed['image_alt']

     

       115
       +
               )

     

       116
       +
       

     

       117
       +
           def _extract_summary(self, soup: BeautifulSoup) -> str:

     

       118
       +
               """Extract summary from first <p> tag."""

     

       119
       +
               p_tag = soup.find('p')

     

       120
       +
               if p_tag:

     

       121
       +
                   return p_tag.get_text(strip=True)

     

       122
       +
               return ""

     

       123
       +
       

     

       124
       +
           def _extract_image_url(self, soup: BeautifulSoup) -> Optional[str]:

     

       125
       +
               """Extract image URL from <img> tag."""

     

       126
       +
               img_tag = soup.find('img')

     

       127
       +
               if img_tag and img_tag.get('src'):

     

       128
       +
                   return img_tag['src']

     

       129
       +
               return None

     

       130
       +
       

     

       131
       +
           def _extract_image_alt(self, soup: BeautifulSoup) -> Optional[str]:

     

       132
       +
               """Extract image alt text from <img> tag."""

     

       133
       +
               img_tag = soup.find('img')

     

       134
       +
               if img_tag and img_tag.get('alt'):

     

       135
       +
                   return img_tag['alt']

     

       136
       +
               return None

     

       137
       +
       

     

       138
       +
           def _extract_highlights(self, soup: BeautifulSoup) -> List[str]:

     

       139
       +
               """Extract highlights list from H3 section."""

     

       140
       +
               highlights = []

     

       141
       +
       

     

       142
       +
               # Find "Highlights:" h3 tag

     

       143
       +
               h3_tags = soup.find_all('h3')

     

       144
       +
               for h3 in h3_tags:

     

       145
       +
                   if 'Highlights' in h3.get_text():

     

       146
       +
                       # Get the <ul> that follows this h3

     

       147
       +
                       ul = h3.find_next_sibling('ul')

     

       148
       +
                       if ul:

     

       149
       +
                           for li in ul.find_all('li'):

     

       150
       +
                               highlights.append(li.get_text(strip=True))

     

       151
       +
                       break

     

       152
       +
       

     

       153
       +
               return highlights

     

       154
       +
       

     

       155
       +
           def _extract_quote(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]:

     

       156
       +
               """Extract quote from <blockquote> tag."""

     

       157
       +
               blockquote = soup.find('blockquote')

     

       158
       +
               if not blockquote:

     

       159
       +
                   return None

     

       160
       +
       

     

       161
       +
               text = blockquote.get_text(strip=True)

     

       162
       +
       

     

       163
       +
               # Try to split on " - " to separate quote from attribution

     

       164
       +
               if ' - ' in text:

     

       165
       +
                   quote_text, attribution = text.rsplit(' - ', 1)

     

       166
       +
                   return {

     

       167
       +
                       'text': quote_text.strip(),

     

       168
       +
                       'attribution': attribution.strip()

     

       169
       +
                   }

     

       170
       +
       

     

       171
       +
               # If no attribution found, entire text is the quote

     

       172
       +
               # Try to infer attribution from context (often mentioned in highlights/perspectives)

     

       173
       +
               return {

     

       174
       +
                   'text': text,

     

       175
       +
                   'attribution': self._infer_quote_attribution(soup, text)

     

       176
       +
               }

     

       177
       +
       

     

       178
       +
           def _infer_quote_attribution(self, soup: BeautifulSoup, quote_text: str) -> str:

     

       179
       +
               """

     

       180
       +
               Try to infer quote attribution from context.

     

       181
       +
       

     

       182
       +
               This is a fallback when quote doesn't have explicit attribution.

     

       183
       +
               """

     

       184
       +
               # For now, check if any perspective mentions similar keywords

     

       185
       +
               perspectives_section = soup.find('h3', string=re.compile(r'Perspectives'))

     

       186
       +
               if perspectives_section:

     

       187
       +
                   ul = perspectives_section.find_next_sibling('ul')

     

       188
       +
                   if ul:

     

       189
       +
                       for li in ul.find_all('li'):

     

       190
       +
                           li_text = li.get_text()

     

       191
       +
                           # Extract actor name (before first colon)

     

       192
       +
                           if ':' in li_text:

     

       193
       +
                               actor = li_text.split(':', 1)[0].strip()

     

       194
       +
                               return actor

     

       195
       +
       

     

       196
       +
               return "Unknown"

     

       197
       +
       

     

       198
       +
           def _extract_perspectives(self, soup: BeautifulSoup) -> List[Dict]:

     

       199
       +
               """Extract perspectives from H3 section."""

     

       200
       +
               perspectives = []

     

       201
       +
       

     

       202
       +
               # Find "Perspectives:" h3 tag

     

       203
       +
               h3_tags = soup.find_all('h3')

     

       204
       +
               for h3 in h3_tags:

     

       205
       +
                   if 'Perspectives' in h3.get_text():

     

       206
       +
                       # Get the <ul> that follows this h3

     

       207
       +
                       ul = h3.find_next_sibling('ul')

     

       208
       +
                       if ul:

     

       209
       +
                           for li in ul.find_all('li'):

     

       210
       +
                               perspective = self._parse_perspective_li(li)

     

       211
       +
                               if perspective:

     

       212
       +
                                   perspectives.append(perspective)

     

       213
       +
                       break

     

       214
       +
       

     

       215
       +
               return perspectives

     

       216
       +
       

     

       217
       +
           def _parse_perspective_li(self, li) -> Optional[Dict]:

     

       218
       +
               """

     

       219
       +
               Parse a single perspective <li> element.

     

       220
       +
       

     

       221
       +
               Format: "Actor: Description. (Source)"

     

       222
       +
               """

     

       223
       +
               # Get full text

     

       224
       +
               full_text = li.get_text()

     

       225
       +
       

     

       226
       +
               # Extract actor (before first colon)

     

       227
       +
               if ':' not in full_text:

     

       228
       +
                   return None

     

       229
       +
       

     

       230
       +
               actor, rest = full_text.split(':', 1)

     

       231
       +
               actor = actor.strip()

     

       232
       +
       

     

       233
       +
               # Find the <a> tag for source URL

     

       234
       +
               a_tag = li.find('a')

     

       235
       +
               source_url = a_tag['href'] if a_tag and a_tag.get('href') else ""

     

       236
       +
       

     

       237
       +
               # Extract description (between colon and source link)

     

       238
       +
               # Remove the source citation part in parentheses

     

       239
       +
               description = rest

     

       240
       +
       

     

       241
       +
               # Remove source citation like "(The Straits Times)" from description

     

       242
       +
               if a_tag:

     

       243
       +
                   # Remove the link text and surrounding parentheses

     

       244
       +
                   link_text = a_tag.get_text()

     

       245
       +
                   description = description.replace(f"({link_text})", "").strip()

     

       246
       +
       

     

       247
       +
               # Clean up trailing period

     

       248
       +
               description = description.strip('. ')

     

       249
       +
       

     

       250
       +
               return {

     

       251
       +
                   'actor': actor,

     

       252
       +
                   'description': description,

     

       253
       +
                   'source_url': source_url

     

       254
       +
               }

     

       255
       +
       

     

       256
       +
           def _extract_sources(self, soup: BeautifulSoup) -> List[Dict]:

     

       257
       +
               """Extract sources list from H3 section."""

     

       258
       +
               sources = []

     

       259
       +
       

     

       260
       +
               # Find "Sources:" h3 tag

     

       261
       +
               h3_tags = soup.find_all('h3')

     

       262
       +
               for h3 in h3_tags:

     

       263
       +
                   if 'Sources' in h3.get_text():

     

       264
       +
                       # Get the <ul> that follows this h3

     

       265
       +
                       ul = h3.find_next_sibling('ul')

     

       266
       +
                       if ul:

     

       267
       +
                           for li in ul.find_all('li'):

     

       268
       +
                               source = self._parse_source_li(li)

     

       269
       +
                               if source:

     

       270
       +
                                   sources.append(source)

     

       271
       +
                       break

     

       272
       +
       

     

       273
       +
               return sources

     

       274
       +
       

     

       275
       +
           def _parse_source_li(self, li) -> Optional[Dict]:

     

       276
       +
               """

     

       277
       +
               Parse a single source <li> element.

     

       278
       +
       

     

       279
       +
               Format: "<a href='...'>Title</a> - domain.com"

     

       280
       +
               """

     

       281
       +
               a_tag = li.find('a')

     

       282
       +
               if not a_tag or not a_tag.get('href'):

     

       283
       +
                   return None

     

       284
       +
       

     

       285
       +
               title = a_tag.get_text(strip=True)

     

       286
       +
               url = a_tag['href']

     

       287
       +
       

     

       288
       +
               # Extract domain from URL

     

       289
       +
               parsed_url = urlparse(url)

     

       290
       +
               domain = parsed_url.netloc

     

       291
       +
       

     

       292
       +
               # Remove "www." prefix if present

     

       293
       +
               if domain.startswith('www.'):

     

       294
       +
                   domain = domain[4:]

     

       295
       +
       

     

       296
       +
               return {

     

       297
       +
                   'title': title,

     

       298
       +
                   'url': url,

     

       299
       +
                   'domain': domain

     

       300
       +
               }

+243

aggregators/kagi-news/src/main.py

···

       1
       +
       """

     

       2
       +
       Main Orchestration Script for Kagi News Aggregator.

     

       3
       +
       

     

       4
       +
       Coordinates all components to:

     

       5
       +
       1. Fetch RSS feeds

     

       6
       +
       2. Parse HTML content

     

       7
       +
       3. Format as rich text

     

       8
       +
       4. Deduplicate stories

     

       9
       +
       5. Post to Coves communities

     

       10
       +
       6. Track state

     

       11
       +
       

     

       12
       +
       Designed to run via CRON (single execution, then exit).

     

       13
       +
       """

     

       14
       +
       import os

     

       15
       +
       import sys

     

       16
       +
       import logging

     

       17
       +
       from pathlib import Path

     

       18
       +
       from datetime import datetime

     

       19
       +
       from typing import Optional

     

       20
       +
       

     

       21
       +
       from src.config import ConfigLoader

     

       22
       +
       from src.rss_fetcher import RSSFetcher

     

       23
       +
       from src.html_parser import KagiHTMLParser

     

       24
       +
       from src.richtext_formatter import RichTextFormatter

     

       25
       +
       from src.state_manager import StateManager

     

       26
       +
       from src.coves_client import CovesClient

     

       27
       +
       

     

       28
       +
       # Setup logging

     

       29
       +
       logging.basicConfig(

     

       30
       +
           level=logging.INFO,

     

       31
       +
           format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'

     

       32
       +
       )

     

       33
       +
       logger = logging.getLogger(__name__)

     

       34
       +
       

     

       35
       +
       

     

       36
       +
       class Aggregator:

     

       37
       +
           """

     

       38
       +
           Main aggregator orchestration.

     

       39
       +
       

     

       40
       +
           Coordinates all components to fetch, parse, format, and post stories.

     

       41
       +
           """

     

       42
       +
       

     

       43
       +
           def __init__(

     

       44
       +
               self,

     

       45
       +
               config_path: Path,

     

       46
       +
               state_file: Path,

     

       47
       +
               coves_client: Optional[CovesClient] = None

     

       48
       +
           ):

     

       49
       +
               """

     

       50
       +
               Initialize aggregator.

     

       51
       +
       

     

       52
       +
               Args:

     

       53
       +
                   config_path: Path to config.yaml

     

       54
       +
                   state_file: Path to state.json

     

       55
       +
                   coves_client: Optional CovesClient (for testing)

     

       56
       +
               """

     

       57
       +
               # Load configuration

     

       58
       +
               logger.info("Loading configuration...")

     

       59
       +
               config_loader = ConfigLoader(config_path)

     

       60
       +
               self.config = config_loader.load()

     

       61
       +
       

     

       62
       +
               # Initialize components

     

       63
       +
               logger.info("Initializing components...")

     

       64
       +
               self.rss_fetcher = RSSFetcher()

     

       65
       +
               self.html_parser = KagiHTMLParser()

     

       66
       +
               self.richtext_formatter = RichTextFormatter()

     

       67
       +
               self.state_manager = StateManager(state_file)

     

       68
       +
               self.state_file = state_file

     

       69
       +
       

     

       70
       +
               # Initialize Coves client (or use provided one for testing)

     

       71
       +
               if coves_client:

     

       72
       +
                   self.coves_client = coves_client

     

       73
       +
               else:

     

       74
       +
                   # Get credentials from environment

     

       75
       +
                   aggregator_handle = os.getenv('AGGREGATOR_HANDLE')

     

       76
       +
                   aggregator_password = os.getenv('AGGREGATOR_PASSWORD')

     

       77
       +
                   pds_url = os.getenv('PDS_URL')  # Optional: separate PDS for auth

     

       78
       +
       

     

       79
       +
                   if not aggregator_handle or not aggregator_password:

     

       80
       +
                       raise ValueError(

     

       81
       +
                           "Missing AGGREGATOR_HANDLE or AGGREGATOR_PASSWORD environment variables"

     

       82
       +
                       )

     

       83
       +
       

     

       84
       +
                   self.coves_client = CovesClient(

     

       85
       +
                       api_url=self.config.coves_api_url,

     

       86
       +
                       handle=aggregator_handle,

     

       87
       +
                       password=aggregator_password,

     

       88
       +
                       pds_url=pds_url  # Auth through PDS if specified

     

       89
       +
                   )

     

       90
       +
       

     

       91
       +
           def run(self):

     

       92
       +
               """

     

       93
       +
               Run aggregator: fetch, parse, post, and update state.

     

       94
       +
       

     

       95
       +
               This is the main entry point for CRON execution.

     

       96
       +
               """

     

       97
       +
               logger.info("=" * 60)

     

       98
       +
               logger.info("Starting Kagi News Aggregator")

     

       99
       +
               logger.info("=" * 60)

     

       100
       +
       

     

       101
       +
               # Get enabled feeds only

     

       102
       +
               enabled_feeds = [f for f in self.config.feeds if f.enabled]

     

       103
       +
               logger.info(f"Processing {len(enabled_feeds)} enabled feeds")

     

       104
       +
       

     

       105
       +
               # Authenticate once at the start

     

       106
       +
               try:

     

       107
       +
                   self.coves_client.authenticate()

     

       108
       +
               except Exception as e:

     

       109
       +
                   logger.error(f"Failed to authenticate: {e}")

     

       110
       +
                   logger.error("Cannot continue without authentication")

     

       111
       +
                   return

     

       112
       +
       

     

       113
       +
               # Process each feed

     

       114
       +
               for feed_config in enabled_feeds:

     

       115
       +
                   try:

     

       116
       +
                       self._process_feed(feed_config)

     

       117
       +
                   except Exception as e:

     

       118
       +
                       # Log error but continue with other feeds

     

       119
       +
                       logger.error(f"Error processing feed '{feed_config.name}': {e}", exc_info=True)

     

       120
       +
                       continue

     

       121
       +
       

     

       122
       +
               logger.info("=" * 60)

     

       123
       +
               logger.info("Aggregator run completed")

     

       124
       +
               logger.info("=" * 60)

     

       125
       +
       

     

       126
       +
           def _process_feed(self, feed_config):

     

       127
       +
               """

     

       128
       +
               Process a single RSS feed.

     

       129
       +
       

     

       130
       +
               Args:

     

       131
       +
                   feed_config: FeedConfig object

     

       132
       +
               """

     

       133
       +
               logger.info(f"Processing feed: {feed_config.name} -> {feed_config.community_handle}")

     

       134
       +
       

     

       135
       +
               # Fetch RSS feed

     

       136
       +
               try:

     

       137
       +
                   feed = self.rss_fetcher.fetch_feed(feed_config.url)

     

       138
       +
               except Exception as e:

     

       139
       +
                   logger.error(f"Failed to fetch feed '{feed_config.name}': {e}")

     

       140
       +
                   raise

     

       141
       +
       

     

       142
       +
               # Check for feed errors

     

       143
       +
               if feed.bozo:

     

       144
       +
                   logger.warning(f"Feed '{feed_config.name}' has parsing issues (bozo flag set)")

     

       145
       +
       

     

       146
       +
               # Process entries

     

       147
       +
               new_posts = 0

     

       148
       +
               skipped_posts = 0

     

       149
       +
       

     

       150
       +
               for entry in feed.entries:

     

       151
       +
                   try:

     

       152
       +
                       # Check if already posted

     

       153
       +
                       guid = entry.guid if hasattr(entry, 'guid') else entry.link

     

       154
       +
                       if self.state_manager.is_posted(feed_config.url, guid):

     

       155
       +
                           skipped_posts += 1

     

       156
       +
                           logger.debug(f"Skipping already-posted story: {guid}")

     

       157
       +
                           continue

     

       158
       +
       

     

       159
       +
                       # Parse story

     

       160
       +
                       story = self.html_parser.parse_to_story(

     

       161
       +
                           title=entry.title,

     

       162
       +
                           link=entry.link,

     

       163
       +
                           guid=guid,

     

       164
       +
                           pub_date=entry.published_parsed,

     

       165
       +
                           categories=[tag.term for tag in entry.tags] if hasattr(entry, 'tags') else [],

     

       166
       +
                           html_description=entry.description

     

       167
       +
                       )

     

       168
       +
       

     

       169
       +
                       # Format as rich text

     

       170
       +
                       rich_text = self.richtext_formatter.format_full(story)

     

       171
       +
       

     

       172
       +
                       # Create external embed

     

       173
       +
                       embed = self.coves_client.create_external_embed(

     

       174
       +
                           uri=story.link,

     

       175
       +
                           title=story.title,

     

       176
       +
                           description=story.summary[:200] if len(story.summary) > 200 else story.summary,

     

       177
       +
                           thumb=story.image_url

     

       178
       +
                       )

     

       179
       +
       

     

       180
       +
                       # Post to community

     

       181
       +
                       try:

     

       182
       +
                           post_uri = self.coves_client.create_post(

     

       183
       +
                               community_handle=feed_config.community_handle,

     

       184
       +
                               content=rich_text["content"],

     

       185
       +
                               facets=rich_text["facets"],

     

       186
       +
                               embed=embed

     

       187
       +
                           )

     

       188
       +
       

     

       189
       +
                           # Mark as posted (only if successful)

     

       190
       +
                           self.state_manager.mark_posted(feed_config.url, guid, post_uri)

     

       191
       +
                           new_posts += 1

     

       192
       +
                           logger.info(f"Posted: {story.title[:50]}... -> {post_uri}")

     

       193
       +
       

     

       194
       +
                       except Exception as e:

     

       195
       +
                           # Don't update state if posting failed

     

       196
       +
                           logger.error(f"Failed to post story '{story.title}': {e}")

     

       197
       +
                           continue

     

       198
       +
       

     

       199
       +
                   except Exception as e:

     

       200
       +
                       # Log error but continue with other entries

     

       201
       +
                       logger.error(f"Error processing entry: {e}", exc_info=True)

     

       202
       +
                       continue

     

       203
       +
       

     

       204
       +
               # Update last run timestamp

     

       205
       +
               self.state_manager.update_last_run(feed_config.url, datetime.now())

     

       206
       +
       

     

       207
       +
               logger.info(

     

       208
       +
                   f"Feed '{feed_config.name}': {new_posts} new posts, {skipped_posts} duplicates"

     

       209
       +
               )

     

       210
       +
       

     

       211
       +
       

     

       212
       +
       def main():

     

       213
       +
           """

     

       214
       +
           Main entry point for command-line execution.

     

       215
       +
       

     

       216
       +
           Usage:

     

       217
       +
               python -m src.main

     

       218
       +
           """

     

       219
       +
           # Get paths from environment or use defaults

     

       220
       +
           config_path = Path(os.getenv('CONFIG_PATH', 'config.yaml'))

     

       221
       +
           state_file = Path(os.getenv('STATE_FILE', 'data/state.json'))

     

       222
       +
       

     

       223
       +
           # Validate config file exists

     

       224
       +
           if not config_path.exists():

     

       225
       +
               logger.error(f"Configuration file not found: {config_path}")

     

       226
       +
               logger.error("Please create config.yaml (see config.example.yaml)")

     

       227
       +
               sys.exit(1)

     

       228
       +
       

     

       229
       +
           # Create aggregator and run

     

       230
       +
           try:

     

       231
       +
               aggregator = Aggregator(

     

       232
       +
                   config_path=config_path,

     

       233
       +
                   state_file=state_file

     

       234
       +
               )

     

       235
       +
               aggregator.run()

     

       236
       +
               sys.exit(0)

     

       237
       +
           except Exception as e:

     

       238
       +
               logger.error(f"Aggregator failed: {e}", exc_info=True)

     

       239
       +
               sys.exit(1)

     

       240
       +
       

     

       241
       +
       

     

       242
       +
       if __name__ == '__main__':

     

       243
       +
           main()

+79

aggregators/kagi-news/src/models.py

···

       1
       +
       """

     

       2
       +
       Data models for Kagi News RSS aggregator.

     

       3
       +
       """

     

       4
       +
       from dataclasses import dataclass, field

     

       5
       +
       from datetime import datetime

     

       6
       +
       from typing import List, Optional

     

       7
       +
       

     

       8
       +
       

     

       9
       +
       @dataclass

     

       10
       +
       class Source:

     

       11
       +
           """A news source citation."""

     

       12
       +
           title: str

     

       13
       +
           url: str

     

       14
       +
           domain: str

     

       15
       +
       

     

       16
       +
       

     

       17
       +
       @dataclass

     

       18
       +
       class Perspective:

     

       19
       +
           """A perspective from a particular actor/stakeholder."""

     

       20
       +
           actor: str

     

       21
       +
           description: str

     

       22
       +
           source_url: str

     

       23
       +
       

     

       24
       +
       

     

       25
       +
       @dataclass

     

       26
       +
       class Quote:

     

       27
       +
           """A notable quote from the story."""

     

       28
       +
           text: str

     

       29
       +
           attribution: str

     

       30
       +
       

     

       31
       +
       

     

       32
       +
       @dataclass

     

       33
       +
       class KagiStory:

     

       34
       +
           """

     

       35
       +
           Structured representation of a Kagi News story.

     

       36
       +
       

     

       37
       +
           Parsed from RSS feed item with HTML description.

     

       38
       +
           """

     

       39
       +
           # RSS metadata

     

       40
       +
           title: str

     

       41
       +
           link: str  # Kagi story permalink

     

       42
       +
           guid: str

     

       43
       +
           pub_date: datetime

     

       44
       +
           categories: List[str] = field(default_factory=list)

     

       45
       +
       

     

       46
       +
           # Parsed from HTML description

     

       47
       +
           summary: str = ""

     

       48
       +
           highlights: List[str] = field(default_factory=list)

     

       49
       +
           perspectives: List[Perspective] = field(default_factory=list)

     

       50
       +
           quote: Optional[Quote] = None

     

       51
       +
           sources: List[Source] = field(default_factory=list)

     

       52
       +
           image_url: Optional[str] = None

     

       53
       +
           image_alt: Optional[str] = None

     

       54
       +
       

     

       55
       +
           def __post_init__(self):

     

       56
       +
               """Validate required fields."""

     

       57
       +
               if not self.title:

     

       58
       +
                   raise ValueError("title is required")

     

       59
       +
               if not self.link:

     

       60
       +
                   raise ValueError("link is required")

     

       61
       +
               if not self.guid:

     

       62
       +
                   raise ValueError("guid is required")

     

       63
       +
       

     

       64
       +
       

     

       65
       +
       @dataclass

     

       66
       +
       class FeedConfig:

     

       67
       +
           """Configuration for a single RSS feed."""

     

       68
       +
           name: str

     

       69
       +
           url: str

     

       70
       +
           community_handle: str

     

       71
       +
           enabled: bool = True

     

       72
       +
       

     

       73
       +
       

     

       74
       +
       @dataclass

     

       75
       +
       class AggregatorConfig:

     

       76
       +
           """Full aggregator configuration."""

     

       77
       +
           coves_api_url: str

     

       78
       +
           feeds: List[FeedConfig]

     

       79
       +
           log_level: str = "info"

+177

aggregators/kagi-news/src/richtext_formatter.py

···

       1
       +
       """

     

       2
       +
       Rich Text Formatter for Coves posts.

     

       3
       +
       

     

       4
       +
       Converts KagiStory objects to Coves rich text format with facets.

     

       5
       +
       Handles UTF-8 byte position calculation for multi-byte characters.

     

       6
       +
       """

     

       7
       +
       import logging

     

       8
       +
       from typing import Dict, List, Tuple

     

       9
       +
       from src.models import KagiStory, Perspective, Source

     

       10
       +
       

     

       11
       +
       logger = logging.getLogger(__name__)

     

       12
       +
       

     

       13
       +
       

     

       14
       +
       class RichTextFormatter:

     

       15
       +
           """

     

       16
       +
           Formats KagiStory into Coves rich text with facets.

     

       17
       +
       

     

       18
       +
           Applies:

     

       19
       +
           - Bold facets for section headers and perspective actors

     

       20
       +
           - Italic facets for quotes

     

       21
       +
           - Link facets for all URLs

     

       22
       +
           """

     

       23
       +
       

     

       24
       +
           def format_full(self, story: KagiStory) -> Dict:

     

       25
       +
               """

     

       26
       +
               Format KagiStory into full rich text format.

     

       27
       +
       

     

       28
       +
               Args:

     

       29
       +
                   story: KagiStory object to format

     

       30
       +
       

     

       31
       +
               Returns:

     

       32
       +
                   Dictionary with 'content' (str) and 'facets' (list)

     

       33
       +
               """

     

       34
       +
               builder = RichTextBuilder()

     

       35
       +
       

     

       36
       +
               # Summary

     

       37
       +
               builder.add_text(story.summary)

     

       38
       +
               builder.add_text("\n\n")

     

       39
       +
       

     

       40
       +
               # Highlights (if present)

     

       41
       +
               if story.highlights:

     

       42
       +
                   builder.add_bold("Highlights:")

     

       43
       +
                   builder.add_text("\n")

     

       44
       +
                   for highlight in story.highlights:

     

       45
       +
                       builder.add_text(f"• {highlight}\n")

     

       46
       +
                   builder.add_text("\n")

     

       47
       +
       

     

       48
       +
               # Perspectives (if present)

     

       49
       +
               if story.perspectives:

     

       50
       +
                   builder.add_bold("Perspectives:")

     

       51
       +
                   builder.add_text("\n")

     

       52
       +
                   for perspective in story.perspectives:

     

       53
       +
                       # Bold the actor name

     

       54
       +
                       actor_with_colon = f"{perspective.actor}:"

     

       55
       +
                       builder.add_bold(actor_with_colon)

     

       56
       +
                       builder.add_text(f" {perspective.description} (")

     

       57
       +
       

     

       58
       +
                       # Add link to source

     

       59
       +
                       source_link_text = "Source"

     

       60
       +
                       builder.add_link(source_link_text, perspective.source_url)

     

       61
       +
                       builder.add_text(")\n")

     

       62
       +
                   builder.add_text("\n")

     

       63
       +
       

     

       64
       +
               # Quote (if present)

     

       65
       +
               if story.quote:

     

       66
       +
                   quote_text = f'"{story.quote.text}"'

     

       67
       +
                   builder.add_italic(quote_text)

     

       68
       +
                   builder.add_text(f" — {story.quote.attribution}\n\n")

     

       69
       +
       

     

       70
       +
               # Sources (if present)

     

       71
       +
               if story.sources:

     

       72
       +
                   builder.add_bold("Sources:")

     

       73
       +
                   builder.add_text("\n")

     

       74
       +
                   for source in story.sources:

     

       75
       +
                       builder.add_text("• ")

     

       76
       +
                       builder.add_link(source.title, source.url)

     

       77
       +
                       builder.add_text(f" - {source.domain}\n")

     

       78
       +
                   builder.add_text("\n")

     

       79
       +
       

     

       80
       +
               # Kagi News attribution

     

       81
       +
               builder.add_text("---\n📰 Story aggregated by ")

     

       82
       +
               builder.add_link("Kagi News", story.link)

     

       83
       +
       

     

       84
       +
               return builder.build()

     

       85
       +
       

     

       86
       +
       

     

       87
       +
       class RichTextBuilder:

     

       88
       +
           """

     

       89
       +
           Helper class to build rich text content with facets.

     

       90
       +
       

     

       91
       +
           Handles UTF-8 byte position tracking automatically.

     

       92
       +
           """

     

       93
       +
       

     

       94
       +
           def __init__(self):

     

       95
       +
               self.content_parts = []

     

       96
       +
               self.facets = []

     

       97
       +
       

     

       98
       +
           def add_text(self, text: str):

     

       99
       +
               """Add plain text without any facets."""

     

       100
       +
               self.content_parts.append(text)

     

       101
       +
       

     

       102
       +
           def add_bold(self, text: str):

     

       103
       +
               """Add text with bold facet."""

     

       104
       +
               start_byte = self._get_current_byte_position()

     

       105
       +
               self.content_parts.append(text)

     

       106
       +
               end_byte = self._get_current_byte_position()

     

       107
       +
       

     

       108
       +
               self.facets.append({

     

       109
       +
                   "index": {

     

       110
       +
                       "byteStart": start_byte,

     

       111
       +
                       "byteEnd": end_byte

     

       112
       +
                   },

     

       113
       +
                   "features": [

     

       114
       +
                       {"$type": "social.coves.richtext.facet#bold"}

     

       115
       +
                   ]

     

       116
       +
               })

     

       117
       +
       

     

       118
       +
           def add_italic(self, text: str):

     

       119
       +
               """Add text with italic facet."""

     

       120
       +
               start_byte = self._get_current_byte_position()

     

       121
       +
               self.content_parts.append(text)

     

       122
       +
               end_byte = self._get_current_byte_position()

     

       123
       +
       

     

       124
       +
               self.facets.append({

     

       125
       +
                   "index": {

     

       126
       +
                       "byteStart": start_byte,

     

       127
       +
                       "byteEnd": end_byte

     

       128
       +
                   },

     

       129
       +
                   "features": [

     

       130
       +
                       {"$type": "social.coves.richtext.facet#italic"}

     

       131
       +
                   ]

     

       132
       +
               })

     

       133
       +
       

     

       134
       +
           def add_link(self, text: str, uri: str):

     

       135
       +
               """Add text with link facet."""

     

       136
       +
               start_byte = self._get_current_byte_position()

     

       137
       +
               self.content_parts.append(text)

     

       138
       +
               end_byte = self._get_current_byte_position()

     

       139
       +
       

     

       140
       +
               self.facets.append({

     

       141
       +
                   "index": {

     

       142
       +
                       "byteStart": start_byte,

     

       143
       +
                       "byteEnd": end_byte

     

       144
       +
                   },

     

       145
       +
                   "features": [

     

       146
       +
                       {

     

       147
       +
                           "$type": "social.coves.richtext.facet#link",

     

       148
       +
                           "uri": uri

     

       149
       +
                       }

     

       150
       +
                   ]

     

       151
       +
               })

     

       152
       +
       

     

       153
       +
           def _get_current_byte_position(self) -> int:

     

       154
       +
               """

     

       155
       +
               Get the current byte position in the content.

     

       156
       +
       

     

       157
       +
               Uses UTF-8 encoding to handle multi-byte characters correctly.

     

       158
       +
               """

     

       159
       +
               current_content = ''.join(self.content_parts)

     

       160
       +
               return len(current_content.encode('utf-8'))

     

       161
       +
       

     

       162
       +
           def build(self) -> Dict:

     

       163
       +
               """

     

       164
       +
               Build the final rich text object.

     

       165
       +
       

     

       166
       +
               Returns:

     

       167
       +
                   Dictionary with 'content' and 'facets'

     

       168
       +
               """

     

       169
       +
               content = ''.join(self.content_parts)

     

       170
       +
       

     

       171
       +
               # Sort facets by start position for consistency

     

       172
       +
               sorted_facets = sorted(self.facets, key=lambda f: f['index']['byteStart'])

     

       173
       +
       

     

       174
       +
               return {

     

       175
       +
                   "content": content,

     

       176
       +
                   "facets": sorted_facets

     

       177
       +
               }

+71

aggregators/kagi-news/src/rss_fetcher.py

···

       1
       +
       """

     

       2
       +
       RSS feed fetcher with retry logic and error handling.

     

       3
       +
       """

     

       4
       +
       import time

     

       5
       +
       import logging

     

       6
       +
       import requests

     

       7
       +
       import feedparser

     

       8
       +
       from typing import Optional

     

       9
       +
       

     

       10
       +
       logger = logging.getLogger(__name__)

     

       11
       +
       

     

       12
       +
       

     

       13
       +
       class RSSFetcher:

     

       14
       +
           """Fetches RSS feeds with retry logic."""

     

       15
       +
       

     

       16
       +
           def __init__(self, timeout: int = 30, max_retries: int = 3):

     

       17
       +
               """

     

       18
       +
               Initialize RSS fetcher.

     

       19
       +
       

     

       20
       +
               Args:

     

       21
       +
                   timeout: Request timeout in seconds

     

       22
       +
                   max_retries: Maximum number of retry attempts

     

       23
       +
               """

     

       24
       +
               self.timeout = timeout

     

       25
       +
               self.max_retries = max_retries

     

       26
       +
       

     

       27
       +
           def fetch_feed(self, url: str) -> feedparser.FeedParserDict:

     

       28
       +
               """

     

       29
       +
               Fetch and parse an RSS feed.

     

       30
       +
       

     

       31
       +
               Args:

     

       32
       +
                   url: RSS feed URL

     

       33
       +
       

     

       34
       +
               Returns:

     

       35
       +
                   Parsed feed object

     

       36
       +
       

     

       37
       +
               Raises:

     

       38
       +
                   ValueError: If URL is empty

     

       39
       +
                   requests.RequestException: If all retry attempts fail

     

       40
       +
               """

     

       41
       +
               if not url:

     

       42
       +
                   raise ValueError("URL cannot be empty")

     

       43
       +
       

     

       44
       +
               last_error = None

     

       45
       +
       

     

       46
       +
               for attempt in range(self.max_retries):

     

       47
       +
                   try:

     

       48
       +
                       logger.info(f"Fetching feed from {url} (attempt {attempt + 1}/{self.max_retries})")

     

       49
       +
       

     

       50
       +
                       response = requests.get(url, timeout=self.timeout)

     

       51
       +
                       response.raise_for_status()

     

       52
       +
       

     

       53
       +
                       # Parse with feedparser

     

       54
       +
                       feed = feedparser.parse(response.content)

     

       55
       +
       

     

       56
       +
                       logger.info(f"Successfully fetched feed: {feed.feed.get('title', 'Unknown')}")

     

       57
       +
                       return feed

     

       58
       +
       

     

       59
       +
                   except requests.RequestException as e:

     

       60
       +
                       last_error = e

     

       61
       +
                       logger.warning(f"Fetch attempt {attempt + 1} failed: {e}")

     

       62
       +
       

     

       63
       +
                       if attempt < self.max_retries - 1:

     

       64
       +
                           # Exponential backoff

     

       65
       +
                           sleep_time = 2 ** attempt

     

       66
       +
                           logger.info(f"Retrying in {sleep_time} seconds...")

     

       67
       +
                           time.sleep(sleep_time)

     

       68
       +
       

     

       69
       +
               # All retries exhausted

     

       70
       +
               logger.error(f"Failed to fetch feed after {self.max_retries} attempts")

     

       71
       +
               raise last_error

+213

aggregators/kagi-news/src/state_manager.py

···

       1
       +
       """

     

       2
       +
       State Manager for tracking posted stories.

     

       3
       +
       

     

       4
       +
       Handles deduplication by tracking which stories have already been posted.

     

       5
       +
       Uses JSON file for persistence.

     

       6
       +
       """

     

       7
       +
       import json

     

       8
       +
       import logging

     

       9
       +
       from pathlib import Path

     

       10
       +
       from datetime import datetime, timedelta

     

       11
       +
       from typing import Optional, Dict, List

     

       12
       +
       

     

       13
       +
       logger = logging.getLogger(__name__)

     

       14
       +
       

     

       15
       +
       

     

       16
       +
       class StateManager:

     

       17
       +
           """

     

       18
       +
           Manages aggregator state for deduplication.

     

       19
       +
       

     

       20
       +
           Tracks:

     

       21
       +
           - Posted GUIDs per feed (with timestamps)

     

       22
       +
           - Last successful run timestamp per feed

     

       23
       +
           - Automatic cleanup of old entries

     

       24
       +
           """

     

       25
       +
       

     

       26
       +
           def __init__(self, state_file: Path, max_guids_per_feed: int = 100, max_age_days: int = 30):

     

       27
       +
               """

     

       28
       +
               Initialize state manager.

     

       29
       +
       

     

       30
       +
               Args:

     

       31
       +
                   state_file: Path to JSON state file

     

       32
       +
                   max_guids_per_feed: Maximum GUIDs to keep per feed (default: 100)

     

       33
       +
                   max_age_days: Maximum age in days for GUIDs (default: 30)

     

       34
       +
               """

     

       35
       +
               self.state_file = Path(state_file)

     

       36
       +
               self.max_guids_per_feed = max_guids_per_feed

     

       37
       +
               self.max_age_days = max_age_days

     

       38
       +
               self.state = self._load_state()

     

       39
       +
       

     

       40
       +
           def _load_state(self) -> Dict:

     

       41
       +
               """Load state from file, or create new state if file doesn't exist."""

     

       42
       +
               if not self.state_file.exists():

     

       43
       +
                   logger.info(f"Creating new state file at {self.state_file}")

     

       44
       +
                   state = {'feeds': {}}

     

       45
       +
                   self._save_state(state)

     

       46
       +
                   return state

     

       47
       +
       

     

       48
       +
               try:

     

       49
       +
                   with open(self.state_file, 'r') as f:

     

       50
       +
                       state = json.load(f)

     

       51
       +
                       logger.info(f"Loaded state from {self.state_file}")

     

       52
       +
                       return state

     

       53
       +
               except json.JSONDecodeError as e:

     

       54
       +
                   logger.error(f"Failed to load state file: {e}. Creating new state.")

     

       55
       +
                   state = {'feeds': {}}

     

       56
       +
                   self._save_state(state)

     

       57
       +
                   return state

     

       58
       +
       

     

       59
       +
           def _save_state(self, state: Optional[Dict] = None):

     

       60
       +
               """Save state to file."""

     

       61
       +
               if state is None:

     

       62
       +
                   state = self.state

     

       63
       +
       

     

       64
       +
               # Ensure parent directory exists

     

       65
       +
               self.state_file.parent.mkdir(parents=True, exist_ok=True)

     

       66
       +
       

     

       67
       +
               with open(self.state_file, 'w') as f:

     

       68
       +
                   json.dump(state, f, indent=2)

     

       69
       +
       

     

       70
       +
           def _ensure_feed_exists(self, feed_url: str):

     

       71
       +
               """Ensure feed entry exists in state."""

     

       72
       +
               if feed_url not in self.state['feeds']:

     

       73
       +
                   self.state['feeds'][feed_url] = {

     

       74
       +
                       'posted_guids': [],

     

       75
       +
                       'last_successful_run': None

     

       76
       +
                   }

     

       77
       +
       

     

       78
       +
           def is_posted(self, feed_url: str, guid: str) -> bool:

     

       79
       +
               """

     

       80
       +
               Check if a story has already been posted.

     

       81
       +
       

     

       82
       +
               Args:

     

       83
       +
                   feed_url: RSS feed URL

     

       84
       +
                   guid: Story GUID

     

       85
       +
       

     

       86
       +
               Returns:

     

       87
       +
                   True if already posted, False otherwise

     

       88
       +
               """

     

       89
       +
               self._ensure_feed_exists(feed_url)

     

       90
       +
       

     

       91
       +
               posted_guids = self.state['feeds'][feed_url]['posted_guids']

     

       92
       +
               return any(entry['guid'] == guid for entry in posted_guids)

     

       93
       +
       

     

       94
       +
           def mark_posted(self, feed_url: str, guid: str, post_uri: str):

     

       95
       +
               """

     

       96
       +
               Mark a story as posted.

     

       97
       +
       

     

       98
       +
               Args:

     

       99
       +
                   feed_url: RSS feed URL

     

       100
       +
                   guid: Story GUID

     

       101
       +
                   post_uri: AT Proto URI of created post

     

       102
       +
               """

     

       103
       +
               self._ensure_feed_exists(feed_url)

     

       104
       +
       

     

       105
       +
               # Add to posted list

     

       106
       +
               entry = {

     

       107
       +
                   'guid': guid,

     

       108
       +
                   'post_uri': post_uri,

     

       109
       +
                   'posted_at': datetime.now().isoformat()

     

       110
       +
               }

     

       111
       +
               self.state['feeds'][feed_url]['posted_guids'].append(entry)

     

       112
       +
       

     

       113
       +
               # Auto-cleanup to keep state file manageable

     

       114
       +
               self.cleanup_old_entries(feed_url)

     

       115
       +
       

     

       116
       +
               # Save state

     

       117
       +
               self._save_state()

     

       118
       +
       

     

       119
       +
               logger.info(f"Marked as posted: {guid} -> {post_uri}")

     

       120
       +
       

     

       121
       +
           def get_last_run(self, feed_url: str) -> Optional[datetime]:

     

       122
       +
               """

     

       123
       +
               Get last successful run timestamp for a feed.

     

       124
       +
       

     

       125
       +
               Args:

     

       126
       +
                   feed_url: RSS feed URL

     

       127
       +
       

     

       128
       +
               Returns:

     

       129
       +
                   Datetime of last run, or None if never run

     

       130
       +
               """

     

       131
       +
               self._ensure_feed_exists(feed_url)

     

       132
       +
       

     

       133
       +
               timestamp_str = self.state['feeds'][feed_url]['last_successful_run']

     

       134
       +
               if timestamp_str is None:

     

       135
       +
                   return None

     

       136
       +
       

     

       137
       +
               return datetime.fromisoformat(timestamp_str)

     

       138
       +
       

     

       139
       +
           def update_last_run(self, feed_url: str, timestamp: datetime):

     

       140
       +
               """

     

       141
       +
               Update last successful run timestamp.

     

       142
       +
       

     

       143
       +
               Args:

     

       144
       +
                   feed_url: RSS feed URL

     

       145
       +
                   timestamp: Timestamp of successful run

     

       146
       +
               """

     

       147
       +
               self._ensure_feed_exists(feed_url)

     

       148
       +
       

     

       149
       +
               self.state['feeds'][feed_url]['last_successful_run'] = timestamp.isoformat()

     

       150
       +
               self._save_state()

     

       151
       +
       

     

       152
       +
               logger.info(f"Updated last run for {feed_url}: {timestamp}")

     

       153
       +
       

     

       154
       +
           def cleanup_old_entries(self, feed_url: str):

     

       155
       +
               """

     

       156
       +
               Remove old entries from state.

     

       157
       +
       

     

       158
       +
               Removes entries that are:

     

       159
       +
               - Older than max_age_days

     

       160
       +
               - Beyond max_guids_per_feed limit (keeps most recent)

     

       161
       +
       

     

       162
       +
               Args:

     

       163
       +
                   feed_url: RSS feed URL

     

       164
       +
               """

     

       165
       +
               self._ensure_feed_exists(feed_url)

     

       166
       +
       

     

       167
       +
               posted_guids = self.state['feeds'][feed_url]['posted_guids']

     

       168
       +
       

     

       169
       +
               # Filter out entries older than max_age_days

     

       170
       +
               cutoff_date = datetime.now() - timedelta(days=self.max_age_days)

     

       171
       +
               filtered = [

     

       172
       +
                   entry for entry in posted_guids

     

       173
       +
                   if datetime.fromisoformat(entry['posted_at']) > cutoff_date

     

       174
       +
               ]

     

       175
       +
       

     

       176
       +
               # Keep only most recent max_guids_per_feed entries

     

       177
       +
               # Sort by posted_at (most recent first)

     

       178
       +
               filtered.sort(key=lambda x: x['posted_at'], reverse=True)

     

       179
       +
               filtered = filtered[:self.max_guids_per_feed]

     

       180
       +
       

     

       181
       +
               # Update state

     

       182
       +
               old_count = len(posted_guids)

     

       183
       +
               new_count = len(filtered)

     

       184
       +
               self.state['feeds'][feed_url]['posted_guids'] = filtered

     

       185
       +
       

     

       186
       +
               if old_count != new_count:

     

       187
       +
                   logger.info(f"Cleaned up {old_count - new_count} old entries for {feed_url}")

     

       188
       +
       

     

       189
       +
           def get_posted_count(self, feed_url: str) -> int:

     

       190
       +
               """

     

       191
       +
               Get count of posted items for a feed.

     

       192
       +
       

     

       193
       +
               Args:

     

       194
       +
                   feed_url: RSS feed URL

     

       195
       +
       

     

       196
       +
               Returns:

     

       197
       +
                   Number of posted items

     

       198
       +
               """

     

       199
       +
               self._ensure_feed_exists(feed_url)

     

       200
       +
               return len(self.state['feeds'][feed_url]['posted_guids'])

     

       201
       +
       

     

       202
       +
           def get_all_posted_guids(self, feed_url: str) -> List[str]:

     

       203
       +
               """

     

       204
       +
               Get all posted GUIDs for a feed.

     

       205
       +
       

     

       206
       +
               Args:

     

       207
       +
                   feed_url: RSS feed URL

     

       208
       +
       

     

       209
       +
               Returns:

     

       210
       +
                   List of GUIDs

     

       211
       +
               """

     

       212
       +
               self._ensure_feed_exists(feed_url)

     

       213
       +
               return [entry['guid'] for entry in self.state['feeds'][feed_url]['posted_guids']]