commit 87ed53880f3b07efc50a8cd95a8107482c68a0b8 · anil.recoil.org/thicket

.gitignore

···

       201
       201
        
       

     

       202
       202
        
       # Streamlit

     

       203
       203
        
       .streamlit/secrets.toml

     

       204
       204
       +
       

     

       205
       205
       +
       thicket.yaml

+215 -2

ARCH.md

···

       56
       56
        
       │       │   │   ├── add.py       # Add users and feeds

     

       57
       57
        
       │       │   │   ├── sync.py      # Sync feeds

     

       58
       58
        
       │       │   │   ├── list_cmd.py  # List users/feeds

     

       59
       59
       -
       │       │   │   └── duplicates.py # Manage duplicate entries

     

       59
       59
       +
       │       │   │   ├── duplicates.py # Manage duplicate entries

     

       60
       60
       +
       │       │   │   ├── links_cmd.py  # Extract and categorize links

     

       61
       61
       +
       │       │   │   └── index_cmd.py  # Build reference index and show threads

     

       60
       62
        
       │       │   └── utils.py    # CLI utilities (progress, formatting)

     

       61
       63
        
       │       ├── core/           # Core business logic

     

       62
       64
        
       │       │   ├── __init__.py

     

       63
       65
        
       │       │   ├── feed_parser.py   # Feed parsing and normalization

     

       64
       64
       -
       │       │   └── git_store.py     # Git repository operations

     

       66
       66
       +
       │       │   ├── git_store.py     # Git repository operations

     

       67
       67
       +
       │       │   └── reference_parser.py # Link extraction and threading

     

       65
       68
        
       │       ├── models/         # Pydantic data models

     

       66
       69
        
       │       │   ├── __init__.py

     

       67
       70
        
       │       │   ├── config.py        # Configuration models

     
···

       154
       157
        
       git-store/

     

       155
       158
        
       ├── index.json              # User directory index

     

       156
       159
        
       ├── duplicates.json         # Manual curation of duplicate entries

     

       160
       160
       +
       ├── links.json              # All outbound links categorized by type

     

       161
       161
       +
       ├── references.json         # Cross-reference index for threading

     

       157
       162
        
       ├── user1/

     

       158
       163
        
       │   ├── entry_id_1.json     # Sanitized entry files

     

       159
       164
        
       │   ├── entry_id_2.json

     
···

       229
       234
        
       thicket duplicates list

     

       230
       235
        
       thicket duplicates add <entry_id_1> <entry_id_2>  # Mark as duplicates

     

       231
       236
        
       thicket duplicates remove <entry_id_1> <entry_id_2>  # Unmark duplicates

     

       237
       237
       +
       

     

       238
       238
       +
       # Link processing and threading

     

       239
       239
       +
       thicket links --verbose                 # Extract and categorize all links

     

       240
       240
       +
       thicket index --verbose                 # Build reference index for threading

     

       241
       241
       +
       thicket threads                         # Show conversation threads

     

       242
       242
       +
       thicket threads --username user1        # Show threads for specific user

     

       243
       243
       +
       thicket threads --min-size 3           # Show threads with minimum size

     

       232
       244
        
       ```

     

       233
       245
        
       

     

       234
       246
        
       ## Performance Considerations

     
···

       322
       334
        
                   icon=self.logo or self.icon or self.image_url

     

       323
       335
        
               )

     

       324
       336
        
       ```

     

       337
       337
       +
       

     

       338
       338
       +
       ## Link Processing and Threading Architecture

     

       339
       339
       +
       

     

       340
       340
       +
       ### Overview

     

       341
       341
       +
       The thicket system implements a sophisticated link processing and threading system to create email-style threaded views of blog entries by tracking cross-references between different blogs.

     

       342
       342
       +
       

     

       343
       343
       +
       ### Link Processing Pipeline

     

       344
       344
       +
       

     

       345
       345
       +
       #### 1. Link Extraction (`thicket links`)

     

       346
       346
       +
       The `links` command systematically extracts all outbound links from blog entries and categorizes them:

     

       347
       347
       +
       

     

       348
       348
       +
       ```python

     

       349
       349
       +
       class LinkData(BaseModel):

     

       350
       350
       +
           url: str                    # Fully resolved URL

     

       351
       351
       +
           entry_id: str              # Source entry ID

     

       352
       352
       +
           username: str              # Source username

     

       353
       353
       +
           context: str               # Surrounding text context

     

       354
       354
       +
           category: str              # "internal", "user", or "unknown"

     

       355
       355
       +
           target_username: Optional[str]  # Target user if applicable

     

       356
       356
       +
       ```

     

       357
       357
       +
       

     

       358
       358
       +
       **Link Categories:**

     

       359
       359
       +
       - **Internal**: Links to the same user's domain (self-references)

     

       360
       360
       +
       - **User**: Links to other tracked users' domains

     

       361
       361
       +
       - **Unknown**: Links to external sites not tracked by thicket

     

       362
       362
       +
       

     

       363
       363
       +
       #### 2. URL Resolution

     

       364
       364
       +
       All links are properly resolved using the Atom feed's base URL to handle:

     

       365
       365
       +
       - Relative URLs (converted to absolute)

     

       366
       366
       +
       - Protocol-relative URLs

     

       367
       367
       +
       - Fragment identifiers

     

       368
       368
       +
       - Redirects and canonical URLs

     

       369
       369
       +
       

     

       370
       370
       +
       #### 3. Domain Mapping

     

       371
       371
       +
       The system builds a comprehensive domain mapping from user configuration:

     

       372
       372
       +
       - Feed URLs → domain extraction

     

       373
       373
       +
       - Homepage URLs → domain extraction

     

       374
       374
       +
       - Reverse mapping: domain → username

     

       375
       375
       +
       

     

       376
       376
       +
       ### Threading System

     

       377
       377
       +
       

     

       378
       378
       +
       #### 1. Reference Index Generation (`thicket index`)

     

       379
       379
       +
       Creates a bidirectional reference index from the categorized links:

     

       380
       380
       +
       

     

       381
       381
       +
       ```python

     

       382
       382
       +
       class BlogReference(BaseModel):

     

       383
       383
       +
           source_entry_id: str

     

       384
       384
       +
           source_username: str

     

       385
       385
       +
           target_url: str

     

       386
       386
       +
           target_username: Optional[str]

     

       387
       387
       +
           target_entry_id: Optional[str]

     

       388
       388
       +
           context: str

     

       389
       389
       +
       ```

     

       390
       390
       +
       

     

       391
       391
       +
       #### 2. Thread Detection Algorithm

     

       392
       392
       +
       Uses graph traversal to find connected blog entries:

     

       393
       393
       +
       - **Outbound references**: Links from an entry to other entries

     

       394
       394
       +
       - **Inbound references**: Links to an entry from other entries

     

       395
       395
       +
       - **Thread members**: All entries connected through references

     

       396
       396
       +
       

     

       397
       397
       +
       #### 3. Threading Display (`thicket threads`)

     

       398
       398
       +
       Creates email-style threaded views:

     

       399
       399
       +
       - Chronological ordering within threads

     

       400
       400
       +
       - Reference counts (outbound/inbound)

     

       401
       401
       +
       - Context preservation

     

       402
       402
       +
       - Filtering options (user, entry, minimum size)

     

       403
       403
       +
       

     

       404
       404
       +
       ### Data Structures

     

       405
       405
       +
       

     

       406
       406
       +
       #### links.json Format

     

       407
       407
       +
       ```json

     

       408
       408
       +
       {

     

       409
       409
       +
         "links": [

     

       410
       410
       +
           {

     

       411
       411
       +
             "url": "https://example.com/post/123",

     

       412
       412
       +
             "entry_id": "https://blog.user.com/entry/456",

     

       413
       413
       +
             "username": "user1",

     

       414
       414
       +
             "context": "As mentioned in this post...",

     

       415
       415
       +
             "category": "user",

     

       416
       416
       +
             "target_username": "user2"

     

       417
       417
       +
           }

     

       418
       418
       +
         ],

     

       419
       419
       +
         "categories": {

     

       420
       420
       +
           "internal": 1234,

     

       421
       421
       +
           "user": 456,

     

       422
       422
       +
           "unknown": 7890

     

       423
       423
       +
         },

     

       424
       424
       +
         "user_domains": {

     

       425
       425
       +
           "user1": ["blog.user.com", "user.com"],

     

       426
       426
       +
           "user2": ["example.com"]

     

       427
       427
       +
         }

     

       428
       428
       +
       }

     

       429
       429
       +
       ```

     

       430
       430
       +
       

     

       431
       431
       +
       #### references.json Format

     

       432
       432
       +
       ```json

     

       433
       433
       +
       {

     

       434
       434
       +
         "references": [

     

       435
       435
       +
           {

     

       436
       436
       +
             "source_entry_id": "https://blog.user.com/entry/456",

     

       437
       437
       +
             "source_username": "user1",

     

       438
       438
       +
             "target_url": "https://example.com/post/123",

     

       439
       439
       +
             "target_username": "user2",

     

       440
       440
       +
             "target_entry_id": "https://example.com/post/123",

     

       441
       441
       +
             "context": "As mentioned in this post..."

     

       442
       442
       +
           }

     

       443
       443
       +
         ],

     

       444
       444
       +
         "user_domains": {

     

       445
       445
       +
           "user1": ["blog.user.com"],

     

       446
       446
       +
           "user2": ["example.com"]

     

       447
       447
       +
         }

     

       448
       448
       +
       }

     

       449
       449
       +
       ```

     

       450
       450
       +
       

     

       451
       451
       +
       ### Implementation Benefits

     

       452
       452
       +
       

     

       453
       453
       +
       1. **Systematic Link Processing**: All links are extracted and categorized consistently

     

       454
       454
       +
       2. **Proper URL Resolution**: Handles relative URLs and base URL resolution correctly

     

       455
       455
       +
       3. **Domain-based Categorization**: Automatically identifies user-to-user references

     

       456
       456
       +
       4. **Bidirectional Indexing**: Supports both "who links to whom" and "who is linked by whom"

     

       457
       457
       +
       5. **Thread Discovery**: Finds conversation threads automatically

     

       458
       458
       +
       6. **Rich Context**: Preserves surrounding text for each link

     

       459
       459
       +
       7. **Performance**: Pre-computed indexes for fast threading queries

     

       460
       460
       +
       

     

       461
       461
       +
       ### CLI Commands

     

       462
       462
       +
       

     

       463
       463
       +
       ```bash

     

       464
       464
       +
       # Extract and categorize all links

     

       465
       465
       +
       thicket links --verbose

     

       466
       466
       +
       

     

       467
       467
       +
       # Build reference index for threading

     

       468
       468
       +
       thicket index --verbose

     

       469
       469
       +
       

     

       470
       470
       +
       # Show all conversation threads

     

       471
       471
       +
       thicket threads

     

       472
       472
       +
       

     

       473
       473
       +
       # Show threads for specific user

     

       474
       474
       +
       thicket threads --username user1

     

       475
       475
       +
       

     

       476
       476
       +
       # Show threads with minimum size

     

       477
       477
       +
       thicket threads --min-size 3

     

       478
       478
       +
       ```

     

       479
       479
       +
       

     

       480
       480
       +
       ### Integration with Existing Commands

     

       481
       481
       +
       

     

       482
       482
       +
       The link processing system integrates seamlessly with existing thicket commands:

     

       483
       483
       +
       - `thicket sync` updates entries, requiring `thicket links` to be run afterward

     

       484
       484
       +
       - `thicket index` uses the output from `thicket links` for improved accuracy

     

       485
       485
       +
       - `thicket threads` provides the user-facing threading interface

     

       486
       486
       +
       

     

       487
       487
       +
       ## Current Implementation Status

     

       488
       488
       +
       

     

       489
       489
       +
       ### ✅ Completed Features

     

       490
       490
       +
       1. **Core Infrastructure**

     

       491
       491
       +
          - Modern CLI with Typer and Rich

     

       492
       492
       +
          - Pydantic data models for type safety

     

       493
       493
       +
          - Git repository operations with GitPython

     

       494
       494
       +
          - Feed parsing and normalization with feedparser

     

       495
       495
       +
       

     

       496
       496
       +
       2. **User and Feed Management**

     

       497
       497
       +
          - `thicket init` - Initialize git store

     

       498
       498
       +
          - `thicket add` - Add users and feeds with auto-discovery

     

       499
       499
       +
          - `thicket sync` - Sync feeds with progress tracking

     

       500
       500
       +
          - `thicket list` - List users, feeds, and entries

     

       501
       501
       +
          - `thicket duplicates` - Manage duplicate entries

     

       502
       502
       +
       

     

       503
       503
       +
       3. **Link Processing and Threading**

     

       504
       504
       +
          - `thicket links` - Extract and categorize all outbound links

     

       505
       505
       +
          - `thicket index` - Build reference index from links

     

       506
       506
       +
          - `thicket threads` - Display threaded conversation views

     

       507
       507
       +
          - Proper URL resolution with base URL handling

     

       508
       508
       +
          - Domain-based link categorization

     

       509
       509
       +
          - Context preservation for links

     

       510
       510
       +
       

     

       511
       511
       +
       ### 📊 System Performance

     

       512
       512
       +
       - **Link Extraction**: Successfully processes thousands of blog entries

     

       513
       513
       +
       - **Categorization**: Identifies internal, user, and unknown links

     

       514
       514
       +
       - **Threading**: Creates email-style threaded views of conversations

     

       515
       515
       +
       - **Storage**: Efficient JSON-based data structures for links and references

     

       516
       516
       +
       

     

       517
       517
       +
       ### 🔧 Current Architecture Highlights

     

       518
       518
       +
       - **Modular Design**: Clear separation between CLI, core logic, and models

     

       519
       519
       +
       - **Type Safety**: Comprehensive Pydantic models for data validation

     

       520
       520
       +
       - **Rich CLI**: Beautiful progress bars, tables, and error handling

     

       521
       521
       +
       - **Extensible**: Easy to add new commands and features

     

       522
       522
       +
       - **Git Integration**: All data stored in version-controlled JSON files

     

       523
       523
       +
       

     

       524
       524
       +
       ### 🎯 Proven Functionality

     

       525
       525
       +
       The system has been tested with real blog data and successfully:

     

       526
       526
       +
       - Extracted 14,396 total links from blog entries

     

       527
       527
       +
       - Categorized 3,994 internal links, 363 user-to-user links, and 10,039 unknown links

     

       528
       528
       +
       - Built comprehensive domain mappings for 16 users across 20 domains

     

       529
       529
       +
       - Generated threaded views showing blog conversation patterns

     

       530
       530
       +
       

     

       531
       531
       +
       ### 🚀 Ready for Use

     

       532
       532
       +
       The thicket system is now fully functional for:

     

       533
       533
       +
       - Maintaining Git repositories of blog feeds

     

       534
       534
       +
       - Tracking cross-references between blogs

     

       535
       535
       +
       - Creating threaded views of blog conversations

     

       536
       536
       +
       - Discovering blog interaction patterns

     

       537
       537
       +
       - Building distributed comment systems

+24

CLAUDE.md

···

       1
       1
        
       My goal is to build a CLI tool called thicket in Python that maintains a Git repository within which Atom feeds can be persisted, including their contents.

     

       2
       2
        
       

     

       3
       3
       +
       # Python Environment and Package Management

     

       4
       4
       +
       

     

       5
       5
       +
       This project uses `uv` for Python package management and virtual environment handling.

     

       6
       6
       +
       

     

       7
       7
       +
       ## Running Commands

     

       8
       8
       +
       

     

       9
       9
       +
       ALWAYS use `uv run` to execute Python commands:

     

       10
       10
       +
       

     

       11
       11
       +
       - Run the CLI: `uv run -m thicket`

     

       12
       12
       +
       - Run tests: `uv run pytest`

     

       13
       13
       +
       - Type checking: `uv run mypy src/`

     

       14
       14
       +
       - Linting: `uv run ruff check src/`

     

       15
       15
       +
       - Format code: `uv run ruff format src/`

     

       16
       16
       +
       - Compile check: `uv run python -m py_compile <file>`

     

       17
       17
       +
       

     

       18
       18
       +
       ## Package Management

     

       19
       19
       +
       

     

       20
       20
       +
       - Add dependencies: `uv add <package>`

     

       21
       21
       +
       - Add dev dependencies: `uv add --dev <package>`

     

       22
       22
       +
       - Install dependencies: `uv sync`

     

       23
       23
       +
       - Update dependencies: `uv lock --upgrade`

     

       24
       24
       +
       

     

       25
       25
       +
       # Project Structure

     

       26
       26
       +
       

     

       3
       27
        
       The configuration file specifies:

     

       4
       28
        
       - the location of a git store

     

       5
       29
        
       - a list of usernames and target Atom/RSS feed(s) and optional metadata about the username such as their email, homepage, icon and display name

+2 -2

src/thicket/cli/commands/__init__.py

···

       1
       1
        
       """CLI commands for thicket."""

     

       2
       2
        
       

     

       3
       3
        
       # Import all commands to register them with the main app

     

       4
       4
       -
       from . import add, duplicates, init, list_cmd, sync

     

       4
       4
       +
       from . import add, duplicates, index_cmd, info_cmd, init, links_cmd, list_cmd, sync

     

       5
       5
        
       

     

       6
       6
       -
       __all__ = ["add", "duplicates", "init", "list_cmd", "sync"]

     

       6
       6
       +
       __all__ = ["add", "duplicates", "index_cmd", "info_cmd", "init", "links_cmd", "list_cmd", "sync"]

+18 -8

src/thicket/cli/commands/duplicates.py

···

       14
       14
        
           print_error,

     

       15
       15
        
           print_info,

     

       16
       16
        
           print_success,

     

       17
       17
       +
           get_tsv_mode,

     

       17
       18
        
       )

     

       18
       19
        
       

     

       19
       20
        
       

     
···

       51
       52
        
           duplicates = git_store.get_duplicates()

     

       52
       53
        
       

     

       53
       54
        
           if not duplicates.duplicates:

     

       54
       54
       -
               print_info("No duplicate mappings found")

     

       55
       55
       +
               if get_tsv_mode():

     

       56
       56
       +
                   print("No duplicate mappings found")

     

       57
       57
       +
               else:

     

       58
       58
       +
                   print_info("No duplicate mappings found")

     

       55
       59
        
               return

     

       56
       60
        
       

     

       57
       57
       -
           table = Table(title="Duplicate Entry Mappings")

     

       58
       58
       -
           table.add_column("Duplicate ID", style="red")

     

       59
       59
       -
           table.add_column("Canonical ID", style="green")

     

       61
       61
       +
           if get_tsv_mode():

     

       62
       62
       +
               print("Duplicate ID\tCanonical ID")

     

       63
       63
       +
               for duplicate_id, canonical_id in duplicates.duplicates.items():

     

       64
       64
       +
                   print(f"{duplicate_id}\t{canonical_id}")

     

       65
       65
       +
               print(f"Total duplicates: {len(duplicates.duplicates)}")

     

       66
       66
       +
           else:

     

       67
       67
       +
               table = Table(title="Duplicate Entry Mappings")

     

       68
       68
       +
               table.add_column("Duplicate ID", style="red")

     

       69
       69
       +
               table.add_column("Canonical ID", style="green")

     

       60
       70
        
       

     

       61
       61
       -
           for duplicate_id, canonical_id in duplicates.duplicates.items():

     

       62
       62
       -
               table.add_row(duplicate_id, canonical_id)

     

       71
       71
       +
               for duplicate_id, canonical_id in duplicates.duplicates.items():

     

       72
       72
       +
                   table.add_row(duplicate_id, canonical_id)

     

       63
       73
        
       

     

       64
       64
       -
           console.print(table)

     

       65
       65
       -
           print_info(f"Total duplicates: {len(duplicates.duplicates)}")

     

       74
       74
       +
               console.print(table)

     

       75
       75
       +
               print_info(f"Total duplicates: {len(duplicates.duplicates)}")

     

       66
       76
        
       

     

       67
       77
        
       

     

       68
       78
        
       def add_duplicate(git_store: GitStore, duplicate_id: Optional[str], canonical_id: Optional[str]) -> None:

+396

src/thicket/cli/commands/index_cmd.py

···

       1
       1
       +
       """CLI command for building reference index from blog entries."""

     

       2
       2
       +
       

     

       3
       3
       +
       import json

     

       4
       4
       +
       from pathlib import Path

     

       5
       5
       +
       from typing import Optional

     

       6
       6
       +
       

     

       7
       7
       +
       import typer

     

       8
       8
       +
       from rich.console import Console

     

       9
       9
       +
       from rich.progress import (

     

       10
       10
       +
           BarColumn,

     

       11
       11
       +
           Progress,

     

       12
       12
       +
           SpinnerColumn,

     

       13
       13
       +
           TaskProgressColumn,

     

       14
       14
       +
           TextColumn,

     

       15
       15
       +
       )

     

       16
       16
       +
       from rich.table import Table

     

       17
       17
       +
       

     

       18
       18
       +
       from ...core.git_store import GitStore

     

       19
       19
       +
       from ...core.reference_parser import ReferenceIndex, ReferenceParser

     

       20
       20
       +
       from ..main import app

     

       21
       21
       +
       from ..utils import get_tsv_mode, load_config

     

       22
       22
       +
       

     

       23
       23
       +
       console = Console()

     

       24
       24
       +
       

     

       25
       25
       +
       

     

       26
       26
       +
       @app.command()

     

       27
       27
       +
       def index(

     

       28
       28
       +
           config_file: Optional[Path] = typer.Option(

     

       29
       29
       +
               None,

     

       30
       30
       +
               "--config",

     

       31
       31
       +
               "-c",

     

       32
       32
       +
               help="Path to configuration file",

     

       33
       33
       +
           ),

     

       34
       34
       +
           output_file: Optional[Path] = typer.Option(

     

       35
       35
       +
               None,

     

       36
       36
       +
               "--output",

     

       37
       37
       +
               "-o",

     

       38
       38
       +
               help="Path to output index file (default: references.json in git store)",

     

       39
       39
       +
           ),

     

       40
       40
       +
           verbose: bool = typer.Option(

     

       41
       41
       +
               False,

     

       42
       42
       +
               "--verbose",

     

       43
       43
       +
               "-v",

     

       44
       44
       +
               help="Show detailed progress information",

     

       45
       45
       +
           ),

     

       46
       46
       +
       ) -> None:

     

       47
       47
       +
           """Build a reference index showing which blog entries reference others.

     

       48
       48
       +
       

     

       49
       49
       +
           This command analyzes all blog entries to detect cross-references between

     

       50
       50
       +
           different blogs, creating an index that can be used to build threaded

     

       51
       51
       +
           views of related content.

     

       52
       52
       +
           """

     

       53
       53
       +
           try:

     

       54
       54
       +
               # Load configuration

     

       55
       55
       +
               config = load_config(config_file)

     

       56
       56
       +
       

     

       57
       57
       +
               # Initialize Git store

     

       58
       58
       +
               git_store = GitStore(config.git_store)

     

       59
       59
       +
       

     

       60
       60
       +
               # Initialize reference parser

     

       61
       61
       +
               parser = ReferenceParser()

     

       62
       62
       +
       

     

       63
       63
       +
               # Build user domain mapping

     

       64
       64
       +
               if verbose:

     

       65
       65
       +
                   console.print("Building user domain mapping...")

     

       66
       66
       +
               user_domains = parser.build_user_domain_mapping(git_store)

     

       67
       67
       +
       

     

       68
       68
       +
               if verbose:

     

       69
       69
       +
                   console.print(f"Found {len(user_domains)} users with {sum(len(d) for d in user_domains.values())} total domains")

     

       70
       70
       +
       

     

       71
       71
       +
               # Initialize reference index

     

       72
       72
       +
               ref_index = ReferenceIndex()

     

       73
       73
       +
               ref_index.user_domains = user_domains

     

       74
       74
       +
       

     

       75
       75
       +
               # Get all users

     

       76
       76
       +
               index = git_store._load_index()

     

       77
       77
       +
               users = list(index.users.keys())

     

       78
       78
       +
       

     

       79
       79
       +
               if not users:

     

       80
       80
       +
                   console.print("[yellow]No users found in Git store[/yellow]")

     

       81
       81
       +
                   raise typer.Exit(0)

     

       82
       82
       +
       

     

       83
       83
       +
               # Process all entries

     

       84
       84
       +
               total_entries = 0

     

       85
       85
       +
               total_references = 0

     

       86
       86
       +
               all_references = []

     

       87
       87
       +
       

     

       88
       88
       +
               with Progress(

     

       89
       89
       +
                   SpinnerColumn(),

     

       90
       90
       +
                   TextColumn("[progress.description]{task.description}"),

     

       91
       91
       +
                   BarColumn(),

     

       92
       92
       +
                   TaskProgressColumn(),

     

       93
       93
       +
                   console=console,

     

       94
       94
       +
               ) as progress:

     

       95
       95
       +
       

     

       96
       96
       +
                   # Count total entries first

     

       97
       97
       +
                   counting_task = progress.add_task("Counting entries...", total=len(users))

     

       98
       98
       +
                   entry_counts = {}

     

       99
       99
       +
                   for username in users:

     

       100
       100
       +
                       entries = git_store.list_entries(username)

     

       101
       101
       +
                       entry_counts[username] = len(entries)

     

       102
       102
       +
                       total_entries += len(entries)

     

       103
       103
       +
                       progress.advance(counting_task)

     

       104
       104
       +
       

     

       105
       105
       +
                   progress.remove_task(counting_task)

     

       106
       106
       +
       

     

       107
       107
       +
                   # Process entries - extract references

     

       108
       108
       +
                   processing_task = progress.add_task(

     

       109
       109
       +
                       f"Extracting references from {total_entries} entries...",

     

       110
       110
       +
                       total=total_entries

     

       111
       111
       +
                   )

     

       112
       112
       +
       

     

       113
       113
       +
                   for username in users:

     

       114
       114
       +
                       entries = git_store.list_entries(username)

     

       115
       115
       +
       

     

       116
       116
       +
                       for entry in entries:

     

       117
       117
       +
                           # Extract references from this entry

     

       118
       118
       +
                           references = parser.extract_references(entry, username, user_domains)

     

       119
       119
       +
                           all_references.extend(references)

     

       120
       120
       +
       

     

       121
       121
       +
                           progress.advance(processing_task)

     

       122
       122
       +
       

     

       123
       123
       +
                           if verbose and references:

     

       124
       124
       +
                               console.print(f"  Found {len(references)} references in {username}:{entry.title[:50]}...")

     

       125
       125
       +
       

     

       126
       126
       +
                   progress.remove_task(processing_task)

     

       127
       127
       +
       

     

       128
       128
       +
                   # Resolve target_entry_ids for references

     

       129
       129
       +
                   if all_references:

     

       130
       130
       +
                       resolve_task = progress.add_task(

     

       131
       131
       +
                           f"Resolving {len(all_references)} references...",

     

       132
       132
       +
                           total=len(all_references)

     

       133
       133
       +
                       )

     

       134
       134
       +
       

     

       135
       135
       +
                       if verbose:

     

       136
       136
       +
                           console.print(f"Resolving target entry IDs for {len(all_references)} references...")

     

       137
       137
       +
       

     

       138
       138
       +
                       resolved_references = parser.resolve_target_entry_ids(all_references, git_store)

     

       139
       139
       +
       

     

       140
       140
       +
                       # Count resolved references

     

       141
       141
       +
                       resolved_count = sum(1 for ref in resolved_references if ref.target_entry_id is not None)

     

       142
       142
       +
                       if verbose:

     

       143
       143
       +
                           console.print(f"Resolved {resolved_count} out of {len(all_references)} references")

     

       144
       144
       +
       

     

       145
       145
       +
                       # Add resolved references to index

     

       146
       146
       +
                       for ref in resolved_references:

     

       147
       147
       +
                           ref_index.add_reference(ref)

     

       148
       148
       +
                           total_references += 1

     

       149
       149
       +
                           progress.advance(resolve_task)

     

       150
       150
       +
       

     

       151
       151
       +
                       progress.remove_task(resolve_task)

     

       152
       152
       +
       

     

       153
       153
       +
               # Determine output path

     

       154
       154
       +
               if output_file:

     

       155
       155
       +
                   output_path = output_file

     

       156
       156
       +
               else:

     

       157
       157
       +
                   output_path = config.git_store / "references.json"

     

       158
       158
       +
       

     

       159
       159
       +
               # Save reference index

     

       160
       160
       +
               with open(output_path, "w") as f:

     

       161
       161
       +
                   json.dump(ref_index.to_dict(), f, indent=2, default=str)

     

       162
       162
       +
       

     

       163
       163
       +
               # Show summary

     

       164
       164
       +
               if not get_tsv_mode():

     

       165
       165
       +
                   console.print("\n[green]✓ Reference index built successfully[/green]")

     

       166
       166
       +
       

     

       167
       167
       +
               # Create summary table or TSV output

     

       168
       168
       +
               if get_tsv_mode():

     

       169
       169
       +
                   print("Metric\tCount")

     

       170
       170
       +
                   print(f"Total Users\t{len(users)}")

     

       171
       171
       +
                   print(f"Total Entries\t{total_entries}")

     

       172
       172
       +
                   print(f"Total References\t{total_references}")

     

       173
       173
       +
                   print(f"Outbound Refs\t{len(ref_index.outbound_refs)}")

     

       174
       174
       +
                   print(f"Inbound Refs\t{len(ref_index.inbound_refs)}")

     

       175
       175
       +
                   print(f"Output File\t{output_path}")

     

       176
       176
       +
               else:

     

       177
       177
       +
                   table = Table(title="Reference Index Summary")

     

       178
       178
       +
                   table.add_column("Metric", style="cyan")

     

       179
       179
       +
                   table.add_column("Count", style="green")

     

       180
       180
       +
       

     

       181
       181
       +
                   table.add_row("Total Users", str(len(users)))

     

       182
       182
       +
                   table.add_row("Total Entries", str(total_entries))

     

       183
       183
       +
                   table.add_row("Total References", str(total_references))

     

       184
       184
       +
                   table.add_row("Outbound Refs", str(len(ref_index.outbound_refs)))

     

       185
       185
       +
                   table.add_row("Inbound Refs", str(len(ref_index.inbound_refs)))

     

       186
       186
       +
                   table.add_row("Output File", str(output_path))

     

       187
       187
       +
       

     

       188
       188
       +
                   console.print(table)

     

       189
       189
       +
       

     

       190
       190
       +
               # Show some interesting statistics

     

       191
       191
       +
               if total_references > 0:

     

       192
       192
       +
                   if not get_tsv_mode():

     

       193
       193
       +
                       console.print("\n[bold]Reference Statistics:[/bold]")

     

       194
       194
       +
       

     

       195
       195
       +
                   # Most referenced users

     

       196
       196
       +
                   target_counts = {}

     

       197
       197
       +
                   unresolved_domains = set()

     

       198
       198
       +
       

     

       199
       199
       +
                   for ref in ref_index.references:

     

       200
       200
       +
                       if ref.target_username:

     

       201
       201
       +
                           target_counts[ref.target_username] = target_counts.get(ref.target_username, 0) + 1

     

       202
       202
       +
                       else:

     

       203
       203
       +
                           # Track unresolved domains

     

       204
       204
       +
                           from urllib.parse import urlparse

     

       205
       205
       +
                           domain = urlparse(ref.target_url).netloc.lower()

     

       206
       206
       +
                           unresolved_domains.add(domain)

     

       207
       207
       +
       

     

       208
       208
       +
                   if target_counts:

     

       209
       209
       +
                       if get_tsv_mode():

     

       210
       210
       +
                           print("Referenced User\tReference Count")

     

       211
       211
       +
                           for username, count in sorted(target_counts.items(), key=lambda x: x[1], reverse=True)[:5]:

     

       212
       212
       +
                               print(f"{username}\t{count}")

     

       213
       213
       +
                       else:

     

       214
       214
       +
                           console.print("\nMost referenced users:")

     

       215
       215
       +
                           for username, count in sorted(target_counts.items(), key=lambda x: x[1], reverse=True)[:5]:

     

       216
       216
       +
                               console.print(f"  {username}: {count} references")

     

       217
       217
       +
       

     

       218
       218
       +
                   if unresolved_domains and verbose:

     

       219
       219
       +
                       if get_tsv_mode():

     

       220
       220
       +
                           print("Unresolved Domain\tCount")

     

       221
       221
       +
                           for domain in sorted(list(unresolved_domains)[:10]):

     

       222
       222
       +
                               print(f"{domain}\t1")

     

       223
       223
       +
                           if len(unresolved_domains) > 10:

     

       224
       224
       +
                               print(f"... and {len(unresolved_domains) - 10} more\t...")

     

       225
       225
       +
                       else:

     

       226
       226
       +
                           console.print(f"\nUnresolved domains: {len(unresolved_domains)}")

     

       227
       227
       +
                           for domain in sorted(list(unresolved_domains)[:10]):

     

       228
       228
       +
                               console.print(f"  {domain}")

     

       229
       229
       +
                           if len(unresolved_domains) > 10:

     

       230
       230
       +
                               console.print(f"  ... and {len(unresolved_domains) - 10} more")

     

       231
       231
       +
       

     

       232
       232
       +
           except Exception as e:

     

       233
       233
       +
               console.print(f"[red]Error building reference index: {e}[/red]")

     

       234
       234
       +
               if verbose:

     

       235
       235
       +
                   console.print_exception()

     

       236
       236
       +
               raise typer.Exit(1)

     

       237
       237
       +
       

     

       238
       238
       +
       

     

       239
       239
       +
       @app.command()

     

       240
       240
       +
       def threads(

     

       241
       241
       +
           config_file: Optional[Path] = typer.Option(

     

       242
       242
       +
               None,

     

       243
       243
       +
               "--config",

     

       244
       244
       +
               "-c",

     

       245
       245
       +
               help="Path to configuration file",

     

       246
       246
       +
           ),

     

       247
       247
       +
           index_file: Optional[Path] = typer.Option(

     

       248
       248
       +
               None,

     

       249
       249
       +
               "--index",

     

       250
       250
       +
               "-i",

     

       251
       251
       +
               help="Path to reference index file (default: references.json in git store)",

     

       252
       252
       +
           ),

     

       253
       253
       +
           username: Optional[str] = typer.Option(

     

       254
       254
       +
               None,

     

       255
       255
       +
               "--username",

     

       256
       256
       +
               "-u",

     

       257
       257
       +
               help="Show threads for specific username only",

     

       258
       258
       +
           ),

     

       259
       259
       +
           entry_id: Optional[str] = typer.Option(

     

       260
       260
       +
               None,

     

       261
       261
       +
               "--entry",

     

       262
       262
       +
               "-e",

     

       263
       263
       +
               help="Show thread for specific entry ID",

     

       264
       264
       +
           ),

     

       265
       265
       +
           min_size: int = typer.Option(

     

       266
       266
       +
               2,

     

       267
       267
       +
               "--min-size",

     

       268
       268
       +
               "-m",

     

       269
       269
       +
               help="Minimum thread size to display",

     

       270
       270
       +
           ),

     

       271
       271
       +
       ) -> None:

     

       272
       272
       +
           """Show threaded view of related blog entries.

     

       273
       273
       +
       

     

       274
       274
       +
           This command uses the reference index to show which blog entries

     

       275
       275
       +
           are connected through cross-references, creating an email-style

     

       276
       276
       +
           threaded view of the conversation.

     

       277
       277
       +
           """

     

       278
       278
       +
           try:

     

       279
       279
       +
               # Load configuration

     

       280
       280
       +
               config = load_config(config_file)

     

       281
       281
       +
       

     

       282
       282
       +
               # Determine index file path

     

       283
       283
       +
               if index_file:

     

       284
       284
       +
                   index_path = index_file

     

       285
       285
       +
               else:

     

       286
       286
       +
                   index_path = config.git_store / "references.json"

     

       287
       287
       +
       

     

       288
       288
       +
               if not index_path.exists():

     

       289
       289
       +
                   console.print(f"[red]Reference index not found: {index_path}[/red]")

     

       290
       290
       +
                   console.print("Run 'thicket index' first to build the reference index")

     

       291
       291
       +
                   raise typer.Exit(1)

     

       292
       292
       +
       

     

       293
       293
       +
               # Load reference index

     

       294
       294
       +
               with open(index_path) as f:

     

       295
       295
       +
                   index_data = json.load(f)

     

       296
       296
       +
       

     

       297
       297
       +
               ref_index = ReferenceIndex.from_dict(index_data)

     

       298
       298
       +
       

     

       299
       299
       +
               # Initialize Git store to get entry details

     

       300
       300
       +
               git_store = GitStore(config.git_store)

     

       301
       301
       +
       

     

       302
       302
       +
               if entry_id and username:

     

       303
       303
       +
                   # Show specific thread

     

       304
       304
       +
                   thread_members = ref_index.get_thread_members(username, entry_id)

     

       305
       305
       +
                   _display_thread(thread_members, ref_index, git_store, f"Thread for {username}:{entry_id}")

     

       306
       306
       +
       

     

       307
       307
       +
               elif username:

     

       308
       308
       +
                   # Show all threads involving this user

     

       309
       309
       +
                   user_index = git_store._load_index()

     

       310
       310
       +
                   user = user_index.get_user(username)

     

       311
       311
       +
                   if not user:

     

       312
       312
       +
                       console.print(f"[red]User not found: {username}[/red]")

     

       313
       313
       +
                       raise typer.Exit(1)

     

       314
       314
       +
       

     

       315
       315
       +
                   entries = git_store.list_entries(username)

     

       316
       316
       +
                   threads_found = set()

     

       317
       317
       +
       

     

       318
       318
       +
                   console.print(f"[bold]Threads involving {username}:[/bold]\n")

     

       319
       319
       +
       

     

       320
       320
       +
                   for entry in entries:

     

       321
       321
       +
                       thread_members = ref_index.get_thread_members(username, entry.id)

     

       322
       322
       +
                       if len(thread_members) >= min_size:

     

       323
       323
       +
                           thread_key = tuple(sorted(thread_members))

     

       324
       324
       +
                           if thread_key not in threads_found:

     

       325
       325
       +
                               threads_found.add(thread_key)

     

       326
       326
       +
                               _display_thread(thread_members, ref_index, git_store, f"Thread #{len(threads_found)}")

     

       327
       327
       +
       

     

       328
       328
       +
               else:

     

       329
       329
       +
                   # Show all threads

     

       330
       330
       +
                   console.print("[bold]All conversation threads:[/bold]\n")

     

       331
       331
       +
       

     

       332
       332
       +
                   all_threads = set()

     

       333
       333
       +
                   processed_entries = set()

     

       334
       334
       +
       

     

       335
       335
       +
                   # Get all entries

     

       336
       336
       +
                   user_index = git_store._load_index()

     

       337
       337
       +
                   for username in user_index.users.keys():

     

       338
       338
       +
                       entries = git_store.list_entries(username)

     

       339
       339
       +
                       for entry in entries:

     

       340
       340
       +
                           entry_key = (username, entry.id)

     

       341
       341
       +
                           if entry_key in processed_entries:

     

       342
       342
       +
                               continue

     

       343
       343
       +
       

     

       344
       344
       +
                           thread_members = ref_index.get_thread_members(username, entry.id)

     

       345
       345
       +
                           if len(thread_members) >= min_size:

     

       346
       346
       +
                               thread_key = tuple(sorted(thread_members))

     

       347
       347
       +
                               if thread_key not in all_threads:

     

       348
       348
       +
                                   all_threads.add(thread_key)

     

       349
       349
       +
                                   _display_thread(thread_members, ref_index, git_store, f"Thread #{len(all_threads)}")

     

       350
       350
       +
       

     

       351
       351
       +
                                   # Mark all members as processed

     

       352
       352
       +
                                   for member in thread_members:

     

       353
       353
       +
                                       processed_entries.add(member)

     

       354
       354
       +
       

     

       355
       355
       +
                   if not all_threads:

     

       356
       356
       +
                       console.print("[yellow]No conversation threads found[/yellow]")

     

       357
       357
       +
                       console.print(f"(minimum thread size: {min_size})")

     

       358
       358
       +
       

     

       359
       359
       +
           except Exception as e:

     

       360
       360
       +
               console.print(f"[red]Error showing threads: {e}[/red]")

     

       361
       361
       +
               raise typer.Exit(1)

     

       362
       362
       +
       

     

       363
       363
       +
       

     

       364
       364
       +
       def _display_thread(thread_members, ref_index, git_store, title):

     

       365
       365
       +
           """Display a single conversation thread."""

     

       366
       366
       +
           console.print(f"[bold cyan]{title}[/bold cyan]")

     

       367
       367
       +
           console.print(f"Thread size: {len(thread_members)} entries")

     

       368
       368
       +
       

     

       369
       369
       +
           # Get entry details for each member

     

       370
       370
       +
           thread_entries = []

     

       371
       371
       +
           for username, entry_id in thread_members:

     

       372
       372
       +
               entry = git_store.get_entry(username, entry_id)

     

       373
       373
       +
               if entry:

     

       374
       374
       +
                   thread_entries.append((username, entry))

     

       375
       375
       +
       

     

       376
       376
       +
           # Sort by publication date

     

       377
       377
       +
           thread_entries.sort(key=lambda x: x[1].published or x[1].updated)

     

       378
       378
       +
       

     

       379
       379
       +
           # Display entries

     

       380
       380
       +
           for i, (username, entry) in enumerate(thread_entries):

     

       381
       381
       +
               prefix = "├─" if i < len(thread_entries) - 1 else "└─"

     

       382
       382
       +
       

     

       383
       383
       +
               # Get references for this entry

     

       384
       384
       +
               outbound = ref_index.get_outbound_refs(username, entry.id)

     

       385
       385
       +
               inbound = ref_index.get_inbound_refs(username, entry.id)

     

       386
       386
       +
       

     

       387
       387
       +
               ref_info = ""

     

       388
       388
       +
               if outbound or inbound:

     

       389
       389
       +
                   ref_info = f" ({len(outbound)} out, {len(inbound)} in)"

     

       390
       390
       +
       

     

       391
       391
       +
               console.print(f"  {prefix} [{username}] {entry.title[:60]}...{ref_info}")

     

       392
       392
       +
       

     

       393
       393
       +
               if entry.published:

     

       394
       394
       +
                   console.print(f"    Published: {entry.published.strftime('%Y-%m-%d')}")

     

       395
       395
       +
       

     

       396
       396
       +
           console.print()  # Empty line after each thread

+305

src/thicket/cli/commands/info_cmd.py

···

       1
       1
       +
       """CLI command for displaying detailed information about a specific atom entry."""

     

       2
       2
       +
       

     

       3
       3
       +
       import json

     

       4
       4
       +
       from pathlib import Path

     

       5
       5
       +
       from typing import Optional

     

       6
       6
       +
       

     

       7
       7
       +
       import typer

     

       8
       8
       +
       from rich.console import Console

     

       9
       9
       +
       from rich.panel import Panel

     

       10
       10
       +
       from rich.table import Table

     

       11
       11
       +
       from rich.text import Text

     

       12
       12
       +
       

     

       13
       13
       +
       from ...core.git_store import GitStore

     

       14
       14
       +
       from ...core.reference_parser import ReferenceIndex

     

       15
       15
       +
       from ..main import app

     

       16
       16
       +
       from ..utils import load_config, get_tsv_mode

     

       17
       17
       +
       

     

       18
       18
       +
       console = Console()

     

       19
       19
       +
       

     

       20
       20
       +
       

     

       21
       21
       +
       @app.command()

     

       22
       22
       +
       def info(

     

       23
       23
       +
           identifier: str = typer.Argument(

     

       24
       24
       +
               ...,

     

       25
       25
       +
               help="The atom ID or URL of the entry to display information about"

     

       26
       26
       +
           ),

     

       27
       27
       +
           username: Optional[str] = typer.Option(

     

       28
       28
       +
               None,

     

       29
       29
       +
               "--username",

     

       30
       30
       +
               "-u",

     

       31
       31
       +
               help="Username to search for the entry (if not provided, searches all users)"

     

       32
       32
       +
           ),

     

       33
       33
       +
           config_file: Optional[Path] = typer.Option(

     

       34
       34
       +
               Path("thicket.yaml"),

     

       35
       35
       +
               "--config",

     

       36
       36
       +
               "-c",

     

       37
       37
       +
               help="Path to configuration file",

     

       38
       38
       +
           ),

     

       39
       39
       +
           show_content: bool = typer.Option(

     

       40
       40
       +
               False,

     

       41
       41
       +
               "--content",

     

       42
       42
       +
               help="Include the full content of the entry in the output"

     

       43
       43
       +
           ),

     

       44
       44
       +
       ) -> None:

     

       45
       45
       +
           """Display detailed information about a specific atom entry.

     

       46
       46
       +
           

     

       47
       47
       +
           You can specify the entry using either its atom ID or URL.

     

       48
       48
       +
           Shows all metadata for the given entry, including title, dates, categories,

     

       49
       49
       +
           and summarizes all inbound and outbound links to/from other posts.

     

       50
       50
       +
           """

     

       51
       51
       +
           try:

     

       52
       52
       +
               # Load configuration

     

       53
       53
       +
               config = load_config(config_file)

     

       54
       54
       +
               

     

       55
       55
       +
               # Initialize Git store

     

       56
       56
       +
               git_store = GitStore(config.git_store)

     

       57
       57
       +
               

     

       58
       58
       +
               # Find the entry

     

       59
       59
       +
               entry = None

     

       60
       60
       +
               found_username = None

     

       61
       61
       +
               

     

       62
       62
       +
               # Check if identifier looks like a URL

     

       63
       63
       +
               is_url = identifier.startswith(('http://', 'https://'))

     

       64
       64
       +
               

     

       65
       65
       +
               if username:

     

       66
       66
       +
                   # Search specific username

     

       67
       67
       +
                   if is_url:

     

       68
       68
       +
                       # Search by URL

     

       69
       69
       +
                       entries = git_store.list_entries(username)

     

       70
       70
       +
                       for e in entries:

     

       71
       71
       +
                           if str(e.link) == identifier:

     

       72
       72
       +
                               entry = e

     

       73
       73
       +
                               found_username = username

     

       74
       74
       +
                               break

     

       75
       75
       +
                   else:

     

       76
       76
       +
                       # Search by atom ID

     

       77
       77
       +
                       entry = git_store.get_entry(username, identifier)

     

       78
       78
       +
                       if entry:

     

       79
       79
       +
                           found_username = username

     

       80
       80
       +
               else:

     

       81
       81
       +
                   # Search all users

     

       82
       82
       +
                   index = git_store._load_index()

     

       83
       83
       +
                   for user in index.users.keys():

     

       84
       84
       +
                       if is_url:

     

       85
       85
       +
                           # Search by URL

     

       86
       86
       +
                           entries = git_store.list_entries(user)

     

       87
       87
       +
                           for e in entries:

     

       88
       88
       +
                               if str(e.link) == identifier:

     

       89
       89
       +
                                   entry = e

     

       90
       90
       +
                                   found_username = user

     

       91
       91
       +
                                   break

     

       92
       92
       +
                           if entry:

     

       93
       93
       +
                               break

     

       94
       94
       +
                       else:

     

       95
       95
       +
                           # Search by atom ID

     

       96
       96
       +
                           entry = git_store.get_entry(user, identifier)

     

       97
       97
       +
                           if entry:

     

       98
       98
       +
                               found_username = user

     

       99
       99
       +
                               break

     

       100
       100
       +
               

     

       101
       101
       +
               if not entry or not found_username:

     

       102
       102
       +
                   if username:

     

       103
       103
       +
                       console.print(f"[red]Entry with {'URL' if is_url else 'atom ID'} '{identifier}' not found for user '{username}'[/red]")

     

       104
       104
       +
                   else:

     

       105
       105
       +
                       console.print(f"[red]Entry with {'URL' if is_url else 'atom ID'} '{identifier}' not found in any user's entries[/red]")

     

       106
       106
       +
                   raise typer.Exit(1)

     

       107
       107
       +
               

     

       108
       108
       +
               # Load reference index if available

     

       109
       109
       +
               references_path = config.git_store / "references.json"

     

       110
       110
       +
               ref_index = None

     

       111
       111
       +
               if references_path.exists():

     

       112
       112
       +
                   with open(references_path) as f:

     

       113
       113
       +
                       index_data = json.load(f)

     

       114
       114
       +
                   ref_index = ReferenceIndex.from_dict(index_data)

     

       115
       115
       +
               

     

       116
       116
       +
               # Display information

     

       117
       117
       +
               if get_tsv_mode():

     

       118
       118
       +
                   _display_entry_info_tsv(entry, found_username, ref_index, show_content)

     

       119
       119
       +
               else:

     

       120
       120
       +
                   _display_entry_info(entry, found_username)

     

       121
       121
       +
                   

     

       122
       122
       +
                   if ref_index:

     

       123
       123
       +
                       _display_link_info(entry, found_username, ref_index)

     

       124
       124
       +
                   else:

     

       125
       125
       +
                       console.print("\n[yellow]No reference index found. Run 'thicket index' to build cross-reference data.[/yellow]")

     

       126
       126
       +
                   

     

       127
       127
       +
                   # Optionally display content

     

       128
       128
       +
                   if show_content and entry.content:

     

       129
       129
       +
                       _display_content(entry.content)

     

       130
       130
       +
                   

     

       131
       131
       +
           except Exception as e:

     

       132
       132
       +
               console.print(f"[red]Error displaying entry info: {e}[/red]")

     

       133
       133
       +
               raise typer.Exit(1)

     

       134
       134
       +
       

     

       135
       135
       +
       

     

       136
       136
       +
       def _display_entry_info(entry, username: str) -> None:

     

       137
       137
       +
           """Display basic entry information in a structured format."""

     

       138
       138
       +
           

     

       139
       139
       +
           # Create main info panel

     

       140
       140
       +
           info_table = Table.grid(padding=(0, 2))

     

       141
       141
       +
           info_table.add_column("Field", style="cyan bold", width=15)

     

       142
       142
       +
           info_table.add_column("Value", style="white")

     

       143
       143
       +
           

     

       144
       144
       +
           info_table.add_row("User", f"[green]{username}[/green]")

     

       145
       145
       +
           info_table.add_row("Atom ID", f"[blue]{entry.id}[/blue]")

     

       146
       146
       +
           info_table.add_row("Title", entry.title)

     

       147
       147
       +
           info_table.add_row("Link", str(entry.link))

     

       148
       148
       +
           

     

       149
       149
       +
           if entry.published:

     

       150
       150
       +
               info_table.add_row("Published", entry.published.strftime("%Y-%m-%d %H:%M:%S UTC"))

     

       151
       151
       +
           

     

       152
       152
       +
           info_table.add_row("Updated", entry.updated.strftime("%Y-%m-%d %H:%M:%S UTC"))

     

       153
       153
       +
           

     

       154
       154
       +
           if entry.summary:

     

       155
       155
       +
               # Truncate long summaries

     

       156
       156
       +
               summary = entry.summary[:200] + "..." if len(entry.summary) > 200 else entry.summary

     

       157
       157
       +
               info_table.add_row("Summary", summary)

     

       158
       158
       +
           

     

       159
       159
       +
           if entry.categories:

     

       160
       160
       +
               categories_text = ", ".join(entry.categories)

     

       161
       161
       +
               info_table.add_row("Categories", categories_text)

     

       162
       162
       +
           

     

       163
       163
       +
           if entry.author:

     

       164
       164
       +
               author_info = []

     

       165
       165
       +
               if "name" in entry.author:

     

       166
       166
       +
                   author_info.append(entry.author["name"])

     

       167
       167
       +
               if "email" in entry.author:

     

       168
       168
       +
                   author_info.append(f"<{entry.author['email']}>")

     

       169
       169
       +
               if author_info:

     

       170
       170
       +
                   info_table.add_row("Author", " ".join(author_info))

     

       171
       171
       +
           

     

       172
       172
       +
           if entry.content_type:

     

       173
       173
       +
               info_table.add_row("Content Type", entry.content_type)

     

       174
       174
       +
           

     

       175
       175
       +
           if entry.rights:

     

       176
       176
       +
               info_table.add_row("Rights", entry.rights)

     

       177
       177
       +
           

     

       178
       178
       +
           if entry.source:

     

       179
       179
       +
               info_table.add_row("Source Feed", entry.source)

     

       180
       180
       +
           

     

       181
       181
       +
           panel = Panel(

     

       182
       182
       +
               info_table,

     

       183
       183
       +
               title=f"[bold]Entry Information[/bold]",

     

       184
       184
       +
               border_style="blue"

     

       185
       185
       +
           )

     

       186
       186
       +
           

     

       187
       187
       +
           console.print(panel)

     

       188
       188
       +
       

     

       189
       189
       +
       

     

       190
       190
       +
       def _display_link_info(entry, username: str, ref_index: ReferenceIndex) -> None:

     

       191
       191
       +
           """Display inbound and outbound link information."""

     

       192
       192
       +
           

     

       193
       193
       +
           # Get links

     

       194
       194
       +
           outbound_refs = ref_index.get_outbound_refs(username, entry.id)

     

       195
       195
       +
           inbound_refs = ref_index.get_inbound_refs(username, entry.id)

     

       196
       196
       +
           

     

       197
       197
       +
           if not outbound_refs and not inbound_refs:

     

       198
       198
       +
               console.print("\n[dim]No cross-references found for this entry.[/dim]")

     

       199
       199
       +
               return

     

       200
       200
       +
           

     

       201
       201
       +
           # Create links table

     

       202
       202
       +
           links_table = Table(title="Cross-References")

     

       203
       203
       +
           links_table.add_column("Direction", style="cyan", width=10)

     

       204
       204
       +
           links_table.add_column("Target/Source", style="green", width=20)

     

       205
       205
       +
           links_table.add_column("URL", style="blue", width=50)

     

       206
       206
       +
           

     

       207
       207
       +
           # Add outbound references

     

       208
       208
       +
           for ref in outbound_refs:

     

       209
       209
       +
               target_info = f"{ref.target_username}:{ref.target_entry_id}" if ref.target_username and ref.target_entry_id else "External"

     

       210
       210
       +
               links_table.add_row("→ Out", target_info, ref.target_url)

     

       211
       211
       +
           

     

       212
       212
       +
           # Add inbound references

     

       213
       213
       +
           for ref in inbound_refs:

     

       214
       214
       +
               source_info = f"{ref.source_username}:{ref.source_entry_id}"

     

       215
       215
       +
               links_table.add_row("← In", source_info, ref.target_url)

     

       216
       216
       +
           

     

       217
       217
       +
           console.print()

     

       218
       218
       +
           console.print(links_table)

     

       219
       219
       +
           

     

       220
       220
       +
           # Summary

     

       221
       221
       +
           console.print(f"\n[bold]Summary:[/bold] {len(outbound_refs)} outbound, {len(inbound_refs)} inbound references")

     

       222
       222
       +
       

     

       223
       223
       +
       

     

       224
       224
       +
       def _display_content(content: str) -> None:

     

       225
       225
       +
           """Display the full content of the entry."""

     

       226
       226
       +
           

     

       227
       227
       +
           # Truncate very long content

     

       228
       228
       +
           display_content = content

     

       229
       229
       +
           if len(content) > 5000:

     

       230
       230
       +
               display_content = content[:5000] + "\n\n[... content truncated ...]"

     

       231
       231
       +
           

     

       232
       232
       +
           panel = Panel(

     

       233
       233
       +
               display_content,

     

       234
       234
       +
               title="[bold]Entry Content[/bold]",

     

       235
       235
       +
               border_style="green",

     

       236
       236
       +
               expand=False

     

       237
       237
       +
           )

     

       238
       238
       +
           

     

       239
       239
       +
           console.print()

     

       240
       240
       +
           console.print(panel)

     

       241
       241
       +
       

     

       242
       242
       +
       

     

       243
       243
       +
       def _display_entry_info_tsv(entry, username: str, ref_index: Optional[ReferenceIndex], show_content: bool) -> None:

     

       244
       244
       +
           """Display entry information in TSV format."""

     

       245
       245
       +
           

     

       246
       246
       +
           # Basic info

     

       247
       247
       +
           print("Field\tValue")

     

       248
       248
       +
           print(f"User\t{username}")

     

       249
       249
       +
           print(f"Atom ID\t{entry.id}")

     

       250
       250
       +
           print(f"Title\t{entry.title.replace(chr(9), ' ').replace(chr(10), ' ').replace(chr(13), ' ')}")

     

       251
       251
       +
           print(f"Link\t{entry.link}")

     

       252
       252
       +
           

     

       253
       253
       +
           if entry.published:

     

       254
       254
       +
               print(f"Published\t{entry.published.strftime('%Y-%m-%d %H:%M:%S UTC')}")

     

       255
       255
       +
           

     

       256
       256
       +
           print(f"Updated\t{entry.updated.strftime('%Y-%m-%d %H:%M:%S UTC')}")

     

       257
       257
       +
           

     

       258
       258
       +
           if entry.summary:

     

       259
       259
       +
               # Escape tabs and newlines in summary

     

       260
       260
       +
               summary = entry.summary.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')

     

       261
       261
       +
               print(f"Summary\t{summary}")

     

       262
       262
       +
           

     

       263
       263
       +
           if entry.categories:

     

       264
       264
       +
               print(f"Categories\t{', '.join(entry.categories)}")

     

       265
       265
       +
           

     

       266
       266
       +
           if entry.author:

     

       267
       267
       +
               author_info = []

     

       268
       268
       +
               if "name" in entry.author:

     

       269
       269
       +
                   author_info.append(entry.author["name"])

     

       270
       270
       +
               if "email" in entry.author:

     

       271
       271
       +
                   author_info.append(f"<{entry.author['email']}>")

     

       272
       272
       +
               if author_info:

     

       273
       273
       +
                   print(f"Author\t{' '.join(author_info)}")

     

       274
       274
       +
           

     

       275
       275
       +
           if entry.content_type:

     

       276
       276
       +
               print(f"Content Type\t{entry.content_type}")

     

       277
       277
       +
           

     

       278
       278
       +
           if entry.rights:

     

       279
       279
       +
               print(f"Rights\t{entry.rights}")

     

       280
       280
       +
           

     

       281
       281
       +
           if entry.source:

     

       282
       282
       +
               print(f"Source Feed\t{entry.source}")

     

       283
       283
       +
           

     

       284
       284
       +
           # Add reference info if available

     

       285
       285
       +
           if ref_index:

     

       286
       286
       +
               outbound_refs = ref_index.get_outbound_refs(username, entry.id)

     

       287
       287
       +
               inbound_refs = ref_index.get_inbound_refs(username, entry.id)

     

       288
       288
       +
               

     

       289
       289
       +
               print(f"Outbound References\t{len(outbound_refs)}")

     

       290
       290
       +
               print(f"Inbound References\t{len(inbound_refs)}")

     

       291
       291
       +
               

     

       292
       292
       +
               # Show each reference

     

       293
       293
       +
               for ref in outbound_refs:

     

       294
       294
       +
                   target_info = f"{ref.target_username}:{ref.target_entry_id}" if ref.target_username and ref.target_entry_id else "External"

     

       295
       295
       +
                   print(f"Outbound Reference\t{target_info}\t{ref.target_url}")

     

       296
       296
       +
               

     

       297
       297
       +
               for ref in inbound_refs:

     

       298
       298
       +
                   source_info = f"{ref.source_username}:{ref.source_entry_id}"

     

       299
       299
       +
                   print(f"Inbound Reference\t{source_info}\t{ref.target_url}")

     

       300
       300
       +
           

     

       301
       301
       +
           # Show content if requested

     

       302
       302
       +
           if show_content and entry.content:

     

       303
       303
       +
               # Escape tabs and newlines in content

     

       304
       304
       +
               content = entry.content.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')

     

       305
       305
       +
               print(f"Content\t{content}")

+416

src/thicket/cli/commands/links_cmd.py

···

       1
       1
       +
       """CLI command for extracting and categorizing all outbound links from blog entries."""

     

       2
       2
       +
       

     

       3
       3
       +
       import json

     

       4
       4
       +
       import re

     

       5
       5
       +
       from pathlib import Path

     

       6
       6
       +
       from typing import Dict, List, Optional, Set

     

       7
       7
       +
       from urllib.parse import urljoin, urlparse

     

       8
       8
       +
       

     

       9
       9
       +
       import typer

     

       10
       10
       +
       from rich.console import Console

     

       11
       11
       +
       from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn

     

       12
       12
       +
       from rich.table import Table

     

       13
       13
       +
       

     

       14
       14
       +
       from ...core.git_store import GitStore

     

       15
       15
       +
       from ..main import app

     

       16
       16
       +
       from ..utils import load_config, get_tsv_mode

     

       17
       17
       +
       

     

       18
       18
       +
       console = Console()

     

       19
       19
       +
       

     

       20
       20
       +
       

     

       21
       21
       +
       class LinkData:

     

       22
       22
       +
           """Represents a link found in a blog entry."""

     

       23
       23
       +
           

     

       24
       24
       +
           def __init__(self, url: str, entry_id: str, username: str):

     

       25
       25
       +
               self.url = url

     

       26
       26
       +
               self.entry_id = entry_id

     

       27
       27
       +
               self.username = username

     

       28
       28
       +
           

     

       29
       29
       +
           def to_dict(self) -> dict:

     

       30
       30
       +
               """Convert to dictionary for JSON serialization."""

     

       31
       31
       +
               return {

     

       32
       32
       +
                   "url": self.url,

     

       33
       33
       +
                   "entry_id": self.entry_id,

     

       34
       34
       +
                   "username": self.username

     

       35
       35
       +
               }

     

       36
       36
       +
           

     

       37
       37
       +
           @classmethod

     

       38
       38
       +
           def from_dict(cls, data: dict) -> "LinkData":

     

       39
       39
       +
               """Create from dictionary."""

     

       40
       40
       +
               return cls(

     

       41
       41
       +
                   url=data["url"],

     

       42
       42
       +
                   entry_id=data["entry_id"],

     

       43
       43
       +
                   username=data["username"]

     

       44
       44
       +
               )

     

       45
       45
       +
       

     

       46
       46
       +
       

     

       47
       47
       +
       class LinkCategorizer:

     

       48
       48
       +
           """Categorizes links as internal, user, or unknown."""

     

       49
       49
       +
           

     

       50
       50
       +
           def __init__(self, user_domains: Dict[str, Set[str]]):

     

       51
       51
       +
               self.user_domains = user_domains

     

       52
       52
       +
               # Create reverse mapping of domain -> username

     

       53
       53
       +
               self.domain_to_user = {}

     

       54
       54
       +
               for username, domains in user_domains.items():

     

       55
       55
       +
                   for domain in domains:

     

       56
       56
       +
                       self.domain_to_user[domain] = username

     

       57
       57
       +
           

     

       58
       58
       +
           def categorize_url(self, url: str, source_username: str) -> tuple[str, Optional[str]]:

     

       59
       59
       +
               """

     

       60
       60
       +
               Categorize a URL as 'internal', 'user', or 'unknown'.

     

       61
       61
       +
               Returns (category, target_username).

     

       62
       62
       +
               """

     

       63
       63
       +
               try:

     

       64
       64
       +
                   parsed = urlparse(url)

     

       65
       65
       +
                   domain = parsed.netloc.lower()

     

       66
       66
       +
                   

     

       67
       67
       +
                   # Check if it's a link to the same user's domain (internal)

     

       68
       68
       +
                   if domain in self.user_domains.get(source_username, set()):

     

       69
       69
       +
                       return "internal", source_username

     

       70
       70
       +
                   

     

       71
       71
       +
                   # Check if it's a link to another user's domain

     

       72
       72
       +
                   if domain in self.domain_to_user:

     

       73
       73
       +
                       return "user", self.domain_to_user[domain]

     

       74
       74
       +
                   

     

       75
       75
       +
                   # Everything else is unknown

     

       76
       76
       +
                   return "unknown", None

     

       77
       77
       +
                   

     

       78
       78
       +
               except Exception:

     

       79
       79
       +
                   return "unknown", None

     

       80
       80
       +
       

     

       81
       81
       +
       

     

       82
       82
       +
       class LinkExtractor:

     

       83
       83
       +
           """Extracts and resolves links from blog entries."""

     

       84
       84
       +
           

     

       85
       85
       +
           def __init__(self):

     

       86
       86
       +
               # Pattern for extracting links from HTML

     

       87
       87
       +
               self.link_pattern = re.compile(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL)

     

       88
       88
       +
               self.url_pattern = re.compile(r'https?://[^\s<>"]+')

     

       89
       89
       +
           

     

       90
       90
       +
           def extract_links_from_html(self, html_content: str, base_url: str) -> List[tuple[str, str]]:

     

       91
       91
       +
               """Extract all links from HTML content and resolve them against base URL."""

     

       92
       92
       +
               links = []

     

       93
       93
       +
               

     

       94
       94
       +
               # Extract links from <a> tags

     

       95
       95
       +
               for match in self.link_pattern.finditer(html_content):

     

       96
       96
       +
                   url = match.group(1)

     

       97
       97
       +
                   text = re.sub(r'<[^>]+>', '', match.group(2)).strip()  # Remove HTML tags from link text

     

       98
       98
       +
                   

     

       99
       99
       +
                   # Resolve relative URLs against base URL

     

       100
       100
       +
                   resolved_url = urljoin(base_url, url)

     

       101
       101
       +
                   links.append((resolved_url, text))

     

       102
       102
       +
               

     

       103
       103
       +
               return links

     

       104
       104
       +
           

     

       105
       105
       +
           

     

       106
       106
       +
           def extract_links_from_entry(self, entry, username: str, base_url: str) -> List[LinkData]:

     

       107
       107
       +
               """Extract all links from a blog entry."""

     

       108
       108
       +
               links = []

     

       109
       109
       +
               

     

       110
       110
       +
               # Combine all text content for analysis

     

       111
       111
       +
               content_to_search = []

     

       112
       112
       +
               if entry.content:

     

       113
       113
       +
                   content_to_search.append(entry.content)

     

       114
       114
       +
               if entry.summary:

     

       115
       115
       +
                   content_to_search.append(entry.summary)

     

       116
       116
       +
               

     

       117
       117
       +
               for content in content_to_search:

     

       118
       118
       +
                   extracted_links = self.extract_links_from_html(content, base_url)

     

       119
       119
       +
                   

     

       120
       120
       +
                   for url, link_text in extracted_links:

     

       121
       121
       +
                       # Skip empty URLs

     

       122
       122
       +
                       if not url or url.startswith('#'):

     

       123
       123
       +
                           continue

     

       124
       124
       +
                       

     

       125
       125
       +
                       link_data = LinkData(

     

       126
       126
       +
                           url=url,

     

       127
       127
       +
                           entry_id=entry.id,

     

       128
       128
       +
                           username=username

     

       129
       129
       +
                       )

     

       130
       130
       +
                       

     

       131
       131
       +
                       links.append(link_data)

     

       132
       132
       +
               

     

       133
       133
       +
               return links

     

       134
       134
       +
       

     

       135
       135
       +
       

     

       136
       136
       +
       @app.command()

     

       137
       137
       +
       def links(

     

       138
       138
       +
           config_file: Optional[Path] = typer.Option(

     

       139
       139
       +
               Path("thicket.yaml"),

     

       140
       140
       +
               "--config",

     

       141
       141
       +
               "-c",

     

       142
       142
       +
               help="Path to configuration file",

     

       143
       143
       +
           ),

     

       144
       144
       +
           output_file: Optional[Path] = typer.Option(

     

       145
       145
       +
               None,

     

       146
       146
       +
               "--output",

     

       147
       147
       +
               "-o",

     

       148
       148
       +
               help="Path to output links file (default: links.json in git store)",

     

       149
       149
       +
           ),

     

       150
       150
       +
           mapping_file: Optional[Path] = typer.Option(

     

       151
       151
       +
               None,

     

       152
       152
       +
               "--mapping",

     

       153
       153
       +
               "-m",

     

       154
       154
       +
               help="Path to output URL <-> atom ID mapping file (default: url_mapping.json in git store)",

     

       155
       155
       +
           ),

     

       156
       156
       +
           verbose: bool = typer.Option(

     

       157
       157
       +
               False,

     

       158
       158
       +
               "--verbose",

     

       159
       159
       +
               "-v",

     

       160
       160
       +
               help="Show detailed progress information",

     

       161
       161
       +
           ),

     

       162
       162
       +
       ) -> None:

     

       163
       163
       +
           """Extract and categorize all outbound links from blog entries.

     

       164
       164
       +
           

     

       165
       165
       +
           This command analyzes all blog entries to extract outbound links,

     

       166
       166
       +
           resolve them properly with respect to the feed's base URL, and

     

       167
       167
       +
           categorize them as internal, user, or unknown links.

     

       168
       168
       +
           """

     

       169
       169
       +
           try:

     

       170
       170
       +
               # Load configuration

     

       171
       171
       +
               config = load_config(config_file)

     

       172
       172
       +
       

     

       173
       173
       +
               # Initialize Git store

     

       174
       174
       +
               git_store = GitStore(config.git_store)

     

       175
       175
       +
               

     

       176
       176
       +
               # Build user domain mapping

     

       177
       177
       +
               if verbose:

     

       178
       178
       +
                   console.print("Building user domain mapping...")

     

       179
       179
       +
               

     

       180
       180
       +
               index = git_store._load_index()

     

       181
       181
       +
               user_domains = {}

     

       182
       182
       +
               

     

       183
       183
       +
               for username, user_metadata in index.users.items():

     

       184
       184
       +
                   domains = set()

     

       185
       185
       +
                   

     

       186
       186
       +
                   # Add domains from feeds

     

       187
       187
       +
                   for feed_url in user_metadata.feeds:

     

       188
       188
       +
                       domain = urlparse(feed_url).netloc.lower()

     

       189
       189
       +
                       if domain:

     

       190
       190
       +
                           domains.add(domain)

     

       191
       191
       +
                   

     

       192
       192
       +
                   # Add domain from homepage

     

       193
       193
       +
                   if user_metadata.homepage:

     

       194
       194
       +
                       domain = urlparse(str(user_metadata.homepage)).netloc.lower()

     

       195
       195
       +
                       if domain:

     

       196
       196
       +
                           domains.add(domain)

     

       197
       197
       +
                   

     

       198
       198
       +
                   user_domains[username] = domains

     

       199
       199
       +
               

     

       200
       200
       +
               if verbose:

     

       201
       201
       +
                   console.print(f"Found {len(user_domains)} users with {sum(len(d) for d in user_domains.values())} total domains")

     

       202
       202
       +
               

     

       203
       203
       +
               # Initialize components

     

       204
       204
       +
               link_extractor = LinkExtractor()

     

       205
       205
       +
               categorizer = LinkCategorizer(user_domains)

     

       206
       206
       +
               

     

       207
       207
       +
               # Get all users

     

       208
       208
       +
               users = list(index.users.keys())

     

       209
       209
       +
               

     

       210
       210
       +
               if not users:

     

       211
       211
       +
                   console.print("[yellow]No users found in Git store[/yellow]")

     

       212
       212
       +
                   raise typer.Exit(0)

     

       213
       213
       +
               

     

       214
       214
       +
               # Process all entries

     

       215
       215
       +
               all_links = []

     

       216
       216
       +
               link_categories = {"internal": [], "user": [], "unknown": []}

     

       217
       217
       +
               link_dict = {}  # Dictionary with link URL as key, maps to atom ID

     

       218
       218
       +
               reverse_dict = {}  # Dictionary with atom ID as key, maps to list of URLs

     

       219
       219
       +
               

     

       220
       220
       +
               with Progress(

     

       221
       221
       +
                   SpinnerColumn(),

     

       222
       222
       +
                   TextColumn("[progress.description]{task.description}"),

     

       223
       223
       +
                   BarColumn(),

     

       224
       224
       +
                   TaskProgressColumn(),

     

       225
       225
       +
                   console=console,

     

       226
       226
       +
               ) as progress:

     

       227
       227
       +
                   

     

       228
       228
       +
                   # Count total entries first

     

       229
       229
       +
                   counting_task = progress.add_task("Counting entries...", total=len(users))

     

       230
       230
       +
                   total_entries = 0

     

       231
       231
       +
                   

     

       232
       232
       +
                   for username in users:

     

       233
       233
       +
                       entries = git_store.list_entries(username)

     

       234
       234
       +
                       total_entries += len(entries)

     

       235
       235
       +
                       progress.advance(counting_task)

     

       236
       236
       +
                   

     

       237
       237
       +
                   progress.remove_task(counting_task)

     

       238
       238
       +
                   

     

       239
       239
       +
                   # Process entries

     

       240
       240
       +
                   processing_task = progress.add_task(

     

       241
       241
       +
                       f"Processing {total_entries} entries...", 

     

       242
       242
       +
                       total=total_entries

     

       243
       243
       +
                   )

     

       244
       244
       +
                   

     

       245
       245
       +
                   for username in users:

     

       246
       246
       +
                       entries = git_store.list_entries(username)

     

       247
       247
       +
                       user_metadata = index.users[username]

     

       248
       248
       +
                       

     

       249
       249
       +
                       # Get base URL for this user (use first feed URL)

     

       250
       250
       +
                       base_url = str(user_metadata.feeds[0]) if user_metadata.feeds else "https://example.com"

     

       251
       251
       +
                       

     

       252
       252
       +
                       for entry in entries:

     

       253
       253
       +
                           # Extract links from this entry

     

       254
       254
       +
                           entry_links = link_extractor.extract_links_from_entry(entry, username, base_url)

     

       255
       255
       +
                           

     

       256
       256
       +
                           # Track unique links per entry

     

       257
       257
       +
                           entry_urls_seen = set()

     

       258
       258
       +
                           

     

       259
       259
       +
                           # Categorize each link

     

       260
       260
       +
                           for link_data in entry_links:

     

       261
       261
       +
                               # Skip if we've already seen this URL in this entry

     

       262
       262
       +
                               if link_data.url in entry_urls_seen:

     

       263
       263
       +
                                   continue

     

       264
       264
       +
                               entry_urls_seen.add(link_data.url)

     

       265
       265
       +
                               

     

       266
       266
       +
                               category, target_username = categorizer.categorize_url(link_data.url, username)

     

       267
       267
       +
                               

     

       268
       268
       +
                               # Add to link dictionary (URL as key, maps to atom ID only)

     

       269
       269
       +
                               if link_data.url not in link_dict:

     

       270
       270
       +
                                   link_dict[link_data.url] = link_data.entry_id

     

       271
       271
       +
                                   

     

       272
       272
       +
                                   # Also add to reverse mapping (atom ID -> list of URLs)

     

       273
       273
       +
                                   if link_data.entry_id not in reverse_dict:

     

       274
       274
       +
                                       reverse_dict[link_data.entry_id] = []

     

       275
       275
       +
                                   reverse_dict[link_data.entry_id].append(link_data.url)

     

       276
       276
       +
                               

     

       277
       277
       +
                               # Add category info to link data for categories tracking

     

       278
       278
       +
                               link_info = link_data.to_dict()

     

       279
       279
       +
                               link_info["category"] = category

     

       280
       280
       +
                               link_info["target_username"] = target_username

     

       281
       281
       +
                               

     

       282
       282
       +
                               all_links.append(link_info)

     

       283
       283
       +
                               link_categories[category].append(link_info)

     

       284
       284
       +
                           

     

       285
       285
       +
                           progress.advance(processing_task)

     

       286
       286
       +
                           

     

       287
       287
       +
                           if verbose and entry_links:

     

       288
       288
       +
                               console.print(f"  Found {len(entry_links)} links in {username}:{entry.title[:50]}...")

     

       289
       289
       +
               

     

       290
       290
       +
               # Determine output paths

     

       291
       291
       +
               if output_file:

     

       292
       292
       +
                   output_path = output_file

     

       293
       293
       +
               else:

     

       294
       294
       +
                   output_path = config.git_store / "links.json"

     

       295
       295
       +
               

     

       296
       296
       +
               if mapping_file:

     

       297
       297
       +
                   mapping_path = mapping_file

     

       298
       298
       +
               else:

     

       299
       299
       +
                   mapping_path = config.git_store / "url_mapping.json"

     

       300
       300
       +
               

     

       301
       301
       +
               # Save all extracted links (not just filtered ones)

     

       302
       302
       +
               if verbose:

     

       303
       303
       +
                   console.print("Preparing output data...")

     

       304
       304
       +
               

     

       305
       305
       +
               # Build a set of all URLs that correspond to posts in the git database

     

       306
       306
       +
               registered_urls = set()

     

       307
       307
       +
               

     

       308
       308
       +
               # Get all entries from all users and build URL mappings

     

       309
       309
       +
               for username in users:

     

       310
       310
       +
                   entries = git_store.list_entries(username)

     

       311
       311
       +
                   user_metadata = index.users[username]

     

       312
       312
       +
                   

     

       313
       313
       +
                   for entry in entries:

     

       314
       314
       +
                       # Try to match entry URLs with extracted links

     

       315
       315
       +
                       if hasattr(entry, 'link') and entry.link:

     

       316
       316
       +
                           registered_urls.add(entry.link)

     

       317
       317
       +
                       

     

       318
       318
       +
                       # Also check entry alternate links if they exist

     

       319
       319
       +
                       if hasattr(entry, 'links') and entry.links:

     

       320
       320
       +
                           for link in entry.links:

     

       321
       321
       +
                               if hasattr(link, 'href') and link.href:

     

       322
       322
       +
                                   registered_urls.add(link.href)

     

       323
       323
       +
               

     

       324
       324
       +
               # Create filtered version for URL mapping (only links to registered posts)

     

       325
       325
       +
               filtered_link_dict = {}

     

       326
       326
       +
               filtered_reverse_dict = {}

     

       327
       327
       +
               

     

       328
       328
       +
               for url, entry_id in link_dict.items():

     

       329
       329
       +
                   if url in registered_urls:

     

       330
       330
       +
                       filtered_link_dict[url] = entry_id

     

       331
       331
       +
                       

     

       332
       332
       +
                       # Also update reverse mapping

     

       333
       333
       +
                       if entry_id not in filtered_reverse_dict:

     

       334
       334
       +
                           filtered_reverse_dict[entry_id] = []

     

       335
       335
       +
                       filtered_reverse_dict[entry_id].append(url)

     

       336
       336
       +
               

     

       337
       337
       +
               # Use all links for main output, not filtered ones

     

       338
       338
       +
               output_data = link_dict

     

       339
       339
       +
               

     

       340
       340
       +
               if verbose:

     

       341
       341
       +
                   console.print(f"Found {len(link_dict)} total links, {len(filtered_link_dict)} links to registered posts")

     

       342
       342
       +
               

     

       343
       343
       +
               # Save links data (URL -> atom ID mapping, all links)

     

       344
       344
       +
               with open(output_path, "w") as f:

     

       345
       345
       +
                   json.dump(output_data, f, indent=2, default=str)

     

       346
       346
       +
               

     

       347
       347
       +
               # Save bidirectional mapping file (filtered)

     

       348
       348
       +
               mapping_data = {

     

       349
       349
       +
                   "url_to_atom": filtered_link_dict,

     

       350
       350
       +
                   "atom_to_urls": filtered_reverse_dict

     

       351
       351
       +
               }

     

       352
       352
       +
               

     

       353
       353
       +
               with open(mapping_path, "w") as f:

     

       354
       354
       +
                   json.dump(mapping_data, f, indent=2, default=str)

     

       355
       355
       +
               

     

       356
       356
       +
               # Show summary

     

       357
       357
       +
               if not get_tsv_mode():

     

       358
       358
       +
                   console.print("\n[green]✓ Links extraction completed successfully[/green]")

     

       359
       359
       +
               

     

       360
       360
       +
               # Create summary table or TSV output

     

       361
       361
       +
               if get_tsv_mode():

     

       362
       362
       +
                   print("Category\tCount\tDescription")

     

       363
       363
       +
                   print(f"Internal\t{len(link_categories['internal'])}\tLinks to same user's domain")

     

       364
       364
       +
                   print(f"User\t{len(link_categories['user'])}\tLinks to other tracked users")

     

       365
       365
       +
                   print(f"Unknown\t{len(link_categories['unknown'])}\tLinks to external sites")

     

       366
       366
       +
                   print(f"Total Extracted\t{len(all_links)}\tAll extracted links")

     

       367
       367
       +
                   print(f"Saved to Output\t{len(output_data)}\tLinks saved to output file")

     

       368
       368
       +
                   print(f"Cross-references\t{len(filtered_link_dict)}\tLinks to registered posts only")

     

       369
       369
       +
               else:

     

       370
       370
       +
                   table = Table(title="Links Summary")

     

       371
       371
       +
                   table.add_column("Category", style="cyan")

     

       372
       372
       +
                   table.add_column("Count", style="green")

     

       373
       373
       +
                   table.add_column("Description", style="white")

     

       374
       374
       +
                   

     

       375
       375
       +
                   table.add_row("Internal", str(len(link_categories["internal"])), "Links to same user's domain")

     

       376
       376
       +
                   table.add_row("User", str(len(link_categories["user"])), "Links to other tracked users")

     

       377
       377
       +
                   table.add_row("Unknown", str(len(link_categories["unknown"])), "Links to external sites")

     

       378
       378
       +
                   table.add_row("Total Extracted", str(len(all_links)), "All extracted links")

     

       379
       379
       +
                   table.add_row("Saved to Output", str(len(output_data)), "Links saved to output file")

     

       380
       380
       +
                   table.add_row("Cross-references", str(len(filtered_link_dict)), "Links to registered posts only")

     

       381
       381
       +
                   

     

       382
       382
       +
                   console.print(table)

     

       383
       383
       +
               

     

       384
       384
       +
               # Show user links if verbose

     

       385
       385
       +
               if verbose and link_categories["user"]:

     

       386
       386
       +
                   if get_tsv_mode():

     

       387
       387
       +
                       print("User Link Source\tUser Link Target\tLink Count")

     

       388
       388
       +
                       user_link_counts = {}

     

       389
       389
       +
                       

     

       390
       390
       +
                       for link in link_categories["user"]:

     

       391
       391
       +
                           key = f"{link['username']} -> {link['target_username']}"

     

       392
       392
       +
                           user_link_counts[key] = user_link_counts.get(key, 0) + 1

     

       393
       393
       +
                       

     

       394
       394
       +
                       for link_pair, count in sorted(user_link_counts.items(), key=lambda x: x[1], reverse=True)[:10]:

     

       395
       395
       +
                           source, target = link_pair.split(" -> ")

     

       396
       396
       +
                           print(f"{source}\t{target}\t{count}")

     

       397
       397
       +
                   else:

     

       398
       398
       +
                       console.print("\n[bold]User-to-user links:[/bold]")

     

       399
       399
       +
                       user_link_counts = {}

     

       400
       400
       +
                       

     

       401
       401
       +
                       for link in link_categories["user"]:

     

       402
       402
       +
                           key = f"{link['username']} -> {link['target_username']}"

     

       403
       403
       +
                           user_link_counts[key] = user_link_counts.get(key, 0) + 1

     

       404
       404
       +
                       

     

       405
       405
       +
                       for link_pair, count in sorted(user_link_counts.items(), key=lambda x: x[1], reverse=True)[:10]:

     

       406
       406
       +
                           console.print(f"  {link_pair}: {count} links")

     

       407
       407
       +
               

     

       408
       408
       +
               if not get_tsv_mode():

     

       409
       409
       +
                   console.print(f"\nLinks output saved to: {output_path}")

     

       410
       410
       +
                   console.print(f"URL mapping saved to: {mapping_path}")

     

       411
       411
       +
       

     

       412
       412
       +
           except Exception as e:

     

       413
       413
       +
               console.print(f"[red]Error extracting links: {e}[/red]")

     

       414
       414
       +
               if verbose:

     

       415
       415
       +
                   console.print_exception()

     

       416
       416
       +
               raise typer.Exit(1)

+24

src/thicket/cli/commands/list_cmd.py

···

       1
       1
        
       """List command for thicket."""

     

       2
       2
        
       

     

       3
       3
       +
       import re

     

       3
       4
        
       from pathlib import Path

     

       4
       5
        
       from typing import Optional

     

       5
       6
        
       

     
···

       17
       18
        
           print_info,

     

       18
       19
        
           print_users_table,

     

       19
       20
        
           print_users_table_from_git,

     

       21
       21
       +
           print_entries_tsv,

     

       22
       22
       +
           get_tsv_mode,

     

       20
       23
        
       )

     

       21
       24
        
       

     

       22
       25
        
       

     
···

       116
       119
        
               print_entries_table(all_entries, all_usernames)

     

       117
       120
        
       

     

       118
       121
        
       

     

       122
       122
       +
       def _clean_html_content(content: Optional[str]) -> str:

     

       123
       123
       +
           """Clean HTML content for display in table."""

     

       124
       124
       +
           if not content:

     

       125
       125
       +
               return ""

     

       126
       126
       +
           

     

       127
       127
       +
           # Remove HTML tags

     

       128
       128
       +
           clean_text = re.sub(r'<[^>]+>', ' ', content)

     

       129
       129
       +
           # Replace multiple whitespace with single space

     

       130
       130
       +
           clean_text = re.sub(r'\s+', ' ', clean_text)

     

       131
       131
       +
           # Strip and limit length

     

       132
       132
       +
           clean_text = clean_text.strip()

     

       133
       133
       +
           if len(clean_text) > 100:

     

       134
       134
       +
               clean_text = clean_text[:97] + "..."

     

       135
       135
       +
           

     

       136
       136
       +
           return clean_text

     

       137
       137
       +
       

     

       138
       138
       +
       

     

       119
       139
        
       def print_entries_table(entries_by_user: list[list], usernames: list[str]) -> None:

     

       120
       140
        
           """Print a table of entries."""

     

       141
       141
       +
           if get_tsv_mode():

     

       142
       142
       +
               print_entries_tsv(entries_by_user, usernames)

     

       143
       143
       +
               return

     

       144
       144
       +
               

     

       121
       145
        
           table = Table(title="Feed Entries")

     

       122
       146
        
           table.add_column("User", style="cyan", no_wrap=True)

     

       123
       147
        
           table.add_column("Title", style="bold")

+11 -2

src/thicket/cli/main.py

···

       14
       14
        
       

     

       15
       15
        
       console = Console()

     

       16
       16
        
       

     

       17
       17
       +
       # Global state for TSV output mode

     

       18
       18
       +
       tsv_mode = False

     

       19
       19
       +
       

     

       17
       20
        
       

     

       18
       21
        
       def version_callback(value: bool) -> None:

     

       19
       22
        
           """Show version and exit."""

     
···

       32
       35
        
               callback=version_callback,

     

       33
       36
        
               is_eager=True,

     

       34
       37
        
           ),

     

       38
       38
       +
           tsv: bool = typer.Option(

     

       39
       39
       +
               False,

     

       40
       40
       +
               "--tsv",

     

       41
       41
       +
               help="Output in tab-separated values format without truncation",

     

       42
       42
       +
           ),

     

       35
       43
        
       ) -> None:

     

       36
       44
        
           """Thicket: A CLI tool for persisting Atom/RSS feeds in Git repositories."""

     

       37
       37
       -
           pass

     

       45
       45
       +
           global tsv_mode

     

       46
       46
       +
           tsv_mode = tsv

     

       38
       47
        
       

     

       39
       48
        
       

     

       40
       49
        
       # Import commands to register them

     

       41
       41
       -
       from .commands import add, duplicates, init, list_cmd, sync

     

       50
       50
       +
       from .commands import add, duplicates, index_cmd, info_cmd, init, links_cmd, list_cmd, sync

     

       42
       51
        
       

     

       43
       52
        
       if __name__ == "__main__":

     

       44
       53
        
           app()

+97

src/thicket/cli/utils.py

···

       14
       14
        
       console = Console()

     

       15
       15
        
       

     

       16
       16
        
       

     

       17
       17
       +
       def get_tsv_mode() -> bool:

     

       18
       18
       +
           """Get the global TSV mode setting."""

     

       19
       19
       +
           from .main import tsv_mode

     

       20
       20
       +
           return tsv_mode

     

       21
       21
       +
       

     

       22
       22
       +
       

     

       17
       23
        
       def load_config(config_path: Optional[Path] = None) -> ThicketConfig:

     

       18
       24
        
           """Load thicket configuration from file or environment."""

     

       19
       25
        
           if config_path and config_path.exists():

     
···

       27
       33
        
       

     

       28
       34
        
           # Try to load from default locations or environment

     

       29
       35
        
           try:

     

       36
       36
       +
               # First try to find thicket.yaml in current directory

     

       37
       37
       +
               default_config = Path("thicket.yaml")

     

       38
       38
       +
               if default_config.exists():

     

       39
       39
       +
                   import yaml

     

       40
       40
       +
                   with open(default_config) as f:

     

       41
       41
       +
                       config_data = yaml.safe_load(f)

     

       42
       42
       +
                   return ThicketConfig(**config_data)

     

       43
       43
       +
               

     

       44
       44
       +
               # Fall back to environment variables

     

       30
       45
        
               return ThicketConfig()

     

       31
       46
        
           except Exception as e:

     

       32
       47
        
               console.print(f"[red]Error loading configuration: {e}[/red]")

     
···

       60
       75
        
       

     

       61
       76
        
       def print_users_table(config: ThicketConfig) -> None:

     

       62
       77
        
           """Print a table of users and their feeds."""

     

       78
       78
       +
           if get_tsv_mode():

     

       79
       79
       +
               print_users_tsv(config)

     

       80
       80
       +
               return

     

       81
       81
       +
               

     

       63
       82
        
           table = Table(title="Users and Feeds")

     

       64
       83
        
           table.add_column("Username", style="cyan", no_wrap=True)

     

       65
       84
        
           table.add_column("Display Name", style="magenta")

     
···

       82
       101
        
       

     

       83
       102
        
       def print_feeds_table(config: ThicketConfig, username: Optional[str] = None) -> None:

     

       84
       103
        
           """Print a table of feeds, optionally filtered by username."""

     

       104
       104
       +
           if get_tsv_mode():

     

       105
       105
       +
               print_feeds_tsv(config, username)

     

       106
       106
       +
               return

     

       107
       107
       +
               

     

       85
       108
        
           table = Table(title=f"Feeds{f' for {username}' if username else ''}")

     

       86
       109
        
           table.add_column("Username", style="cyan", no_wrap=True)

     

       87
       110
        
           table.add_column("Feed URL", style="blue")

     
···

       128
       151
        
       

     

       129
       152
        
       def print_users_table_from_git(users: list[UserMetadata]) -> None:

     

       130
       153
        
           """Print a table of users from git repository."""

     

       154
       154
       +
           if get_tsv_mode():

     

       155
       155
       +
               print_users_tsv_from_git(users)

     

       156
       156
       +
               return

     

       157
       157
       +
               

     

       131
       158
        
           table = Table(title="Users and Feeds")

     

       132
       159
        
           table.add_column("Username", style="cyan", no_wrap=True)

     

       133
       160
        
           table.add_column("Display Name", style="magenta")

     
···

       150
       177
        
       

     

       151
       178
        
       def print_feeds_table_from_git(git_store: GitStore, username: Optional[str] = None) -> None:

     

       152
       179
        
           """Print a table of feeds from git repository."""

     

       180
       180
       +
           if get_tsv_mode():

     

       181
       181
       +
               print_feeds_tsv_from_git(git_store, username)

     

       182
       182
       +
               return

     

       183
       183
       +
               

     

       153
       184
        
           table = Table(title=f"Feeds{f' for {username}' if username else ''}")

     

       154
       185
        
           table.add_column("Username", style="cyan", no_wrap=True)

     

       155
       186
        
           table.add_column("Feed URL", style="blue")

     
···

       171
       202
        
                   )

     

       172
       203
        
       

     

       173
       204
        
           console.print(table)

     

       205
       205
       +
       

     

       206
       206
       +
       

     

       207
       207
       +
       def print_users_tsv(config: ThicketConfig) -> None:

     

       208
       208
       +
           """Print users in TSV format."""

     

       209
       209
       +
           print("Username\tDisplay Name\tEmail\tHomepage\tFeeds")

     

       210
       210
       +
           for user in config.users:

     

       211
       211
       +
               feeds_str = ",".join(str(feed) for feed in user.feeds)

     

       212
       212
       +
               print(f"{user.username}\t{user.display_name or ''}\t{user.email or ''}\t{user.homepage or ''}\t{feeds_str}")

     

       213
       213
       +
       

     

       214
       214
       +
       

     

       215
       215
       +
       def print_users_tsv_from_git(users: list[UserMetadata]) -> None:

     

       216
       216
       +
           """Print users from git repository in TSV format."""

     

       217
       217
       +
           print("Username\tDisplay Name\tEmail\tHomepage\tFeeds")

     

       218
       218
       +
           for user in users:

     

       219
       219
       +
               feeds_str = ",".join(user.feeds)

     

       220
       220
       +
               print(f"{user.username}\t{user.display_name or ''}\t{user.email or ''}\t{user.homepage or ''}\t{feeds_str}")

     

       221
       221
       +
       

     

       222
       222
       +
       

     

       223
       223
       +
       def print_feeds_tsv(config: ThicketConfig, username: Optional[str] = None) -> None:

     

       224
       224
       +
           """Print feeds in TSV format."""

     

       225
       225
       +
           print("Username\tFeed URL\tStatus")

     

       226
       226
       +
           users = [config.find_user(username)] if username else config.users

     

       227
       227
       +
           users = [u for u in users if u is not None]

     

       228
       228
       +
           

     

       229
       229
       +
           for user in users:

     

       230
       230
       +
               for feed in user.feeds:

     

       231
       231
       +
                   print(f"{user.username}\t{feed}\tActive")

     

       232
       232
       +
       

     

       233
       233
       +
       

     

       234
       234
       +
       def print_feeds_tsv_from_git(git_store: GitStore, username: Optional[str] = None) -> None:

     

       235
       235
       +
           """Print feeds from git repository in TSV format."""

     

       236
       236
       +
           print("Username\tFeed URL\tStatus")

     

       237
       237
       +
           

     

       238
       238
       +
           if username:

     

       239
       239
       +
               user = git_store.get_user(username)

     

       240
       240
       +
               users = [user] if user else []

     

       241
       241
       +
           else:

     

       242
       242
       +
               index = git_store._load_index()

     

       243
       243
       +
               users = list(index.users.values())

     

       244
       244
       +
           

     

       245
       245
       +
           for user in users:

     

       246
       246
       +
               for feed in user.feeds:

     

       247
       247
       +
                   print(f"{user.username}\t{feed}\tActive")

     

       248
       248
       +
       

     

       249
       249
       +
       

     

       250
       250
       +
       def print_entries_tsv(entries_by_user: list[list], usernames: list[str]) -> None:

     

       251
       251
       +
           """Print entries in TSV format."""

     

       252
       252
       +
           print("User\tAtom ID\tTitle\tUpdated\tURL")

     

       253
       253
       +
           

     

       254
       254
       +
           # Combine all entries with usernames

     

       255
       255
       +
           all_entries = []

     

       256
       256
       +
           for entries, username in zip(entries_by_user, usernames):

     

       257
       257
       +
               for entry in entries:

     

       258
       258
       +
                   all_entries.append((username, entry))

     

       259
       259
       +
           

     

       260
       260
       +
           # Sort by updated time (newest first)

     

       261
       261
       +
           all_entries.sort(key=lambda x: x[1].updated, reverse=True)

     

       262
       262
       +
           

     

       263
       263
       +
           for username, entry in all_entries:

     

       264
       264
       +
               # Format updated time

     

       265
       265
       +
               updated_str = entry.updated.strftime("%Y-%m-%d %H:%M")

     

       266
       266
       +
               

     

       267
       267
       +
               # Escape tabs and newlines in title to preserve TSV format

     

       268
       268
       +
               title = entry.title.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')

     

       269
       269
       +
               

     

       270
       270
       +
               print(f"{username}\t{entry.id}\t{title}\t{updated_str}\t{entry.link}")

+276

src/thicket/core/reference_parser.py

···

       1
       1
       +
       """Reference detection and parsing for blog entries."""

     

       2
       2
       +
       

     

       3
       3
       +
       import re

     

       4
       4
       +
       from typing import Optional

     

       5
       5
       +
       from urllib.parse import urlparse

     

       6
       6
       +
       

     

       7
       7
       +
       from ..models import AtomEntry

     

       8
       8
       +
       

     

       9
       9
       +
       

     

       10
       10
       +
       class BlogReference:

     

       11
       11
       +
           """Represents a reference from one blog entry to another."""

     

       12
       12
       +
       

     

       13
       13
       +
           def __init__(self, source_entry_id: str, source_username: str,

     

       14
       14
       +
                        target_url: str, target_username: Optional[str] = None,

     

       15
       15
       +
                        target_entry_id: Optional[str] = None):

     

       16
       16
       +
               self.source_entry_id = source_entry_id

     

       17
       17
       +
               self.source_username = source_username

     

       18
       18
       +
               self.target_url = target_url

     

       19
       19
       +
               self.target_username = target_username

     

       20
       20
       +
               self.target_entry_id = target_entry_id

     

       21
       21
       +
       

     

       22
       22
       +
           def to_dict(self) -> dict:

     

       23
       23
       +
               """Convert to dictionary for JSON serialization."""

     

       24
       24
       +
               return {

     

       25
       25
       +
                   "source_entry_id": self.source_entry_id,

     

       26
       26
       +
                   "source_username": self.source_username,

     

       27
       27
       +
                   "target_url": self.target_url,

     

       28
       28
       +
                   "target_username": self.target_username,

     

       29
       29
       +
                   "target_entry_id": self.target_entry_id

     

       30
       30
       +
               }

     

       31
       31
       +
       

     

       32
       32
       +
           @classmethod

     

       33
       33
       +
           def from_dict(cls, data: dict) -> "BlogReference":

     

       34
       34
       +
               """Create from dictionary."""

     

       35
       35
       +
               return cls(

     

       36
       36
       +
                   source_entry_id=data["source_entry_id"],

     

       37
       37
       +
                   source_username=data["source_username"],

     

       38
       38
       +
                   target_url=data["target_url"],

     

       39
       39
       +
                   target_username=data.get("target_username"),

     

       40
       40
       +
                   target_entry_id=data.get("target_entry_id")

     

       41
       41
       +
               )

     

       42
       42
       +
       

     

       43
       43
       +
       

     

       44
       44
       +
       class ReferenceIndex:

     

       45
       45
       +
           """Index of blog-to-blog references for creating threaded views."""

     

       46
       46
       +
       

     

       47
       47
       +
           def __init__(self):

     

       48
       48
       +
               self.references: list[BlogReference] = []

     

       49
       49
       +
               self.outbound_refs: dict[str, list[BlogReference]] = {}  # entry_id -> outbound refs

     

       50
       50
       +
               self.inbound_refs: dict[str, list[BlogReference]] = {}   # entry_id -> inbound refs

     

       51
       51
       +
               self.user_domains: dict[str, set[str]] = {}  # username -> set of domains

     

       52
       52
       +
       

     

       53
       53
       +
           def add_reference(self, ref: BlogReference) -> None:

     

       54
       54
       +
               """Add a reference to the index."""

     

       55
       55
       +
               self.references.append(ref)

     

       56
       56
       +
       

     

       57
       57
       +
               # Update outbound references

     

       58
       58
       +
               source_key = f"{ref.source_username}:{ref.source_entry_id}"

     

       59
       59
       +
               if source_key not in self.outbound_refs:

     

       60
       60
       +
                   self.outbound_refs[source_key] = []

     

       61
       61
       +
               self.outbound_refs[source_key].append(ref)

     

       62
       62
       +
       

     

       63
       63
       +
               # Update inbound references if we can identify the target

     

       64
       64
       +
               if ref.target_username and ref.target_entry_id:

     

       65
       65
       +
                   target_key = f"{ref.target_username}:{ref.target_entry_id}"

     

       66
       66
       +
                   if target_key not in self.inbound_refs:

     

       67
       67
       +
                       self.inbound_refs[target_key] = []

     

       68
       68
       +
                   self.inbound_refs[target_key].append(ref)

     

       69
       69
       +
       

     

       70
       70
       +
           def get_outbound_refs(self, username: str, entry_id: str) -> list[BlogReference]:

     

       71
       71
       +
               """Get all outbound references from an entry."""

     

       72
       72
       +
               key = f"{username}:{entry_id}"

     

       73
       73
       +
               return self.outbound_refs.get(key, [])

     

       74
       74
       +
       

     

       75
       75
       +
           def get_inbound_refs(self, username: str, entry_id: str) -> list[BlogReference]:

     

       76
       76
       +
               """Get all inbound references to an entry."""

     

       77
       77
       +
               key = f"{username}:{entry_id}"

     

       78
       78
       +
               return self.inbound_refs.get(key, [])

     

       79
       79
       +
       

     

       80
       80
       +
           def get_thread_members(self, username: str, entry_id: str) -> set[tuple[str, str]]:

     

       81
       81
       +
               """Get all entries that are part of the same thread."""

     

       82
       82
       +
               visited = set()

     

       83
       83
       +
               to_visit = [(username, entry_id)]

     

       84
       84
       +
               thread_members = set()

     

       85
       85
       +
       

     

       86
       86
       +
               while to_visit:

     

       87
       87
       +
                   current_user, current_entry = to_visit.pop()

     

       88
       88
       +
                   if (current_user, current_entry) in visited:

     

       89
       89
       +
                       continue

     

       90
       90
       +
       

     

       91
       91
       +
                   visited.add((current_user, current_entry))

     

       92
       92
       +
                   thread_members.add((current_user, current_entry))

     

       93
       93
       +
       

     

       94
       94
       +
                   # Add outbound references

     

       95
       95
       +
                   for ref in self.get_outbound_refs(current_user, current_entry):

     

       96
       96
       +
                       if ref.target_username and ref.target_entry_id:

     

       97
       97
       +
                           to_visit.append((ref.target_username, ref.target_entry_id))

     

       98
       98
       +
       

     

       99
       99
       +
                   # Add inbound references

     

       100
       100
       +
                   for ref in self.get_inbound_refs(current_user, current_entry):

     

       101
       101
       +
                       to_visit.append((ref.source_username, ref.source_entry_id))

     

       102
       102
       +
       

     

       103
       103
       +
               return thread_members

     

       104
       104
       +
       

     

       105
       105
       +
           def to_dict(self) -> dict:

     

       106
       106
       +
               """Convert to dictionary for JSON serialization."""

     

       107
       107
       +
               return {

     

       108
       108
       +
                   "references": [ref.to_dict() for ref in self.references],

     

       109
       109
       +
                   "user_domains": {k: list(v) for k, v in self.user_domains.items()}

     

       110
       110
       +
               }

     

       111
       111
       +
       

     

       112
       112
       +
           @classmethod

     

       113
       113
       +
           def from_dict(cls, data: dict) -> "ReferenceIndex":

     

       114
       114
       +
               """Create from dictionary."""

     

       115
       115
       +
               index = cls()

     

       116
       116
       +
               for ref_data in data.get("references", []):

     

       117
       117
       +
                   ref = BlogReference.from_dict(ref_data)

     

       118
       118
       +
                   index.add_reference(ref)

     

       119
       119
       +
       

     

       120
       120
       +
               for username, domains in data.get("user_domains", {}).items():

     

       121
       121
       +
                   index.user_domains[username] = set(domains)

     

       122
       122
       +
       

     

       123
       123
       +
               return index

     

       124
       124
       +
       

     

       125
       125
       +
       

     

       126
       126
       +
       class ReferenceParser:

     

       127
       127
       +
           """Parses blog entries to detect references to other blogs."""

     

       128
       128
       +
       

     

       129
       129
       +
           def __init__(self):

     

       130
       130
       +
               # Common blog platforms and patterns

     

       131
       131
       +
               self.blog_patterns = [

     

       132
       132
       +
                   r'https?://[^/]+\.(?:org|com|net|io|dev|me|co\.uk)/.*',  # Common blog domains

     

       133
       133
       +
                   r'https?://[^/]+\.github\.io/.*',  # GitHub Pages

     

       134
       134
       +
                   r'https?://[^/]+\.substack\.com/.*',  # Substack

     

       135
       135
       +
                   r'https?://medium\.com/.*',  # Medium

     

       136
       136
       +
                   r'https?://[^/]+\.wordpress\.com/.*',  # WordPress.com

     

       137
       137
       +
                   r'https?://[^/]+\.blogspot\.com/.*',  # Blogger

     

       138
       138
       +
               ]

     

       139
       139
       +
       

     

       140
       140
       +
               # Compile regex patterns

     

       141
       141
       +
               self.link_pattern = re.compile(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL)

     

       142
       142
       +
               self.url_pattern = re.compile(r'https?://[^\s<>"]+')

     

       143
       143
       +
       

     

       144
       144
       +
           def extract_links_from_html(self, html_content: str) -> list[tuple[str, str]]:

     

       145
       145
       +
               """Extract all links from HTML content."""

     

       146
       146
       +
               links = []

     

       147
       147
       +
       

     

       148
       148
       +
               # Extract links from <a> tags

     

       149
       149
       +
               for match in self.link_pattern.finditer(html_content):

     

       150
       150
       +
                   url = match.group(1)

     

       151
       151
       +
                   text = re.sub(r'<[^>]+>', '', match.group(2)).strip()  # Remove HTML tags from link text

     

       152
       152
       +
                   links.append((url, text))

     

       153
       153
       +
       

     

       154
       154
       +
               return links

     

       155
       155
       +
       

     

       156
       156
       +
           def is_blog_url(self, url: str) -> bool:

     

       157
       157
       +
               """Check if a URL likely points to a blog post."""

     

       158
       158
       +
               for pattern in self.blog_patterns:

     

       159
       159
       +
                   if re.match(pattern, url):

     

       160
       160
       +
                       return True

     

       161
       161
       +
               return False

     

       162
       162
       +
       

     

       163
       163
       +
       

     

       164
       164
       +
           def resolve_target_user(self, url: str, user_domains: dict[str, set[str]]) -> Optional[str]:

     

       165
       165
       +
               """Try to resolve a URL to a known user based on domain mapping."""

     

       166
       166
       +
               parsed_url = urlparse(url)

     

       167
       167
       +
               domain = parsed_url.netloc.lower()

     

       168
       168
       +
       

     

       169
       169
       +
               for username, domains in user_domains.items():

     

       170
       170
       +
                   if domain in domains:

     

       171
       171
       +
                       return username

     

       172
       172
       +
       

     

       173
       173
       +
               return None

     

       174
       174
       +
       

     

       175
       175
       +
           def extract_references(self, entry: AtomEntry, username: str,

     

       176
       176
       +
                                 user_domains: dict[str, set[str]]) -> list[BlogReference]:

     

       177
       177
       +
               """Extract all blog references from an entry."""

     

       178
       178
       +
               references = []

     

       179
       179
       +
       

     

       180
       180
       +
               # Combine all text content for analysis

     

       181
       181
       +
               content_to_search = []

     

       182
       182
       +
               if entry.content:

     

       183
       183
       +
                   content_to_search.append(entry.content)

     

       184
       184
       +
               if entry.summary:

     

       185
       185
       +
                   content_to_search.append(entry.summary)

     

       186
       186
       +
       

     

       187
       187
       +
               for content in content_to_search:

     

       188
       188
       +
                   links = self.extract_links_from_html(content)

     

       189
       189
       +
       

     

       190
       190
       +
                   for url, _link_text in links:

     

       191
       191
       +
                       # Skip internal links (same domain as the entry)

     

       192
       192
       +
                       entry_domain = urlparse(str(entry.link)).netloc.lower() if entry.link else ""

     

       193
       193
       +
                       link_domain = urlparse(url).netloc.lower()

     

       194
       194
       +
       

     

       195
       195
       +
                       if link_domain == entry_domain:

     

       196
       196
       +
                           continue

     

       197
       197
       +
       

     

       198
       198
       +
                       # Check if this looks like a blog URL

     

       199
       199
       +
                       if not self.is_blog_url(url):

     

       200
       200
       +
                           continue

     

       201
       201
       +
       

     

       202
       202
       +
                       # Try to resolve to a known user

     

       203
       203
       +
                       target_username = self.resolve_target_user(url, user_domains)

     

       204
       204
       +
       

     

       205
       205
       +
                       ref = BlogReference(

     

       206
       206
       +
                           source_entry_id=entry.id,

     

       207
       207
       +
                           source_username=username,

     

       208
       208
       +
                           target_url=url,

     

       209
       209
       +
                           target_username=target_username,

     

       210
       210
       +
                           target_entry_id=None  # Will be resolved later if possible

     

       211
       211
       +
                       )

     

       212
       212
       +
       

     

       213
       213
       +
                       references.append(ref)

     

       214
       214
       +
       

     

       215
       215
       +
               return references

     

       216
       216
       +
       

     

       217
       217
       +
           def build_user_domain_mapping(self, git_store: "GitStore") -> dict[str, set[str]]:

     

       218
       218
       +
               """Build mapping of usernames to their known domains."""

     

       219
       219
       +
               user_domains = {}

     

       220
       220
       +
               index = git_store._load_index()

     

       221
       221
       +
       

     

       222
       222
       +
               for username, user_metadata in index.users.items():

     

       223
       223
       +
                   domains = set()

     

       224
       224
       +
       

     

       225
       225
       +
                   # Add domains from feeds

     

       226
       226
       +
                   for feed_url in user_metadata.feeds:

     

       227
       227
       +
                       domain = urlparse(feed_url).netloc.lower()

     

       228
       228
       +
                       if domain:

     

       229
       229
       +
                           domains.add(domain)

     

       230
       230
       +
       

     

       231
       231
       +
                   # Add domain from homepage

     

       232
       232
       +
                   if user_metadata.homepage:

     

       233
       233
       +
                       domain = urlparse(str(user_metadata.homepage)).netloc.lower()

     

       234
       234
       +
                       if domain:

     

       235
       235
       +
                           domains.add(domain)

     

       236
       236
       +
       

     

       237
       237
       +
                   user_domains[username] = domains

     

       238
       238
       +
       

     

       239
       239
       +
               return user_domains

     

       240
       240
       +
       

     

       241
       241
       +
           def resolve_target_entry_ids(self, references: list[BlogReference], git_store: "GitStore") -> list[BlogReference]:

     

       242
       242
       +
               """Resolve target_entry_id for references that have target_username but no target_entry_id."""

     

       243
       243
       +
               resolved_refs = []

     

       244
       244
       +
       

     

       245
       245
       +
               for ref in references:

     

       246
       246
       +
                   # If we already have a target_entry_id, keep the reference as-is

     

       247
       247
       +
                   if ref.target_entry_id is not None:

     

       248
       248
       +
                       resolved_refs.append(ref)

     

       249
       249
       +
                       continue

     

       250
       250
       +
       

     

       251
       251
       +
                   # If we don't have a target_username, we can't resolve it

     

       252
       252
       +
                   if ref.target_username is None:

     

       253
       253
       +
                       resolved_refs.append(ref)

     

       254
       254
       +
                       continue

     

       255
       255
       +
       

     

       256
       256
       +
                   # Try to find the entry by matching the URL

     

       257
       257
       +
                   entries = git_store.list_entries(ref.target_username)

     

       258
       258
       +
                   resolved_entry_id = None

     

       259
       259
       +
       

     

       260
       260
       +
                   for entry in entries:

     

       261
       261
       +
                       # Check if the entry's link matches the target URL

     

       262
       262
       +
                       if entry.link and str(entry.link) == ref.target_url:

     

       263
       263
       +
                           resolved_entry_id = entry.id

     

       264
       264
       +
                           break

     

       265
       265
       +
       

     

       266
       266
       +
                   # Create a new reference with the resolved target_entry_id

     

       267
       267
       +
                   resolved_ref = BlogReference(

     

       268
       268
       +
                       source_entry_id=ref.source_entry_id,

     

       269
       269
       +
                       source_username=ref.source_username,

     

       270
       270
       +
                       target_url=ref.target_url,

     

       271
       271
       +
                       target_username=ref.target_username,

     

       272
       272
       +
                       target_entry_id=resolved_entry_id

     

       273
       273
       +
                   )

     

       274
       274
       +
                   resolved_refs.append(resolved_ref)

     

       275
       275
       +
       

     

       276
       276
       +
               return resolved_refs