commit 1f08568210306fdcbe16b628c87fa70ad77a05ba · anil.recoil.org/thicket

+1 -1

src/thicket/cli/utils.py

···

       53
        
           """Save thicket configuration to file."""

     

       54
        
           import yaml

     

       55
        
       

     

       56
       -
           config_data = config.model_dump(mode="json")

     

       57
        
       

     

       58
        
           # Convert Path objects to strings for YAML serialization

     

       59
        
           config_data["git_store"] = str(config_data["git_store"])

···

       53
        
           """Save thicket configuration to file."""

     

       54
        
           import yaml

     

       55
        
       

     

       56
       +
           config_data = config.model_dump(mode="json", exclude_none=True)

     

       57
        
       

     

       58
        
           # Convert Path objects to strings for YAML serialization

     

       59
        
           config_data["git_store"] = str(config_data["git_store"])

+3 -3

src/thicket/core/git_store.py

···

       53
        
               """Save the index to index.json."""

     

       54
        
               index_path = self.repo_path / "index.json"

     

       55
        
               with open(index_path, "w") as f:

     

       56
       -
                   json.dump(index.model_dump(mode="json"), f, indent=2, default=str)

     

       57
        
       

     

       58
        
           def _load_index(self) -> GitStoreIndex:

     

       59
        
               """Load the index from index.json."""

     
···

       73
        
               """Save duplicates map to duplicates.json."""

     

       74
        
               duplicates_path = self.repo_path / "duplicates.json"

     

       75
        
               with open(duplicates_path, "w") as f:

     

       76
       -
                   json.dump(duplicates.model_dump(), f, indent=2)

     

       77
        
       

     

       78
        
           def _load_duplicates(self) -> DuplicateMap:

     

       79
        
               """Load duplicates map from duplicates.json."""

     
···

       163
        
       

     

       164
        
               # Save entry

     

       165
        
               with open(entry_path, "w") as f:

     

       166
       -
                   json.dump(entry.model_dump(mode="json"), f, indent=2, default=str)

     

       167
        
       

     

       168
        
               # Update user metadata if new entry

     

       169
        
               if not entry_exists:

···

       53
        
               """Save the index to index.json."""

     

       54
        
               index_path = self.repo_path / "index.json"

     

       55
        
               with open(index_path, "w") as f:

     

       56
       +
                   json.dump(index.model_dump(mode="json", exclude_none=True), f, indent=2, default=str)

     

       57
        
       

     

       58
        
           def _load_index(self) -> GitStoreIndex:

     

       59
        
               """Load the index from index.json."""

     
···

       73
        
               """Save duplicates map to duplicates.json."""

     

       74
        
               duplicates_path = self.repo_path / "duplicates.json"

     

       75
        
               with open(duplicates_path, "w") as f:

     

       76
       +
                   json.dump(duplicates.model_dump(exclude_none=True), f, indent=2)

     

       77
        
       

     

       78
        
           def _load_duplicates(self) -> DuplicateMap:

     

       79
        
               """Load duplicates map from duplicates.json."""

     
···

       163
        
       

     

       164
        
               # Save entry

     

       165
        
               with open(entry_path, "w") as f:

     

       166
       +
                   json.dump(entry.model_dump(mode="json", exclude_none=True), f, indent=2, default=str)

     

       167
        
       

     

       168
        
               # Update user metadata if new entry

     

       169
        
               if not entry_exists:

+51 -26

src/thicket/core/reference_parser.py

···

       10
        
       class BlogReference:

     

       11
        
           """Represents a reference from one blog entry to another."""

     

       12
        
       

     

       13
       -
           def __init__(self, source_entry_id: str, source_username: str,

     

       14
       -
                        target_url: str, target_username: Optional[str] = None,

     

       15
       -
                        target_entry_id: Optional[str] = None):

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       16
        
               self.source_entry_id = source_entry_id

     

       17
        
               self.source_username = source_username

     

       18
        
               self.target_url = target_url

     
···

       21
        
       

     

       22
        
           def to_dict(self) -> dict:

     

       23
        
               """Convert to dictionary for JSON serialization."""

     

       24
       -
               return {

     

       25
        
                   "source_entry_id": self.source_entry_id,

     

       26
        
                   "source_username": self.source_username,

     

       27
        
                   "target_url": self.target_url,

     

       28
       -
                   "target_username": self.target_username,

     

       29
       -
                   "target_entry_id": self.target_entry_id

     

       30
        
               }

     

       31
        
       

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       32
        
           @classmethod

     

       33
        
           def from_dict(cls, data: dict) -> "BlogReference":

     

       34
        
               """Create from dictionary."""

     
···

       37
        
                   source_username=data["source_username"],

     

       38
        
                   target_url=data["target_url"],

     

       39
        
                   target_username=data.get("target_username"),

     

       40
       -
                   target_entry_id=data.get("target_entry_id")

     

       41
        
               )

     

       42
        
       

     

       43
        
       

     
···

       46
        
       

     

       47
        
           def __init__(self):

     

       48
        
               self.references: list[BlogReference] = []

     

       49
       -
               self.outbound_refs: dict[str, list[BlogReference]] = {}  # entry_id -> outbound refs

     

       50
       -
               self.inbound_refs: dict[str, list[BlogReference]] = {}   # entry_id -> inbound refs

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       51
        
               self.user_domains: dict[str, set[str]] = {}  # username -> set of domains

     

       52
        
       

     

       53
        
           def add_reference(self, ref: BlogReference) -> None:

     
···

       106
        
               """Convert to dictionary for JSON serialization."""

     

       107
        
               return {

     

       108
        
                   "references": [ref.to_dict() for ref in self.references],

     

       109
       -
                   "user_domains": {k: list(v) for k, v in self.user_domains.items()}

     

       110
        
               }

     

       111
        
       

     

       112
        
           @classmethod

     
···

       129
        
           def __init__(self):

     

       130
        
               # Common blog platforms and patterns

     

       131
        
               self.blog_patterns = [

     

       132
       -
                   r'https?://[^/]+\.(?:org|com|net|io|dev|me|co\.uk)/.*',  # Common blog domains

     

       133
       -
                   r'https?://[^/]+\.github\.io/.*',  # GitHub Pages

     

       134
       -
                   r'https?://[^/]+\.substack\.com/.*',  # Substack

     

       135
       -
                   r'https?://medium\.com/.*',  # Medium

     

       136
       -
                   r'https?://[^/]+\.wordpress\.com/.*',  # WordPress.com

     

       137
       -
                   r'https?://[^/]+\.blogspot\.com/.*',  # Blogger

     

       138
        
               ]

     

       139
        
       

     

       140
        
               # Compile regex patterns

     

       141
       -
               self.link_pattern = re.compile(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL)

     

       0
        
       
     

       0
        
       
     

       142
        
               self.url_pattern = re.compile(r'https?://[^\s<>"]+')

     

       143
        
       

     

       144
        
           def extract_links_from_html(self, html_content: str) -> list[tuple[str, str]]:

     
···

       148
        
               # Extract links from <a> tags

     

       149
        
               for match in self.link_pattern.finditer(html_content):

     

       150
        
                   url = match.group(1)

     

       151
       -
                   text = re.sub(r'<[^>]+>', '', match.group(2)).strip()  # Remove HTML tags from link text

     

       0
        
       
     

       0
        
       
     

       152
        
                   links.append((url, text))

     

       153
        
       

     

       154
        
               return links

     
···

       160
        
                       return True

     

       161
        
               return False

     

       162
        
       

     

       163
       -
       

     

       164
       -
           def resolve_target_user(self, url: str, user_domains: dict[str, set[str]]) -> Optional[str]:

     

       0
        
       
     

       165
        
               """Try to resolve a URL to a known user based on domain mapping."""

     

       166
        
               parsed_url = urlparse(url)

     

       167
        
               domain = parsed_url.netloc.lower()

     
···

       172
        
       

     

       173
        
               return None

     

       174
        
       

     

       175
       -
           def extract_references(self, entry: AtomEntry, username: str,

     

       176
       -
                                 user_domains: dict[str, set[str]]) -> list[BlogReference]:

     

       0
        
       
     

       177
        
               """Extract all blog references from an entry."""

     

       178
        
               references = []

     

       179
        
       

     
···

       189
        
       

     

       190
        
                   for url, _link_text in links:

     

       191
        
                       # Skip internal links (same domain as the entry)

     

       192
       -
                       entry_domain = urlparse(str(entry.link)).netloc.lower() if entry.link else ""

     

       0
        
       
     

       0
        
       
     

       193
        
                       link_domain = urlparse(url).netloc.lower()

     

       194
        
       

     

       195
        
                       if link_domain == entry_domain:

     
···

       207
        
                           source_username=username,

     

       208
        
                           target_url=url,

     

       209
        
                           target_username=target_username,

     

       210
       -
                           target_entry_id=None  # Will be resolved later if possible

     

       211
        
                       )

     

       212
        
       

     

       213
        
                       references.append(ref)

     
···

       238
        
       

     

       239
        
               return user_domains

     

       240
        
       

     

       241
       -
           def resolve_target_entry_ids(self, references: list[BlogReference], git_store: "GitStore") -> list[BlogReference]:

     

       0
        
       
     

       0
        
       
     

       242
        
               """Resolve target_entry_id for references that have target_username but no target_entry_id."""

     

       243
        
               resolved_refs = []

     

       244
        
       

     
···

       269
        
                       source_username=ref.source_username,

     

       270
        
                       target_url=ref.target_url,

     

       271
        
                       target_username=ref.target_username,

     

       272
       -
                       target_entry_id=resolved_entry_id

     

       273
        
                   )

     

       274
        
                   resolved_refs.append(resolved_ref)

     

       275

···

       10
        
       class BlogReference:

     

       11
        
           """Represents a reference from one blog entry to another."""

     

       12
        
       

     

       13
       +
           def __init__(

     

       14
       +
               self,

     

       15
       +
               source_entry_id: str,

     

       16
       +
               source_username: str,

     

       17
       +
               target_url: str,

     

       18
       +
               target_username: Optional[str] = None,

     

       19
       +
               target_entry_id: Optional[str] = None,

     

       20
       +
           ):

     

       21
        
               self.source_entry_id = source_entry_id

     

       22
        
               self.source_username = source_username

     

       23
        
               self.target_url = target_url

     
···

       26
        
       

     

       27
        
           def to_dict(self) -> dict:

     

       28
        
               """Convert to dictionary for JSON serialization."""

     

       29
       +
               result = {

     

       30
        
                   "source_entry_id": self.source_entry_id,

     

       31
        
                   "source_username": self.source_username,

     

       32
        
                   "target_url": self.target_url,

     

       0
        
       
     

       0
        
       
     

       33
        
               }

     

       34
        
       

     

       35
       +
               # Only include optional fields if they are not None

     

       36
       +
               if self.target_username is not None:

     

       37
       +
                   result["target_username"] = self.target_username

     

       38
       +
               if self.target_entry_id is not None:

     

       39
       +
                   result["target_entry_id"] = self.target_entry_id

     

       40
       +
       

     

       41
       +
               return result

     

       42
       +
       

     

       43
        
           @classmethod

     

       44
        
           def from_dict(cls, data: dict) -> "BlogReference":

     

       45
        
               """Create from dictionary."""

     
···

       48
        
                   source_username=data["source_username"],

     

       49
        
                   target_url=data["target_url"],

     

       50
        
                   target_username=data.get("target_username"),

     

       51
       +
                   target_entry_id=data.get("target_entry_id"),

     

       52
        
               )

     

       53
        
       

     

       54
        
       

     
···

       57
        
       

     

       58
        
           def __init__(self):

     

       59
        
               self.references: list[BlogReference] = []

     

       60
       +
               self.outbound_refs: dict[

     

       61
       +
                   str, list[BlogReference]

     

       62
       +
               ] = {}  # entry_id -> outbound refs

     

       63
       +
               self.inbound_refs: dict[

     

       64
       +
                   str, list[BlogReference]

     

       65
       +
               ] = {}  # entry_id -> inbound refs

     

       66
        
               self.user_domains: dict[str, set[str]] = {}  # username -> set of domains

     

       67
        
       

     

       68
        
           def add_reference(self, ref: BlogReference) -> None:

     
···

       121
        
               """Convert to dictionary for JSON serialization."""

     

       122
        
               return {

     

       123
        
                   "references": [ref.to_dict() for ref in self.references],

     

       124
       +
                   "user_domains": {k: list(v) for k, v in self.user_domains.items()},

     

       125
        
               }

     

       126
        
       

     

       127
        
           @classmethod

     
···

       144
        
           def __init__(self):

     

       145
        
               # Common blog platforms and patterns

     

       146
        
               self.blog_patterns = [

     

       147
       +
                   r"https?://[^/]+\.(?:org|com|net|io|dev|me|co\.uk)/.*",  # Common blog domains

     

       148
       +
                   r"https?://[^/]+\.github\.io/.*",  # GitHub Pages

     

       149
       +
                   r"https?://[^/]+\.substack\.com/.*",  # Substack

     

       150
       +
                   r"https?://medium\.com/.*",  # Medium

     

       151
       +
                   r"https?://[^/]+\.wordpress\.com/.*",  # WordPress.com

     

       152
       +
                   r"https?://[^/]+\.blogspot\.com/.*",  # Blogger

     

       153
        
               ]

     

       154
        
       

     

       155
        
               # Compile regex patterns

     

       156
       +
               self.link_pattern = re.compile(

     

       157
       +
                   r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL

     

       158
       +
               )

     

       159
        
               self.url_pattern = re.compile(r'https?://[^\s<>"]+')

     

       160
        
       

     

       161
        
           def extract_links_from_html(self, html_content: str) -> list[tuple[str, str]]:

     
···

       165
        
               # Extract links from <a> tags

     

       166
        
               for match in self.link_pattern.finditer(html_content):

     

       167
        
                   url = match.group(1)

     

       168
       +
                   text = re.sub(

     

       169
       +
                       r"<[^>]+>", "", match.group(2)

     

       170
       +
                   ).strip()  # Remove HTML tags from link text

     

       171
        
                   links.append((url, text))

     

       172
        
       

     

       173
        
               return links

     
···

       179
        
                       return True

     

       180
        
               return False

     

       181
        
       

     

       182
       +
           def resolve_target_user(

     

       183
       +
               self, url: str, user_domains: dict[str, set[str]]

     

       184
       +
           ) -> Optional[str]:

     

       185
        
               """Try to resolve a URL to a known user based on domain mapping."""

     

       186
        
               parsed_url = urlparse(url)

     

       187
        
               domain = parsed_url.netloc.lower()

     
···

       192
        
       

     

       193
        
               return None

     

       194
        
       

     

       195
       +
           def extract_references(

     

       196
       +
               self, entry: AtomEntry, username: str, user_domains: dict[str, set[str]]

     

       197
       +
           ) -> list[BlogReference]:

     

       198
        
               """Extract all blog references from an entry."""

     

       199
        
               references = []

     

       200
        
       

     
···

       210
        
       

     

       211
        
                   for url, _link_text in links:

     

       212
        
                       # Skip internal links (same domain as the entry)

     

       213
       +
                       entry_domain = (

     

       214
       +
                           urlparse(str(entry.link)).netloc.lower() if entry.link else ""

     

       215
       +
                       )

     

       216
        
                       link_domain = urlparse(url).netloc.lower()

     

       217
        
       

     

       218
        
                       if link_domain == entry_domain:

     
···

       230
        
                           source_username=username,

     

       231
        
                           target_url=url,

     

       232
        
                           target_username=target_username,

     

       233
       +
                           target_entry_id=None,  # Will be resolved later if possible

     

       234
        
                       )

     

       235
        
       

     

       236
        
                       references.append(ref)

     
···

       261
        
       

     

       262
        
               return user_domains

     

       263
        
       

     

       264
       +
           def resolve_target_entry_ids(

     

       265
       +
               self, references: list[BlogReference], git_store: "GitStore"

     

       266
       +
           ) -> list[BlogReference]:

     

       267
        
               """Resolve target_entry_id for references that have target_username but no target_entry_id."""

     

       268
        
               resolved_refs = []

     

       269
        
       

     
···

       294
        
                       source_username=ref.source_username,

     

       295
        
                       target_url=ref.target_url,

     

       296
        
                       target_username=ref.target_username,

     

       297
       +
                       target_entry_id=resolved_entry_id,

     

       298
        
                   )

     

       299
        
                   resolved_refs.append(resolved_ref)

     

       300