···
"""Represents a reference from one blog entry to another."""
13
-
def __init__(self, source_entry_id: str, source_username: str,
14
-
target_url: str, target_username: Optional[str] = None,
15
-
target_entry_id: Optional[str] = None):
15
+
source_entry_id: str,
16
+
source_username: str,
18
+
target_username: Optional[str] = None,
19
+
target_entry_id: Optional[str] = None,
self.source_entry_id = source_entry_id
self.source_username = source_username
self.target_url = target_url
···
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
"source_entry_id": self.source_entry_id,
"source_username": self.source_username,
"target_url": self.target_url,
28
-
"target_username": self.target_username,
29
-
"target_entry_id": self.target_entry_id
35
+
# Only include optional fields if they are not None
36
+
if self.target_username is not None:
37
+
result["target_username"] = self.target_username
38
+
if self.target_entry_id is not None:
39
+
result["target_entry_id"] = self.target_entry_id
def from_dict(cls, data: dict) -> "BlogReference":
"""Create from dictionary."""
···
source_username=data["source_username"],
target_url=data["target_url"],
target_username=data.get("target_username"),
40
-
target_entry_id=data.get("target_entry_id")
51
+
target_entry_id=data.get("target_entry_id"),
···
self.references: list[BlogReference] = []
49
-
self.outbound_refs: dict[str, list[BlogReference]] = {} # entry_id -> outbound refs
50
-
self.inbound_refs: dict[str, list[BlogReference]] = {} # entry_id -> inbound refs
60
+
self.outbound_refs: dict[
61
+
str, list[BlogReference]
62
+
] = {} # entry_id -> outbound refs
63
+
self.inbound_refs: dict[
64
+
str, list[BlogReference]
65
+
] = {} # entry_id -> inbound refs
self.user_domains: dict[str, set[str]] = {} # username -> set of domains
def add_reference(self, ref: BlogReference) -> None:
···
"""Convert to dictionary for JSON serialization."""
"references": [ref.to_dict() for ref in self.references],
109
-
"user_domains": {k: list(v) for k, v in self.user_domains.items()}
124
+
"user_domains": {k: list(v) for k, v in self.user_domains.items()},
···
# Common blog platforms and patterns
132
-
r'https?://[^/]+\.(?:org|com|net|io|dev|me|co\.uk)/.*', # Common blog domains
133
-
r'https?://[^/]+\.github\.io/.*', # GitHub Pages
134
-
r'https?://[^/]+\.substack\.com/.*', # Substack
135
-
r'https?://medium\.com/.*', # Medium
136
-
r'https?://[^/]+\.wordpress\.com/.*', # WordPress.com
137
-
r'https?://[^/]+\.blogspot\.com/.*', # Blogger
147
+
r"https?://[^/]+\.(?:org|com|net|io|dev|me|co\.uk)/.*", # Common blog domains
148
+
r"https?://[^/]+\.github\.io/.*", # GitHub Pages
149
+
r"https?://[^/]+\.substack\.com/.*", # Substack
150
+
r"https?://medium\.com/.*", # Medium
151
+
r"https?://[^/]+\.wordpress\.com/.*", # WordPress.com
152
+
r"https?://[^/]+\.blogspot\.com/.*", # Blogger
141
-
self.link_pattern = re.compile(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL)
156
+
self.link_pattern = re.compile(
157
+
r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL
self.url_pattern = re.compile(r'https?://[^\s<>"]+')
def extract_links_from_html(self, html_content: str) -> list[tuple[str, str]]:
···
# Extract links from <a> tags
for match in self.link_pattern.finditer(html_content):
151
-
text = re.sub(r'<[^>]+>', '', match.group(2)).strip() # Remove HTML tags from link text
169
+
r"<[^>]+>", "", match.group(2)
170
+
).strip() # Remove HTML tags from link text
links.append((url, text))
···
164
-
def resolve_target_user(self, url: str, user_domains: dict[str, set[str]]) -> Optional[str]:
182
+
def resolve_target_user(
183
+
self, url: str, user_domains: dict[str, set[str]]
184
+
) -> Optional[str]:
"""Try to resolve a URL to a known user based on domain mapping."""
parsed_url = urlparse(url)
domain = parsed_url.netloc.lower()
···
175
-
def extract_references(self, entry: AtomEntry, username: str,
176
-
user_domains: dict[str, set[str]]) -> list[BlogReference]:
195
+
def extract_references(
196
+
self, entry: AtomEntry, username: str, user_domains: dict[str, set[str]]
197
+
) -> list[BlogReference]:
"""Extract all blog references from an entry."""
···
for url, _link_text in links:
# Skip internal links (same domain as the entry)
192
-
entry_domain = urlparse(str(entry.link)).netloc.lower() if entry.link else ""
214
+
urlparse(str(entry.link)).netloc.lower() if entry.link else ""
link_domain = urlparse(url).netloc.lower()
if link_domain == entry_domain:
···
source_username=username,
target_username=target_username,
210
-
target_entry_id=None # Will be resolved later if possible
233
+
target_entry_id=None, # Will be resolved later if possible
···
241
-
def resolve_target_entry_ids(self, references: list[BlogReference], git_store: "GitStore") -> list[BlogReference]:
264
+
def resolve_target_entry_ids(
265
+
self, references: list[BlogReference], git_store: "GitStore"
266
+
) -> list[BlogReference]:
"""Resolve target_entry_id for references that have target_username but no target_entry_id."""
···
source_username=ref.source_username,
target_url=ref.target_url,
target_username=ref.target_username,
272
-
target_entry_id=resolved_entry_id
297
+
target_entry_id=resolved_entry_id,
resolved_refs.append(resolved_ref)