Manage Atom feeds in a persistent git repository
1"""Tests for feed parser functionality."""
2
3from pydantic import HttpUrl
4
5from thicket.core.feed_parser import FeedParser
6from thicket.models import AtomEntry, FeedMetadata
7
8
9class TestFeedParser:
10 """Test the FeedParser class."""
11
12 def test_init(self):
13 """Test parser initialization."""
14 parser = FeedParser()
15 assert parser.user_agent == "thicket/0.1.0"
16 assert "a" in parser.allowed_tags
17 assert "href" in parser.allowed_attributes["a"]
18
19 def test_parse_atom_feed(self, sample_atom_feed):
20 """Test parsing an Atom feed."""
21 parser = FeedParser()
22 metadata, entries = parser.parse_feed(sample_atom_feed)
23
24 # Check metadata
25 assert isinstance(metadata, FeedMetadata)
26 assert metadata.title == "Test Feed"
27 assert metadata.author_name == "Test Author"
28 assert metadata.author_email == "author@example.com"
29 assert metadata.link == HttpUrl("https://example.com/")
30
31 # Check entries
32 assert len(entries) == 1
33 entry = entries[0]
34 assert isinstance(entry, AtomEntry)
35 assert entry.title == "Test Entry"
36 assert entry.id == "https://example.com/entry/1"
37 assert entry.link == HttpUrl("https://example.com/entry/1")
38 assert entry.summary == "This is a test entry."
39 assert "<p>This is the content of the test entry.</p>" in entry.content
40
41 def test_parse_rss_feed(self, sample_rss_feed):
42 """Test parsing an RSS feed."""
43 parser = FeedParser()
44 metadata, entries = parser.parse_feed(sample_rss_feed)
45
46 # Check metadata
47 assert isinstance(metadata, FeedMetadata)
48 assert metadata.title == "Test RSS Feed"
49 assert metadata.link == HttpUrl("https://example.com/")
50 assert metadata.author_email == "editor@example.com"
51
52 # Check entries
53 assert len(entries) == 1
54 entry = entries[0]
55 assert isinstance(entry, AtomEntry)
56 assert entry.title == "Test RSS Entry"
57 assert entry.id == "https://example.com/rss/entry/1"
58 assert entry.summary == "This is a test RSS entry."
59
60 def test_sanitize_entry_id(self):
61 """Test entry ID sanitization."""
62 parser = FeedParser()
63
64 # Test URL ID
65 url_id = "https://example.com/posts/2025/01/test-post"
66 sanitized = parser.sanitize_entry_id(url_id)
67 assert sanitized == "posts_2025_01_test-post"
68
69 # Test problematic characters
70 bad_id = "test/with\\bad:chars|and<more>"
71 sanitized = parser.sanitize_entry_id(bad_id)
72 assert sanitized == "test_with_bad_chars_and_more_"
73
74 # Test empty ID
75 empty_id = ""
76 sanitized = parser.sanitize_entry_id(empty_id)
77 assert sanitized == "entry"
78
79 # Test very long ID
80 long_id = "a" * 300
81 sanitized = parser.sanitize_entry_id(long_id)
82 assert len(sanitized) == 200
83
84 def test_sanitize_html(self):
85 """Test HTML sanitization."""
86 parser = FeedParser()
87
88 # Test allowed tags
89 safe_html = "<p>This is <strong>safe</strong> HTML</p>"
90 sanitized = parser._sanitize_html(safe_html)
91 assert sanitized == safe_html
92
93 # Test dangerous tags
94 dangerous_html = "<script>alert('xss')</script><p>Safe content</p>"
95 sanitized = parser._sanitize_html(dangerous_html)
96 assert "<script>" not in sanitized
97 assert "<p>Safe content</p>" in sanitized
98
99 # Test attributes
100 html_with_attrs = '<a href="https://example.com" onclick="alert()">Link</a>'
101 sanitized = parser._sanitize_html(html_with_attrs)
102 assert 'href="https://example.com"' in sanitized
103 assert 'onclick' not in sanitized
104
105 def test_extract_feed_metadata(self):
106 """Test feed metadata extraction."""
107 parser = FeedParser()
108
109 # Test with feedparser parsed data
110 import feedparser
111 parsed = feedparser.parse("""<?xml version="1.0" encoding="utf-8"?>
112<feed xmlns="http://www.w3.org/2005/Atom">
113 <title>Test Feed</title>
114 <link href="https://example.com/"/>
115 <author>
116 <name>Test Author</name>
117 <email>author@example.com</email>
118 <uri>https://example.com/about</uri>
119 </author>
120 <logo>https://example.com/logo.png</logo>
121 <icon>https://example.com/icon.png</icon>
122</feed>""")
123
124 metadata = parser._extract_feed_metadata(parsed.feed)
125 assert metadata.title == "Test Feed"
126 assert metadata.author_name == "Test Author"
127 assert metadata.author_email == "author@example.com"
128 assert metadata.author_uri == HttpUrl("https://example.com/about")
129 assert metadata.link == HttpUrl("https://example.com/")
130 assert metadata.logo == HttpUrl("https://example.com/logo.png")
131 assert metadata.icon == HttpUrl("https://example.com/icon.png")