Manage Atom feeds in a persistent git repository
1"""Tests for feed parser functionality."""
2
3import pytest
4from pydantic import HttpUrl
5
6from thicket.core.feed_parser import FeedParser
7from thicket.models import AtomEntry, FeedMetadata
8
9
10class TestFeedParser:
11 """Test the FeedParser class."""
12
13 def test_init(self):
14 """Test parser initialization."""
15 parser = FeedParser()
16 assert parser.user_agent == "thicket/0.1.0"
17 assert "a" in parser.allowed_tags
18 assert "href" in parser.allowed_attributes["a"]
19
20 def test_parse_atom_feed(self, sample_atom_feed):
21 """Test parsing an Atom feed."""
22 parser = FeedParser()
23 metadata, entries = parser.parse_feed(sample_atom_feed)
24
25 # Check metadata
26 assert isinstance(metadata, FeedMetadata)
27 assert metadata.title == "Test Feed"
28 assert metadata.author_name == "Test Author"
29 assert metadata.author_email == "author@example.com"
30 assert metadata.link == HttpUrl("https://example.com/")
31
32 # Check entries
33 assert len(entries) == 1
34 entry = entries[0]
35 assert isinstance(entry, AtomEntry)
36 assert entry.title == "Test Entry"
37 assert entry.id == "https://example.com/entry/1"
38 assert entry.link == HttpUrl("https://example.com/entry/1")
39 assert entry.summary == "This is a test entry."
40 assert "<p>This is the content of the test entry.</p>" in entry.content
41
42 def test_parse_rss_feed(self, sample_rss_feed):
43 """Test parsing an RSS feed."""
44 parser = FeedParser()
45 metadata, entries = parser.parse_feed(sample_rss_feed)
46
47 # Check metadata
48 assert isinstance(metadata, FeedMetadata)
49 assert metadata.title == "Test RSS Feed"
50 assert metadata.link == HttpUrl("https://example.com/")
51 assert metadata.author_email == "editor@example.com"
52
53 # Check entries
54 assert len(entries) == 1
55 entry = entries[0]
56 assert isinstance(entry, AtomEntry)
57 assert entry.title == "Test RSS Entry"
58 assert entry.id == "https://example.com/rss/entry/1"
59 assert entry.summary == "This is a test RSS entry."
60
61 def test_sanitize_entry_id(self):
62 """Test entry ID sanitization."""
63 parser = FeedParser()
64
65 # Test URL ID
66 url_id = "https://example.com/posts/2025/01/test-post"
67 sanitized = parser.sanitize_entry_id(url_id)
68 assert sanitized == "posts_2025_01_test-post"
69
70 # Test problematic characters
71 bad_id = "test/with\\bad:chars|and<more>"
72 sanitized = parser.sanitize_entry_id(bad_id)
73 assert sanitized == "test_with_bad_chars_and_more_"
74
75 # Test empty ID
76 empty_id = ""
77 sanitized = parser.sanitize_entry_id(empty_id)
78 assert sanitized == "entry"
79
80 # Test very long ID
81 long_id = "a" * 300
82 sanitized = parser.sanitize_entry_id(long_id)
83 assert len(sanitized) == 200
84
85 def test_sanitize_html(self):
86 """Test HTML sanitization."""
87 parser = FeedParser()
88
89 # Test allowed tags
90 safe_html = "<p>This is <strong>safe</strong> HTML</p>"
91 sanitized = parser._sanitize_html(safe_html)
92 assert sanitized == safe_html
93
94 # Test dangerous tags
95 dangerous_html = "<script>alert('xss')</script><p>Safe content</p>"
96 sanitized = parser._sanitize_html(dangerous_html)
97 assert "<script>" not in sanitized
98 assert "<p>Safe content</p>" in sanitized
99
100 # Test attributes
101 html_with_attrs = '<a href="https://example.com" onclick="alert()">Link</a>'
102 sanitized = parser._sanitize_html(html_with_attrs)
103 assert 'href="https://example.com"' in sanitized
104 assert 'onclick' not in sanitized
105
106 def test_extract_feed_metadata(self):
107 """Test feed metadata extraction."""
108 parser = FeedParser()
109
110 # Test with feedparser parsed data
111 import feedparser
112 parsed = feedparser.parse("""<?xml version="1.0" encoding="utf-8"?>
113<feed xmlns="http://www.w3.org/2005/Atom">
114 <title>Test Feed</title>
115 <link href="https://example.com/"/>
116 <author>
117 <name>Test Author</name>
118 <email>author@example.com</email>
119 <uri>https://example.com/about</uri>
120 </author>
121 <logo>https://example.com/logo.png</logo>
122 <icon>https://example.com/icon.png</icon>
123</feed>""")
124
125 metadata = parser._extract_feed_metadata(parsed.feed)
126 assert metadata.title == "Test Feed"
127 assert metadata.author_name == "Test Author"
128 assert metadata.author_email == "author@example.com"
129 assert metadata.author_uri == HttpUrl("https://example.com/about")
130 assert metadata.link == HttpUrl("https://example.com/")
131 assert metadata.logo == HttpUrl("https://example.com/logo.png")
132 assert metadata.icon == HttpUrl("https://example.com/icon.png")