Manage Atom feeds in a persistent git repository
1"""Tests for feed parser functionality.""" 2 3from pydantic import HttpUrl 4 5from thicket.core.feed_parser import FeedParser 6from thicket.models import AtomEntry, FeedMetadata 7 8 9class TestFeedParser: 10 """Test the FeedParser class.""" 11 12 def test_init(self): 13 """Test parser initialization.""" 14 parser = FeedParser() 15 assert parser.user_agent == "thicket/0.1.0" 16 assert "a" in parser.allowed_tags 17 assert "href" in parser.allowed_attributes["a"] 18 19 def test_parse_atom_feed(self, sample_atom_feed): 20 """Test parsing an Atom feed.""" 21 parser = FeedParser() 22 metadata, entries = parser.parse_feed(sample_atom_feed) 23 24 # Check metadata 25 assert isinstance(metadata, FeedMetadata) 26 assert metadata.title == "Test Feed" 27 assert metadata.author_name == "Test Author" 28 assert metadata.author_email == "author@example.com" 29 assert metadata.link == HttpUrl("https://example.com/") 30 31 # Check entries 32 assert len(entries) == 1 33 entry = entries[0] 34 assert isinstance(entry, AtomEntry) 35 assert entry.title == "Test Entry" 36 assert entry.id == "https://example.com/entry/1" 37 assert entry.link == HttpUrl("https://example.com/entry/1") 38 assert entry.summary == "This is a test entry." 39 assert "<p>This is the content of the test entry.</p>" in entry.content 40 41 def test_parse_rss_feed(self, sample_rss_feed): 42 """Test parsing an RSS feed.""" 43 parser = FeedParser() 44 metadata, entries = parser.parse_feed(sample_rss_feed) 45 46 # Check metadata 47 assert isinstance(metadata, FeedMetadata) 48 assert metadata.title == "Test RSS Feed" 49 assert metadata.link == HttpUrl("https://example.com/") 50 assert metadata.author_email == "editor@example.com" 51 52 # Check entries 53 assert len(entries) == 1 54 entry = entries[0] 55 assert isinstance(entry, AtomEntry) 56 assert entry.title == "Test RSS Entry" 57 assert entry.id == "https://example.com/rss/entry/1" 58 assert entry.summary == "This is a test RSS entry." 59 60 def test_sanitize_entry_id(self): 61 """Test entry ID sanitization.""" 62 parser = FeedParser() 63 64 # Test URL ID 65 url_id = "https://example.com/posts/2025/01/test-post" 66 sanitized = parser.sanitize_entry_id(url_id) 67 assert sanitized == "posts_2025_01_test-post" 68 69 # Test problematic characters 70 bad_id = "test/with\\bad:chars|and<more>" 71 sanitized = parser.sanitize_entry_id(bad_id) 72 assert sanitized == "test_with_bad_chars_and_more_" 73 74 # Test empty ID 75 empty_id = "" 76 sanitized = parser.sanitize_entry_id(empty_id) 77 assert sanitized == "entry" 78 79 # Test very long ID 80 long_id = "a" * 300 81 sanitized = parser.sanitize_entry_id(long_id) 82 assert len(sanitized) == 200 83 84 def test_sanitize_html(self): 85 """Test HTML sanitization.""" 86 parser = FeedParser() 87 88 # Test allowed tags 89 safe_html = "<p>This is <strong>safe</strong> HTML</p>" 90 sanitized = parser._sanitize_html(safe_html) 91 assert sanitized == safe_html 92 93 # Test dangerous tags 94 dangerous_html = "<script>alert('xss')</script><p>Safe content</p>" 95 sanitized = parser._sanitize_html(dangerous_html) 96 assert "<script>" not in sanitized 97 assert "<p>Safe content</p>" in sanitized 98 99 # Test attributes 100 html_with_attrs = '<a href="https://example.com" onclick="alert()">Link</a>' 101 sanitized = parser._sanitize_html(html_with_attrs) 102 assert 'href="https://example.com"' in sanitized 103 assert "onclick" not in sanitized 104 105 def test_extract_feed_metadata(self): 106 """Test feed metadata extraction.""" 107 parser = FeedParser() 108 109 # Test with feedparser parsed data 110 import feedparser 111 112 parsed = feedparser.parse("""<?xml version="1.0" encoding="utf-8"?> 113<feed xmlns="http://www.w3.org/2005/Atom"> 114 <title>Test Feed</title> 115 <link href="https://example.com/"/> 116 <author> 117 <name>Test Author</name> 118 <email>author@example.com</email> 119 <uri>https://example.com/about</uri> 120 </author> 121 <logo>https://example.com/logo.png</logo> 122 <icon>https://example.com/icon.png</icon> 123</feed>""") 124 125 metadata = parser._extract_feed_metadata(parsed.feed) 126 assert metadata.title == "Test Feed" 127 assert metadata.author_name == "Test Author" 128 assert metadata.author_email == "author@example.com" 129 assert metadata.author_uri == HttpUrl("https://example.com/about") 130 assert metadata.link == HttpUrl("https://example.com/") 131 assert metadata.logo == HttpUrl("https://example.com/logo.png") 132 assert metadata.icon == HttpUrl("https://example.com/icon.png")