Manage Atom feeds in a persistent git repository
1"""Tests for feed parser functionality.""" 2 3import pytest 4from pydantic import HttpUrl 5 6from thicket.core.feed_parser import FeedParser 7from thicket.models import AtomEntry, FeedMetadata 8 9 10class TestFeedParser: 11 """Test the FeedParser class.""" 12 13 def test_init(self): 14 """Test parser initialization.""" 15 parser = FeedParser() 16 assert parser.user_agent == "thicket/0.1.0" 17 assert "a" in parser.allowed_tags 18 assert "href" in parser.allowed_attributes["a"] 19 20 def test_parse_atom_feed(self, sample_atom_feed): 21 """Test parsing an Atom feed.""" 22 parser = FeedParser() 23 metadata, entries = parser.parse_feed(sample_atom_feed) 24 25 # Check metadata 26 assert isinstance(metadata, FeedMetadata) 27 assert metadata.title == "Test Feed" 28 assert metadata.author_name == "Test Author" 29 assert metadata.author_email == "author@example.com" 30 assert metadata.link == HttpUrl("https://example.com/") 31 32 # Check entries 33 assert len(entries) == 1 34 entry = entries[0] 35 assert isinstance(entry, AtomEntry) 36 assert entry.title == "Test Entry" 37 assert entry.id == "https://example.com/entry/1" 38 assert entry.link == HttpUrl("https://example.com/entry/1") 39 assert entry.summary == "This is a test entry." 40 assert "<p>This is the content of the test entry.</p>" in entry.content 41 42 def test_parse_rss_feed(self, sample_rss_feed): 43 """Test parsing an RSS feed.""" 44 parser = FeedParser() 45 metadata, entries = parser.parse_feed(sample_rss_feed) 46 47 # Check metadata 48 assert isinstance(metadata, FeedMetadata) 49 assert metadata.title == "Test RSS Feed" 50 assert metadata.link == HttpUrl("https://example.com/") 51 assert metadata.author_email == "editor@example.com" 52 53 # Check entries 54 assert len(entries) == 1 55 entry = entries[0] 56 assert isinstance(entry, AtomEntry) 57 assert entry.title == "Test RSS Entry" 58 assert entry.id == "https://example.com/rss/entry/1" 59 assert entry.summary == "This is a test RSS entry." 60 61 def test_sanitize_entry_id(self): 62 """Test entry ID sanitization.""" 63 parser = FeedParser() 64 65 # Test URL ID 66 url_id = "https://example.com/posts/2025/01/test-post" 67 sanitized = parser.sanitize_entry_id(url_id) 68 assert sanitized == "posts_2025_01_test-post" 69 70 # Test problematic characters 71 bad_id = "test/with\\bad:chars|and<more>" 72 sanitized = parser.sanitize_entry_id(bad_id) 73 assert sanitized == "test_with_bad_chars_and_more_" 74 75 # Test empty ID 76 empty_id = "" 77 sanitized = parser.sanitize_entry_id(empty_id) 78 assert sanitized == "entry" 79 80 # Test very long ID 81 long_id = "a" * 300 82 sanitized = parser.sanitize_entry_id(long_id) 83 assert len(sanitized) == 200 84 85 def test_sanitize_html(self): 86 """Test HTML sanitization.""" 87 parser = FeedParser() 88 89 # Test allowed tags 90 safe_html = "<p>This is <strong>safe</strong> HTML</p>" 91 sanitized = parser._sanitize_html(safe_html) 92 assert sanitized == safe_html 93 94 # Test dangerous tags 95 dangerous_html = "<script>alert('xss')</script><p>Safe content</p>" 96 sanitized = parser._sanitize_html(dangerous_html) 97 assert "<script>" not in sanitized 98 assert "<p>Safe content</p>" in sanitized 99 100 # Test attributes 101 html_with_attrs = '<a href="https://example.com" onclick="alert()">Link</a>' 102 sanitized = parser._sanitize_html(html_with_attrs) 103 assert 'href="https://example.com"' in sanitized 104 assert 'onclick' not in sanitized 105 106 def test_extract_feed_metadata(self): 107 """Test feed metadata extraction.""" 108 parser = FeedParser() 109 110 # Test with feedparser parsed data 111 import feedparser 112 parsed = feedparser.parse("""<?xml version="1.0" encoding="utf-8"?> 113<feed xmlns="http://www.w3.org/2005/Atom"> 114 <title>Test Feed</title> 115 <link href="https://example.com/"/> 116 <author> 117 <name>Test Author</name> 118 <email>author@example.com</email> 119 <uri>https://example.com/about</uri> 120 </author> 121 <logo>https://example.com/logo.png</logo> 122 <icon>https://example.com/icon.png</icon> 123</feed>""") 124 125 metadata = parser._extract_feed_metadata(parsed.feed) 126 assert metadata.title == "Test Feed" 127 assert metadata.author_name == "Test Author" 128 assert metadata.author_email == "author@example.com" 129 assert metadata.author_uri == HttpUrl("https://example.com/about") 130 assert metadata.link == HttpUrl("https://example.com/") 131 assert metadata.logo == HttpUrl("https://example.com/logo.png") 132 assert metadata.icon == HttpUrl("https://example.com/icon.png")