A community based topic aggregation platform built on atproto
at main 7.6 kB view raw
1""" 2Tests for Kagi HTML description parser. 3""" 4import pytest 5from pathlib import Path 6from datetime import datetime 7import html 8 9from src.html_parser import KagiHTMLParser 10from src.models import KagiStory, Perspective, Quote, Source 11 12 13@pytest.fixture 14def sample_html_description(): 15 """Load sample HTML from RSS item fixture.""" 16 # This is the escaped HTML from the RSS description field 17 html_content = """<p>The White House confirmed President Trump will hold a bilateral meeting with Chinese President Xi Jinping in South Korea on October 30, at the end of an Asia trip that includes Malaysia and Japan . The administration said the meeting will take place Thursday morning local time, and Mr Trump indicated his first question to Xi would concern fentanyl and other bilateral issues . The talks come amid heightened trade tensions after Beijing expanded export curbs on rare-earth minerals and following Mr Trump's recent threat of additional tariffs on Chinese goods, making the meeting a focal point for discussions on trade, technology supply chains and energy .</p><img src='https://kagiproxy.com/img/Q2SRXQtwTYBIiQeI0FG-X6taF_wHSJaXDiFUzju2kbCWGuOYIFUX--8L0BqE4VKxpbOJY3ylFPJkDpfSnyQYZ1qdOLXbphHTnsOK4jb7gqC4KCn5nf3ANbWCuaFD5ZUSijiK0k7wOLP2fyX6tynu2mPtXlCbotLo2lTrEswZl4-No2AI4mI4lkResfnRdp-YjpoEfCOHkNfbN1-0cNcHt9T2dmgBSXrQ2w' alt='News image associated with coverage of President Trump&#x27;s Asia trip and planned meeting with President Xi' /><br /><h3>Highlights:</h3><ul><li>Itinerary details: The Asia swing begins in Malaysia, continues to Japan and ends with the bilateral meeting in South Korea on Thursday morning local time, White House press secretary Karoline Leavitt said at a briefing .</li><li>APEC context: US officials indicated the leaders will meet on the sidelines of the Asia-Pacific Economic Cooperation gathering, shaping expectations for short, high-level talks rather than a lengthy summit .</li></ul><blockquote>Work out a lot of our doubts and questions - President Trump</blockquote><h3>Perspectives:</h3><ul><li>President Trump: He said his first question to President Xi would be about fentanyl and indicated he hoped to resolve bilateral doubts and questions in the talks. (<a href='https://www.straitstimes.com/world/united-states/trump-to-meet-xi-in-south-korea-on-oct-30-as-part-of-asia-swing'>The Straits Times</a>)</li><li>White House (press secretary): Karoline Leavitt confirmed the bilateral meeting will occur Thursday morning local time during a White House briefing. (<a href='https://www.scmp.com/news/us/diplomacy/article/3330131/donald-trump-meet-chinas-xi-jinping-next-thursday-south-korea-crunch-talks'>South China Morning Post</a>)</li></ul><h3>Sources:</h3><ul><li><a href='https://www.straitstimes.com/world/united-states/trump-to-meet-xi-in-south-korea-on-oct-30-as-part-of-asia-swing'>Trump to meet Xi in South Korea on Oct 30 as part of Asia swing</a> - straitstimes.com</li><li><a href='https://www.scmp.com/news/us/diplomacy/article/3330131/donald-trump-meet-chinas-xi-jinping-next-thursday-south-korea-crunch-talks'>Trump to meet Xi in South Korea next Thursday as part of key Asia trip</a> - scmp.com</li></ul>""" 18 return html_content 19 20 21class TestKagiHTMLParser: 22 """Test suite for Kagi HTML parser.""" 23 24 def test_parse_summary(self, sample_html_description): 25 """Test extracting summary paragraph.""" 26 parser = KagiHTMLParser() 27 result = parser.parse(sample_html_description) 28 29 assert result['summary'].startswith("The White House confirmed President Trump") 30 assert "bilateral meeting with Chinese President Xi Jinping" in result['summary'] 31 32 def test_parse_image_url(self, sample_html_description): 33 """Test extracting image URL and alt text.""" 34 parser = KagiHTMLParser() 35 result = parser.parse(sample_html_description) 36 37 assert result['image_url'] is not None 38 assert result['image_url'].startswith("https://kagiproxy.com/img/") 39 assert result['image_alt'] is not None 40 assert "Trump" in result['image_alt'] 41 42 def test_parse_highlights(self, sample_html_description): 43 """Test extracting highlights list.""" 44 parser = KagiHTMLParser() 45 result = parser.parse(sample_html_description) 46 47 assert len(result['highlights']) == 2 48 assert "Itinerary details" in result['highlights'][0] 49 assert "APEC context" in result['highlights'][1] 50 51 def test_parse_quote(self, sample_html_description): 52 """Test extracting blockquote.""" 53 parser = KagiHTMLParser() 54 result = parser.parse(sample_html_description) 55 56 assert result['quote'] is not None 57 assert result['quote']['text'] == "Work out a lot of our doubts and questions" 58 assert result['quote']['attribution'] == "President Trump" 59 60 def test_parse_perspectives(self, sample_html_description): 61 """Test extracting perspectives list.""" 62 parser = KagiHTMLParser() 63 result = parser.parse(sample_html_description) 64 65 assert len(result['perspectives']) == 2 66 67 # First perspective 68 assert result['perspectives'][0]['actor'] == "President Trump" 69 assert "fentanyl" in result['perspectives'][0]['description'] 70 assert result['perspectives'][0]['source_url'] == "https://www.straitstimes.com/world/united-states/trump-to-meet-xi-in-south-korea-on-oct-30-as-part-of-asia-swing" 71 72 # Second perspective 73 assert "White House" in result['perspectives'][1]['actor'] 74 75 def test_parse_sources(self, sample_html_description): 76 """Test extracting sources list.""" 77 parser = KagiHTMLParser() 78 result = parser.parse(sample_html_description) 79 80 assert len(result['sources']) >= 2 81 82 # Check first source 83 assert result['sources'][0]['title'] == "Trump to meet Xi in South Korea on Oct 30 as part of Asia swing" 84 assert result['sources'][0]['url'].startswith("https://www.straitstimes.com") 85 assert result['sources'][0]['domain'] == "straitstimes.com" 86 87 def test_parse_missing_sections(self): 88 """Test parsing HTML with missing sections.""" 89 html_minimal = "<p>Just a summary, no other sections.</p>" 90 91 parser = KagiHTMLParser() 92 result = parser.parse(html_minimal) 93 94 assert result['summary'] == "Just a summary, no other sections." 95 assert result['highlights'] == [] 96 assert result['perspectives'] == [] 97 assert result['sources'] == [] 98 assert result['quote'] is None 99 assert result['image_url'] is None 100 101 def test_parse_to_kagi_story(self, sample_html_description): 102 """Test converting parsed HTML to KagiStory object.""" 103 parser = KagiHTMLParser() 104 105 # Simulate full RSS item data 106 story = parser.parse_to_story( 107 title="Trump to meet Xi in South Korea on Oct 30", 108 link="https://kite.kagi.com/test/world/10", 109 guid="https://kite.kagi.com/test/world/10", 110 pub_date=datetime(2025, 10, 23, 20, 56, 0), 111 categories=["World", "World/Diplomacy"], 112 html_description=sample_html_description 113 ) 114 115 assert isinstance(story, KagiStory) 116 assert story.title == "Trump to meet Xi in South Korea on Oct 30" 117 assert story.link == "https://kite.kagi.com/test/world/10" 118 assert len(story.highlights) == 2 119 assert len(story.perspectives) == 2 120 assert len(story.sources) >= 2 121 assert story.quote is not None 122 assert story.image_url is not None