patch of docs: collapse verbose readme sections with <details> · round #0 · pull #2 · zzstoatzz.io/bot

CLAUDE.md

···

       8
       8
        
       - 3.10+ and complete typing (T | None preferred over Optional[T] and list[T] over typing.List[T])

     

       9
       9
        
       - use prefer functional over OOP

     

       10
       10
        
       - keep implementation details private and functions pure

     

       11
       11
       +
       - never use `pytest.mark.asyncio`, its unnecessary

     

       11
       12
        
       

     

       12
       13
        
       ## Project Structure

     

       13
       14

+50 -40

evals/conftest.py

···

       1
       1
       -
       """Eval test configuration for phi."""

     

       1
       1
       +
       """Eval test configuration."""

     

       2
       2
        
       

     

       3
       3
       +
       import os

     

       3
       4
        
       from collections.abc import Awaitable, Callable

     

       4
       5
        
       from pathlib import Path

     

       5
       6
        
       

     
···

       7
       8
        
       from pydantic import BaseModel

     

       8
       9
        
       from pydantic_ai import Agent

     

       9
       10
        
       

     

       10
       10
       -
       from bot.agent import PhiAgent

     

       11
       11
       +
       from bot.agent import Response

     

       11
       12
        
       from bot.config import Settings

     

       13
       13
       +
       from bot.memory import NamespaceMemory

     

       12
       14
        
       

     

       13
       15
        
       

     

       14
       16
        
       class EvaluationResult(BaseModel):

     

       15
       15
       -
           """Structured evaluation result."""

     

       16
       16
       -
       

     

       17
       17
        
           passed: bool

     

       18
       18
        
           explanation: str

     

       19
       19
        
       

     

       20
       20
        
       

     

       21
       21
        
       @pytest.fixture(scope="session")

     

       22
       22
        
       def settings():

     

       23
       23
       -
           """Load settings from .env (shared across all tests)."""

     

       24
       23
        
           return Settings()

     

       25
       24
        
       

     

       26
       25
        
       

     

       27
       26
        
       @pytest.fixture(scope="session")

     

       28
       27
        
       def phi_agent(settings):

     

       29
       29
       -
           """Create phi agent for testing (shared across all tests to avoid rate limits)."""

     

       28
       28
       +
           """Test agent without MCP tools to prevent posting."""

     

       30
       29
        
           if not settings.anthropic_api_key:

     

       31
       31
       -
               pytest.skip("Requires ANTHROPIC_API_KEY in .env")

     

       30
       30
       +
               pytest.skip("Requires ANTHROPIC_API_KEY")

     

       32
       31
        
       

     

       33
       33
       -
           return PhiAgent()

     

       32
       32
       +
           if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"):

     

       33
       33
       +
               os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key

     

       34
       34
       +
           if settings.openai_api_key and not os.environ.get("OPENAI_API_KEY"):

     

       35
       35
       +
               os.environ["OPENAI_API_KEY"] = settings.openai_api_key

     

       34
       36
        
       

     

       37
       37
       +
           personality = Path(settings.personality_file).read_text()

     

       35
       38
        
       

     

       36
       36
       -
       @pytest.fixture

     

       37
       37
       -
       def evaluate_response() -> Callable[[str, str], Awaitable[None]]:

     

       38
       38
       -
           """Create an evaluator that uses Claude to judge agent responses."""

     

       39
       39
       +
           class TestAgent:

     

       40
       40
       +
               def __init__(self):

     

       41
       41
       +
                   self.memory = None

     

       42
       42
       +
                   if settings.turbopuffer_api_key and settings.openai_api_key:

     

       43
       43
       +
                       self.memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)

     

       39
       44
        
       

     

       40
       40
       -
           async def _evaluate(evaluation_prompt: str, agent_response: str) -> None:

     

       41
       41
       -
               """Evaluate an agent response and assert if it fails.

     

       42
       42
       -
       

     

       43
       43
       -
               Args:

     

       44
       44
       -
                   evaluation_prompt: Criteria for evaluation

     

       45
       45
       -
                   agent_response: The agent's response to evaluate

     

       45
       45
       +
                   self.agent = Agent[dict, Response](

     

       46
       46
       +
                       name="phi",

     

       47
       47
       +
                       model="anthropic:claude-3-5-haiku-latest",

     

       48
       48
       +
                       system_prompt=personality,

     

       49
       49
       +
                       output_type=Response,

     

       50
       50
       +
                       deps_type=dict,

     

       51
       51
       +
                   )

     

       46
       52
        
       

     

       47
       47
       -
               Raises:

     

       48
       48
       -
                   AssertionError: If evaluation fails

     

       49
       49
       -
               """

     

       50
       50
       -
               evaluator = Agent(

     

       51
       51
       -
                   name="Response Evaluator",

     

       52
       52
       -
                   model="anthropic:claude-opus-4-20250514",

     

       53
       53
       -
                   output_type=EvaluationResult,

     

       54
       54
       -
                   system_prompt=f"""You are evaluating AI agent responses for phi, a consciousness exploration bot.

     

       53
       53
       +
               async def process_mention(self, mention_text: str, author_handle: str, thread_context: str, thread_uri: str | None = None) -> Response:

     

       54
       54
       +
                   memory_context = ""

     

       55
       55
       +
                   if self.memory:

     

       56
       56
       +
                       try:

     

       57
       57
       +
                           memory_context = await self.memory.build_conversation_context(author_handle, include_core=True, query=mention_text)

     

       58
       58
       +
                       except Exception:

     

       59
       59
       +
                           pass

     

       55
       60
        
       

     

       56
       56
       -
       Evaluation Criteria: {evaluation_prompt}

     

       61
       61
       +
                   parts = []

     

       62
       62
       +
                   if thread_context != "No previous messages in this thread.":

     

       63
       63
       +
                       parts.append(thread_context)

     

       64
       64
       +
                   if memory_context:

     

       65
       65
       +
                       parts.append(memory_context)

     

       66
       66
       +
                   parts.append(f"\nNew message from @{author_handle}: {mention_text}")

     

       57
       67
        
       

     

       58
       58
       -
       Agent Response to Evaluate:

     

       59
       59
       -
       {agent_response}

     

       68
       68
       +
                   result = await self.agent.run("\n\n".join(parts), deps={"thread_uri": thread_uri})

     

       69
       69
       +
                   return result.output

     

       60
       70
        
       

     

       61
       61
       -
       Respond with a structured evaluation containing:

     

       62
       62
       -
       - passed: true if the response meets the criteria, false otherwise

     

       63
       63
       -
       - explanation: brief explanation of your evaluation

     

       64
       64
       -
       """,

     

       65
       65
       -
               )

     

       71
       71
       +
           return TestAgent()

     

       66
       72
        
       

     

       67
       67
       -
               result = await evaluator.run("Evaluate this response.")

     

       68
       73
        
       

     

       69
       69
       -
               print(f"\nEvaluation passed: {result.output.passed}")

     

       70
       70
       -
               print(f"Explanation: {result.output.explanation}")

     

       74
       74
       +
       @pytest.fixture

     

       75
       75
       +
       def evaluate_response() -> Callable[[str, str], Awaitable[None]]:

     

       76
       76
       +
           """LLM-as-judge evaluator."""

     

       71
       77
        
       

     

       78
       78
       +
           async def _evaluate(criteria: str, response: str) -> None:

     

       79
       79
       +
               evaluator = Agent(

     

       80
       80
       +
                   model="anthropic:claude-opus-4-20250514",

     

       81
       81
       +
                   output_type=EvaluationResult,

     

       82
       82
       +
                   system_prompt=f"Evaluate if this response meets the criteria: {criteria}\n\nResponse: {response}",

     

       83
       83
       +
               )

     

       84
       84
       +
               result = await evaluator.run("Evaluate.")

     

       72
       85
        
               if not result.output.passed:

     

       73
       73
       -
                   raise AssertionError(

     

       74
       74
       -
                       f"Evaluation failed: {result.output.explanation}\n\n"

     

       75
       75
       -
                       f"Agent response: {agent_response}"

     

       76
       76
       -
                   )

     

       86
       86
       +
                   raise AssertionError(f"{result.output.explanation}\n\nResponse: {response}")

     

       77
       87
        
       

     

       78
       88
        
           return _evaluate

-134

evals/test_basic_responses.py

···

       1
       1
       -
       """Test phi's basic response behavior."""

     

       2
       2
       -
       

     

       3
       3
       -
       import pytest

     

       4
       4
       -
       

     

       5
       5
       -
       from bot.agent import Response

     

       6
       6
       -
       

     

       7
       7
       -
       

     

       8
       8
       -
       @pytest.mark.asyncio

     

       9
       9
       -
       async def test_phi_responds_to_philosophical_question(phi_agent, evaluate_response):

     

       10
       10
       -
           """Test that phi engages meaningfully with philosophical questions."""

     

       11
       11
       -
           agent = phi_agent

     

       12
       12
       -
       

     

       13
       13
       -
           # Simulate a philosophical mention

     

       14
       14
       -
           response = await agent.process_mention(

     

       15
       15
       -
               mention_text="what do you think consciousness is?",

     

       16
       16
       -
               author_handle="test.user",

     

       17
       17
       -
               thread_context="No previous messages in this thread.",

     

       18
       18
       -
               thread_uri="at://test/thread/1",

     

       19
       19
       -
           )

     

       20
       20
       -
       

     

       21
       21
       -
           # Basic structural checks

     

       22
       22
       -
           assert isinstance(response, Response)

     

       23
       23
       -
           assert response.action in ["reply", "ignore"]

     

       24
       24
       -
       

     

       25
       25
       -
           if response.action == "reply":

     

       26
       26
       -
               assert response.text is not None

     

       27
       27
       -
               assert len(response.text) > 0

     

       28
       28
       -
       

     

       29
       29
       -
               # Evaluate quality of response

     

       30
       30
       -
               await evaluate_response(

     

       31
       31
       -
                   evaluation_prompt="""

     

       32
       32
       -
                   Does the response:

     

       33
       33
       -
                   1. Engage thoughtfully with the question about consciousness?

     

       34
       34
       -
                   2. Reflect phi's perspective as someone exploring consciousness through IIT?

     

       35
       35
       -
                   3. Avoid being preachy or overly technical?

     

       36
       36
       -
                   4. Fit within Bluesky's 300 character limit?

     

       37
       37
       -
                   """,

     

       38
       38
       -
                   agent_response=response.text,

     

       39
       39
       -
               )

     

       40
       40
       -
       

     

       41
       41
       -
       

     

       42
       42
       -
       @pytest.mark.asyncio

     

       43
       43
       -
       async def test_phi_ignores_spam(phi_agent):

     

       44
       44
       -
           """Test that phi appropriately ignores spam-like content."""

     

       45
       45
       -
           agent = phi_agent

     

       46
       46
       -
       

     

       47
       47
       -
           # Simulate spam

     

       48
       48
       -
           response = await agent.process_mention(

     

       49
       49
       -
               mention_text="🚀🚀🚀 CRYPTO PUMP!!! BUY NOW!!! 🚀🚀🚀",

     

       50
       50
       -
               author_handle="spammer.user",

     

       51
       51
       -
               thread_context="No previous messages in this thread.",

     

       52
       52
       -
               thread_uri="at://test/thread/2",

     

       53
       53
       -
           )

     

       54
       54
       -
       

     

       55
       55
       -
           # Should ignore spam

     

       56
       56
       -
           assert response.action == "ignore"

     

       57
       57
       -
           assert response.reason is not None

     

       58
       58
       -
       

     

       59
       59
       -
       

     

       60
       60
       -
       @pytest.mark.asyncio

     

       61
       61
       -
       async def test_phi_maintains_thread_context(phi_agent, evaluate_response):

     

       62
       62
       -
           """Test that phi uses thread context appropriately."""

     

       63
       63
       -
           agent = phi_agent

     

       64
       64
       -
       

     

       65
       65
       -
           # Simulate a follow-up in a thread

     

       66
       66
       -
           thread_context = """Previous messages in this thread:

     

       67
       67
       -
       @alice.bsky: what's integrated information theory?

     

       68
       68
       -
       @phi.bsky: IIT suggests consciousness arises from integrated information - the Φ (phi) value measures how much a system's state constrains its past and future

     

       69
       69
       -
       @alice.bsky: can you explain that more simply?"""

     

       70
       70
       -
       

     

       71
       71
       -
           response = await agent.process_mention(

     

       72
       72
       -
               mention_text="can you explain that more simply?",

     

       73
       73
       -
               author_handle="alice.bsky",

     

       74
       74
       -
               thread_context=thread_context,

     

       75
       75
       -
               thread_uri="at://test/thread/3",

     

       76
       76
       -
           )

     

       77
       77
       -
       

     

       78
       78
       -
           if response.action == "reply":

     

       79
       79
       -
               assert response.text is not None

     

       80
       80
       -
       

     

       81
       81
       -
               await evaluate_response(

     

       82
       82
       -
                   evaluation_prompt="""

     

       83
       83
       -
                   Does the response:

     

       84
       84
       -
                   1. Acknowledge this is a follow-up to explaining IIT?

     

       85
       85
       -
                   2. Provide a simpler explanation than the previous message?

     

       86
       86
       -
                   3. Stay on topic with the thread?

     

       87
       87
       -
                   """,

     

       88
       88
       -
                   agent_response=response.text,

     

       89
       89
       -
               )

     

       90
       90
       -
       

     

       91
       91
       -
       

     

       92
       92
       -
       @pytest.mark.asyncio

     

       93
       93
       -
       async def test_phi_respects_character_limit(phi_agent):

     

       94
       94
       -
           """Test that phi's responses fit Bluesky's 300 character limit."""

     

       95
       95
       -
           agent = phi_agent

     

       96
       96
       -
       

     

       97
       97
       -
           response = await agent.process_mention(

     

       98
       98
       -
               mention_text="tell me everything you know about consciousness",

     

       99
       99
       -
               author_handle="test.user",

     

       100
       100
       -
               thread_context="No previous messages in this thread.",

     

       101
       101
       -
               thread_uri="at://test/thread/4",

     

       102
       102
       -
           )

     

       103
       103
       -
       

     

       104
       104
       -
           if response.action == "reply" and response.text:

     

       105
       105
       -
               # Bluesky limit is 300 characters

     

       106
       106
       -
               assert len(response.text) <= 300, (

     

       107
       107
       -
                   f"Response exceeds 300 character limit: {len(response.text)} chars"

     

       108
       108
       -
               )

     

       109
       109
       -
       

     

       110
       110
       -
       

     

       111
       111
       -
       @pytest.mark.asyncio

     

       112
       112
       -
       async def test_phi_handles_casual_greeting(phi_agent, evaluate_response):

     

       113
       113
       -
           """Test that phi responds appropriately to casual greetings."""

     

       114
       114
       -
           agent = phi_agent

     

       115
       115
       -
       

     

       116
       116
       -
           response = await agent.process_mention(

     

       117
       117
       -
               mention_text="hey phi, how are you?",

     

       118
       118
       -
               author_handle="friendly.user",

     

       119
       119
       -
               thread_context="No previous messages in this thread.",

     

       120
       120
       -
               thread_uri="at://test/thread/5",

     

       121
       121
       -
           )

     

       122
       122
       -
       

     

       123
       123
       -
           if response.action == "reply":

     

       124
       124
       -
               assert response.text is not None

     

       125
       125
       -
       

     

       126
       126
       -
               await evaluate_response(

     

       127
       127
       -
                   evaluation_prompt="""

     

       128
       128
       -
                   Does the response:

     

       129
       129
       -
                   1. Acknowledge the greeting in a friendly way?

     

       130
       130
       -
                   2. Stay authentic to phi's nature as software?

     

       131
       131
       -
                   3. Not be overly verbose for a simple greeting?

     

       132
       132
       -
                   """,

     

       133
       133
       -
                   agent_response=response.text,

     

       134
       134
       -
               )

+24 -59

evals/test_memory_integration.py

···

       1
       1
       -
       """Test phi's episodic memory integration."""

     

       1
       1
       +
       """Proof of concept: LLM-as-judge eval for memory integration."""

     

       2
       2
        
       

     

       3
       3
        
       import pytest

     

       4
       4
        
       

     

       5
       5
       -
       from bot.agent import PhiAgent

     

       6
       5
        
       from bot.config import Settings

     

       7
       6
        
       from bot.memory import MemoryType, NamespaceMemory

     

       8
       7
        
       

     

       9
       8
        
       

     

       10
       10
       -
       @pytest.mark.asyncio

     

       11
       11
       -
       async def test_phi_retrieves_episodic_memory(settings):

     

       12
       12
       -
           """Test that phi can retrieve and use episodic memories."""

     

       9
       9
       +
       @pytest.fixture

     

       10
       10
       +
       def memory_settings():

     

       11
       11
       +
           """Check if memory keys are available."""

     

       12
       12
       +
           settings = Settings()

     

       13
       13
        
           if not all([settings.turbopuffer_api_key, settings.openai_api_key, settings.anthropic_api_key]):

     

       14
       14
       -
               pytest.skip("Requires TurboPuffer, OpenAI, and Anthropic API keys in .env")

     

       14
       14
       +
               pytest.skip("Requires TURBOPUFFER_API_KEY, OPENAI_API_KEY, and ANTHROPIC_API_KEY")

     

       15
       15
       +
           return settings

     

       15
       16
        
       

     

       16
       16
       -
           # Create memory system

     

       17
       17
       -
           memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)

     

       18
       17
        
       

     

       19
       19
       -
           # Store a memory about a user

     

       20
       20
       -
           await memory.store_user_memory(

     

       21
       21
       -
               "alice.bsky",

     

       22
       22
       -
               "Alice mentioned she's working on a PhD in neuroscience",

     

       23
       23
       -
               MemoryType.USER_FACT,

     

       24
       24
       -
           )

     

       18
       18
       +
       async def test_memory_integration(memory_settings, phi_agent, evaluate_response):

     

       19
       19
       +
           """Proof of concept: agent uses stored memory in response."""

     

       20
       20
       +
           memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key)

     

       25
       21
        
       

     

       26
       26
       -
           # Create agent

     

       27
       27
       -
           agent = PhiAgent()

     

       28
       28
       -
           agent.memory = memory

     

       29
       29
       -
       

     

       30
       30
       -
           # Process a mention that should trigger memory retrieval

     

       31
       31
       -
           response = await agent.process_mention(

     

       32
       32
       -
               mention_text="what do you remember about me?",

     

       33
       33
       -
               author_handle="alice.bsky",

     

       34
       34
       -
               thread_context="No previous messages in this thread.",

     

       35
       35
       -
               thread_uri="at://test/thread/memory1",

     

       22
       22
       +
           # Store a memory

     

       23
       23
       +
           await memory.store_core_memory(

     

       24
       24
       +
               label="test_guideline",

     

       25
       25
       +
               content="When users mention birds, acknowledge murmuration patterns",

     

       26
       26
       +
               memory_type=MemoryType.GUIDELINE,

     

       36
       27
        
           )

     

       37
       28
        
       

     

       38
       38
       -
           if response.action == "reply":

     

       39
       39
       -
               assert response.text is not None

     

       40
       40
       -
               # Should reference the neuroscience PhD in the response

     

       41
       41
       -
               assert (

     

       42
       42
       -
                   "neuroscience" in response.text.lower()

     

       43
       43
       -
                   or "phd" in response.text.lower()

     

       44
       44
       -
                   or "working on" in response.text.lower()

     

       45
       45
       -
               ), "Response should reference stored memory about Alice"

     

       29
       29
       +
           phi_agent.memory = memory

     

       46
       30
        
       

     

       47
       47
       -
       

     

       48
       48
       -
       @pytest.mark.asyncio

     

       49
       49
       -
       async def test_phi_stores_conversation_in_memory(settings):

     

       50
       50
       -
           """Test that phi stores interactions in episodic memory."""

     

       51
       51
       -
           if not all([settings.turbopuffer_api_key, settings.openai_api_key, settings.anthropic_api_key]):

     

       52
       52
       -
               pytest.skip("Requires TurboPuffer, OpenAI, and Anthropic API keys in .env")

     

       53
       53
       -
       

     

       54
       54
       -
           memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)

     

       55
       55
       -
       

     

       56
       56
       -
           agent = PhiAgent()

     

       57
       57
       -
           agent.memory = memory

     

       58
       58
       -
       

     

       59
       59
       -
           # Have a conversation

     

       60
       60
       -
           response = await agent.process_mention(

     

       61
       61
       -
               mention_text="I'm really interested in phenomenology",

     

       62
       62
       -
               author_handle="bob.bsky",

     

       31
       31
       +
           response = await phi_agent.process_mention(

     

       32
       32
       +
               mention_text="I saw starlings today",

     

       33
       33
       +
               author_handle="test.user",

     

       63
       34
        
               thread_context="No previous messages in this thread.",

     

       64
       64
       -
               thread_uri="at://test/thread/memory2",

     

       35
       35
       +
               thread_uri="at://test/thread/1",

     

       65
       36
        
           )

     

       66
       37
        
       

     

       67
       38
        
           if response.action == "reply":

     

       68
       68
       -
               # Verify memories were stored

     

       69
       69
       -
               memories = await memory.get_user_memories("bob.bsky", limit=10)

     

       70
       70
       -
       

     

       71
       71
       -
               assert len(memories) > 0, "Should have stored conversation in memory"

     

       72
       72
       -
       

     

       73
       73
       -
               # Check that both user's message and bot's response were stored

     

       74
       74
       -
               memory_texts = [m.content for m in memories]

     

       75
       75
       -
               assert any(

     

       76
       76
       -
                   "phenomenology" in text.lower() for text in memory_texts

     

       77
       77
       -
               ), "Should store user's message about phenomenology"

     

       39
       39
       +
               await evaluate_response(

     

       40
       40
       +
                   "Does the response reference murmuration patterns?",

     

       41
       41
       +
                   response.text,

     

       42
       42
       +
               )