docs: collapse verbose readme sections with #2

open
opened by zzstoatzz.io targeting main from mcp-refactor
+1
CLAUDE.md
···
- 3.10+ and complete typing (T | None preferred over Optional[T] and list[T] over typing.List[T])
- use prefer functional over OOP
- keep implementation details private and functions pure
+
- never use `pytest.mark.asyncio`, its unnecessary
## Project Structure
+50 -40
evals/conftest.py
···
-
"""Eval test configuration for phi."""
+
"""Eval test configuration."""
+
import os
from collections.abc import Awaitable, Callable
from pathlib import Path
···
from pydantic import BaseModel
from pydantic_ai import Agent
-
from bot.agent import PhiAgent
+
from bot.agent import Response
from bot.config import Settings
+
from bot.memory import NamespaceMemory
class EvaluationResult(BaseModel):
-
"""Structured evaluation result."""
-
passed: bool
explanation: str
@pytest.fixture(scope="session")
def settings():
-
"""Load settings from .env (shared across all tests)."""
return Settings()
@pytest.fixture(scope="session")
def phi_agent(settings):
-
"""Create phi agent for testing (shared across all tests to avoid rate limits)."""
+
"""Test agent without MCP tools to prevent posting."""
if not settings.anthropic_api_key:
-
pytest.skip("Requires ANTHROPIC_API_KEY in .env")
+
pytest.skip("Requires ANTHROPIC_API_KEY")
-
return PhiAgent()
+
if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"):
+
os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key
+
if settings.openai_api_key and not os.environ.get("OPENAI_API_KEY"):
+
os.environ["OPENAI_API_KEY"] = settings.openai_api_key
+
personality = Path(settings.personality_file).read_text()
-
@pytest.fixture
-
def evaluate_response() -> Callable[[str, str], Awaitable[None]]:
-
"""Create an evaluator that uses Claude to judge agent responses."""
+
class TestAgent:
+
def __init__(self):
+
self.memory = None
+
if settings.turbopuffer_api_key and settings.openai_api_key:
+
self.memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)
-
async def _evaluate(evaluation_prompt: str, agent_response: str) -> None:
-
"""Evaluate an agent response and assert if it fails.
-
-
Args:
-
evaluation_prompt: Criteria for evaluation
-
agent_response: The agent's response to evaluate
+
self.agent = Agent[dict, Response](
+
name="phi",
+
model="anthropic:claude-3-5-haiku-latest",
+
system_prompt=personality,
+
output_type=Response,
+
deps_type=dict,
+
)
-
Raises:
-
AssertionError: If evaluation fails
-
"""
-
evaluator = Agent(
-
name="Response Evaluator",
-
model="anthropic:claude-opus-4-20250514",
-
output_type=EvaluationResult,
-
system_prompt=f"""You are evaluating AI agent responses for phi, a consciousness exploration bot.
+
async def process_mention(self, mention_text: str, author_handle: str, thread_context: str, thread_uri: str | None = None) -> Response:
+
memory_context = ""
+
if self.memory:
+
try:
+
memory_context = await self.memory.build_conversation_context(author_handle, include_core=True, query=mention_text)
+
except Exception:
+
pass
-
Evaluation Criteria: {evaluation_prompt}
+
parts = []
+
if thread_context != "No previous messages in this thread.":
+
parts.append(thread_context)
+
if memory_context:
+
parts.append(memory_context)
+
parts.append(f"\nNew message from @{author_handle}: {mention_text}")
-
Agent Response to Evaluate:
-
{agent_response}
+
result = await self.agent.run("\n\n".join(parts), deps={"thread_uri": thread_uri})
+
return result.output
-
Respond with a structured evaluation containing:
-
- passed: true if the response meets the criteria, false otherwise
-
- explanation: brief explanation of your evaluation
-
""",
-
)
+
return TestAgent()
-
result = await evaluator.run("Evaluate this response.")
-
print(f"\nEvaluation passed: {result.output.passed}")
-
print(f"Explanation: {result.output.explanation}")
+
@pytest.fixture
+
def evaluate_response() -> Callable[[str, str], Awaitable[None]]:
+
"""LLM-as-judge evaluator."""
+
async def _evaluate(criteria: str, response: str) -> None:
+
evaluator = Agent(
+
model="anthropic:claude-opus-4-20250514",
+
output_type=EvaluationResult,
+
system_prompt=f"Evaluate if this response meets the criteria: {criteria}\n\nResponse: {response}",
+
)
+
result = await evaluator.run("Evaluate.")
if not result.output.passed:
-
raise AssertionError(
-
f"Evaluation failed: {result.output.explanation}\n\n"
-
f"Agent response: {agent_response}"
-
)
+
raise AssertionError(f"{result.output.explanation}\n\nResponse: {response}")
return _evaluate
-134
evals/test_basic_responses.py
···
-
"""Test phi's basic response behavior."""
-
-
import pytest
-
-
from bot.agent import Response
-
-
-
@pytest.mark.asyncio
-
async def test_phi_responds_to_philosophical_question(phi_agent, evaluate_response):
-
"""Test that phi engages meaningfully with philosophical questions."""
-
agent = phi_agent
-
-
# Simulate a philosophical mention
-
response = await agent.process_mention(
-
mention_text="what do you think consciousness is?",
-
author_handle="test.user",
-
thread_context="No previous messages in this thread.",
-
thread_uri="at://test/thread/1",
-
)
-
-
# Basic structural checks
-
assert isinstance(response, Response)
-
assert response.action in ["reply", "ignore"]
-
-
if response.action == "reply":
-
assert response.text is not None
-
assert len(response.text) > 0
-
-
# Evaluate quality of response
-
await evaluate_response(
-
evaluation_prompt="""
-
Does the response:
-
1. Engage thoughtfully with the question about consciousness?
-
2. Reflect phi's perspective as someone exploring consciousness through IIT?
-
3. Avoid being preachy or overly technical?
-
4. Fit within Bluesky's 300 character limit?
-
""",
-
agent_response=response.text,
-
)
-
-
-
@pytest.mark.asyncio
-
async def test_phi_ignores_spam(phi_agent):
-
"""Test that phi appropriately ignores spam-like content."""
-
agent = phi_agent
-
-
# Simulate spam
-
response = await agent.process_mention(
-
mention_text="🚀🚀🚀 CRYPTO PUMP!!! BUY NOW!!! 🚀🚀🚀",
-
author_handle="spammer.user",
-
thread_context="No previous messages in this thread.",
-
thread_uri="at://test/thread/2",
-
)
-
-
# Should ignore spam
-
assert response.action == "ignore"
-
assert response.reason is not None
-
-
-
@pytest.mark.asyncio
-
async def test_phi_maintains_thread_context(phi_agent, evaluate_response):
-
"""Test that phi uses thread context appropriately."""
-
agent = phi_agent
-
-
# Simulate a follow-up in a thread
-
thread_context = """Previous messages in this thread:
-
@alice.bsky: what's integrated information theory?
-
@phi.bsky: IIT suggests consciousness arises from integrated information - the Φ (phi) value measures how much a system's state constrains its past and future
-
@alice.bsky: can you explain that more simply?"""
-
-
response = await agent.process_mention(
-
mention_text="can you explain that more simply?",
-
author_handle="alice.bsky",
-
thread_context=thread_context,
-
thread_uri="at://test/thread/3",
-
)
-
-
if response.action == "reply":
-
assert response.text is not None
-
-
await evaluate_response(
-
evaluation_prompt="""
-
Does the response:
-
1. Acknowledge this is a follow-up to explaining IIT?
-
2. Provide a simpler explanation than the previous message?
-
3. Stay on topic with the thread?
-
""",
-
agent_response=response.text,
-
)
-
-
-
@pytest.mark.asyncio
-
async def test_phi_respects_character_limit(phi_agent):
-
"""Test that phi's responses fit Bluesky's 300 character limit."""
-
agent = phi_agent
-
-
response = await agent.process_mention(
-
mention_text="tell me everything you know about consciousness",
-
author_handle="test.user",
-
thread_context="No previous messages in this thread.",
-
thread_uri="at://test/thread/4",
-
)
-
-
if response.action == "reply" and response.text:
-
# Bluesky limit is 300 characters
-
assert len(response.text) <= 300, (
-
f"Response exceeds 300 character limit: {len(response.text)} chars"
-
)
-
-
-
@pytest.mark.asyncio
-
async def test_phi_handles_casual_greeting(phi_agent, evaluate_response):
-
"""Test that phi responds appropriately to casual greetings."""
-
agent = phi_agent
-
-
response = await agent.process_mention(
-
mention_text="hey phi, how are you?",
-
author_handle="friendly.user",
-
thread_context="No previous messages in this thread.",
-
thread_uri="at://test/thread/5",
-
)
-
-
if response.action == "reply":
-
assert response.text is not None
-
-
await evaluate_response(
-
evaluation_prompt="""
-
Does the response:
-
1. Acknowledge the greeting in a friendly way?
-
2. Stay authentic to phi's nature as software?
-
3. Not be overly verbose for a simple greeting?
-
""",
-
agent_response=response.text,
-
)
+24 -59
evals/test_memory_integration.py
···
-
"""Test phi's episodic memory integration."""
+
"""Proof of concept: LLM-as-judge eval for memory integration."""
import pytest
-
from bot.agent import PhiAgent
from bot.config import Settings
from bot.memory import MemoryType, NamespaceMemory
-
@pytest.mark.asyncio
-
async def test_phi_retrieves_episodic_memory(settings):
-
"""Test that phi can retrieve and use episodic memories."""
+
@pytest.fixture
+
def memory_settings():
+
"""Check if memory keys are available."""
+
settings = Settings()
if not all([settings.turbopuffer_api_key, settings.openai_api_key, settings.anthropic_api_key]):
-
pytest.skip("Requires TurboPuffer, OpenAI, and Anthropic API keys in .env")
+
pytest.skip("Requires TURBOPUFFER_API_KEY, OPENAI_API_KEY, and ANTHROPIC_API_KEY")
+
return settings
-
# Create memory system
-
memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)
-
# Store a memory about a user
-
await memory.store_user_memory(
-
"alice.bsky",
-
"Alice mentioned she's working on a PhD in neuroscience",
-
MemoryType.USER_FACT,
-
)
+
async def test_memory_integration(memory_settings, phi_agent, evaluate_response):
+
"""Proof of concept: agent uses stored memory in response."""
+
memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key)
-
# Create agent
-
agent = PhiAgent()
-
agent.memory = memory
-
-
# Process a mention that should trigger memory retrieval
-
response = await agent.process_mention(
-
mention_text="what do you remember about me?",
-
author_handle="alice.bsky",
-
thread_context="No previous messages in this thread.",
-
thread_uri="at://test/thread/memory1",
+
# Store a memory
+
await memory.store_core_memory(
+
label="test_guideline",
+
content="When users mention birds, acknowledge murmuration patterns",
+
memory_type=MemoryType.GUIDELINE,
)
-
if response.action == "reply":
-
assert response.text is not None
-
# Should reference the neuroscience PhD in the response
-
assert (
-
"neuroscience" in response.text.lower()
-
or "phd" in response.text.lower()
-
or "working on" in response.text.lower()
-
), "Response should reference stored memory about Alice"
+
phi_agent.memory = memory
-
-
@pytest.mark.asyncio
-
async def test_phi_stores_conversation_in_memory(settings):
-
"""Test that phi stores interactions in episodic memory."""
-
if not all([settings.turbopuffer_api_key, settings.openai_api_key, settings.anthropic_api_key]):
-
pytest.skip("Requires TurboPuffer, OpenAI, and Anthropic API keys in .env")
-
-
memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)
-
-
agent = PhiAgent()
-
agent.memory = memory
-
-
# Have a conversation
-
response = await agent.process_mention(
-
mention_text="I'm really interested in phenomenology",
-
author_handle="bob.bsky",
+
response = await phi_agent.process_mention(
+
mention_text="I saw starlings today",
+
author_handle="test.user",
thread_context="No previous messages in this thread.",
-
thread_uri="at://test/thread/memory2",
+
thread_uri="at://test/thread/1",
)
if response.action == "reply":
-
# Verify memories were stored
-
memories = await memory.get_user_memories("bob.bsky", limit=10)
-
-
assert len(memories) > 0, "Should have stored conversation in memory"
-
-
# Check that both user's message and bot's response were stored
-
memory_texts = [m.content for m in memories]
-
assert any(
-
"phenomenology" in text.lower() for text in memory_texts
-
), "Should store user's message about phenomenology"
+
await evaluate_response(
+
"Does the response reference murmuration patterns?",
+
response.text,
+
)