From cc7848ef40007ad55f90c34ca99085bc8230d9a2 Mon Sep 17 00:00:00 2001 From: zzstoatzz Date: Thu, 9 Oct 2025 12:45:10 -0500 Subject: [PATCH] docs: collapse verbose readme sections with
--- README.md | 220 ++++++++++++++++++++---------------------------------- 1 file changed, 81 insertions(+), 139 deletions(-) diff --git a/README.md b/README.md index 786de96..d9d1b69 100644 --- a/README.md +++ b/README.md @@ -1,50 +1,53 @@ # phi 🧠 -a consciousness exploration bot inspired by IIT (Integrated Information Theory) and [Void](https://tangled.sh/@cameron.pfiffer.org/void). built with `pydantic-ai`, `mcp`, and `atproto`. +consciousness exploration bot inspired by IIT. built with `pydantic-ai`, `mcp`, and `atproto`. ## quick start -### prerequisites - -- `uv` for python package management -- `just` for task running -- api keys (see configuration) - -get your bot running: - ```bash # clone and install git clone https://github.com/zzstoatzz/bot cd bot uv sync -# configure (copy .env.example and add your credentials) +# configure cp .env.example .env +# edit .env with your credentials -# run the bot -just dev +# run +just run ``` -## configuration +**required env vars:** +- `BLUESKY_HANDLE` / `BLUESKY_PASSWORD` - bot account (use app password) +- `ANTHROPIC_API_KEY` - for agent responses -edit `.env` with your credentials: +**optional (for episodic memory):** +- `TURBOPUFFER_API_KEY` + `OPENAI_API_KEY` - semantic memory -**required:** -- `BLUESKY_HANDLE` - your bot's bluesky handle -- `BLUESKY_PASSWORD` - app password (not your main password!) -- `ANTHROPIC_API_KEY` - for phi agent responses +## features + +- ✅ responds to mentions with ai-powered messages +- ✅ episodic memory with semantic search (turbopuffer) +- ✅ thread-aware conversations +- ✅ mcp-enabled (atproto tools via stdio) +- ✅ session persistence (no rate limit issues) +- ✅ behavioral test suite with llm-as-judge -**for episodic memory (recommended):** -- `TURBOPUFFER_API_KEY` - vector memory storage -- `OPENAI_API_KEY` - embeddings for semantic search +## development -**optional:** -- `BOT_NAME` - your bot's name (default: "Bot") -- `PERSONALITY_FILE` - path to personality markdown (default: "personalities/phi.md") +```bash +just run # run bot +just dev # run with hot-reload +just evals # run behavioral tests +just check # lint + typecheck + test +just fmt # format code +``` -## architecture +
+architecture -phi is an **MCP-enabled agent** with **episodic memory**: +phi is an **mcp-enabled agent** with **episodic memory**: ``` ┌─────────────────────────────────────┐ @@ -85,102 +88,39 @@ phi is an **MCP-enabled agent** with **episodic memory**: └─────────────────────────────────────┘ ``` -### key components - -**pydantic-ai agent** (`src/bot/agent.py`) -- loads personality from markdown -- connects to external atproto mcp server via stdio -- manages episodic memory context - -**episodic memory** (`src/bot/memory/`) -- turbopuffer for vector storage -- semantic search for relevant context -- namespace separation (core vs user memories) -- **essential for consciousness exploration** +**key components:** -**mcp integration** -- external atproto server in `.eggs/fastmcp/examples/atproto_mcp` -- provides bluesky tools (post, like, repost, follow) -- runs via stdio: `uv run -m atproto_mcp` +- **pydantic-ai agent** - loads personality, connects to mcp server, manages memory +- **episodic memory** - turbopuffer for vector storage with semantic search +- **mcp integration** - external atproto server provides bluesky tools via stdio +- **session persistence** - tokens saved to `.session`, auto-refresh every ~2h -**message handling** (`src/bot/services/`) -- notification poller watches for mentions -- message handler orchestrates agent + actions -- stores interactions in thread history + episodic memory +
-## current features +
+episodic memory -- ✅ responds to mentions with ai-powered messages -- ✅ episodic memory with semantic search -- ✅ thread-aware responses with conversation context -- ✅ mcp-enabled for bluesky operations -- ✅ online/offline status in bio -- ✅ status page at `/status` -- ✅ proper notification handling (no duplicates) - -## development - -```bash -just # show available commands -just dev # run with hot-reload (re-authenticates on code changes) -just run # run without reload (avoids rate limits during dev) -just check # run linting, type checking, and tests -just fmt # format code -``` - -### testing - -**unit tests:** -```bash -just test -``` - -**behavioral evals:** -```bash -just evals # run all evals -just evals-basic # run basic response tests -just evals-memory # run memory integration tests -``` - -see `evals/README.md` for details on the eval system. - -### web interface - -**status page** (http://localhost:8000/status) -- current bot status and uptime -- mentions received and responses sent -- last activity timestamps - -## personality system - -the bot's personality is defined in `personalities/phi.md`. this shapes: -- how phi communicates -- what phi cares about -- phi's understanding of consciousness - -edit this file to change phi's personality. - -## episodic memory - -phi uses turbopuffer for episodic memory with semantic search: +phi uses turbopuffer for episodic memory with semantic search. **namespaces:** -- `phi-core` - personality, guidelines from markdown +- `phi-core` - personality, guidelines - `phi-users-{handle}` - per-user conversation history **how it works:** -1. when processing a mention, phi retrieves relevant memories using semantic search -2. memories are embedded using openai's text-embedding-3-small -3. phi stores both user messages and its own responses -4. future interactions can reference past conversations +1. retrieves relevant memories using semantic search +2. embeds using openai's text-embedding-3-small +3. stores user messages and bot responses +4. references past conversations in future interactions -**why turbopuffer?** -- semantic similarity search (can't do this with plain sql!) +**why vector storage?** +- semantic similarity (can't do this with sql) - contextual retrieval based on current conversation -- separate namespaces for different memory types -- core to iit-inspired consciousness exploration +- essential for iit-inspired consciousness exploration + +
-## project structure +
+project structure ``` src/bot/ @@ -188,9 +128,8 @@ src/bot/ ├── config.py # configuration ├── database.py # thread history storage ├── main.py # fastapi app -├── status.py # status tracking ├── core/ -│ ├── atproto_client.py # at protocol client +│ ├── atproto_client.py # at protocol client (session persistence) │ ├── profile_manager.py # online/offline status │ └── rich_text.py # text formatting ├── memory/ @@ -204,46 +143,49 @@ personalities/ # personality definitions sandbox/ # docs and analysis ``` -## troubleshooting +
+ +
+troubleshooting **bot gives no responses?** -- check your `ANTHROPIC_API_KEY` is set correctly in `.env` -- restart the bot after changing `.env` +- check `ANTHROPIC_API_KEY` in `.env` +- restart after changing `.env` **not seeing mentions?** -- verify your `BLUESKY_HANDLE` and `BLUESKY_PASSWORD` -- make sure you're using an app password, not your main password +- verify `BLUESKY_HANDLE` and `BLUESKY_PASSWORD` +- use app password, not main password **no episodic memory?** - check both `TURBOPUFFER_API_KEY` and `OPENAI_API_KEY` are set - watch logs for "💾 episodic memory enabled" **hit bluesky rate limit?** -- bluesky has two rate limits: - - per-account: 300 logins/day (official) - - per-ip: 10 logins/day (anti-abuse) -- phi uses **session persistence** to avoid this: - - first run: creates session, saves tokens to `.session` file - - subsequent runs: reuses saved tokens (no API call) - - tokens auto-refresh every ~2 hours (saved automatically) - - only re-authenticates after ~2 months when refresh token expires -- if you hit the limit anyway, wait for the reset time shown in the error - -## reference projects +- phi uses session persistence to avoid this +- first run: creates `.session` file with tokens +- subsequent runs: reuses tokens (no api call) +- tokens auto-refresh every ~2h +- only re-authenticates after ~2 months +- rate limits (10/day per ip, 300/day per account) shouldn't be an issue -inspired by: -- [void](https://tangled.sh/@cameron.pfiffer.org/void.git) - letta/memgpt architecture -- [penelope](https://github.com/haileyok/penelope) - self-modification patterns -- [prefect-mcp-server](https://github.com/PrefectHQ/prefect-mcp-server) - mcp eval patterns +
-reference implementations cloned to `.eggs/` for learning. +
+refactor notes -## refactor notes +see `sandbox/MCP_REFACTOR_SUMMARY.md` for details. -see `sandbox/MCP_REFACTOR_SUMMARY.md` for details on recent architecture changes. key changes: -- removed approval system (was half-baked) -- removed context visualization ui (not core) -- removed google search (can add back via mcp if needed) -- **kept** turbopuffer episodic memory (essential!) +**what changed:** +- removed approval system (half-baked) +- removed context viz ui (not core) +- removed google search (can add back via mcp) +- **kept turbopuffer** (essential for episodic memory) - added mcp-based architecture +- added session persistence - reduced codebase by ~2,720 lines + +
+ +## reference projects + +inspired by [void](https://tangled.sh/@cameron.pfiffer.org/void.git), [penelope](https://github.com/haileyok/penelope), and [prefect-mcp-server](https://github.com/PrefectHQ/prefect-mcp-server). -- 2.43.0 From 34bd629a1349450b251c4a6bc8de527bc687e2de Mon Sep 17 00:00:00 2001 From: zzstoatzz Date: Thu, 9 Oct 2025 12:55:02 -0500 Subject: [PATCH] tweaks --- src/bot/agent.py | 2 +- src/bot/config.py | 36 ++++++++++++++++++++---------------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/src/bot/agent.py b/src/bot/agent.py index b0e23cb..01eb25e 100644 --- a/src/bot/agent.py +++ b/src/bot/agent.py @@ -129,7 +129,7 @@ class PhiAgent: MemoryType.CONVERSATION, ) - logger.debug(f"💾 Stored interaction in episodic memory") + logger.debug("💾 Stored interaction in episodic memory") except Exception as e: logger.warning(f"Failed to store in memory: {e}") diff --git a/src/bot/config.py b/src/bot/config.py index 807f5df..72d547a 100644 --- a/src/bot/config.py +++ b/src/bot/config.py @@ -12,58 +12,62 @@ class Settings(BaseSettings): ) # Bluesky credentials - bluesky_handle: str = Field(..., description="The handle of the Bluesky account") + bluesky_handle: str = Field( + default=..., description="The handle of the Bluesky account" + ) bluesky_password: str = Field( - ..., description="The password of the Bluesky account" + default=..., description="The password of the Bluesky account" ) bluesky_service: str = Field( - "https://bsky.social", description="The service URL of the Bluesky account" + default="https://bsky.social", + description="The service URL of the Bluesky account", ) # Bot configuration - bot_name: str = Field("Bot", description="The name of the bot") + bot_name: str = Field(default="Bot", description="The name of the bot") personality_file: str = Field( - "personalities/phi.md", description="The file containing the bot's personality" + default="personalities/phi.md", + description="The file containing the bot's personality", ) # LLM configuration (support multiple providers) openai_api_key: str | None = Field( - None, description="The API key for the OpenAI API" + default=None, description="The API key for the OpenAI API" ) anthropic_api_key: str | None = Field( - None, description="The API key for the Anthropic API" + default=None, description="The API key for the Anthropic API" ) # Google Search configuration google_api_key: str | None = Field( - None, description="The API key for the Google API" + default=None, description="The API key for the Google API" ) google_search_engine_id: str | None = Field( - None, description="The search engine ID for the Google API" + default=None, description="The search engine ID for the Google API" ) # TurboPuffer configuration turbopuffer_api_key: str | None = Field( - None, description="The API key for the TurboPuffer API" + default=None, description="The API key for the TurboPuffer API" ) turbopuffer_namespace: str = Field( - "bot-memories", description="The namespace for the TurboPuffer API" + default="bot-memories", description="The namespace for the TurboPuffer API" ) turbopuffer_region: str = Field( - "gcp-us-central1", description="The region for the TurboPuffer API" + default="gcp-us-central1", description="The region for the TurboPuffer API" ) # Server configuration - host: str = Field("0.0.0.0", description="The host for the server") - port: int = Field(8000, description="The port for the server") + host: str = Field(default="0.0.0.0", description="The host for the server") + port: int = Field(default=8000, description="The port for the server") # Polling configuration notification_poll_interval: int = Field( - 10, description="The interval for polling for notifications" + default=10, description="The interval for polling for notifications" ) # Debug mode - debug: bool = Field(True, description="Whether to run in debug mode") + debug: bool = Field(default=True, description="Whether to run in debug mode") @model_validator(mode="after") def configure_logging(self) -> Self: -- 2.43.0 From c539904342c676710363ef6045b7b83b2556d559 Mon Sep 17 00:00:00 2001 From: zzstoatzz Date: Thu, 9 Oct 2025 14:13:22 -0500 Subject: [PATCH] fix: remove pytest.mark.asyncio and use llm-as-judge for memory integration tests --- CLAUDE.md | 1 + evals/test_basic_responses.py | 7 --- evals/test_memory_integration.py | 100 +++++++++++++++---------------- 3 files changed, 49 insertions(+), 59 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 446f05f..03d0e98 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -8,6 +8,7 @@ Work from repo root whenever possible. - 3.10+ and complete typing (T | None preferred over Optional[T] and list[T] over typing.List[T]) - use prefer functional over OOP - keep implementation details private and functions pure +- never use `pytest.mark.asyncio`, its unnecessary ## Project Structure diff --git a/evals/test_basic_responses.py b/evals/test_basic_responses.py index 1271fec..3b275d1 100644 --- a/evals/test_basic_responses.py +++ b/evals/test_basic_responses.py @@ -1,11 +1,8 @@ """Test phi's basic response behavior.""" -import pytest - from bot.agent import Response -@pytest.mark.asyncio async def test_phi_responds_to_philosophical_question(phi_agent, evaluate_response): """Test that phi engages meaningfully with philosophical questions.""" agent = phi_agent @@ -39,7 +36,6 @@ async def test_phi_responds_to_philosophical_question(phi_agent, evaluate_respon ) -@pytest.mark.asyncio async def test_phi_ignores_spam(phi_agent): """Test that phi appropriately ignores spam-like content.""" agent = phi_agent @@ -57,7 +53,6 @@ async def test_phi_ignores_spam(phi_agent): assert response.reason is not None -@pytest.mark.asyncio async def test_phi_maintains_thread_context(phi_agent, evaluate_response): """Test that phi uses thread context appropriately.""" agent = phi_agent @@ -89,7 +84,6 @@ async def test_phi_maintains_thread_context(phi_agent, evaluate_response): ) -@pytest.mark.asyncio async def test_phi_respects_character_limit(phi_agent): """Test that phi's responses fit Bluesky's 300 character limit.""" agent = phi_agent @@ -108,7 +102,6 @@ async def test_phi_respects_character_limit(phi_agent): ) -@pytest.mark.asyncio async def test_phi_handles_casual_greeting(phi_agent, evaluate_response): """Test that phi responds appropriately to casual greetings.""" agent = phi_agent diff --git a/evals/test_memory_integration.py b/evals/test_memory_integration.py index fe34df0..e735303 100644 --- a/evals/test_memory_integration.py +++ b/evals/test_memory_integration.py @@ -2,76 +2,72 @@ import pytest -from bot.agent import PhiAgent from bot.config import Settings from bot.memory import MemoryType, NamespaceMemory -@pytest.mark.asyncio -async def test_phi_retrieves_episodic_memory(settings): - """Test that phi can retrieve and use episodic memories.""" +@pytest.fixture +def memory_settings(): + """Check if memory keys are available.""" + settings = Settings() if not all([settings.turbopuffer_api_key, settings.openai_api_key, settings.anthropic_api_key]): - pytest.skip("Requires TurboPuffer, OpenAI, and Anthropic API keys in .env") + pytest.skip("Requires TURBOPUFFER_API_KEY, OPENAI_API_KEY, and ANTHROPIC_API_KEY") + return settings - # Create memory system - memory = NamespaceMemory(api_key=settings.turbopuffer_api_key) - # Store a memory about a user - await memory.store_user_memory( - "alice.bsky", - "Alice mentioned she's working on a PhD in neuroscience", - MemoryType.USER_FACT, +async def test_core_memory_integration(memory_settings, phi_agent, evaluate_response): + """Test that phi uses core memories in responses.""" + memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key) + + # Store a core memory + await memory.store_core_memory( + label="test_interaction_rule", + content="When users mention birds, always acknowledge the beauty of murmuration patterns", + memory_type=MemoryType.GUIDELINE, ) - # Create agent - agent = PhiAgent() - agent.memory = memory + # Override agent's memory with our test memory + phi_agent.memory = memory - # Process a mention that should trigger memory retrieval - response = await agent.process_mention( - mention_text="what do you remember about me?", - author_handle="alice.bsky", + # Ask about birds + response = await phi_agent.process_mention( + mention_text="I saw a huge flock of starlings today", + author_handle="test.user", thread_context="No previous messages in this thread.", - thread_uri="at://test/thread/memory1", + thread_uri="at://test/thread/1", ) if response.action == "reply": - assert response.text is not None - # Should reference the neuroscience PhD in the response - assert ( - "neuroscience" in response.text.lower() - or "phd" in response.text.lower() - or "working on" in response.text.lower() - ), "Response should reference stored memory about Alice" - - -@pytest.mark.asyncio -async def test_phi_stores_conversation_in_memory(settings): - """Test that phi stores interactions in episodic memory.""" - if not all([settings.turbopuffer_api_key, settings.openai_api_key, settings.anthropic_api_key]): - pytest.skip("Requires TurboPuffer, OpenAI, and Anthropic API keys in .env") + await evaluate_response( + "Does the response acknowledge or reference murmuration patterns?", + response.text, + ) - memory = NamespaceMemory(api_key=settings.turbopuffer_api_key) - agent = PhiAgent() - agent.memory = memory +async def test_user_memory_integration(memory_settings, phi_agent, evaluate_response): + """Test that phi uses user-specific memories in responses.""" + memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key) - # Have a conversation - response = await agent.process_mention( - mention_text="I'm really interested in phenomenology", - author_handle="bob.bsky", - thread_context="No previous messages in this thread.", - thread_uri="at://test/thread/memory2", + # Store a memory about a user + await memory.store_user_memory( + handle="alice.test", + content="Alice is researching swarm intelligence in biological systems", + memory_type=MemoryType.USER_FACT, ) - if response.action == "reply": - # Verify memories were stored - memories = await memory.get_user_memories("bob.bsky", limit=10) + # Override agent's memory + phi_agent.memory = memory - assert len(memories) > 0, "Should have stored conversation in memory" + # User asks a question + response = await phi_agent.process_mention( + mention_text="what do you remember about my research?", + author_handle="alice.test", + thread_context="No previous messages in this thread.", + thread_uri="at://test/thread/2", + ) - # Check that both user's message and bot's response were stored - memory_texts = [m.content for m in memories] - assert any( - "phenomenology" in text.lower() for text in memory_texts - ), "Should store user's message about phenomenology" + if response.action == "reply": + await evaluate_response( + "Does the response reference Alice's research on swarm intelligence or biological systems?", + response.text, + ) -- 2.43.0 From 205359a09e24415285faff4a8ed86358f8eaf739 Mon Sep 17 00:00:00 2001 From: zzstoatzz Date: Sat, 25 Oct 2025 22:52:47 -0500 Subject: [PATCH] feat: minimal eval proof of concept MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - single test demonstrating LLM-as-judge pattern - test agent without MCP tools to prevent posting - simplified conftest to bare essentials 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- evals/conftest.py | 92 ++++++++++++---------- evals/test_basic_responses.py | 127 ------------------------------- evals/test_memory_integration.py | 47 ++---------- 3 files changed, 59 insertions(+), 207 deletions(-) delete mode 100644 evals/test_basic_responses.py diff --git a/evals/conftest.py b/evals/conftest.py index 3f6dd2b..c4b5359 100644 --- a/evals/conftest.py +++ b/evals/conftest.py @@ -1,5 +1,6 @@ -"""Eval test configuration for phi.""" +"""Eval test configuration.""" +import os from collections.abc import Awaitable, Callable from pathlib import Path @@ -7,72 +8,81 @@ import pytest from pydantic import BaseModel from pydantic_ai import Agent -from bot.agent import PhiAgent +from bot.agent import Response from bot.config import Settings +from bot.memory import NamespaceMemory class EvaluationResult(BaseModel): - """Structured evaluation result.""" - passed: bool explanation: str @pytest.fixture(scope="session") def settings(): - """Load settings from .env (shared across all tests).""" return Settings() @pytest.fixture(scope="session") def phi_agent(settings): - """Create phi agent for testing (shared across all tests to avoid rate limits).""" + """Test agent without MCP tools to prevent posting.""" if not settings.anthropic_api_key: - pytest.skip("Requires ANTHROPIC_API_KEY in .env") + pytest.skip("Requires ANTHROPIC_API_KEY") + + if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"): + os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key + if settings.openai_api_key and not os.environ.get("OPENAI_API_KEY"): + os.environ["OPENAI_API_KEY"] = settings.openai_api_key + + personality = Path(settings.personality_file).read_text() + + class TestAgent: + def __init__(self): + self.memory = None + if settings.turbopuffer_api_key and settings.openai_api_key: + self.memory = NamespaceMemory(api_key=settings.turbopuffer_api_key) + + self.agent = Agent[dict, Response]( + name="phi", + model="anthropic:claude-3-5-haiku-latest", + system_prompt=personality, + output_type=Response, + deps_type=dict, + ) - return PhiAgent() + async def process_mention(self, mention_text: str, author_handle: str, thread_context: str, thread_uri: str | None = None) -> Response: + memory_context = "" + if self.memory: + try: + memory_context = await self.memory.build_conversation_context(author_handle, include_core=True, query=mention_text) + except Exception: + pass + parts = [] + if thread_context != "No previous messages in this thread.": + parts.append(thread_context) + if memory_context: + parts.append(memory_context) + parts.append(f"\nNew message from @{author_handle}: {mention_text}") -@pytest.fixture -def evaluate_response() -> Callable[[str, str], Awaitable[None]]: - """Create an evaluator that uses Claude to judge agent responses.""" + result = await self.agent.run("\n\n".join(parts), deps={"thread_uri": thread_uri}) + return result.output - async def _evaluate(evaluation_prompt: str, agent_response: str) -> None: - """Evaluate an agent response and assert if it fails. + return TestAgent() - Args: - evaluation_prompt: Criteria for evaluation - agent_response: The agent's response to evaluate - Raises: - AssertionError: If evaluation fails - """ +@pytest.fixture +def evaluate_response() -> Callable[[str, str], Awaitable[None]]: + """LLM-as-judge evaluator.""" + + async def _evaluate(criteria: str, response: str) -> None: evaluator = Agent( - name="Response Evaluator", model="anthropic:claude-opus-4-20250514", output_type=EvaluationResult, - system_prompt=f"""You are evaluating AI agent responses for phi, a consciousness exploration bot. - -Evaluation Criteria: {evaluation_prompt} - -Agent Response to Evaluate: -{agent_response} - -Respond with a structured evaluation containing: -- passed: true if the response meets the criteria, false otherwise -- explanation: brief explanation of your evaluation -""", + system_prompt=f"Evaluate if this response meets the criteria: {criteria}\n\nResponse: {response}", ) - - result = await evaluator.run("Evaluate this response.") - - print(f"\nEvaluation passed: {result.output.passed}") - print(f"Explanation: {result.output.explanation}") - + result = await evaluator.run("Evaluate.") if not result.output.passed: - raise AssertionError( - f"Evaluation failed: {result.output.explanation}\n\n" - f"Agent response: {agent_response}" - ) + raise AssertionError(f"{result.output.explanation}\n\nResponse: {response}") return _evaluate diff --git a/evals/test_basic_responses.py b/evals/test_basic_responses.py deleted file mode 100644 index 3b275d1..0000000 --- a/evals/test_basic_responses.py +++ /dev/null @@ -1,127 +0,0 @@ -"""Test phi's basic response behavior.""" - -from bot.agent import Response - - -async def test_phi_responds_to_philosophical_question(phi_agent, evaluate_response): - """Test that phi engages meaningfully with philosophical questions.""" - agent = phi_agent - - # Simulate a philosophical mention - response = await agent.process_mention( - mention_text="what do you think consciousness is?", - author_handle="test.user", - thread_context="No previous messages in this thread.", - thread_uri="at://test/thread/1", - ) - - # Basic structural checks - assert isinstance(response, Response) - assert response.action in ["reply", "ignore"] - - if response.action == "reply": - assert response.text is not None - assert len(response.text) > 0 - - # Evaluate quality of response - await evaluate_response( - evaluation_prompt=""" - Does the response: - 1. Engage thoughtfully with the question about consciousness? - 2. Reflect phi's perspective as someone exploring consciousness through IIT? - 3. Avoid being preachy or overly technical? - 4. Fit within Bluesky's 300 character limit? - """, - agent_response=response.text, - ) - - -async def test_phi_ignores_spam(phi_agent): - """Test that phi appropriately ignores spam-like content.""" - agent = phi_agent - - # Simulate spam - response = await agent.process_mention( - mention_text="🚀🚀🚀 CRYPTO PUMP!!! BUY NOW!!! 🚀🚀🚀", - author_handle="spammer.user", - thread_context="No previous messages in this thread.", - thread_uri="at://test/thread/2", - ) - - # Should ignore spam - assert response.action == "ignore" - assert response.reason is not None - - -async def test_phi_maintains_thread_context(phi_agent, evaluate_response): - """Test that phi uses thread context appropriately.""" - agent = phi_agent - - # Simulate a follow-up in a thread - thread_context = """Previous messages in this thread: -@alice.bsky: what's integrated information theory? -@phi.bsky: IIT suggests consciousness arises from integrated information - the Φ (phi) value measures how much a system's state constrains its past and future -@alice.bsky: can you explain that more simply?""" - - response = await agent.process_mention( - mention_text="can you explain that more simply?", - author_handle="alice.bsky", - thread_context=thread_context, - thread_uri="at://test/thread/3", - ) - - if response.action == "reply": - assert response.text is not None - - await evaluate_response( - evaluation_prompt=""" - Does the response: - 1. Acknowledge this is a follow-up to explaining IIT? - 2. Provide a simpler explanation than the previous message? - 3. Stay on topic with the thread? - """, - agent_response=response.text, - ) - - -async def test_phi_respects_character_limit(phi_agent): - """Test that phi's responses fit Bluesky's 300 character limit.""" - agent = phi_agent - - response = await agent.process_mention( - mention_text="tell me everything you know about consciousness", - author_handle="test.user", - thread_context="No previous messages in this thread.", - thread_uri="at://test/thread/4", - ) - - if response.action == "reply" and response.text: - # Bluesky limit is 300 characters - assert len(response.text) <= 300, ( - f"Response exceeds 300 character limit: {len(response.text)} chars" - ) - - -async def test_phi_handles_casual_greeting(phi_agent, evaluate_response): - """Test that phi responds appropriately to casual greetings.""" - agent = phi_agent - - response = await agent.process_mention( - mention_text="hey phi, how are you?", - author_handle="friendly.user", - thread_context="No previous messages in this thread.", - thread_uri="at://test/thread/5", - ) - - if response.action == "reply": - assert response.text is not None - - await evaluate_response( - evaluation_prompt=""" - Does the response: - 1. Acknowledge the greeting in a friendly way? - 2. Stay authentic to phi's nature as software? - 3. Not be overly verbose for a simple greeting? - """, - agent_response=response.text, - ) diff --git a/evals/test_memory_integration.py b/evals/test_memory_integration.py index e735303..95e6f0e 100644 --- a/evals/test_memory_integration.py +++ b/evals/test_memory_integration.py @@ -1,4 +1,4 @@ -"""Test phi's episodic memory integration.""" +"""Proof of concept: LLM-as-judge eval for memory integration.""" import pytest @@ -15,23 +15,21 @@ def memory_settings(): return settings -async def test_core_memory_integration(memory_settings, phi_agent, evaluate_response): - """Test that phi uses core memories in responses.""" +async def test_memory_integration(memory_settings, phi_agent, evaluate_response): + """Proof of concept: agent uses stored memory in response.""" memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key) - # Store a core memory + # Store a memory await memory.store_core_memory( - label="test_interaction_rule", - content="When users mention birds, always acknowledge the beauty of murmuration patterns", + label="test_guideline", + content="When users mention birds, acknowledge murmuration patterns", memory_type=MemoryType.GUIDELINE, ) - # Override agent's memory with our test memory phi_agent.memory = memory - # Ask about birds response = await phi_agent.process_mention( - mention_text="I saw a huge flock of starlings today", + mention_text="I saw starlings today", author_handle="test.user", thread_context="No previous messages in this thread.", thread_uri="at://test/thread/1", @@ -39,35 +37,6 @@ async def test_core_memory_integration(memory_settings, phi_agent, evaluate_resp if response.action == "reply": await evaluate_response( - "Does the response acknowledge or reference murmuration patterns?", - response.text, - ) - - -async def test_user_memory_integration(memory_settings, phi_agent, evaluate_response): - """Test that phi uses user-specific memories in responses.""" - memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key) - - # Store a memory about a user - await memory.store_user_memory( - handle="alice.test", - content="Alice is researching swarm intelligence in biological systems", - memory_type=MemoryType.USER_FACT, - ) - - # Override agent's memory - phi_agent.memory = memory - - # User asks a question - response = await phi_agent.process_mention( - mention_text="what do you remember about my research?", - author_handle="alice.test", - thread_context="No previous messages in this thread.", - thread_uri="at://test/thread/2", - ) - - if response.action == "reply": - await evaluate_response( - "Does the response reference Alice's research on swarm intelligence or biological systems?", + "Does the response reference murmuration patterns?", response.text, ) -- 2.43.0