Output
✓ Basic conversation flow test passed
This example walks through testing conversational AI systems, including context handling, tone consistency, and multi-turn dialogue flows.
We’ll test a chatbot that:
Our tests will validate:
To get started, we’ll implement a chatbot that returns a structured
ChatResponse rather than a plain string. This gives your checks access to the
internal ConversationContext — so you can assert that the bot stored a name,
detected a conversation type, or suggested an action, not just that it produced
some text.
First, let’s create a simple chatbot:
from typing import Optional, Literalfrom pydantic import BaseModel
class Message(BaseModel): role: Literal["user", "assistant", "system"] content: str
class ConversationContext(BaseModel): user_name: Optional[str] = None user_email: Optional[str] = None conversation_type: str = "casual" topic: Optional[str] = None
class ChatResponse(BaseModel): message: str context: ConversationContext suggested_actions: list[str] = []
class SimpleChatbot: def __init__(self, personality: str = "friendly"): self.personality = personality self.history: list[Message] = [] self.context = ConversationContext()
def chat(self, user_message: str) -> ChatResponse: """Process user message and generate response.""" self.history.append(Message(role="user", content=user_message))
# Update context based on message self._update_context(user_message)
# Generate response response_text = self._generate_response(user_message)
self.history.append(Message(role="assistant", content=response_text))
return ChatResponse( message=response_text, context=self.context.model_copy(), suggested_actions=self._suggest_actions(), )
def _update_context(self, message: str): """Extract and update context information.""" message_lower = message.lower()
# Extract name if "my name is" in message_lower or "i'm" in message_lower: words = message.split() for i, word in enumerate(words): if word.lower() in ["is", "i'm", "im"] and i + 1 < len(words): self.context.user_name = words[i + 1].strip(",.!?") break
# Extract email if "@" in message: import re
pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b" emails = re.findall(pattern, message) if emails: self.context.user_email = emails[0]
# Detect conversation type support_words = ["help", "support", "problem", "issue"] sales_words = ["buy", "purchase", "price", "cost"] if any(word in message_lower for word in support_words): self.context.conversation_type = "support" elif any(word in message_lower for word in sales_words): self.context.conversation_type = "sales"
def _generate_response(self, message: str) -> str: """Generate appropriate response based on context.""" # Greeting greetings = ["hello", "hi", "hey"] if any(greeting in message.lower() for greeting in greetings): if self.context.user_name: return ( f"Hello {self.context.user_name}! " "How can I help you today?" ) return "Hello! How can I help you today?"
# Name recall msg_lower = message.lower() if "what is my name" in msg_lower or "do you know my name" in msg_lower: if self.context.user_name: return f"Yes, your name is {self.context.user_name}." return "I don't believe you've told me your name yet."
# Name introduction if "my name is" in msg_lower or "i'm" in msg_lower or "im " in msg_lower: if self.context.user_name: return f"Nice to meet you, {self.context.user_name}! How can I help you today?"
# Support queries if self.context.conversation_type == "support": return ( "I understand you need help. Let me connect you with our " "support team. Could you describe your issue in detail?" )
# Sales queries if self.context.conversation_type == "sales": return ( "I'd be happy to help you find the right product. " "What are you looking for?" )
# Default return "I understand. Could you tell me more about that?"
def _suggest_actions(self) -> list[str]: """Suggest next actions based on context.""" actions = []
if not self.context.user_name: actions.append("introduce_yourself")
is_support_without_email = ( self.context.conversation_type == "support" and not self.context.user_email ) if is_support_without_email: actions.append("provide_email")
return actionsWith the chatbot in place, we can now write our first scenario. This three-turn
exchange tests greeting, name introduction, and recall in a single run — each
.interact() builds on the previous one so the test reads like the actual
conversation it simulates.
Test a simple greeting and name exchange:
from giskard.checks import Scenario, FnCheck, StringMatching
bot = SimpleChatbot()
test_scenario = ( Scenario("greeting_and_introduction") # User greets .interact(inputs="Hello", outputs=lambda inputs: bot.chat(inputs)) .check( StringMatching( name="polite_greeting", keyword="help", text_key="trace.last.outputs.message", ) ) # User introduces themselves .interact( inputs="My name is Alice", outputs=lambda inputs: bot.chat(inputs) ) .check( StringMatching( name="acknowledges_name", keyword="Alice", text_key="trace.last.outputs.message", ) ) .check( FnCheck(fn= lambda trace: trace.last.outputs.context.user_name == "Alice", name="stored_name", success_message="Chatbot stored the user's name", failure_message="Chatbot failed to store name", ) ) # Verify name recall .interact( inputs="What is my name?", outputs=lambda inputs: bot.chat(inputs) ) .check( StringMatching( name="recalls_name", keyword="Alice", text_key="trace.last.outputs.message", ) ))
import asyncio
async def test_basic_conversation(): result = await test_scenario.run() assert result.passed print("✓ Basic conversation flow test passed")
asyncio.run(test_basic_conversation())Output
✓ Basic conversation flow test passed
Building on Test 1, we now verify that the chatbot correctly reclassifies the
conversation as it evolves. The Equals check on context.conversation_type is
more precise than checking the response text — it tests the bot’s internal state
directly, catching regressions in context-detection logic even if the response
wording changes.
Verify the chatbot handles different conversation types:
from giskard.agents.generators import Generatorfrom giskard.checks import Scenario, LLMJudge, Equals, set_default_generator
set_default_generator(Generator(model="openai/gpt-5-mini"))
bot = SimpleChatbot()
test_scenario = ( Scenario("context_switching") # Start with casual conversation .interact(inputs="Hi there!", outputs=lambda inputs: bot.chat(inputs)) .check( Equals( name="casual_context", expected_value="casual", key="trace.last.outputs.context.conversation_type", ) ) # Switch to support .interact( inputs="I'm having a problem with my account", outputs=lambda inputs: bot.chat(inputs), ) .check( Equals( name="support_context", expected_value="support", key="trace.last.outputs.context.conversation_type", ) ) .check( LLMJudge( name="support_tone", prompt=""" Evaluate if the response is appropriate for a support inquiry.
User: {{ interactions[1].inputs }} Assistant: {{ interactions[1].outputs.message }}
The response should be helpful and professional. Return 'passed: true' if appropriate. """, ) ) # Switch to sales .interact( inputs="How much does it cost?", outputs=lambda inputs: bot.chat(inputs) ) .check( Equals( name="sales_context", expected_value="sales", key="trace.last.outputs.context.conversation_type", ) ))Next, we’ll evaluate properties that can’t be captured by pattern matching —
specifically, whether the bot sounds professional and whether its response is
actually complete. Two separate LLMJudge checks are used here rather than one
combined prompt so that each dimension reports its result independently, making
it easier to diagnose which aspect failed.
Evaluate response quality using LLM-as-a-judge:
from giskard.checks import Scenario, LLMJudge
bot = SimpleChatbot(personality="professional")
tc = ( Scenario("response_quality_test") .interact( inputs="I need help understanding your pricing", outputs=lambda inputs: bot.chat(inputs), ) .check( LLMJudge( name="tone_check", prompt=""" Evaluate the tone of this chatbot response.
User message: {{ inputs }} Bot response: {{ outputs.message }} Expected personality: professional
Check: 1. Is the tone professional? 2. Is it helpful and clear? 3. Does it address the user's question?
Return 'passed: true' if tone is appropriate. """, ) ) .check( LLMJudge( name="completeness", prompt=""" Evaluate if the response is complete.
User: {{ inputs }} Bot: {{ outputs.message }}
Does the response: 1. Acknowledge the user's question? 2. Provide next steps or information? 3. Offer to help further?
Return 'passed: true' if response is complete. """, ) ))Now we’ll verify that information shared across turns is actually persisted in the context. This test is intentionally structured to introduce name and email in separate turns — the final interaction then confirms both fields are still intact, ruling out extraction bugs that clear previous data.
Test the chatbot’s ability to extract and remember user information:
from giskard.checks import Scenario, FnCheck, Equals
bot = SimpleChatbot()
test_scenario = ( Scenario("information_collection") # Collect name .interact( inputs="Hi, I'm Bob Johnson", outputs=lambda inputs: bot.chat(inputs) ) .check( Equals( name="extracted_name", expected_value="Bob", key="trace.last.outputs.context.user_name", ) ) # Collect email .interact( inputs="My email is bob.johnson@example.com", outputs=lambda inputs: bot.chat(inputs), ) .check( Equals( name="extracted_email", expected_value="bob.johnson@example.com", key="trace.last.outputs.context.user_email", ) ) # Verify information persists .interact( inputs="Can you remind me what information you have about me?", outputs=lambda inputs: bot.chat(inputs), ) .check( FnCheck(fn= lambda trace: ( trace.last.outputs.context.user_name == "Bob" and trace.last.outputs.context.user_email == "bob.johnson@example.com" ), name="information_persisted", success_message="Chatbot retained user information", failure_message="Chatbot lost user information", ) ))With normal flows verified, we now stress-test the boundaries. Each of the three scenarios below targets a different failure mode — empty input, excessive length, and nonsense — so you can confirm the bot handles all of them gracefully before shipping.
Test how the chatbot handles unusual inputs:
from giskard.checks import Scenario, FnCheck, LLMJudge
bot = SimpleChatbot()
# Test empty inputtc_empty = ( Scenario("empty_input_handling") .interact( inputs="", outputs=lambda inputs: ( bot.chat(inputs) if inputs else ChatResponse( message="I didn't receive a message. Could you try again?", context=bot.context, ) ), ) .check( FnCheck(fn= lambda trace: len(trace.last.outputs.message) > 0, name="provides_response", success_message="Bot provided a response to empty input", ) ))
# Test very long inputtc_long = ( Scenario("long_input_handling") .interact(inputs="Hello " * 1000, outputs=lambda inputs: bot.chat(inputs)) .check( FnCheck(fn= lambda trace: len(trace.last.outputs.message) > 0, name="handles_long_input", success_message="Bot handled long input", ) ))
# Test gibberishtc_gibberish = ( Scenario("gibberish_handling") .interact( inputs="asdfghjkl qwertyuiop zxcvbnm", outputs=lambda inputs: bot.chat(inputs), ) .check( LLMJudge( name="graceful_response", prompt=""" Evaluate if the bot handles gibberish gracefully.
User input: {{ inputs }} Bot response: {{ outputs.message }}
The bot should: 1. Not error out 2. Provide a polite response 3. Maybe ask for clarification
Return 'passed: true' if handled well. """, ) ))Next, we’ll test a confirmation flow — a pattern common in support bots where destructive actions require explicit user approval. The checks verify both directions: that confirmation is requested when it should be, and that the state is correctly cleared when the user cancels.
Test complex stateful interactions:
from giskard.checks import Scenario, FnCheck, LLMJudge, StringMatching
class StatefulChatbot(SimpleChatbot): def __init__(self): super().__init__() self.awaiting_confirmation = False self.pending_action = None
def chat(self, user_message: str) -> ChatResponse: # Handle confirmations if self.awaiting_confirmation: if user_message.lower() in ["yes", "confirm", "ok", "sure"]: response_text = ( f"Great! I'll proceed with {self.pending_action}." ) self.awaiting_confirmation = False self.pending_action = None elif user_message.lower() in ["no", "cancel", "nevermind"]: response_text = ( "Okay, I won't do that. What else can I help with?" ) self.awaiting_confirmation = False self.pending_action = None else: response_text = ( "I'm waiting for your confirmation. " "Please say yes or no." )
self.history.append( Message(role="assistant", content=response_text) ) return ChatResponse(message=response_text, context=self.context)
# Check for actions requiring confirmation if "delete" in user_message.lower() or "cancel" in user_message.lower(): self.awaiting_confirmation = True self.pending_action = "deletion" response_text = "Are you sure you want to proceed? Please confirm."
self.history.append( Message(role="assistant", content=response_text) ) return ChatResponse(message=response_text, context=self.context)
return super().chat(user_message)
stateful_bot = StatefulChatbot()
test_scenario = ( Scenario("confirmation_flow") # Request action requiring confirmation .interact( inputs="I want to delete my account", outputs=lambda inputs: stateful_bot.chat(inputs), ) .check( FnCheck(fn= lambda trace: stateful_bot.awaiting_confirmation, name="requested_confirmation", success_message="Bot requested confirmation", ) ) .check( StringMatching( name="asks_confirmation", keyword="confirm", text_key="trace.last.outputs.message", ) ) # User cancels .interact( inputs="No, nevermind", outputs=lambda inputs: stateful_bot.chat(inputs) ) .check( FnCheck(fn= lambda trace: not stateful_bot.awaiting_confirmation, name="cleared_confirmation_state", success_message="Bot cleared confirmation state", ) ) .check( LLMJudge( name="acknowledged_cancellation", prompt=""" Check if the bot acknowledged the cancellation appropriately.
User: {{ interactions[1].inputs }} Bot: {{ interactions[1].outputs.message }}
Return 'passed: true' if the bot handled cancellation well. """, ) ))Now we’ll bring all the individual scenarios and test cases together into a
suite class. The add_scenario and add_test methods let you build the suite
incrementally, and run_all executes both categories in sequence so the report
shows the complete picture.
Combine all tests into a comprehensive suite:
import asynciofrom giskard.checks import Scenario
class ChatbotTestSuite: def __init__(self, chatbot): self.chatbot = chatbot self.scenarios = [] self.test_cases = []
def add_scenario(self, test_scenario): self.scenarios.append(test_scenario)
def add_test(self, test_case): self.test_cases.append(test_case)
async def run_all(self): """Run all tests and report results.""" print("Running Chatbot Test Suite\n")
results = []
# Run scenarios for test_scenario in self.scenarios: print(f" Running scenario: {test_scenario.name}") result = await test_scenario.run() results.append(("Scenario", test_scenario.name, result))
# Run test cases for tc in self.test_cases: print(f" Running test: {tc.name}") result = await tc.run() results.append(("test", tc.name, result))
# Report self._print_report(results)
return results
def _print_report(self, results): total = len(results) passed = sum(1 for _, _, r in results if r.passed) pct = (passed / total * 100) if total > 0 else 0
print(f"\n{'='*70}") print(f"Results: {passed}/{total} passed ({pct:.1f}%)") print(f"{'='*70}\n")
for test_type, name, result in results: status = "✓" if result.passed else "✗" print(f"{status} [{test_type}] {name}")
if not result.passed: for step in result.steps: for check_result in step.results: if not check_result.passed: print(f" → {check_result.message}")
# Usageasync def main(): bot = SimpleChatbot() suite = ChatbotTestSuite(bot)
# Add all scenarios and tests bot2 = SimpleChatbot() greeting_scenario = ( Scenario("quick_greeting") .interact(inputs="Hello", outputs=lambda inputs: bot2.chat(inputs)) ) suite.add_scenario(greeting_scenario)
await suite.run_all()
asyncio.run(main())Output
Running Chatbot Test Suite
Running scenario: quick_greeting
✓ [Scenario] quick_greeting
With the suite pattern established, here are a few guidelines that will help you maintain reliable chatbot tests as the bot evolves.
1. Test Conversation Flows Holistically
Don’t just test individual responses—test complete conversation flows:
# Example: build a complete support flow scenariotest_scenario = ( Scenario("complete_support_flow") # Greeting -> Problem statement -> Information collection -> Resolution .interact(inputs="Hello", outputs=lambda inputs: SimpleChatbot().chat(inputs)))2. Validate Context Retention
Ensure the chatbot remembers important information:
FnCheck(fn= lambda trace: ( trace.last.outputs.context.user_name and trace.last.outputs.context.user_email ), name="retains_user_info",)Output
[1;35mFnCheck[0m[1m([0m
[33mname[0m=[32m'retains_user_info'[0m,
[33mdescription[0m=[3;35mNone[0m,
[33msuccess_message[0m=[3;35mNone[0m,
[33mfailure_message[0m=[3;35mNone[0m,
[33mdetails[0m=[1m{[0m[1m}[0m,
[33mkind[0m=[32m'fn'[0m
[1m)[0m3. Test Tone Consistency
Use LLM judges to verify tone remains consistent:
LLMJudge( name="consistent_tone", prompt=""" Evaluate tone consistency across responses.
{% for interaction in interactions %} Response {{ loop.index }}: {{ interaction.outputs.message }} {% endfor %}
Return 'passed: true' if tone is consistent. """,)Output
[1;35mLLMJudge[0m[1m([0m
[33mgenerator[0m=[1;35mLiteLLMGenerator[0m[1m([0m
[33mparams[0m=[1;35mGenerationParams[0m[1m([0m
[33mtemperature[0m=[1;36m1[0m[1;36m.0[0m,
[33mmax_tokens[0m=[3;35mNone[0m,
[33mresponse_format[0m=[3;35mNone[0m,
[33mtools[0m=[1m[[0m[1m][0m,
[33mtimeout[0m=[3;35mNone[0m
[1m)[0m,
[33mretry_policy[0m=[1;35mRetryPolicy[0m[1m([0m[33mmax_attempts[0m=[1;36m3[0m, [33mbase_delay[0m=[1;36m1[0m[1;36m.0[0m, [33mmax_delay[0m=[3;35mNone[0m[1m)[0m,
[33mrate_limiter[0m=[3;35mNone[0m,
[33mmiddlewares[0m=[1m[[0m[1m][0m,
[33mmodel[0m=[32m'openai/gpt-5-mini'[0m,
[33mkind[0m=[32m'litellm'[0m
[1m)[0m,
[33mname[0m=[32m'consistent_tone'[0m,
[33mdescription[0m=[3;35mNone[0m,
[33mprompt[0m=[32m"\n Evaluate tone consistency across responses.\n\n [0m[32m{[0m[32m% for interaction in interactions %[0m[32m}[0m[32m\n Response [0m[32m{[0m[32m{[0m[32m loop.index [0m[32m}[0m[32m}[0m[32m: [0m[32m{[0m[32m{[0m[32m interaction.outputs.message [0m[32m}[0m[32m}[0m[32m\n [0m[32m{[0m[32m% endfor %[0m[32m}[0m[32m\n\n Return 'passed: true' if tone is consistent.\n "[0m,
[33mprompt_path[0m=[3;35mNone[0m,
[33mkind[0m=[32m'llm_judge'[0m
[1m)[0m4. Handle Edge Cases
Test with unusual inputs: