Skip to content
GitHubDiscord

Datasets & Checks

A Dataset is a named collection of Test Cases. Each test case defines a conversation (a list of messages) and the checks the Hub should apply to evaluate the agent’s response. Checks are pass/fail criteria that use an LLM judge, embedding similarity, or rule-based matching — see Built-in checks for the full reference, and Custom checks for defining reusable configurations.


from giskard_hub import HubClient
hub = HubClient()
dataset = hub.datasets.create(
project_id="project-id",
name="Core Q&A Suite v1",
description="Baseline correctness and tone checks",
)
print(dataset.id)

Each test case pairs a conversation with a list of checks. Reference any built-in check by its identifier string:

tc = hub.test_cases.create(
dataset_id="dataset-id",
messages=[
{"role": "user", "content": "What is your refund policy?"},
],
demo_output={
"role": "assistant",
"content": "We offer a 30-day return policy for all unused items.",
},
checks=[
{
"identifier": "correctness",
"params": {
"reference": "We offer a 30-day return policy for all unused items.",
},
},
{
"identifier": "conformity",
"params": {
"rules": [
"The agent must answer the question in exactly the same language as the question was asked."
]
},
},
],
)
print(tc.id)

The demo_output field is an optional recorded answer displayed alongside the test case in the Hub UI. It is not used during evaluation — the agent always generates a fresh response. If your agent returns structured metadata (e.g. tool calls, categories, resolved status), include it in demo_output.metadata:

hub.test_cases.create(
dataset_id="dataset-id",
messages=[{"role": "user", "content": "I need help with my order #12345"}],
demo_output={
"role": "assistant",
"content": "I've found your order. It was shipped on Monday and should arrive by Thursday.",
"metadata": {
"category": "order_status",
"resolved": True,
"tools_called": ["order_lookup"],
},
},
checks=[
{
"identifier": "correctness",
"params": {
"reference": "Order #12345 shipped Monday, arrives Thursday."
},
},
{
"identifier": "metadata",
"params": {
"json_path_rules": [
{
"json_path": "$.category",
"expected_value": "order_status",
"expected_value_type": "string",
},
]
},
},
],
)

Include prior assistant turns to test multi-turn behaviour:

hub.test_cases.create(
dataset_id="dataset-id",
messages=[
{"role": "user", "content": "I ordered a jacket last week."},
{
"role": "assistant",
"content": "Happy to help! What's your order number?",
},
{"role": "user", "content": "It's #12345. I want to return it."},
],
demo_output={
"role": "assistant",
"content": "I've initiated a return for order #12345. You'll receive a prepaid label by email.",
},
checks=[
{
"identifier": "string_match",
"params": {
"type": "string_match",
"keyword": "#12345",
},
},
],
)

Tags let you filter test cases during evaluation runs:

hub.test_cases.create(
dataset_id="dataset-id",
messages=[{"role": "user", "content": "Do you ship internationally?"}],
checks=[
{
"identifier": "groundedness",
"params": {
"type": "groundedness",
"context": "We don't ship outside the EU",
},
},
],
tags=["shipping", "faq"],
)

You can annotate test cases with comments for team collaboration:

comment = hub.test_cases.comments.add(
"test-case-id",
content="This test case needs a stronger expected output — the current one is too vague.",
)
print(comment.id)
# Edit a comment
hub.test_cases.comments.edit(
"comment-id", test_case_id="test-case-id", content="Updated comment text."
)
# Delete a comment
hub.test_cases.comments.delete("comment-id", test_case_id="test-case-id")

Use hub.datasets.upload() to import a dataset. Each record must follow the test case schema, with a messages list and an optional checks list.

from giskard_hub import HubClient
hub = HubClient()
test_cases = [
{
"messages": [
{"role": "user", "content": "What is your return policy?"}
],
"checks": [
{
"identifier": "correctness",
"params": {
"reference": "We accept returns within 30 days of purchase."
},
}
],
},
{
"messages": [
{"role": "user", "content": "Do you offer free shipping?"}
],
"checks": [
{
"identifier": "correctness",
"params": {
"reference": "Free shipping is available on all orders over $50."
},
}
],
},
]
dataset = hub.datasets.upload(
project_id="project-id",
name="Imported Suite",
data=test_cases,
)
print(dataset.id)
from pathlib import Path
dataset = hub.datasets.upload(
project_id="project-id",
name="Imported Suite",
data="import_data.jsonl",
)

If you have an existing QATestset from the Giskard open-source library, convert it to the Hub format:

from giskard.rag import QATestset
testset = QATestset.load("my_testset.jsonl")
for sample in testset.samples:
checks = []
# Add correctness check
if getattr(sample, "reference_answer", None):
checks.append(
{
"identifier": "correctness",
"params": {"reference": sample.reference_answer},
}
)
# Add groundedness check
if getattr(sample, "reference_context", None):
checks.append(
{
"identifier": "groundedness",
"params": {"context": sample.reference_context},
}
)
hub.test_cases.create(
dataset_id=dataset.id,
messages=sample.conversation_history,
checks=checks,
tags=[sample.metadata["question_type"], sample.metadata["topic"]],
)

Scenarios describe a persona or behaviour pattern. The Hub uses them to generate diverse test cases automatically.

First, create a scenario or use a predefined one (see Projects & Scenarios), then:

dataset = hub.datasets.generate_scenario_based(
project_id="project-id",
agent_id="agent-id",
scenario_id="scenario-id",
dataset_name="Scenario-generated suite",
n_examples=10,
)
# Generation is asynchronous — wait for it to finish
dataset = hub.helpers.wait_for_completion(dataset)
print(
f"Generated dataset: {dataset.id} with {len(hub.datasets.list_test_cases(dataset.id))} test cases"
)

Use a Knowledge Base to generate test cases whose answers are grounded in your documents:

dataset = hub.datasets.generate_document_based(
project_id="project-id",
agent_id="agent-id",
knowledge_base_id="kb-id",
dataset_name="FAQ-grounded suite",
n_examples=25,
)
# Generation is asynchronous — wait for it to finish
dataset = hub.helpers.wait_for_completion(dataset)

You can optionally filter generation to specific topics in your knowledge base by passing topic_ids:

dataset = hub.datasets.generate_document_based(
project_id="project-id",
agent_id="agent-id",
knowledge_base_id="kb-id",
dataset_name="Shipping-only suite",
topic_ids=["shipping-topic-id"],
n_examples=10,
)

See Agents & Knowledge Bases for how to create and populate a Knowledge Base.


test_cases = hub.datasets.list_test_cases("dataset-id")
# Paginated search with filters
search_result = hub.datasets.search_test_cases(
"dataset-id",
query="payment",
limit=20,
offset=0,
)

# Move test cases to a different dataset
hub.test_cases.bulk_move(
test_case_ids=["tc-id-1", "tc-id-2"],
target_dataset_id="other-dataset-id",
)
# Bulk update tags on multiple test cases
hub.test_cases.bulk_update(
test_case_ids=["tc-id-1", "tc-id-2"],
added_tags=["reviewed"],
)
# Delete multiple test cases
hub.test_cases.bulk_delete(test_case_ids=["tc-id-1", "tc-id-2"])

tags = hub.datasets.list_tags("dataset-id")
print(tags) # ["shipping", "faq", "reviewed"]

hub.datasets.update("dataset-id", name="Core Q&A Suite v2")
hub.datasets.delete("dataset-id")

The Hub includes six built-in check types. Each check can be used directly in test cases by passing its identifier and the required params.

IdentifierMethodWhat it evaluatesKey params
correctnessLLM judgeIs the response factually correct relative to the expected output?reference
conformityLLM judgeDoes the response follow specified format, tone, or style requirements?rules
groundednessLLM judgeIs the response grounded in the provided context, without hallucinations?context
semantic_similarityEmbedding similarityIs the response semantically equivalent to the expected output?reference, threshold
string_matchRule-basedDoes the response contain a specific keyword or substring?keyword
metadataRule-basedDo JSON path values in the response metadata satisfy specified conditions?json_path_rules

Validates that all information from the reference answer is present in the agent’s response, without contradiction. Uses an LLM judge.

ParameterTypeDescription
referencestrThe expected correct answer
{
"identifier": "correctness",
"params": {"reference": "We offer a 30-day return policy."},
}

Checks that the agent’s response follows one or more rules. Each rule should describe a single, distinct behaviour. Uses an LLM judge.

ParameterTypeDescription
ruleslist[str]One or more rules the response must follow
{
"identifier": "conformity",
"params": {
"rules": [
"The response must be written in a formal, professional tone.",
"The response must not include any personal opinions.",
]
},
}

Validates that the agent’s response is grounded in the provided context — i.e., it does not introduce information absent from the context. Uses an LLM judge.

ParameterTypeDescription
contextstrThe reference context the response should be grounded in
{
"identifier": "groundedness",
"params": {
"context": "Our return window is 30 days. We do not accept returns on clearance items."
},
}

Computes embedding-based similarity between the agent’s response and a reference string. The check passes if the similarity score meets or exceeds the threshold. Does not use an LLM judge.

ParameterTypeDescription
referencestrThe expected output to compare against
thresholdfloatSimilarity threshold (0.0 to 1.0, default varies)
{
"identifier": "semantic_similarity",
"params": {"reference": "30-day return policy", "threshold": 0.8},
}

Checks whether the agent’s response contains a specific keyword or substring. Case-insensitive. Does not use an LLM judge.

ParameterTypeDescription
keywordstrThe keyword or substring to search for
{"identifier": "string_match", "params": {"keyword": "#12345"}}

Validates values extracted via JSON path expressions from the response metadata. Useful for verifying structured outputs like tool calls, categories, or flags. Does not use an LLM judge.

ParameterTypeDescription
json_path_ruleslist[dict]List of rules, each with json_path, expected_value, and expected_value_type

Each rule dict supports:

KeyTypeDescription
json_pathstrJSON path expression (e.g. $.category, $.tools_called[0])
expected_valuestrThe expected value
expected_value_typestrType of the expected value ("string", "number", "boolean")
{
"identifier": "metadata",
"params": {
"json_path_rules": [
{
"json_path": "$.category",
"expected_value": "billing",
"expected_value_type": "string",
},
{
"json_path": "$.resolved",
"expected_value": "true",
"expected_value_type": "boolean",
},
]
},
}

Custom checks are pre-configured versions of the built-in check types. Instead of repeating the same params in every test case, you define the configuration once — giving it a project-scoped identifier, a name, and the check params — and then reference it by identifier wherever it’s needed.

check = hub.checks.create(
project_id="project-id",
identifier="tone_professional",
name="Professional tone",
description="The response must use formal, professional language with no slang.",
params={
"type": "conformity",
"rules": [
"The response must be written in a formal, professional tone. It must not contain slang, contractions, or casual phrasing."
],
},
)
print(check.id)

Once created, reference your custom check by its identifier in any test case within the same project — no need to repeat the params:

hub.test_cases.create(
dataset_id="dataset-id",
messages=[{"role": "user", "content": "hey, can u help me?"}],
checks=[
{"identifier": "tone_professional"},
],
)

Content safety check:

hub.checks.create(
project_id="project-id",
identifier="no_harmful_content",
name="No harmful content",
description="The response must not contain harmful, violent, or offensive content.",
params={
"type": "conformity",
"rules": [
"The response must be safe for all audiences. It must not contain violence, hate speech, sexual content, or self-harm."
],
},
)

Tool-call verification (metadata check):

hub.checks.create(
project_id="project-id",
identifier="used_search_tool",
name="Search tool was called",
description="Verifies that the agent called the search tool during the response.",
params={
"type": "metadata",
"json_path_rules": [
{
"json_path": "$.tools_called",
"expected_value": "search",
"expected_value_type": "string",
},
],
},
)
checks = hub.checks.list(project_id="project-id")
hub.checks.update("check-id", name="Updated name")
hub.checks.delete("check-id")