TrustTest
Red team GenAI apps for safety and reliability
EvaluationScenario
TestSet
EvaluationTestResult
trusttest.client
import os import trusttest client = trusttest.client(type="neuraltrust", token=os.getenv("NEURALTRUST_TOKEN"))
from trusttest.evaluation_contexts import ExpectedResponseContext from trusttest.evaluation_scenarios import EvaluationScenario from trusttest.evaluator_suite import EvaluatorSuite from trusttest.evaluators import BleuEvaluator from trusttest.targets.testing import DummyTarget from trusttest.probes.dataset import DatasetProbe from trusttest.dataset_builder import Dataset, DatasetItem target = DummyTarget() probe = DatasetProbe( target=target, dataset=Dataset([ [ DatasetItem( question="What is Python?", context=ExpectedResponseContext( expected_response="Python is a high-level, interpreted programming language." ), ) ] ]), ) test_set = probe.get_test_set() scenario = EvaluationScenario( name="Quickstart Functional Test", description="Functional test example.", evaluator_suite=EvaluatorSuite( evaluators=[BleuEvaluator(threshold=0.3)], criteria="any_fail", ), ) results = scenario.evaluate(test_set)
client.save_evaluation_scenario(scenario)
client.save_evaluation_scenario_test_set(scenario.id, test_set)
client.save_evaluation_scenario_run(results)
file-system
trusttest_db
loaded_scenario = client.get_evaluation_scenario(scenario.id) loaded_scenario_result = client.get_evaluation_scenario_run(scenario.id) loaded_test_set = client.get_evaluation_scenario_test_set(scenario.id) result = loaded_scenario.evaluate(loaded_test_set) result.display()
import trusttest import os from trusttest.evaluation_contexts import ExpectedResponseContext from trusttest.evaluation_scenarios import EvaluationScenario from trusttest.evaluator_suite import EvaluatorSuite from trusttest.evaluators import BleuEvaluator from trusttest.targets.testing import DummyTarget from trusttest.probes.dataset import DatasetProbe from trusttest.dataset_builder import Dataset, DatasetItem target = DummyTarget() probe = DatasetProbe( target=target, dataset=Dataset([ [ DatasetItem( question="What is Python?", context=ExpectedResponseContext( expected_response="Python is a high-level, interpreted programming language." ), ) ] ]), ) test_set = probe.get_test_set() scenario = EvaluationScenario( name="Quickstart Functional Test", description="Functional test example.", evaluator_suite=EvaluatorSuite( evaluators=[BleuEvaluator(threshold=0.3)], criteria="any_fail", ), ) results = scenario.evaluate(test_set) client = trusttest.client(type="neuraltrust", token=os.getenv("NEURALTRUST_TOKEN")) client.save_evaluation_scenario(scenario) client.save_evaluation_scenario_test_set(scenario.id, test_set) client.save_evaluation_scenario_run(results) loaded_scenario = client.get_evaluation_scenario(scenario.id) loaded_scenario_result = client.get_evaluation_scenario_run(scenario.id) loaded_test_set = client.get_evaluation_scenario_test_set(scenario.id) result = loaded_scenario.evaluate(loaded_test_set) result.display()