From 958b9d281cff52bbde5c9d6e8e8ff8e13504dc85 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Mon, 15 Jan 2024 18:05:42 +0100
Subject: [PATCH 01/88] Remove unused arguments in _make_evaluate_function

---
 giskard/llm/evaluators/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/giskard/llm/evaluators/base.py b/giskard/llm/evaluators/base.py
index 0a6e19dec9..925964a6a8 100644
--- a/giskard/llm/evaluators/base.py
+++ b/giskard/llm/evaluators/base.py
@@ -78,7 +78,7 @@ def _make_evaluate_prompt(self, model: BaseModel, input_vars, model_output, row_
             model_output=model_output,
         )
 
-    def _make_evaluate_functions(self, model: BaseModel, input_vars, model_output):
+    def _make_evaluate_functions(self):
         return EVALUATE_MODEL_FUNCTIONS
 
     def evaluate(self, model: BaseModel, dataset: Dataset):

From b7cae6ad052c3c8d658b210ec120fe4c838f920e Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Mon, 15 Jan 2024 18:06:38 +0100
Subject: [PATCH 02/88] Add correctness evaluator

---
 giskard/llm/evaluators/correctness.py | 118 ++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 giskard/llm/evaluators/correctness.py

diff --git a/giskard/llm/evaluators/correctness.py b/giskard/llm/evaluators/correctness.py
new file mode 100644
index 0000000000..cda90399a9
--- /dev/null
+++ b/giskard/llm/evaluators/correctness.py
@@ -0,0 +1,118 @@
+from typing import Sequence
+
+from enum import Enum
+
+from giskard.datasets import Dataset
+from giskard.llm.errors import LLMGenerationError
+from giskard.llm.evaluators.base import EvaluationResult, LLMBasedEvaluator
+from giskard.models.base.model import BaseModel
+
+CORRECTNESS_EVALUATION_PROMPT = """Your role is to test AI models. Your task consists in assessing whether a model output correctly answers a question. 
+You are provided with the ground truth answer to the question. Your task is then to evaluate if the model answer is close to the ground thruth answer. 
+
+You are auditing the following model:
+
+Model name: {model_name}
+Model description: {model_description}
+
+Here is the question that was asked to the model and its output, followed by the expected ground truth answer:
+
+QUESTION:
+###
+{question}
+###
+
+MODEL OUTPUT:
+###
+{model_output}
+###
+
+GROUND TRUTH:
+###
+{ground_truth}
+###
+
+Think step by step and consider the model output in its entirety. Remember: you need to have a strong and sound reason to support your evaluation.
+Call the `evaluate_model` function with the result of your evaluation.
+"""
+
+
+class EvaluationFeatures(Enum):
+    QUESTION = 0
+    REFERENCE_ANSWER = 1
+    REFERENCE_CONTEXT = 2
+
+
+class CorrectnessEvaluator(LLMBasedEvaluator):
+    _default_eval_prompt = CORRECTNESS_EVALUATION_PROMPT
+    _required_features = ["question", "reference_answer"]
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def _make_evaluate_prompt(self, model_name, model_description, question, model_output, ground_truth):
+        return self.eval_prompt.format(
+            model_name=model_name,
+            model_description=model_description,
+            question=question,
+            model_output=model_output,
+            ground_truth=ground_truth,
+        )
+
+    def evaluate(self, model: BaseModel, dataset: Dataset, feature_names: Sequence = None):
+        feature_names = self._required_features if feature_names is None else feature_names
+
+        if any([name not in dataset.df for name in feature_names]):
+            raise ValueError(f"Missing at least one required feature in the evaluation dataset among: {feature_names}.")
+
+        if any([name not in model.feature_names for name in feature_names]):
+            raise ValueError(f"Missing at least one required feature in the evaluated model among: {feature_names}.")
+
+        model_outputs = model.predict(dataset).prediction
+        succeeded = []
+        failed = []
+        errored = []
+        for evaluation_question, model_output in zip(dataset.df.to_dict("records"), model_outputs):
+            try:
+                passed, reason = self._evaluate_single(
+                    model, evaluation_question[feature_names[0]], evaluation_question[feature_names[1]], model_output
+                )
+                sample = {
+                    **evaluation_question,
+                    "reason": reason,
+                    "model_output": model_output,
+                    "model_evaluation": passed,
+                }
+                if passed:
+                    succeeded.append(sample)
+                else:
+                    failed.append(sample)
+            except LLMGenerationError as err:
+                errored.append({"message": str(err), "sample": {**evaluation_question, "model_output": model_output}})
+
+        return EvaluationResult(
+            failure_examples=failed,
+            success_examples=succeeded,
+            errors=errored,
+        )
+
+    def _evaluate_single(self, model: BaseModel, question, reference_answer, model_output):
+        prompt = self._make_evaluate_prompt(
+            model.meta.name,
+            model.meta.description,
+            question,
+            model_output,
+            reference_answer,
+        )
+
+        out = self.llm_client.complete(
+            [{"role": "system", "content": prompt}],
+            functions=self._make_evaluate_functions(),
+            function_call={"name": "evaluate_model"},
+            temperature=self.llm_temperature,
+            caller_id=self.__class__.__name__,
+        )
+        if out.function_call is None or "passed_test" not in out.function_call.args:
+            raise LLMGenerationError("Invalid function call arguments received")
+
+        return out.function_call.args["passed_test"], out.function_call.args.get("reason")

From 43cea8d96602574fdba0c126d87eb7764de28e52 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Mon, 15 Jan 2024 18:08:00 +0100
Subject: [PATCH 03/88] Add tests for CorrectnessEvaluator

---
 .../evaluators/test_correctness_evaluator.py  | 141 ++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 tests/llm/evaluators/test_correctness_evaluator.py

diff --git a/tests/llm/evaluators/test_correctness_evaluator.py b/tests/llm/evaluators/test_correctness_evaluator.py
new file mode 100644
index 0000000000..ee62285ebe
--- /dev/null
+++ b/tests/llm/evaluators/test_correctness_evaluator.py
@@ -0,0 +1,141 @@
+from unittest.mock import Mock
+
+import pandas as pd
+import pytest
+
+from giskard.datasets.base import Dataset
+from giskard.llm.client import LLMFunctionCall, LLMOutput
+from giskard.llm.evaluators.correctness import CorrectnessEvaluator
+from giskard.models.base.model_prediction import ModelPredictionResults
+
+
+def _make_eval_dataset():
+    ds = Dataset(
+        pd.DataFrame(
+            {
+                "question": ["What is the capital of France?", "What is the capital of Italy?"],
+                "reference_answer": ["Paris is the capital of France", "Rome is the capital of Italy"],
+                "reference_context": [
+                    "France is a unitary semi-presidential republic with its capital in Paris, the country's largest city and main cultural and commercial centre.",
+                    "Italy covers an area of 301,340 km2 is the third-most populous member state of the European Union. Its capital and largest city is Rome.",
+                ],
+                "difficulty": [0, 1],
+                "answerable": [True, True],
+            }
+        )
+    )
+    return ds
+
+
+def _make_mock_model(feature_names=None):
+    model = Mock()
+    model.predict.return_value = ModelPredictionResults(
+        prediction=["The capital of France is Paris", "The capital of Italy is Paris"]
+    )
+    model.feature_names = feature_names if feature_names else ["question", "reference_answer", "reference_context"]
+    model.name = "Mock model for test"
+    model.description = "This is a model for testing purposes"
+    return model
+
+
+def test_correctness_evaluator_correctly_flags_examples():
+    dataset = _make_eval_dataset()
+    model = _make_mock_model()
+
+    client = Mock()
+    client.complete.side_effect = [
+        LLMOutput(
+            function_call=LLMFunctionCall(
+                function="evaluate_model",
+                args={"passed_test": True, "reason": ""},
+            )
+        ),
+        LLMOutput(
+            function_call=LLMFunctionCall(
+                function="evaluate_model",
+                args={
+                    "passed_test": False,
+                    "reason": "The model output does not agree with the ground truth: Rome is the capital of Italy",
+                },
+            )
+        ),
+    ]
+
+    evaluator = CorrectnessEvaluator(llm_client=client)
+
+    result = evaluator.evaluate(model, dataset)
+
+    assert len(result.success_examples) == 1
+    assert len(result.failure_examples) == 1
+
+    assert (
+        result.failure_examples[0]["reason"]
+        == "The model output does not agree with the ground truth: Rome is the capital of Italy"
+    )
+    assert result.failure_examples[0]["question"] == "What is the capital of Italy?"
+    assert result.failure_examples[0]["reference_answer"] == "Rome is the capital of Italy"
+    assert (
+        result.failure_examples[0]["reference_context"]
+        == "Italy covers an area of 301,340 km2 is the third-most populous member state of the European Union. Its capital and largest city is Rome."
+    )
+    assert result.failure_examples[0]["model_output"] == "The capital of Italy is Paris"
+    assert not result.failure_examples[0]["model_evaluation"]
+
+    # Check LLM client calls arguments
+    args = client.complete.call_args_list[0]
+    assert "Your role is to test AI models" in args[0][0][0]["content"]
+    assert args[1]["functions"][0]["name"] == "evaluate_model"
+
+
+def test_correctness_evaluator_handles_generation_errors():
+    dataset = _make_eval_dataset()
+    model = _make_mock_model()
+
+    client = Mock()
+    client.complete.side_effect = [
+        LLMOutput(
+            function_call=LLMFunctionCall(
+                function="evaluate_model",
+                args={"passed_test": True, "reason": ""},
+            )
+        ),
+        LLMOutput(
+            function_call=LLMFunctionCall(
+                function="evaluate_model",
+                args={
+                    "pass": False,
+                    "reason": "The model output does not agree with the ground truth: Rome is the capital of Italy",
+                },
+            )
+        ),
+    ]
+
+    evaluator = CorrectnessEvaluator(llm_client=client)
+
+    result = evaluator.evaluate(model, dataset)
+
+    assert len(result.success_examples) == 1
+    assert len(result.errors) == 1
+
+    assert result.errors[0]["message"] == "Invalid function call arguments received"
+
+
+def test_raises_error_if_missing_feature_in_dataset():
+    dataset = _make_eval_dataset()
+    dataset.df = dataset.df.drop("question", axis=1)
+
+    model = _make_mock_model()
+
+    evaluator = CorrectnessEvaluator(llm_client=Mock())
+    with pytest.raises(ValueError, match="Missing at least one required feature in the evaluation dataset among"):
+        evaluator.evaluate(model, dataset)
+
+
+def test_raises_error_if_missing_feature_in_model():
+    dataset = _make_eval_dataset()
+
+    model = _make_mock_model(feature_names=["question"])
+
+    evaluator = CorrectnessEvaluator(llm_client=Mock())
+    with pytest.raises(ValueError, match="Missing at least one required feature in the evaluated model among"):
+        evaluator.evaluate(model, dataset)

From bb8902b9dde56afc1b52e787bd0b8927eb583b19 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Tue, 16 Jan 2024 15:39:51 +0100
Subject: [PATCH 04/88] Minor fix following Matteo's comments

---
 giskard/llm/evaluators/base.py        |  2 +-
 giskard/llm/evaluators/correctness.py | 18 +++++++-----------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/giskard/llm/evaluators/base.py b/giskard/llm/evaluators/base.py
index 671c78cb0e..16d394bb5a 100644
--- a/giskard/llm/evaluators/base.py
+++ b/giskard/llm/evaluators/base.py
@@ -79,7 +79,7 @@ def _make_evaluate_prompt(self, model: BaseModel, input_vars, model_output, row_
             model_output=model_output,
         )
 
-    def _make_evaluate_functions(self):
+    def _make_evaluate_functions(self, model: BaseModel, input_vars, model_output):
         return EVALUATE_MODEL_FUNCTIONS
 
     def evaluate(self, model: BaseModel, dataset: Dataset):
diff --git a/giskard/llm/evaluators/correctness.py b/giskard/llm/evaluators/correctness.py
index cda90399a9..4fe6c66e51 100644
--- a/giskard/llm/evaluators/correctness.py
+++ b/giskard/llm/evaluators/correctness.py
@@ -1,10 +1,12 @@
 from typing import Sequence
 
-from enum import Enum
-
 from giskard.datasets import Dataset
 from giskard.llm.errors import LLMGenerationError
-from giskard.llm.evaluators.base import EvaluationResult, LLMBasedEvaluator
+from giskard.llm.evaluators.base import (
+    EVALUATE_MODEL_FUNCTIONS,
+    EvaluationResult,
+    LLMBasedEvaluator,
+)
 from giskard.models.base.model import BaseModel
 
 CORRECTNESS_EVALUATION_PROMPT = """Your role is to test AI models. Your task consists in assessing whether a model output correctly answers a question. 
@@ -37,18 +39,12 @@
 """
 
 
-class EvaluationFeatures(Enum):
-    QUESTION = 0
-    REFERENCE_ANSWER = 1
-    REFERENCE_CONTEXT = 2
-
-
 class CorrectnessEvaluator(LLMBasedEvaluator):
     _default_eval_prompt = CORRECTNESS_EVALUATION_PROMPT
     _required_features = ["question", "reference_answer"]
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+    def _make_evaluate_functions(self):
+        return EVALUATE_MODEL_FUNCTIONS
 
     def _make_evaluate_prompt(self, model_name, model_description, question, model_output, ground_truth):
         return self.eval_prompt.format(

From b140cca1c3b448ee2250ebc0c3236961585db9af Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 17 Jan 2024 10:23:27 +0100
Subject: [PATCH 05/88] Add basic vector store and embedding model

---
 giskard/rag/embeddings.py   | 26 ++++++++++++++++
 giskard/rag/vector_store.py | 59 +++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 giskard/rag/embeddings.py
 create mode 100644 giskard/rag/vector_store.py

diff --git a/giskard/rag/embeddings.py b/giskard/rag/embeddings.py
new file mode 100644
index 0000000000..f321c94b9f
--- /dev/null
+++ b/giskard/rag/embeddings.py
@@ -0,0 +1,26 @@
+from abc import ABC, abstractmethod
+
+import numpy as np
+from openai import OpenAI
+
+
+class EmbeddingsBase(ABC):
+    @abstractmethod
+    def embed_text(self, text: str) -> str:
+        ...
+
+
+class OpenAIEmbeddings(EmbeddingsBase):
+    def __init__(self, model: str = "text-embedding-ada-002", client=None):
+        self.model = model
+        self._client = client if client is not None else OpenAI()
+
+    def embed_text(self, text: str) -> str:
+        print(text)
+        text = text.replace("\n", " ")
+        try:
+            out = self._client.embeddings.create(input=[text], model=self.model)
+            embeddings = out.data[0].embedding
+        except Exception as err:
+            raise ValueError(f"Embedding creation failed for text: {text}.") from err
+        return np.array(embeddings)
diff --git a/giskard/rag/vector_store.py b/giskard/rag/vector_store.py
new file mode 100644
index 0000000000..73295addf5
--- /dev/null
+++ b/giskard/rag/vector_store.py
@@ -0,0 +1,59 @@
+from typing import Optional, Sequence
+
+import numpy as np
+import pandas as pd
+from faiss import IndexFlatL2
+
+from .embeddings import EmbeddingsBase
+
+
+class Document:
+    def __init__(self, document: dict, features: Optional[Sequence] = None):
+        if len(document) == 1:
+            self.page_content = list(document.values())[0]
+        elif features is not None and any([feat in document for feat in features]):
+            if len(features) == 1:
+                self.page_content = document[features[0]]
+            else:
+                self.page_content = "\n".join([f"{feat}: {document[feat]}" for feat in features])
+        else:
+            self.page_content = "\n".join([f"{key}: {value}" for key, value in document.items()])
+
+        self.metadata = document
+
+
+class VectorStore:
+    def __init__(self, documents: Sequence[Document], embeddings: np.array, embedding_model: EmbeddingsBase):
+        if len(embeddings) == 0 or len(documents) == 0:
+            raise ValueError("Documents and embeddings must contains at least one element.")
+        if len(embeddings) != len(documents):
+            raise ValueError("Documents and embeddings must have the same length.")
+
+        self.embeddings = embeddings
+        self.documents = documents
+        self.embedding_model = embedding_model
+
+        self.dimension = self.embeddings[0].shape[0]
+        self.index = IndexFlatL2(self.dimension)
+        self.index.add(self.embeddings)
+
+    @classmethod
+    def from_df(cls, df: pd.DataFrame, embedding_model: EmbeddingsBase, features: Sequence[str] = None):
+        if len(df) > 0:
+            documents = [Document(knowledge_chunk, features=features) for knowledge_chunk in df.to_dict("records")]
+            try:
+                embeddings = np.stack(
+                    [embedding_model.embed_text(document.page_content) for document in documents]
+                ).astype("float32")
+            except Exception as err:
+                raise ValueError("Failed to embed the list of documents.") from err
+
+            return cls(documents, embeddings, embedding_model)
+        else:
+            raise ValueError("Cannot generate a vector store from empty DataFrame.")
+
+    def similarity_search_with_score(self, query, k):
+        query_emb = self.embedding_model.embed_text(query)
+        print(query_emb)
+        distances, indices = self.index.search(query_emb[None, :], k=k)
+        return [(self.documents[i], d) for d, i in zip(distances[0], indices[0])]

From caaef7248d81c07d99cce6f4132a249a6d6d54c7 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 17 Jan 2024 10:24:16 +0100
Subject: [PATCH 06/88] Add testset generator

---
 giskard/rag/__init__.py                       |   0
 .../rag/knowledge_base_testset_generator.py   | 139 ++++++++++++++++++
 giskard/rag/prompts.py                        |  33 +++++
 3 files changed, 172 insertions(+)
 create mode 100644 giskard/rag/__init__.py
 create mode 100644 giskard/rag/knowledge_base_testset_generator.py
 create mode 100644 giskard/rag/prompts.py

diff --git a/giskard/rag/__init__.py b/giskard/rag/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
new file mode 100644
index 0000000000..496547ac18
--- /dev/null
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -0,0 +1,139 @@
+import numpy as np
+import pandas as pd
+
+from ..datasets import Dataset
+from ..llm.errors import LLMGenerationError
+from ..llm.generators import BaseDataGenerator
+from .embeddings import EmbeddingsBase, OpenAIEmbeddings
+from .prompts import ANSWER_GENERATION_PROMPT, QUESTION_GENERATION_PROMPT
+from .vector_store import VectorStore
+
+
+class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
+    _question_generation_prompt = QUESTION_GENERATION_PROMPT
+    _answer_generation_prompt = ANSWER_GENERATION_PROMPT
+    _difficulty_level = 1
+
+    def __init__(
+        self,
+        knowledge_df,
+        model_name: str,
+        model_description: str,
+        context_neighbors: int = 4,
+        context_similarity_threshold: float = 0.2,
+        context_window_length: int = 8192,
+        embedding_model: EmbeddingsBase = None,
+        language: str = "english",
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.model_name = model_name
+        self.model_description = model_description
+        self.context_neighbors = context_neighbors
+        self.context_similarity_threshold = context_similarity_threshold
+
+        # ideally should be moved into llm_client object but OpenAI has no API to retrieve
+        # model context length
+        self.context_window_length = context_window_length
+        self.embedding_model = embedding_model if embedding_model is not None else OpenAIEmbeddings()
+        self.language = language
+
+        self.knowledge_base = VectorStore.from_df(knowledge_df, self.embedding_model)
+
+    def _make_generate_input_functions(self, return_attribute_name):
+        return [
+            {
+                "name": "generate_inputs",
+                "description": "generates inputs for model audit",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "inputs": {
+                            "type": "array",
+                            "items": {
+                                "type": "object",
+                                "properties": {return_attribute_name: {"type": "string"}},
+                            },
+                        }
+                    },
+                    "required": ["inputs"],
+                },
+            }
+        ]
+
+    def _generate_question_from_context(self, context):
+        prompt = self._question_generation_prompt.format(
+            context=context,
+            model_name=self.model_name,
+            model_description=self.model_description,
+            language=self.language,
+        )
+
+        prompt = self._prevent_context_window_overflow(prompt)
+        return self._llm_complete(prompt, self._make_generate_input_functions("question"))
+
+    def _generate_answer_from_context(self, question, context):
+        prompt = self._answer_generation_prompt.format(question=question, context=context)
+        prompt = self._prevent_context_window_overflow(prompt)
+        return self._llm_complete(prompt, self._make_generate_input_functions("answer"))
+
+    def _extract_seed_context(self):
+        seed_context = np.random.choice(self.knowledge_base.documents)
+        relevant_contexts = [
+            context
+            for (context, score) in self.knowledge_base.similarity_search_with_score(
+                seed_context.page_content, k=self.context_neighbors
+            )
+            if score < self.context_similarity_threshold  # should we keep it or not ?
+        ]
+
+        print(f"Retrieved {len(relevant_contexts)} relevant contexts.")
+        return relevant_contexts
+
+    def _format_context(self, contexts):
+        context_string = "\n\n".join(
+            ["### Context {} ###\n{}\n######".format(idx + 1, c.page_content) for idx, c in enumerate(contexts)]
+        )
+        return context_string
+
+    def _prevent_context_window_overflow(self, prompt):
+        # Prevent context overflow
+        # general rule of thumbs to count tokens: 1 token ~ 4 characters
+        # https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
+        return prompt[: self.context_window_length // 4]
+
+    def _llm_complete(self, prompt, functions):
+        try:
+            out = self.llm_client.complete(
+                messages=[{"role": "system", "content": prompt}],
+                functions=functions,
+                function_call={"name": "generate_inputs"},
+                temperature=self.llm_temperature,
+                caller_id=self.__class__.__name__,
+            )
+            generated = out.function_call.args["inputs"]
+        except (AttributeError, KeyError) as err:
+            raise LLMGenerationError("Could not parse generated inputs") from err
+
+        return generated
+
+    def generate_testset(self, num_samples: int = 10) -> Dataset:
+        generated_questions = []
+        for idx in range(num_samples):
+            seed_contexts = self._extract_seed_context()
+            context = self._format_context(seed_contexts)
+
+            question = self._generate_question_from_context(context)[0]
+            answer = self._generate_answer_from_context(question["question"], context)[0]
+
+            generated_questions.append(
+                {
+                    "question": question["question"],
+                    "reference_answer": answer["answer"],
+                    "reference_context": context,
+                    "difficulty_level": self._difficulty_level,
+                }
+            )
+
+        return pd.DataFrame(generated_questions)
diff --git a/giskard/rag/prompts.py b/giskard/rag/prompts.py
new file mode 100644
index 0000000000..3c0d9228c8
--- /dev/null
+++ b/giskard/rag/prompts.py
@@ -0,0 +1,33 @@
+QUESTION_GENERATION_PROMPT = """You are a client from an online shop called {model_name}. Shop description: {model_description}
+You are looking for information about specific products that are sold on by this shop and about the shop's activities.   
+
+Your task is to generate questions about the products, the ordering process and the shop's activities in general. Your question must be related to a provided context.  
+Please respect the following rules to generate the question:
+- The answer to the question should be found, at least partially, inside the provided context.  
+- The question must be self-contained and understandable by humans. 
+- The question must be in {language}.
+
+Here is the context:
+<context>
+{context}
+</context>
+
+Remember you are a client of {model_name}, you are looking for information to help you with your shopping.
+Please call the `generate_inputs` function with the generated inputs.
+"""
+
+ANSWER_GENERATION_PROMPT = """Your task is to answer a question based on a provided context.
+The answer should be clear and concise. Think step by step and answer the question thoroughly. 
+Your answer must only contain information provided by the context.
+
+Here is the context and the question:
+<context>
+{context}
+</context>
+
+<question>
+{question}
+</question>
+
+Please call the `generate_inputs` function with the generated inputs.
+"""

From c453b328347b340ae2f0bb5b5d70014c2e02eb2b Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 17 Jan 2024 10:24:51 +0100
Subject: [PATCH 07/88] Add unit tests for the rag module

---
 tests/rag/test_document_creation.py           |  60 +++++++++++
 tests/rag/test_embedding_model.py             |  22 ++++
 .../test_knowledge_base_testset_generator.py  | 101 ++++++++++++++++++
 tests/rag/test_vector_store.py                |  69 ++++++++++++
 4 files changed, 252 insertions(+)
 create mode 100644 tests/rag/test_document_creation.py
 create mode 100644 tests/rag/test_embedding_model.py
 create mode 100644 tests/rag/test_knowledge_base_testset_generator.py
 create mode 100644 tests/rag/test_vector_store.py

diff --git a/tests/rag/test_document_creation.py b/tests/rag/test_document_creation.py
new file mode 100644
index 0000000000..f6719479cd
--- /dev/null
+++ b/tests/rag/test_document_creation.py
@@ -0,0 +1,60 @@
+from giskard.rag import Document
+
+
+def test_single_feature_document_creation():
+    doc = Document({"feature": "This a test value for a feature"})
+
+    assert doc.page_content == "This a test value for a feature"
+    assert doc.metadata == {"feature": "This a test value for a feature"}
+
+
+def test_multiple_features_document_creation():
+    doc = Document(
+        {
+            "feat1": "This a test value for a feature 1",
+            "feat2": "This a test value for a feature 2",
+            "feat3": "This a test value for a feature 3",
+        }
+    )
+    assert (
+        doc.page_content
+        == "feat1: This a test value for a feature 1\nfeat2: This a test value for a feature 2\nfeat3: This a test value for a feature 3"
+    )
+    assert doc.metadata == {
+        "feat1": "This a test value for a feature 1",
+        "feat2": "This a test value for a feature 2",
+        "feat3": "This a test value for a feature 3",
+    }
+
+    doc = Document(
+        {
+            "feat1": "This a test value for a feature 1",
+            "feat2": "This a test value for a feature 2",
+            "feat3": "This a test value for a feature 3",
+        },
+        features=["feat1"],
+    )
+    assert doc.page_content == "This a test value for a feature 1"
+
+    doc = Document(
+        {
+            "feat1": "This a test value for a feature 1",
+            "feat2": "This a test value for a feature 2",
+            "feat3": "This a test value for a feature 3",
+        },
+        features=["feat1", "feat2"],
+    )
+    assert doc.page_content == "feat1: This a test value for a feature 1\nfeat2: This a test value for a feature 2"
+
+    doc = Document(
+        {
+            "feat1": "This a test value for a feature 1",
+            "feat2": "This a test value for a feature 2",
+            "feat3": "This a test value for a feature 3",
+        },
+        features=["feat4"],
+    )
+    assert (
+        doc.page_content
+        == "feat1: This a test value for a feature 1\nfeat2: This a test value for a feature 2\nfeat3: This a test value for a feature 3"
+    )
diff --git a/tests/rag/test_embedding_model.py b/tests/rag/test_embedding_model.py
new file mode 100644
index 0000000000..8bbd562e73
--- /dev/null
+++ b/tests/rag/test_embedding_model.py
@@ -0,0 +1,22 @@
+from unittest.mock import Mock
+
+import numpy as np
+
+from giskard.rag import OpenAIEmbeddings
+
+
+def test_openai_embeddings_model():
+    embedding_mock = Mock()
+    embedding_mock.embedding = np.ones(8)
+
+    embedding_call = Mock()
+    embedding_call.data = [embedding_mock]
+
+    client = Mock()
+    client.embeddings.create.side_effect = [embedding_call]
+
+    embedding_model = OpenAIEmbeddings(client=client)
+
+    embedded_text = embedding_model.embed_text("This a test string")
+    assert len(embedded_text) == 8
+    assert np.allclose(embedded_text, np.ones(8))
diff --git a/tests/rag/test_knowledge_base_testset_generator.py b/tests/rag/test_knowledge_base_testset_generator.py
new file mode 100644
index 0000000000..dedbe59cc4
--- /dev/null
+++ b/tests/rag/test_knowledge_base_testset_generator.py
@@ -0,0 +1,101 @@
+from unittest.mock import Mock
+
+import numpy as np
+import pandas as pd
+
+from giskard.llm.client import LLMFunctionCall, LLMOutput
+from giskard.rag import KnowledgeBaseTestsetGenerator
+
+
+def make_knowledge_base_df():
+    knowledge_base_df = pd.DataFrame(
+        [
+            {"context": "Camembert is a moist, soft, creamy, surface-ripened cow's milk cheese."},
+            {
+                "context": "Bleu d'Auvergne is a French blue cheese, named for its place of origin in the Auvergne region."
+            },
+            {"context": "Scamorza is a Southern Italian cow's milk cheese."},
+            {
+                "context": "Freeriding is a style of snowboarding or skiing performed on natural, un-groomed terrain, without a set course, goals or rules."
+            },
+        ]
+    )
+    return knowledge_base_df
+
+
+CONTEXT_STRING = """### Context 1 ###
+Scamorza is a Southern Italian cow's milk cheese.
+######
+
+### Context 2 ###
+Bleu d'Auvergne is a French blue cheese, named for its place of origin in the Auvergne region.
+######
+
+### Context 3 ###
+Freeriding is a style of snowboarding or skiing performed on natural, un-groomed terrain, without a set course, goals or rules.
+######"""
+
+
+def test_testset_generation():
+    llm_client = Mock()
+    llm_client.complete.side_effect = [
+        LLMOutput(
+            None,
+            LLMFunctionCall(
+                "generate_inputs",
+                {
+                    "inputs": [
+                        {"question": "Where is Camembert from?"},
+                    ]
+                },
+            ),
+        ),
+        LLMOutput(
+            None,
+            LLMFunctionCall(
+                "generate_inputs",
+                {
+                    "inputs": [
+                        {"answer": "Camembert was created in Normandy, in the northwest of France."},
+                    ]
+                },
+            ),
+        ),
+    ] * 2
+
+    embedding_dimension = 8
+
+    embedding_model = Mock()
+    # evenly spaced embeddings for the knowledge base elements and specifically chosen embeddings for
+    # each mock embedding calls.
+    embedding_model.embed_text.side_effect = [np.ones(embedding_dimension) * idx / 100 for idx in range(4)] + [
+        np.ones(8) * 0.02,
+        np.ones(8) * 10,
+    ]
+
+    knowledge_base_df = make_knowledge_base_df()
+    testset_generator = KnowledgeBaseTestsetGenerator(
+        knowledge_base_df,
+        model_name="Test model",
+        model_description="This is a model for testing purpose.",
+        llm_client=llm_client,
+        embedding_model=embedding_model,
+        context_neighbors=3,
+    )
+
+    assert testset_generator.knowledge_base.index.d == 8
+    assert testset_generator.knowledge_base.embeddings.shape == (4, 8)
+    assert len(testset_generator.knowledge_base.documents) == 4
+    assert testset_generator.knowledge_base.documents[2].page_content.startswith(
+        "Scamorza is a Southern Italian cow's milk cheese."
+    )
+
+    test_set = testset_generator.generate_testset(num_samples=2)
+    assert len(test_set) == 2
+    assert test_set.loc[0, "question"] == "Where is Camembert from?"
+    assert test_set.loc[0, "reference_answer"] == "Camembert was created in Normandy, in the northwest of France."
+    assert test_set.loc[0, "reference_context"] == CONTEXT_STRING
+    assert test_set.loc[0, "difficulty_level"] == 1
+
+    assert test_set.loc[1, "question"] == "Where is Camembert from?"
+    assert test_set.loc[1, "reference_context"] == ""
diff --git a/tests/rag/test_vector_store.py b/tests/rag/test_vector_store.py
new file mode 100644
index 0000000000..5f5803f08a
--- /dev/null
+++ b/tests/rag/test_vector_store.py
@@ -0,0 +1,69 @@
+from unittest.mock import Mock
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from giskard.rag import Document, VectorStore
+
+
+def test_vector_store_creation():
+    dimension = 8
+    embeddings = np.repeat(np.arange(5)[:, None], 8, axis=1)
+    documents = [Document({"feature": "This is a test string"})] * 5
+
+    embedding_model = Mock()
+
+    store = VectorStore(documents, embeddings, embedding_model)
+    assert store.embeddings.shape == (5, 8)
+    assert len(store.documents) == 5
+    assert store.index.d == dimension
+    assert store.index.ntotal == 5
+
+    with pytest.raises(ValueError, match="Documents and embeddings must have the same length."):
+        store = VectorStore(documents, np.repeat(np.arange(4)[:, None], 8, axis=1), embedding_model)
+
+    with pytest.raises(ValueError, match="Documents and embeddings must contains at least one element."):
+        store = VectorStore(documents, [], embedding_model)
+
+    with pytest.raises(ValueError, match="Documents and embeddings must contains at least one element."):
+        store = VectorStore([], [], embedding_model)
+
+
+def test_vector_store_creation_from_df():
+    dimension = 8
+    df = pd.DataFrame(["This is a test string"] * 5)
+
+    embedding_model = Mock()
+    random_embedding = np.random.rand(dimension)
+    embedding_model.embed_text.side_effect = [random_embedding] * 5
+
+    store = VectorStore.from_df(df, embedding_model)
+    assert store.index.d == dimension
+    assert store.embeddings.shape == (5, 8)
+    assert len(store.documents) == 5
+    assert store.index.ntotal == 5
+
+    assert np.allclose(store.embeddings[0], random_embedding)
+
+
+def test_vector_store_similarity_search_with_score():
+    dimension = 8
+    embeddings = np.repeat(np.arange(100)[:, None], 8, axis=1)
+    documents = [Document({"feature": f"This is test string {idx + 1}"}) for idx in range(100)]
+
+    embedding_model = Mock()
+    embedding_model.embed_text.side_effect = [np.ones(dimension) * 49]
+
+    store = VectorStore(documents, embeddings, embedding_model)
+
+    query = "This is test string 50"
+    retrieved_elements = store.similarity_search_with_score(query, k=3)
+    print([(ret.page_content, score) for (ret, score) in retrieved_elements])
+    assert len(retrieved_elements) == 3
+    assert retrieved_elements[0][0].page_content == "This is test string 50"
+    assert retrieved_elements[0][1] == 0.0
+    assert retrieved_elements[1][0].page_content == "This is test string 49"
+    assert retrieved_elements[1][1] == 8.0
+    assert retrieved_elements[2][0].page_content == "This is test string 51"
+    assert retrieved_elements[2][1] == 8.0

From 37eaf8ac2eb819f871a273d96f042a0c8771e431 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 17 Jan 2024 10:27:11 +0100
Subject: [PATCH 08/88] Add faiss-cpu as a dependency

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 92208f7028..b4003b867c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -174,6 +174,7 @@ dependencies = [
     "markdown",                 # needed for display of scan results in notebook
     "colorama",                 # needed for the scan
     "griffe>=0.36.9",
+    "faiss-cpu>=1.7.4",
 ]
 
 [project.optional-dependencies]

From 9b91a9a7537557c7b4b80c5d649a3420b7d3ff4c Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 17 Jan 2024 10:30:59 +0100
Subject: [PATCH 09/88] Fix imports of rag module in tests

---
 tests/rag/test_document_creation.py | 2 +-
 tests/rag/test_embedding_model.py   | 2 +-
 tests/rag/test_vector_store.py      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/rag/test_document_creation.py b/tests/rag/test_document_creation.py
index f6719479cd..28b262b2fc 100644
--- a/tests/rag/test_document_creation.py
+++ b/tests/rag/test_document_creation.py
@@ -1,4 +1,4 @@
-from giskard.rag import Document
+from giskard.rag.vector_store import Document
 
 
 def test_single_feature_document_creation():
diff --git a/tests/rag/test_embedding_model.py b/tests/rag/test_embedding_model.py
index 8bbd562e73..a8d15f2975 100644
--- a/tests/rag/test_embedding_model.py
+++ b/tests/rag/test_embedding_model.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from giskard.rag import OpenAIEmbeddings
+from giskard.rag.embeddings import OpenAIEmbeddings
 
 
 def test_openai_embeddings_model():
diff --git a/tests/rag/test_vector_store.py b/tests/rag/test_vector_store.py
index 5f5803f08a..86f4c53c3d 100644
--- a/tests/rag/test_vector_store.py
+++ b/tests/rag/test_vector_store.py
@@ -4,7 +4,7 @@
 import pandas as pd
 import pytest
 
-from giskard.rag import Document, VectorStore
+from giskard.rag.vector_store import Document, VectorStore
 
 
 def test_vector_store_creation():

From fb8a3147ec4c3675c276f014423987dc90b4a4ec Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 17 Jan 2024 10:38:29 +0100
Subject: [PATCH 10/88] Add import in __init__ file

---
 giskard/rag/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/giskard/rag/__init__.py b/giskard/rag/__init__.py
index e69de29bb2..8cf4f4881c 100644
--- a/giskard/rag/__init__.py
+++ b/giskard/rag/__init__.py
@@ -0,0 +1,3 @@
+from .knowledge_base_testset_generator import KnowledgeBaseTestsetGenerator
+
+__all__ = [KnowledgeBaseTestsetGenerator]

From 3c3a14e2accd0b9764b66e066e5edfa724d3eb4f Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 17 Jan 2024 10:50:08 +0100
Subject: [PATCH 11/88] Minor cleaning

---
 giskard/rag/embeddings.py   | 1 -
 giskard/rag/vector_store.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/giskard/rag/embeddings.py b/giskard/rag/embeddings.py
index f321c94b9f..4ef0b86978 100644
--- a/giskard/rag/embeddings.py
+++ b/giskard/rag/embeddings.py
@@ -16,7 +16,6 @@ def __init__(self, model: str = "text-embedding-ada-002", client=None):
         self._client = client if client is not None else OpenAI()
 
     def embed_text(self, text: str) -> str:
-        print(text)
         text = text.replace("\n", " ")
         try:
             out = self._client.embeddings.create(input=[text], model=self.model)
diff --git a/giskard/rag/vector_store.py b/giskard/rag/vector_store.py
index 73295addf5..afcea4f634 100644
--- a/giskard/rag/vector_store.py
+++ b/giskard/rag/vector_store.py
@@ -54,6 +54,5 @@ def from_df(cls, df: pd.DataFrame, embedding_model: EmbeddingsBase, features: Se
 
     def similarity_search_with_score(self, query, k):
         query_emb = self.embedding_model.embed_text(query)
-        print(query_emb)
         distances, indices = self.index.search(query_emb[None, :], k=k)
         return [(self.documents[i], d) for d, i in zip(distances[0], indices[0])]

From 6a1da207e3b4cdaf61294290f9ed3b4bff485d66 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 17 Jan 2024 14:50:26 +0100
Subject: [PATCH 12/88] Add batch embedding to get much faster KB creation

---
 giskard/rag/embeddings.py                       | 13 ++++++++++++-
 giskard/rag/knowledge_base_testset_generator.py | 14 ++++++--------
 giskard/rag/vector_store.py                     | 12 +++---------
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/giskard/rag/embeddings.py b/giskard/rag/embeddings.py
index 4ef0b86978..50e904327d 100644
--- a/giskard/rag/embeddings.py
+++ b/giskard/rag/embeddings.py
@@ -1,3 +1,5 @@
+from typing import Sequence
+
 from abc import ABC, abstractmethod
 
 import numpy as np
@@ -15,7 +17,7 @@ def __init__(self, model: str = "text-embedding-ada-002", client=None):
         self.model = model
         self._client = client if client is not None else OpenAI()
 
-    def embed_text(self, text: str) -> str:
+    def embed_text(self, text: str) -> np.array:
         text = text.replace("\n", " ")
         try:
             out = self._client.embeddings.create(input=[text], model=self.model)
@@ -23,3 +25,12 @@ def embed_text(self, text: str) -> str:
         except Exception as err:
             raise ValueError(f"Embedding creation failed for text: {text}.") from err
         return np.array(embeddings)
+
+    def embed_documents(self, documents: Sequence) -> np.array:
+        text_batch = [doc.page_content.replace("\n", " ") for doc in documents]
+        try:
+            out = self._client.embeddings.create(input=text_batch, model=self.model)
+            embeddings = [element.embedding for element in out.data]
+        except Exception as err:
+            raise ValueError("Batched embedding creation failed.") from err
+        return np.stack(embeddings)
diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index 496547ac18..f6cfab54b0 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -1,3 +1,5 @@
+from typing import Sequence
+
 import numpy as np
 import pandas as pd
 
@@ -24,6 +26,7 @@ def __init__(
         context_window_length: int = 8192,
         embedding_model: EmbeddingsBase = None,
         language: str = "english",
+        knowledge_base_features: Sequence[str] = None,
         *args,
         **kwargs,
     ):
@@ -33,13 +36,11 @@ def __init__(
         self.context_neighbors = context_neighbors
         self.context_similarity_threshold = context_similarity_threshold
 
-        # ideally should be moved into llm_client object but OpenAI has no API to retrieve
-        # model context length
         self.context_window_length = context_window_length
         self.embedding_model = embedding_model if embedding_model is not None else OpenAIEmbeddings()
         self.language = language
 
-        self.knowledge_base = VectorStore.from_df(knowledge_df, self.embedding_model)
+        self.knowledge_base = VectorStore.from_df(knowledge_df, self.embedding_model, features=knowledge_base_features)
 
     def _make_generate_input_functions(self, return_attribute_name):
         return [
@@ -69,7 +70,6 @@ def _generate_question_from_context(self, context):
             model_description=self.model_description,
             language=self.language,
         )
-
         prompt = self._prevent_context_window_overflow(prompt)
         return self._llm_complete(prompt, self._make_generate_input_functions("question"))
 
@@ -87,8 +87,6 @@ def _extract_seed_context(self):
             )
             if score < self.context_similarity_threshold  # should we keep it or not ?
         ]
-
-        print(f"Retrieved {len(relevant_contexts)} relevant contexts.")
         return relevant_contexts
 
     def _format_context(self, contexts):
@@ -101,7 +99,7 @@ def _prevent_context_window_overflow(self, prompt):
         # Prevent context overflow
         # general rule of thumbs to count tokens: 1 token ~ 4 characters
         # https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
-        return prompt[: self.context_window_length // 4]
+        return prompt[: self.context_window_length * 4]
 
     def _llm_complete(self, prompt, functions):
         try:
@@ -118,7 +116,7 @@ def _llm_complete(self, prompt, functions):
 
         return generated
 
-    def generate_testset(self, num_samples: int = 10) -> Dataset:
+    def generate_dataset(self, num_samples: int = 10) -> Dataset:
         generated_questions = []
         for idx in range(num_samples):
             seed_contexts = self._extract_seed_context()
diff --git a/giskard/rag/vector_store.py b/giskard/rag/vector_store.py
index afcea4f634..6e80c0a3b7 100644
--- a/giskard/rag/vector_store.py
+++ b/giskard/rag/vector_store.py
@@ -41,18 +41,12 @@ def __init__(self, documents: Sequence[Document], embeddings: np.array, embeddin
     def from_df(cls, df: pd.DataFrame, embedding_model: EmbeddingsBase, features: Sequence[str] = None):
         if len(df) > 0:
             documents = [Document(knowledge_chunk, features=features) for knowledge_chunk in df.to_dict("records")]
-            try:
-                embeddings = np.stack(
-                    [embedding_model.embed_text(document.page_content) for document in documents]
-                ).astype("float32")
-            except Exception as err:
-                raise ValueError("Failed to embed the list of documents.") from err
-
+            embeddings = embedding_model.embed_documents(documents).astype("float32")
             return cls(documents, embeddings, embedding_model)
         else:
             raise ValueError("Cannot generate a vector store from empty DataFrame.")
 
     def similarity_search_with_score(self, query, k):
-        query_emb = self.embedding_model.embed_text(query)
-        distances, indices = self.index.search(query_emb[None, :], k=k)
+        query_emb = self.embedding_model.embed_text(query).astype("float32")
+        distances, indices = self.index.search(query_emb[None, :], k)
         return [(self.documents[i], d) for d, i in zip(distances[0], indices[0])]

From baa17ce6cf7b61dc09d6bb247ccf6c9070b41e57 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 17 Jan 2024 14:51:02 +0100
Subject: [PATCH 13/88] Update question prompt to enfore generation of only one
 question

---
 giskard/rag/prompts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/giskard/rag/prompts.py b/giskard/rag/prompts.py
index 3c0d9228c8..2561d61032 100644
--- a/giskard/rag/prompts.py
+++ b/giskard/rag/prompts.py
@@ -13,7 +13,7 @@
 </context>
 
 Remember you are a client of {model_name}, you are looking for information to help you with your shopping.
-Please call the `generate_inputs` function with the generated inputs.
+Please call the `generate_inputs` function with the generated inputs. You must generate 1 input.
 """
 
 ANSWER_GENERATION_PROMPT = """Your task is to answer a question based on a provided context.

From 1991fa2e011b2a8d4d3d2edfdcaf7a44604622e7 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 17 Jan 2024 14:51:21 +0100
Subject: [PATCH 14/88] Update tests

---
 tests/rag/test_knowledge_base_testset_generator.py | 10 +++++-----
 tests/rag/test_vector_store.py                     |  7 +++----
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/tests/rag/test_knowledge_base_testset_generator.py b/tests/rag/test_knowledge_base_testset_generator.py
index dedbe59cc4..96d7553b3b 100644
--- a/tests/rag/test_knowledge_base_testset_generator.py
+++ b/tests/rag/test_knowledge_base_testset_generator.py
@@ -68,10 +68,10 @@ def test_testset_generation():
     embedding_model = Mock()
     # evenly spaced embeddings for the knowledge base elements and specifically chosen embeddings for
     # each mock embedding calls.
-    embedding_model.embed_text.side_effect = [np.ones(embedding_dimension) * idx / 100 for idx in range(4)] + [
-        np.ones(8) * 0.02,
-        np.ones(8) * 10,
-    ]
+    kb_embeddings = np.ones((4, embedding_dimension)) * np.arange(4)[:, None] / 100
+    query_embeddings = np.ones((2, embedding_dimension)) * np.array([0.02, 10])[:, None]
+    embedding_model.embed_documents.side_effect = [kb_embeddings]
+    embedding_model.embed_text.side_effect = list(query_embeddings)
 
     knowledge_base_df = make_knowledge_base_df()
     testset_generator = KnowledgeBaseTestsetGenerator(
@@ -90,7 +90,7 @@ def test_testset_generation():
         "Scamorza is a Southern Italian cow's milk cheese."
     )
 
-    test_set = testset_generator.generate_testset(num_samples=2)
+    test_set = testset_generator.generate_dataset(num_samples=2)
     assert len(test_set) == 2
     assert test_set.loc[0, "question"] == "Where is Camembert from?"
     assert test_set.loc[0, "reference_answer"] == "Camembert was created in Normandy, in the northwest of France."
diff --git a/tests/rag/test_vector_store.py b/tests/rag/test_vector_store.py
index 86f4c53c3d..6ce88746c8 100644
--- a/tests/rag/test_vector_store.py
+++ b/tests/rag/test_vector_store.py
@@ -35,8 +35,8 @@ def test_vector_store_creation_from_df():
     df = pd.DataFrame(["This is a test string"] * 5)
 
     embedding_model = Mock()
-    random_embedding = np.random.rand(dimension)
-    embedding_model.embed_text.side_effect = [random_embedding] * 5
+    random_embedding = np.random.rand(5, dimension)
+    embedding_model.embed_documents.side_effect = [random_embedding]
 
     store = VectorStore.from_df(df, embedding_model)
     assert store.index.d == dimension
@@ -44,7 +44,7 @@ def test_vector_store_creation_from_df():
     assert len(store.documents) == 5
     assert store.index.ntotal == 5
 
-    assert np.allclose(store.embeddings[0], random_embedding)
+    assert np.allclose(store.embeddings, random_embedding)
 
 
 def test_vector_store_similarity_search_with_score():
@@ -59,7 +59,6 @@ def test_vector_store_similarity_search_with_score():
 
     query = "This is test string 50"
     retrieved_elements = store.similarity_search_with_score(query, k=3)
-    print([(ret.page_content, score) for (ret, score) in retrieved_elements])
     assert len(retrieved_elements) == 3
     assert retrieved_elements[0][0].page_content == "This is test string 50"
     assert retrieved_elements[0][1] == 0.0

From ed6e608c6d2457aad310443fe7f72d1a59282c16 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Thu, 11 Jan 2024 11:29:24 +0100
Subject: [PATCH 15/88] Add a flag to handle control characters inside LLM
 response decoding

---
 giskard/llm/client/openai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/giskard/llm/client/openai.py b/giskard/llm/client/openai.py
index 23482c1fd3..68af5282b7 100644
--- a/giskard/llm/client/openai.py
+++ b/giskard/llm/client/openai.py
@@ -65,7 +65,7 @@ def complete(
             try:
                 function_call = LLMFunctionCall(
                     function=fc["name"],
-                    args=json.loads(fc["arguments"]),
+                    args=json.loads(fc["arguments"], strict=False),
                 )
             except (json.JSONDecodeError, KeyError) as err:
                 raise LLMGenerationError("Could not parse function call") from err

From 41631a1246dda0b5d9aa9e84e3afdf68c6d0f854 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Thu, 18 Jan 2024 13:53:45 +0100
Subject: [PATCH 16/88] Fix string in __all__

---
 giskard/rag/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/giskard/rag/__init__.py b/giskard/rag/__init__.py
index 8cf4f4881c..50c3c750bb 100644
--- a/giskard/rag/__init__.py
+++ b/giskard/rag/__init__.py
@@ -1,3 +1,3 @@
 from .knowledge_base_testset_generator import KnowledgeBaseTestsetGenerator
 
-__all__ = [KnowledgeBaseTestsetGenerator]
+__all__ = ["KnowledgeBaseTestsetGenerator"]

From 61eaa6105188c173372ebd83598df4e56c129e54 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Thu, 18 Jan 2024 18:50:09 +0100
Subject: [PATCH 17/88] Add LLM correctness test

---
 giskard/testing/tests/llm/__init__.py    |  2 ++
 giskard/testing/tests/llm/correctness.py | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)
 create mode 100644 giskard/testing/tests/llm/correctness.py

diff --git a/giskard/testing/tests/llm/__init__.py b/giskard/testing/tests/llm/__init__.py
index b9330301c9..a660816c23 100644
--- a/giskard/testing/tests/llm/__init__.py
+++ b/giskard/testing/tests/llm/__init__.py
@@ -1,3 +1,4 @@
+from .correctness import test_llm_correctness
 from .ground_truth import test_llm_ground_truth, test_llm_ground_truth_similarity
 from .hallucination import test_llm_output_coherency, test_llm_output_plausibility
 from .injections import (
@@ -24,4 +25,5 @@
     "test_llm_output_against_strings",
     "test_llm_ground_truth_similarity",
     "test_llm_ground_truth",
+    "test_llm_correctness",
 ]
diff --git a/giskard/testing/tests/llm/correctness.py b/giskard/testing/tests/llm/correctness.py
new file mode 100644
index 0000000000..1008bdde82
--- /dev/null
+++ b/giskard/testing/tests/llm/correctness.py
@@ -0,0 +1,24 @@
+from ....core.test_result import TestResult
+from ....datasets.base import Dataset
+from ....llm.evaluators import CorrectnessEvaluator
+from ....models.base import BaseModel
+from ....registry.decorators import test
+from .. import debug_description_prefix
+
+
+@test(
+    name="LLM Correctness from knowledge base",
+    tags=["llm", "llm-as-a-judge"],
+    debug_description=debug_description_prefix + "that are <b>failing the evaluation criteria</b>.",
+)
+def test_llm_correctness(model: BaseModel, dataset: Dataset):
+    correctness_evaluator = CorrectnessEvaluator()
+    eval_result = correctness_evaluator.evaluate(model, dataset)
+
+    return TestResult(
+        passed=eval_result.passed,
+        metric=len(eval_result.failure_examples),
+        metric_name="Failing examples",
+        is_error=eval_result.has_errors,
+        details=eval_result.details,
+    )

From 948a2bd659150586f6cc573a5b00dab9391c42a4 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Thu, 18 Jan 2024 18:53:21 +0100
Subject: [PATCH 18/88] Add Testset dataset wrapper to build test suite

---
 giskard/rag/__init__.py                         |  3 ++-
 giskard/rag/knowledge_base_testset_generator.py | 12 +++++++-----
 giskard/rag/testset.py                          | 10 ++++++++++
 3 files changed, 19 insertions(+), 6 deletions(-)
 create mode 100644 giskard/rag/testset.py

diff --git a/giskard/rag/__init__.py b/giskard/rag/__init__.py
index 50c3c750bb..52f240a99e 100644
--- a/giskard/rag/__init__.py
+++ b/giskard/rag/__init__.py
@@ -1,3 +1,4 @@
 from .knowledge_base_testset_generator import KnowledgeBaseTestsetGenerator
+from .testset import TestSet
 
-__all__ = ["KnowledgeBaseTestsetGenerator"]
+__all__ = ["KnowledgeBaseTestsetGenerator", "TestSet"]
diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index f6cfab54b0..28a90c196a 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -3,11 +3,11 @@
 import numpy as np
 import pandas as pd
 
-from ..datasets import Dataset
 from ..llm.errors import LLMGenerationError
 from ..llm.generators import BaseDataGenerator
 from .embeddings import EmbeddingsBase, OpenAIEmbeddings
 from .prompts import ANSWER_GENERATION_PROMPT, QUESTION_GENERATION_PROMPT
+from .testset import TestSet
 from .vector_store import VectorStore
 
 
@@ -25,8 +25,9 @@ def __init__(
         context_similarity_threshold: float = 0.2,
         context_window_length: int = 8192,
         embedding_model: EmbeddingsBase = None,
-        language: str = "english",
+        language: str = "en",
         knowledge_base_features: Sequence[str] = None,
+        seed: int = None,
         *args,
         **kwargs,
     ):
@@ -39,6 +40,7 @@ def __init__(
         self.context_window_length = context_window_length
         self.embedding_model = embedding_model if embedding_model is not None else OpenAIEmbeddings()
         self.language = language
+        self.rng = np.random.default_rng(seed=seed)
 
         self.knowledge_base = VectorStore.from_df(knowledge_df, self.embedding_model, features=knowledge_base_features)
 
@@ -79,7 +81,7 @@ def _generate_answer_from_context(self, question, context):
         return self._llm_complete(prompt, self._make_generate_input_functions("answer"))
 
     def _extract_seed_context(self):
-        seed_context = np.random.choice(self.knowledge_base.documents)
+        seed_context = self.rng.choice(self.knowledge_base.documents)
         relevant_contexts = [
             context
             for (context, score) in self.knowledge_base.similarity_search_with_score(
@@ -116,7 +118,7 @@ def _llm_complete(self, prompt, functions):
 
         return generated
 
-    def generate_dataset(self, num_samples: int = 10) -> Dataset:
+    def generate_dataset(self, num_samples: int = 10) -> TestSet:
         generated_questions = []
         for idx in range(num_samples):
             seed_contexts = self._extract_seed_context()
@@ -134,4 +136,4 @@ def generate_dataset(self, num_samples: int = 10) -> Dataset:
                 }
             )
 
-        return pd.DataFrame(generated_questions)
+        return TestSet(df=pd.DataFrame(generated_questions))
diff --git a/giskard/rag/testset.py b/giskard/rag/testset.py
new file mode 100644
index 0000000000..01d8537bcc
--- /dev/null
+++ b/giskard/rag/testset.py
@@ -0,0 +1,10 @@
+from .. import Dataset, Suite
+from ..testing.tests.llm import test_llm_correctness
+
+
+class TestSet(Dataset):
+    def to_test_suite(self):
+        suite_default_params = {"dataset": self}
+        suite = Suite(name="Test suite generated from testset", default_params=suite_default_params)
+        suite.add_test(test_llm_correctness, "TestsetCorrectnessTest", "TestsetCorrectnessTest")
+        return suite

From 56758cbb4d08f899f18dad03a0dbd0f518b76999 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Thu, 18 Jan 2024 18:54:30 +0100
Subject: [PATCH 19/88] Fix validation of feature name between testset, model
 and evaluator

---
 giskard/llm/evaluators/__init__.py    |  9 +++++-
 giskard/llm/evaluators/correctness.py | 40 ++++++++++++++++++++-------
 2 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/giskard/llm/evaluators/__init__.py b/giskard/llm/evaluators/__init__.py
index 3606c0161b..c12d57e56b 100644
--- a/giskard/llm/evaluators/__init__.py
+++ b/giskard/llm/evaluators/__init__.py
@@ -1,5 +1,12 @@
 from .coherency import CoherencyEvaluator
+from .correctness import CorrectnessEvaluator
 from .plausibility import PlausibilityEvaluator
 from .requirements import PerRowRequirementEvaluator, RequirementEvaluator
 
-__all__ = ["CoherencyEvaluator", "RequirementEvaluator", "PerRowRequirementEvaluator", "PlausibilityEvaluator"]
+__all__ = [
+    "CoherencyEvaluator",
+    "RequirementEvaluator",
+    "PerRowRequirementEvaluator",
+    "PlausibilityEvaluator",
+    "CorrectnessEvaluator",
+]
diff --git a/giskard/llm/evaluators/correctness.py b/giskard/llm/evaluators/correctness.py
index 4fe6c66e51..548287a69e 100644
--- a/giskard/llm/evaluators/correctness.py
+++ b/giskard/llm/evaluators/correctness.py
@@ -1,5 +1,3 @@
-from typing import Sequence
-
 from giskard.datasets import Dataset
 from giskard.llm.errors import LLMGenerationError
 from giskard.llm.evaluators.base import (
@@ -41,7 +39,8 @@
 
 class CorrectnessEvaluator(LLMBasedEvaluator):
     _default_eval_prompt = CORRECTNESS_EVALUATION_PROMPT
-    _required_features = ["question", "reference_answer"]
+    _question_feature_name = "question"
+    _reference_answer_feature_name = "reference_answer"
 
     def _make_evaluate_functions(self):
         return EVALUATE_MODEL_FUNCTIONS
@@ -55,14 +54,32 @@ def _make_evaluate_prompt(self, model_name, model_description, question, model_o
             ground_truth=ground_truth,
         )
 
-    def evaluate(self, model: BaseModel, dataset: Dataset, feature_names: Sequence = None):
-        feature_names = self._required_features if feature_names is None else feature_names
+    def evaluate(
+        self,
+        model: BaseModel,
+        dataset: Dataset,
+        question_feature_name: str = None,
+        reference_answer_feature_name: str = None,
+    ):
+        question_feature_name = (
+            question_feature_name if question_feature_name is not None else self._question_feature_name
+        )
+        reference_answer_feature_name = (
+            reference_answer_feature_name
+            if reference_answer_feature_name is not None
+            else self._reference_answer_feature_name
+        )
+        qa_feature_names = [question_feature_name, reference_answer_feature_name]
 
-        if any([name not in dataset.df for name in feature_names]):
-            raise ValueError(f"Missing at least one required feature in the evaluation dataset among: {feature_names}.")
+        # question and reference_answer feature names must be present in the dataset
+        if not (question_feature_name in dataset.df and reference_answer_feature_name in dataset.df):
+            raise ValueError(
+                f"Missing at least one required feature in the evaluation dataset among: {qa_feature_names}."
+            )
 
-        if any([name not in model.feature_names for name in feature_names]):
-            raise ValueError(f"Missing at least one required feature in the evaluated model among: {feature_names}.")
+        # question feature name must be inside model's features
+        if question_feature_name not in model.feature_names:
+            raise ValueError(f"Missing question feature: {question_feature_name} inside model's features.")
 
         model_outputs = model.predict(dataset).prediction
         succeeded = []
@@ -71,7 +88,10 @@ def evaluate(self, model: BaseModel, dataset: Dataset, feature_names: Sequence =
         for evaluation_question, model_output in zip(dataset.df.to_dict("records"), model_outputs):
             try:
                 passed, reason = self._evaluate_single(
-                    model, evaluation_question[feature_names[0]], evaluation_question[feature_names[1]], model_output
+                    model,
+                    evaluation_question[question_feature_name],
+                    evaluation_question[reference_answer_feature_name],
+                    model_output,
                 )
                 sample = {
                     **evaluation_question,

From b811dbd4e977b12e3a0f695f448fda42222ed8bb Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Fri, 19 Jan 2024 18:26:05 +0100
Subject: [PATCH 20/88] Add threshold and failure examples in correctness test
 output

---
 giskard/testing/tests/llm/correctness.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/giskard/testing/tests/llm/correctness.py b/giskard/testing/tests/llm/correctness.py
index 1008bdde82..a0460b56bb 100644
--- a/giskard/testing/tests/llm/correctness.py
+++ b/giskard/testing/tests/llm/correctness.py
@@ -11,14 +11,20 @@
     tags=["llm", "llm-as-a-judge"],
     debug_description=debug_description_prefix + "that are <b>failing the evaluation criteria</b>.",
 )
-def test_llm_correctness(model: BaseModel, dataset: Dataset):
+def test_llm_correctness(model: BaseModel, dataset: Dataset, threshold: float = 0.5):
     correctness_evaluator = CorrectnessEvaluator()
-    eval_result = correctness_evaluator.evaluate(model, dataset)
+    eval_result, failed_idx = correctness_evaluator.evaluate(model, dataset)
+    output_ds = list()
+    if not eval_result.passed:
+        output_ds.append(dataset.slice(lambda df: df.loc[failed_idx], row_level=False))
+
+    passed = bool(eval_result.passed_ratio > threshold)
 
     return TestResult(
-        passed=eval_result.passed,
-        metric=len(eval_result.failure_examples),
-        metric_name="Failing examples",
+        passed=passed,
+        metric=eval_result.passed_ratio,
+        metric_name="Failing examples ratio",
         is_error=eval_result.has_errors,
         details=eval_result.details,
+        output_ds=output_ds,
     )

From b579e9004b6508ba8801a8014a4280562bf22f14 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Fri, 19 Jan 2024 18:31:56 +0100
Subject: [PATCH 21/88] Add testset to test suite convertion test + minor fix
 to generator test

---
 .../test_knowledge_base_testset_generator.py  | 12 +++----
 tests/rag/test_testset_suite_conversion.py    | 35 +++++++++++++++++++
 2 files changed, 41 insertions(+), 6 deletions(-)
 create mode 100644 tests/rag/test_testset_suite_conversion.py

diff --git a/tests/rag/test_knowledge_base_testset_generator.py b/tests/rag/test_knowledge_base_testset_generator.py
index 96d7553b3b..1caad22a6b 100644
--- a/tests/rag/test_knowledge_base_testset_generator.py
+++ b/tests/rag/test_knowledge_base_testset_generator.py
@@ -92,10 +92,10 @@ def test_testset_generation():
 
     test_set = testset_generator.generate_dataset(num_samples=2)
     assert len(test_set) == 2
-    assert test_set.loc[0, "question"] == "Where is Camembert from?"
-    assert test_set.loc[0, "reference_answer"] == "Camembert was created in Normandy, in the northwest of France."
-    assert test_set.loc[0, "reference_context"] == CONTEXT_STRING
-    assert test_set.loc[0, "difficulty_level"] == 1
+    assert test_set.df.loc[0, "question"] == "Where is Camembert from?"
+    assert test_set.df.loc[0, "reference_answer"] == "Camembert was created in Normandy, in the northwest of France."
+    assert test_set.df.loc[0, "reference_context"] == CONTEXT_STRING
+    assert test_set.df.loc[0, "difficulty_level"] == 1
 
-    assert test_set.loc[1, "question"] == "Where is Camembert from?"
-    assert test_set.loc[1, "reference_context"] == ""
+    assert test_set.df.loc[1, "question"] == "Where is Camembert from?"
+    assert test_set.df.loc[1, "reference_context"] == ""
diff --git a/tests/rag/test_testset_suite_conversion.py b/tests/rag/test_testset_suite_conversion.py
new file mode 100644
index 0000000000..9b1526e87f
--- /dev/null
+++ b/tests/rag/test_testset_suite_conversion.py
@@ -0,0 +1,35 @@
+import pandas as pd
+
+from giskard.rag import TestSet
+
+
+def make_testset_df():
+    return pd.DataFrame(
+        [
+            {
+                "question": "Which milk is used to make Camembert?",
+                "reference_answer": "Cow's milk is used to make Camembert.",
+                "reference_context": "Camembert is a moist, soft, creamy, surface-ripened cow's milk cheese.",
+            },
+            {
+                "question": "Where is Scarmorza from?",
+                "reference_answer": "Scarmorza is from Southern Italy.",
+                "reference_context": "Scamorza is a Southern Italian cow's milk cheese.",
+            },
+        ]
+    )
+
+
+def test_testset_suite_conversion():
+    testset = TestSet(df=make_testset_df())
+    suite = testset.to_test_suite()
+
+    assert "dataset" in suite.default_params
+    assert suite.default_params["dataset"].df.loc[0, "question"] == "Which milk is used to make Camembert?"
+    assert (
+        suite.default_params["dataset"].df.loc[1, "reference_context"]
+        == "Scamorza is a Southern Italian cow's milk cheese."
+    )
+
+    assert len(suite.tests) == 1
+    assert suite.tests[0].display_name == "TestsetCorrectnessTest"

From 43395a4a8f4275eaae2fa79d94f05822d38c6b59 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Fri, 19 Jan 2024 18:33:03 +0100
Subject: [PATCH 22/88] Add failed indices inside evaluator's outputs

---
 giskard/llm/evaluators/correctness.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/giskard/llm/evaluators/correctness.py b/giskard/llm/evaluators/correctness.py
index 548287a69e..7b046885d3 100644
--- a/giskard/llm/evaluators/correctness.py
+++ b/giskard/llm/evaluators/correctness.py
@@ -85,7 +85,8 @@ def evaluate(
         succeeded = []
         failed = []
         errored = []
-        for evaluation_question, model_output in zip(dataset.df.to_dict("records"), model_outputs):
+        failed_index = []
+        for idx, (evaluation_question, model_output) in enumerate(zip(dataset.df.to_dict("records"), model_outputs)):
             try:
                 passed, reason = self._evaluate_single(
                     model,
@@ -102,14 +103,18 @@ def evaluate(
                 if passed:
                     succeeded.append(sample)
                 else:
+                    failed_index.append(idx)
                     failed.append(sample)
             except LLMGenerationError as err:
                 errored.append({"message": str(err), "sample": {**evaluation_question, "model_output": model_output}})
 
-        return EvaluationResult(
-            failure_examples=failed,
-            success_examples=succeeded,
-            errors=errored,
+        return (
+            EvaluationResult(
+                failure_examples=failed,
+                success_examples=succeeded,
+                errors=errored,
+            ),
+            failed_index,
         )
 
     def _evaluate_single(self, model: BaseModel, question, reference_answer, model_output):

From 071307aa6a8ce99932f9c1a9ba391af6900a3442 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Fri, 19 Jan 2024 19:11:02 +0100
Subject: [PATCH 23/88] Add some documentation

---
 giskard/llm/evaluators/correctness.py         |  4 ++
 giskard/rag/embeddings.py                     | 12 +++-
 .../rag/knowledge_base_testset_generator.py   | 59 ++++++++++++++++++-
 giskard/rag/testset.py                        |  4 ++
 giskard/testing/tests/llm/correctness.py      | 19 ++++++
 5 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/giskard/llm/evaluators/correctness.py b/giskard/llm/evaluators/correctness.py
index 7b046885d3..1bc2768dda 100644
--- a/giskard/llm/evaluators/correctness.py
+++ b/giskard/llm/evaluators/correctness.py
@@ -38,6 +38,10 @@
 
 
 class CorrectnessEvaluator(LLMBasedEvaluator):
+    """Correctness evaluator class: assess the correctness of a model answers
+    given questions and associated reference answers.
+    """
+
     _default_eval_prompt = CORRECTNESS_EVALUATION_PROMPT
     _question_feature_name = "question"
     _reference_answer_feature_name = "reference_answer"
diff --git a/giskard/rag/embeddings.py b/giskard/rag/embeddings.py
index 50e904327d..dad52ae597 100644
--- a/giskard/rag/embeddings.py
+++ b/giskard/rag/embeddings.py
@@ -5,14 +5,24 @@
 import numpy as np
 from openai import OpenAI
 
+from .vector_store import Document
+
 
 class EmbeddingsBase(ABC):
+    """Base class to build custom embedding models."""
+
     @abstractmethod
     def embed_text(self, text: str) -> str:
         ...
 
+    @abstractmethod
+    def embed_documents(self, documents: Sequence[Document]) -> str:
+        ...
+
 
 class OpenAIEmbeddings(EmbeddingsBase):
+    """Simple wrapper around the OpenAI embeddings API."""
+
     def __init__(self, model: str = "text-embedding-ada-002", client=None):
         self.model = model
         self._client = client if client is not None else OpenAI()
@@ -26,7 +36,7 @@ def embed_text(self, text: str) -> np.array:
             raise ValueError(f"Embedding creation failed for text: {text}.") from err
         return np.array(embeddings)
 
-    def embed_documents(self, documents: Sequence) -> np.array:
+    def embed_documents(self, documents: Sequence[Document]) -> np.array:
         text_batch = [doc.page_content.replace("\n", " ") for doc in documents]
         try:
             out = self._client.embeddings.create(input=text_batch, model=self.model)
diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index 28a90c196a..376f7ec6db 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -12,13 +12,52 @@
 
 
 class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
+    """Testset generator for testing RAG models.
+
+    Explore a given knowledge base and generate question/answer pairs to test the model.
+
+    Each generated item contains the following field
+    - question: a question about a part of the knowledge base
+    - reference_answer: the expected answer according to the knowledge base
+    - reference_context: relevant elements directly extracted from the knowledge base
+    - difficulty_level: an indicator of how difficult the question is
+
+    Parameters
+    ----------
+    knowledge_df: pd.DataFrame
+        a dataframe containing the whole knowledge base
+    model_name: str
+        name of the model to be tested
+    model_description: str
+        a description of the model to be tested, to get more fitting questions
+    context_neighbors: int
+        the maximum number of extracted element from the knowledge base to get a relevant context for question generation
+    context_similarity_threshold: float = 0.2
+        a similarity threshold to filter irrelevant element from the knowledge base during context creation
+    context_window_length: int = 8192
+        context window length of the llm used in the `llm_client` of the generator
+    embedding_model: EmbeddingsBase = None
+        an embedding model to build the knowledge base index
+    language: str = "en"
+        the language in which question are generated (following ISO 639-1)
+    knowledge_base_features: Sequence[str] = None
+        a list of columns from the `knowledge_df` to include inside the knowledge base. If the
+        `knowledge_df` only has one column, it will be used by default has the content of
+        the knowledge base elements. If `knowledge_df` has multiple columns they will be
+        concatenated into a single column with the name of the column before the respective content.
+        If `knowledge_base_features` is specified, only the columns from it are considered.
+
+        Example: "col_1: content column 1, col_2: content column 2"
+    seed: int = None
+    """
+
     _question_generation_prompt = QUESTION_GENERATION_PROMPT
     _answer_generation_prompt = ANSWER_GENERATION_PROMPT
     _difficulty_level = 1
 
     def __init__(
         self,
-        knowledge_df,
+        knowledge_df: pd.DataFrame,
         model_name: str,
         model_description: str,
         context_neighbors: int = 4,
@@ -119,6 +158,24 @@ def _llm_complete(self, prompt, functions):
         return generated
 
     def generate_dataset(self, num_samples: int = 10) -> TestSet:
+        """Generates a testset from the knowledge base.
+
+        Parameters
+        ----------
+        num_samples : int
+            The number of question to generate, by default 10.
+
+        Returns
+        -------
+        TestSet
+            The generated test set.
+
+        Each generated question has the following field:
+        - question: a question about a part of the knowledge base
+        - reference_answer: the expected answer according to the knowledge base
+        - reference_context: relevant elements directly extracted from the knowledge base
+        - difficulty_level: an indicator of how difficult the question is
+        """
         generated_questions = []
         for idx in range(num_samples):
             seed_contexts = self._extract_seed_context()
diff --git a/giskard/rag/testset.py b/giskard/rag/testset.py
index 01d8537bcc..52188e2f85 100644
--- a/giskard/rag/testset.py
+++ b/giskard/rag/testset.py
@@ -3,6 +3,10 @@
 
 
 class TestSet(Dataset):
+    """A wrapper class around `Dataset` to allow automatic creation
+    of a `Suite` based on the question/answer pairs inside the `TestSet`.
+    """
+
     def to_test_suite(self):
         suite_default_params = {"dataset": self}
         suite = Suite(name="Test suite generated from testset", default_params=suite_default_params)
diff --git a/giskard/testing/tests/llm/correctness.py b/giskard/testing/tests/llm/correctness.py
index a0460b56bb..27c08c7827 100644
--- a/giskard/testing/tests/llm/correctness.py
+++ b/giskard/testing/tests/llm/correctness.py
@@ -12,6 +12,25 @@
     debug_description=debug_description_prefix + "that are <b>failing the evaluation criteria</b>.",
 )
 def test_llm_correctness(model: BaseModel, dataset: Dataset, threshold: float = 0.5):
+    """Tests if LLM answers are correct with respect to a known reference answers.
+
+    The test is passed when the ratio of correct answers is higher than the
+    threshold.
+
+    Parameters
+    ----------
+    model : BaseModel
+        Model used to compute the test
+    dataset : Dataset
+        Dataset used to compute the test
+    threshold : float
+        The threshold value for the ratio of invariant rows.
+
+    Returns
+    -------
+    TestResult
+        A TestResult object containing the test result.
+    """
     correctness_evaluator = CorrectnessEvaluator()
     eval_result, failed_idx = correctness_evaluator.evaluate(model, dataset)
     output_ds = list()

From e2c0c65e8cb3523b31b7eb8449e01b3bf605dc72 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Mon, 22 Jan 2024 14:56:51 +0100
Subject: [PATCH 24/88] Move faiss dependency inside llm module

---
 pyproject.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index a95adbd1f5..ab2ff7d1fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -174,7 +174,7 @@ dependencies = [
     "markdown",                 # needed for display of scan results in notebook
     "colorama",                 # needed for the scan
     "griffe>=0.36.9",
-    "faiss-cpu>=1.7.4",
+    "uvloop>=0.19.0",
 ]
 
 [project.optional-dependencies]
@@ -182,6 +182,8 @@ llm = [
     "openai",
     "evaluate>=0.4.1",
     "bert-score>=0.3.13",
+    "faiss-cpu>=1.7.4",
+
 ]
 
 hub = [

From 2d00ca6590f4af1493421d3dba99420ec13c7360 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Mon, 22 Jan 2024 14:57:45 +0100
Subject: [PATCH 25/88] Fix circular import and minor typing issue

---
 giskard/rag/embeddings.py   | 10 ++++------
 giskard/rag/vector_store.py |  8 +++++++-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/giskard/rag/embeddings.py b/giskard/rag/embeddings.py
index dad52ae597..9d5e5da415 100644
--- a/giskard/rag/embeddings.py
+++ b/giskard/rag/embeddings.py
@@ -5,18 +5,16 @@
 import numpy as np
 from openai import OpenAI
 
-from .vector_store import Document
-
 
 class EmbeddingsBase(ABC):
     """Base class to build custom embedding models."""
 
     @abstractmethod
-    def embed_text(self, text: str) -> str:
+    def embed_text(self, text: str) -> np.ndarray:
         ...
 
     @abstractmethod
-    def embed_documents(self, documents: Sequence[Document]) -> str:
+    def embed_documents(self, documents: Sequence) -> np.ndarray:
         ...
 
 
@@ -27,7 +25,7 @@ def __init__(self, model: str = "text-embedding-ada-002", client=None):
         self.model = model
         self._client = client if client is not None else OpenAI()
 
-    def embed_text(self, text: str) -> np.array:
+    def embed_text(self, text: str) -> np.ndarray:
         text = text.replace("\n", " ")
         try:
             out = self._client.embeddings.create(input=[text], model=self.model)
@@ -36,7 +34,7 @@ def embed_text(self, text: str) -> np.array:
             raise ValueError(f"Embedding creation failed for text: {text}.") from err
         return np.array(embeddings)
 
-    def embed_documents(self, documents: Sequence[Document]) -> np.array:
+    def embed_documents(self, documents: Sequence) -> np.ndarray:
         text_batch = [doc.page_content.replace("\n", " ") for doc in documents]
         try:
             out = self._client.embeddings.create(input=text_batch, model=self.model)
diff --git a/giskard/rag/vector_store.py b/giskard/rag/vector_store.py
index 6e80c0a3b7..35254663f8 100644
--- a/giskard/rag/vector_store.py
+++ b/giskard/rag/vector_store.py
@@ -8,6 +8,8 @@
 
 
 class Document:
+    """A class to wrap the elements of the knowledge base into a unified format."""
+
     def __init__(self, document: dict, features: Optional[Sequence] = None):
         if len(document) == 1:
             self.page_content = list(document.values())[0]
@@ -23,6 +25,10 @@ def __init__(self, document: dict, features: Optional[Sequence] = None):
 
 
 class VectorStore:
+    """Stores all embedded Document of the knowledge base.
+    Relies on `FlatIndexL2` class from FAISS.
+    """
+
     def __init__(self, documents: Sequence[Document], embeddings: np.array, embedding_model: EmbeddingsBase):
         if len(embeddings) == 0 or len(documents) == 0:
             raise ValueError("Documents and embeddings must contains at least one element.")
@@ -46,7 +52,7 @@ def from_df(cls, df: pd.DataFrame, embedding_model: EmbeddingsBase, features: Se
         else:
             raise ValueError("Cannot generate a vector store from empty DataFrame.")
 
-    def similarity_search_with_score(self, query, k):
+    def similarity_search_with_score(self, query: str, k: int) -> Sequence:
         query_emb = self.embedding_model.embed_text(query).astype("float32")
         distances, indices = self.index.search(query_emb[None, :], k)
         return [(self.documents[i], d) for d, i in zip(distances[0], indices[0])]

From 68c7a68cd2687f24c3b15bd73b9c1ed9130221ad Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Mon, 22 Jan 2024 15:08:54 +0100
Subject: [PATCH 26/88] Add safe import of faiss and openai modules

---
 giskard/rag/embeddings.py   | 9 ++++++++-
 giskard/rag/vector_store.py | 7 ++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/giskard/rag/embeddings.py b/giskard/rag/embeddings.py
index 9d5e5da415..9fb0d65dd9 100644
--- a/giskard/rag/embeddings.py
+++ b/giskard/rag/embeddings.py
@@ -3,7 +3,8 @@
 from abc import ABC, abstractmethod
 
 import numpy as np
-from openai import OpenAI
+
+from ..core.errors import GiskardInstallationError
 
 
 class EmbeddingsBase(ABC):
@@ -23,6 +24,12 @@ class OpenAIEmbeddings(EmbeddingsBase):
 
     def __init__(self, model: str = "text-embedding-ada-002", client=None):
         self.model = model
+
+        try:
+            from openai import OpenAI
+        except ImportError as err:
+            raise GiskardInstallationError(flavor="llm") from err
+
         self._client = client if client is not None else OpenAI()
 
     def embed_text(self, text: str) -> np.ndarray:
diff --git a/giskard/rag/vector_store.py b/giskard/rag/vector_store.py
index 35254663f8..b4d3eff0cf 100644
--- a/giskard/rag/vector_store.py
+++ b/giskard/rag/vector_store.py
@@ -2,8 +2,8 @@
 
 import numpy as np
 import pandas as pd
-from faiss import IndexFlatL2
 
+from ..core.errors import GiskardInstallationError
 from .embeddings import EmbeddingsBase
 
 
@@ -35,6 +35,11 @@ def __init__(self, documents: Sequence[Document], embeddings: np.array, embeddin
         if len(embeddings) != len(documents):
             raise ValueError("Documents and embeddings must have the same length.")
 
+        try:
+            from faiss import IndexFlatL2
+        except ImportError as err:
+            raise GiskardInstallationError(flavor="llm") from err
+
         self.embeddings = embeddings
         self.documents = documents
         self.embedding_model = embedding_model

From 39c7ea50db27d25040e5eb551bb494e67bd05c0b Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Mon, 22 Jan 2024 17:27:32 +0100
Subject: [PATCH 27/88] Fix broken test

---
 giskard/llm/evaluators/correctness.py              | 2 +-
 tests/llm/evaluators/test_correctness_evaluator.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/giskard/llm/evaluators/correctness.py b/giskard/llm/evaluators/correctness.py
index 1bc2768dda..91bdf86427 100644
--- a/giskard/llm/evaluators/correctness.py
+++ b/giskard/llm/evaluators/correctness.py
@@ -83,7 +83,7 @@ def evaluate(
 
         # question feature name must be inside model's features
         if question_feature_name not in model.feature_names:
-            raise ValueError(f"Missing question feature: {question_feature_name} inside model's features.")
+            raise ValueError(f"Missing question feature: '{question_feature_name}' inside model's features.")
 
         model_outputs = model.predict(dataset).prediction
         succeeded = []
diff --git a/tests/llm/evaluators/test_correctness_evaluator.py b/tests/llm/evaluators/test_correctness_evaluator.py
index ee62285ebe..58e15820a2 100644
--- a/tests/llm/evaluators/test_correctness_evaluator.py
+++ b/tests/llm/evaluators/test_correctness_evaluator.py
@@ -63,7 +63,7 @@ def test_correctness_evaluator_correctly_flags_examples():
 
     evaluator = CorrectnessEvaluator(llm_client=client)
 
-    result = evaluator.evaluate(model, dataset)
+    result, failed_indices = evaluator.evaluate(model, dataset)
 
     assert len(result.success_examples) == 1
     assert len(result.failure_examples) == 1
@@ -112,7 +112,7 @@ def test_correctness_evaluator_handles_generation_errors():
 
     evaluator = CorrectnessEvaluator(llm_client=client)
 
-    result = evaluator.evaluate(model, dataset)
+    result, failed_indices = evaluator.evaluate(model, dataset)
 
     assert len(result.success_examples) == 1
     assert len(result.errors) == 1
@@ -134,8 +134,8 @@ def test_raises_error_if_missing_feature_in_dataset():
 def test_raises_error_if_missing_feature_in_model():
     dataset = _make_eval_dataset()
 
-    model = _make_mock_model(feature_names=["question"])
+    model = _make_mock_model(feature_names=["reference_answer"])
 
     evaluator = CorrectnessEvaluator(llm_client=Mock())
-    with pytest.raises(ValueError, match="Missing at least one required feature in the evaluated model among"):
+    with pytest.raises(ValueError, match="Missing question feature: 'question' inside model's features."):
         evaluator.evaluate(model, dataset)

From b8496c7fc761cc345eb752b9ae01bbea1388e43f Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Tue, 23 Jan 2024 12:41:40 +0100
Subject: [PATCH 28/88] Merge question and answer generation prompt and
 separate system instruction from context

---
 .../rag/knowledge_base_testset_generator.py   | 62 ++++++++++---------
 giskard/rag/prompts.py                        | 46 +++++++-------
 2 files changed, 56 insertions(+), 52 deletions(-)

diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index 376f7ec6db..41379687c0 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -1,12 +1,18 @@
 from typing import Sequence
 
+import json
+
 import numpy as np
 import pandas as pd
 
 from ..llm.errors import LLMGenerationError
 from ..llm.generators import BaseDataGenerator
 from .embeddings import EmbeddingsBase, OpenAIEmbeddings
-from .prompts import ANSWER_GENERATION_PROMPT, QUESTION_GENERATION_PROMPT
+from .prompts import (
+    QA_GENERATION_ASSISTANT_EXAMPLE,
+    QA_GENERATION_CONTEXT_EXAMPLE,
+    QA_GENERATION_SYSTEM_PROMPT,
+)
 from .testset import TestSet
 from .vector_store import VectorStore
 
@@ -51,8 +57,10 @@ class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
     seed: int = None
     """
 
-    _question_generation_prompt = QUESTION_GENERATION_PROMPT
-    _answer_generation_prompt = ANSWER_GENERATION_PROMPT
+    _qa_generation_system_prompt = QA_GENERATION_SYSTEM_PROMPT
+    _qa_generation_context_example = QA_GENERATION_CONTEXT_EXAMPLE
+    _qa_generation_assistant_example = QA_GENERATION_ASSISTANT_EXAMPLE
+
     _difficulty_level = 1
 
     def __init__(
@@ -67,6 +75,7 @@ def __init__(
         language: str = "en",
         knowledge_base_features: Sequence[str] = None,
         seed: int = None,
+        include_examples: bool = True,
         *args,
         **kwargs,
     ):
@@ -80,6 +89,7 @@ def __init__(
         self.embedding_model = embedding_model if embedding_model is not None else OpenAIEmbeddings()
         self.language = language
         self.rng = np.random.default_rng(seed=seed)
+        self.include_examples = include_examples
 
         self.knowledge_base = VectorStore.from_df(knowledge_df, self.embedding_model, features=knowledge_base_features)
 
@@ -104,20 +114,19 @@ def _make_generate_input_functions(self, return_attribute_name):
             }
         ]
 
-    def _generate_question_from_context(self, context):
-        prompt = self._question_generation_prompt.format(
-            context=context,
-            model_name=self.model_name,
-            model_description=self.model_description,
-            language=self.language,
-        )
-        prompt = self._prevent_context_window_overflow(prompt)
-        return self._llm_complete(prompt, self._make_generate_input_functions("question"))
-
-    def _generate_answer_from_context(self, question, context):
-        prompt = self._answer_generation_prompt.format(question=question, context=context)
-        prompt = self._prevent_context_window_overflow(prompt)
-        return self._llm_complete(prompt, self._make_generate_input_functions("answer"))
+    def _generate_question_answer_from_context(self, context):
+        messages = [{"role": "system", "content": self._qa_generation_system_prompt}]
+        if self.include_examples:
+            messages.extend(
+                [
+                    {"role": "user", "content": self._qa_generation_context_example},
+                    {"role": "assistant", "content": self._qa_generation_assistant_example},
+                ]
+            )
+        messages.append({"role": "user", "content": context})
+
+        generated_qa = self._llm_complete(messages=messages)
+        return generated_qa["question"], generated_qa["answer"]
 
     def _extract_seed_context(self):
         seed_context = self.rng.choice(self.knowledge_base.documents)
@@ -131,9 +140,7 @@ def _extract_seed_context(self):
         return relevant_contexts
 
     def _format_context(self, contexts):
-        context_string = "\n\n".join(
-            ["### Context {} ###\n{}\n######".format(idx + 1, c.page_content) for idx, c in enumerate(contexts)]
-        )
+        context_string = "\n------\n".join(["", *[doc.page_content for doc in contexts], ""])
         return context_string
 
     def _prevent_context_window_overflow(self, prompt):
@@ -142,16 +149,14 @@ def _prevent_context_window_overflow(self, prompt):
         # https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
         return prompt[: self.context_window_length * 4]
 
-    def _llm_complete(self, prompt, functions):
+    def _llm_complete(self, messages):
         try:
             out = self.llm_client.complete(
-                messages=[{"role": "system", "content": prompt}],
-                functions=functions,
-                function_call={"name": "generate_inputs"},
+                messages=messages,
                 temperature=self.llm_temperature,
                 caller_id=self.__class__.__name__,
             )
-            generated = out.function_call.args["inputs"]
+            generated = json.loads(out.message, strict=False)
         except (AttributeError, KeyError) as err:
             raise LLMGenerationError("Could not parse generated inputs") from err
 
@@ -181,13 +186,12 @@ def generate_dataset(self, num_samples: int = 10) -> TestSet:
             seed_contexts = self._extract_seed_context()
             context = self._format_context(seed_contexts)
 
-            question = self._generate_question_from_context(context)[0]
-            answer = self._generate_answer_from_context(question["question"], context)[0]
+            question, answer = self._generate_question_answer_from_context(context)
 
             generated_questions.append(
                 {
-                    "question": question["question"],
-                    "reference_answer": answer["answer"],
+                    "question": question,
+                    "reference_answer": answer,
                     "reference_context": context,
                     "difficulty_level": self._difficulty_level,
                 }
diff --git a/giskard/rag/prompts.py b/giskard/rag/prompts.py
index 2561d61032..3464b7656c 100644
--- a/giskard/rag/prompts.py
+++ b/giskard/rag/prompts.py
@@ -1,33 +1,33 @@
-QUESTION_GENERATION_PROMPT = """You are a client from an online shop called {model_name}. Shop description: {model_description}
-You are looking for information about specific products that are sold on by this shop and about the shop's activities.   
+QA_GENERATION_SYSTEM_PROMPT = """You are a powerful auditing AI, your role is to generate question answer pair from a given list of context paragraph to audit a model specialized on these knowledge. 
+
+The model you are auditing is the following:
+- Model name: {model_name}
+- Model description: {model_description}  
 
 Your task is to generate questions about the products, the ordering process and the shop's activities in general. Your question must be related to a provided context.  
 Please respect the following rules to generate the question:
-- The answer to the question should be found, at least partially, inside the provided context.  
-- The question must be self-contained and understandable by humans. 
-- The question must be in {language}.
+- The answer to the question should be found inside the provided context
+- The question must be self-contained
+- The question must be in English
 
-Here is the context:
-<context>
-{context}
-</context>
+You will be provided the context, consisting in multiple paragraphs delimited by dashes "------".
+You will return the question and the precise answer to the question based exclusively on the provided context.
+Your output should be a single JSON object, with keys 'question' and 'answer'."""
 
-Remember you are a client of {model_name}, you are looking for information to help you with your shopping.
-Please call the `generate_inputs` function with the generated inputs. You must generate 1 input.
-"""
+QA_GENERATION_ASSISTANT_EXAMPLE = """{
+    "question": "For which countries can I track my shipping?",
+    "answer": "We ship to all 50 states in the US, as well as to Canada and Mexico. We offer tracking for all our shippings."
+}"""
 
-ANSWER_GENERATION_PROMPT = """Your task is to answer a question based on a provided context.
-The answer should be clear and concise. Think step by step and answer the question thoroughly. 
-Your answer must only contain information provided by the context.
+QA_GENERATION_CONTEXT_EXAMPLE = """What payment methods do you accept?
 
-Here is the context and the question:
-<context>
-{context}
-</context>
+We accept a variety of payment methods to provide our customers with a convenient and secure shopping experience. You can make a purchase using major credit and debit cards, including Visa, Mastercard, American Express, and Discover. We also offer the option to pay with popular digital wallets such as PayPal and Google Pay. For added flexibility, you can choose to complete your order using bank transfers or wire transfers. Rest assured that we prioritize the security of your personal information and go the extra mile to ensure your transactions are processed safely.
+------
+What is your shipping policy?
 
-<question>
-{question}
-</question>
+We offer free shipping on all orders over $50. For orders below $50, we charge a flat rate of $5.99. We offer shipping services to customers residing in all 50 states of the US, in addition to providing delivery options to Canada and Mexico.
+------
+How can I track my order?
 
-Please call the `generate_inputs` function with the generated inputs.
+Tracking your order is a breeze! Once your purchase has been successfully confirmed and shipped, you will receive a confirmation email containing your tracking number. You can simply click on the link provided in the email or visit our website's order tracking page. Enter your tracking number, and you will be able to monitor the progress of your shipment in real-time. This way, you can stay updated on the estimated delivery date and ensure you're available to receive your package.
 """

From ce702051467900d0e84f5b9b582b57fe071f3efd Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Tue, 23 Jan 2024 12:49:55 +0100
Subject: [PATCH 29/88] Update tests for testset generator

---
 .../test_knowledge_base_testset_generator.py  | 51 +++++++------------
 1 file changed, 17 insertions(+), 34 deletions(-)

diff --git a/tests/rag/test_knowledge_base_testset_generator.py b/tests/rag/test_knowledge_base_testset_generator.py
index 1caad22a6b..eee4c64154 100644
--- a/tests/rag/test_knowledge_base_testset_generator.py
+++ b/tests/rag/test_knowledge_base_testset_generator.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pandas as pd
 
-from giskard.llm.client import LLMFunctionCall, LLMOutput
+from giskard.llm.client import LLMOutput
 from giskard.rag import KnowledgeBaseTestsetGenerator
 
 
@@ -23,45 +23,28 @@ def make_knowledge_base_df():
     return knowledge_base_df
 
 
-CONTEXT_STRING = """### Context 1 ###
+CONTEXT_STRING = """
+------
 Scamorza is a Southern Italian cow's milk cheese.
-######
-
-### Context 2 ###
+------
 Bleu d'Auvergne is a French blue cheese, named for its place of origin in the Auvergne region.
-######
-
-### Context 3 ###
+------
 Freeriding is a style of snowboarding or skiing performed on natural, un-groomed terrain, without a set course, goals or rules.
-######"""
+------
+"""
 
 
 def test_testset_generation():
     llm_client = Mock()
-    llm_client.complete.side_effect = [
-        LLMOutput(
-            None,
-            LLMFunctionCall(
-                "generate_inputs",
-                {
-                    "inputs": [
-                        {"question": "Where is Camembert from?"},
-                    ]
-                },
-            ),
-        ),
-        LLMOutput(
-            None,
-            LLMFunctionCall(
-                "generate_inputs",
-                {
-                    "inputs": [
-                        {"answer": "Camembert was created in Normandy, in the northwest of France."},
-                    ]
-                },
-            ),
-        ),
-    ] * 2
+    llm_client.complete.side_effect = (
+        [
+            LLMOutput(
+                """{"question": "Where is Camembert from?",
+"answer": "Camembert was created in Normandy, in the northwest of France."}"""
+            )
+        ]
+        * 2
+    )
 
     embedding_dimension = 8
 
@@ -98,4 +81,4 @@ def test_testset_generation():
     assert test_set.df.loc[0, "difficulty_level"] == 1
 
     assert test_set.df.loc[1, "question"] == "Where is Camembert from?"
-    assert test_set.df.loc[1, "reference_context"] == ""
+    assert test_set.df.loc[1, "reference_context"] == "\n------\n"

From c196c047d27d468e0f453a6dd5d08d3275a296c8 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Tue, 23 Jan 2024 14:23:03 +0100
Subject: [PATCH 30/88] Change the prompt to fix the number of output of the
 model

---
 giskard/rag/knowledge_base_testset_generator.py | 2 ++
 giskard/rag/prompts.py                          | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index 41379687c0..b58d082115 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -60,6 +60,7 @@ class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
     _qa_generation_system_prompt = QA_GENERATION_SYSTEM_PROMPT
     _qa_generation_context_example = QA_GENERATION_CONTEXT_EXAMPLE
     _qa_generation_assistant_example = QA_GENERATION_ASSISTANT_EXAMPLE
+    _one_output_requirement = "\n\nRemember you should only generate one question and answer pair."
 
     _difficulty_level = 1
 
@@ -141,6 +142,7 @@ def _extract_seed_context(self):
 
     def _format_context(self, contexts):
         context_string = "\n------\n".join(["", *[doc.page_content for doc in contexts], ""])
+        context_string = context_string + self._one_output_requirement
         return context_string
 
     def _prevent_context_window_overflow(self, prompt):
diff --git a/giskard/rag/prompts.py b/giskard/rag/prompts.py
index 3464b7656c..e0945a5dad 100644
--- a/giskard/rag/prompts.py
+++ b/giskard/rag/prompts.py
@@ -8,11 +8,11 @@
 Please respect the following rules to generate the question:
 - The answer to the question should be found inside the provided context
 - The question must be self-contained
-- The question must be in English
+- The question and answer must be in {language}
 
 You will be provided the context, consisting in multiple paragraphs delimited by dashes "------".
 You will return the question and the precise answer to the question based exclusively on the provided context.
-Your output should be a single JSON object, with keys 'question' and 'answer'."""
+Your output should be a unique JSON object, with keys 'question' and 'answer'."""
 
 QA_GENERATION_ASSISTANT_EXAMPLE = """{
     "question": "For which countries can I track my shipping?",

From 543d6fdbc9937a39af8ef28455beff38dedd73ca Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Tue, 23 Jan 2024 14:34:26 +0100
Subject: [PATCH 31/88] Improve handling of JSONDecoderErrors

---
 giskard/rag/knowledge_base_testset_generator.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index b58d082115..f34ce9e5f7 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -161,6 +161,11 @@ def _llm_complete(self, messages):
             generated = json.loads(out.message, strict=False)
         except (AttributeError, KeyError) as err:
             raise LLMGenerationError("Could not parse generated inputs") from err
+        except json.decoder.JSONDecodeError as err:
+            if "Extra data:" in str(err):
+                raise LLMGenerationError("Generator model output more than one question/answer pair.") from err
+            else:
+                raise err
 
         return generated
 

From 6583b4426891f5074b1803707ba0fd4669eb4d63 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Tue, 23 Jan 2024 15:47:46 +0100
Subject: [PATCH 32/88] Minor refactor

---
 .../rag/knowledge_base_testset_generator.py   | 24 +------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index f34ce9e5f7..d713fadd86 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -94,27 +94,6 @@ def __init__(
 
         self.knowledge_base = VectorStore.from_df(knowledge_df, self.embedding_model, features=knowledge_base_features)
 
-    def _make_generate_input_functions(self, return_attribute_name):
-        return [
-            {
-                "name": "generate_inputs",
-                "description": "generates inputs for model audit",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "inputs": {
-                            "type": "array",
-                            "items": {
-                                "type": "object",
-                                "properties": {return_attribute_name: {"type": "string"}},
-                            },
-                        }
-                    },
-                    "required": ["inputs"],
-                },
-            }
-        ]
-
     def _generate_question_answer_from_context(self, context):
         messages = [{"role": "system", "content": self._qa_generation_system_prompt}]
         if self.include_examples:
@@ -124,7 +103,7 @@ def _generate_question_answer_from_context(self, context):
                     {"role": "assistant", "content": self._qa_generation_assistant_example},
                 ]
             )
-        messages.append({"role": "user", "content": context})
+        messages.append({"role": "user", "content": context + self._one_output_requirement})
 
         generated_qa = self._llm_complete(messages=messages)
         return generated_qa["question"], generated_qa["answer"]
@@ -142,7 +121,6 @@ def _extract_seed_context(self):
 
     def _format_context(self, contexts):
         context_string = "\n------\n".join(["", *[doc.page_content for doc in contexts], ""])
-        context_string = context_string + self._one_output_requirement
         return context_string
 
     def _prevent_context_window_overflow(self, prompt):

From 42e7c4686844e69e7e1ada60a1d21d99252ca852 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Tue, 23 Jan 2024 16:16:15 +0100
Subject: [PATCH 33/88] Remove unnecessary uvloop dependency

---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index ab2ff7d1fb..20331af912 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -174,7 +174,6 @@ dependencies = [
     "markdown",                 # needed for display of scan results in notebook
     "colorama",                 # needed for the scan
     "griffe>=0.36.9",
-    "uvloop>=0.19.0",
 ]
 
 [project.optional-dependencies]

From b91bee8458cac451680eb33d1e7611fe65a84045 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Thu, 25 Jan 2024 17:34:16 +0100
Subject: [PATCH 34/88] Enforce JSON format via prompt

---
 giskard/rag/knowledge_base_testset_generator.py | 4 ++--
 giskard/rag/prompts.py                          | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index d713fadd86..f412b2451c 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -60,7 +60,6 @@ class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
     _qa_generation_system_prompt = QA_GENERATION_SYSTEM_PROMPT
     _qa_generation_context_example = QA_GENERATION_CONTEXT_EXAMPLE
     _qa_generation_assistant_example = QA_GENERATION_ASSISTANT_EXAMPLE
-    _one_output_requirement = "\n\nRemember you should only generate one question and answer pair."
 
     _difficulty_level = 1
 
@@ -103,7 +102,7 @@ def _generate_question_answer_from_context(self, context):
                     {"role": "assistant", "content": self._qa_generation_assistant_example},
                 ]
             )
-        messages.append({"role": "user", "content": context + self._one_output_requirement})
+        messages.append({"role": "user", "content": context})
 
         generated_qa = self._llm_complete(messages=messages)
         return generated_qa["question"], generated_qa["answer"]
@@ -140,6 +139,7 @@ def _llm_complete(self, messages):
         except (AttributeError, KeyError) as err:
             raise LLMGenerationError("Could not parse generated inputs") from err
         except json.decoder.JSONDecodeError as err:
+            print("ERROR RES", out)
             if "Extra data:" in str(err):
                 raise LLMGenerationError("Generator model output more than one question/answer pair.") from err
             else:
diff --git a/giskard/rag/prompts.py b/giskard/rag/prompts.py
index e0945a5dad..a5ce8be216 100644
--- a/giskard/rag/prompts.py
+++ b/giskard/rag/prompts.py
@@ -12,7 +12,7 @@
 
 You will be provided the context, consisting in multiple paragraphs delimited by dashes "------".
 You will return the question and the precise answer to the question based exclusively on the provided context.
-Your output should be a unique JSON object, with keys 'question' and 'answer'."""
+Your output should be a single JSON object, with keys 'question' and 'answer'. Make sure you return a valid JSON object."""
 
 QA_GENERATION_ASSISTANT_EXAMPLE = """{
     "question": "For which countries can I track my shipping?",

From acade50204435390deac8f2dbe5a8025270b63c1 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Thu, 25 Jan 2024 17:50:23 +0100
Subject: [PATCH 35/88] Remove unwanted code

---
 giskard/rag/knowledge_base_testset_generator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index f412b2451c..3351083dd0 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -139,7 +139,6 @@ def _llm_complete(self, messages):
         except (AttributeError, KeyError) as err:
             raise LLMGenerationError("Could not parse generated inputs") from err
         except json.decoder.JSONDecodeError as err:
-            print("ERROR RES", out)
             if "Extra data:" in str(err):
                 raise LLMGenerationError("Generator model output more than one question/answer pair.") from err
             else:

From da384f6444f30e6f921752e5e5f0890687fa09fc Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Thu, 25 Jan 2024 17:51:03 +0100
Subject: [PATCH 36/88] Make the language selection work

---
 giskard/rag/knowledge_base_testset_generator.py | 11 ++++++++++-
 giskard/rag/prompts.py                          |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index 3351083dd0..5968e9ddc6 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -94,7 +94,16 @@ def __init__(
         self.knowledge_base = VectorStore.from_df(knowledge_df, self.embedding_model, features=knowledge_base_features)
 
     def _generate_question_answer_from_context(self, context):
-        messages = [{"role": "system", "content": self._qa_generation_system_prompt}]
+        messages = [
+            {
+                "role": "system",
+                "content": self._qa_generation_system_prompt.format(
+                    model_name=self.model_name,
+                    model_description=self.model_description,
+                    language=self.language,
+                ),
+            }
+        ]
         if self.include_examples:
             messages.extend(
                 [
diff --git a/giskard/rag/prompts.py b/giskard/rag/prompts.py
index a5ce8be216..e12a832d0f 100644
--- a/giskard/rag/prompts.py
+++ b/giskard/rag/prompts.py
@@ -8,7 +8,7 @@
 Please respect the following rules to generate the question:
 - The answer to the question should be found inside the provided context
 - The question must be self-contained
-- The question and answer must be in {language}
+- The question and answer must be in this language: {language}
 
 You will be provided the context, consisting in multiple paragraphs delimited by dashes "------".
 You will return the question and the precise answer to the question based exclusively on the provided context.

From e3071ca6594419e0b25a71c2a7d453336d616f9b Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Mon, 22 Jan 2024 17:13:22 +0100
Subject: [PATCH 37/88] Add documentation for RAG toolset

---
 docs/open_source/customize_tests/index.md     |   5 +
 .../testset_generation/index.md               | 144 ++++++++++++++++++
 2 files changed, 149 insertions(+)
 create mode 100644 docs/open_source/customize_tests/testset_generation/index.md

diff --git a/docs/open_source/customize_tests/index.md b/docs/open_source/customize_tests/index.md
index 1f01300a91..1596cc802e 100644
--- a/docs/open_source/customize_tests/index.md
+++ b/docs/open_source/customize_tests/index.md
@@ -28,4 +28,9 @@ data_transformations/index
 :link: data_transformations/index.html
 ::::
 
+::::{grid-item-card} <br/><h3>🧰 RAG toolset</h3>
+:text-align: center
+:link: testset_generation/index.html
+::::
+
 :::::
diff --git a/docs/open_source/customize_tests/testset_generation/index.md b/docs/open_source/customize_tests/testset_generation/index.md
new file mode 100644
index 0000000000..59c24ccc40
--- /dev/null
+++ b/docs/open_source/customize_tests/testset_generation/index.md
@@ -0,0 +1,144 @@
+# 🧰 RAG toolset
+
+Retrieval Augmented Generative models (RAGs) combine LLM models and data sources to produce domain-specific language models able to 
+answer precise questions whose answer are available inside a knowledge base. These models are often extremely specialized to a use-case
+defined by the information present inside the knowledge base. The specialization of the model makes generic evaluations irrelevant to verify 
+the model's behavior (e.g. hallucinations, trustworthiness, etc.). To this end, the Giskard python library provides a toolset dedicated to RAG models 
+that generates question/answer pairs from the knowledge base of the model.
+
+## How does it work?
+
+The automatic testset generation explores the Knowledge Base (KB) of your model and generate questions and answers related to specific topics 
+available inside the KB. Specifically, a topic from the KB is selected at random, then the related excerpts from the KB are extracted to create 
+a `reference_context`. Then we generate a `question` along with a `reference_answer` using an LLM (specifically, we use **OpenAI GPT-4**). 
+
+The generated testset contains a list of questions specific to the model's knowledge base. The model should theoretically answer all these 
+questions correctly. Yet, hallucination or imprecise answers can be generated by the model. This testset allows to evaluate how frequent 
+these undesired behaviors happen.
+
+### What data are being sent to OpenAI/Azure OpenAI
+
+In order to perform LLM-assisted detectors, we will be sending the following information to OpenAI/Azure OpenAI:
+
+- Data provided in your Dataset
+- Text generated by your model
+- Model name and description
+
+### Will the testset generation work in any language?
+
+The testset quality depends on GPT-4 capabilities regarding your model's language.
+
+## Before starting
+
+Before starting, make sure you have installed the LLM flavor of Giskard:
+
+```bash
+pip install "giskard[llm]"
+```
+
+For the LLM-assisted detectors to work, you need to have an OpenAI API key. You can set it in your notebook
+like this:
+
+:::::::{tab-set}
+::::::{tab-item} OpenAI
+
+```python
+import os
+
+os.environ["OPENAI_API_KEY"] = "sk-…"
+```
+
+::::::
+::::::{tab-item} Azure OpenAI
+
+Require `openai>=1.0.0`
+
+```python
+import os
+from giskard.llm import set_llm_model
+
+os.environ['AZURE_OPENAI_API_KEY'] = '...'
+os.environ['AZURE_OPENAI_ENDPOINT'] = 'https://xxx.openai.azure.com'
+os.environ['OPENAI_API_VERSION'] = '2023-07-01-preview'
+
+
+# You'll need to provide the name of the model that you've deployed
+# Beware, the model provided must be capable of using function calls
+set_llm_model('my-gpt-4-model')
+```
+
+::::::
+:::::::
+
+We are now ready to start.
+
+
+## Step 1: Format and load your Knowledge Base
+The RAG toolset currently only handles knowledge bases as pandas `DataFrame`. If the DataFrame has multiple columns,
+they will be concatenated automatically. If only some of the columns contains the knowledge, you can specify it when building 
+the generator by passing a list of column names to the `knowledge_base_features` argument.
+
+
+```python
+knowledge_base_df = pd.read_*("path/to/your/knowledge_base")
+feature_names = ["col1", "col2"]
+knowledge_base_df["page_content"] = knowledge_base_df[feature_names].apply(" ".join, axis=1)
+```
+
+## Step 2: Generate the testset
+Once the knowledge base is loaded as a pandas `DataFrame`, you can generate the testset with the 
+`KnowledgeBaseTestsetGenerator`. 
+
+
+```python
+from giskard.rag import KnowledgeBaseTestsetGenerator
+
+generator = KnowledgeBaseTestsetGenerator(knowledge_base_df, 
+                    model_name="Model name",
+                    model_description="Description of the model",
+                    knowledge_base_features=["page_content"])
+
+testset = generator.generate_dataset(num_samples=10)
+```
+
+## Step 3: Evaluate your model
+```python
+from giskard.testing.tests.llm import test_llm_correctness
+
+test_llm_correctness(model, testset, threshold=0.8).execute()
+```
+
+## What's next?
+
+Your scan results may have highlighted important vulnerabilities. There are 2 important actions you can take next:
+
+### 1. Generate a test suite from the testset:
+
+Turn the generated testset into an actionable test that you can save and reuse in further iterations.
+
+```python
+test_suite = scan_results.generate_test_suite("My first test suite")
+
+# You can run the test suite locally to verify that it reproduces the issues
+test_suite.run()
+```
+
+Jump to the [test customization](https://docs.giskard.ai/en/latest/open_source/customize_tests/index.html) and [test integration](https://docs.giskard.ai/en/latest/open_source/integrate_tests/index.html) sections to find out everything you can do with test suites.
+
+### 2. Upload your test suite to the Giskard Hub to:
+* Compare the quality of different models and prompts to decide which one to promote
+* Create more tests relevant to your use case, combining input prompts that make your model fail and custome evaluation criteria
+* Share results, and collaborate with your team to integrate business feedback
+
+To upload your test suite, you must have created a project on Giskard Hub and instantiated a Giskard Python client. If you haven't done this yet, follow the first steps of [upload your object](https://docs.giskard.ai/en/latest/giskard_hub/upload/index.html#upload-your-object) guide.
+
+Then, upload your test suite like this:
+```python
+test_suite.upload(giskard_client, project_key)
+```
+
+[Here's a demo](https://huggingface.co/spaces/giskardai/giskard) of the Giskard Hub in action.
+
+## Troubleshooting
+
+If you encounter any issues, join our [Discord community](https://discord.gg/fkv7CAr3FE) and ask questions in our #support channel.

From a98c749537822882543ea3e9e33d607791b34b63 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 24 Jan 2024 16:38:35 +0100
Subject: [PATCH 38/88] Update documentation and add API reference for RAG
 toolset

---
 docs/index.md                                 |  2 +
 docs/open_source/customize_tests/index.md     |  5 --
 docs/open_source/index.md                     |  6 +++
 .../testset_generation/index.md               | 47 ++++++++++---------
 docs/reference/index.rst                      |  1 +
 .../rag-toolset/correctness_evaluator.rst     |  5 ++
 docs/reference/rag-toolset/index.rst          | 10 ++++
 .../rag-toolset/testset_generation.rst        |  8 ++++
 docs/reference/rag-toolset/vector_store.rst   | 11 +++++
 docs/reference/tests/llm.rst                  |  1 +
 10 files changed, 70 insertions(+), 26 deletions(-)
 rename docs/open_source/{customize_tests => }/testset_generation/index.md (56%)
 create mode 100644 docs/reference/rag-toolset/correctness_evaluator.rst
 create mode 100644 docs/reference/rag-toolset/index.rst
 create mode 100644 docs/reference/rag-toolset/testset_generation.rst
 create mode 100644 docs/reference/rag-toolset/vector_store.rst

diff --git a/docs/index.md b/docs/index.md
index d8bd2dd6ea..e985163b52 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -16,6 +16,7 @@ open_source/installation_library/index
 open_source/scan/index
 open_source/customize_tests/index
 open_source/integrate_tests/index
+open_source/testset_generation/index
 ```
 
 ```{toctree}
@@ -75,6 +76,7 @@ reference/datasets/index
 reference/scan/index
 reference/tests/index
 reference/slicing-functions/index
+reference/rag-toolset/index
 reference/transformation-functions/index
 reference/push/index
 reference/suite/index
diff --git a/docs/open_source/customize_tests/index.md b/docs/open_source/customize_tests/index.md
index 1596cc802e..1f01300a91 100644
--- a/docs/open_source/customize_tests/index.md
+++ b/docs/open_source/customize_tests/index.md
@@ -28,9 +28,4 @@ data_transformations/index
 :link: data_transformations/index.html
 ::::
 
-::::{grid-item-card} <br/><h3>🧰 RAG toolset</h3>
-:text-align: center
-:link: testset_generation/index.html
-::::
-
 :::::
diff --git a/docs/open_source/index.md b/docs/open_source/index.md
index ee9302d019..8b348df2a6 100644
--- a/docs/open_source/index.md
+++ b/docs/open_source/index.md
@@ -8,6 +8,7 @@ installation_library/index
 scan/index
 customize_tests/index
 integrate_tests/index
+testset_generation/index
 ```
 
 ::::::{grid} 1 1 2 2
@@ -31,4 +32,9 @@ integrate_tests/index
 ::::{grid-item-card} <br/><h3>🔁 Integrate your tests</h3>
 :text-align: center
 :link: integrate_tests/index.html
+::::
+
+::::{grid-item-card} <br/><h3>🧰 RAG Toolset</h3>
+:text-align: center
+:link: testset_generation/index.html
 ::::
\ No newline at end of file
diff --git a/docs/open_source/customize_tests/testset_generation/index.md b/docs/open_source/testset_generation/index.md
similarity index 56%
rename from docs/open_source/customize_tests/testset_generation/index.md
rename to docs/open_source/testset_generation/index.md
index 59c24ccc40..ee008155a8 100644
--- a/docs/open_source/customize_tests/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -1,32 +1,22 @@
 # 🧰 RAG toolset
-
-Retrieval Augmented Generative models (RAGs) combine LLM models and data sources to produce domain-specific language models able to 
-answer precise questions whose answer are available inside a knowledge base. These models are often extremely specialized to a use-case
-defined by the information present inside the knowledge base. The specialization of the model makes generic evaluations irrelevant to verify 
-the model's behavior (e.g. hallucinations, trustworthiness, etc.). To this end, the Giskard python library provides a toolset dedicated to RAG models 
-that generates question/answer pairs from the knowledge base of the model.
+Retrieval Augmented Generative models (RAGs) combine LLM models and data sources to produce domain-specific language models able to answer precise questions whose answer are available inside a knowledge base. These models are often extremely specialized to a use-case defined by the information present inside the knowledge base. The specialization of the model makes generic evaluations irrelevant to verify the model's behavior (e.g. hallucinations, trustworthiness, etc.). To this end, the Giskard python library provides a toolset dedicated to RAG models that generates question/answer pairs from the knowledge base of the model.
 
 ## How does it work?
+The automatic testset generation explores the Knowledge Base (KB) of your model and generate questions and answers related to specific topics available inside the KB. Specifically, we randomly select a topic from the KB, then we extract the related excerpts from the KB to build a `reference_context`. Then we generate a `question` along with a `reference_answer` using an LLM (specifically, we use **OpenAI GPT-4**). 
 
-The automatic testset generation explores the Knowledge Base (KB) of your model and generate questions and answers related to specific topics 
-available inside the KB. Specifically, a topic from the KB is selected at random, then the related excerpts from the KB are extracted to create 
-a `reference_context`. Then we generate a `question` along with a `reference_answer` using an LLM (specifically, we use **OpenAI GPT-4**). 
-
-The generated testset contains a list of questions specific to the model's knowledge base. The model should theoretically answer all these 
-questions correctly. Yet, hallucination or imprecise answers can be generated by the model. This testset allows to evaluate how frequent 
-these undesired behaviors happen.
+The generated testset contains a list of questions specific to the model's knowledge base. The model should theoretically answer all these questions correctly. Yet, hallucination or imprecise answers can be generated by the model. This testset allows to quantify how frequent these undesired behaviors happen.
 
 ### What data are being sent to OpenAI/Azure OpenAI
 
 In order to perform LLM-assisted detectors, we will be sending the following information to OpenAI/Azure OpenAI:
 
-- Data provided in your Dataset
+- Data provided in your knowledge base
 - Text generated by your model
 - Model name and description
 
 ### Will the testset generation work in any language?
 
-The testset quality depends on GPT-4 capabilities regarding your model's language.
+The testset quality depends on GPT-4 capabilities regarding your model's language. 
 
 ## Before starting
 
@@ -36,7 +26,7 @@ Before starting, make sure you have installed the LLM flavor of Giskard:
 pip install "giskard[llm]"
 ```
 
-For the LLM-assisted detectors to work, you need to have an OpenAI API key. You can set it in your notebook
+To use the RAG testset generation and evaluation tools, you need to have an OpenAI API key. You can set it in your notebook
 like this:
 
 :::::::{tab-set}
@@ -75,8 +65,7 @@ We are now ready to start.
 
 ## Step 1: Format and load your Knowledge Base
 The RAG toolset currently only handles knowledge bases as pandas `DataFrame`. If the DataFrame has multiple columns,
-they will be concatenated automatically. If only some of the columns contains the knowledge, you can specify it when building 
-the generator by passing a list of column names to the `knowledge_base_features` argument.
+they are concatenated automatically. If only some of the columns contains relevant information, you can specify it when building the generator by passing a list of column names to the `knowledge_base_features` argument (see [API Reference](https://docs.giskard.ai/en/latest/reference/rag-toolset/testset_generation.html#giskard.rag.KnowledgeBaseTestsetGenerator)).
 
 
 ```python
@@ -102,19 +91,35 @@ testset = generator.generate_dataset(num_samples=10)
 ```
 
 ## Step 3: Evaluate your model
+Once your testset is ready, you can evaluate your model using the `CorrectnessEvaluator`. This can be done directly or through a Giskard test which wraps the evaluator. The `CorrectnessEvaluator` asks a question to the given model and compares the model answer with the reference answer from the testset. Specifically, we use GPT-4 to assess whether the model answer is acceptable given the reference answer.  
+:::::::{tab-set}
+::::::{tab-item} Direct Evaluation
+
+The `CorrectnessEvaluator` asks all the questions from the testset to your model and generate a `EvaluationResult` object with all samples from the testset split as pass or fail, and the indices of failed samples in the original testset.
+```python
+from giskard.llm.evaluators import CorrectnessEvaluator
+
+correctness_evaluator = CorrectnessEvaluator()
+eval_result, failed_indices = correctness_evaluator.evaluate(model, dataset)
+```
+::::::
+::::::{tab-item} Giskard test
+You can also evaluate your model with the `test_llm_correctness` function, which wraps the `CorrectnessEvaluator` and produce a `TestResult` object as all Giskard test functions. The model passes the test if the ratio of correct answer is above the specified threshold. 
 ```python
 from giskard.testing.tests.llm import test_llm_correctness
 
-test_llm_correctness(model, testset, threshold=0.8).execute()
+test_result = test_llm_correctness(model, testset, threshold=0.8).execute()
 ```
+::::::
+:::::::
 
 ## What's next?
 
-Your scan results may have highlighted important vulnerabilities. There are 2 important actions you can take next:
+The questions generated in the testset may have highlighted some vulnerabilities of your model. There are 2 important actions you can take next:
 
 ### 1. Generate a test suite from the testset:
 
-Turn the generated testset into an actionable test that you can save and reuse in further iterations.
+Turn the generated testset into an actionable test suite that you can save and reuse in further iterations.
 
 ```python
 test_suite = scan_results.generate_test_suite("My first test suite")
diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 94afac716d..dfa15be8b3 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -13,3 +13,4 @@ API Reference
    transformation-functions/index
    push/index
    suite/index
+   rag-toolset/index
diff --git a/docs/reference/rag-toolset/correctness_evaluator.rst b/docs/reference/rag-toolset/correctness_evaluator.rst
new file mode 100644
index 0000000000..c47cb39ab9
--- /dev/null
+++ b/docs/reference/rag-toolset/correctness_evaluator.rst
@@ -0,0 +1,5 @@
+Correctness Evaluator
+======
+
+.. autoclass:: giskard.llm.evaluators.CorrectnessEvaluator
+    :members:
diff --git a/docs/reference/rag-toolset/index.rst b/docs/reference/rag-toolset/index.rst
new file mode 100644
index 0000000000..efc35eccff
--- /dev/null
+++ b/docs/reference/rag-toolset/index.rst
@@ -0,0 +1,10 @@
+RAG Toolset
+=============
+
+.. toctree::
+   :maxdepth: 2
+
+   testset_generation
+   vector_store
+   correctness_evaluator
+
diff --git a/docs/reference/rag-toolset/testset_generation.rst b/docs/reference/rag-toolset/testset_generation.rst
new file mode 100644
index 0000000000..2cb429fe82
--- /dev/null
+++ b/docs/reference/rag-toolset/testset_generation.rst
@@ -0,0 +1,8 @@
+Testset Generation
+======
+
+.. autoclass:: giskard.rag.KnowledgeBaseTestsetGenerator
+    :members:
+
+.. autoclass:: giskard.rag.TestSet
+    :members:
\ No newline at end of file
diff --git a/docs/reference/rag-toolset/vector_store.rst b/docs/reference/rag-toolset/vector_store.rst
new file mode 100644
index 0000000000..5707244113
--- /dev/null
+++ b/docs/reference/rag-toolset/vector_store.rst
@@ -0,0 +1,11 @@
+Vector Store
+======
+
+.. autoclass:: giskard.rag.vector_store.VectorStore
+    :members:
+
+.. autoclass:: giskard.rag.vector_store.Document
+    :members:
+
+.. autoclass:: giskard.rag.embeddings.OpenAIEmbeddings
+    :members:
\ No newline at end of file
diff --git a/docs/reference/tests/llm.rst b/docs/reference/tests/llm.rst
index 436a104830..1e3592d64c 100644
--- a/docs/reference/tests/llm.rst
+++ b/docs/reference/tests/llm.rst
@@ -14,6 +14,7 @@ LLM-as-a-judge
 .. autofunction:: giskard.testing.tests.llm.test_llm_output_against_requirement_per_row
 .. autofunction:: giskard.testing.tests.llm.test_llm_single_output_against_requirement
 .. autofunction:: giskard.testing.tests.llm.test_llm_output_against_requirement
+.. autofunction:: giskard.testing.tests.llm.test_llm_correctness
 
 Ground Truth
 --------------

From 8c0f560a45613bb10ecd079c7450e44b53e8ff36 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Fri, 26 Jan 2024 16:01:07 +0100
Subject: [PATCH 39/88] Add embeddings function inside base LLM client

---
 giskard/llm/client/base.py                    |  6 ++
 giskard/llm/client/openai.py                  | 11 +++
 .../rag/knowledge_base_testset_generator.py   | 71 +++++++++++--------
 giskard/rag/prompts.py                        | 10 +--
 4 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/giskard/llm/client/base.py b/giskard/llm/client/base.py
index 2aa19358a5..3030a43beb 100644
--- a/giskard/llm/client/base.py
+++ b/giskard/llm/client/base.py
@@ -3,6 +3,8 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 
+import numpy as np
+
 from .logger import LLMLogger
 
 
@@ -35,3 +37,7 @@ def complete(
         caller_id: Optional[str] = None,
     ) -> LLMOutput:
         ...
+
+    @abstractmethod
+    def embeddings(self, text) -> np.ndarray:
+        ...
diff --git a/giskard/llm/client/openai.py b/giskard/llm/client/openai.py
index 68af5282b7..5c67d0d522 100644
--- a/giskard/llm/client/openai.py
+++ b/giskard/llm/client/openai.py
@@ -3,6 +3,7 @@
 import json
 from abc import ABC, abstractmethod
 
+import numpy as np
 from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
 
 from ..config import LLMConfigurationError
@@ -72,6 +73,16 @@ def complete(
 
         return LLMOutput(message=cc["content"], function_call=function_call)
 
+    def embeddings(self, texts: Sequence[str], model: str = "text-embedding-ada-002") -> np.ndarray:
+        texts = [t.replace("\n", " ") for t in texts]
+        try:
+            out = self._client.embeddings.create(input=texts, model=model)
+            embeddings = [element.embedding for element in out.data]
+        except Exception as err:
+            print(err)
+            raise ValueError("Batched embedding creation failed.") from err
+        return np.stack(embeddings)
+
 
 class LegacyOpenAIClient(BaseOpenAIClient):
     """OpenAI client for versions <= 0.28.1"""
diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index 5968e9ddc6..ccf4c5fc7f 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -1,21 +1,23 @@
 from typing import Sequence
 
 import json
+import logging
 
 import numpy as np
 import pandas as pd
 
-from ..llm.errors import LLMGenerationError
 from ..llm.generators import BaseDataGenerator
-from .embeddings import EmbeddingsBase, OpenAIEmbeddings
 from .prompts import (
+    FIX_JSON_FORMAT_PROMPT,
     QA_GENERATION_ASSISTANT_EXAMPLE,
     QA_GENERATION_CONTEXT_EXAMPLE,
     QA_GENERATION_SYSTEM_PROMPT,
 )
-from .testset import TestSet
+from .testset import QATestset
 from .vector_store import VectorStore
 
+logger = logging.getLogger(__name__)
+
 
 class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
     """Testset generator for testing RAG models.
@@ -60,6 +62,7 @@ class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
     _qa_generation_system_prompt = QA_GENERATION_SYSTEM_PROMPT
     _qa_generation_context_example = QA_GENERATION_CONTEXT_EXAMPLE
     _qa_generation_assistant_example = QA_GENERATION_ASSISTANT_EXAMPLE
+    _fix_json_prompt = FIX_JSON_FORMAT_PROMPT
 
     _difficulty_level = 1
 
@@ -71,7 +74,6 @@ def __init__(
         context_neighbors: int = 4,
         context_similarity_threshold: float = 0.2,
         context_window_length: int = 8192,
-        embedding_model: EmbeddingsBase = None,
         language: str = "en",
         knowledge_base_features: Sequence[str] = None,
         seed: int = None,
@@ -86,12 +88,13 @@ def __init__(
         self.context_similarity_threshold = context_similarity_threshold
 
         self.context_window_length = context_window_length
-        self.embedding_model = embedding_model if embedding_model is not None else OpenAIEmbeddings()
         self.language = language
         self.rng = np.random.default_rng(seed=seed)
         self.include_examples = include_examples
 
-        self.knowledge_base = VectorStore.from_df(knowledge_df, self.embedding_model, features=knowledge_base_features)
+        self.knowledge_base = VectorStore.from_df(
+            knowledge_df, self.llm_client.embeddings, features=knowledge_base_features
+        )
 
     def _generate_question_answer_from_context(self, context):
         messages = [
@@ -113,15 +116,14 @@ def _generate_question_answer_from_context(self, context):
             )
         messages.append({"role": "user", "content": context})
 
-        generated_qa = self._llm_complete(messages=messages)
-        return generated_qa["question"], generated_qa["answer"]
+        return self._llm_complete(messages=messages)
 
     def _extract_seed_context(self):
         seed_context = self.rng.choice(self.knowledge_base.documents)
         relevant_contexts = [
             context
             for (context, score) in self.knowledge_base.similarity_search_with_score(
-                seed_context.page_content, k=self.context_neighbors
+                [seed_context.page_content], k=self.context_neighbors
             )
             if score < self.context_similarity_threshold  # should we keep it or not ?
         ]
@@ -144,18 +146,30 @@ def _llm_complete(self, messages):
                 temperature=self.llm_temperature,
                 caller_id=self.__class__.__name__,
             )
-            generated = json.loads(out.message, strict=False)
-        except (AttributeError, KeyError) as err:
-            raise LLMGenerationError("Could not parse generated inputs") from err
-        except json.decoder.JSONDecodeError as err:
-            if "Extra data:" in str(err):
-                raise LLMGenerationError("Generator model output more than one question/answer pair.") from err
-            else:
-                raise err
 
+            generated = json.loads(out.message, strict=False)
+        except json.decoder.JSONDecodeError:
+            logger.warning("JSON decoding error, trying to fix the JSON string.")
+            generated = self._try_fix_json_message(out.message)
         return generated
 
-    def generate_dataset(self, num_samples: int = 10) -> TestSet:
+    def _try_fix_json_message(self, incorrect_json):
+        try:
+            out = self.llm_client.complete(
+                messages=[
+                    {"role": "system", "content": self._fix_json_prompt},
+                    {"role": "user", "content": incorrect_json},
+                ],
+                temperature=0,
+                caller_id=self.__class__.__name__,
+            )
+            corrected_message = json.loads(out.message)
+        except Exception:
+            logger.warning("Fixing JSON format failed, question generation skipped.")
+            return None
+        return corrected_message
+
+    def generate_dataset(self, num_samples: int = 10) -> QATestset:
         """Generates a testset from the knowledge base.
 
         Parameters
@@ -179,15 +193,16 @@ def generate_dataset(self, num_samples: int = 10) -> TestSet:
             seed_contexts = self._extract_seed_context()
             context = self._format_context(seed_contexts)
 
-            question, answer = self._generate_question_answer_from_context(context)
+            generated_qa = self._generate_question_answer_from_context(context)
 
-            generated_questions.append(
-                {
-                    "question": question,
-                    "reference_answer": answer,
-                    "reference_context": context,
-                    "difficulty_level": self._difficulty_level,
-                }
-            )
+            if generated_qa is not None:
+                generated_questions.append(
+                    {
+                        "question": generated_qa["question"],
+                        "reference_answer": generated_qa["answer"],
+                        "reference_context": context,
+                        "difficulty_level": self._difficulty_level,
+                    }
+                )
 
-        return TestSet(df=pd.DataFrame(generated_questions))
+        return QATestset(df=pd.DataFrame(generated_questions))
diff --git a/giskard/rag/prompts.py b/giskard/rag/prompts.py
index e12a832d0f..e6d7880285 100644
--- a/giskard/rag/prompts.py
+++ b/giskard/rag/prompts.py
@@ -21,13 +21,15 @@
 
 QA_GENERATION_CONTEXT_EXAMPLE = """What payment methods do you accept?
 
-We accept a variety of payment methods to provide our customers with a convenient and secure shopping experience. You can make a purchase using major credit and debit cards, including Visa, Mastercard, American Express, and Discover. We also offer the option to pay with popular digital wallets such as PayPal and Google Pay. For added flexibility, you can choose to complete your order using bank transfers or wire transfers. Rest assured that we prioritize the security of your personal information and go the extra mile to ensure your transactions are processed safely.
+\tWe accept a variety of payment methods to provide our customers with a convenient and secure shopping experience. You can make a purchase using major credit and debit cards, including Visa, Mastercard, American Express, and Discover. We also offer the option to pay with popular digital wallets such as PayPal and Google Pay. For added flexibility, you can choose to complete your order using bank transfers or wire transfers. Rest assured that we prioritize the security of your personal information and go the extra mile to ensure your transactions are processed safely.
 ------
-What is your shipping policy?
+\tWhat is your shipping policy?
 
-We offer free shipping on all orders over $50. For orders below $50, we charge a flat rate of $5.99. We offer shipping services to customers residing in all 50 states of the US, in addition to providing delivery options to Canada and Mexico.
+We offer free shipping on all orders over $50. For orders below $50, we charge a flat rate of $5.99. We offer shipping services to customers residing in all 50\n states of the US, in addition to providing delivery options to Canada and Mexico.
 ------
-How can I track my order?
+\tHow can I track my order?
 
 Tracking your order is a breeze! Once your purchase has been successfully confirmed and shipped, you will receive a confirmation email containing your tracking number. You can simply click on the link provided in the email or visit our website's order tracking page. Enter your tracking number, and you will be able to monitor the progress of your shipment in real-time. This way, you can stay updated on the estimated delivery date and ensure you're available to receive your package.
 """
+
+FIX_JSON_FORMAT_PROMPT = """Fix the following json string so it contains a single valid json. Make sure to start and end with curly brackets."""

From a7132bcc6aa559b5417eb64ea1b610925b5d8b6c Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Fri, 26 Jan 2024 16:02:11 +0100
Subject: [PATCH 40/88] Remove embedding model and replace it with llm client
 embeddings

---
 giskard/rag/__init__.py     |  4 +--
 giskard/rag/embeddings.py   | 51 -------------------------------------
 giskard/rag/testset.py      |  2 +-
 giskard/rag/vector_store.py | 20 +++++++--------
 4 files changed, 13 insertions(+), 64 deletions(-)
 delete mode 100644 giskard/rag/embeddings.py

diff --git a/giskard/rag/__init__.py b/giskard/rag/__init__.py
index 52f240a99e..f01a6d5ccc 100644
--- a/giskard/rag/__init__.py
+++ b/giskard/rag/__init__.py
@@ -1,4 +1,4 @@
 from .knowledge_base_testset_generator import KnowledgeBaseTestsetGenerator
-from .testset import TestSet
+from .testset import QATestset
 
-__all__ = ["KnowledgeBaseTestsetGenerator", "TestSet"]
+__all__ = ["KnowledgeBaseTestsetGenerator", "QATestset"]
diff --git a/giskard/rag/embeddings.py b/giskard/rag/embeddings.py
deleted file mode 100644
index 9fb0d65dd9..0000000000
--- a/giskard/rag/embeddings.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from typing import Sequence
-
-from abc import ABC, abstractmethod
-
-import numpy as np
-
-from ..core.errors import GiskardInstallationError
-
-
-class EmbeddingsBase(ABC):
-    """Base class to build custom embedding models."""
-
-    @abstractmethod
-    def embed_text(self, text: str) -> np.ndarray:
-        ...
-
-    @abstractmethod
-    def embed_documents(self, documents: Sequence) -> np.ndarray:
-        ...
-
-
-class OpenAIEmbeddings(EmbeddingsBase):
-    """Simple wrapper around the OpenAI embeddings API."""
-
-    def __init__(self, model: str = "text-embedding-ada-002", client=None):
-        self.model = model
-
-        try:
-            from openai import OpenAI
-        except ImportError as err:
-            raise GiskardInstallationError(flavor="llm") from err
-
-        self._client = client if client is not None else OpenAI()
-
-    def embed_text(self, text: str) -> np.ndarray:
-        text = text.replace("\n", " ")
-        try:
-            out = self._client.embeddings.create(input=[text], model=self.model)
-            embeddings = out.data[0].embedding
-        except Exception as err:
-            raise ValueError(f"Embedding creation failed for text: {text}.") from err
-        return np.array(embeddings)
-
-    def embed_documents(self, documents: Sequence) -> np.ndarray:
-        text_batch = [doc.page_content.replace("\n", " ") for doc in documents]
-        try:
-            out = self._client.embeddings.create(input=text_batch, model=self.model)
-            embeddings = [element.embedding for element in out.data]
-        except Exception as err:
-            raise ValueError("Batched embedding creation failed.") from err
-        return np.stack(embeddings)
diff --git a/giskard/rag/testset.py b/giskard/rag/testset.py
index 52188e2f85..13450776d7 100644
--- a/giskard/rag/testset.py
+++ b/giskard/rag/testset.py
@@ -2,7 +2,7 @@
 from ..testing.tests.llm import test_llm_correctness
 
 
-class TestSet(Dataset):
+class QATestset(Dataset):
     """A wrapper class around `Dataset` to allow automatic creation
     of a `Suite` based on the question/answer pairs inside the `TestSet`.
     """
diff --git a/giskard/rag/vector_store.py b/giskard/rag/vector_store.py
index b4d3eff0cf..3821f15f60 100644
--- a/giskard/rag/vector_store.py
+++ b/giskard/rag/vector_store.py
@@ -1,10 +1,9 @@
-from typing import Optional, Sequence
+from typing import Callable, Optional, Sequence
 
 import numpy as np
 import pandas as pd
 
 from ..core.errors import GiskardInstallationError
-from .embeddings import EmbeddingsBase
 
 
 class Document:
@@ -29,7 +28,7 @@ class VectorStore:
     Relies on `FlatIndexL2` class from FAISS.
     """
 
-    def __init__(self, documents: Sequence[Document], embeddings: np.array, embedding_model: EmbeddingsBase):
+    def __init__(self, documents: Sequence[Document], embeddings: np.array, embedding_fn: Callable):
         if len(embeddings) == 0 or len(documents) == 0:
             raise ValueError("Documents and embeddings must contains at least one element.")
         if len(embeddings) != len(documents):
@@ -42,22 +41,23 @@ def __init__(self, documents: Sequence[Document], embeddings: np.array, embeddin
 
         self.embeddings = embeddings
         self.documents = documents
-        self.embedding_model = embedding_model
+        self.embedding_fn = embedding_fn
 
         self.dimension = self.embeddings[0].shape[0]
         self.index = IndexFlatL2(self.dimension)
         self.index.add(self.embeddings)
 
     @classmethod
-    def from_df(cls, df: pd.DataFrame, embedding_model: EmbeddingsBase, features: Sequence[str] = None):
+    def from_df(cls, df: pd.DataFrame, embedding_fn: Callable, features: Sequence[str] = None):
         if len(df) > 0:
             documents = [Document(knowledge_chunk, features=features) for knowledge_chunk in df.to_dict("records")]
-            embeddings = embedding_model.embed_documents(documents).astype("float32")
-            return cls(documents, embeddings, embedding_model)
+            raw_texts = [d.page_content for d in documents]
+            embeddings = embedding_fn(raw_texts).astype("float32")
+            return cls(documents, embeddings, embedding_fn)
         else:
             raise ValueError("Cannot generate a vector store from empty DataFrame.")
 
-    def similarity_search_with_score(self, query: str, k: int) -> Sequence:
-        query_emb = self.embedding_model.embed_text(query).astype("float32")
-        distances, indices = self.index.search(query_emb[None, :], k)
+    def similarity_search_with_score(self, query: Sequence[str], k: int) -> Sequence:
+        query_emb = self.embedding_fn(query).astype("float32")
+        distances, indices = self.index.search(query_emb, k)
         return [(self.documents[i], d) for d, i in zip(distances[0], indices[0])]

From a4f85099cb0e9345e5db3c0bfc9c48a5b5c40b9c Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Fri, 26 Jan 2024 16:22:36 +0100
Subject: [PATCH 41/88] Update tests for rag module

---
 tests/rag/test_embedding_model.py             | 22 -----------------
 .../test_knowledge_base_testset_generator.py  |  6 ++---
 tests/rag/test_testset_suite_conversion.py    |  4 ++--
 tests/rag/test_vector_store.py                | 24 +++++++++----------
 4 files changed, 16 insertions(+), 40 deletions(-)
 delete mode 100644 tests/rag/test_embedding_model.py

diff --git a/tests/rag/test_embedding_model.py b/tests/rag/test_embedding_model.py
deleted file mode 100644
index a8d15f2975..0000000000
--- a/tests/rag/test_embedding_model.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from unittest.mock import Mock
-
-import numpy as np
-
-from giskard.rag.embeddings import OpenAIEmbeddings
-
-
-def test_openai_embeddings_model():
-    embedding_mock = Mock()
-    embedding_mock.embedding = np.ones(8)
-
-    embedding_call = Mock()
-    embedding_call.data = [embedding_mock]
-
-    client = Mock()
-    client.embeddings.create.side_effect = [embedding_call]
-
-    embedding_model = OpenAIEmbeddings(client=client)
-
-    embedded_text = embedding_model.embed_text("This a test string")
-    assert len(embedded_text) == 8
-    assert np.allclose(embedded_text, np.ones(8))
diff --git a/tests/rag/test_knowledge_base_testset_generator.py b/tests/rag/test_knowledge_base_testset_generator.py
index eee4c64154..b42b81bd3f 100644
--- a/tests/rag/test_knowledge_base_testset_generator.py
+++ b/tests/rag/test_knowledge_base_testset_generator.py
@@ -48,13 +48,12 @@ def test_testset_generation():
 
     embedding_dimension = 8
 
-    embedding_model = Mock()
+    llm_client.embeddings = Mock()
     # evenly spaced embeddings for the knowledge base elements and specifically chosen embeddings for
     # each mock embedding calls.
     kb_embeddings = np.ones((4, embedding_dimension)) * np.arange(4)[:, None] / 100
     query_embeddings = np.ones((2, embedding_dimension)) * np.array([0.02, 10])[:, None]
-    embedding_model.embed_documents.side_effect = [kb_embeddings]
-    embedding_model.embed_text.side_effect = list(query_embeddings)
+    llm_client.embeddings.side_effect = [kb_embeddings] + list(query_embeddings[:, None, :])
 
     knowledge_base_df = make_knowledge_base_df()
     testset_generator = KnowledgeBaseTestsetGenerator(
@@ -62,7 +61,6 @@ def test_testset_generation():
         model_name="Test model",
         model_description="This is a model for testing purpose.",
         llm_client=llm_client,
-        embedding_model=embedding_model,
         context_neighbors=3,
     )
 
diff --git a/tests/rag/test_testset_suite_conversion.py b/tests/rag/test_testset_suite_conversion.py
index 9b1526e87f..fa99ece0af 100644
--- a/tests/rag/test_testset_suite_conversion.py
+++ b/tests/rag/test_testset_suite_conversion.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from giskard.rag import TestSet
+from giskard.rag import QATestset
 
 
 def make_testset_df():
@@ -21,7 +21,7 @@ def make_testset_df():
 
 
 def test_testset_suite_conversion():
-    testset = TestSet(df=make_testset_df())
+    testset = QATestset(df=make_testset_df())
     suite = testset.to_test_suite()
 
     assert "dataset" in suite.default_params
diff --git a/tests/rag/test_vector_store.py b/tests/rag/test_vector_store.py
index 6ce88746c8..7ce6847714 100644
--- a/tests/rag/test_vector_store.py
+++ b/tests/rag/test_vector_store.py
@@ -12,33 +12,33 @@ def test_vector_store_creation():
     embeddings = np.repeat(np.arange(5)[:, None], 8, axis=1)
     documents = [Document({"feature": "This is a test string"})] * 5
 
-    embedding_model = Mock()
+    embedding_fn = Mock()
 
-    store = VectorStore(documents, embeddings, embedding_model)
+    store = VectorStore(documents, embeddings, embedding_fn)
     assert store.embeddings.shape == (5, 8)
     assert len(store.documents) == 5
     assert store.index.d == dimension
     assert store.index.ntotal == 5
 
     with pytest.raises(ValueError, match="Documents and embeddings must have the same length."):
-        store = VectorStore(documents, np.repeat(np.arange(4)[:, None], 8, axis=1), embedding_model)
+        store = VectorStore(documents, np.repeat(np.arange(4)[:, None], 8, axis=1), embedding_fn)
 
     with pytest.raises(ValueError, match="Documents and embeddings must contains at least one element."):
-        store = VectorStore(documents, [], embedding_model)
+        store = VectorStore(documents, [], embedding_fn)
 
     with pytest.raises(ValueError, match="Documents and embeddings must contains at least one element."):
-        store = VectorStore([], [], embedding_model)
+        store = VectorStore([], [], embedding_fn)
 
 
 def test_vector_store_creation_from_df():
     dimension = 8
     df = pd.DataFrame(["This is a test string"] * 5)
 
-    embedding_model = Mock()
+    embedding_fn = Mock()
     random_embedding = np.random.rand(5, dimension)
-    embedding_model.embed_documents.side_effect = [random_embedding]
+    embedding_fn.side_effect = [random_embedding]
 
-    store = VectorStore.from_df(df, embedding_model)
+    store = VectorStore.from_df(df, embedding_fn)
     assert store.index.d == dimension
     assert store.embeddings.shape == (5, 8)
     assert len(store.documents) == 5
@@ -52,12 +52,12 @@ def test_vector_store_similarity_search_with_score():
     embeddings = np.repeat(np.arange(100)[:, None], 8, axis=1)
     documents = [Document({"feature": f"This is test string {idx + 1}"}) for idx in range(100)]
 
-    embedding_model = Mock()
-    embedding_model.embed_text.side_effect = [np.ones(dimension) * 49]
+    embedding_fn = Mock()
+    embedding_fn.side_effect = [np.ones((1, dimension)) * 49]
 
-    store = VectorStore(documents, embeddings, embedding_model)
+    store = VectorStore(documents, embeddings, embedding_fn)
 
-    query = "This is test string 50"
+    query = ["This is test string 50"]
     retrieved_elements = store.similarity_search_with_score(query, k=3)
     assert len(retrieved_elements) == 3
     assert retrieved_elements[0][0].page_content == "This is test string 50"

From 91d4a29432d7ceff9d17f5d111e915f7348fde78 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Fri, 26 Jan 2024 16:29:07 +0100
Subject: [PATCH 42/88] Make model name and description optional

---
 giskard/rag/knowledge_base_testset_generator.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index ccf4c5fc7f..56f034521b 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -44,8 +44,8 @@ class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
         a similarity threshold to filter irrelevant element from the knowledge base during context creation
     context_window_length: int = 8192
         context window length of the llm used in the `llm_client` of the generator
-    embedding_model: EmbeddingsBase = None
-        an embedding model to build the knowledge base index
+    embedding_fn: Callable = None
+        an embedding function to build the knowledge base index
     language: str = "en"
         the language in which question are generated (following ISO 639-1)
     knowledge_base_features: Sequence[str] = None
@@ -69,8 +69,8 @@ class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
     def __init__(
         self,
         knowledge_df: pd.DataFrame,
-        model_name: str,
-        model_description: str,
+        model_name: str = "",
+        model_description: str = "",
         context_neighbors: int = 4,
         context_similarity_threshold: float = 0.2,
         context_window_length: int = 8192,

From 08669e1ab96b7bf69204e43c0030b5777e1863a7 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Fri, 26 Jan 2024 17:01:29 +0100
Subject: [PATCH 43/88] Update documentation

---
 docs/open_source/testset_generation/index.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/open_source/testset_generation/index.md b/docs/open_source/testset_generation/index.md
index ee008155a8..73f7a8b2f4 100644
--- a/docs/open_source/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -42,6 +42,7 @@ os.environ["OPENAI_API_KEY"] = "sk-…"
 ::::::{tab-item} Azure OpenAI
 
 Require `openai>=1.0.0`
+Make sure that both the LLM and Embeddings models are both deployed on the Azure endpoint. The default embedding model used by the Giskard client is `text-embedding-ada-002`. 
 
 ```python
 import os

From 4e5957e8ea6d592d54aa36dc93b1a069f49de03f Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Fri, 26 Jan 2024 18:12:55 +0100
Subject: [PATCH 44/88] Improve handling embedding generation in llm client

---
 giskard/llm/client/openai.py                  | 40 ++++++++++++++++---
 .../rag/knowledge_base_testset_generator.py   | 19 +++++----
 2 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/giskard/llm/client/openai.py b/giskard/llm/client/openai.py
index 5c67d0d522..b9932f4c9f 100644
--- a/giskard/llm/client/openai.py
+++ b/giskard/llm/client/openai.py
@@ -73,14 +73,13 @@ def complete(
 
         return LLMOutput(message=cc["content"], function_call=function_call)
 
+    @abstractmethod
+    def _embeddings_generation(self, texts: Sequence[str], model: str):
+        ...
+
     def embeddings(self, texts: Sequence[str], model: str = "text-embedding-ada-002") -> np.ndarray:
         texts = [t.replace("\n", " ") for t in texts]
-        try:
-            out = self._client.embeddings.create(input=texts, model=model)
-            embeddings = [element.embedding for element in out.data]
-        except Exception as err:
-            print(err)
-            raise ValueError("Batched embedding creation failed.") from err
+        embeddings = self._embeddings_generation(texts, model)
         return np.stack(embeddings)
 
 
@@ -131,6 +130,20 @@ def _completion(
 
         return completion["choices"][0]["message"]
 
+    def _embeddings_generation(self, texts: Sequence[str], model: str):
+        try:
+            out = openai.Embedding.create(input=texts, engine=model)
+            embeddings = [element["embedding"] for element in out["data"]]
+
+        except openai.error.InvalidRequestError as err:
+            raise ValueError(
+                f"The embedding model: '{model}' was not found,"
+                "make sure the model is correctly deployed on your endpoint."
+            ) from err
+        except Exception as err:
+            raise ValueError("Embedding creation failed.") from err
+        return embeddings
+
 
 class OpenAIClient(BaseOpenAIClient):
     def __init__(self, model: str, client=None):
@@ -173,3 +186,18 @@ def _completion(
         )
 
         return completion.choices[0].message.model_dump()
+
+    def _embeddings_generation(self, texts: Sequence[str], model: str):
+        try:
+            out = self._client.embeddings.create(input=texts, model=model)
+            embeddings = [element.embedding for element in out.data]
+        except openai.NotFoundError as err:
+            raise ValueError(
+                f"The embedding model: '{model}' was not found,"
+                "make sure the model is correctly deployed on "
+                f"the specified endpoint: {self._client._base_url}."
+            ) from err
+        except Exception as err:
+            raise ValueError("Embedding creation failed.") from err
+
+        return embeddings
diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index 56f034521b..2ab5184ebb 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -78,6 +78,7 @@ def __init__(
         knowledge_base_features: Sequence[str] = None,
         seed: int = None,
         include_examples: bool = True,
+        embedding_model: str = "text-embedding-ada-002",
         *args,
         **kwargs,
     ):
@@ -86,14 +87,16 @@ def __init__(
         self.model_description = model_description
         self.context_neighbors = context_neighbors
         self.context_similarity_threshold = context_similarity_threshold
-
+        self.embedding_model = embedding_model
         self.context_window_length = context_window_length
         self.language = language
         self.rng = np.random.default_rng(seed=seed)
         self.include_examples = include_examples
 
         self.knowledge_base = VectorStore.from_df(
-            knowledge_df, self.llm_client.embeddings, features=knowledge_base_features
+            knowledge_df,
+            lambda query: self.llm_client.embeddings(query, model=self.embedding_model),
+            features=knowledge_base_features,
         )
 
     def _generate_question_answer_from_context(self, context):
@@ -179,14 +182,14 @@ def generate_dataset(self, num_samples: int = 10) -> QATestset:
 
         Returns
         -------
-        TestSet
+        QATestset
             The generated test set.
+            Each generated question has the following field:
+                - *question*: a question about a part of the knowledge base
+                - *reference_answer*: the expected answer according to the knowledge base
+                - *reference_context*: relevant elements directly extracted from the knowledge base
+                - *difficulty_level*: an indicator of how difficult the question is
 
-        Each generated question has the following field:
-        - question: a question about a part of the knowledge base
-        - reference_answer: the expected answer according to the knowledge base
-        - reference_context: relevant elements directly extracted from the knowledge base
-        - difficulty_level: an indicator of how difficult the question is
         """
         generated_questions = []
         for idx in range(num_samples):

From 6313ced18b07faf4ab6b6e238f342c8e4aa10a96 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Mon, 29 Jan 2024 11:37:08 +0100
Subject: [PATCH 45/88] Reorder RAG in the doc

---
 docs/index.md             |  4 ++--
 docs/open_source/index.md | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index e985163b52..91c136c3f8 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -14,9 +14,9 @@ getting_started/quickstart/index
 
 open_source/installation_library/index
 open_source/scan/index
+open_source/testset_generation/index
 open_source/customize_tests/index
 open_source/integrate_tests/index
-open_source/testset_generation/index
 ```
 
 ```{toctree}
@@ -74,9 +74,9 @@ cli/index
 reference/models/index
 reference/datasets/index
 reference/scan/index
+reference/rag-toolset/index
 reference/tests/index
 reference/slicing-functions/index
-reference/rag-toolset/index
 reference/transformation-functions/index
 reference/push/index
 reference/suite/index
diff --git a/docs/open_source/index.md b/docs/open_source/index.md
index 8b348df2a6..5fb96bad77 100644
--- a/docs/open_source/index.md
+++ b/docs/open_source/index.md
@@ -6,9 +6,9 @@
 
 installation_library/index
 scan/index
+testset_generation/index
 customize_tests/index
 integrate_tests/index
-testset_generation/index
 ```
 
 ::::::{grid} 1 1 2 2
@@ -24,6 +24,11 @@ testset_generation/index
 :link: scan/index.html
 ::::
 
+::::{grid-item-card} <br/><h3>🧰 RAG Toolset</h3>
+:text-align: center
+:link: testset_generation/index.html
+::::
+
 ::::{grid-item-card} <br/><h3>🧪 Customize your tests</h3>
 :text-align: center
 :link: customize_tests/index.html
@@ -32,9 +37,4 @@ testset_generation/index
 ::::{grid-item-card} <br/><h3>🔁 Integrate your tests</h3>
 :text-align: center
 :link: integrate_tests/index.html
-::::
-
-::::{grid-item-card} <br/><h3>🧰 RAG Toolset</h3>
-:text-align: center
-:link: testset_generation/index.html
 ::::
\ No newline at end of file

From 2a6cef947e4670ddb36f6285c43445f8482aaae1 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Tue, 30 Jan 2024 15:59:55 +0100
Subject: [PATCH 46/88] Update docs with Rabah's comments

---
 docs/open_source/scan/scan_llm/index.md      |  2 +-
 docs/open_source/testset_generation/index.md | 24 ++++++++++++++------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/docs/open_source/scan/scan_llm/index.md b/docs/open_source/scan/scan_llm/index.md
index 84e4382682..cfca37ac4c 100644
--- a/docs/open_source/scan/scan_llm/index.md
+++ b/docs/open_source/scan/scan_llm/index.md
@@ -72,7 +72,7 @@ set_llm_model('my-gpt-4-model')
 
 We are now ready to start.
 
-
+(model-wrapping)=
 ## Step 1: Wrap your model
 
 Start by **wrapping your model**. This step is necessary to ensure a common format for your model and its metadata.
diff --git a/docs/open_source/testset_generation/index.md b/docs/open_source/testset_generation/index.md
index 73f7a8b2f4..2598d3c66b 100644
--- a/docs/open_source/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -84,15 +84,25 @@ Once the knowledge base is loaded as a pandas `DataFrame`, you can generate the
 from giskard.rag import KnowledgeBaseTestsetGenerator
 
 generator = KnowledgeBaseTestsetGenerator(knowledge_base_df, 
-                    model_name="Model name",
-                    model_description="Description of the model",
+                    model_name="Model name", # Optional, provide a name to your model to get better fitting questions
+                    model_description="Description of the model", # Optional, briefly describe the task done by your model
                     knowledge_base_features=["page_content"])
 
 testset = generator.generate_dataset(num_samples=10)
 ```
 
-## Step 3: Evaluate your model
-Once your testset is ready, you can evaluate your model using the `CorrectnessEvaluator`. This can be done directly or through a Giskard test which wraps the evaluator. The `CorrectnessEvaluator` asks a question to the given model and compares the model answer with the reference answer from the testset. Specifically, we use GPT-4 to assess whether the model answer is acceptable given the reference answer.  
+## Step 3: Wrap your model
+To evaluate your model, you must wrap it as a `giskard.Model`. This step is necessary to ensure a common format for your model and its metadata.You can wrap anything as long as you can represent it in a Python function (for example an API call call to Azure or OpenAI). We also have pre-built wrappers for LangChain objects, or you can create your own wrapper by extending the `giskard.Model` class if you need to wrap a complex object such as a custom-made RAG communicating with a vectorstore.
+
+To do so, you can follow the instructions from the [LLM Scan feature](../scan/scan_llm/index.md#step-1-wrap-your-model) or from the {doc}`Reference API </reference/models/index>`. 
+
+Detailed examples can also be found on our {doc}`LLM tutorials section </tutorials/llm_tutorials/index>`.
+
+
+## Step 4: Evaluate your model
+Once your `testset` is ready, you can evaluate your wrapped model using the `CorrectnessEvaluator`. This can be done directly or through a Giskard test which wraps the evaluator. The `CorrectnessEvaluator` asks a question to the given model and compares the model answer with the reference answer from the testset. Specifically, we use GPT-4 to assess whether the model answer is acceptable given the reference answer. 
+
+
 :::::::{tab-set}
 ::::::{tab-item} Direct Evaluation
 
@@ -101,7 +111,7 @@ The `CorrectnessEvaluator` asks all the questions from the testset to your model
 from giskard.llm.evaluators import CorrectnessEvaluator
 
 correctness_evaluator = CorrectnessEvaluator()
-eval_result, failed_indices = correctness_evaluator.evaluate(model, dataset)
+eval_result, failed_indices = correctness_evaluator.evaluate(giskard_model, testset)
 ```
 ::::::
 ::::::{tab-item} Giskard test
@@ -109,7 +119,7 @@ You can also evaluate your model with the `test_llm_correctness` function, which
 ```python
 from giskard.testing.tests.llm import test_llm_correctness
 
-test_result = test_llm_correctness(model, testset, threshold=0.8).execute()
+test_result = test_llm_correctness(giskard_model, testset, threshold=0.8).execute()
 ```
 ::::::
 :::::::
@@ -123,7 +133,7 @@ The questions generated in the testset may have highlighted some vulnerabilities
 Turn the generated testset into an actionable test suite that you can save and reuse in further iterations.
 
 ```python
-test_suite = scan_results.generate_test_suite("My first test suite")
+test_suite = testset.to_test_suite("My first test suite")
 
 # You can run the test suite locally to verify that it reproduces the issues
 test_suite.run()

From 48936019e4f125409465a0afcc001fbdcffb4b01 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Tue, 30 Jan 2024 16:00:47 +0100
Subject: [PATCH 47/88] Remove model name and description from prompt if both
 are not specified

---
 giskard/rag/prompts.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/giskard/rag/prompts.py b/giskard/rag/prompts.py
index e6d7880285..d9e2e1328e 100644
--- a/giskard/rag/prompts.py
+++ b/giskard/rag/prompts.py
@@ -1,4 +1,4 @@
-QA_GENERATION_SYSTEM_PROMPT = """You are a powerful auditing AI, your role is to generate question answer pair from a given list of context paragraph to audit a model specialized on these knowledge. 
+QA_GENERATION_SYSTEM_PROMPT_MODEL = """You are a powerful auditing AI, your role is to generate question answer pair from a given list of context paragraph to audit a model specialized on these knowledge. 
 
 The model you are auditing is the following:
 - Model name: {model_name}
@@ -14,6 +14,18 @@
 You will return the question and the precise answer to the question based exclusively on the provided context.
 Your output should be a single JSON object, with keys 'question' and 'answer'. Make sure you return a valid JSON object."""
 
+QA_GENERATION_SYSTEM_PROMPT = """You are a powerful auditing AI, your role is to generate question answer pair from a given list of context paragraph to audit a model specialized on these knowledge. 
+
+Your task is to generate questions about the products, the ordering process and the shop's activities in general. Your question must be related to a provided context.  
+Please respect the following rules to generate the question:
+- The answer to the question should be found inside the provided context
+- The question must be self-contained
+- The question and answer must be in this language: {language}
+
+You will be provided the context, consisting in multiple paragraphs delimited by dashes "------".
+You will return the question and the precise answer to the question based exclusively on the provided context.
+Your output should be a single JSON object, with keys 'question' and 'answer'. Make sure you return a valid JSON object."""
+
 QA_GENERATION_ASSISTANT_EXAMPLE = """{
     "question": "For which countries can I track my shipping?",
     "answer": "We ship to all 50 states in the US, as well as to Canada and Mexico. We offer tracking for all our shippings."

From f255c56e2aadc0c30bfa26dd4a117e5c38efd5c0 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Tue, 30 Jan 2024 16:03:38 +0100
Subject: [PATCH 48/88] Minor change + add some logging

---
 .../rag/knowledge_base_testset_generator.py   | 23 ++++++++++++++-----
 giskard/rag/testset.py                        |  5 ++--
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index 2ab5184ebb..538e02c074 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -12,11 +12,13 @@
     QA_GENERATION_ASSISTANT_EXAMPLE,
     QA_GENERATION_CONTEXT_EXAMPLE,
     QA_GENERATION_SYSTEM_PROMPT,
+    QA_GENERATION_SYSTEM_PROMPT_MODEL,
 )
 from .testset import QATestset
 from .vector_store import VectorStore
 
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 
 
 class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
@@ -60,6 +62,7 @@ class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
     """
 
     _qa_generation_system_prompt = QA_GENERATION_SYSTEM_PROMPT
+    _qa_generation_system_prompt_model = QA_GENERATION_SYSTEM_PROMPT_MODEL
     _qa_generation_context_example = QA_GENERATION_CONTEXT_EXAMPLE
     _qa_generation_assistant_example = QA_GENERATION_ASSISTANT_EXAMPLE
     _fix_json_prompt = FIX_JSON_FORMAT_PROMPT
@@ -100,14 +103,21 @@ def __init__(
         )
 
     def _generate_question_answer_from_context(self, context):
+        if self.model_name is not None or self.model_description is not None:
+            system_prompt = self._qa_generation_system_prompt_model.format(
+                model_name=self.model_name,
+                model_description=self.model_description,
+                language=self.language,
+            )
+        else:
+            system_prompt = self._qa_generation_system_prompt.format(
+                language=self.language,
+            )
+
         messages = [
             {
                 "role": "system",
-                "content": self._qa_generation_system_prompt.format(
-                    model_name=self.model_name,
-                    model_description=self.model_description,
-                    language=self.language,
-                ),
+                "content": system_prompt,
             }
         ]
         if self.include_examples:
@@ -193,6 +203,7 @@ def generate_dataset(self, num_samples: int = 10) -> QATestset:
         """
         generated_questions = []
         for idx in range(num_samples):
+            logger.info(f"Generating question {idx + 1}/{num_samples}")
             seed_contexts = self._extract_seed_context()
             context = self._format_context(seed_contexts)
 
@@ -208,4 +219,4 @@ def generate_dataset(self, num_samples: int = 10) -> QATestset:
                     }
                 )
 
-        return QATestset(df=pd.DataFrame(generated_questions))
+        return QATestset(df=pd.DataFrame(generated_questions), target=None)
diff --git a/giskard/rag/testset.py b/giskard/rag/testset.py
index 13450776d7..d410f11e14 100644
--- a/giskard/rag/testset.py
+++ b/giskard/rag/testset.py
@@ -7,8 +7,9 @@ class QATestset(Dataset):
     of a `Suite` based on the question/answer pairs inside the `TestSet`.
     """
 
-    def to_test_suite(self):
+    def to_test_suite(self, name=None):
         suite_default_params = {"dataset": self}
-        suite = Suite(name="Test suite generated from testset", default_params=suite_default_params)
+        name = name or "Test suite generated from testset"
+        suite = Suite(name=name, default_params=suite_default_params)
         suite.add_test(test_llm_correctness, "TestsetCorrectnessTest", "TestsetCorrectnessTest")
         return suite

From 6d0f111e62f802b574de18e6f0f9d9f5b0c1bd26 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Tue, 30 Jan 2024 16:04:22 +0100
Subject: [PATCH 49/88] Compute embedding in chunks to respect OpenAI API
 limits

---
 giskard/llm/client/openai.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/giskard/llm/client/openai.py b/giskard/llm/client/openai.py
index b9932f4c9f..cb5b23cb49 100644
--- a/giskard/llm/client/openai.py
+++ b/giskard/llm/client/openai.py
@@ -22,6 +22,8 @@
 
 
 class BaseOpenAIClient(LLMClient, ABC):
+    _max_embedding_chunk_size = 2048
+
     def __init__(self, model: str):
         self._logger = LLMLogger()
         self.model = model
@@ -77,10 +79,17 @@ def complete(
     def _embeddings_generation(self, texts: Sequence[str], model: str):
         ...
 
-    def embeddings(self, texts: Sequence[str], model: str = "text-embedding-ada-002") -> np.ndarray:
+    def embeddings(
+        self, texts: Sequence[str], model: str = "text-embedding-ada-002", chunk_size: int = 2048
+    ) -> np.ndarray:
         texts = [t.replace("\n", " ") for t in texts]
-        embeddings = self._embeddings_generation(texts, model)
-        return np.stack(embeddings)
+        if not isinstance(chunk_size, int) or chunk_size > self._max_embedding_chunk_size or chunk_size < 1:
+            raise ValueError(f"Chunk size must be an integer between 0 and {self._max_embedding_chunk_size}.")
+
+        chunks_indices = list(range(chunk_size, len(texts), chunk_size))
+        chunks = np.split(texts, chunks_indices)
+        embedded_chunks = [self._embeddings_generation(list(chunk), model) for chunk in chunks]
+        return np.stack([emb for embeddings in embedded_chunks for emb in embeddings])
 
 
 class LegacyOpenAIClient(BaseOpenAIClient):

From 0ec5ab38fd60540599fcdfd7a89af7178fa918d6 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Tue, 30 Jan 2024 16:06:01 +0100
Subject: [PATCH 50/88] Remove failed_indices from Correctness evaluator and
 add TestResultDetails

---
 giskard/llm/evaluators/correctness.py         | 34 +++++++++----------
 giskard/testing/tests/llm/correctness.py      |  9 +++--
 .../evaluators/test_correctness_evaluator.py  |  4 +--
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/giskard/llm/evaluators/correctness.py b/giskard/llm/evaluators/correctness.py
index 91bdf86427..4c3cd0727f 100644
--- a/giskard/llm/evaluators/correctness.py
+++ b/giskard/llm/evaluators/correctness.py
@@ -1,11 +1,8 @@
-from giskard.datasets import Dataset
-from giskard.llm.errors import LLMGenerationError
-from giskard.llm.evaluators.base import (
-    EVALUATE_MODEL_FUNCTIONS,
-    EvaluationResult,
-    LLMBasedEvaluator,
-)
-from giskard.models.base.model import BaseModel
+from ...core.test_result import TestResultStatus, create_test_result_details
+from ...datasets import Dataset
+from ...models.base.model import BaseModel
+from ..errors import LLMGenerationError
+from .base import EVALUATE_MODEL_FUNCTIONS, EvaluationResult, LLMBasedEvaluator
 
 CORRECTNESS_EVALUATION_PROMPT = """Your role is to test AI models. Your task consists in assessing whether a model output correctly answers a question. 
 You are provided with the ground truth answer to the question. Your task is then to evaluate if the model answer is close to the ground thruth answer. 
@@ -89,7 +86,8 @@ def evaluate(
         succeeded = []
         failed = []
         errored = []
-        failed_index = []
+        status = []
+        reasons = []
         for idx, (evaluation_question, model_output) in enumerate(zip(dataset.df.to_dict("records"), model_outputs)):
             try:
                 passed, reason = self._evaluate_single(
@@ -98,6 +96,7 @@ def evaluate(
                     evaluation_question[reference_answer_feature_name],
                     model_output,
                 )
+                reasons.append(reason)
                 sample = {
                     **evaluation_question,
                     "reason": reason,
@@ -106,19 +105,20 @@ def evaluate(
                 }
                 if passed:
                     succeeded.append(sample)
+                    status.append(TestResultStatus.PASSED)
                 else:
-                    failed_index.append(idx)
+                    status.append(TestResultStatus.FAILED)
                     failed.append(sample)
             except LLMGenerationError as err:
+                status.append(TestResultStatus.ERROR)
+                reasons.append(str(err))
                 errored.append({"message": str(err), "sample": {**evaluation_question, "model_output": model_output}})
 
-        return (
-            EvaluationResult(
-                failure_examples=failed,
-                success_examples=succeeded,
-                errors=errored,
-            ),
-            failed_index,
+        return EvaluationResult(
+            failure_examples=failed,
+            success_examples=succeeded,
+            errors=errored,
+            details=create_test_result_details(dataset, model, model_outputs, status, {"reason": reasons}),
         )
 
     def _evaluate_single(self, model: BaseModel, question, reference_answer, model_output):
diff --git a/giskard/testing/tests/llm/correctness.py b/giskard/testing/tests/llm/correctness.py
index 27c08c7827..69764b894b 100644
--- a/giskard/testing/tests/llm/correctness.py
+++ b/giskard/testing/tests/llm/correctness.py
@@ -1,4 +1,4 @@
-from ....core.test_result import TestResult
+from ....core.test_result import TestResult, TestResultStatus
 from ....datasets.base import Dataset
 from ....llm.evaluators import CorrectnessEvaluator
 from ....models.base import BaseModel
@@ -32,10 +32,13 @@ def test_llm_correctness(model: BaseModel, dataset: Dataset, threshold: float =
         A TestResult object containing the test result.
     """
     correctness_evaluator = CorrectnessEvaluator()
-    eval_result, failed_idx = correctness_evaluator.evaluate(model, dataset)
+    eval_result = correctness_evaluator.evaluate(model, dataset)
     output_ds = list()
     if not eval_result.passed:
-        output_ds.append(dataset.slice(lambda df: df.loc[failed_idx], row_level=False))
+        failed_indices = [
+            idx for idx, status in enumerate(eval_result.details.results) if status == TestResultStatus.FAILED
+        ]
+        output_ds.append(dataset.slice(lambda df: df.loc[failed_indices], row_level=False))
 
     passed = bool(eval_result.passed_ratio > threshold)
 
diff --git a/tests/llm/evaluators/test_correctness_evaluator.py b/tests/llm/evaluators/test_correctness_evaluator.py
index 58e15820a2..368947c4c3 100644
--- a/tests/llm/evaluators/test_correctness_evaluator.py
+++ b/tests/llm/evaluators/test_correctness_evaluator.py
@@ -63,7 +63,7 @@ def test_correctness_evaluator_correctly_flags_examples():
 
     evaluator = CorrectnessEvaluator(llm_client=client)
 
-    result, failed_indices = evaluator.evaluate(model, dataset)
+    result = evaluator.evaluate(model, dataset)
 
     assert len(result.success_examples) == 1
     assert len(result.failure_examples) == 1
@@ -112,7 +112,7 @@ def test_correctness_evaluator_handles_generation_errors():
 
     evaluator = CorrectnessEvaluator(llm_client=client)
 
-    result, failed_indices = evaluator.evaluate(model, dataset)
+    result = evaluator.evaluate(model, dataset)
 
     assert len(result.success_examples) == 1
     assert len(result.errors) == 1

From f7f557a98ac534ce7149ae747b7d891775b94e9b Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 31 Jan 2024 10:22:07 +0100
Subject: [PATCH 51/88] Add prompt template for easier prompt formatting

---
 giskard/rag/prompts.py | 129 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 126 insertions(+), 3 deletions(-)

diff --git a/giskard/rag/prompts.py b/giskard/rag/prompts.py
index d9e2e1328e..7a690041bc 100644
--- a/giskard/rag/prompts.py
+++ b/giskard/rag/prompts.py
@@ -1,10 +1,10 @@
-QA_GENERATION_SYSTEM_PROMPT_MODEL = """You are a powerful auditing AI, your role is to generate question answer pair from a given list of context paragraph to audit a model specialized on these knowledge. 
+QA_GENERATION_SYSTEM_PROMPT_WITH_DESCRIPTION = """You are a powerful auditing AI, your role is to generate question answer pair from a given list of context paragraph to audit a model specialized on these knowledge. 
 
 The model you are auditing is the following:
 - Model name: {model_name}
 - Model description: {model_description}  
 
-Your task is to generate questions about the products, the ordering process and the shop's activities in general. Your question must be related to a provided context.  
+Your question must be related to a provided context.  
 Please respect the following rules to generate the question:
 - The answer to the question should be found inside the provided context
 - The question must be self-contained
@@ -16,7 +16,7 @@
 
 QA_GENERATION_SYSTEM_PROMPT = """You are a powerful auditing AI, your role is to generate question answer pair from a given list of context paragraph to audit a model specialized on these knowledge. 
 
-Your task is to generate questions about the products, the ordering process and the shop's activities in general. Your question must be related to a provided context.  
+Your question must be related to a provided context.  
 Please respect the following rules to generate the question:
 - The answer to the question should be found inside the provided context
 - The question must be self-contained
@@ -45,3 +45,126 @@
 """
 
 FIX_JSON_FORMAT_PROMPT = """Fix the following json string so it contains a single valid json. Make sure to start and end with curly brackets."""
+
+
+class QAGenerationPrompt:
+    system_prompt_with_description = QA_GENERATION_SYSTEM_PROMPT_WITH_DESCRIPTION
+    system_prompt_raw = QA_GENERATION_SYSTEM_PROMPT
+    example_prompt = QA_GENERATION_CONTEXT_EXAMPLE
+    example_answer = QA_GENERATION_ASSISTANT_EXAMPLE
+
+    @classmethod
+    def format_system_prompt(cls, model_name, model_description, language):
+        language = language or "en"
+        if model_name is not None or model_description is not None:
+            system_prompt = cls.system_prompt_with_description.format(
+                model_name=model_name,
+                model_description=model_description,
+                language=language,
+            )
+        else:
+            system_prompt = cls.system_prompt_raw.format(
+                language=language,
+            )
+        system_message = {
+            "role": "system",
+            "content": system_prompt,
+        }
+        return system_message
+
+    @classmethod
+    def format_examples(cls, examples):
+        if examples is not None:
+            return examples
+        elif cls.example_prompt is not None:
+            examples = []
+            if cls.example_prompt is not None:
+                examples.append({"role": "user", "content": cls.example_prompt})
+            if cls.example_prompt is not None:
+                examples.append({"role": "assistant", "content": cls.example_answer})
+            return examples
+        return []
+
+    @classmethod
+    def create_messages(
+        cls,
+        model_name=None,
+        model_description=None,
+        language=None,
+        add_examples=False,
+        examples=None,
+        user_content=None,
+    ):
+        messages = list()
+
+        messages.append(cls.format_system_prompt(model_name, model_description, language))
+        if add_examples:
+            messages.extend(cls.format_examples(examples))
+
+        if user_content is not None:
+            messages.append({"role": "user", "content": user_content})
+
+        return messages
+
+
+COMPLEXIFICATION_SYSTEM_PROMPT_WITH_DESCRIPTION = """You are an expert at writing questions. 
+Your task is to re-write questions that will be used to evaluate the following model:
+- Model name: {model_name}
+- Model description: {model_description}  
+
+Respect the following rules to reformulate the question:
+- The re-written question should not be longer than the original question by up to 10 to 15 words. 
+- The re-written question should be more elaborated than the original, use elements from the context to enrich the questions. 
+- The re-written question should be more difficult to handle for AI models but it must be understood and answerable by humans.
+- Add one or more constraints / conditions to the question.
+- The re-written question must be in {language}.
+
+You will be provided the question delimited with <question></question> tags.
+You will also be provided a relevant context which contain the answer to the question, delimited with <context></context> tags. It consists in multiple paragraphs delimited by dashes "------".
+You will return the reformulated question as a single JSON object, with the key 'question'. Make sure you return a valid JSON object.
+"""
+
+COMPLEXIFICATION_SYSTEM_PROMPT = """You are an expert at writing questions. 
+Your task is to re-write questions that will be used to evaluate a language model.
+
+Respect the following rules to reformulate the question:
+- The re-written question should not be longer than the original question by up to 10 to 15 words. 
+- The re-written question should be more elaborated than the original, use elements from the context to enrich the questions. 
+- The re-written question should be more difficult to handle for AI models but it must be understood and answerable by humans.
+- Add one or more constraints / conditions to the question.
+- The re-written question must be in {language}.
+
+You will be provided the question delimited with <question></question> tags.
+You will also be provided a relevant context which contain the answer to the question, delimited with <context></context> tags. It consists in multiple paragraphs delimited by dashes "------".
+You will return the reformulated question as a single JSON object, with the key 'question'. Make sure you return a valid JSON object.
+"""
+
+COMPLEXIFICATION_PROMPT_EXAMPLE = """<question>
+For which countries can I track my shipping?
+</question>
+
+<context>
+What payment methods do you accept?
+
+\tWe accept a variety of payment methods to provide our customers with a convenient and secure shopping experience. You can make a purchase using major credit and debit cards, including Visa, Mastercard, American Express, and Discover. We also offer the option to pay with popular digital wallets such as PayPal and Google Pay. For added flexibility, you can choose to complete your order using bank transfers or wire transfers. Rest assured that we prioritize the security of your personal information and go the extra mile to ensure your transactions are processed safely.
+------
+\tWhat is your shipping policy?
+
+We offer free shipping on all orders over $50. For orders below $50, we charge a flat rate of $5.99. We offer shipping services to customers residing in all 50\n states of the US, in addition to providing delivery options to Canada and Mexico.
+------
+\tHow can I track my order?
+
+Tracking your order is a breeze! Once your purchase has been successfully confirmed and shipped, you will receive a confirmation email containing your tracking number. You can simply click on the link provided in the email or visit our website's order tracking page. Enter your tracking number, and you will be able to monitor the progress of your shipment in real-time. This way, you can stay updated on the estimated delivery date and ensure you're available to receive your package.
+<context>
+"""
+
+COMPLEXIFICATION_ANSWER_EXAMPLE = """{
+    "question": "Can you provide my a list of the countries from which I can follow the advancement of the delivery of my shipping?"
+}"""
+
+
+class QuestionComplexificationPrompt(QAGenerationPrompt):
+    system_prompt_with_description = COMPLEXIFICATION_SYSTEM_PROMPT_WITH_DESCRIPTION
+    system_prompt_raw = COMPLEXIFICATION_SYSTEM_PROMPT
+    example_prompt = COMPLEXIFICATION_PROMPT_EXAMPLE
+    example_answer = COMPLEXIFICATION_ANSWER_EXAMPLE

From 70f0ed4f9ef8754dfb1a5337d2f5dd0dbd0432f2 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 31 Jan 2024 10:22:57 +0100
Subject: [PATCH 52/88] Add difficulty level in question generation

---
 giskard/rag/__init__.py                       |   4 +-
 .../rag/knowledge_base_testset_generator.py   | 133 ++++++++++--------
 giskard/rag/vector_store.py                   |   3 +
 .../test_knowledge_base_testset_generator.py  |   6 +-
 4 files changed, 83 insertions(+), 63 deletions(-)

diff --git a/giskard/rag/__init__.py b/giskard/rag/__init__.py
index f01a6d5ccc..1149de1ade 100644
--- a/giskard/rag/__init__.py
+++ b/giskard/rag/__init__.py
@@ -1,4 +1,4 @@
-from .knowledge_base_testset_generator import KnowledgeBaseTestsetGenerator
+from .knowledge_base_testset_generator import DifficultyLevel, KnowledgeBaseTestsetGenerator
 from .testset import QATestset
 
-__all__ = ["KnowledgeBaseTestsetGenerator", "QATestset"]
+__all__ = ["KnowledgeBaseTestsetGenerator", "QATestset", "DifficultyLevel"]
diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index 538e02c074..82f0ec6a4e 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -2,18 +2,13 @@
 
 import json
 import logging
+from enum import Enum
 
 import numpy as np
 import pandas as pd
 
 from ..llm.generators import BaseDataGenerator
-from .prompts import (
-    FIX_JSON_FORMAT_PROMPT,
-    QA_GENERATION_ASSISTANT_EXAMPLE,
-    QA_GENERATION_CONTEXT_EXAMPLE,
-    QA_GENERATION_SYSTEM_PROMPT,
-    QA_GENERATION_SYSTEM_PROMPT_MODEL,
-)
+from .prompts import FIX_JSON_FORMAT_PROMPT, QAGenerationPrompt, QuestionComplexificationPrompt
 from .testset import QATestset
 from .vector_store import VectorStore
 
@@ -21,6 +16,11 @@
 logger.setLevel(logging.INFO)
 
 
+class DifficultyLevel(int, Enum):
+    DIFF_1 = 1
+    DIFF_2 = 2
+
+
 class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
     """Testset generator for testing RAG models.
 
@@ -61,19 +61,17 @@ class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
     seed: int = None
     """
 
-    _qa_generation_system_prompt = QA_GENERATION_SYSTEM_PROMPT
-    _qa_generation_system_prompt_model = QA_GENERATION_SYSTEM_PROMPT_MODEL
-    _qa_generation_context_example = QA_GENERATION_CONTEXT_EXAMPLE
-    _qa_generation_assistant_example = QA_GENERATION_ASSISTANT_EXAMPLE
+    # _qa_generation_system_prompt = QA_GENERATION_SYSTEM_PROMPT
+    # _qa_generation_system_prompt_model = QA_GENERATION_SYSTEM_PROMPT_MODEL
+    # _qa_generation_context_example = QA_GENERATION_CONTEXT_EXAMPLE
+    # _qa_generation_assistant_example = QA_GENERATION_ASSISTANT_EXAMPLE
     _fix_json_prompt = FIX_JSON_FORMAT_PROMPT
 
-    _difficulty_level = 1
-
     def __init__(
         self,
         knowledge_df: pd.DataFrame,
-        model_name: str = "",
-        model_description: str = "",
+        model_name: str = None,
+        model_description: str = None,
         context_neighbors: int = 4,
         context_similarity_threshold: float = 0.2,
         context_window_length: int = 8192,
@@ -102,41 +100,47 @@ def __init__(
             features=knowledge_base_features,
         )
 
+    def _difficulty_level_mapping(self, level: DifficultyLevel):
+        match level:
+            case DifficultyLevel.DIFF_1:
+                return self._generate_question_answer_from_context
+            case DifficultyLevel.DIFF_2:
+                return self._generate_complex_questions_from_context
+            case _:
+                raise NotImplementedError(f"Missing case for difficulty level {level}.")
+
     def _generate_question_answer_from_context(self, context):
-        if self.model_name is not None or self.model_description is not None:
-            system_prompt = self._qa_generation_system_prompt_model.format(
-                model_name=self.model_name,
-                model_description=self.model_description,
-                language=self.language,
-            )
-        else:
-            system_prompt = self._qa_generation_system_prompt.format(
-                language=self.language,
-            )
+        messages = QAGenerationPrompt.create_messages(
+            model_name=self.model_name,
+            model_description=self.model_description,
+            language=self.language,
+            user_content=context,
+        )
 
-        messages = [
-            {
-                "role": "system",
-                "content": system_prompt,
-            }
-        ]
-        if self.include_examples:
-            messages.extend(
-                [
-                    {"role": "user", "content": self._qa_generation_context_example},
-                    {"role": "assistant", "content": self._qa_generation_assistant_example},
-                ]
-            )
-        messages.append({"role": "user", "content": context})
+        generated_qa = self._llm_complete(messages=messages)
+        generated_qa["difficulty"] = DifficultyLevel.DIFF_1
+        return generated_qa
+
+    def _generate_complex_questions_from_context(self, context):
+        generated_qa = self._generate_question_answer_from_context(context)
 
-        return self._llm_complete(messages=messages)
+        messages = QuestionComplexificationPrompt.create_messages(
+            model_name=self.model_name,
+            model_description=self.model_description,
+            language=self.language,
+            user_content=self._format_question_context_for_complexification(generated_qa["question"], context),
+        )
+        generated_qa["difficulty"] = DifficultyLevel.DIFF_2
+        out = self._llm_complete(messages=messages)
+        generated_qa["question"] = out["question"]
+        return generated_qa
 
     def _extract_seed_context(self):
-        seed_context = self.rng.choice(self.knowledge_base.documents)
+        seed_embedding = self.rng.choice(self.knowledge_base.embeddings)
         relevant_contexts = [
             context
-            for (context, score) in self.knowledge_base.similarity_search_with_score(
-                [seed_context.page_content], k=self.context_neighbors
+            for (context, score) in self.knowledge_base.vector_similarity_search_with_score(
+                seed_embedding[None], k=self.context_neighbors
             )
             if score < self.context_similarity_threshold  # should we keep it or not ?
         ]
@@ -146,6 +150,10 @@ def _format_context(self, contexts):
         context_string = "\n------\n".join(["", *[doc.page_content for doc in contexts], ""])
         return context_string
 
+    def _format_question_context_for_complexification(self, question, context):
+        context_string = f"<question>\n{question}\n</question>\n<context>\n{context}\n</context>"
+        return context_string
+
     def _prevent_context_window_overflow(self, prompt):
         # Prevent context overflow
         # general rule of thumbs to count tokens: 1 token ~ 4 characters
@@ -182,7 +190,7 @@ def _try_fix_json_message(self, incorrect_json):
             return None
         return corrected_message
 
-    def generate_dataset(self, num_samples: int = 10) -> QATestset:
+    def generate_dataset(self, num_samples: int = 10, difficulty_levels: Sequence[DifficultyLevel] = None) -> QATestset:
         """Generates a testset from the knowledge base.
 
         Parameters
@@ -201,22 +209,27 @@ def generate_dataset(self, num_samples: int = 10) -> QATestset:
                 - *difficulty_level*: an indicator of how difficult the question is
 
         """
+        difficulty_levels = difficulty_levels or [DifficultyLevel.DIFF_1]
         generated_questions = []
-        for idx in range(num_samples):
-            logger.info(f"Generating question {idx + 1}/{num_samples}")
-            seed_contexts = self._extract_seed_context()
-            context = self._format_context(seed_contexts)
-
-            generated_qa = self._generate_question_answer_from_context(context)
-
-            if generated_qa is not None:
-                generated_questions.append(
-                    {
-                        "question": generated_qa["question"],
-                        "reference_answer": generated_qa["answer"],
-                        "reference_context": context,
-                        "difficulty_level": self._difficulty_level,
-                    }
-                )
+        for level in difficulty_levels:
+            for idx in range(num_samples):
+                logger.info(f"Generating question {idx + 1}/{num_samples} for difficulty level {level}.")
+                seed_contexts = self._extract_seed_context()
+                context = self._format_context(seed_contexts)
+
+                generation_fn = self._difficulty_level_mapping(level)
+                generated_qa = generation_fn(context)
+
+                if generated_qa is not None:
+                    generated_questions.append(
+                        {
+                            "question": generated_qa["question"],
+                            "reference_answer": generated_qa["answer"],
+                            "reference_context": context,
+                            "difficulty_level": generated_qa["difficulty"],
+                        }
+                    )
+                else:
+                    logger.warning("Error in question generation, skipping it.")
 
         return QATestset(df=pd.DataFrame(generated_questions), target=None)
diff --git a/giskard/rag/vector_store.py b/giskard/rag/vector_store.py
index 3821f15f60..1e8c65eb40 100644
--- a/giskard/rag/vector_store.py
+++ b/giskard/rag/vector_store.py
@@ -59,5 +59,8 @@ def from_df(cls, df: pd.DataFrame, embedding_fn: Callable, features: Sequence[st
 
     def similarity_search_with_score(self, query: Sequence[str], k: int) -> Sequence:
         query_emb = self.embedding_fn(query).astype("float32")
+        return self.vector_similarity_search_with_score(query_emb, k)
+
+    def vector_similarity_search_with_score(self, query_emb: np.ndarray, k: int) -> Sequence:
         distances, indices = self.index.search(query_emb, k)
         return [(self.documents[i], d) for d, i in zip(distances[0], indices[0])]
diff --git a/tests/rag/test_knowledge_base_testset_generator.py b/tests/rag/test_knowledge_base_testset_generator.py
index b42b81bd3f..2b6bc77505 100644
--- a/tests/rag/test_knowledge_base_testset_generator.py
+++ b/tests/rag/test_knowledge_base_testset_generator.py
@@ -53,7 +53,8 @@ def test_testset_generation():
     # each mock embedding calls.
     kb_embeddings = np.ones((4, embedding_dimension)) * np.arange(4)[:, None] / 100
     query_embeddings = np.ones((2, embedding_dimension)) * np.array([0.02, 10])[:, None]
-    llm_client.embeddings.side_effect = [kb_embeddings] + list(query_embeddings[:, None, :])
+
+    llm_client.embeddings.side_effect = [kb_embeddings]
 
     knowledge_base_df = make_knowledge_base_df()
     testset_generator = KnowledgeBaseTestsetGenerator(
@@ -63,6 +64,9 @@ def test_testset_generation():
         llm_client=llm_client,
         context_neighbors=3,
     )
+    testset_generator.rng = Mock()
+    testset_generator.rng.choice = Mock()
+    testset_generator.rng.choice.side_effect = list(query_embeddings)
 
     assert testset_generator.knowledge_base.index.d == 8
     assert testset_generator.knowledge_base.embeddings.shape == (4, 8)

From e64b0f29828e97063f88192f0f490e9c70f1a57e Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 31 Jan 2024 10:29:24 +0100
Subject: [PATCH 53/88] Add missing llm dependency

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1511b4ed78..e24d9b29e1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -181,8 +181,8 @@ llm = [
     "openai",
     "evaluate>=0.4.1",
     "bert-score>=0.3.13",
+    "tenacity>=4.11.0",
     "faiss-cpu>=1.7.4",
-
 ]
 
 hub = [

From 9acee2ab6054b13b0046947831b28f7b8f91c1c9 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 31 Jan 2024 11:11:10 +0100
Subject: [PATCH 54/88] Update RAG docs

---
 docs/open_source/testset_generation/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/open_source/testset_generation/index.md b/docs/open_source/testset_generation/index.md
index 2598d3c66b..402cecaf22 100644
--- a/docs/open_source/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -94,7 +94,7 @@ testset = generator.generate_dataset(num_samples=10)
 ## Step 3: Wrap your model
 To evaluate your model, you must wrap it as a `giskard.Model`. This step is necessary to ensure a common format for your model and its metadata.You can wrap anything as long as you can represent it in a Python function (for example an API call call to Azure or OpenAI). We also have pre-built wrappers for LangChain objects, or you can create your own wrapper by extending the `giskard.Model` class if you need to wrap a complex object such as a custom-made RAG communicating with a vectorstore.
 
-To do so, you can follow the instructions from the [LLM Scan feature](../scan/scan_llm/index.md#step-1-wrap-your-model) or from the {doc}`Reference API </reference/models/index>`. 
+To do so, you can follow the instructions from the [LLM Scan feature](../scan/scan_llm/index.md#step-1-wrap-your-model) or from the {doc}`Reference API </reference/models/index>`. Make sure that you pass `feature_names = "question"` when wrapping your model, so that it matches the question column of the testset. 
 
 Detailed examples can also be found on our {doc}`LLM tutorials section </tutorials/llm_tutorials/index>`.
 

From ca05d216173adeb8c22b4c244a51411542322bda Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 31 Jan 2024 18:51:29 +0100
Subject: [PATCH 55/88] Add minor fixes from last test session with Rabah

---
 docs/open_source/testset_generation/index.md |  4 ++--
 giskard/rag/testset.py                       | 12 ++++++++++++
 giskard/testing/tests/llm/correctness.py     |  5 ++++-
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/docs/open_source/testset_generation/index.md b/docs/open_source/testset_generation/index.md
index 402cecaf22..be33f5e5bc 100644
--- a/docs/open_source/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -130,13 +130,13 @@ The questions generated in the testset may have highlighted some vulnerabilities
 
 ### 1. Generate a test suite from the testset:
 
-Turn the generated testset into an actionable test suite that you can save and reuse in further iterations.
+Turn the generated testset into an actionable test suite that you can save and reuse in further iterations. Note that you need to pass your wrapped model when executing the suite, since the suite was generated only from the testset.
 
 ```python
 test_suite = testset.to_test_suite("My first test suite")
 
 # You can run the test suite locally to verify that it reproduces the issues
-test_suite.run()
+test_suite.run(giskard_model)
 ```
 
 Jump to the [test customization](https://docs.giskard.ai/en/latest/open_source/customize_tests/index.html) and [test integration](https://docs.giskard.ai/en/latest/open_source/integrate_tests/index.html) sections to find out everything you can do with test suites.
diff --git a/giskard/rag/testset.py b/giskard/rag/testset.py
index d410f11e14..ad01267572 100644
--- a/giskard/rag/testset.py
+++ b/giskard/rag/testset.py
@@ -13,3 +13,15 @@ def to_test_suite(self, name=None):
         suite = Suite(name=name, default_params=suite_default_params)
         suite.add_test(test_llm_correctness, "TestsetCorrectnessTest", "TestsetCorrectnessTest")
         return suite
+
+    def copy(self):
+        testset = QATestset(
+            df=self.df.copy(),
+            target=self.target,
+            column_types=self.column_types.copy(),
+            validation=False,
+        )
+
+        if hasattr(self, "column_meta"):
+            testset.load_metadata_from_instance(self.column_meta)
+        return testset
diff --git a/giskard/testing/tests/llm/correctness.py b/giskard/testing/tests/llm/correctness.py
index 69764b894b..6b9b558136 100644
--- a/giskard/testing/tests/llm/correctness.py
+++ b/giskard/testing/tests/llm/correctness.py
@@ -36,8 +36,11 @@ def test_llm_correctness(model: BaseModel, dataset: Dataset, threshold: float =
     output_ds = list()
     if not eval_result.passed:
         failed_indices = [
-            idx for idx, status in enumerate(eval_result.details.results) if status == TestResultStatus.FAILED
+            idx
+            for idx, status in zip(dataset.df.index, eval_result.details.results)
+            if status == TestResultStatus.FAILED
         ]
+
         output_ds.append(dataset.slice(lambda df: df.loc[failed_indices], row_level=False))
 
     passed = bool(eval_result.passed_ratio > threshold)

From f14ce82260d0a8e36fa11c17645bc71fabd78a8e Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 31 Jan 2024 10:29:24 +0100
Subject: [PATCH 56/88] Add missing llm dependency

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1511b4ed78..e24d9b29e1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -181,8 +181,8 @@ llm = [
     "openai",
     "evaluate>=0.4.1",
     "bert-score>=0.3.13",
+    "tenacity>=4.11.0",
     "faiss-cpu>=1.7.4",
-
 ]
 
 hub = [

From 009041699dff1c9b4ae7c0b10eb46d2267e03950 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 31 Jan 2024 11:11:10 +0100
Subject: [PATCH 57/88] Update RAG docs

---
 docs/open_source/testset_generation/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/open_source/testset_generation/index.md b/docs/open_source/testset_generation/index.md
index 2598d3c66b..402cecaf22 100644
--- a/docs/open_source/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -94,7 +94,7 @@ testset = generator.generate_dataset(num_samples=10)
 ## Step 3: Wrap your model
 To evaluate your model, you must wrap it as a `giskard.Model`. This step is necessary to ensure a common format for your model and its metadata.You can wrap anything as long as you can represent it in a Python function (for example an API call call to Azure or OpenAI). We also have pre-built wrappers for LangChain objects, or you can create your own wrapper by extending the `giskard.Model` class if you need to wrap a complex object such as a custom-made RAG communicating with a vectorstore.
 
-To do so, you can follow the instructions from the [LLM Scan feature](../scan/scan_llm/index.md#step-1-wrap-your-model) or from the {doc}`Reference API </reference/models/index>`. 
+To do so, you can follow the instructions from the [LLM Scan feature](../scan/scan_llm/index.md#step-1-wrap-your-model) or from the {doc}`Reference API </reference/models/index>`. Make sure that you pass `feature_names = "question"` when wrapping your model, so that it matches the question column of the testset. 
 
 Detailed examples can also be found on our {doc}`LLM tutorials section </tutorials/llm_tutorials/index>`.
 

From a756cea990a17009b0c89cc2a829d38c2bbba02d Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 31 Jan 2024 18:51:29 +0100
Subject: [PATCH 58/88] Add minor fixes from last test session with Rabah

---
 docs/open_source/testset_generation/index.md |  4 ++--
 giskard/rag/testset.py                       | 12 ++++++++++++
 giskard/testing/tests/llm/correctness.py     |  5 ++++-
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/docs/open_source/testset_generation/index.md b/docs/open_source/testset_generation/index.md
index 402cecaf22..be33f5e5bc 100644
--- a/docs/open_source/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -130,13 +130,13 @@ The questions generated in the testset may have highlighted some vulnerabilities
 
 ### 1. Generate a test suite from the testset:
 
-Turn the generated testset into an actionable test suite that you can save and reuse in further iterations.
+Turn the generated testset into an actionable test suite that you can save and reuse in further iterations. Note that you need to pass your wrapped model when executing the suite, since the suite was generated only from the testset.
 
 ```python
 test_suite = testset.to_test_suite("My first test suite")
 
 # You can run the test suite locally to verify that it reproduces the issues
-test_suite.run()
+test_suite.run(giskard_model)
 ```
 
 Jump to the [test customization](https://docs.giskard.ai/en/latest/open_source/customize_tests/index.html) and [test integration](https://docs.giskard.ai/en/latest/open_source/integrate_tests/index.html) sections to find out everything you can do with test suites.
diff --git a/giskard/rag/testset.py b/giskard/rag/testset.py
index d410f11e14..ad01267572 100644
--- a/giskard/rag/testset.py
+++ b/giskard/rag/testset.py
@@ -13,3 +13,15 @@ def to_test_suite(self, name=None):
         suite = Suite(name=name, default_params=suite_default_params)
         suite.add_test(test_llm_correctness, "TestsetCorrectnessTest", "TestsetCorrectnessTest")
         return suite
+
+    def copy(self):
+        testset = QATestset(
+            df=self.df.copy(),
+            target=self.target,
+            column_types=self.column_types.copy(),
+            validation=False,
+        )
+
+        if hasattr(self, "column_meta"):
+            testset.load_metadata_from_instance(self.column_meta)
+        return testset
diff --git a/giskard/testing/tests/llm/correctness.py b/giskard/testing/tests/llm/correctness.py
index 69764b894b..6b9b558136 100644
--- a/giskard/testing/tests/llm/correctness.py
+++ b/giskard/testing/tests/llm/correctness.py
@@ -36,8 +36,11 @@ def test_llm_correctness(model: BaseModel, dataset: Dataset, threshold: float =
     output_ds = list()
     if not eval_result.passed:
         failed_indices = [
-            idx for idx, status in enumerate(eval_result.details.results) if status == TestResultStatus.FAILED
+            idx
+            for idx, status in zip(dataset.df.index, eval_result.details.results)
+            if status == TestResultStatus.FAILED
         ]
+
         output_ds.append(dataset.slice(lambda df: df.loc[failed_indices], row_level=False))
 
     passed = bool(eval_result.passed_ratio > threshold)

From d366e4b09e28cdd6a05e6bf4f03ac824fb6e7cd9 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Wed, 31 Jan 2024 15:59:05 +0100
Subject: [PATCH 59/88] Add difficulty level 3 questions

---
 .../rag/knowledge_base_testset_generator.py   | 39 ++++++---
 giskard/rag/prompts.py                        | 87 ++++++++++++++++++-
 2 files changed, 110 insertions(+), 16 deletions(-)

diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index 82f0ec6a4e..62154622c1 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -8,7 +8,12 @@
 import pandas as pd
 
 from ..llm.generators import BaseDataGenerator
-from .prompts import FIX_JSON_FORMAT_PROMPT, QAGenerationPrompt, QuestionComplexificationPrompt
+from .prompts import (
+    FIX_JSON_FORMAT_PROMPT,
+    DistractingQuestionPrompt,
+    QAGenerationPrompt,
+    QuestionComplexificationPrompt,
+)
 from .testset import QATestset
 from .vector_store import VectorStore
 
@@ -19,6 +24,7 @@
 class DifficultyLevel(int, Enum):
     DIFF_1 = 1
     DIFF_2 = 2
+    DIFF_3 = 3
 
 
 class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
@@ -106,6 +112,8 @@ def _difficulty_level_mapping(self, level: DifficultyLevel):
                 return self._generate_question_answer_from_context
             case DifficultyLevel.DIFF_2:
                 return self._generate_complex_questions_from_context
+            case DifficultyLevel.DIFF_3:
+                return self._generate_distraction_questions_from_context
             case _:
                 raise NotImplementedError(f"Missing case for difficulty level {level}.")
 
@@ -128,13 +136,28 @@ def _generate_complex_questions_from_context(self, context):
             model_name=self.model_name,
             model_description=self.model_description,
             language=self.language,
-            user_content=self._format_question_context_for_complexification(generated_qa["question"], context),
+            user_content=(generated_qa["question"], context),
         )
         generated_qa["difficulty"] = DifficultyLevel.DIFF_2
         out = self._llm_complete(messages=messages)
         generated_qa["question"] = out["question"]
         return generated_qa
 
+    def _generate_distraction_questions_from_context(self, context):
+        generated_qa = self._generate_question_answer_from_context(context)
+
+        distracting_context = self.rng.choice(self.knowledge_base.documents).page_content
+        messages = DistractingQuestionPrompt.create_messages(
+            model_name=self.model_name,
+            model_description=self.model_description,
+            language=self.language,
+            user_content=(generated_qa["question"], generated_qa["answer"], distracting_context),
+        )
+        generated_qa["difficulty"] = DifficultyLevel.DIFF_3
+        out = self._llm_complete(messages=messages)
+        generated_qa["question"] = out["question"]
+        return generated_qa
+
     def _extract_seed_context(self):
         seed_embedding = self.rng.choice(self.knowledge_base.embeddings)
         relevant_contexts = [
@@ -146,14 +169,6 @@ def _extract_seed_context(self):
         ]
         return relevant_contexts
 
-    def _format_context(self, contexts):
-        context_string = "\n------\n".join(["", *[doc.page_content for doc in contexts], ""])
-        return context_string
-
-    def _format_question_context_for_complexification(self, question, context):
-        context_string = f"<question>\n{question}\n</question>\n<context>\n{context}\n</context>"
-        return context_string
-
     def _prevent_context_window_overflow(self, prompt):
         # Prevent context overflow
         # general rule of thumbs to count tokens: 1 token ~ 4 characters
@@ -213,9 +228,9 @@ def generate_dataset(self, num_samples: int = 10, difficulty_levels: Sequence[Di
         generated_questions = []
         for level in difficulty_levels:
             for idx in range(num_samples):
-                logger.info(f"Generating question {idx + 1}/{num_samples} for difficulty level {level}.")
+                logger.info(f"Generating question {idx + 1}/{num_samples} for difficulty level {str(level.value)}.")
                 seed_contexts = self._extract_seed_context()
-                context = self._format_context(seed_contexts)
+                context = QAGenerationPrompt.format_context(seed_contexts)
 
                 generation_fn = self._difficulty_level_mapping(level)
                 generated_qa = generation_fn(context)
diff --git a/giskard/rag/prompts.py b/giskard/rag/prompts.py
index 7a690041bc..167bc89b48 100644
--- a/giskard/rag/prompts.py
+++ b/giskard/rag/prompts.py
@@ -54,7 +54,7 @@ class QAGenerationPrompt:
     example_answer = QA_GENERATION_ASSISTANT_EXAMPLE
 
     @classmethod
-    def format_system_prompt(cls, model_name, model_description, language):
+    def _format_system_prompt(cls, model_name, model_description, language):
         language = language or "en"
         if model_name is not None or model_description is not None:
             system_prompt = cls.system_prompt_with_description.format(
@@ -73,7 +73,7 @@ def format_system_prompt(cls, model_name, model_description, language):
         return system_message
 
     @classmethod
-    def format_examples(cls, examples):
+    def _format_example_prompt(cls, examples):
         if examples is not None:
             return examples
         elif cls.example_prompt is not None:
@@ -85,6 +85,11 @@ def format_examples(cls, examples):
             return examples
         return []
 
+    @classmethod
+    def format_context(cls, contexts):
+        context_string = "\n------\n".join(["", *[doc.page_content for doc in contexts], ""])
+        return context_string
+
     @classmethod
     def create_messages(
         cls,
@@ -97,9 +102,9 @@ def create_messages(
     ):
         messages = list()
 
-        messages.append(cls.format_system_prompt(model_name, model_description, language))
+        messages.append(cls._format_system_prompt(model_name, model_description, language))
         if add_examples:
-            messages.extend(cls.format_examples(examples))
+            messages.extend(cls._format_example_prompt(examples))
 
         if user_content is not None:
             messages.append({"role": "user", "content": user_content})
@@ -168,3 +173,77 @@ class QuestionComplexificationPrompt(QAGenerationPrompt):
     system_prompt_raw = COMPLEXIFICATION_SYSTEM_PROMPT
     example_prompt = COMPLEXIFICATION_PROMPT_EXAMPLE
     example_answer = COMPLEXIFICATION_ANSWER_EXAMPLE
+
+    @classmethod
+    def format_user_content(cls, question, context):
+        context_string = f"<question>\n{question}\n</question>\n<context>\n{context}\n</context>"
+        return context_string
+
+    @classmethod
+    def create_messages(cls, **kwargs):
+        kwargs["user_content"] = cls.format_user_content(*kwargs["user_content"])
+        return super().create_messages(**kwargs)
+
+
+DISTRACTING_QUESTION_SYSTEM_PROMPT = """You are an expert at rewritting question.
+Your task is to re-write questions that will be used to evaluate a language model.
+
+Your task is to complexify questions given a provided context. 
+Please respect the following rules to generate the question:
+- The new question must include a condition or constraint based on the provided context. 
+- The new question must have the same answer as the original question.
+- The question must be plausible according to the context and the model description.
+- The question must be self-contained and understandable by humans. 
+- The question must be in french.
+
+You will be provided the question and its answer delimited with <question></question> and <answer></answer> tags.
+You will also be provided a context paragraph delimited with <context></context> tags.
+You will return the reformulated question as a single JSON object, with the key 'question'. Make sure you return a valid JSON object.
+"""
+
+DISTRACTING_QUESTION_SYSTEM_PROMPT_WITH_DESCRIPTION = """You are an expert at rewritting question.
+Your task is to re-write questions that will be used to evaluate the following model:
+- Model name: {model_name}
+- Model description: {model_description}  
+
+Your task is to complexify questions given a provided context. 
+Please respect the following rules to generate the question:
+- The new question must include a condition or constraint based on the provided context. 
+- The original question direction should be preserved.
+- The question must be plausible according to the context and the model description.
+- The question must be self-contained and understandable by humans. 
+- The question must be in french.
+
+You will be provided the question delimited with <question></question> tags.
+You will also be provided a context paragraph delimited with <context></context> tags.
+You will return the reformulated question as a single JSON object, with the key 'question'. Make sure you return a valid JSON object.
+"""
+
+DISCTRACTING_QUESTION_PROMPT_EXAMPLE = """<question>
+What job offer do you have for engineering student?
+</question>
+<answer>
+We have plenty of different jobs for engineering student depending on your speciality: mechanical engineer, data scientist, electronic designer and many more.
+</answer>
+<context>
+Sometimes employers assume being accessible and inclusive only means providing physical access like ramps, accessible bathrooms and automatic opening doors. However, there are many other important ways to demonstrate that you welcome and want to attract a diverse workforce including people with disability.
+<context>
+"""
+
+DISCTRACTING_QUESTION_ANSWER_EXAMPLE = """{
+    "question": "Do you have any job opening suitable for disabled engineering students? "
+}"""
+
+
+class DistractingQuestionPrompt(QuestionComplexificationPrompt):
+    system_prompt_with_description = DISTRACTING_QUESTION_SYSTEM_PROMPT_WITH_DESCRIPTION
+    system_prompt_raw = DISTRACTING_QUESTION_SYSTEM_PROMPT
+    example_prompt = DISCTRACTING_QUESTION_PROMPT_EXAMPLE
+    example_answer = DISCTRACTING_QUESTION_ANSWER_EXAMPLE
+
+    @classmethod
+    def format_user_content(cls, question, answer, context):
+        context_string = (
+            f"<question>\n{question}\n</question>\n<answer>\n{answer}\n</answer>\n<context>\n{context}\n</context>"
+        )
+        return context_string

From 605d6921f547a9e4635a34e3fe725791dff4065e Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Thu, 1 Feb 2024 18:23:01 +0100
Subject: [PATCH 60/88] Update docs with difficulty levels

---
 docs/open_source/testset_generation/index.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/docs/open_source/testset_generation/index.md b/docs/open_source/testset_generation/index.md
index be33f5e5bc..9070a51dd6 100644
--- a/docs/open_source/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -81,16 +81,23 @@ Once the knowledge base is loaded as a pandas `DataFrame`, you can generate the
 
 
 ```python
-from giskard.rag import KnowledgeBaseTestsetGenerator
+from giskard.rag import KnowledgeBaseTestsetGenerator, DifficultyLevel
 
 generator = KnowledgeBaseTestsetGenerator(knowledge_base_df, 
                     model_name="Model name", # Optional, provide a name to your model to get better fitting questions
                     model_description="Description of the model", # Optional, briefly describe the task done by your model
                     knowledge_base_features=["page_content"])
 
-testset = generator.generate_dataset(num_samples=10)
+testset = generator.generate_dataset(num_samples=10, difficulty_level=[DifficultyLevel.DIFF_1, DifficultyLevel.DIFF_2])
 ```
 
+You can select the difficulty level of the generated questions. There are three distinct difficulty levels available:
+- Level 1: basic questions generated from a piece of the knowledge base
+- Level 2: question made more complex by paraphrasing
+- Level 3: questions with distracting element
+
+The generators creates `num_samples` questions per by difficulty level. In the above examples 10 *level 1* questions and 10 *level 2* questions.
+
 ## Step 3: Wrap your model
 To evaluate your model, you must wrap it as a `giskard.Model`. This step is necessary to ensure a common format for your model and its metadata.You can wrap anything as long as you can represent it in a Python function (for example an API call call to Azure or OpenAI). We also have pre-built wrappers for LangChain objects, or you can create your own wrapper by extending the `giskard.Model` class if you need to wrap a complex object such as a custom-made RAG communicating with a vectorstore.
 

From 95e84c65b0f086b1f44a026bf2d5274f13aa7418 Mon Sep 17 00:00:00 2001
From: BotLocker <bot.locker@users.noreply.github.com>
Date: Fri, 2 Feb 2024 09:52:25 +0000
Subject: [PATCH 61/88] Regenerating pdm.lock

---
 pdm.lock | 261 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 143 insertions(+), 118 deletions(-)

diff --git a/pdm.lock b/pdm.lock
index ef50d2e22a..efba7b7242 100644
--- a/pdm.lock
+++ b/pdm.lock
@@ -5,7 +5,7 @@
 groups = ["default", "dev", "doc", "hub", "llm", "ml_runtime", "test"]
 strategy = ["cross_platform", "inherit_metadata"]
 lock_version = "4.4.1"
-content_hash = "sha256:1028b19bca6bf198cf4ed8e9a99dcdb80ae4f23fe8dd67402cda270d57acb5d2"
+content_hash = "sha256:8f100fd058c86295057d05712917e343673fe0b8bd8db5afa76e4fdecbcdd988"
 
 [[package]]
 name = "absl-py"
@@ -540,13 +540,13 @@ files = [
 
 [[package]]
 name = "certifi"
-version = "2023.11.17"
+version = "2024.2.2"
 requires_python = ">=3.6"
 summary = "Python package for providing Mozilla's CA Bundle."
 groups = ["default", "dev", "doc", "hub", "llm", "ml_runtime", "test"]
 files = [
-    {file = "certifi-2023.11.17-py3-none-any.whl", hash = "sha256:e036ab49d5b79556f99cfc2d9320b34cfbe5be05c5871b51de9329f0603b0474"},
-    {file = "certifi-2023.11.17.tar.gz", hash = "sha256:9b469f3a900bf28dc19b8cfbf8019bf47f7fdd1a65a1d4ffb98fc14166beb4d1"},
+    {file = "certifi-2024.2.2-py3-none-any.whl", hash = "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1"},
+    {file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"},
 ]
 
 [[package]]
@@ -955,7 +955,7 @@ files = [
 
 [[package]]
 name = "dataclasses-json"
-version = "0.6.3"
+version = "0.6.4"
 requires_python = ">=3.7,<4.0"
 summary = "Easily serialize dataclasses to and from JSON."
 groups = ["ml_runtime"]
@@ -964,8 +964,8 @@ dependencies = [
     "typing-inspect<1,>=0.4.0",
 ]
 files = [
-    {file = "dataclasses_json-0.6.3-py3-none-any.whl", hash = "sha256:4aeb343357997396f6bca1acae64e486c3a723d8f5c76301888abeccf0c45176"},
-    {file = "dataclasses_json-0.6.3.tar.gz", hash = "sha256:35cb40aae824736fdf959801356641836365219cfe14caeb115c39136f775d2a"},
+    {file = "dataclasses_json-0.6.4-py3-none-any.whl", hash = "sha256:f90578b8a3177f7552f4e1a6e535e84293cd5da421fcce0642d49c0d7bdf8df2"},
+    {file = "dataclasses_json-0.6.4.tar.gz", hash = "sha256:73696ebf24936560cca79a2430cbc4f3dd23ac7bf46ed17f38e5e5e7657a6377"},
 ]
 
 [[package]]
@@ -1226,6 +1226,30 @@ files = [
     {file = "executing-2.0.1.tar.gz", hash = "sha256:35afe2ce3affba8ee97f2d69927fa823b08b472b7b994e36a52a964b93d16147"},
 ]
 
+[[package]]
+name = "faiss-cpu"
+version = "1.7.4"
+summary = "A library for efficient similarity search and clustering of dense vectors."
+groups = ["llm"]
+files = [
+    {file = "faiss-cpu-1.7.4.tar.gz", hash = "sha256:265dc31b0c079bf4433303bf6010f73922490adff9188b915e2d3f5e9c82dd0a"},
+    {file = "faiss_cpu-1.7.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:50d4ebe7f1869483751c558558504f818980292a9b55be36f9a1ee1009d9a686"},
+    {file = "faiss_cpu-1.7.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7b1db7fae7bd8312aeedd0c41536bcd19a6e297229e1dce526bde3a73ab8c0b5"},
+    {file = "faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17b7fa7194a228a84929d9e6619d0e7dbf00cc0f717e3462253766f5e3d07de8"},
+    {file = "faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dca531952a2e3eac56f479ff22951af4715ee44788a3fe991d208d766d3f95f3"},
+    {file = "faiss_cpu-1.7.4-cp310-cp310-win_amd64.whl", hash = "sha256:7173081d605e74766f950f2e3d6568a6f00c53f32fd9318063e96728c6c62821"},
+    {file = "faiss_cpu-1.7.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d0bbd6f55d7940cc0692f79e32a58c66106c3c950cee2341b05722de9da23ea3"},
+    {file = "faiss_cpu-1.7.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e13c14280376100f143767d0efe47dcb32618f69e62bbd3ea5cd38c2e1755926"},
+    {file = "faiss_cpu-1.7.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c521cb8462f3b00c0c7dfb11caff492bb67816528b947be28a3b76373952c41d"},
+    {file = "faiss_cpu-1.7.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afdd9fe1141117fed85961fd36ee627c83fc3b9fd47bafb52d3c849cc2f088b7"},
+    {file = "faiss_cpu-1.7.4-cp311-cp311-win_amd64.whl", hash = "sha256:2ff7f57889ea31d945e3b87275be3cad5d55b6261a4e3f51c7aba304d76b81fb"},
+    {file = "faiss_cpu-1.7.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:559a0133f5ed44422acb09ee1ac0acffd90c6666d1bc0d671c18f6e93ad603e2"},
+    {file = "faiss_cpu-1.7.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ea1d71539fe3dc0f1bed41ef954ca701678776f231046bf0ca22ccea5cf5bef6"},
+    {file = "faiss_cpu-1.7.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:12d45e0157024eb3249842163162983a1ac8b458f1a8b17bbf86f01be4585a99"},
+    {file = "faiss_cpu-1.7.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f0eab359e066d32c874f51a7d4bf6440edeec068b7fe47e6d803c73605a8b4c"},
+    {file = "faiss_cpu-1.7.4-cp39-cp39-win_amd64.whl", hash = "sha256:98459ceeeb735b9df1a5b94572106ffe0a6ce740eb7e4626715dd218657bb4dc"},
+]
+
 [[package]]
 name = "faker"
 version = "22.6.0"
@@ -1586,39 +1610,39 @@ files = [
 
 [[package]]
 name = "grpcio"
-version = "1.60.0"
+version = "1.60.1"
 requires_python = ">=3.7"
 summary = "HTTP/2-based RPC framework"
 groups = ["ml_runtime"]
 files = [
-    {file = "grpcio-1.60.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:d020cfa595d1f8f5c6b343530cd3ca16ae5aefdd1e832b777f9f0eb105f5b139"},
-    {file = "grpcio-1.60.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:b98f43fcdb16172dec5f4b49f2fece4b16a99fd284d81c6bbac1b3b69fcbe0ff"},
-    {file = "grpcio-1.60.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:20e7a4f7ded59097c84059d28230907cd97130fa74f4a8bfd1d8e5ba18c81491"},
-    {file = "grpcio-1.60.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:452ca5b4afed30e7274445dd9b441a35ece656ec1600b77fff8c216fdf07df43"},
-    {file = "grpcio-1.60.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43e636dc2ce9ece583b3e2ca41df5c983f4302eabc6d5f9cd04f0562ee8ec1ae"},
-    {file = "grpcio-1.60.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6e306b97966369b889985a562ede9d99180def39ad42c8014628dd3cc343f508"},
-    {file = "grpcio-1.60.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f897c3b127532e6befdcf961c415c97f320d45614daf84deba0a54e64ea2457b"},
-    {file = "grpcio-1.60.0-cp310-cp310-win32.whl", hash = "sha256:b87efe4a380887425bb15f220079aa8336276398dc33fce38c64d278164f963d"},
-    {file = "grpcio-1.60.0-cp310-cp310-win_amd64.whl", hash = "sha256:a9c7b71211f066908e518a2ef7a5e211670761651039f0d6a80d8d40054047df"},
-    {file = "grpcio-1.60.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:fb464479934778d7cc5baf463d959d361954d6533ad34c3a4f1d267e86ee25fd"},
-    {file = "grpcio-1.60.0-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:4b44d7e39964e808b071714666a812049765b26b3ea48c4434a3b317bac82f14"},
-    {file = "grpcio-1.60.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:90bdd76b3f04bdb21de5398b8a7c629676c81dfac290f5f19883857e9371d28c"},
-    {file = "grpcio-1.60.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91229d7203f1ef0ab420c9b53fe2ca5c1fbeb34f69b3bc1b5089466237a4a134"},
-    {file = "grpcio-1.60.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b36a2c6d4920ba88fa98075fdd58ff94ebeb8acc1215ae07d01a418af4c0253"},
-    {file = "grpcio-1.60.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:297eef542156d6b15174a1231c2493ea9ea54af8d016b8ca7d5d9cc65cfcc444"},
-    {file = "grpcio-1.60.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:87c9224acba0ad8bacddf427a1c2772e17ce50b3042a789547af27099c5f751d"},
-    {file = "grpcio-1.60.0-cp311-cp311-win32.whl", hash = "sha256:95ae3e8e2c1b9bf671817f86f155c5da7d49a2289c5cf27a319458c3e025c320"},
-    {file = "grpcio-1.60.0-cp311-cp311-win_amd64.whl", hash = "sha256:467a7d31554892eed2aa6c2d47ded1079fc40ea0b9601d9f79204afa8902274b"},
-    {file = "grpcio-1.60.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:c193109ca4070cdcaa6eff00fdb5a56233dc7610216d58fb81638f89f02e4968"},
-    {file = "grpcio-1.60.0-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:676e4a44e740deaba0f4d95ba1d8c5c89a2fcc43d02c39f69450b1fa19d39590"},
-    {file = "grpcio-1.60.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:5ff21e000ff2f658430bde5288cb1ac440ff15c0d7d18b5fb222f941b46cb0d2"},
-    {file = "grpcio-1.60.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c86343cf9ff7b2514dd229bdd88ebba760bd8973dac192ae687ff75e39ebfab"},
-    {file = "grpcio-1.60.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fd3b3968ffe7643144580f260f04d39d869fcc2cddb745deef078b09fd2b328"},
-    {file = "grpcio-1.60.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:30943b9530fe3620e3b195c03130396cd0ee3a0d10a66c1bee715d1819001eaf"},
-    {file = "grpcio-1.60.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b10241250cb77657ab315270b064a6c7f1add58af94befa20687e7c8d8603ae6"},
-    {file = "grpcio-1.60.0-cp39-cp39-win32.whl", hash = "sha256:79a050889eb8d57a93ed21d9585bb63fca881666fc709f5d9f7f9372f5e7fd03"},
-    {file = "grpcio-1.60.0-cp39-cp39-win_amd64.whl", hash = "sha256:8a97a681e82bc11a42d4372fe57898d270a2707f36c45c6676e49ce0d5c41353"},
-    {file = "grpcio-1.60.0.tar.gz", hash = "sha256:2199165a1affb666aa24adf0c97436686d0a61bc5fc113c037701fb7c7fceb96"},
+    {file = "grpcio-1.60.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:14e8f2c84c0832773fb3958240c69def72357bc11392571f87b2d7b91e0bb092"},
+    {file = "grpcio-1.60.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:33aed0a431f5befeffd9d346b0fa44b2c01aa4aeae5ea5b2c03d3e25e0071216"},
+    {file = "grpcio-1.60.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:fead980fbc68512dfd4e0c7b1f5754c2a8e5015a04dea454b9cada54a8423525"},
+    {file = "grpcio-1.60.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:082081e6a36b6eb5cf0fd9a897fe777dbb3802176ffd08e3ec6567edd85bc104"},
+    {file = "grpcio-1.60.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55ccb7db5a665079d68b5c7c86359ebd5ebf31a19bc1a91c982fd622f1e31ff2"},
+    {file = "grpcio-1.60.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:9b54577032d4f235452f77a83169b6527bf4b77d73aeada97d45b2aaf1bf5ce0"},
+    {file = "grpcio-1.60.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7d142bcd604166417929b071cd396aa13c565749a4c840d6c702727a59d835eb"},
+    {file = "grpcio-1.60.1-cp310-cp310-win32.whl", hash = "sha256:2a6087f234cb570008a6041c8ffd1b7d657b397fdd6d26e83d72283dae3527b1"},
+    {file = "grpcio-1.60.1-cp310-cp310-win_amd64.whl", hash = "sha256:f2212796593ad1d0235068c79836861f2201fc7137a99aa2fea7beeb3b101177"},
+    {file = "grpcio-1.60.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:79ae0dc785504cb1e1788758c588c711f4e4a0195d70dff53db203c95a0bd303"},
+    {file = "grpcio-1.60.1-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:4eec8b8c1c2c9b7125508ff7c89d5701bf933c99d3910e446ed531cd16ad5d87"},
+    {file = "grpcio-1.60.1-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:8c9554ca8e26241dabe7951aa1fa03a1ba0856688ecd7e7bdbdd286ebc272e4c"},
+    {file = "grpcio-1.60.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91422ba785a8e7a18725b1dc40fbd88f08a5bb4c7f1b3e8739cab24b04fa8a03"},
+    {file = "grpcio-1.60.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cba6209c96828711cb7c8fcb45ecef8c8859238baf15119daa1bef0f6c84bfe7"},
+    {file = "grpcio-1.60.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c71be3f86d67d8d1311c6076a4ba3b75ba5703c0b856b4e691c9097f9b1e8bd2"},
+    {file = "grpcio-1.60.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:af5ef6cfaf0d023c00002ba25d0751e5995fa0e4c9eec6cd263c30352662cbce"},
+    {file = "grpcio-1.60.1-cp311-cp311-win32.whl", hash = "sha256:a09506eb48fa5493c58f946c46754ef22f3ec0df64f2b5149373ff31fb67f3dd"},
+    {file = "grpcio-1.60.1-cp311-cp311-win_amd64.whl", hash = "sha256:49c9b6a510e3ed8df5f6f4f3c34d7fbf2d2cae048ee90a45cd7415abab72912c"},
+    {file = "grpcio-1.60.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:0250a7a70b14000fa311de04b169cc7480be6c1a769b190769d347939d3232a8"},
+    {file = "grpcio-1.60.1-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:660fc6b9c2a9ea3bb2a7e64ba878c98339abaf1811edca904ac85e9e662f1d73"},
+    {file = "grpcio-1.60.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:76eaaba891083fcbe167aa0f03363311a9f12da975b025d30e94b93ac7a765fc"},
+    {file = "grpcio-1.60.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5d97c65ea7e097056f3d1ead77040ebc236feaf7f71489383d20f3b4c28412a"},
+    {file = "grpcio-1.60.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb2a2911b028f01c8c64d126f6b632fcd8a9ac975aa1b3855766c94e4107180"},
+    {file = "grpcio-1.60.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:5a1ebbae7e2214f51b1f23b57bf98eeed2cf1ba84e4d523c48c36d5b2f8829ff"},
+    {file = "grpcio-1.60.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9a66f4d2a005bc78e61d805ed95dedfcb35efa84b7bba0403c6d60d13a3de2d6"},
+    {file = "grpcio-1.60.1-cp39-cp39-win32.whl", hash = "sha256:8d488fbdbf04283f0d20742b64968d44825617aa6717b07c006168ed16488804"},
+    {file = "grpcio-1.60.1-cp39-cp39-win_amd64.whl", hash = "sha256:61b7199cd2a55e62e45bfb629a35b71fc2c0cb88f686a047f25b1112d3810904"},
+    {file = "grpcio-1.60.1.tar.gz", hash = "sha256:dd1d3a8d1d2e50ad9b59e10aa7f07c7d1be2b367f3f2d33c5fade96ed5460962"},
 ]
 
 [[package]]
@@ -2413,7 +2437,7 @@ files = [
 
 [[package]]
 name = "langchain"
-version = "0.1.4"
+version = "0.1.5"
 requires_python = ">=3.8.1,<4.0"
 summary = "Building applications with LLMs through composability"
 groups = ["ml_runtime"]
@@ -2424,7 +2448,7 @@ dependencies = [
     "async-timeout<5.0.0,>=4.0.0; python_version < \"3.11\"",
     "dataclasses-json<0.7,>=0.5.7",
     "jsonpatch<2.0,>=1.33",
-    "langchain-community<0.1,>=0.0.14",
+    "langchain-community<0.1,>=0.0.17",
     "langchain-core<0.2,>=0.1.16",
     "langsmith<0.1,>=0.0.83",
     "numpy<2,>=1",
@@ -2433,13 +2457,13 @@ dependencies = [
     "tenacity<9.0.0,>=8.1.0",
 ]
 files = [
-    {file = "langchain-0.1.4-py3-none-any.whl", hash = "sha256:6befdd6221f5f326092e31a3c19efdc7ce3d7d1f2e2cab065141071451730ed7"},
-    {file = "langchain-0.1.4.tar.gz", hash = "sha256:8767a9461e2b717ce9a35b1fa20659de89ea86ba9c2a4ff516e05d47ab2d195d"},
+    {file = "langchain-0.1.5-py3-none-any.whl", hash = "sha256:4614118d4a95b2e7ba3611a0b6b21707a259a21652a04fbe3c31205bcf3fcd50"},
+    {file = "langchain-0.1.5.tar.gz", hash = "sha256:69603a5bb21b044ddea69d38131dbbf47475afdf79728644faa67d1ad325d652"},
 ]
 
 [[package]]
 name = "langchain-community"
-version = "0.0.16"
+version = "0.0.17"
 requires_python = ">=3.8.1,<4.0"
 summary = "Community contributed LangChain integrations."
 groups = ["ml_runtime"]
@@ -2455,13 +2479,13 @@ dependencies = [
     "tenacity<9.0.0,>=8.1.0",
 ]
 files = [
-    {file = "langchain_community-0.0.16-py3-none-any.whl", hash = "sha256:0f1dfc1a6205ce8d39931d3515974a208a9f69c16157c649f83490a7cc830b73"},
-    {file = "langchain_community-0.0.16.tar.gz", hash = "sha256:c06512a93013a06fba7679cd5a1254ff8b927cddd2d1fbe0cc444bf7bbdf0b8c"},
+    {file = "langchain_community-0.0.17-py3-none-any.whl", hash = "sha256:d503491bbfb691d1b3d10d74f7a69840cee3caf9b58a9a76f053ff925ea76733"},
+    {file = "langchain_community-0.0.17.tar.gz", hash = "sha256:ab957b34a562e0199b2ecf050bdc987c4fe889b2ac9f22b75a9fac8b9e30f53a"},
 ]
 
 [[package]]
 name = "langchain-core"
-version = "0.1.17"
+version = "0.1.18"
 requires_python = ">=3.8.1,<4.0"
 summary = "Building applications with LLMs through composability"
 groups = ["ml_runtime"]
@@ -2476,8 +2500,8 @@ dependencies = [
     "tenacity<9.0.0,>=8.1.0",
 ]
 files = [
-    {file = "langchain_core-0.1.17-py3-none-any.whl", hash = "sha256:026155cf97867bde410ab1834799ab4c5ba64c39380f2a4328bcf9c78623ca64"},
-    {file = "langchain_core-0.1.17.tar.gz", hash = "sha256:59016e457cd6a1708d83a3a454acc97cf02c2a2c3af95626d13f83894fd4e777"},
+    {file = "langchain_core-0.1.18-py3-none-any.whl", hash = "sha256:5a60dc3c391b33834fb9c8b072abd7a0df4cbba8ce88eb1bcb288844000ab759"},
+    {file = "langchain_core-0.1.18.tar.gz", hash = "sha256:ad470b21cdfdc75e829cd91c8d8eb7e0438ab8ddb5b50828125ff7ada121ee7b"},
 ]
 
 [[package]]
@@ -2494,7 +2518,7 @@ files = [
 
 [[package]]
 name = "langsmith"
-version = "0.0.85"
+version = "0.0.86"
 requires_python = ">=3.8.1,<4.0"
 summary = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 groups = ["ml_runtime"]
@@ -2503,8 +2527,8 @@ dependencies = [
     "requests<3,>=2",
 ]
 files = [
-    {file = "langsmith-0.0.85-py3-none-any.whl", hash = "sha256:9d0ccbcda7b69c83828060603a51bb4319e43b8dc807fbd90b6355f8ec709500"},
-    {file = "langsmith-0.0.85.tar.gz", hash = "sha256:fefc631fc30d836b54d4e3f99961c41aea497633898b8f09e305b6c7216c2c54"},
+    {file = "langsmith-0.0.86-py3-none-any.whl", hash = "sha256:7af15c36edb8c9fd9ae5c6d4fb940eb1da668b630a703d63c90c91e9be53aefb"},
+    {file = "langsmith-0.0.86.tar.gz", hash = "sha256:c1572824664810c4425b17f2d1e9a59d53992e6898df22a37236c62d3c80f59e"},
 ]
 
 [[package]]
@@ -3023,57 +3047,58 @@ files = [
 
 [[package]]
 name = "multidict"
-version = "6.0.4"
+version = "6.0.5"
 requires_python = ">=3.7"
 summary = "multidict implementation"
 groups = ["dev", "llm", "ml_runtime"]
 files = [
-    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"},
-    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"},
-    {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"},
-    {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"},
-    {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"},
-    {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"},
-    {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"},
-    {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"},
-    {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"},
-    {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc"},
+    {file = "multidict-6.0.5-cp310-cp310-win32.whl", hash = "sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319"},
+    {file = "multidict-6.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e"},
+    {file = "multidict-6.0.5-cp311-cp311-win32.whl", hash = "sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c"},
+    {file = "multidict-6.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c"},
+    {file = "multidict-6.0.5-cp39-cp39-win32.whl", hash = "sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b"},
+    {file = "multidict-6.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755"},
+    {file = "multidict-6.0.5-py3-none-any.whl", hash = "sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7"},
+    {file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"},
 ]
 
 [[package]]
@@ -4932,28 +4957,28 @@ files = [
 
 [[package]]
 name = "ruff"
-version = "0.1.15"
+version = "0.2.0"
 requires_python = ">=3.7"
 summary = "An extremely fast Python linter and code formatter, written in Rust."
 groups = ["dev"]
 files = [
-    {file = "ruff-0.1.15-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:5fe8d54df166ecc24106db7dd6a68d44852d14eb0729ea4672bb4d96c320b7df"},
-    {file = "ruff-0.1.15-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6f0bfbb53c4b4de117ac4d6ddfd33aa5fc31beeaa21d23c45c6dd249faf9126f"},
-    {file = "ruff-0.1.15-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e0d432aec35bfc0d800d4f70eba26e23a352386be3a6cf157083d18f6f5881c8"},
-    {file = "ruff-0.1.15-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9405fa9ac0e97f35aaddf185a1be194a589424b8713e3b97b762336ec79ff807"},
-    {file = "ruff-0.1.15-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c66ec24fe36841636e814b8f90f572a8c0cb0e54d8b5c2d0e300d28a0d7bffec"},
-    {file = "ruff-0.1.15-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:6f8ad828f01e8dd32cc58bc28375150171d198491fc901f6f98d2a39ba8e3ff5"},
-    {file = "ruff-0.1.15-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86811954eec63e9ea162af0ffa9f8d09088bab51b7438e8b6488b9401863c25e"},
-    {file = "ruff-0.1.15-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd4025ac5e87d9b80e1f300207eb2fd099ff8200fa2320d7dc066a3f4622dc6b"},
-    {file = "ruff-0.1.15-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b17b93c02cdb6aeb696effecea1095ac93f3884a49a554a9afa76bb125c114c1"},
-    {file = "ruff-0.1.15-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ddb87643be40f034e97e97f5bc2ef7ce39de20e34608f3f829db727a93fb82c5"},
-    {file = "ruff-0.1.15-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:abf4822129ed3a5ce54383d5f0e964e7fef74a41e48eb1dfad404151efc130a2"},
-    {file = "ruff-0.1.15-py3-none-musllinux_1_2_i686.whl", hash = "sha256:6c629cf64bacfd136c07c78ac10a54578ec9d1bd2a9d395efbee0935868bf852"},
-    {file = "ruff-0.1.15-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1bab866aafb53da39c2cadfb8e1c4550ac5340bb40300083eb8967ba25481447"},
-    {file = "ruff-0.1.15-py3-none-win32.whl", hash = "sha256:2417e1cb6e2068389b07e6fa74c306b2810fe3ee3476d5b8a96616633f40d14f"},
-    {file = "ruff-0.1.15-py3-none-win_amd64.whl", hash = "sha256:3837ac73d869efc4182d9036b1405ef4c73d9b1f88da2413875e34e0d6919587"},
-    {file = "ruff-0.1.15-py3-none-win_arm64.whl", hash = "sha256:9a933dfb1c14ec7a33cceb1e49ec4a16b51ce3c20fd42663198746efc0427360"},
-    {file = "ruff-0.1.15.tar.gz", hash = "sha256:f6dfa8c1b21c913c326919056c390966648b680966febcb796cc9d1aaab8564e"},
+    {file = "ruff-0.2.0-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:638ea3294f800d18bae84a492cb5a245c8d29c90d19a91d8e338937a4c27fca0"},
+    {file = "ruff-0.2.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:3ff35433fcf4dff6d610738712152df6b7d92351a1bde8e00bd405b08b3d5759"},
+    {file = "ruff-0.2.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9faafbdcf4f53917019f2c230766da437d4fd5caecd12ddb68bb6a17d74399"},
+    {file = "ruff-0.2.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8153a3e4128ed770871c47545f1ae7b055023e0c222ff72a759f5a341ee06483"},
+    {file = "ruff-0.2.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e8a75a98ae989a27090e9c51f763990ad5bbc92d20626d54e9701c7fe597f399"},
+    {file = "ruff-0.2.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:87057dd2fdde297130ff99553be8549ca38a2965871462a97394c22ed2dfc19d"},
+    {file = "ruff-0.2.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6d232f99d3ab00094ebaf88e0fb7a8ccacaa54cc7fa3b8993d9627a11e6aed7a"},
+    {file = "ruff-0.2.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d3c641f95f435fc6754b05591774a17df41648f0daf3de0d75ad3d9f099ab92"},
+    {file = "ruff-0.2.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3826fb34c144ef1e171b323ed6ae9146ab76d109960addca730756dc19dc7b22"},
+    {file = "ruff-0.2.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:eceab7d85d09321b4de18b62d38710cf296cb49e98979960a59c6b9307c18cfe"},
+    {file = "ruff-0.2.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:30ad74687e1f4a9ff8e513b20b82ccadb6bd796fe5697f1e417189c5cde6be3e"},
+    {file = "ruff-0.2.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:a7e3818698f8460bd0f8d4322bbe99db8327e9bc2c93c789d3159f5b335f47da"},
+    {file = "ruff-0.2.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:edf23041242c48b0d8295214783ef543847ef29e8226d9f69bf96592dba82a83"},
+    {file = "ruff-0.2.0-py3-none-win32.whl", hash = "sha256:e155147199c2714ff52385b760fe242bb99ea64b240a9ffbd6a5918eb1268843"},
+    {file = "ruff-0.2.0-py3-none-win_amd64.whl", hash = "sha256:ba918e01cdd21e81b07555564f40d307b0caafa9a7a65742e98ff244f5035c59"},
+    {file = "ruff-0.2.0-py3-none-win_arm64.whl", hash = "sha256:3fbaff1ba9564a2c5943f8f38bc221f04bac687cc7485e45237579fee7ccda79"},
+    {file = "ruff-0.2.0.tar.gz", hash = "sha256:63856b91837606c673537d2889989733d7dffde553828d3b0f0bacfa6def54be"},
 ]
 
 [[package]]
@@ -5732,7 +5757,7 @@ name = "tenacity"
 version = "8.2.3"
 requires_python = ">=3.7"
 summary = "Retry code until it succeeds"
-groups = ["hub", "ml_runtime"]
+groups = ["hub", "llm", "ml_runtime"]
 files = [
     {file = "tenacity-8.2.3-py3-none-any.whl", hash = "sha256:ce510e327a630c9e1beaf17d42e6ffacc88185044ad85cf74c0a8887c6a0f88c"},
     {file = "tenacity-8.2.3.tar.gz", hash = "sha256:5398ef0d78e63f40007c1fb4c0bff96e1911394d2fa8d194f77619c05ff6cc8a"},
@@ -6029,7 +6054,7 @@ files = [
 
 [[package]]
 name = "textual"
-version = "0.48.0"
+version = "0.48.1"
 requires_python = ">=3.8,<4.0"
 summary = "Modern Text User Interface framework"
 groups = ["test"]
@@ -6040,8 +6065,8 @@ dependencies = [
     "typing-extensions<5.0.0,>=4.4.0",
 ]
 files = [
-    {file = "textual-0.48.0-py3-none-any.whl", hash = "sha256:4a64cfafe0fed0b2f55d012053621e2681ae91385010114c85a861f6b5a7c097"},
-    {file = "textual-0.48.0.tar.gz", hash = "sha256:de270ee8448bb9ee02e8705da9268605504a8d347c9c9d26d6b7a86baf6fd6a8"},
+    {file = "textual-0.48.1-py3-none-any.whl", hash = "sha256:caa12b0e2171c50b78171059cb8dd56df72e7e4fd3fd760215343b6c30e975d8"},
+    {file = "textual-0.48.1.tar.gz", hash = "sha256:df39371a0404a41dbb45ea0bc0c3e853ec5bc33236d71c9226a94192db0a637e"},
 ]
 
 [[package]]

From b0dafcff8e740ce8f9e8149f160176298652fd13 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Fri, 2 Feb 2024 16:06:03 +0100
Subject: [PATCH 62/88] Remove case matching

---
 giskard/rag/knowledge_base_testset_generator.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index 62154622c1..776aec8a0e 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -107,15 +107,14 @@ def __init__(
         )
 
     def _difficulty_level_mapping(self, level: DifficultyLevel):
-        match level:
-            case DifficultyLevel.DIFF_1:
-                return self._generate_question_answer_from_context
-            case DifficultyLevel.DIFF_2:
-                return self._generate_complex_questions_from_context
-            case DifficultyLevel.DIFF_3:
-                return self._generate_distraction_questions_from_context
-            case _:
-                raise NotImplementedError(f"Missing case for difficulty level {level}.")
+        if level == DifficultyLevel.DIFF_1:
+            return self._generate_question_answer_from_context
+        elif level == DifficultyLevel.DIFF_2:
+            return self._generate_complex_questions_from_context
+        elif level == DifficultyLevel.DIFF_3:
+            return self._generate_distraction_questions_from_context
+        else:
+            raise NotImplementedError(f"Missing case for difficulty level {level}.")
 
     def _generate_question_answer_from_context(self, context):
         messages = QAGenerationPrompt.create_messages(

From c16d8b40aba2a89a99559b7308f6e55bb945bd41 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Fri, 9 Feb 2024 11:00:14 +0100
Subject: [PATCH 63/88] Fix minor issues after merge

---
 .../rag/knowledge_base_testset_generator.py   | 43 +++++++++++--------
 giskard/rag/prompts.py                        | 26 +++++------
 2 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index 776aec8a0e..1d7cdc4e4a 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -1,4 +1,4 @@
-from typing import Sequence
+from typing import Sequence, Union
 
 import json
 import logging
@@ -18,13 +18,12 @@
 from .vector_store import VectorStore
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
 
 
 class DifficultyLevel(int, Enum):
-    DIFF_1 = 1
-    DIFF_2 = 2
-    DIFF_3 = 3
+    EASY = 1
+    COMPLEX = 2
+    DISTRACTING_ELEMENT = 3
 
 
 class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
@@ -107,11 +106,11 @@ def __init__(
         )
 
     def _difficulty_level_mapping(self, level: DifficultyLevel):
-        if level == DifficultyLevel.DIFF_1:
+        if level == DifficultyLevel.EASY:
             return self._generate_question_answer_from_context
-        elif level == DifficultyLevel.DIFF_2:
+        elif level == DifficultyLevel.COMPLEX:
             return self._generate_complex_questions_from_context
-        elif level == DifficultyLevel.DIFF_3:
+        elif level == DifficultyLevel.DISTRACTING_ELEMENT:
             return self._generate_distraction_questions_from_context
         else:
             raise NotImplementedError(f"Missing case for difficulty level {level}.")
@@ -125,7 +124,7 @@ def _generate_question_answer_from_context(self, context):
         )
 
         generated_qa = self._llm_complete(messages=messages)
-        generated_qa["difficulty"] = DifficultyLevel.DIFF_1
+        generated_qa["difficulty"] = DifficultyLevel.EASY
         return generated_qa
 
     def _generate_complex_questions_from_context(self, context):
@@ -137,7 +136,7 @@ def _generate_complex_questions_from_context(self, context):
             language=self.language,
             user_content=(generated_qa["question"], context),
         )
-        generated_qa["difficulty"] = DifficultyLevel.DIFF_2
+        generated_qa["difficulty"] = DifficultyLevel.COMPLEX
         out = self._llm_complete(messages=messages)
         generated_qa["question"] = out["question"]
         return generated_qa
@@ -152,7 +151,7 @@ def _generate_distraction_questions_from_context(self, context):
             language=self.language,
             user_content=(generated_qa["question"], generated_qa["answer"], distracting_context),
         )
-        generated_qa["difficulty"] = DifficultyLevel.DIFF_3
+        generated_qa["difficulty"] = DifficultyLevel.DISTRACTING_ELEMENT
         out = self._llm_complete(messages=messages)
         generated_qa["question"] = out["question"]
         return generated_qa
@@ -174,7 +173,7 @@ def _prevent_context_window_overflow(self, prompt):
         # https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
         return prompt[: self.context_window_length * 4]
 
-    def _llm_complete(self, messages):
+    def _llm_complete(self, messages: Sequence[dict]):
         try:
             out = self.llm_client.complete(
                 messages=messages,
@@ -182,10 +181,10 @@ def _llm_complete(self, messages):
                 caller_id=self.__class__.__name__,
             )
 
-            generated = json.loads(out.message, strict=False)
+            generated = json.loads(out.content, strict=False)
         except json.decoder.JSONDecodeError:
             logger.warning("JSON decoding error, trying to fix the JSON string.")
-            generated = self._try_fix_json_message(out.message)
+            generated = self._try_fix_json_message(out.content)
         return generated
 
     def _try_fix_json_message(self, incorrect_json):
@@ -198,13 +197,17 @@ def _try_fix_json_message(self, incorrect_json):
                 temperature=0,
                 caller_id=self.__class__.__name__,
             )
-            corrected_message = json.loads(out.message)
+            corrected_message = json.loads(out.content)
         except Exception:
             logger.warning("Fixing JSON format failed, question generation skipped.")
             return None
         return corrected_message
 
-    def generate_dataset(self, num_samples: int = 10, difficulty_levels: Sequence[DifficultyLevel] = None) -> QATestset:
+    def generate_dataset(
+        self,
+        num_samples: int = 10,
+        difficulty: Union[DifficultyLevel, Sequence[DifficultyLevel]] = DifficultyLevel.EASY,
+    ) -> QATestset:
         """Generates a testset from the knowledge base.
 
         Parameters
@@ -223,11 +226,13 @@ def generate_dataset(self, num_samples: int = 10, difficulty_levels: Sequence[Di
                 - *difficulty_level*: an indicator of how difficult the question is
 
         """
-        difficulty_levels = difficulty_levels or [DifficultyLevel.DIFF_1]
+        if not isinstance(difficulty, Sequence):
+            difficulty = [difficulty]
+
         generated_questions = []
-        for level in difficulty_levels:
+        for level in difficulty:
             for idx in range(num_samples):
-                logger.info(f"Generating question {idx + 1}/{num_samples} for difficulty level {str(level.value)}.")
+                logger.info(f"Generating question {idx + 1}/{num_samples} for difficulty level {str(level)}.")
                 seed_contexts = self._extract_seed_context()
                 context = QAGenerationPrompt.format_context(seed_contexts)
 
diff --git a/giskard/rag/prompts.py b/giskard/rag/prompts.py
index 167bc89b48..379544ddc0 100644
--- a/giskard/rag/prompts.py
+++ b/giskard/rag/prompts.py
@@ -1,8 +1,8 @@
-QA_GENERATION_SYSTEM_PROMPT_WITH_DESCRIPTION = """You are a powerful auditing AI, your role is to generate question answer pair from a given list of context paragraph to audit a model specialized on these knowledge. 
+QA_GENERATION_SYSTEM_PROMPT_WITH_DESCRIPTION = """You are a powerful auditor, your role is to generate question & answer pair from a given list of context paragraphs.
 
 The model you are auditing is the following:
 - Model name: {model_name}
-- Model description: {model_description}  
+- Model description: {model_description}
 
 Your question must be related to a provided context.  
 Please respect the following rules to generate the question:
@@ -10,11 +10,11 @@
 - The question must be self-contained
 - The question and answer must be in this language: {language}
 
-You will be provided the context, consisting in multiple paragraphs delimited by dashes "------".
+The user will provide the context, consisting in multiple paragraphs delimited by dashes "------".
 You will return the question and the precise answer to the question based exclusively on the provided context.
-Your output should be a single JSON object, with keys 'question' and 'answer'. Make sure you return a valid JSON object."""
+You must output a single JSON object with keys 'question' and 'answer'. Make sure you return a valid JSON object."""
 
-QA_GENERATION_SYSTEM_PROMPT = """You are a powerful auditing AI, your role is to generate question answer pair from a given list of context paragraph to audit a model specialized on these knowledge. 
+QA_GENERATION_SYSTEM_PROMPT = """You are a powerful auditor, your role is to generate a question & answer pair from a given list of context paragraphs.
 
 Your question must be related to a provided context.  
 Please respect the following rules to generate the question:
@@ -33,7 +33,7 @@
 
 QA_GENERATION_CONTEXT_EXAMPLE = """What payment methods do you accept?
 
-\tWe accept a variety of payment methods to provide our customers with a convenient and secure shopping experience. You can make a purchase using major credit and debit cards, including Visa, Mastercard, American Express, and Discover. We also offer the option to pay with popular digital wallets such as PayPal and Google Pay. For added flexibility, you can choose to complete your order using bank transfers or wire transfers. Rest assured that we prioritize the security of your personal information and go the extra mile to ensure your transactions are processed safely.
+We accept a variety of payment methods to provide our customers with a convenient and secure shopping experience. You can make a purchase using major credit and debit cards, including Visa, Mastercard, American Express, and Discover. We also offer the option to pay with popular digital wallets such as PayPal and Google Pay. For added flexibility, you can choose to complete your order using bank transfers or wire transfers. Rest assured that we prioritize the security of your personal information and go the extra mile to ensure your transactions are processed safely.
 ------
 \tWhat is your shipping policy?
 
@@ -122,10 +122,10 @@ def create_messages(
 - The re-written question should be more elaborated than the original, use elements from the context to enrich the questions. 
 - The re-written question should be more difficult to handle for AI models but it must be understood and answerable by humans.
 - Add one or more constraints / conditions to the question.
-- The re-written question must be in {language}.
+- The re-written question must be in this language: {language}
 
-You will be provided the question delimited with <question></question> tags.
-You will also be provided a relevant context which contain the answer to the question, delimited with <context></context> tags. It consists in multiple paragraphs delimited by dashes "------".
+You will be provided the question delimited by <question></question> tags.
+You will also be provided a relevant context which contain the answer to the question, delimited by <context></context> tags. It consists in multiple paragraphs delimited by dashes "------".
 You will return the reformulated question as a single JSON object, with the key 'question'. Make sure you return a valid JSON object.
 """
 
@@ -137,7 +137,7 @@ def create_messages(
 - The re-written question should be more elaborated than the original, use elements from the context to enrich the questions. 
 - The re-written question should be more difficult to handle for AI models but it must be understood and answerable by humans.
 - Add one or more constraints / conditions to the question.
-- The re-written question must be in {language}.
+- The re-written question must be in this language: {language}
 
 You will be provided the question delimited with <question></question> tags.
 You will also be provided a relevant context which contain the answer to the question, delimited with <context></context> tags. It consists in multiple paragraphs delimited by dashes "------".
@@ -194,7 +194,7 @@ def create_messages(cls, **kwargs):
 - The new question must have the same answer as the original question.
 - The question must be plausible according to the context and the model description.
 - The question must be self-contained and understandable by humans. 
-- The question must be in french.
+- The question must be in this language: {language}
 
 You will be provided the question and its answer delimited with <question></question> and <answer></answer> tags.
 You will also be provided a context paragraph delimited with <context></context> tags.
@@ -212,7 +212,7 @@ def create_messages(cls, **kwargs):
 - The original question direction should be preserved.
 - The question must be plausible according to the context and the model description.
 - The question must be self-contained and understandable by humans. 
-- The question must be in french.
+- The question must be in this language: {language}
 
 You will be provided the question delimited with <question></question> tags.
 You will also be provided a context paragraph delimited with <context></context> tags.
@@ -231,7 +231,7 @@ def create_messages(cls, **kwargs):
 """
 
 DISCTRACTING_QUESTION_ANSWER_EXAMPLE = """{
-    "question": "Do you have any job opening suitable for disabled engineering students? "
+    "question": "Do you have any job opening suitable for engineering students with a disability? "
 }"""
 
 

From 938dd093487270f2e2b2691f142bf52ab8ea3344 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Fri, 9 Feb 2024 11:21:20 +0100
Subject: [PATCH 64/88] Prompt fixes

---
 giskard/rag/prompts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/giskard/rag/prompts.py b/giskard/rag/prompts.py
index 379544ddc0..06f05c95f8 100644
--- a/giskard/rag/prompts.py
+++ b/giskard/rag/prompts.py
@@ -185,7 +185,7 @@ def create_messages(cls, **kwargs):
         return super().create_messages(**kwargs)
 
 
-DISTRACTING_QUESTION_SYSTEM_PROMPT = """You are an expert at rewritting question.
+DISTRACTING_QUESTION_SYSTEM_PROMPT = """You are an expert at rewriting question.
 Your task is to re-write questions that will be used to evaluate a language model.
 
 Your task is to complexify questions given a provided context. 
@@ -201,7 +201,7 @@ def create_messages(cls, **kwargs):
 You will return the reformulated question as a single JSON object, with the key 'question'. Make sure you return a valid JSON object.
 """
 
-DISTRACTING_QUESTION_SYSTEM_PROMPT_WITH_DESCRIPTION = """You are an expert at rewritting question.
+DISTRACTING_QUESTION_SYSTEM_PROMPT_WITH_DESCRIPTION = """You are an expert at rewriting questions.
 Your task is to re-write questions that will be used to evaluate the following model:
 - Model name: {model_name}
 - Model description: {model_description}  

From 8dabe830fdc4b48bd9dd4fb968b050fb4201e73c Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Fri, 9 Feb 2024 11:21:34 +0100
Subject: [PATCH 65/88] Docs update

---
 docs/open_source/testset_generation/index.md | 31 +++++++++++++-------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/docs/open_source/testset_generation/index.md b/docs/open_source/testset_generation/index.md
index 9070a51dd6..9f019f6aed 100644
--- a/docs/open_source/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -70,7 +70,7 @@ they are concatenated automatically. If only some of the columns contains releva
 
 
 ```python
-knowledge_base_df = pd.read_*("path/to/your/knowledge_base")
+knowledge_base_df = pd.read_csv("path/to/your/knowledge_base.csv")
 feature_names = ["col1", "col2"]
 knowledge_base_df["page_content"] = knowledge_base_df[feature_names].apply(" ".join, axis=1)
 ```
@@ -83,20 +83,29 @@ Once the knowledge base is loaded as a pandas `DataFrame`, you can generate the
 ```python
 from giskard.rag import KnowledgeBaseTestsetGenerator, DifficultyLevel
 
-generator = KnowledgeBaseTestsetGenerator(knowledge_base_df, 
-                    model_name="Model name", # Optional, provide a name to your model to get better fitting questions
-                    model_description="Description of the model", # Optional, briefly describe the task done by your model
-                    knowledge_base_features=["page_content"])
+generator = KnowledgeBaseTestsetGenerator(
+    knowledge_base_df, 
+    model_name="Model name", # Optional, provide a name to your model to get better fitting questions
+    model_description="Description of the model", # Optional, briefly describe the task done by your model
+    knowledge_base_features=["page_content"]
+)
 
-testset = generator.generate_dataset(num_samples=10, difficulty_level=[DifficultyLevel.DIFF_1, DifficultyLevel.DIFF_2])
+# Generate a testset with 10 questions & answers for each difficulty level
+testset = generator.generate_dataset(num_samples=10, difficulty_level=[1, 2])
 ```
 
-You can select the difficulty level of the generated questions. There are three distinct difficulty levels available:
-- Level 1: basic questions generated from a piece of the knowledge base
-- Level 2: question made more complex by paraphrasing
-- Level 3: questions with distracting element
+The test set will be a subclass of {ref}`giskard.Dataset`. You can also get it as a pandas DataFrame by accessing `testset.df`.
+
+Here's an example of the generated test set:
 
-The generators creates `num_samples` questions per by difficulty level. In the above examples 10 *level 1* questions and 10 *level 2* questions.
+| question | reference_context | reference_answer | difficulty_level |
+|----------|-------------------|------------------|------------------|
+| For which countries can I track my shipping? | What is your shipping policy? We offer free shipping on all orders over \$50. For orders below \$50, we charge a flat rate of \$5.99. We offer shipping services to customers residing in all 50 states of the US, in addition to providing delivery options to Canada and Mexico. ------  How can I track my order? Once your purchase has been successfully confirmed and shipped, you will receive a confirmation email containing your tracking number. You can simply click on the link provided in the email or visit our website's order tracking page. | We ship to all 50 states in the US, as well as to Canada and Mexico. We offer tracking for all our shippings. | 1 |
+
+You can select the difficulty level of the generated questions. We currently support three difficulty levels:
+- Level 1: simple questions generated from a excerpt of the knowledge base
+- Level 2: question made more complex by paraphrasing
+- Level 3: questions made more difficult by adding a distracting element which is related to the knowledge base but irrelevant to the question
 
 ## Step 3: Wrap your model
 To evaluate your model, you must wrap it as a `giskard.Model`. This step is necessary to ensure a common format for your model and its metadata.You can wrap anything as long as you can represent it in a Python function (for example an API call call to Azure or OpenAI). We also have pre-built wrappers for LangChain objects, or you can create your own wrapper by extending the `giskard.Model` class if you need to wrap a complex object such as a custom-made RAG communicating with a vectorstore.

From 41273210ad21389aff6e9bd5e96e9c6d3401c72c Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Fri, 9 Feb 2024 13:14:19 +0100
Subject: [PATCH 66/88] Fixing LLM client

---
 giskard/llm/client/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/giskard/llm/client/base.py b/giskard/llm/client/base.py
index 88e5ec2c0d..b9dcd5d13e 100644
--- a/giskard/llm/client/base.py
+++ b/giskard/llm/client/base.py
@@ -25,8 +25,8 @@ class LLMToolCall:
 class LLMMessage:
     role: str
     content: Optional[str]
-    function_call: Optional[LLMFunctionCall]
-    tool_calls: Optional[List[LLMToolCall]]
+    function_call: Optional[LLMFunctionCall] = None
+    tool_calls: Optional[List[LLMToolCall]] = None
 
     @staticmethod
     def create_message(role: str, content: str):

From a3f28111be4ef22d1166973b40d7ba4b4de1a13b Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Fri, 9 Feb 2024 13:19:28 +0100
Subject: [PATCH 67/88] Make content optional in LLMMessage

---
 giskard/llm/client/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/giskard/llm/client/base.py b/giskard/llm/client/base.py
index b9dcd5d13e..904258025a 100644
--- a/giskard/llm/client/base.py
+++ b/giskard/llm/client/base.py
@@ -24,7 +24,7 @@ class LLMToolCall:
 @dataclass
 class LLMMessage:
     role: str
-    content: Optional[str]
+    content: Optional[str] = None
     function_call: Optional[LLMFunctionCall] = None
     tool_calls: Optional[List[LLMToolCall]] = None
 

From fc647f89bb747eb33e87e22d9be1d95be7434384 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Fri, 9 Feb 2024 13:20:10 +0100
Subject: [PATCH 68/88] Fix evaluator and tests

---
 giskard/llm/evaluators/correctness.py         |  4 +-
 .../evaluators/test_correctness_evaluator.py  | 40 ++++++++++---------
 .../test_knowledge_base_testset_generator.py  |  9 +++--
 3 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/giskard/llm/evaluators/correctness.py b/giskard/llm/evaluators/correctness.py
index 4c3cd0727f..09027e0559 100644
--- a/giskard/llm/evaluators/correctness.py
+++ b/giskard/llm/evaluators/correctness.py
@@ -137,7 +137,7 @@ def _evaluate_single(self, model: BaseModel, question, reference_answer, model_o
             temperature=self.llm_temperature,
             caller_id=self.__class__.__name__,
         )
-        if out.function_call is None or "passed_test" not in out.function_call.args:
+        if out.function_call is None or "passed_test" not in out.function_call.arguments:
             raise LLMGenerationError("Invalid function call arguments received")
 
-        return out.function_call.args["passed_test"], out.function_call.args.get("reason")
+        return out.function_call.arguments["passed_test"], out.function_call.arguments.get("reason")
diff --git a/tests/llm/evaluators/test_correctness_evaluator.py b/tests/llm/evaluators/test_correctness_evaluator.py
index 368947c4c3..4327ef4b46 100644
--- a/tests/llm/evaluators/test_correctness_evaluator.py
+++ b/tests/llm/evaluators/test_correctness_evaluator.py
@@ -4,7 +4,7 @@
 import pytest
 
 from giskard.datasets.base import Dataset
-from giskard.llm.client import LLMFunctionCall, LLMOutput
+from giskard.llm.client import LLMFunctionCall, LLMMessage
 from giskard.llm.evaluators.correctness import CorrectnessEvaluator
 from giskard.models.base.model_prediction import ModelPredictionResults
 
@@ -44,20 +44,22 @@ def test_correctness_evaluator_correctly_flags_examples():
 
     client = Mock()
     client.complete.side_effect = [
-        LLMOutput(
+        LLMMessage(
+            role="assistant",
             function_call=LLMFunctionCall(
-                function="evaluate_model",
-                args={"passed_test": True, "reason": ""},
-            )
+                name="evaluate_model",
+                arguments={"passed_test": True, "reason": ""},
+            ),
         ),
-        LLMOutput(
+        LLMMessage(
+            role="assistant",
             function_call=LLMFunctionCall(
-                function="evaluate_model",
-                args={
+                name="evaluate_model",
+                arguments={
                     "passed_test": False,
                     "reason": "The model output does not agree with the ground truth: Rome is the capital of Italy",
                 },
-            )
+            ),
         ),
     ]
 
@@ -84,7 +86,7 @@ def test_correctness_evaluator_correctly_flags_examples():
     # Check LLM client calls arguments
     args = client.complete.call_args_list[0]
     assert "Your role is to test AI models" in args[0][0][0]["content"]
-    assert args[1]["functions"][0]["name"] == "evaluate_model"
+    assert args[1]["functions"][0]["function"]["name"] == "evaluate_model"
 
 
 def test_correctness_evaluator_handles_generation_errors():
@@ -93,20 +95,22 @@ def test_correctness_evaluator_handles_generation_errors():
 
     client = Mock()
     client.complete.side_effect = [
-        LLMOutput(
+        LLMMessage(
+            role="assistant",
             function_call=LLMFunctionCall(
-                function="evaluate_model",
-                args={"passed_test": True, "reason": ""},
-            )
+                name="evaluate_model",
+                arguments={"passed_test": True, "reason": ""},
+            ),
         ),
-        LLMOutput(
+        LLMMessage(
+            role="assistant",
             function_call=LLMFunctionCall(
-                function="evaluate_model",
-                args={
+                name="evaluate_model",
+                arguments={
                     "pass": False,
                     "reason": "The model output does not agree with the ground truth: Rome is the capital of Italy",
                 },
-            )
+            ),
         ),
     ]
 
diff --git a/tests/rag/test_knowledge_base_testset_generator.py b/tests/rag/test_knowledge_base_testset_generator.py
index 2b6bc77505..297eb6d7d3 100644
--- a/tests/rag/test_knowledge_base_testset_generator.py
+++ b/tests/rag/test_knowledge_base_testset_generator.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pandas as pd
 
-from giskard.llm.client import LLMOutput
+from giskard.llm.client import LLMMessage
 from giskard.rag import KnowledgeBaseTestsetGenerator
 
 
@@ -38,9 +38,10 @@ def test_testset_generation():
     llm_client = Mock()
     llm_client.complete.side_effect = (
         [
-            LLMOutput(
-                """{"question": "Where is Camembert from?",
-"answer": "Camembert was created in Normandy, in the northwest of France."}"""
+            LLMMessage(
+                role="assistant",
+                content="""{"question": "Where is Camembert from?",
+"answer": "Camembert was created in Normandy, in the northwest of France."}""",
             )
         ]
         * 2

From 853f31f687d7369ac94aad704593ec46529358b4 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Mon, 12 Feb 2024 15:40:48 +0100
Subject: [PATCH 69/88] Fix docs

---
 docs/open_source/testset_generation/index.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/open_source/testset_generation/index.md b/docs/open_source/testset_generation/index.md
index 9f019f6aed..c775c52896 100644
--- a/docs/open_source/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -81,7 +81,7 @@ Once the knowledge base is loaded as a pandas `DataFrame`, you can generate the
 
 
 ```python
-from giskard.rag import KnowledgeBaseTestsetGenerator, DifficultyLevel
+from giskard.rag import KnowledgeBaseTestsetGenerator
 
 generator = KnowledgeBaseTestsetGenerator(
     knowledge_base_df, 
@@ -91,7 +91,7 @@ generator = KnowledgeBaseTestsetGenerator(
 )
 
 # Generate a testset with 10 questions & answers for each difficulty level
-testset = generator.generate_dataset(num_samples=10, difficulty_level=[1, 2])
+testset = generator.generate_dataset(num_samples=10, difficulty=[1, 2])
 ```
 
 The test set will be a subclass of {ref}`giskard.Dataset`. You can also get it as a pandas DataFrame by accessing `testset.df`.

From 3b3417359f646726ed041e728aefbf86ca7dff19 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Mon, 12 Feb 2024 15:41:40 +0100
Subject: [PATCH 70/88] Start refactoring of QATestset

---
 .../rag/knowledge_base_testset_generator.py   |  2 +-
 giskard/rag/testset.py                        | 33 ++++++++++---------
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
index 1d7cdc4e4a..9415b3df8c 100644
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ b/giskard/rag/knowledge_base_testset_generator.py
@@ -251,4 +251,4 @@ def generate_dataset(
                 else:
                     logger.warning("Error in question generation, skipping it.")
 
-        return QATestset(df=pd.DataFrame(generated_questions), target=None)
+        return QATestset(pd.DataFrame(generated_questions))
diff --git a/giskard/rag/testset.py b/giskard/rag/testset.py
index ad01267572..6a75df1902 100644
--- a/giskard/rag/testset.py
+++ b/giskard/rag/testset.py
@@ -1,11 +1,23 @@
-from .. import Dataset, Suite
+import pandas as pd
+
+from ..core.suite import Suite
 from ..testing.tests.llm import test_llm_correctness
 
 
-class QATestset(Dataset):
-    """A wrapper class around `Dataset` to allow automatic creation
-    of a `Suite` based on the question/answer pairs inside the `TestSet`.
-    """
+class QATestset:
+    def __init__(self, dataframe: pd.DataFrame):
+        self._dataframe = dataframe
+
+    def to_pandas(self):
+        return self._dataframe
+
+    def save(self, path):
+        self._dataframe.to_json(path, orient="records", lines=True)
+
+    @classmethod
+    def load(cls, path):
+        dataframe = pd.read_json(path, orient="records", lines=True)
+        return cls(dataframe)
 
     def to_test_suite(self, name=None):
         suite_default_params = {"dataset": self}
@@ -15,13 +27,4 @@ def to_test_suite(self, name=None):
         return suite
 
     def copy(self):
-        testset = QATestset(
-            df=self.df.copy(),
-            target=self.target,
-            column_types=self.column_types.copy(),
-            validation=False,
-        )
-
-        if hasattr(self, "column_meta"):
-            testset.load_metadata_from_instance(self.column_meta)
-        return testset
+        return QATestset(self.dataframe.copy())

From 50a51c1a0db93eb03ab47e2d9f66cf5837630179 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Mon, 12 Feb 2024 16:48:50 +0100
Subject: [PATCH 71/88] Update docs

---
 docs/open_source/testset_generation/index.md | 89 ++++++--------------
 1 file changed, 27 insertions(+), 62 deletions(-)

diff --git a/docs/open_source/testset_generation/index.md b/docs/open_source/testset_generation/index.md
index c775c52896..dda45b9b81 100644
--- a/docs/open_source/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -1,22 +1,13 @@
 # 🧰 RAG toolset
-Retrieval Augmented Generative models (RAGs) combine LLM models and data sources to produce domain-specific language models able to answer precise questions whose answer are available inside a knowledge base. These models are often extremely specialized to a use-case defined by the information present inside the knowledge base. The specialization of the model makes generic evaluations irrelevant to verify the model's behavior (e.g. hallucinations, trustworthiness, etc.). To this end, the Giskard python library provides a toolset dedicated to RAG models that generates question/answer pairs from the knowledge base of the model.
+The Giskard python library provides a toolset dedicated to Retrieval Augmented Generative models (RAGs) that generates question & answer pairs from the knowledge base of the model. The generated testset is then used to evaluate your model. 
 
-## How does it work?
-The automatic testset generation explores the Knowledge Base (KB) of your model and generate questions and answers related to specific topics available inside the KB. Specifically, we randomly select a topic from the KB, then we extract the related excerpts from the KB to build a `reference_context`. Then we generate a `question` along with a `reference_answer` using an LLM (specifically, we use **OpenAI GPT-4**). 
+## Generate questions with difficulty levels
+You can currently generate questions with three difficulty levels:
+- **Easy questions (level 1):** simple questions generated from an excerpt of the knowledge base
+- **Complex questions: (level 2)** questions made more complex by paraphrasing
+- **Distracting questions (level 3):** questions made even more difficult by adding a distracting element which is related to the knowledge base but irrelevant to the question
 
-The generated testset contains a list of questions specific to the model's knowledge base. The model should theoretically answer all these questions correctly. Yet, hallucination or imprecise answers can be generated by the model. This testset allows to quantify how frequent these undesired behaviors happen.
-
-### What data are being sent to OpenAI/Azure OpenAI
-
-In order to perform LLM-assisted detectors, we will be sending the following information to OpenAI/Azure OpenAI:
-
-- Data provided in your knowledge base
-- Text generated by your model
-- Model name and description
-
-### Will the testset generation work in any language?
-
-The testset quality depends on GPT-4 capabilities regarding your model's language. 
+These three difficulty levels allows to evaluate different components of your model. Easy questions are directly generated from your knowledge base. They assess the quality of the answer generation from the context, i.e. the quality of the LLM answer. Complex and distracting questions are more challenging as they can perturb the retrieval componenent of the RAG. These questions are more realistic of a user seeking precise information with your model.
 
 ## Before starting
 
@@ -72,7 +63,6 @@ they are concatenated automatically. If only some of the columns contains releva
 ```python
 knowledge_base_df = pd.read_csv("path/to/your/knowledge_base.csv")
 feature_names = ["col1", "col2"]
-knowledge_base_df["page_content"] = knowledge_base_df[feature_names].apply(" ".join, axis=1)
 ```
 
 ## Step 2: Generate the testset
@@ -87,7 +77,7 @@ generator = KnowledgeBaseTestsetGenerator(
     knowledge_base_df, 
     model_name="Model name", # Optional, provide a name to your model to get better fitting questions
     model_description="Description of the model", # Optional, briefly describe the task done by your model
-    knowledge_base_features=["page_content"]
+    knowledge_base_features=feature_names
 )
 
 # Generate a testset with 10 questions & answers for each difficulty level
@@ -102,62 +92,27 @@ Here's an example of the generated test set:
 |----------|-------------------|------------------|------------------|
 | For which countries can I track my shipping? | What is your shipping policy? We offer free shipping on all orders over \$50. For orders below \$50, we charge a flat rate of \$5.99. We offer shipping services to customers residing in all 50 states of the US, in addition to providing delivery options to Canada and Mexico. ------  How can I track my order? Once your purchase has been successfully confirmed and shipped, you will receive a confirmation email containing your tracking number. You can simply click on the link provided in the email or visit our website's order tracking page. | We ship to all 50 states in the US, as well as to Canada and Mexico. We offer tracking for all our shippings. | 1 |
 
-You can select the difficulty level of the generated questions. We currently support three difficulty levels:
-- Level 1: simple questions generated from a excerpt of the knowledge base
-- Level 2: question made more complex by paraphrasing
-- Level 3: questions made more difficult by adding a distracting element which is related to the knowledge base but irrelevant to the question
-
 ## Step 3: Wrap your model
-To evaluate your model, you must wrap it as a `giskard.Model`. This step is necessary to ensure a common format for your model and its metadata.You can wrap anything as long as you can represent it in a Python function (for example an API call call to Azure or OpenAI). We also have pre-built wrappers for LangChain objects, or you can create your own wrapper by extending the `giskard.Model` class if you need to wrap a complex object such as a custom-made RAG communicating with a vectorstore.
+Before evaluating your model, you must wrap it as a `giskard.Model`. This step is necessary to ensure a common format for your model and its metadata. You can wrap anything as long as you can represent it in a Python function (for example an API call call to Azure or OpenAI). We also have pre-built wrappers for LangChain objects, or you can create your own wrapper by extending the `giskard.Model` class if you need to wrap a complex object such as a custom-made RAG communicating with a vectorstore.
 
 To do so, you can follow the instructions from the [LLM Scan feature](../scan/scan_llm/index.md#step-1-wrap-your-model) or from the {doc}`Reference API </reference/models/index>`. Make sure that you pass `feature_names = "question"` when wrapping your model, so that it matches the question column of the testset. 
 
 Detailed examples can also be found on our {doc}`LLM tutorials section </tutorials/llm_tutorials/index>`.
 
 
-## Step 4: Evaluate your model
-Once your `testset` is ready, you can evaluate your wrapped model using the `CorrectnessEvaluator`. This can be done directly or through a Giskard test which wraps the evaluator. The `CorrectnessEvaluator` asks a question to the given model and compares the model answer with the reference answer from the testset. Specifically, we use GPT-4 to assess whether the model answer is acceptable given the reference answer. 
-
-
-:::::::{tab-set}
-::::::{tab-item} Direct Evaluation
-
-The `CorrectnessEvaluator` asks all the questions from the testset to your model and generate a `EvaluationResult` object with all samples from the testset split as pass or fail, and the indices of failed samples in the original testset.
-```python
-from giskard.llm.evaluators import CorrectnessEvaluator
-
-correctness_evaluator = CorrectnessEvaluator()
-eval_result, failed_indices = correctness_evaluator.evaluate(giskard_model, testset)
-```
-::::::
-::::::{tab-item} Giskard test
-You can also evaluate your model with the `test_llm_correctness` function, which wraps the `CorrectnessEvaluator` and produce a `TestResult` object as all Giskard test functions. The model passes the test if the ratio of correct answer is above the specified threshold. 
-```python
-from giskard.testing.tests.llm import test_llm_correctness
-
-test_result = test_llm_correctness(giskard_model, testset, threshold=0.8).execute()
-```
-::::::
-:::::::
-
-## What's next?
-
-The questions generated in the testset may have highlighted some vulnerabilities of your model. There are 2 important actions you can take next:
-
-### 1. Generate a test suite from the testset:
-
-Turn the generated testset into an actionable test suite that you can save and reuse in further iterations. Note that you need to pass your wrapped model when executing the suite, since the suite was generated only from the testset.
+## Step 4: Generate a test suite to evaluate your model
+Once your `testset` is ready, you can turn it into an actionable test suite that you can save and reuse in further iterations. Note that you need to pass your wrapped model when executing the suite, since the suite is generated only from the testset.
 
 ```python
 test_suite = testset.to_test_suite("My first test suite")
-
-# You can run the test suite locally to verify that it reproduces the issues
 test_suite.run(giskard_model)
 ```
 
 Jump to the [test customization](https://docs.giskard.ai/en/latest/open_source/customize_tests/index.html) and [test integration](https://docs.giskard.ai/en/latest/open_source/integrate_tests/index.html) sections to find out everything you can do with test suites.
 
-### 2. Upload your test suite to the Giskard Hub to:
+
+## Next: upload your test suite to the Giskard Hub
+Uploading a test suite to the hub allows you to:
 * Compare the quality of different models and prompts to decide which one to promote
 * Create more tests relevant to your use case, combining input prompts that make your model fail and custome evaluation criteria
 * Share results, and collaborate with your team to integrate business feedback
@@ -166,11 +121,21 @@ To upload your test suite, you must have created a project on Giskard Hub and in
 
 Then, upload your test suite like this:
 ```python
-test_suite.upload(giskard_client, project_key)
+test_suite.upload(giskard_client, project_id) #project_id should be the id of the Giskard project in which you want to upload the suite
 ```
 
 [Here's a demo](https://huggingface.co/spaces/giskardai/giskard) of the Giskard Hub in action.
 
-## Troubleshooting
+## What data are being sent to OpenAI/Azure OpenAI
+
+In order to perform LLM-assisted detectors, we will be sending the following information to OpenAI/Azure OpenAI:
+
+- Data provided in your knowledge base
+- Text generated by your model
+- Model name and description
 
-If you encounter any issues, join our [Discord community](https://discord.gg/fkv7CAr3FE) and ask questions in our #support channel.
+## Will the testset generation work in any language?
+The testset quality depends on GPT-4 capabilities regarding your model's language. 
+
+## Troubleshooting
+If you encounter any issues, join our [Discord community](https://discord.gg/fkv7CAr3FE) and ask questions in our #support channel.
\ No newline at end of file

From e0631fe9e46a56145bfc96c08aebb962c26c75d1 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Mon, 12 Feb 2024 18:21:06 +0100
Subject: [PATCH 72/88] Fixing correctness evaluator

---
 giskard/llm/evaluators/correctness.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/giskard/llm/evaluators/correctness.py b/giskard/llm/evaluators/correctness.py
index 09027e0559..6ee7338952 100644
--- a/giskard/llm/evaluators/correctness.py
+++ b/giskard/llm/evaluators/correctness.py
@@ -1,6 +1,7 @@
 from ...core.test_result import TestResultStatus, create_test_result_details
 from ...datasets import Dataset
 from ...models.base.model import BaseModel
+from ..client.base import LLMMessage
 from ..errors import LLMGenerationError
 from .base import EVALUATE_MODEL_FUNCTIONS, EvaluationResult, LLMBasedEvaluator
 
@@ -131,9 +132,9 @@ def _evaluate_single(self, model: BaseModel, question, reference_answer, model_o
         )
 
         out = self.llm_client.complete(
-            [{"role": "system", "content": prompt}],
-            functions=self._make_evaluate_functions(),
-            function_call={"name": "evaluate_model"},
+            [LLMMessage(role="system", content=prompt)],
+            tools=self._make_evaluate_functions(),
+            tool_choice={"type": "function", "function": {"name": "evaluate_model"}},
             temperature=self.llm_temperature,
             caller_id=self.__class__.__name__,
         )

From 0c13ebe269604f993b772858b38fe1109f53cb7e Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Mon, 12 Feb 2024 18:21:26 +0100
Subject: [PATCH 73/88] Add conversion to dataset to testset

---
 giskard/rag/testset.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/giskard/rag/testset.py b/giskard/rag/testset.py
index 6a75df1902..cf4c40a8bc 100644
--- a/giskard/rag/testset.py
+++ b/giskard/rag/testset.py
@@ -1,6 +1,7 @@
 import pandas as pd
 
 from ..core.suite import Suite
+from ..datasets.base import Dataset
 from ..testing.tests.llm import test_llm_correctness
 
 
@@ -11,6 +12,9 @@ def __init__(self, dataframe: pd.DataFrame):
     def to_pandas(self):
         return self._dataframe
 
+    def to_dataset(self):
+        return Dataset(self._dataframe, name="QA Testset", target=False, validation=False)
+
     def save(self, path):
         self._dataframe.to_json(path, orient="records", lines=True)
 

From 46fbea2bd1166f44e729a07b8e0bfd5062de50a6 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Mon, 12 Feb 2024 18:34:32 +0100
Subject: [PATCH 74/88] More fixes to correctness evaluator

---
 giskard/llm/evaluators/correctness.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/giskard/llm/evaluators/correctness.py b/giskard/llm/evaluators/correctness.py
index 6ee7338952..a97c273b16 100644
--- a/giskard/llm/evaluators/correctness.py
+++ b/giskard/llm/evaluators/correctness.py
@@ -138,7 +138,11 @@ def _evaluate_single(self, model: BaseModel, question, reference_answer, model_o
             temperature=self.llm_temperature,
             caller_id=self.__class__.__name__,
         )
-        if out.function_call is None or "passed_test" not in out.function_call.arguments:
+
+        try:
+            passed_test = out.tool_calls[0].function.arguments["passed_test"]
+            reason = out.tool_calls[0].function.argumentsget("passed_test")
+        except (AttributeError, KeyError, IndexError, TypeError):
             raise LLMGenerationError("Invalid function call arguments received")
 
-        return out.function_call.arguments["passed_test"], out.function_call.arguments.get("reason")
+        return passed_test, reason

From 6a7487a766c3718853f955ac0defaa41cc2a5c76 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Mon, 12 Feb 2024 18:48:01 +0100
Subject: [PATCH 75/88] Fix typo

---
 giskard/llm/evaluators/correctness.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/giskard/llm/evaluators/correctness.py b/giskard/llm/evaluators/correctness.py
index a97c273b16..bd305ec1b7 100644
--- a/giskard/llm/evaluators/correctness.py
+++ b/giskard/llm/evaluators/correctness.py
@@ -141,7 +141,7 @@ def _evaluate_single(self, model: BaseModel, question, reference_answer, model_o
 
         try:
             passed_test = out.tool_calls[0].function.arguments["passed_test"]
-            reason = out.tool_calls[0].function.argumentsget("passed_test")
+            reason = out.tool_calls[0].function.arguments.get("passed_test")
         except (AttributeError, KeyError, IndexError, TypeError):
             raise LLMGenerationError("Invalid function call arguments received")
 

From add75fb6dae44c90931f4e77adac960ae07fdc2d Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Mon, 12 Feb 2024 18:52:24 +0100
Subject: [PATCH 76/88] Fix reason

---
 giskard/llm/evaluators/correctness.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/giskard/llm/evaluators/correctness.py b/giskard/llm/evaluators/correctness.py
index bd305ec1b7..0a0838a090 100644
--- a/giskard/llm/evaluators/correctness.py
+++ b/giskard/llm/evaluators/correctness.py
@@ -141,7 +141,7 @@ def _evaluate_single(self, model: BaseModel, question, reference_answer, model_o
 
         try:
             passed_test = out.tool_calls[0].function.arguments["passed_test"]
-            reason = out.tool_calls[0].function.arguments.get("passed_test")
+            reason = out.tool_calls[0].function.arguments.get("reason")
         except (AttributeError, KeyError, IndexError, TypeError):
             raise LLMGenerationError("Invalid function call arguments received")
 

From d6b88ecf4fb0ce681ef26bea1fef33dbb01c3f5c Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Mon, 12 Feb 2024 20:01:59 +0100
Subject: [PATCH 77/88] Fixing tests

---
 giskard/rag/testset.py                        |  5 +-
 .../evaluators/test_correctness_evaluator.py  | 70 ++++++++++++-------
 .../test_knowledge_base_testset_generator.py  | 16 +++--
 tests/rag/test_testset_suite_conversion.py    |  2 +-
 4 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/giskard/rag/testset.py b/giskard/rag/testset.py
index cf4c40a8bc..037f1140a5 100644
--- a/giskard/rag/testset.py
+++ b/giskard/rag/testset.py
@@ -9,6 +9,9 @@ class QATestset:
     def __init__(self, dataframe: pd.DataFrame):
         self._dataframe = dataframe
 
+    def __len__(self):
+        return len(self._dataframe)
+
     def to_pandas(self):
         return self._dataframe
 
@@ -24,7 +27,7 @@ def load(cls, path):
         return cls(dataframe)
 
     def to_test_suite(self, name=None):
-        suite_default_params = {"dataset": self}
+        suite_default_params = {"dataset": self.to_dataset()}
         name = name or "Test suite generated from testset"
         suite = Suite(name=name, default_params=suite_default_params)
         suite.add_test(test_llm_correctness, "TestsetCorrectnessTest", "TestsetCorrectnessTest")
diff --git a/tests/llm/evaluators/test_correctness_evaluator.py b/tests/llm/evaluators/test_correctness_evaluator.py
index 4327ef4b46..9fb9c1f4e4 100644
--- a/tests/llm/evaluators/test_correctness_evaluator.py
+++ b/tests/llm/evaluators/test_correctness_evaluator.py
@@ -5,6 +5,7 @@
 
 from giskard.datasets.base import Dataset
 from giskard.llm.client import LLMFunctionCall, LLMMessage
+from giskard.llm.client.base import LLMToolCall
 from giskard.llm.evaluators.correctness import CorrectnessEvaluator
 from giskard.models.base.model_prediction import ModelPredictionResults
 
@@ -46,20 +47,32 @@ def test_correctness_evaluator_correctly_flags_examples():
     client.complete.side_effect = [
         LLMMessage(
             role="assistant",
-            function_call=LLMFunctionCall(
-                name="evaluate_model",
-                arguments={"passed_test": True, "reason": ""},
-            ),
+            tool_calls=[
+                LLMToolCall(
+                    id="1",
+                    type="function",
+                    function=LLMFunctionCall(
+                        name="evaluate_model",
+                        arguments={"passed_test": True, "reason": ""},
+                    ),
+                )
+            ],
         ),
         LLMMessage(
             role="assistant",
-            function_call=LLMFunctionCall(
-                name="evaluate_model",
-                arguments={
-                    "passed_test": False,
-                    "reason": "The model output does not agree with the ground truth: Rome is the capital of Italy",
-                },
-            ),
+            tool_calls=[
+                LLMToolCall(
+                    id="2",
+                    type="function",
+                    function=LLMFunctionCall(
+                        name="evaluate_model",
+                        arguments={
+                            "passed_test": False,
+                            "reason": "The model output does not agree with the ground truth: Rome is the capital of Italy",
+                        },
+                    ),
+                )
+            ],
         ),
     ]
 
@@ -85,8 +98,8 @@ def test_correctness_evaluator_correctly_flags_examples():
 
     # Check LLM client calls arguments
     args = client.complete.call_args_list[0]
-    assert "Your role is to test AI models" in args[0][0][0]["content"]
-    assert args[1]["functions"][0]["function"]["name"] == "evaluate_model"
+    assert "Your role is to test AI models" in args[0][0][0].content
+    assert args[1]["tools"][0]["function"]["name"] == "evaluate_model"
 
 
 def test_correctness_evaluator_handles_generation_errors():
@@ -97,20 +110,29 @@ def test_correctness_evaluator_handles_generation_errors():
     client.complete.side_effect = [
         LLMMessage(
             role="assistant",
-            function_call=LLMFunctionCall(
-                name="evaluate_model",
-                arguments={"passed_test": True, "reason": ""},
-            ),
+            tool_calls=[
+                LLMToolCall(
+                    id="1",
+                    type="function",
+                    function=LLMFunctionCall(name="evaluate_model", arguments={"passed_test": True, "reason": ""}),
+                )
+            ],
         ),
         LLMMessage(
             role="assistant",
-            function_call=LLMFunctionCall(
-                name="evaluate_model",
-                arguments={
-                    "pass": False,
-                    "reason": "The model output does not agree with the ground truth: Rome is the capital of Italy",
-                },
-            ),
+            tool_calls=[
+                LLMToolCall(
+                    id="2",
+                    type="function",
+                    function=LLMFunctionCall(
+                        name="evaluate_model",
+                        arguments={
+                            "pass": False,
+                            "reason": "The model output does not agree with the ground truth: Rome is the capital of Italy",
+                        },
+                    ),
+                )
+            ],
         ),
     ]
 
diff --git a/tests/rag/test_knowledge_base_testset_generator.py b/tests/rag/test_knowledge_base_testset_generator.py
index 297eb6d7d3..07983343f1 100644
--- a/tests/rag/test_knowledge_base_testset_generator.py
+++ b/tests/rag/test_knowledge_base_testset_generator.py
@@ -77,11 +77,15 @@ def test_testset_generation():
     )
 
     test_set = testset_generator.generate_dataset(num_samples=2)
+
     assert len(test_set) == 2
-    assert test_set.df.loc[0, "question"] == "Where is Camembert from?"
-    assert test_set.df.loc[0, "reference_answer"] == "Camembert was created in Normandy, in the northwest of France."
-    assert test_set.df.loc[0, "reference_context"] == CONTEXT_STRING
-    assert test_set.df.loc[0, "difficulty_level"] == 1
 
-    assert test_set.df.loc[1, "question"] == "Where is Camembert from?"
-    assert test_set.df.loc[1, "reference_context"] == "\n------\n"
+    df = test_set.to_pandas()
+
+    assert df.loc[0, "question"] == "Where is Camembert from?"
+    assert df.loc[0, "reference_answer"] == "Camembert was created in Normandy, in the northwest of France."
+    assert df.loc[0, "reference_context"] == CONTEXT_STRING
+    assert df.loc[0, "difficulty_level"] == 1
+
+    assert df.loc[1, "question"] == "Where is Camembert from?"
+    assert df.loc[1, "reference_context"] == "\n------\n"
diff --git a/tests/rag/test_testset_suite_conversion.py b/tests/rag/test_testset_suite_conversion.py
index fa99ece0af..1fddafa80e 100644
--- a/tests/rag/test_testset_suite_conversion.py
+++ b/tests/rag/test_testset_suite_conversion.py
@@ -21,7 +21,7 @@ def make_testset_df():
 
 
 def test_testset_suite_conversion():
-    testset = QATestset(df=make_testset_df())
+    testset = QATestset(make_testset_df())
     suite = testset.to_test_suite()
 
     assert "dataset" in suite.default_params

From 47f89941e7a86307719be438a4f0c1c2fd03b69d Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Tue, 13 Feb 2024 09:19:17 +0100
Subject: [PATCH 78/88] Fix typo in testset copy method

---
 giskard/rag/testset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/giskard/rag/testset.py b/giskard/rag/testset.py
index 037f1140a5..b500977366 100644
--- a/giskard/rag/testset.py
+++ b/giskard/rag/testset.py
@@ -34,4 +34,4 @@ def to_test_suite(self, name=None):
         return suite
 
     def copy(self):
-        return QATestset(self.dataframe.copy())
+        return QATestset(self._dataframe.copy())

From af08e5db5c48f733d1579c8d4eccc36172f145c4 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Tue, 13 Feb 2024 11:07:11 +0100
Subject: [PATCH 79/88] Fixing and reformatting LLMClient

---
 giskard/llm/client/openai.py                  | 40 +++++++++----------
 ...generator.py => test_testset_generator.py} | 22 +++++-----
 2 files changed, 31 insertions(+), 31 deletions(-)
 rename tests/rag/{test_knowledge_base_testset_generator.py => test_testset_generator.py} (81%)

diff --git a/giskard/llm/client/openai.py b/giskard/llm/client/openai.py
index fd534aa633..2f891d847c 100644
--- a/giskard/llm/client/openai.py
+++ b/giskard/llm/client/openai.py
@@ -68,9 +68,9 @@ def _serialize_message(response: LLMMessage) -> Dict:
         result = {
             "role": response.role,
             "content": response.content,
-            "function_call": BaseOpenAIClient._serialize_function_call(response.function_call)
-            if response.function_call
-            else None,
+            "function_call": (
+                BaseOpenAIClient._serialize_function_call(response.function_call) if response.function_call else None
+            ),
             "tool_calls": BaseOpenAIClient._serialize_tool_calls(response.tool_calls) if response.tool_calls else None,
         }
 
@@ -103,12 +103,16 @@ def _parse_message(response) -> LLMMessage:
         return LLMMessage(
             role=response["role"],
             content=response["content"],
-            function_call=BaseOpenAIClient._parse_function_call(response["function_call"])
-            if "function_call" in response and response["function_call"] is not None
-            else None,
-            tool_calls=BaseOpenAIClient._parse_tool_calls(response["tool_calls"])
-            if "tool_calls" in response and response["tool_calls"] is not None
-            else None,
+            function_call=(
+                BaseOpenAIClient._parse_function_call(response["function_call"])
+                if "function_call" in response and response["function_call"] is not None
+                else None
+            ),
+            tool_calls=(
+                BaseOpenAIClient._parse_tool_calls(response["tool_calls"])
+                if "tool_calls" in response and response["tool_calls"] is not None
+                else None
+            ),
         )
 
     def complete(
@@ -145,13 +149,9 @@ def _embeddings_generation(self, texts: Sequence[str], model: str):
     def embeddings(
         self, texts: Sequence[str], model: str = "text-embedding-ada-002", chunk_size: int = 2048
     ) -> np.ndarray:
-        texts = [t.replace("\n", " ") for t in texts]
-        if not isinstance(chunk_size, int) or chunk_size > self._max_embedding_chunk_size or chunk_size < 1:
-            raise ValueError(f"Chunk size must be an integer between 0 and {self._max_embedding_chunk_size}.")
-
-        chunks_indices = list(range(chunk_size, len(texts), chunk_size))
+        chunks_indices = range(chunk_size, len(texts), chunk_size)
         chunks = np.split(texts, chunks_indices)
-        embedded_chunks = [self._embeddings_generation(list(chunk), model) for chunk in chunks]
+        embedded_chunks = [self._embeddings_generation(chunk, model) for chunk in chunks]
         return np.stack([emb for embeddings in embedded_chunks for emb in embeddings])
 
 
@@ -213,16 +213,16 @@ def _completion(
 
     def _embeddings_generation(self, texts: Sequence[str], model: str):
         try:
-            out = openai.Embedding.create(input=texts, engine=model)
+            out = openai.Embedding.create(input=list(texts), engine=model)
             embeddings = [element["embedding"] for element in out["data"]]
-
         except openai.error.InvalidRequestError as err:
             raise ValueError(
                 f"The embedding model: '{model}' was not found,"
                 "make sure the model is correctly deployed on your endpoint."
             ) from err
         except Exception as err:
-            raise ValueError("Embedding creation failed.") from err
+            raise RuntimeError("Embedding creation failed.") from err
+
         return embeddings
 
 
@@ -276,7 +276,7 @@ def _completion(
 
     def _embeddings_generation(self, texts: Sequence[str], model: str):
         try:
-            out = self._client.embeddings.create(input=texts, model=model)
+            out = self._client.embeddings.create(input=list(texts), model=model)
             embeddings = [element.embedding for element in out.data]
         except openai.NotFoundError as err:
             raise ValueError(
@@ -285,6 +285,6 @@ def _embeddings_generation(self, texts: Sequence[str], model: str):
                 f"the specified endpoint: {self._client._base_url}."
             ) from err
         except Exception as err:
-            raise ValueError("Embedding creation failed.") from err
+            raise RuntimeError("Embedding creation failed.") from err
 
         return embeddings
diff --git a/tests/rag/test_knowledge_base_testset_generator.py b/tests/rag/test_testset_generator.py
similarity index 81%
rename from tests/rag/test_knowledge_base_testset_generator.py
rename to tests/rag/test_testset_generator.py
index 07983343f1..3b8dc818ec 100644
--- a/tests/rag/test_knowledge_base_testset_generator.py
+++ b/tests/rag/test_testset_generator.py
@@ -4,7 +4,7 @@
 import pandas as pd
 
 from giskard.llm.client import LLMMessage
-from giskard.rag import KnowledgeBaseTestsetGenerator
+from giskard.rag import TestsetGenerator
 
 
 def make_knowledge_base_df():
@@ -58,25 +58,25 @@ def test_testset_generation():
     llm_client.embeddings.side_effect = [kb_embeddings]
 
     knowledge_base_df = make_knowledge_base_df()
-    testset_generator = KnowledgeBaseTestsetGenerator(
+    testset_generator = TestsetGenerator(
         knowledge_base_df,
         model_name="Test model",
         model_description="This is a model for testing purpose.",
         llm_client=llm_client,
         context_neighbors=3,
     )
-    testset_generator.rng = Mock()
-    testset_generator.rng.choice = Mock()
-    testset_generator.rng.choice.side_effect = list(query_embeddings)
-
-    assert testset_generator.knowledge_base.index.d == 8
-    assert testset_generator.knowledge_base.embeddings.shape == (4, 8)
-    assert len(testset_generator.knowledge_base.documents) == 4
-    assert testset_generator.knowledge_base.documents[2].page_content.startswith(
+    testset_generator._rng = Mock()
+    testset_generator._rng.choice = Mock()
+    testset_generator._rng.choice.side_effect = list(query_embeddings)
+
+    assert testset_generator._vector_store.index.d == 8
+    assert testset_generator._vector_store.embeddings.shape == (4, 8)
+    assert len(testset_generator._vector_store.documents) == 4
+    assert testset_generator._vector_store.documents[2].page_content.startswith(
         "Scamorza is a Southern Italian cow's milk cheese."
     )
 
-    test_set = testset_generator.generate_dataset(num_samples=2)
+    test_set = testset_generator.generate_testset(num_questions=2)
 
     assert len(test_set) == 2
 

From 2c14cd098dceed655de161a6496258e190093a7a Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Tue, 13 Feb 2024 11:09:39 +0100
Subject: [PATCH 80/88] Small refactoring

---
 giskard/llm/evaluators/correctness.py         |  47 ++--
 giskard/rag/__init__.py                       |   4 +-
 .../rag/knowledge_base_testset_generator.py   | 254 ------------------
 giskard/rag/testset_generator.py              | 252 +++++++++++++++++
 giskard/rag/vector_store.py                   |   1 +
 .../evaluators/test_correctness_evaluator.py  |   6 +-
 6 files changed, 275 insertions(+), 289 deletions(-)
 delete mode 100644 giskard/rag/knowledge_base_testset_generator.py
 create mode 100644 giskard/rag/testset_generator.py

diff --git a/giskard/llm/evaluators/correctness.py b/giskard/llm/evaluators/correctness.py
index 0a0838a090..c88dad8f9b 100644
--- a/giskard/llm/evaluators/correctness.py
+++ b/giskard/llm/evaluators/correctness.py
@@ -36,13 +36,9 @@
 
 
 class CorrectnessEvaluator(LLMBasedEvaluator):
-    """Correctness evaluator class: assess the correctness of a model answers
-    given questions and associated reference answers.
-    """
+    """Assess the correctness of a model answers given questions and associated reference answers."""
 
     _default_eval_prompt = CORRECTNESS_EVALUATION_PROMPT
-    _question_feature_name = "question"
-    _reference_answer_feature_name = "reference_answer"
 
     def _make_evaluate_functions(self):
         return EVALUATE_MODEL_FUNCTIONS
@@ -60,41 +56,32 @@ def evaluate(
         self,
         model: BaseModel,
         dataset: Dataset,
-        question_feature_name: str = None,
-        reference_answer_feature_name: str = None,
+        question_col: str = "question",
+        reference_answer_col: str = "reference_answer",
     ):
-        question_feature_name = (
-            question_feature_name if question_feature_name is not None else self._question_feature_name
-        )
-        reference_answer_feature_name = (
-            reference_answer_feature_name
-            if reference_answer_feature_name is not None
-            else self._reference_answer_feature_name
-        )
-        qa_feature_names = [question_feature_name, reference_answer_feature_name]
-
-        # question and reference_answer feature names must be present in the dataset
-        if not (question_feature_name in dataset.df and reference_answer_feature_name in dataset.df):
+        if not (question_col in dataset.df and reference_answer_col in dataset.df):
             raise ValueError(
-                f"Missing at least one required feature in the evaluation dataset among: {qa_feature_names}."
+                f"Missing required columns in the evaluation dataset. Make sure the dataset has columns {question_col} and {reference_answer_col}."
             )
 
-        # question feature name must be inside model's features
-        if question_feature_name not in model.feature_names:
-            raise ValueError(f"Missing question feature: '{question_feature_name}' inside model's features.")
+        if question_col not in model.feature_names:
+            raise ValueError(
+                f"Model has no feature '{question_col}'. Make sure your Model wrapper accepts '{question_col}'."
+            )
 
         model_outputs = model.predict(dataset).prediction
+
         succeeded = []
         failed = []
         errored = []
         status = []
         reasons = []
-        for idx, (evaluation_question, model_output) in enumerate(zip(dataset.df.to_dict("records"), model_outputs)):
+        for evaluation_question, model_output in zip(dataset.df.to_dict("records"), model_outputs):
             try:
                 passed, reason = self._evaluate_single(
                     model,
-                    evaluation_question[question_feature_name],
-                    evaluation_question[reference_answer_feature_name],
+                    evaluation_question[question_col],
+                    evaluation_question[reference_answer_col],
                     model_output,
                 )
                 reasons.append(reason)
@@ -108,12 +95,12 @@ def evaluate(
                     succeeded.append(sample)
                     status.append(TestResultStatus.PASSED)
                 else:
-                    status.append(TestResultStatus.FAILED)
                     failed.append(sample)
+                    status.append(TestResultStatus.FAILED)
             except LLMGenerationError as err:
-                status.append(TestResultStatus.ERROR)
-                reasons.append(str(err))
                 errored.append({"message": str(err), "sample": {**evaluation_question, "model_output": model_output}})
+                reasons.append(str(err))
+                status.append(TestResultStatus.ERROR)
 
         return EvaluationResult(
             failure_examples=failed,
@@ -142,7 +129,7 @@ def _evaluate_single(self, model: BaseModel, question, reference_answer, model_o
         try:
             passed_test = out.tool_calls[0].function.arguments["passed_test"]
             reason = out.tool_calls[0].function.arguments.get("reason")
-        except (AttributeError, KeyError, IndexError, TypeError):
+        except (AttributeError, KeyError, IndexError):
             raise LLMGenerationError("Invalid function call arguments received")
 
         return passed_test, reason
diff --git a/giskard/rag/__init__.py b/giskard/rag/__init__.py
index 1149de1ade..fb2ecc8cfb 100644
--- a/giskard/rag/__init__.py
+++ b/giskard/rag/__init__.py
@@ -1,4 +1,4 @@
-from .knowledge_base_testset_generator import DifficultyLevel, KnowledgeBaseTestsetGenerator
 from .testset import QATestset
+from .testset_generator import DifficultyLevel, TestsetGenerator
 
-__all__ = ["KnowledgeBaseTestsetGenerator", "QATestset", "DifficultyLevel"]
+__all__ = ["TestsetGenerator", "QATestset", "DifficultyLevel"]
diff --git a/giskard/rag/knowledge_base_testset_generator.py b/giskard/rag/knowledge_base_testset_generator.py
deleted file mode 100644
index 9415b3df8c..0000000000
--- a/giskard/rag/knowledge_base_testset_generator.py
+++ /dev/null
@@ -1,254 +0,0 @@
-from typing import Sequence, Union
-
-import json
-import logging
-from enum import Enum
-
-import numpy as np
-import pandas as pd
-
-from ..llm.generators import BaseDataGenerator
-from .prompts import (
-    FIX_JSON_FORMAT_PROMPT,
-    DistractingQuestionPrompt,
-    QAGenerationPrompt,
-    QuestionComplexificationPrompt,
-)
-from .testset import QATestset
-from .vector_store import VectorStore
-
-logger = logging.getLogger(__name__)
-
-
-class DifficultyLevel(int, Enum):
-    EASY = 1
-    COMPLEX = 2
-    DISTRACTING_ELEMENT = 3
-
-
-class KnowledgeBaseTestsetGenerator(BaseDataGenerator):
-    """Testset generator for testing RAG models.
-
-    Explore a given knowledge base and generate question/answer pairs to test the model.
-
-    Each generated item contains the following field
-    - question: a question about a part of the knowledge base
-    - reference_answer: the expected answer according to the knowledge base
-    - reference_context: relevant elements directly extracted from the knowledge base
-    - difficulty_level: an indicator of how difficult the question is
-
-    Parameters
-    ----------
-    knowledge_df: pd.DataFrame
-        a dataframe containing the whole knowledge base
-    model_name: str
-        name of the model to be tested
-    model_description: str
-        a description of the model to be tested, to get more fitting questions
-    context_neighbors: int
-        the maximum number of extracted element from the knowledge base to get a relevant context for question generation
-    context_similarity_threshold: float = 0.2
-        a similarity threshold to filter irrelevant element from the knowledge base during context creation
-    context_window_length: int = 8192
-        context window length of the llm used in the `llm_client` of the generator
-    embedding_fn: Callable = None
-        an embedding function to build the knowledge base index
-    language: str = "en"
-        the language in which question are generated (following ISO 639-1)
-    knowledge_base_features: Sequence[str] = None
-        a list of columns from the `knowledge_df` to include inside the knowledge base. If the
-        `knowledge_df` only has one column, it will be used by default has the content of
-        the knowledge base elements. If `knowledge_df` has multiple columns they will be
-        concatenated into a single column with the name of the column before the respective content.
-        If `knowledge_base_features` is specified, only the columns from it are considered.
-
-        Example: "col_1: content column 1, col_2: content column 2"
-    seed: int = None
-    """
-
-    # _qa_generation_system_prompt = QA_GENERATION_SYSTEM_PROMPT
-    # _qa_generation_system_prompt_model = QA_GENERATION_SYSTEM_PROMPT_MODEL
-    # _qa_generation_context_example = QA_GENERATION_CONTEXT_EXAMPLE
-    # _qa_generation_assistant_example = QA_GENERATION_ASSISTANT_EXAMPLE
-    _fix_json_prompt = FIX_JSON_FORMAT_PROMPT
-
-    def __init__(
-        self,
-        knowledge_df: pd.DataFrame,
-        model_name: str = None,
-        model_description: str = None,
-        context_neighbors: int = 4,
-        context_similarity_threshold: float = 0.2,
-        context_window_length: int = 8192,
-        language: str = "en",
-        knowledge_base_features: Sequence[str] = None,
-        seed: int = None,
-        include_examples: bool = True,
-        embedding_model: str = "text-embedding-ada-002",
-        *args,
-        **kwargs,
-    ):
-        super().__init__(*args, **kwargs)
-        self.model_name = model_name
-        self.model_description = model_description
-        self.context_neighbors = context_neighbors
-        self.context_similarity_threshold = context_similarity_threshold
-        self.embedding_model = embedding_model
-        self.context_window_length = context_window_length
-        self.language = language
-        self.rng = np.random.default_rng(seed=seed)
-        self.include_examples = include_examples
-
-        self.knowledge_base = VectorStore.from_df(
-            knowledge_df,
-            lambda query: self.llm_client.embeddings(query, model=self.embedding_model),
-            features=knowledge_base_features,
-        )
-
-    def _difficulty_level_mapping(self, level: DifficultyLevel):
-        if level == DifficultyLevel.EASY:
-            return self._generate_question_answer_from_context
-        elif level == DifficultyLevel.COMPLEX:
-            return self._generate_complex_questions_from_context
-        elif level == DifficultyLevel.DISTRACTING_ELEMENT:
-            return self._generate_distraction_questions_from_context
-        else:
-            raise NotImplementedError(f"Missing case for difficulty level {level}.")
-
-    def _generate_question_answer_from_context(self, context):
-        messages = QAGenerationPrompt.create_messages(
-            model_name=self.model_name,
-            model_description=self.model_description,
-            language=self.language,
-            user_content=context,
-        )
-
-        generated_qa = self._llm_complete(messages=messages)
-        generated_qa["difficulty"] = DifficultyLevel.EASY
-        return generated_qa
-
-    def _generate_complex_questions_from_context(self, context):
-        generated_qa = self._generate_question_answer_from_context(context)
-
-        messages = QuestionComplexificationPrompt.create_messages(
-            model_name=self.model_name,
-            model_description=self.model_description,
-            language=self.language,
-            user_content=(generated_qa["question"], context),
-        )
-        generated_qa["difficulty"] = DifficultyLevel.COMPLEX
-        out = self._llm_complete(messages=messages)
-        generated_qa["question"] = out["question"]
-        return generated_qa
-
-    def _generate_distraction_questions_from_context(self, context):
-        generated_qa = self._generate_question_answer_from_context(context)
-
-        distracting_context = self.rng.choice(self.knowledge_base.documents).page_content
-        messages = DistractingQuestionPrompt.create_messages(
-            model_name=self.model_name,
-            model_description=self.model_description,
-            language=self.language,
-            user_content=(generated_qa["question"], generated_qa["answer"], distracting_context),
-        )
-        generated_qa["difficulty"] = DifficultyLevel.DISTRACTING_ELEMENT
-        out = self._llm_complete(messages=messages)
-        generated_qa["question"] = out["question"]
-        return generated_qa
-
-    def _extract_seed_context(self):
-        seed_embedding = self.rng.choice(self.knowledge_base.embeddings)
-        relevant_contexts = [
-            context
-            for (context, score) in self.knowledge_base.vector_similarity_search_with_score(
-                seed_embedding[None], k=self.context_neighbors
-            )
-            if score < self.context_similarity_threshold  # should we keep it or not ?
-        ]
-        return relevant_contexts
-
-    def _prevent_context_window_overflow(self, prompt):
-        # Prevent context overflow
-        # general rule of thumbs to count tokens: 1 token ~ 4 characters
-        # https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
-        return prompt[: self.context_window_length * 4]
-
-    def _llm_complete(self, messages: Sequence[dict]):
-        try:
-            out = self.llm_client.complete(
-                messages=messages,
-                temperature=self.llm_temperature,
-                caller_id=self.__class__.__name__,
-            )
-
-            generated = json.loads(out.content, strict=False)
-        except json.decoder.JSONDecodeError:
-            logger.warning("JSON decoding error, trying to fix the JSON string.")
-            generated = self._try_fix_json_message(out.content)
-        return generated
-
-    def _try_fix_json_message(self, incorrect_json):
-        try:
-            out = self.llm_client.complete(
-                messages=[
-                    {"role": "system", "content": self._fix_json_prompt},
-                    {"role": "user", "content": incorrect_json},
-                ],
-                temperature=0,
-                caller_id=self.__class__.__name__,
-            )
-            corrected_message = json.loads(out.content)
-        except Exception:
-            logger.warning("Fixing JSON format failed, question generation skipped.")
-            return None
-        return corrected_message
-
-    def generate_dataset(
-        self,
-        num_samples: int = 10,
-        difficulty: Union[DifficultyLevel, Sequence[DifficultyLevel]] = DifficultyLevel.EASY,
-    ) -> QATestset:
-        """Generates a testset from the knowledge base.
-
-        Parameters
-        ----------
-        num_samples : int
-            The number of question to generate, by default 10.
-
-        Returns
-        -------
-        QATestset
-            The generated test set.
-            Each generated question has the following field:
-                - *question*: a question about a part of the knowledge base
-                - *reference_answer*: the expected answer according to the knowledge base
-                - *reference_context*: relevant elements directly extracted from the knowledge base
-                - *difficulty_level*: an indicator of how difficult the question is
-
-        """
-        if not isinstance(difficulty, Sequence):
-            difficulty = [difficulty]
-
-        generated_questions = []
-        for level in difficulty:
-            for idx in range(num_samples):
-                logger.info(f"Generating question {idx + 1}/{num_samples} for difficulty level {str(level)}.")
-                seed_contexts = self._extract_seed_context()
-                context = QAGenerationPrompt.format_context(seed_contexts)
-
-                generation_fn = self._difficulty_level_mapping(level)
-                generated_qa = generation_fn(context)
-
-                if generated_qa is not None:
-                    generated_questions.append(
-                        {
-                            "question": generated_qa["question"],
-                            "reference_answer": generated_qa["answer"],
-                            "reference_context": context,
-                            "difficulty_level": generated_qa["difficulty"],
-                        }
-                    )
-                else:
-                    logger.warning("Error in question generation, skipping it.")
-
-        return QATestset(pd.DataFrame(generated_questions))
diff --git a/giskard/rag/testset_generator.py b/giskard/rag/testset_generator.py
new file mode 100644
index 0000000000..9d25c886a7
--- /dev/null
+++ b/giskard/rag/testset_generator.py
@@ -0,0 +1,252 @@
+from typing import Optional, Sequence, Union
+
+import json
+import logging
+from enum import Enum
+
+import numpy as np
+import pandas as pd
+
+from ..llm.client import get_default_client
+from ..llm.client.base import LLMClient
+from .prompts import (
+    FIX_JSON_FORMAT_PROMPT,
+    DistractingQuestionPrompt,
+    QAGenerationPrompt,
+    QuestionComplexificationPrompt,
+)
+from .testset import QATestset
+from .vector_store import VectorStore
+
+logger = logging.getLogger(__name__)
+
+
+class DifficultyLevel(int, Enum):
+    EASY = 1
+    COMPLEX = 2
+    DISTRACTING_ELEMENT = 3
+
+
+class TestsetGenerator:
+    """Testset generator for testing RAG models.
+
+    Explore a given knowledge base and generate question/answer pairs to test the model.
+
+    Each generated item contains the following field
+    - question: a question about a part of the knowledge base
+    - reference_answer: the expected answer according to the knowledge base
+    - reference_context: relevant elements directly extracted from the knowledge base
+    - difficulty_level: an indicator of how difficult the question is
+
+    Parameters
+    ----------
+    knowledge_base: pd.DataFrame
+        A dataframe containing the whole knowledge base.
+    knowledge_base_columns: Sequence[str], optional
+        The list of columns from the `knowledge_base` to consider. If not specified, all columns of the knowledge base
+        dataframe will be concatenated to produce a single document.
+        Example: if your knowledge base consists in FAQ data with columns "Q" and "A", we will format each row into a
+        single document "Q: [question]\\nA: [answer]" to generate questions.
+    language: str = "en"
+        The language used to generate questions (e.g. "fr", "de", ...)
+    model_name: str, optional
+        Name of the model to be tested, to get more fitting questions.
+    model_description: str, optional
+        Description of the model to be tested.
+    context_neighbors: int
+        The maximum number of extracted element from the knowledge base to get a relevant context for question generation
+    context_similarity_threshold: float = 0.2
+        A similarity threshold to filter irrelevant element from the knowledge base during context creation
+    context_window_length: int = 8192
+        Context window length of the llm used in the `llm_client` of the generator
+    embedding_fn: Callable = None
+        Embedding function to build the knowledge base index.
+    seed: int = None
+    """
+
+    def __init__(
+        self,
+        knowledge_base: pd.DataFrame,
+        knowledge_base_columns: Sequence[str] = None,
+        language: str = "en",
+        model_name: str = None,
+        model_description: str = None,
+        context_neighbors: int = 4,
+        context_similarity_threshold: float = 0.2,
+        context_window_length: int = 8192,
+        seed: int = None,
+        include_examples: bool = True,
+        embedding_model: str = "text-embedding-ada-002",
+        llm_client: Optional[LLMClient] = None,
+        llm_temperature: float = 0.5,
+    ):
+        self._knowledge_base = knowledge_base
+        self._knowledge_base_columns = knowledge_base_columns
+        self._language = language
+        self._model_name = model_name
+        self._model_description = model_description
+        self._context_neighbors = context_neighbors
+        self._context_similarity_threshold = context_similarity_threshold
+        self._embedding_model = embedding_model
+        self._context_window_length = context_window_length
+        self._rng = np.random.default_rng(seed=seed)
+        self._include_examples = include_examples
+        self._vector_store_inst = None
+        self._llm_client = llm_client or get_default_client()
+        self._llm_temperature = llm_temperature
+
+    @property
+    def _vector_store(self):
+        if self._vector_store_inst is None:
+            logger.debug("Initializing vector store from knowledge base.")
+            self._vector_store_inst = VectorStore.from_df(
+                self._knowledge_base,
+                lambda query: self._llm_client.embeddings(query, model=self._embedding_model),
+                features=self._knowledge_base_columns,
+            )
+        return self._vector_store_inst
+
+    def _get_generator_method(self, level: DifficultyLevel):
+        mapping = {
+            DifficultyLevel.EASY: self._generate_question_easy,
+            DifficultyLevel.COMPLEX: self._generate_question_complex,
+            DifficultyLevel.DISTRACTING_ELEMENT: self._generate_question_distracting_element,
+        }
+
+        try:
+            return mapping[level]
+        except KeyError:
+            raise ValueError(f"Invalid difficulty level: {level}.")
+
+    def _generate_question_easy(self, context: str) -> dict:
+        messages = QAGenerationPrompt.create_messages(
+            model_name=self._model_name,
+            model_description=self._model_description,
+            language=self._language,
+            user_content=context,
+        )
+
+        generated_qa = self._llm_complete(messages=messages)
+        generated_qa["difficulty"] = DifficultyLevel.EASY
+        return generated_qa
+
+    def _generate_question_complex(self, context: str) -> dict:
+        generated_qa = self._generate_question_easy(context)
+
+        messages = QuestionComplexificationPrompt.create_messages(
+            model_name=self._model_name,
+            model_description=self._model_description,
+            language=self._language,
+            user_content=(generated_qa["question"], context),
+        )
+        generated_qa["difficulty"] = DifficultyLevel.COMPLEX
+        out = self._llm_complete(messages=messages)
+        generated_qa["question"] = out["question"]
+        return generated_qa
+
+    def _generate_question_distracting_element(self, context: str):
+        generated_qa = self._generate_question_easy(context)
+
+        distracting_context = self._rng.choice(self._knowledge_base.documents).page_content
+        messages = DistractingQuestionPrompt.create_messages(
+            model_name=self._model_name,
+            model_description=self._model_description,
+            language=self._language,
+            user_content=(generated_qa["question"], generated_qa["answer"], distracting_context),
+        )
+        generated_qa["difficulty"] = DifficultyLevel.DISTRACTING_ELEMENT
+        out = self._llm_complete(messages=messages)
+        generated_qa["question"] = out["question"]
+        return generated_qa
+
+    def _get_random_document_group(self):
+        seed_embedding = self._rng.choice(self._vector_store.embeddings)
+        relevant_contexts = [
+            context
+            for (context, score) in self._vector_store.vector_similarity_search_with_score(
+                seed_embedding, k=self._context_neighbors
+            )
+            if score < self._context_similarity_threshold
+        ]
+
+        return relevant_contexts
+
+    def _prevent_context_window_overflow(self, prompt: str):
+        # Prevent context overflow
+        # general rule of thumbs to count tokens: 1 token ~ 4 characters
+        # https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
+        return prompt[: self._context_window_length * 4]
+
+    def _llm_complete(self, messages: Sequence[dict]) -> dict:
+        try:
+            out = self._llm_client.complete(
+                messages=messages,
+                temperature=self._llm_temperature,
+                caller_id=self.__class__.__name__,
+            )
+
+            return json.loads(out.content, strict=False)
+        except json.decoder.JSONDecodeError:
+            logger.warning("JSON decoding error, trying to fix the JSON string.")
+            return self._try_fix_json_message(out.content)
+
+    def _try_fix_json_message(self, incorrect_json: str):
+        out = self._llm_client.complete(
+            messages=[
+                {"role": "system", "content": FIX_JSON_FORMAT_PROMPT},
+                {"role": "user", "content": incorrect_json},
+            ],
+            temperature=0,
+            caller_id=self.__class__.__name__,
+        )
+        return json.loads(out.content)
+
+    def generate_testset(
+        self,
+        num_questions: int = 10,
+        difficulty: Union[DifficultyLevel, Sequence[DifficultyLevel]] = DifficultyLevel.EASY,
+    ) -> QATestset:
+        """Generates a testset from the knowledge base.
+
+        Parameters
+        ----------
+        num_questions : int
+            The number of question to generate for each difficulty level. By default 10.
+        difficulty : Union[DifficultyLevel, Sequence[DifficultyLevel]]
+            The difficulty level of the questions to generate. Can be 1 (:attr:`DifficultyLevel.EASY`), 2 (:attr:`DifficultyLevel.COMPLEX`),
+            3 (:attr:`DifficultyLevel.DISTRACTING_ELEMENT`) or a list of these values. By default will use the easy level.
+
+        Returns
+        -------
+        QATestset
+            The generated test set.
+
+        """
+        if not isinstance(difficulty, Sequence):
+            difficulty = [difficulty]
+
+        generated_questions = []
+        for level in difficulty:
+            for idx in range(num_questions):
+                logger.info(f"Generating question {idx + 1}/{num_questions} for difficulty level {str(level)}.")
+                context_docs = self._get_random_document_group()
+                context = QAGenerationPrompt.format_context(context_docs)
+
+                generation_fn = self._get_generator_method(level)
+
+                try:
+                    generated_qa = generation_fn(context)
+                except Exception as e:
+                    logger.error(f"Encountered error in question generation: {e}. Skipping.")
+                    continue
+
+                generated_questions.append(
+                    {
+                        "question": generated_qa["question"],
+                        "reference_answer": generated_qa["answer"],
+                        "reference_context": context,
+                        "difficulty_level": generated_qa["difficulty"],
+                    }
+                )
+
+        return QATestset(pd.DataFrame(generated_questions))
diff --git a/giskard/rag/vector_store.py b/giskard/rag/vector_store.py
index 1e8c65eb40..255a37ef83 100644
--- a/giskard/rag/vector_store.py
+++ b/giskard/rag/vector_store.py
@@ -62,5 +62,6 @@ def similarity_search_with_score(self, query: Sequence[str], k: int) -> Sequence
         return self.vector_similarity_search_with_score(query_emb, k)
 
     def vector_similarity_search_with_score(self, query_emb: np.ndarray, k: int) -> Sequence:
+        query_emb = np.atleast_2d(query_emb)
         distances, indices = self.index.search(query_emb, k)
         return [(self.documents[i], d) for d, i in zip(distances[0], indices[0])]
diff --git a/tests/llm/evaluators/test_correctness_evaluator.py b/tests/llm/evaluators/test_correctness_evaluator.py
index 9fb9c1f4e4..1269211146 100644
--- a/tests/llm/evaluators/test_correctness_evaluator.py
+++ b/tests/llm/evaluators/test_correctness_evaluator.py
@@ -146,14 +146,14 @@ def test_correctness_evaluator_handles_generation_errors():
     assert result.errors[0]["message"] == "Invalid function call arguments received"
 
 
-def test_raises_error_if_missing_feature_in_dataset():
+def test_raises_error_if_missing_column_in_dataset():
     dataset = _make_eval_dataset()
     dataset.df = dataset.df.drop("question", axis=1)
 
     model = _make_mock_model()
 
     evaluator = CorrectnessEvaluator(llm_client=Mock())
-    with pytest.raises(ValueError, match="Missing at least one required feature in the evaluation dataset among"):
+    with pytest.raises(ValueError, match="Missing required columns in the evaluation dataset."):
         evaluator.evaluate(model, dataset)
 
 
@@ -163,5 +163,5 @@ def test_raises_error_if_missing_feature_in_model():
     model = _make_mock_model(feature_names=["reference_answer"])
 
     evaluator = CorrectnessEvaluator(llm_client=Mock())
-    with pytest.raises(ValueError, match="Missing question feature: 'question' inside model's features."):
+    with pytest.raises(ValueError, match="Model has no feature 'question'"):
         evaluator.evaluate(model, dataset)

From 09801907e169fb34259d28a19b6cfe8831db38b1 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Tue, 13 Feb 2024 11:24:58 +0100
Subject: [PATCH 81/88] Small refactoring

---
 giskard/rag/prompts.py              | 29 ++++++++++++++---------------
 giskard/rag/testset_generator.py    |  2 +-
 giskard/rag/vector_store.py         | 15 ++++++---------
 tests/rag/test_document_creation.py | 10 +++++-----
 tests/rag/test_testset_generator.py |  2 +-
 tests/rag/test_vector_store.py      |  6 +++---
 6 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/giskard/rag/prompts.py b/giskard/rag/prompts.py
index 06f05c95f8..66533542b4 100644
--- a/giskard/rag/prompts.py
+++ b/giskard/rag/prompts.py
@@ -87,8 +87,7 @@ def _format_example_prompt(cls, examples):
 
     @classmethod
     def format_context(cls, contexts):
-        context_string = "\n------\n".join(["", *[doc.page_content for doc in contexts], ""])
-        return context_string
+        return "\n------\n".join(["", *[doc.content for doc in contexts], ""])
 
     @classmethod
     def create_messages(
@@ -100,9 +99,7 @@ def create_messages(
         examples=None,
         user_content=None,
     ):
-        messages = list()
-
-        messages.append(cls._format_system_prompt(model_name, model_description, language))
+        messages = [cls._format_system_prompt(model_name, model_description, language)]
         if add_examples:
             messages.extend(cls._format_example_prompt(examples))
 
@@ -219,16 +216,21 @@ def create_messages(cls, **kwargs):
 You will return the reformulated question as a single JSON object, with the key 'question'. Make sure you return a valid JSON object.
 """
 
-DISCTRACTING_QUESTION_PROMPT_EXAMPLE = """<question>
-What job offer do you have for engineering student?
+DISTRACTING_QUESTION_USER_INPUT = """<question>
+{question}
 </question>
 <answer>
-We have plenty of different jobs for engineering student depending on your speciality: mechanical engineer, data scientist, electronic designer and many more.
+{answer}
 </answer>
 <context>
-Sometimes employers assume being accessible and inclusive only means providing physical access like ramps, accessible bathrooms and automatic opening doors. However, there are many other important ways to demonstrate that you welcome and want to attract a diverse workforce including people with disability.
-<context>
-"""
+{context}
+</context>"""
+
+DISCTRACTING_QUESTION_PROMPT_EXAMPLE = DISTRACTING_QUESTION_USER_INPUT.format(
+    question="What job offer do you have for engineering student?",
+    answer="We have plenty of different jobs for engineering student depending on your speciality: mechanical engineer, data scientist, electronic designer and many more.",
+    context="Sometimes employers assume being accessible and inclusive only means providing physical access like ramps, accessible bathrooms and automatic opening doors. However, there are many other important ways to demonstrate that you welcome and want to attract a diverse workforce including people with disability.",
+)
 
 DISCTRACTING_QUESTION_ANSWER_EXAMPLE = """{
     "question": "Do you have any job opening suitable for engineering students with a disability? "
@@ -243,7 +245,4 @@ class DistractingQuestionPrompt(QuestionComplexificationPrompt):
 
     @classmethod
     def format_user_content(cls, question, answer, context):
-        context_string = (
-            f"<question>\n{question}\n</question>\n<answer>\n{answer}\n</answer>\n<context>\n{context}\n</context>"
-        )
-        return context_string
+        return DISTRACTING_QUESTION_USER_INPUT.format(question=question, answer=answer, context=context)
diff --git a/giskard/rag/testset_generator.py b/giskard/rag/testset_generator.py
index 9d25c886a7..58d29aa2ce 100644
--- a/giskard/rag/testset_generator.py
+++ b/giskard/rag/testset_generator.py
@@ -147,7 +147,7 @@ def _generate_question_complex(self, context: str) -> dict:
     def _generate_question_distracting_element(self, context: str):
         generated_qa = self._generate_question_easy(context)
 
-        distracting_context = self._rng.choice(self._knowledge_base.documents).page_content
+        distracting_context = self._rng.choice(self._knowledge_base.documents).content
         messages = DistractingQuestionPrompt.create_messages(
             model_name=self._model_name,
             model_description=self._model_description,
diff --git a/giskard/rag/vector_store.py b/giskard/rag/vector_store.py
index 255a37ef83..f291414637 100644
--- a/giskard/rag/vector_store.py
+++ b/giskard/rag/vector_store.py
@@ -10,15 +10,12 @@ class Document:
     """A class to wrap the elements of the knowledge base into a unified format."""
 
     def __init__(self, document: dict, features: Optional[Sequence] = None):
-        if len(document) == 1:
-            self.page_content = list(document.values())[0]
-        elif features is not None and any([feat in document for feat in features]):
-            if len(features) == 1:
-                self.page_content = document[features[0]]
-            else:
-                self.page_content = "\n".join([f"{feat}: {document[feat]}" for feat in features])
+        features = features if features is not None else list(document.keys())
+
+        if len(features) == 1:
+            self.content = document[features[0]]
         else:
-            self.page_content = "\n".join([f"{key}: {value}" for key, value in document.items()])
+            self.content = "\n".join(f"{feat}: {document[feat]}" for feat in features)
 
         self.metadata = document
 
@@ -51,7 +48,7 @@ def __init__(self, documents: Sequence[Document], embeddings: np.array, embeddin
     def from_df(cls, df: pd.DataFrame, embedding_fn: Callable, features: Sequence[str] = None):
         if len(df) > 0:
             documents = [Document(knowledge_chunk, features=features) for knowledge_chunk in df.to_dict("records")]
-            raw_texts = [d.page_content for d in documents]
+            raw_texts = [d.content for d in documents]
             embeddings = embedding_fn(raw_texts).astype("float32")
             return cls(documents, embeddings, embedding_fn)
         else:
diff --git a/tests/rag/test_document_creation.py b/tests/rag/test_document_creation.py
index 28b262b2fc..495ee7b97c 100644
--- a/tests/rag/test_document_creation.py
+++ b/tests/rag/test_document_creation.py
@@ -4,7 +4,7 @@
 def test_single_feature_document_creation():
     doc = Document({"feature": "This a test value for a feature"})
 
-    assert doc.page_content == "This a test value for a feature"
+    assert doc.content == "This a test value for a feature"
     assert doc.metadata == {"feature": "This a test value for a feature"}
 
 
@@ -17,7 +17,7 @@ def test_multiple_features_document_creation():
         }
     )
     assert (
-        doc.page_content
+        doc.content
         == "feat1: This a test value for a feature 1\nfeat2: This a test value for a feature 2\nfeat3: This a test value for a feature 3"
     )
     assert doc.metadata == {
@@ -34,7 +34,7 @@ def test_multiple_features_document_creation():
         },
         features=["feat1"],
     )
-    assert doc.page_content == "This a test value for a feature 1"
+    assert doc.content == "This a test value for a feature 1"
 
     doc = Document(
         {
@@ -44,7 +44,7 @@ def test_multiple_features_document_creation():
         },
         features=["feat1", "feat2"],
     )
-    assert doc.page_content == "feat1: This a test value for a feature 1\nfeat2: This a test value for a feature 2"
+    assert doc.content == "feat1: This a test value for a feature 1\nfeat2: This a test value for a feature 2"
 
     doc = Document(
         {
@@ -55,6 +55,6 @@ def test_multiple_features_document_creation():
         features=["feat4"],
     )
     assert (
-        doc.page_content
+        doc.content
         == "feat1: This a test value for a feature 1\nfeat2: This a test value for a feature 2\nfeat3: This a test value for a feature 3"
     )
diff --git a/tests/rag/test_testset_generator.py b/tests/rag/test_testset_generator.py
index 3b8dc818ec..d2fc7cd828 100644
--- a/tests/rag/test_testset_generator.py
+++ b/tests/rag/test_testset_generator.py
@@ -72,7 +72,7 @@ def test_testset_generation():
     assert testset_generator._vector_store.index.d == 8
     assert testset_generator._vector_store.embeddings.shape == (4, 8)
     assert len(testset_generator._vector_store.documents) == 4
-    assert testset_generator._vector_store.documents[2].page_content.startswith(
+    assert testset_generator._vector_store.documents[2].content.startswith(
         "Scamorza is a Southern Italian cow's milk cheese."
     )
 
diff --git a/tests/rag/test_vector_store.py b/tests/rag/test_vector_store.py
index 7ce6847714..46ca86fa91 100644
--- a/tests/rag/test_vector_store.py
+++ b/tests/rag/test_vector_store.py
@@ -60,9 +60,9 @@ def test_vector_store_similarity_search_with_score():
     query = ["This is test string 50"]
     retrieved_elements = store.similarity_search_with_score(query, k=3)
     assert len(retrieved_elements) == 3
-    assert retrieved_elements[0][0].page_content == "This is test string 50"
+    assert retrieved_elements[0][0].content == "This is test string 50"
     assert retrieved_elements[0][1] == 0.0
-    assert retrieved_elements[1][0].page_content == "This is test string 49"
+    assert retrieved_elements[1][0].content == "This is test string 49"
     assert retrieved_elements[1][1] == 8.0
-    assert retrieved_elements[2][0].page_content == "This is test string 51"
+    assert retrieved_elements[2][0].content == "This is test string 51"
     assert retrieved_elements[2][1] == 8.0

From 79d0ee3775b8638d32f4debbf5d233c1560a9e72 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Tue, 13 Feb 2024 11:26:33 +0100
Subject: [PATCH 82/88] Fix test

---
 tests/rag/test_document_creation.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/tests/rag/test_document_creation.py b/tests/rag/test_document_creation.py
index 495ee7b97c..b45d3b832b 100644
--- a/tests/rag/test_document_creation.py
+++ b/tests/rag/test_document_creation.py
@@ -1,3 +1,5 @@
+import pytest
+
 from giskard.rag.vector_store import Document
 
 
@@ -46,15 +48,12 @@ def test_multiple_features_document_creation():
     )
     assert doc.content == "feat1: This a test value for a feature 1\nfeat2: This a test value for a feature 2"
 
-    doc = Document(
-        {
-            "feat1": "This a test value for a feature 1",
-            "feat2": "This a test value for a feature 2",
-            "feat3": "This a test value for a feature 3",
-        },
-        features=["feat4"],
-    )
-    assert (
-        doc.content
-        == "feat1: This a test value for a feature 1\nfeat2: This a test value for a feature 2\nfeat3: This a test value for a feature 3"
-    )
+    with pytest.raises(KeyError):
+        doc = Document(
+            {
+                "feat1": "This a test value for a feature 1",
+                "feat2": "This a test value for a feature 2",
+                "feat3": "This a test value for a feature 3",
+            },
+            features=["feat4"],
+        )

From 40a80ca48c2a6a1c661edb81d8287cf1849dbb10 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Tue, 13 Feb 2024 12:28:49 +0100
Subject: [PATCH 83/88] Fix level 3 generator

---
 giskard/rag/testset_generator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/giskard/rag/testset_generator.py b/giskard/rag/testset_generator.py
index 58d29aa2ce..37ca69008b 100644
--- a/giskard/rag/testset_generator.py
+++ b/giskard/rag/testset_generator.py
@@ -144,10 +144,10 @@ def _generate_question_complex(self, context: str) -> dict:
         generated_qa["question"] = out["question"]
         return generated_qa
 
-    def _generate_question_distracting_element(self, context: str):
+    def _generate_question_distracting_element(self, context: str) -> dict:
         generated_qa = self._generate_question_easy(context)
 
-        distracting_context = self._rng.choice(self._knowledge_base.documents).content
+        distracting_context = self._rng.choice(self._vector_store.documents).content
         messages = DistractingQuestionPrompt.create_messages(
             model_name=self._model_name,
             model_description=self._model_description,

From 63306d40f8459d039e5312740ac2499be97c86c7 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Tue, 13 Feb 2024 12:28:57 +0100
Subject: [PATCH 84/88] Update RAG toolset docs

---
 docs/open_source/testset_generation/index.md  | 105 ++++++++++++------
 .../testset_generation/test_suite_widget.png  | Bin 0 -> 27041 bytes
 .../rag-toolset/testset_generation.rst        |   4 +-
 docs/reference/rag-toolset/vector_store.rst   |   3 -
 4 files changed, 74 insertions(+), 38 deletions(-)
 create mode 100644 docs/open_source/testset_generation/test_suite_widget.png

diff --git a/docs/open_source/testset_generation/index.md b/docs/open_source/testset_generation/index.md
index dda45b9b81..662135c259 100644
--- a/docs/open_source/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -1,13 +1,15 @@
-# 🧰 RAG toolset
-The Giskard python library provides a toolset dedicated to Retrieval Augmented Generative models (RAGs) that generates question & answer pairs from the knowledge base of the model. The generated testset is then used to evaluate your model. 
+# 🧰 RAG Testset Generation
 
+The Giskard python library provides a toolset dedicated to Retrieval Augmented Generative models (RAGs) that generates question & answer pairs from the knowledge base of the model. The generated test set is then used to evaluate your model. 
+
+(difficulty_levels)=
 ## Generate questions with difficulty levels
 You can currently generate questions with three difficulty levels:
 - **Easy questions (level 1):** simple questions generated from an excerpt of the knowledge base
 - **Complex questions: (level 2)** questions made more complex by paraphrasing
 - **Distracting questions (level 3):** questions made even more difficult by adding a distracting element which is related to the knowledge base but irrelevant to the question
 
-These three difficulty levels allows to evaluate different components of your model. Easy questions are directly generated from your knowledge base. They assess the quality of the answer generation from the context, i.e. the quality of the LLM answer. Complex and distracting questions are more challenging as they can perturb the retrieval componenent of the RAG. These questions are more realistic of a user seeking precise information with your model.
+These three difficulty levels allow you to evaluate different components of your model. Easy questions are directly generated from your knowledge base. They assess the quality of the answer generation from the context, i.e. the quality of the LLM answer. Complex and distracting questions are more challenging as they can perturb the retrieval component of the RAG. These questions are more realistic of a user seeking precise information with your model.
 
 ## Before starting
 
@@ -17,7 +19,7 @@ Before starting, make sure you have installed the LLM flavor of Giskard:
 pip install "giskard[llm]"
 ```
 
-To use the RAG testset generation and evaluation tools, you need to have an OpenAI API key. You can set it in your notebook
+To use the RAG test set generation and evaluation tools, you need to have an OpenAI API key. You can set it in your notebook
 like this:
 
 :::::::{tab-set}
@@ -55,63 +57,99 @@ set_llm_model('my-gpt-4-model')
 We are now ready to start.
 
 
-## Step 1: Format and load your Knowledge Base
-The RAG toolset currently only handles knowledge bases as pandas `DataFrame`. If the DataFrame has multiple columns,
-they are concatenated automatically. If only some of the columns contains relevant information, you can specify it when building the generator by passing a list of column names to the `knowledge_base_features` argument (see [API Reference](https://docs.giskard.ai/en/latest/reference/rag-toolset/testset_generation.html#giskard.rag.KnowledgeBaseTestsetGenerator)).
+## Step 1: Automatically generate a Q&A test set
+
+To start, you only need your data or knowledge base in a pandas `DataFrame`. Then, you can initialize the testset
+generator ({class}`giskard.rag.TestsetGenerator`) by passing your dataframe.
+
+If some columns in your dataframe are not relevant for the generation of questions (e.g. they contain metadata), make sure you specify
+column names to the `knowledge_base_columns` argument (see {class}`giskard.rag.TestsetGenerator`).
+
+To make the question generation more accurate, you can also provide a model name and a model description to the generator. This will help the generator to generate questions that are more relevant to your model's task. You can also specify the language of the generated questions.
 
 
 ```python
+
+from giskard.rag import TestsetGenerator
+
+# Load your data
 knowledge_base_df = pd.read_csv("path/to/your/knowledge_base.csv")
-feature_names = ["col1", "col2"]
+
+# Initialize the testset generator
+generator = TestsetGenerator(
+    knowledge_base_df, 
+    knowledge_base_columns=["column_1", "column_2"],
+    language="en",  # Optional, if you want to  generate questions in a specific language
+
+    # Optionally, you can provide a model name and description to improve the question quality
+    model_name="Shop Assistant",
+    model_description="A model that answers common questions about our products",
+)
 ```
 
-## Step 2: Generate the testset
-Once the knowledge base is loaded as a pandas `DataFrame`, you can generate the testset with the 
-`KnowledgeBaseTestsetGenerator`. 
+We are ready to generate the test set. We can start with a small test set of 10 questions and answers for each difficulty level.
 
+Currently, you can choose the difficulty levels from 1 to 3 (see {ref}`difficulty_levels`)
 
 ```python
-from giskard.rag import KnowledgeBaseTestsetGenerator
+# Generate a testset with 10 questions & answers for each difficulty level (this will take a while)
+testset = generator.generate_testset(num_questions=10, difficulty=[1, 2])
 
-generator = KnowledgeBaseTestsetGenerator(
-    knowledge_base_df, 
-    model_name="Model name", # Optional, provide a name to your model to get better fitting questions
-    model_description="Description of the model", # Optional, briefly describe the task done by your model
-    knowledge_base_features=feature_names
-)
+# Save the generated testset
+testset.save("my_testset.jsonl")
+
+# Load it back
+from giskard.rag import QATestset
 
-# Generate a testset with 10 questions & answers for each difficulty level
-testset = generator.generate_dataset(num_samples=10, difficulty=[1, 2])
+loaded_testset = QATestset.load("my_testset.jsonl")
 ```
 
-The test set will be a subclass of {ref}`giskard.Dataset`. You can also get it as a pandas DataFrame by accessing `testset.df`.
+The test set will be an instance of {ref}`giskard.rag.QATestset`. You can save it and load it later with `QATestset.load("path/to/testset.jsonl")`.
+
+You can also convert it to a pandas DataFrame with `testset.to_pandas()`:
 
-Here's an example of the generated test set:
+```py
+# Convert it to a pandas dataframe
+df = loaded_testset.to_pandas()
+```
+
+Let's have a look at the generated questions:
 
 | question | reference_context | reference_answer | difficulty_level |
 |----------|-------------------|------------------|------------------|
 | For which countries can I track my shipping? | What is your shipping policy? We offer free shipping on all orders over \$50. For orders below \$50, we charge a flat rate of \$5.99. We offer shipping services to customers residing in all 50 states of the US, in addition to providing delivery options to Canada and Mexico. ------  How can I track my order? Once your purchase has been successfully confirmed and shipped, you will receive a confirmation email containing your tracking number. You can simply click on the link provided in the email or visit our website's order tracking page. | We ship to all 50 states in the US, as well as to Canada and Mexico. We offer tracking for all our shippings. | 1 |
 
-## Step 3: Wrap your model
-Before evaluating your model, you must wrap it as a `giskard.Model`. This step is necessary to ensure a common format for your model and its metadata. You can wrap anything as long as you can represent it in a Python function (for example an API call call to Azure or OpenAI). We also have pre-built wrappers for LangChain objects, or you can create your own wrapper by extending the `giskard.Model` class if you need to wrap a complex object such as a custom-made RAG communicating with a vectorstore.
+As you can see, the data contains 4 columns:
+- `question`: the generated question
+- `reference_context`: the context that can be used to answer the question
+- `reference_answer`: the answer to the question (generated with GPT-4)
+- `difficulty_level`: the difficulty level of the question  (1, 2 or 3)
+
+## Step 2: Evaluate your model on the generated test set
 
-To do so, you can follow the instructions from the [LLM Scan feature](../scan/scan_llm/index.md#step-1-wrap-your-model) or from the {doc}`Reference API </reference/models/index>`. Make sure that you pass `feature_names = "question"` when wrapping your model, so that it matches the question column of the testset. 
+Before evaluating your model, you must wrap it as a `giskard.Model`. This step is necessary to ensure a common format for your model and its metadata. You can wrap anything as long as you can represent it in a Python function (for example an API call to Azure or OpenAI). We also have pre-built wrappers for LangChain objects, or you can create your own wrapper by extending the `giskard.Model` class if you need to wrap a complex object such as a custom-made RAG communicating with a vectorstore.
+
+To do so, you can follow the instructions from the [LLM Scan feature](../scan/scan_llm/index.md#step-1-wrap-your-model). Make sure that you pass `feature_names = "question"` when wrapping your model, so that it matches the question column of the test set.
 
 Detailed examples can also be found on our {doc}`LLM tutorials section </tutorials/llm_tutorials/index>`.
 
+Once you have wrapped your model, we can proceed with evaluation.
 
-## Step 4: Generate a test suite to evaluate your model
-Once your `testset` is ready, you can turn it into an actionable test suite that you can save and reuse in further iterations. Note that you need to pass your wrapped model when executing the suite, since the suite is generated only from the testset.
+Let's convert our test set into an actionable test suite ({class}`giskard.Suite`) that we can save and reuse in further iterations.
 
 ```python
 test_suite = testset.to_test_suite("My first test suite")
-test_suite.run(giskard_model)
+
+test_suite.run(model=giskard_model)
 ```
 
+![](./test_suite_widget.png)
+
 Jump to the [test customization](https://docs.giskard.ai/en/latest/open_source/customize_tests/index.html) and [test integration](https://docs.giskard.ai/en/latest/open_source/integrate_tests/index.html) sections to find out everything you can do with test suites.
 
 
-## Next: upload your test suite to the Giskard Hub
+## Step 3: upload your test suite to the Giskard Hub
+
 Uploading a test suite to the hub allows you to:
 * Compare the quality of different models and prompts to decide which one to promote
 * Create more tests relevant to your use case, combining input prompts that make your model fail and custome evaluation criteria
@@ -126,16 +164,17 @@ test_suite.upload(giskard_client, project_id) #project_id should be the id of th
 
 [Here's a demo](https://huggingface.co/spaces/giskardai/giskard) of the Giskard Hub in action.
 
+
 ## What data are being sent to OpenAI/Azure OpenAI
 
-In order to perform LLM-assisted detectors, we will be sending the following information to OpenAI/Azure OpenAI:
+In order to perform the question generation, we will be sending the following information to OpenAI/Azure OpenAI:
 
 - Data provided in your knowledge base
 - Text generated by your model
 - Model name and description
 
-## Will the testset generation work in any language?
-The testset quality depends on GPT-4 capabilities regarding your model's language. 
+## Will the test set generation work in any language?
+Yes, you can specify the language of the generated questions when you initialize the {class}`giskard.rag.TestsetGenerator`.
 
 ## Troubleshooting
-If you encounter any issues, join our [Discord community](https://discord.gg/fkv7CAr3FE) and ask questions in our #support channel.
\ No newline at end of file
+If you encounter any issues, join our [Discord community](https://discord.gg/fkv7CAr3FE) and ask questions in our #support channel.
diff --git a/docs/open_source/testset_generation/test_suite_widget.png b/docs/open_source/testset_generation/test_suite_widget.png
new file mode 100644
index 0000000000000000000000000000000000000000..1ec3eb2326dc1b1f07d241d96c57f73f3246562f
GIT binary patch
literal 27041
zcmeFZWmp`~+CG>N5<FOt;1&q(65J)YI|Lbga2Y(fYl6E3cXxsf2^u8GFz5vL;J%aJ
zIq!Rtoc*-l{=3UHb4}6RRZl-sPgUJ@cSoow$zY%op+0)_2t!U*Qti>BC+d$LJw`%)
z20tRtN1_4$N84ILLPbtOf<ndF!NS@O@aWO&1Ymr(ytwL%!ER-YZ8{7dNeqV(kaQ}d
zV%~FAu2kCeg%cZP)tr<IqtjjElE&7rtrYQ15IyP=E1<O_uEAUTO`A_zo6daXT|S5F
zP}Y^7TbU_Bj~I%~24<;jyyyzNmREZDLfM}*kJ*vM4)A;pJba^5Df8M{81aXG@u$+8
z*^z+>S58&EN+8q28RoPsivwpm`SeM&{`1@s?7?$VmaF%3mJ@3Ibcr!6Ir3xfS7osI
zSeb~!a5`PFacmJ>+j-%|E!bPnINyDWX7b^TuZW{4JGzAl;y8~bvBgsBu9k^+Km{d%
zFf9B?HkJ2Jk`5gC0Y4&UO#)3G6FFf;O;e^FAMSZU*zYVuI8=y)faS;rJXXNU;;ZWI
zg~G+_7W-|H2nOc3&h{63Kdnd!`Em2p5IX#F?`ZnH4a40r9J?R!Rw`?LH+2`gNM(du
zS__~fXQ8P0hyh+ke)KrZ`Vk_$^cemnf`8#oNeq321pmc>e<gDf{^#ry_1q`_Q-1vG
z{39`S2{}3VuezBt0ATN8<={G`;8_K4YQ|ba$5lsBf#1vl$YNsdU<zOX0Udu;c_awp
zhZlhWR}%^l(9Yh4A0$Ne&lUXe@~^|JR22VQ;%X~IrK6}qA>rT*px|L)V_~BbMx~&j
z5Og-T;8&BB{&#iwe?nANuC9*!tgId$9xNW5EDp|=tn6>zykTYIVCCRohF`(#;%V<{
z0%Epzq5fAR|I>~nz{Skj+R@e8!Jgt*yC$X%ZmvR9RKEoM@87@f69BUQk0g7Sf2##o
zkoDIYR(2LP*8goAURChdQGOL`5Wr4X(i#Xi4_t;Y2M?#<KiB`)ng2-qt)|X@YVvWj
z|6cRAGk@3AbOAU^H~`@yU4{SSnSWRQ{p7zZ3bOvv{5MPd%gp~Ah1*#eRgm?6eI|^m
zaHU!B=#l6nIY}`M(Bu8Ar|(ImW(JMxGGe?DoG{4(+lN=3=T^P8R>yks!dyN(kvJof
zJI0ed4{L7^!=e#~E=C+^cfYln6cFHCs9(q!v3S6=uxx8@YrCG!I=G+pK5J9j9Lthd
zq(z{j`12D_5U_SSY1r<rnDfaR8|(2OpOWv0(y=7E_7A&XI5lG<EW+O-|M3wGLEO>O
z5q~Nviul)O2s=KNAxx6<ufvkLrU*Fdc${h|f5?H0eJvwuT1J91`qv#`1w^9ADrk<#
zzWT40NFGO&I?HlN{4JV+mV#ZAojU2iT5?9DDH@`jK>u5GN$UH4je%$l@xNR8f1pHe
z3pA5q#A>D^jnr*2q3MogLARJCt3t`px{+aJy6i9OYtpE+xt7Mf<OU1-Y*O!Dyz|fC
zcG1GKHK_H<8p{$$sY<Q5t;XQpHk*ZdH8$$j1J@ZR(*J6MN<|T|NFmcs+t9Vq_hIz2
zWK(>2YA^NKmwHS&*;mf88WDf_%ne&qh1BC4N(86`C!&wKL7A*{haXy|{ipU6^axw{
zr}8+-;i;z49~J&QDT-)=5Kwq3Ph}cuj?3|1_C!HUoMep%rw8npqZCI&UjG?!kN#Ej
zZJ6YLHA)P3Da5_MvHLF_Be3FA-y#+;VE+}KK)6S37O-dUBmWFc8gWrm2JGmvza-=~
z!K1PaC}VKL_~$LfBf|wtvX$ZdsY(Du8XlBQq)z+Qf7y@<E*Ph$<n3QUY4d-O0+G0-
zpF|mS>&zpV4O+h8GaHyk5is)<rLY+8=xaCI8!|F5)UA09XjK<xCw=^Qxs71mmQMMn
z*;M+FEAUNiRs%zL9F_$2j}`(eF0nOz@R{|@Pc}y{)^~VI9Ab#L>k_YD>CR3e{At|B
z*k2@brK3s*qEhBE=j&Y}OB4i1Hz>O<Daz8hfHngIQYHyqQhy~D3c3Iy#-fe!9EW;~
zk)c)>CxU0H-z%gzZRcuCpx-r={*0(xeB6=*l-C<$X&I^Uk!0TeFWrX!@|a{U6#}=C
z0;T)LNVfWfpQhIvH+dCJq8zOkzlU0Yo<y#61iko8nYwI9!5Gh6@_%)RI_whmU%f#t
zCSm9qHGdAURx$0*pm~pMPNibj|3desY$`dKrSSp7b5eTbmb)=+XUW1w;`(pn!jPxK
z?fg1q7-j_u%ifYwp6$gC0;elhc}&k81#&m@W-2`~yA_)%FH|(MkE+$f=5UvVEfn9T
zLnbnEteA<L%6tHNoDK+|K7Hb`0VCI|)$_aU=%ges{;ZN<{5^}C)2dMSzhM1Q$r6QH
zs<7~Gp|yYdR_bK$B!S#gj~mue#q6zfF(bsuqEBksb<|hSV|MTQ;OW05;QlU0LThl5
zY0fvD!=rBTWP-Hxe6eHMuypwkT~xRcp0XAtGwG@UjY?f9H8q;~){{_mj>9KH^Yceo
zPqVlz^E<DnqVhl?%L`|7rFsL?VSP)rJ?ehP`*U@>JWXQlU3g8U{AjLLOB3L6qFvrv
ziz#mPjP$z&CU0j^5fQ`cl5EoUd2;pP(?16Tc#;YvP)8$RslbnQ3f7oo1D4RUp4_=X
zREPz)>6TB=NBJw9+1=jd^i_D58_kp$QR`P5=DeTAXVOizv9u}j``t|8q8L;vN2WFt
zy;2dd5gFN9CrG@aLZdCDM``|ggaddxTkKI>bfM=VvJ1_(wP^zs%#<$QrV9I~E2eRN
zjRTY#Lvu&!PEo{BkAF_rwuKJT>G^F>e&X>z_R;Tfm3r2@jDKh0LNM0oB{iU>rXkYV
z<~$XZ!c>P+Pd9ff<q~se1Ig8G<(Llp?gn!p;tzPySQG6T^OmU9ea56j>8&KQZrfaB
z7J*IA+VU+AN996Gp_6OVL+4y2ugx_w-4~*T^ijx_*D&7Oj2`yvZYy0c?^%s*mXo5f
zEYqlDmIk;jHYzMp0E<p(JPu`z_E0c{%cw8z&D#Y98|MuNPP^U;r|kK3MToxY;z+iF
z)|X<Y^^-R;$t^Gu_YEuCrQ8s1hq=J-fZsFscgft9f$24=DBQ&}pO6>C<KIKC2hJ)j
zGa82)YD<(1VJ0NIa~&?N>z|ZNJGi#q+}KQ8o@e0V_dr_0&O2uWax;0YPoopgw^A^#
z!;Vyv54Dt1Nt>_eXR36)Z=5DYjcMC5YYlw(o3~6Y?fp$$RnwM(p=UQ6l*GbrI%!wP
zIeoms;OmCvFXJ-P1~=D$QV#8X&jJGa5z%m|6vNiRm{2yay>cXVvJ#=$`C0daJAwAQ
zPU1VlG7YH%8J&v;;Fw&9SZoG2IQzP(`<OJEr?M05)n)if>XnIm>mf@0;QaD(_rUlj
zD6`@F%bCO7u#OgU+B3UpCv^c(UtboRLBcdn-2B5cVQb^K`+S3}g$wAwxziw00^y##
z2YSxKDt-5Z3E6VzLY0{e(97p#Pnz<)&p39h$%6w~lX3BLMeYKLdZnfdubbkcv)@CT
zLKlHCo9aU)Z!Ltd`r}7UY1!I+ngvnc2ja_U-_u=hK=6x`V!mpWeCsbia?(zDoX$b*
zH+|hnxN1!Ow$6h$>mgt3xV3K$*E+Q3qwO4;J8x#4#Up<r7Y=q^ix9INWJH(O+fQoQ
z3`6+cmR{Tx%V%QYso6#GU=WFXCc8Y%3FT#prL-fz-OK(Mk9tO7VN7zWkj8M}wlG3d
z__`m|u=z|df#G309CL5`@!c>AJ*p%6A?UE^Z4%&<&lq;Mf6lVLw5%%99-rUulZSvE
zN_YojIC$bB1!7<<Ea5HXQJKAfb#s#S29eHfst9<8McFMJ2n6e%f7`rhb?gL?wK@#?
zB&93ke=fi1_{y?$_I7u=L*C{-r9k)$mKf`r9e+M1kB5GUd~&|Ha$%y?dbnyIt$7s7
z{o4K=!M;z(swSju;k>kt{Qla0#{);?`L=iPe*01Ype#SjXU49r;_l)42RVJhWGg_s
z&&;hCzu3!xfH5{x7`jAsYm>(BtVqE(<FGvIzSn4*b~K<I>Td<QuVLquE=~ljjPlzR
z6zop}$C|oGd6YEo7U%E=VNnS=pEh7l?e>QYd(r!2Cq}IJSYy1Vkm@c4VZe!yjIevJ
zAkEb~v&<(ANO9(TO<ma6)V~e7@vj2{F;9k@nk9-K>@XsUb&HD_C8e6`kV`kigjbiT
zrP9{;rJ5doddLCvi`zCZvro;+b{1!r7+>ijBay6=?Z_yTZ%ZJ`j1#USs$p(cre`u3
zYlQV_4(jpRm=)W5YYk0_J0pRW&-QB~j%!5xW<cG~VP&Z)!?ZHH9x$Js>B`-w!;gtc
z)<TeWXUg?iwedVzC4Mxz-W*NFjTg_vG<~SmE>s^x9u)B0t*^NM886&Rba(~QEj#S_
z;9s{~S+86cYarsj-5IuD+p+gt$i0?$xr_hRjC9M9G`JC-bs%Gl$2i7{)CBooTBs`u
za9=$E`|kSVJ`0yMo#)yVLIV?)ntJ%OHSO8^FYW8;x0nYQPgnT4JY@aTbC%Ze&?(){
zpQl`Y8&GTRzVLU}maaB?UG4DO)FWUB_ce%Jb5;mi>>cWt#olj3be>te7vC7$^jwkO
z;#K@t>ZIy;Qvtd8qxEdm9q-)~ZQ$oQJWAIOhUsVTsEaX9Ov^5acIQMlM|(rtK5lAv
zyGJxy*M;iOO!2sz`FS-QWO1^Km#{8)RDS`nN{%o)81D-4E<$f=c&xhf<jHFD<c$)1
z?bMpQ$~kh(aeI(=Zw8T?L*;lOc2ZWWEoo-AuCD0nW$z$y`)!A*=~IP%>bq0+)Hm*X
zDSk_Y+L+Sf7{bAWzPpm!G%3+$*=9I6x|iYym~v!nk|umSB*Pv^-3K^9u65a!vvrF6
z4%t8J*7bRoUCf~)fYQzRu^D@jK0>w@yIC{32>Q=o{xcoQB|?7Dk-B!Lv~9YxD`(u2
zz<nUXYq62mbGP?-$-ibyu|2u(!$%p01%aX=AR9P(+ulH9Iv=5I<aV8`fVs^MXjU9o
zH!kCwsHbo)A{>0tXf-!^->`T!Q_;hLDO;v8LJn$0w)6(a+@Z*E%38daU6E)Z&dy7^
z%3qCeN%oxSWidCC^xkC7OhLUvTj%uUty(S|+0%;A|5Q{wmi5CQ=H_*q5w<^wB2t*V
ze(kg+Epxs1-o$`lzo+2KW~Yh6e4B06;ZlxU9l_nz$I%OtwEA~iWoebr<7gtex8{Eg
zOA_#5NyDkrRKKCb&g*pf;yOr3=#rh7*Rn$1rSnTR@vX6dRVB5-mt|D3q+l5aied~o
z-EuIHIXo!Kd;fsr?o3HjXc=&ByVZMEGgCzp>am*J6}E6-d$`~ED%cexje`#_Uz!hk
zA|kj?22uXG&Uj)uHKwnV^V}D>((;(ash5?<b%!-Hw-*Z;)8DDvuzt2~1T=Ad4B=wY
zrB%ac!nBQIsopzuA0AW;IeXaRIF@2|EQ2{`e+U$c9nCxxKcOn^r5jJngo9@vxbF^G
zrF}l)sD*z~BuQamZqqNgju`^HHGeA3I6i-Dnxf>FiB^3Jj)^1E0;8Qy^^2XaZz%(Z
zs}?pi;bBqOiaSk1-BZMOXmJ?coKQ&nS3-`@d|Q%zuG^XD#pAzJckwht@x?6j`Kqs&
z3NwqL>9{naCM~6#C>~gLBDAm~3~e0E6!nlO&$K61WZpBg2CIzQ8RDpIqieOGwb;~1
zYT#vm9>?TNxeF!a9b@<~n#wCQwa0Nni!-#bWE^s4-`+%7!3IV?^C<0SPb9=bNM$j2
zCa{VZ{AC<hJNoGf<N6gHgL_-UUa}ss>Qt@NrUt6}`JE;2fn5UHL-{6HeE{XSpw1(#
z3xIU~rhhi-5ov><YvRjj?Y?J<BwO8A^}BlBt>a{V>5kp5eNj@Wlwg5o*_+to=gf6j
zi6Pi#Bv9!ro;XAN)uh8W;x|F5*I!<t%|ksouXqv4;oWL=Az^zQ8sZWSYl@ZU&N*lu
z)-XExPX`i39K!jPS=16eJCv|A#-Dpa!6RhWVk+1_F9w2@)xXk4LL~s2!7p0s8uOTj
zK<c|K->R|dF7CfI`%VTLDfP1*tS6D@$cI3(8rHXkgAZE+;)-$JV%an=UnA)Td0_yP
zPZ|(+m)f*1(|*2Shau4s-fu1iOqZWQSUxO<mU2vx8^_#_2os*i<_vh}eJKeOP2cKZ
z?g4#!&N>C?z^4qI8xG1oL*XP!=3=W`TQ|d46qekl+`OZ*UBWkPKEC-Z(0o+3@=_vm
zZ?zxQsNEy1!@4h-&Vb@%`AWbLzmj&>f#bDOs?dxckKMX@xw=CQR_~mvXC3l}|7tIT
zrx=@AlhI|K-L-3Xw*JF1ORGeQWjmR`jX71v5h(ixad%>fG{a}84G%ubc{Lg`a=W8h
zG<UgV5rCsRQ3n|s=zKVz?_Kj&$u^2ykJYG1FaO?UTT$S-$+Sc5?%5fz$6>gxK*|G2
zObSG9R7z2jO$HUwq(Pw^X!=0H*=W=*EN#*15!!LEaMg6Pfy+%+G^=N3C}PV?ur%0S
z%w|;PtxfYXC};cL?mMvkXllkeo5kpP)NyXymzV5rAT4&OKm;{gb^ODr?!}voNgxv4
zi$LH`2UGRKZC#FeR(yYeoj1sKjc6@88e|UclieRJwk<ypE(O!Gyt>*yTw3WgMGo|{
zK9uh@8{jK!k-k~sg@z{S0_aj0gg`~ITgP`lJ#1DxQK7>XH9mn@-OO$FmXxDHWR$|H
zW*NjSu6rGGZ|rqv#xF>EGNh<J9Q7JO$i^P?xfo2veB2j9uCwc(HokM88n)K^ZLLco
z5b$Gi8n7&#l)`26(boXodTeS?zP5)Z_{G@|2+uHMT^kF)y)7Z%d))1Uvx8NY8sqYC
zW@#|iMGh?x_^qnie-I~vb~1)*ZgU@$HhxWz4BS%7JA^H*PB)0vvoRJEN`5n_TlhXR
z`pHC@S5QsH!99Qf2iX0o1hV}+fH)_WYQ#*Rj!#){T0GU13q7u3SJjM319MYTg79fM
zES65XeGGChPW}ufrR4a&kkg~(TQO(c*!X$IwX(;df8<Y#WvoQ+s&;=HZb@|5lsJ()
zK7}->TeBgiP;3yHSnyqtgkj`800zBPxCpD;ZCv=wvUo?tTKJP${t7;yBqZ-1pkJyd
z0E7(2zzCSyFr&*+m!u(a9vZ*Ho$=%W{8<7XMm!XB-Dg7|?~lSThZM92`=hrxIlPK{
zo5WMe$;t|YUl^6*GwTN{Y7Fx(kt9Al63#DqG}OQ@K1SO<;v%Kwh-FWJC|^`J!pQ6v
z(Z5J%Z{snf<aG97x<C=e?OY^jDL7U*k6UTvBt&gypuq*ORj#@M$?d2`5&M7qTGZyj
z&#box`CeE_j`^`v!IkXg`ThMpvmZXyTBz^gX*WubSqGjeb4tSdAg(>9dKwm;j?YD7
z8GB%NZs*lBMvkWz5NTn6u1JncFb^!v<_AuYq?AdKyJN@K%ogbn?s(bqX*08e|C#KR
zd_`<peiz2T+r83v+e^k{4dE(IMI|dE2o<)@sHpwqX1hdPp||bOq`L@y;(*LqZlw<*
zh42pFD^;GJVbjj{%X#S*(G%mmHP<7{3X8Yh6f~35owiLZRZ?rAGl|*Zc6XqxYAK2g
z26|xx|8%pOq4#wMZ~F|Xi3x8a=O;UvI7P!2tE|KLq%@^}57#FFMASu*QBf7TQ5HP5
z;7E_1!oxTMjm|jP9K%xUrU9NmmXW^_L41U047NH~mq}k}lj~Qt3{G#QB40T>qYp)j
zw@o<ZDtMKy2ptbLu-J^7eV(er4%OesvB%i5siHAB>U-WW+F`Qja(NZX!d@O6i^?4>
zs>fhS(yp)Uj80LIkH6tHqCn~zsu(>Ch)e%-6ObIS+gsUJikfE;zvGObcjR)xo<+ST
z6hEqsI&lbo@(Ng*Myd$U^6K8m^5n6rwZ69g3j-4UC>n#OzK6ILHocBUz|`P*va!xw
zhNapeQ&#9F_c6jR>URO~G8|#IX-Gx&4kKKvH~)&G-{P3Ztk?1dpH*+K_!m?cbzJo+
zKc9-RO55)2?T)UToZMK-d}!Ml^KXlWe-Qt)Ur-&x9-(Q(k%;i0#(@LnB}xd~9FL{k
z5L6d`PrJX$SYdE<oa8OT@5_*XTC|2^pp);d)P8g9KWN<=9ITC$kOTZaGX5%4#K0RO
z!czPTs|^<=fo};oy)KRZ&HVpZp%9MNzI~D}_n&zF7o;C90Y`VAze@TG4aN%Kf;VI>
z%0>UT?oP<E3IlnC9Dl184~KUdFv4m73uSx`hlBY(YWyCv|EM+zZ|KwGiNe3{w4aiK
z9WN|#=nuT;m-mR^4RKMV{{be#fyDqmxb4<d)L#Crnk+ROyk{_M<EqsAZ#$Xb$iD5S
zp(+AL6RD?uoM!mHsr=DTPEx`5a!kg0bn3tE@&)a`koU6XpgZ7D|JM+?x0N`*!<@hF
zY#s3%^HvHC6E-`y?FLi(PJa2Tl0-hLvftqcxGi$^kSPlPxw7J)ouv;Y;r!{`4i++5
zu7$wvVfUF*dAf3SUhK+bjT4QyKkX&TOf6c60+z*wgvl__Q$7Vt4rnk7us{s|R471C
zLA0ts&*!Ld_b<mK(ZP3kJl*xw{|r4i+zW^I2at^0S^kUw6;F6zBv}@7|K(^KcDQ@~
ze*wR&`+t&&UMo2-XK>qE$dI~y-0!+Mn8}xlqAAxh{3}YRROk^nEk<g1mwl$L&vq^A
zEk?^UNYk{+rdnM#Cl%0sX3Z8WleZs(i7OQ|cxKw%Y|D+i{kk+uI{pmT_sBF#?|K5C
z^0@72WN-i?4c+GfJdP{yjkbhe8*Sga{rz{BI@`H{5UzyKQk*}77(VnQe%_m}2VNen
zy8lR!p<T;+Yd-QK&lglkQ<x|?T_*o$3{%J<Y~i`0UB5z+msDVvj{GM4*9h_+`31-0
z|G6N+PTZ_8k&ooRdM7_hu72X%<dpE#vau}9|H|nq;qd%}dR$VX_?OL|D#JItRxlHY
z{@SzqKdVA+%Zh%<vp>ceQG5h>`4mV+2q$nUzqJXpZOS;Rrc`gs6Tn|}HQ*LisoDvi
zsVHpyUa3wfOc_y{?rT(f_8;S@*rG7${LD~H<DJJ(VJU4yw;7J5P9%6!E9larkkRfK
z+h{#KvqQkGv)*n@m}sA5rt;+X;m-!=v7j1`8uxbit68mn&cSS#cbXaf(`CX4kgk6R
z+q5Hsd@@rlJTxKjaJIS3Hg~v>6S=pt%=Rq0yBhaz`uc&+lC|U&W6YyG-=DITT+j$R
ztJTC19{&j+{lxdkYjZBoD{L6U*}B8X{pRqQbg-fZ#E?kMfeTGBnoVt}cSn7g=f{KO
zgJoUvDJ)I1M&1Jq?QJiv*Heru{2%VmPsT-VGIsIx?MyNpx?1;JV6ihLss-NxyxJLe
zsXV}Rp|ffA+VkR;mN_N<?Lr-ZrYs_A%%fSFD&^61?)of`rPkgJ-}|dg9+O_g-MQKV
zVAH$>UUK#5(<Q$f;7#|#jmU!Atl|ET56WW^fpsn$BgX&+ethIqN{P_IodrOOVTJWn
zQR#Nl6)Zu9td?HA=yE)Ywabbw8?-hA^)XrqJY=lySUc>SDptN2SIn@hi`Z{kum`&d
zRxum4O?O?cp%L@grDBi^L#t;xy*+lnm#1`B&sMhE-28Y+t*H*YoHjX!jI-*&t`#ff
zqpC_5t>#M_&+HZ#(x;8NH$BTJzn{Xomj&%alrDQ>=Wl5iat)Um970@jrS6h7tBM)$
z`X2~1lPWc84(2U$fc2K+Rk_o31A-s5!&1GrKc(!!El?|RyNcPgo~*xNH!MT$YhIk=
z=QYM*vicZ_L<MNN6Dx2~Gf^OOV^qfjgaj*UEnFyz+?hYzUvvYf6nPs9V)+?h?p9M*
zn>of7^TVv&d&RBsA~$>WGmX2EQPOFF+#pi`c@&Lm3^DJRlE8k$oTgSvYS~wGvp)46
z&9^Ikm?B(cKWfir^v7omTt+MQ(?tBc7U4Dz9RyT1&LpM|y=8cK8Yg^Ra(BMqP>OeR
zzHeKp*Z5XZ#vkf?u_zs;Toc2))G}!r={<3=?0;A;emXIc66bVC^v1c+_>+3E(p5#9
z+OSd<|MxMW3n+(C?=fuf##iRu-M)i=z1P`}=3J572ikc>-bHgLb-r{MhqUGKs}uv%
zO8w?o7NgQ`IUd#5=U<O7fkEfQdiBpn&A7K0*o`6K*4@{2L10z^)&fR%+wXl!kR5bJ
zdWMLxnL{pw5Xa{)f>YOeiM&eR1?#jPcHP+B>^C>7D+@KEH`y<?C8a6Ztc~(6pYo2y
z@IXq$*&7GRyywi~gtRQC!Wj`MF@sfWW#TN{UPeE|B(w9sp31QUM9`lfUc%zWUl$pH
zRzp6<^B=Up?3R5m8sWB@-O|0?HGZ&J_Bn|;+nySHhEDkI)UHygsV+_`anI{?OZC8~
z>*4+;21!|R#(e>p0x?ikRo(HJ6tUsxE7UFN+MzqzDX+e^)oo(#yg6t$lCAA1l*Htm
z=X=HeP}#gx*Y{kY!a%e8{@V5>rDm0AHfIZg(?bsy!c`<ae_~59xAkP<y8-UC2QaY%
z6Ln#WJtw&Y#U1>PevV`KNymz48+m>jO=YLnM6j5HcTcrt|Epo+y=?D;<EipBOcWH<
zo?q}!=Pm5yeEtjOeg`S*uU+4Md##ga%y!L<$x>{ihFpWsqVI;_8>xpOGk4_60QlCK
zn9<q5(vgi?4c1sdAeK7FAs-wQg~aIfq~23h+>-^%iTvm%J@L-?X$+Pu!4r7ys3cHF
zA_L`W(p;`J6*M$VUut}hA@lIwV`RA-vcs&_<cn>FI!770PDs(a?pAkq?bnYByMLh!
zBgAAQS%4>)H;sbV)HzLcWb4)rO=z=g15x#%TtDioubLR!l()N2O3V3&ie}}u0N35{
znl~d+rNHLJCTQKzfwJ1B%k?=oGeQ#O5yZB7enN&k>+D)|Q^f!V9dpNsoDqA&q%`kS
zr91)bzsmB%2(S7u?tVf~yeNHrXlEL!G}a;=#<YeOr`nbhb(h8KknJRb9J-nCTqT=#
zXSVwA&^P+3mrod(HkKd{7(75pllpu$dphZBy98-)f1}kzZ}TUPi>YtmA)kkf?uSNp
zGw`LRCy)4EMdvx>e98OpWtKT1DL^vSuJOB;AO93NqPm7GyOLcW2Jfj<_sxO8VXY}b
zH6=C&9EljyP^?V7n5aJVq|_YTnTr+jh&nqIK(#2hUx2{zIHgfY>+AuK=NL^ptdrPo
zWJHz6eS>g!I$zKjhqYf0;~<+R=7F*8pr4t5i0G!RSx{BTM@w73v8O7jkl$rPp&gzd
zee8G$-gu={we2tv3%J|05`{e{wUHrl3VsMv(k0tt?L1ZUH~DmwV(hoQDRlZx!q4zo
zirPj+{TPF`wx$Kv$>>11<1<L@#gg{|Jil4PchZ@(_lX@kR#SK#mY`?@@Q8+xfmQ-E
zTD=cDXPdFBUo=fA30!SI+-{1La^UTIl;z&^Na2qYQL;iGfolA&w~N)KTC6|Q1oLo!
zW2{-gJ>leoSc(J&j$)}<*Bb$MGx`o^Qw=%9`fRauYdap@H(8c_Avxca$+imcA!4lo
zPmmz@H;0GAxKCn3%)YiBjc|^G;!t_p;V~wxsVH@m&amY-lFmKc{H7r)<LTyP^RizC
zgVsc?C|_?Ro7#|bmGiBM@7SBoOJE^`{ylY$%c}#YSbdgG=J}!%YcrfpmnL_sE@X`m
z(WcsJ*XL5?G@QVo-AAixgCs7|1OU+Tu?2u5c)@csYjfBMgYv~SJXy19e+<K|G7PpN
z_56`=uctt$iL8jm=*aqJ{iFJA>NFdxkNU++%gltIk~OR!0fM-*B0C#I<ekPxYzoM{
z*;Y-%LO;LK<uJh(k<VcBxlC?q$!_9X=C3~-^3Ljk>!o6L?~urSu(Fblc>yiPx5sh*
z4aD{>J%<8jO;-Lj)!p}yI2fuC{RZ{QYZ;l>p`dr^nAq$&K0hNwjB2`We-8V;q~a7$
z!|FO9Xb~~kT#3DibT{xh9>^YFhL1eaBz<0T{!<6<Bq!ib=f9^+XAg{aK|S9t&I!$V
z*^PpS4)*0jrr*Hs!Np6QU>m7X0$xeWcJgiM(&6sR<q?wtGTg^lUAc&hn-phigh_dh
zqrwz)j~!sMr4oDO<WP&0Zs!2Q^T}5Cz6@>1hh-rPw3LtI0!MwpkIyeI9BPM+`Nvs&
zw@KQs_HEHI_1XrQ2faj;Nz5OvcEq~xhK+^C@9(xn_J>sgIp4+a8j|X?JDl?juh-tV
zZ{F|{!}_&k7?H;{gx`IA(ce-uYwW*R4Hj`E%fF&{)FR53iw6tH$aeF}A+(jM;O508
zgT5pmSTXgAA>y^)8$LmL+{Y!PDfjXi9Q@V^x7Ui3j7Zo3+WIww?1?y8oY1+BaV#2T
z#2YKl-asVNwUlkSsHQX{J_s?3z0<jX9p>}?SToy-+E7eY8c#%W#EoN?LQHDfbV85w
zfSztU5u)qfQB>*<R()lX{J<!d&a?p58yPihNI*|$_GkZF82NplpUhwlc2N%!``bfe
z;K8@NAmfy5>13h%OIRr}4W%8jB-25c8h%0)9rJ1QBnz2UNe1Q`@~X~6?YkNd;F`tA
z^=bN~1s?}lkZ;WS`%RasexWg^;o4;<rW_txC*eWs4@2Ri!ib<6$;)pjlBSuiXju|@
z2G`D29(gX|yDYPu;Dn=9{+EjUl5|5ihuw$eE0C{3(f$$!GN5-cv0{Zvy?Mv4k5-1y
zg4p>5GpBk6@6f5~pOLB=$WO&Ex!65n7Z)-x!)AmMd#(&_L3>XEYcUWdzVr^BIroQS
z-v<>Vu@f8+7?>608Si1{<2rq>G88a220S<_Z}x}~URVmgZH?|nP7@z#+2j-^a2h%h
zXBlq5*tk2%d8lx_$y<u|p;nOTUR9D|9VZPibA8i&bGExVK5%-KR&)V%8K3rj<NIbm
z7L_>BO%iyq=y5TQJ%VP|oj+XAy?wb3-RM~9dqd8z;<<#nDIfF<!{^e^brJIOrapmO
zGb$eSmMDcuevW&$8IYH}9DOyEed(@S?CGQ`l+$8bq<?<4#cEeb?CsVwrd=qgNsjsO
z+33<lw-|I|L3$Sz4c~d&60-WRp->}nO}5R8+e{MTpLrQ1^mCy5*2}Itn|PSpb%^!8
z{@pRcLe*fT?RJZ<+1|bK{p#?~VkbE>f;PUHgPR|R(mJpm3pZQHb3>;AJR?~rbxH{g
z(_pmQL;nZwhi^n3bbAD_^p=vk=LCjidaWqzN2yU_j&AjP!MI9Cua8XGVm293V5VKm
zem7TPmR_`;vc;nY2A<#i=R7_0jp#Zw33>6oI9pdw*kYa7X@lQqumWk&abQI~TteA8
znRi7boe9COOi$N(LXUzSVPPnkYaq3)J2Xs8c($ooCNo<I3|V4Z`|6@QoXp(GhL)E`
zU<Aq|ctcaXLivaXir(AQaw+5eMSO98q4|P0C<2#`2y;)h&rGJ89usR1(`;z%z_uie
zO)B`sCh8N9K2K>Tere^`m>hVdQ7Yu(kz|&To*)e@6~TrCL-gSbg9z_TSTY)L^eoGP
z_yb>#OQX!NWG)9H&)mBw<fzhN3WN{XJF9I0xvOG{PVL)Z?oId<_M*OZY-Kf10^Q-c
zG?-Ts8QjQZpnUwEg1==0M%ufzf)a>~>5HZ2+OV?STkm`O^Eq=Bxn}LR(#w?~%<J>X
zh=Fd)er!yz>!nlWy9^p%QAg}Cyx><0MIX?PgIDrNZu-s$#D?w)IHZ3(1q(5yf9GYm
z%_mE$!8mwW<ztKUemZL@DANO<dc8S3UhC@T7Kpwwc|&SG(pR7Q6^j;Xw%t--#+l(H
z{F#z4Fdl}s?3e>1A?C7Dd98LJkojaC-#j%z?l!~}TAq(j+C6%^#WffO%NgBzvA*<@
z-@28q-q`2pizzaFhtoX2O8xgvemLb!=;H1}gm)nVvG~C=WutSuZ+(FCKBKE`&xg)&
zFC@sB_rqw_?IMQz*D7sY)vPuiz`+Y*PP323V>uULUF-fI*bkwkthHz!UC=T3&aeok
zGR_)Ol)n8=2mSe6$}m;o^Q9x?o86`6d2a7TQ#yOk^y<w4vGJDuxbyBeIWKoZDi3j-
z4Vo`1-Pg%@+r}|wikyfg9#z?E!TMkDQlYbI9Pu4t@(ZfKR8a_u;=!CC79Cb71=g#H
zfHS7@=LHy0idIJ#c2ojd$cyu*n3^^52s{uAd&E=s$&C^jhKdjOq#oVJAg7@9HPBXG
zSdrJL`<w}qKXvia5k-7g%oZ0WIi+;F$06Wht6<pW?s*<z;l^`0E-Z&|CP9&gxD2~C
zr;Y+%@RqZn2z0Ap+vQ|Y+bf1W*k;RVU69%}D2%qlGd#b={c<(zCWCRkwTG%Eh!6}5
zU-$l3=Dd>DIHFtrl{&S*(J<o&yv3$aAnY3^X<UifSmv9}cct!#L(=0v-MYeDdM2I&
zIs%KoX3LT>*laZl@jqm;tU$$nF^yZ5wc05KrEXDv+J`Gf>RO~ZP?Cc6c}plb%mI=7
zLX9l4@7|f27X9T7e3hV<@kt$}AczC%UrM0*vTC`sm))NtkWD98=}oZOmU*v0>b8jx
z5q|hwzFAc52nlaKYT!h|K<B#*U=uwx?U#{3K`L5~m$P<=Yzb7+mBSW!SxlA!DTeN8
zB3(4ox%G*=LOBW-wJLPj?3tKO!mWH$x&+a{P~><vR-s)oqpt>`bdf4!?~rRstfa%A
zIcUDF=l6-r6c*eRh!-Y3&LdFU+okl4lVxg%yCy{9Hvc)k^dy)L1<iCP->|B|HY<ES
zZ<EDO*k7-eYlo7A&HS19edg2p%Q}1vy(@=%SF4$?QEWRU#s<sdary_F5h7l+#BR`$
zxI<J}&Ww?n&}IT;e)jC75J372cO$xf8UMio&S#oiB_`jk!)95~f+f<r5Aj&u)Riyf
z{Pc)KSd#F1FJ5Ax@xGAvr2``7*(vRD{Fq`OF%7RW;tPT$3Y*DUMX&U1p=dNOF6k^!
z${@;4%O2S^(L)LXPZkZRm6b~snw1W(&*0*r<A=f+lM$T~>frtLw#UJB6T~8Gbax13
zL!bOzs5e14*P$sai-c%W2K>N?&{4-eQv*$&f)p;nm0*smJw__}21Z{2=yZ?=!!+rp
zfcng9vc#i&O6gjk@uZf)H}<W3HwphFF<65<K$5~3A(~t22b+E5#nTC0ab+Rw$ArDF
zr&>zcw31?<PAbSSyG92i*;PqQ%YPxoeg^H^dI5uw-!Fza)gwPXkax|*HW4ev&(jjz
zQ<u@Kv>eY~uZqg654uCYA#K71H-_Fx?NNfP<h}G9$)MJ@0I<^P-Ddh&XHa<b>9@BB
z0eM+@Kp!OVZK(HY63IktRmoPqodK-v<zPOrV2aD@q2NQJvJ|*p4(kFhX|z!=*j_Ev
zY~YHdlR#R?|6@e5CBF@bQO-#0izQnuF1WV$=*zL48yr)pY_MeLUbVU9goX?Y9nFgX
zE1PT=*fZ|f@puS$BD6QJ0+e6vBzWCzreuX}0lOol4nmqHQs%R3)=PyYV?)jibx6q=
z#kauYB%8)Q0?VCd+X5+fnEDN_2@e~}aWkEEUq1G@kIVSipFx0DyH%|L3&m~~)<G8d
zzDIHTv25aYdm|I)6V-sh(AaF@URE?y`%`C3-8i1kp-*mf{nb)^$-hX--D5jH>4q9s
zF6h0EN6zXxD2ld9QNLnfF^M&>`9)%(;-G0XhhPaceJCF_$ZSzb0EVHFWaF9kNC@&_
zehMbDK=qJa4r{rI`P_E|Ir?huWG{JYF*!AKN&8H|5;f+lhDVE><?cV(L;WRX_^
zZ+(qmjxPr9pxr*I+r|kg_*iN-B2r~z^WO<ZY9V>wqYJ38<})^fKS~f#h9&yqn`+2K
z)-HJ#G3a9pK5kh^n81frRLm5?nqrUmkaJ1$4Qzw?p^%?G)UsP8l>M<ltPOD3l`DAV
z&Kgd5cTMW>v)mDlj?MV`T^jM-Q7*unS6kEQiTL2l3pkH9wgPKS=2R>kU3PiCgdh~4
zx-n6ws{Gc~uWFesZkRv)l40@WgDvs*TBbYFSiuKGOsa(_uYA4-mwjs8$8X381&^dd
z+58Tkpht+-sBT2iWq;Z3JV~047deZPH<&X>@GrG{s3vbJ;1QyHC5x>#j(+KoFV65z
zO!BFO-Ko2&ef^mJ%fob3Y4NZyk?v)D$)Qld_VS>eQzWtRS;;RZ>9g2}G5^<7aBii*
zbWDm~y|s7uTc|S(3Zg@In2!q5;(K&EvhE*92h5k>jjf!ZIq|-Qv|Zt-59d`zR$Dpf
z6d~<>LzBQXVDl;uj{^7g64EbKe+JQeWleq{4t7hteNMpJL=bcMd?9x<JmDLx?+MS7
zWI5KcR2r}XWd$5|d$@YsfBHG3E-zML+3B3PwpRjCju}}%o);p0DMliZ+xvkn-oP{x
zCQV3dhL4be6`I_Y<#-ZHO))jAm@{T+92Ab0&(K-5lm1+AEt|1{ddz98*g)zg8Q%N(
za~N$>lA}Zc$XIWqQ>msg+402hISer+?!r;Xp!B#n;=uqPmqqkESJ3w$*`BHXIhB0$
zEjH?4rgWzFZmok(LK}H0w)Ap@TgX%qkEy8B(?0$b94XRAc2UmBu_o#XKR)E=U*h`+
zFb2h2_y>l+b#Qqtf5TuKyb&^{)NhVe+sZHawTdYF35ZmViv*(@6-Xzh!g44NSRgjU
z>>=P!6ECrF^~_w(q(esvHv>K5l;W>p$#M_hh$lhFleo05VM~tNwln#eXV)MUZHp!z
zxemgWL&xjopwD1@!7;6i+l3M$09>fw7v|kS!v5Lc95+wSBMj*xx@(1x8xo1SHzf{n
zZs)azUWYy76>yCDRzo~&2{=<HOv+p94kwZKd134?<~cxk51xKj?Td2Tv!L@l{Qi!B
z=Q0Yd=Vt6x4WW>4;o{jU(0$;*|7;h`JPV0&6KC3=v(;WGffY?CUM=5x+v+&zf}D53
zz`Ks^lcFTX!RYN1&hc@bme7$3)ol7+3M`yc66a%u!})PANv-%C2{{fLmhj*lgZR^e
zv^5=(U<31V#^tM3ap^CJ1i39>KO6`+Ok?@<BNYo)2o#i$&yqRed0S=G?P>h%NwL%2
z=2VkHG)VUI7t{~}<|zR>Ec;+b$0x<w3B-Ii5n+)uxZ9mx;hap<`*aLfbUD7C_8X#z
zF)<Y<j&1F-kG&dFm{cc>M3Mml_)O(r7`&@ATr>OJ;nQ9?*-^VGMlN$a4N~AqrGkeb
zsNzi8D<QFohdGd0-dgo0_<jw}>gT`6Gr;eSYCVuL7L8l!v5g<aGBDV@awJ`Ad)xo?
zYw{88isnjoZzJdRJH@OtM6IU%U?&5EDMkoE;yp!0%LA4BWHp?(XTG3Sb+yNumAhva
zu9NFi<uob?m65>pr1U|yrVdW@nyY4G^}R{MA^wyS!8v+`1?lm`CS4erTZsQ0<n77R
z^SHf*GBG$^CeG7~xYhsZM6d_FL4E7^6g_eDX6d^ls^E+|%1A1HCFT$BV}HVy>iAy<
z?tBD}HTbvMxh}`vO<LUYFegnCS1YM)kDr)?QuH3_-8t=Y4p*cP4+qWdESQ)*ynz#b
z_OE$aF0w0E)x+A3x`j)<7#L<N0UkN{zQ>7DetViW1Lz{X0=3c##74r?(=ngY*Ou}c
z+jJMx8zc-~^v$Kg;zYVe6obeZSP(w0_KHUH6IgCYE#LJ#rD|y^!2Bwf3h+LZ1-x<U
z+1!ZMj#iLWg^9~}Z`bzs3V%DW(jtS9Kn5|Gb&iOn2D#Vkm-DpoYVm{7+|-e*LDu>p
zB&3rkyc{>5F8n4LTRsp6eU3>EXx`4lJMF->4^8+SKqj$MY)L4wQn&c7Bj^<k-d8sm
z@;c-7n1``Mvcxk=nE~p3s%P2gI;*KQ`wD~k`8=Ok$2b4A29kG>Z`$^Or1oEBR+6T(
zBFJGQ8@(2!BD^51Qk+Mx<_AA<J&I<F317_;H{c|5N1?8995t`dn3fI6V(~hPJiez3
zR(I6}J$*_Rb|fHa>}ZSvXr&F?mu^nl<rT*X=)a@AwilZ*VM4zV$JaxrU42Y)s5{3p
z+`>|1DP`^_oO}>M1$jJ?I=c0)aG#KjkV+(wAkG`z<k;uk3nt{JMaS3dTB)*0Wjd|~
z%&^6loZjbgGzKgEVRyPO@r^2bo3>~uTtA+(-P60CJF$d#ldNG@(@3BqzrrwUEvQBp
z-NU8~5?z;=->Qi^La*mIcK`qj)K6H(B-~=<r@wthqsL((X_}a5(oLz<(!w3GD;AR_
ztaXf~v<kfAGs(T%309pZoITx48EY&z>TFTi5M;3}<QmlDve_OK8K9TBGGf`oqE5PI
zBd{loJ~q7T_WQ!(dA!F_I`@@LcBV_)Pmglt<>A>E0!9wwP=2kjL!#L%QDhRK*EGo#
zf<b6(m`8bo>*=;ta%UBes;$>M<-XuoWba$ICT_yR5CejcUJ+KgL}y?Vm=A?%H0LP*
zy0&X6@0bbHdNEol*t^2fRI%&@Q_Z3$Odc_wY*Xg@c1T1XA+fwe4lF^6VB&`&gPL*W
z7(uISK{~Jl-i1Iq0yXI6G&y0dkf5yCqmc9Cm@srznM?u9r-Zzo_F!c4;FrVm%3h#W
ztUeMmHg;BRbg<ORB@jwPt4uuhIf1>S`BeE^CcinQuA8ACJQE&h_%`=?KvYflH<qOv
z95TB(=_WJmCh|>z9G8$fr&t%LvlB!n4IW(6S0v3q20A!|{<$%()^}Bm(>!HZi@;<s
z9KGW?dWWR~=B$8tO)J51E)jo3%QFs(NLz}C_`r*hY`<Vpjrb$cAvcn01p3lei;1Fd
zy3!k;DaelI9GTZ#I(Bt_EE5G%1rGvjy{~<CV3Zj*G_ww1hLEw2Z<L{*(L!seZWtF#
z+iASk{6vvxFEq}(>z*<89zFRGi6i<gU_hXz1>Y<{#`Oj?4oZ??OAQ^<Y?A0jI~dT!
z=7$!bG(R`FJ)*etna1){oxn2$JBTY->ytzk@s4X`11Iuca;!U(!*IE&T)%Kd)IJ2f
znTq~EoA|_m;bQ+=?XZLJHu%m-Gm^edsE!V#>;UweqWlrZwS42?_=VwEHU$5|b$bH)
zHFBw~arcN<li_gDc*f4sBG=M*9Bt*+Tt|JQ-SkkgeD`|Rt`7$W$2Mih221Ad)Q5Tp
z9jJ4bzH6h<?Bcl5&9>BGE%ksWG4FZA58UYp*#$q(<Q^PxbY78pvg<X!hs2d?!*Ck2
z2I*14?FHpzk-EEAfcf^WM2Psn*^5@R$?tds!-j!jo-D8BUOuF@Vqpa~!?(SXOOvyO
z$?Q-pS5}jCoLQdrr=J`IeTxFWy?r8sg=Nzdf)9tmAJ1$)BREr-fGLfW*?W1p9PtNV
z@FZD_CZmvbIeCAW6oA}FIW<YJV)F9$lPpk-#AG0u21jYJu>|fBXOo;`n1omcXE2J#
z5xcRmfGC@?iP<_zICJibo|7TUN)0zz5#NXj9A!?pyo}Buz{EKPPojMlC}}}K3VMl&
z%_be~q#htJ1-06)mFST2CVrEREZQ`F{G%t}8$X%V)?M@x@*82nj`(+hr}f*?8C%vr
ziuwhFCz*460mR|Du}|%=!yBS%2<<N?kkFex?CVf~v8MLSytc4RNHa0_s3{qAgg4(p
zVI<9&4xfLDoVK|z@nMQ9`{-mV|3ZxV+~Jfr%~zr_BoEoxDwHo?j*E1j82E!qiX`X=
zAMUR_hPXNS@joJIRSg*UW0U%{C{4`|Jn@rF7v17mFlk4|sG<Z#ZGSd3tnv;oJX&)T
zpJ;tp{kkUH&;8MsCd(VS<sI~}CDu6$&@OiA_y9{EZcOxGoxBn=G_WK*>2IW11QA+f
zzFr487O`~TLIomJ!i}0+z%D`*+X%3rODtjO4T%FGs7Oh9zUw^5PMk(u)=Wu%wXZW_
z|JydcV6bauFB)tKi|X+lT}R{sz1gEjuNr^73jokSDmK^>^A33ojiu%DvUjgw>{Doq
z_KH;RoPX@&U06?#m3)ym^2L|W={E{kvaXlMCNDP?*v7E=eYRyWZg7&Nx?g?(zro$?
z>ujKc-Tv^Qw&SpAK*ApJ#E-u7->q!E^+fc*0n|P~pO|12V;_)>y}{$=o6H;96%Rz>
zY4ZWxWw%?!D$^MmF4*dIrvbkM1$HOv^w%;jJtyOMEH+;2+s3#J)1-fC$ZK<ps!+a(
zfzB0^17lB=6y5WjUGPV0f$fJ@LQujo?5R*)<qJi3VLSM`fS-N+K4vWY-3O>nu@mQD
zi&;y5j0a@o3ziQ2;m~U57Y9pa#onq8wen=_qiEba#y|Q=7X(eB)S07`^6fHcZ7<{d
z<y`0>BUhZaeDvo5DNk8h4A%yY&)i^LGggzaYnpXl_QvVST;&?1u6D=vpUd<}J)Cr%
z&PvUJf=sjtG`SoZm;_6w$2{9^PE%`Zy-!(B4?d-sq>~Y?O}45yipfD*0=`+;Yna44
z7XcmBSHR#HVQ4C6k{KDVy08H<J$Ylx%_u)q6kAP@Nk&S)q3b9E<ZJ(Yo+Tfu0kT?^
z9>xgchY@08)qK@*$mC!9_|?+5G%f>$H!JcP=O#bC69kVL_rky=>mR(pO2GLpb^%;C
zwrK#I;K0_wLu@fVdNg`QIV%)2#&zH6SUq(1ZdBNjT)f1wvveyRS5i9Di*c>b9h(-!
z%+U7C1-R)Kko(n5xzZ{_j=6Y0Tu}1SMSLWoi=sFM&EomD=aR}t_cV=hS2KN_m=`Sk
zXh)Njf~z0wYgSqj6SD8ov4_+M0Bn-hB&q~6m1Bo5(TWyEo-!iZx!ISi>=><d@Ynli
ztc#!N=^i_)eyjur&rn)iUWn@bw8L*U&{=NoiD*r>N7qikm20)D!u3Zpd?pU|#%4)t
z;I3YV>P;JOSdTW_Q=x7tl$L#mKOLd@NYD5;%2VzJ_R6bf*k<(EQf&IHS7S>VV&pFT
z5oub|;Z95@`%bxt(mMO-7JcnEfAFew;4hwNbCLuSHFA@c0UDzo4Q85+zo{u>zT#`r
z?3DRmR?RO$rf4OFN_R=Xh0=%~-_!<kx@}YnOI1_hi$(;i$A>gxfx$dHWwUeXNnEVR
zluubAN)d#ur@4NU763&&Qev|&sxz~6o~`;s6`%3B>>_5}J}3DPhS4ucVmR=(UVQ1x
zm;Qq%45w`I!-<CCBH}hVe^Sf-zgpq7QkkZNA8D3`W|_=y!SD~qwk$?8E7(R!LE-FR
z;!PfHDba7P_=l`oB8(sr_FS>hIppe9plZ@zoZD$0IM=grKit;;FR~{NoTHnBEUiiP
z7dLbP&guQ$L|t`<_a_xrHW*G<rXtk*`}G+>YB>3HA-3iG?-yoZ!v#mY(|GkC*4RHZ
z>Be8QXQ03u#Q0A|0wm#r|6i+sZtUgl)d}zb+NztZCFnhoFXealN<LY`K<8B*PRWkb
z7rk2iKRhRj2Y=5C;4uhJ5m=C;%Q6=n(FWUho@sU8e-3!USl>3ItCU+dC`R#zS99Co
zHq^wOpPZbuDbxMFA+Z8q_cgU2b}jhb9uN9x>01NfTj6rcT`xH;DjI+`62JTA>#vuJ
zgcjJ)s~{Nr-!=N4PRQ`sOqT@Ot-=W^+Y8!B@soj2cJ8<T2~eBI0fBhxaQfV6xCnWi
zauYv?W~TCdjh`?OWp_(}H^3+E$sk1pEM8gYs)0SHuJx-`>2LjAG!OeVOYpP1(<x>5
zK~j%7Eqx$hx2nDB7qgDX9?EC?EjM7l`{CaGn;6o3u2|4s4-)z5NmOKHg?6=(`;4xo
zZUzr<Xb*ubT-(_9`!8nFRFPsm_l!;-55$Uk!7|6!Zj^hLNj*8jW65(+_;TgR4V<13
z<HZYgln|LIHRxIA?7BPG?;i75_O%pzJ+%a<)Xcaqd2ZU3Tlt|QT#<#ri9Mr6{&#K}
zT-J#RKMLTztO?FnnhQW8+iJ!iD_uGEgxhEz&VOq>n>AjB(<m}Eq8Q5PUYPHZY%aq&
zWAK-9jQ-rD-CTha8Vn9P(m5?-J5ML&&%bLM&!@7R-M(r2N}Dm~ce8H^XC<C#3P0SQ
zoDVTp&%ybzcc*Y7qW!B^ukH<&UB-p_EdF?cLFF?ds<;BPPR(bLT%6c5I63MP>3fl4
z#x;^6_+vD=$dd5wkp!zdma=-0Lanji)c{NT9!L9Tws(b5rx)3AFbZyTm>S2n37qft
z6LVo&ZV}E4Y5&FMVTEC0!Z3wS<8$E8qhp#zcuA0!_8uTnn$aYGey!i_=X<jnMt-{q
zzq}q!@EK;E%DF}U;Z1VTd12)JaCdPh&8N(#6@EBK?hh?e%Gw!l%;I<J;6s6fh9NpA
zo2ZO~rD4vQIdIoBiI8Jo+Ez9uV-o+E%kjUz>Y{!$5zB1UF;~@oz)uXP*yU{az<kz8
zZujcPedfV2#QIK7wGMHtKkijnPxvtZcp!#5NK`Dm^u5hehiBKteMv#=&w4nO?dk4v
zcQ=8Wkk@HVc~#pG8es=EoLCd!-$E+v^MAE>U13da-PRH$2oj1&krqI@^iF7@{}hpq
z3J3@&QbYwp5d=b!Zb8LD@1S%93<v~52}O!XFGlH@Koq6l74P@m!}0NdzHfQi$zIuO
zt~utMV~kxFR(H;LMKWwFKCFB?gxtFZh%-N2*M%qV4w*QF*axrGIG9~j<}T&UO#;jZ
z>{&Bl0CQH_8O?faNu*cE^wslAzn)l_uBQugkFJ|tJm618INvO5tQ|153jrI48k4AH
z;EP~XU_*ZEQziazz%rXEwSXPVl`%R8)>nB!eXEULWA|2Af;Rrk2v#{f78i;BwKD(G
z$CRe>vUh!Z)7Jc1?E7|Qv>K`TxPw8>&Ch-DqHWyHtJ^DO?^+_MgQfl2S!_lD*?UC;
zRa^M-&O4Wmq^!X0Gl$OX!lRIr9zCZU$yx?2-o(aVrf|f18*`(Q^9hF|t?N>4Fa`fr
z+XJ(1te67?qi2haKDG8GUUE}yA0>&jW#NPJLsHgA9~1mmMtb{{2y_Rk+tkS{$Kq3(
z0O7lx!Q9q1W9G@jZ~DA5gpZZ3M!{f?m{{J(n>)$}&ju7&a!2gPhB)zzAcjFU2SJQ}
ziv)&*oO!yaY_zGl3O2c1pKih(MjxSQMX?9c4oxz9ht(^#c@S#^``bY`@d+iQpa-H3
z0CzSU;MkQDZ|-sdTGAR#$NuHRx`B>%{c}TG7C+cog`O}5&0QQD{*kx%1x3tBwsa|6
zgO1nY$E?stAGPg5AscG+v9>yj9k1^$K)H}xYz1w$f}V4&Q7j6489_=sfL2TMK@GR`
zhv*Av@WcnX=UYhrQ6w{j>lj_dvru(+v68LB$7PgJW@ejZ12oQ=Ha%w%vq+TE%e%L;
zIWSf<gTB5#Yii35>&Veh-o(p9S;LzmVrf|*Ft+J86e^gLf11zJt_}VaqHm^un`f;{
zO8*F9WC+l7Rf!mKh^;UoBRz1Q{Xom$;po?o+OVH5zfGQeT5|7|5R*mtlS3vzne$A-
z!qQeO39;uqH@<eUm(Y;e@evQll4CC693Re|_ek)i;f|Klcp}7qHY;>VZ}AMD@l&4g
zX8G^FS9i@BM6VICRR-pd1WjJBxfGJ<+bcp=#%|-!C&IE{B!phO--bn(e;MTgV`ueU
zInS5AcbD{(5zc9lkRUsg!@{i7>fxXC06v+T7!n+oKvbXDMWd<u&4u+Z-NY$9M5?O2
z3CzEx_Q=SL`=M@UrZ_t#L<SRGII93}RsIunaJc`KvaI~;eSUOg^j7Md7?=K-CwsYy
zLXo^Q5`cZF+lA{GRnZBSgGhNQP6Pz)RHhDv90CvFt#C6l8sXxIJdldi<zW3fA42!K
zE&Tw8mAwdWXr}q)s5mNzB!w1$_6TWF+06}lMV?Y1Rdr6{RLdgPXLrHqW6C|AUphiu
zgK}$3aaZ9YI-?vN2({z<cqAValZMXuj395!x?TobR4(g9TIV7V6`=|aLOP1k?xNeR
z4!H4$g%PC@aC_Bn$Q_2ChxKIPFLbZgtQ*fU@_v(B8VYn0U8;KTzvdwjqiIm_itc8M
zfu+_N4O}m2HeM*)OuAdvK@`P?3`7Vcpo9mrYRD%Y%vymEwiT&RJ@Di}Mzz~faudkl
zU2*PDSKG5y(z&}WopX-!9Y|uIY2CVgs(KatK~~yX4f9<Y13czEqfC$hT+b@q%AA|h
z>7fcgbFb@r{X+W6&?cE_k&vX?S8!cOEzH0iId)x1m{7qM6UDI5-j2cmHu?Kq-s0**
z8M|2&ZpM%<67)mt=0MwN6qN+vOCB6aY@Ty^R*v8vWi$t;y=YaMOOi5<R@B^<-0H}U
zGJGEjvT!cq!~hFDf~2YXC6*tPl8tZ*^K@_gIYK7#cudrK6{AKAuJ4%B{>-5Z4@o+{
z^wID{xDE8^Pmlw*T}TBQdtEUzsYRnE4D@y2t^%`kw`Pxa7o_(pO!P=JEb<Y@1iIQL
zd_Ju8pLaOfYm^;WW+#QoNSHdw4rg?HWmC<{7y#f;O!ofE0Oy};6Q2+Az0XODMCvpD
z6VGNJA<K28&Usc|aJ6OBk&|7}t=8cC)FMfl@T13I7zAC1H0D&o0DMd9$CS43h*jDu
zWJ8%A$Z1F-gEUpNU*s?Mq9-5P=H}+M(Ux4V4_R)(KM>YdTIg11UtZw+ilOAD=c(&w
z2m0$ox~;1v*~LB@AKt>Fkj2&csS2x_Pl7#?ToAY;9P7AUX*`6=-H^sjqJ0&&N((CM
zX};d!L_P4nqHX4`>aaB&Ioh>=;k)d_sr|g2(cO3XU;4T5RP9XO)?Mqqw;p2utn&{9
z=S!h@1%9<H7Cdwd6tUYS9Uyl(!{xbFeCVlD3OR~Z;3Jo!7l%b0;IjuKMKE@^yr$%@
zU+7{itZ=5Rc@q%#1@!sCUtEW+bzjSE;s&`FWCyV!=U(PRxr!pWbb_1|l~*B>0>7o&
z?BZ_)!X=2YU!n>zcWc0yU!)rNcGUuH(!^{_<Ql+J)!HyMxth{q4b+WgAdIlh=a}9Z
z0(s~T@oOcZ+{?L~Tq#Ho>ZVs+l_I#7YaHy}S+>^#gxzY5{q^L{#hm1Al?y6M0>QK4
z)ZN28uYUvjr#oOwHm(6|vrpA3?y}<<3|Pw5NnEOW@nzN@0u@IA(buW0aq|Uh`s$U>
zZ|%K4+XvKu2z%H1D=?8ub%Z<gw-W);@V#NhxoYlPA5Yk>Z|R18!`%vil*Fv<SPn}I
zZ5wG$Cp<^CafXmuHUWT4h3`{XSW6W3!yiV@Ol;BY(J$ZX4LM*V1+V=tjfe)q2IW6r
zv3N@dBeAVz-<xYM37GVaVKoO@tr|)gQ`pv}epN@{f1>6>eguW1`-dto8PHpe=)*j7
z2<k!&Z1C)ps3*W7Pbf8Cb9L)aU(;pYN%scF$2-7Y4%!6x?ox^Mn?crsd?EnfPhwTJ
zzCJ!R$K0X5y~wW$z&hb{&2f6A%u8*j7uFs3H&_n_7l9H*#`<#crA46ds0=&UnQ4lg
z1t_f8!|!l!!-Fe4g3;o~W*7j9(>oiVKkR=Ru@A7PNXXOxF!5h_9Wc(Q&*?PP;8*pg
zcDx&PbOlJ&T(qO<D!`pJT%sQreCg@DkP+5Y_LACOXBu#vy<Kn#>#%8P{^If9fcU=F
z=-wFoYjce@aT^pBd;JGMXH%t3uPo5>mb6oemyjL5Zvx~R4)=aqVQ)solmNByW^&kW
zPOC~K3yR#UxJJQd6x{@G+c2Qu@2%?^_$?22F(iE0nId9@;%e15M_o2SNi|L>0n}_T
zN{Uk?4|skAyh*bD>m#W&axfsuk|tu$d>Q2<F<=3zH#uJ{(a$@V5w<figHgQg8u!0q
z3&jUT!|)N<Kj9Z^k_mnk(3h8^PEViJF#>38kN(H<Cw#n6z7I$|JN;q%gaW(H<K92u
zibLFz_6^ku<qw{A)g;_Tv*__i(R{*we@|4G%%(s!qAt{xq~irrL6h+hSOmj_XH{=@
zmDp|~Yf^dBRS*`%6k2JuY(HWsMq!@G0vey4on7wH62^atk}C+m3HOk4)ZnCD!>}^U
zb;w|%(HnUafCzc@v7pI@Q&y}@+s!M|p8#GVFxLRwI}$Gfxl*k*ATt-ByI?N-tj$-e
zjhv;O#TP&EHg0*~_CL#H2=9T2hVtPt4%v68={Xly2;s`mqQw)aQJkt4+tqT7y`>Aa
z``asLv;M<2eJX$b`fP>DVrC>J^ZH0>+Ut9pV+sM^uMn+Nz;kGXB0Z(f#Ke$75@G^d
zcgq0_99|>3)4Yj2j&L}UkG18*@E9{^+t{gYbn@>N$DY7B>w3-ir1^sfU4T%b!1Wp8
z^-%%7NCgCi>TiW+A98=~XI)*Kx_vgB5j{sMiAlLwSM4)14K-=w4*bf9KRy27BMzDx
zDNk6U2&^>9RQ54I3*cZ`RzLp&s4l@38GzG&`z7)>Lw>BVxP-*!!Zct+rm0BU&H<ik
zIhX%vOBuf6+j9~-U_KVB;~p{c#o0W%!<y`wUlHI5JCiVzV}JMLt&p6#$8$6PA(ir(
zj&T!?{;SworvVC}<%xG?KGNyb+#Ev`vF5%L%Q~~?*UpFPKg1HOoYk|gqN5&`&VsPn
zaJr55ST7I#@<3=D2(A{fUnyk>k;-Y3fgHh0>u2-h-g-Oh4xeZ8w)A+59FiW3AFT6r
zIe4jo&X?}Rdb_THFxxEoUOi-LZEpUk*bN9j9S{yc;uU(ZemxPCnnXp|6e!^8CF2$O
z#|622(%u5qXz|<BJ$@1&CV8*hO@NOkT?JR8>(%#4dHOOHc`9H=?b}72`!+GY)2kSA
z-?Wk89?D?n7m>)El^fj4_CD%49G$vujcSIn>RzD=k6i>JAj9G!PG<F%9i8h!#~zwp
zz+~J)3Roz1AUqQ4T|BvIE;3|27P_$M>>Yo;mgEI>z5LGm^|+gojjZ1_^v1ucqE5|b
z*9T&6mOIoKR1?kQclL;3$=xKLU*)#TL1d%XC}BU5c3Eo-5ZU~$^j>}9Xj)(1gh$&f
zL!vYKH>76N%AX~1$anX<*9Vg}@sq!^h7~3w-)1bxK=S8Zx94!OTenS*Y`6E(ekmAB
zE>vcmT~J+Tef|1pfG`o4nA*@2y-(mCHhLY<_nE*QVq{<2EVH5td+ez9auQqRcp~|-
zPosQI60M@~8v?>Fw28ctJO8Gb$J_I<kC86sMg|_eh*u#xq|K{T%q?Wx?iLB%|FJZ*
zCs+Ea^Sp(^cQn3xapl(1D!#OJOS|!vg~a1)MYMTj{7l%vJ&oioQ>CpyDfbr}5-tBJ
z=3Y?VAS?Grnmm_k>Ayo`+FCjsp!8WuF;aOheLS7{acC_bnKqtI_Sg#SBijT`c)iP8
zPTD}tr@8NjD7uMSmK?GWD`bIul(hP!S?UPD;jiW!K&FWj%{mUZL}xPk>ulD7u=p5#
z>a1Rct={dZ*?M?Xkb$d-IrY1tCi<sCk;$jnq?*GP$T8`mr7|nHu|5`?D|6Te=o%k}
zNMSXXFaik|d{{dOMdt%?KqREd)F}@vHM)Od-<Vgl?a~N$B&mEW7DPz_InjHWc4Vw$
z=Voj{%-t>2DZ{GtTwiZ<A_eZgW2{tQ%GdvLx`#&Wz@*rp4#qo|-(pa%HF{&RmWpT(
zlt4%pbuHRyvt^oT9B7sFNeZ2Qv>F3SHp-|8Xq;I!^67|0hmB5o*^f~@ydPn&H3=B_
zYU%?JyOrsCzwCL6^*0d^h)W{$I2~+w%zq~EXm{d!_-kc7l;dzKR>Z|%L%y{@sx_||
zi{h~|xYWXym8BiS;S%@iS9|hVKF-REIeO}^l6@7>T|XBGa;{jQ^jW*Opp10cl~=N?
zl=x#m0BNYz=*dbB=YckUN{yM7$VmXeMcs;ejEC_Vl(u!{QDs@vcDGGbCap_9atcl-
z3$=lO`kr%slJK0bq0^wNBK~G|Mv5+Sm%$k=!z9Qy8Wn_$6?b{#*7KavZN~@6EIn>Q
z9eH@Qnahxqdd5-qS*#fXj3KX*2a<oy!JlyZ;nKgpl;Ru42ax!57OXZETj@*QQd>N7
z%MHHi-LND$0rLGRVrS#8=@m74flJK>yeVA~&2fUVotEBKz5Gj1oY4z0nZY@ii}Sgf
z9SlzfJmXNFQW$yYOyLnSt>{)v#RayG`xgWWdA)4YV4!eIoSk@@=^5|u^|;%JNE;Cg
zH79P78FmX@K|>YJ-&bSBSzFz&H44CKvQP2T=qu1Vy!FJ(A0Dyrs)t7~TMXp$hc31Y
zdBR|{$D*&%mdH3ATjNhvg%J^_$qq%SuAVTbE^UN3bY^+Hrep?lcH(gb``po&)ZdZS
zvp3pGB6*>6Y0N^Vaxwx;`FfSqmHSaT98O%DLf4Lc4PcPoP2%7dH6IglinaL7kDeUG
z+_oA#su`mxRrkKa2WM2Y1f5l+IoC<jXJWx5vamadw~^1kn{>upWh*=1dJ-l<l*Zod
zs&9#5(1CDfC0)$1-woyyi8}l=D0@%ZEsolr`-e<kkeg!U)j6k!)B9F8B<jQK1ppsS
zB3x6HFy8H9L8`0v92@julBQxsSi<Qf#-FBes9}yMHHfP<2HhR&-Zm-ZfXBt&C-<pJ
z*4WA5*S*inDDiRbgGNOG#FHT<cSld0?-r_nNwO(1v~x994G0E`f=age(GGmeGz6oA
zS@^ZE!Ne4E++FD=G0e&4a|tAu^lVqzyn8Ubav$1*uefuh)1Todyfo#>ifw-FfW$aQ
z-%B&0nW$UbIukjEjIGQppcHxQYcn<{GdNu9h!U_-eh)-jd)H6_mbSGrAJ~Vmqx={~
zbkO<{$J7A5ZxjAHoDK=`5)GYY-8<D%I>Ls6Qn(?VjxaUQOP!`|txS=MoN>aKqNvZ1
zt@D1aeD3#2C&?pB1gv?TjvE?EwbkN{7sDQVM&83Am+v6wTM-|wZt=8997B_ubD1PP
zfoQO4lhQpb5Q0rasyY=`N3!la$0}_`TJ8-HkS{_K)-7A`9D|SeMRU5Wq5VKIz#4V`
zWORoFXUD7;Ul#;Nn~#02p3knK8y7OZ3C=TF{N+(WZrQfNe?3f%V<jWKRhTzu-*s-T
z=?_>ojuWE+pz-!=oZM8%V6GqiA=MQf9d%2F`#W}2ptfIID$9BeY|PsEaxWyOG{lvy
z1S>a9!~HQ5j<oHodj+Z%4!(%;52Wl7&}<#)hke6_uA2&s@RRMgcjAy^sp}@@C(r{F
z%-q2$jf-8^Tyijw;m~W8gdc^h^<OK!VEdJ#aD*yMw^&-aaGr$+U`S`eBkI0Tb)jc`
zlWuvJo6x}oTTd{Wy8Wl`CPq)7y_w;G6Az;?^e)nX<`f+vizHlOf|fi`czz+;2p3VI
zR-4>G!>>#;mw_S2`Sb2FK1MY&+=xBjmw46b71y*HggvWTYcyKs-^p5w_rdP6A;!J?
z=j1b$e#>Pp^|xL7TuLGD#l7;r!>f$@f~U{5%Pe$lkg>t}lUSFu+r~=$QIab%FH-Gr
za(;EJ+xheFKb8;s=Xhm0Ods_lBq}})bT_jda1A7{xCZw>F?!4uH^1wDOq8%Wh4n28
zPLXH?#gt4_aQrWx^(Nl#D1S2d`RS-9-sS#9pDjwA?4Y=vXRk7WDch?4Z}qk6A@km^
zTvvJmko@wJLomhs&S#NE>Sv~9zg1GXUFnVDeyReu$bS>V<!@U=7EKlDuU-)rwn(~X
z9A<MytX|rwC?KQ4=at2CW@MfzDldr@uf$_-d##9hyvg`Cp%r2@Wt3<1x>>Z!MChpz
zcBc?&*A%F4GitM(`p-aB(~CD{I5>ZqH-A#wGBu$ZZz!n5J0%~vyc8~!cNZE}Y|ejk
zpj$wpS(dT2u5wMqkc%xxd(@7Ua!u@g<@}H!Ms|+N1ZCWbiupxF>>j$H;RS#`soTRe
zYQpzed$N^_v1Mb>&&sDlJ<0d@q;=%9bn{tAI?nI}$tS+73OvjDv5uT&g~2z<d0@JB
zBVDeZkENR9`qHlWT|ea1F?{;i^+YM1Y*8->18J*NuEB;9p`*B;2y&0CLo8a_uw)_K
zqwH{Vq4=eyxpOfu&e%#({F5V@M(S-`^5KZlgq)1h{63n2ypcElmr*$HF)E5peKURR
zi@!XY%w#FE%>+WO^<R_f(`=xRaZHuhBk8ZcM=viJZptP-zVw%elG#;?VaV$JLgl|2
zAM%l4y4kB_>XG-?H-%GP08xHCd;VWjzWdN&Ixr@cCny!A{`F1&#UjFW+J+O@Wjz5#
zr>edA_hRltpPakzt@S~`=|OA}#RLXSXkO?9TgLqRPVYl4&;FmG3gyJiH%@Z~{tl+u
zCV$(iXhN8T?mK9`%XEBHo;~0D($W4?+^y78oJt=RrtVr?23GpODc{!D*Vn4#)AnD>
z<1=$TwwOvWNT~BAtb*da3V1=-)SHSH{;{Lc@wlBTcXOmCNo84h0yJ6P=*FKhv?>WY
zyhkXXGY27T{JPh9xQpzn+^ibXf3PsKSS+R9JnRPeXN+!bbL>Gv+9mtB>mUC94qnPz
z3OujL9R>-1uDJ~+qTNUJ<NyBx@OS(d|5Eq?^~Cd}d!el51j<W!40TO)-XgC?{12s<
Bc1QpK

literal 0
HcmV?d00001

diff --git a/docs/reference/rag-toolset/testset_generation.rst b/docs/reference/rag-toolset/testset_generation.rst
index 2cb429fe82..82fe2ac2d3 100644
--- a/docs/reference/rag-toolset/testset_generation.rst
+++ b/docs/reference/rag-toolset/testset_generation.rst
@@ -1,8 +1,8 @@
 Testset Generation
 ======
 
-.. autoclass:: giskard.rag.KnowledgeBaseTestsetGenerator
+.. autoclass:: giskard.rag.TestsetGenerator
     :members:
 
-.. autoclass:: giskard.rag.TestSet
+.. autoclass:: giskard.rag.QATestset
     :members:
\ No newline at end of file
diff --git a/docs/reference/rag-toolset/vector_store.rst b/docs/reference/rag-toolset/vector_store.rst
index 5707244113..f2fa9d0780 100644
--- a/docs/reference/rag-toolset/vector_store.rst
+++ b/docs/reference/rag-toolset/vector_store.rst
@@ -6,6 +6,3 @@ Vector Store
 
 .. autoclass:: giskard.rag.vector_store.Document
     :members:
-
-.. autoclass:: giskard.rag.embeddings.OpenAIEmbeddings
-    :members:
\ No newline at end of file

From fa453f7fb7f2cbec7f5c7d5602d893449f2b35d4 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Tue, 13 Feb 2024 12:36:25 +0100
Subject: [PATCH 85/88] Nice table in docs

---
 docs/open_source/testset_generation/index.md | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/docs/open_source/testset_generation/index.md b/docs/open_source/testset_generation/index.md
index 662135c259..74a41d5a05 100644
--- a/docs/open_source/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -4,10 +4,21 @@ The Giskard python library provides a toolset dedicated to Retrieval Augmented G
 
 (difficulty_levels)=
 ## Generate questions with difficulty levels
+
 You can currently generate questions with three difficulty levels:
-- **Easy questions (level 1):** simple questions generated from an excerpt of the knowledge base
-- **Complex questions: (level 2)** questions made more complex by paraphrasing
-- **Distracting questions (level 3):** questions made even more difficult by adding a distracting element which is related to the knowledge base but irrelevant to the question
+
+```{list-table}
+:header-rows: 1
+:widths: 35, 65
+* - Difficulty Level
+  - Description
+* - **1: Easy questions**
+  - Simple questions generated from an excerpt of the knowledge base
+* - **2: Complex questions**
+  - Questions made more complex by paraphrasing
+* - **3: Distracting questions**
+  - Questions made even more difficult by adding a distracting element which is related to the knowledge base but irrelevant to the question
+```
 
 These three difficulty levels allow you to evaluate different components of your model. Easy questions are directly generated from your knowledge base. They assess the quality of the answer generation from the context, i.e. the quality of the LLM answer. Complex and distracting questions are more challenging as they can perturb the retrieval component of the RAG. These questions are more realistic of a user seeking precise information with your model.
 
@@ -159,7 +170,7 @@ To upload your test suite, you must have created a project on Giskard Hub and in
 
 Then, upload your test suite like this:
 ```python
-test_suite.upload(giskard_client, project_id) #project_id should be the id of the Giskard project in which you want to upload the suite
+test_suite.upload(giskard_client, project_id)  # project_id should be the id of the Giskard project in which you want to upload the suite
 ```
 
 [Here's a demo](https://huggingface.co/spaces/giskardai/giskard) of the Giskard Hub in action.

From 7bc9f6c4ea28dac7af8fc36fb21d47618a8ce2a8 Mon Sep 17 00:00:00 2001
From: Matteo Dora <matteo@giskard.ai>
Date: Tue, 13 Feb 2024 12:40:15 +0100
Subject: [PATCH 86/88] Add docs for QATestset

---
 docs/open_source/testset_generation/index.md |  2 +-
 giskard/rag/testset.py                       | 31 ++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/docs/open_source/testset_generation/index.md b/docs/open_source/testset_generation/index.md
index 74a41d5a05..528d7f7e70 100644
--- a/docs/open_source/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -115,7 +115,7 @@ from giskard.rag import QATestset
 loaded_testset = QATestset.load("my_testset.jsonl")
 ```
 
-The test set will be an instance of {ref}`giskard.rag.QATestset`. You can save it and load it later with `QATestset.load("path/to/testset.jsonl")`.
+The test set will be an instance of {class}`~giskard.rag.QATestset`. You can save it and load it later with `QATestset.load("path/to/testset.jsonl")`.
 
 You can also convert it to a pandas DataFrame with `testset.to_pandas()`:
 
diff --git a/giskard/rag/testset.py b/giskard/rag/testset.py
index b500977366..c73afee46e 100644
--- a/giskard/rag/testset.py
+++ b/giskard/rag/testset.py
@@ -6,6 +6,8 @@
 
 
 class QATestset:
+    """A class to represent a testset for QA models."""
+
     def __init__(self, dataframe: pd.DataFrame):
         self._dataframe = dataframe
 
@@ -13,20 +15,48 @@ def __len__(self):
         return len(self._dataframe)
 
     def to_pandas(self):
+        """Return the testset as a pandas DataFrame."""
         return self._dataframe
 
     def to_dataset(self):
         return Dataset(self._dataframe, name="QA Testset", target=False, validation=False)
 
     def save(self, path):
+        """Save the testset as a JSONL file.
+
+        Parameters
+        ----------
+        path : str
+            The path to the output JSONL file.
+        """
         self._dataframe.to_json(path, orient="records", lines=True)
 
     @classmethod
     def load(cls, path):
+        """Load a testset from a JSONL file.
+
+        Parameters
+        ----------
+        path : str
+            The path to the input JSONL file.
+        """
         dataframe = pd.read_json(path, orient="records", lines=True)
         return cls(dataframe)
 
     def to_test_suite(self, name=None):
+        """
+        Convert the testset to a Giskard test suite.
+
+        Parameters
+        ----------
+        name : str, optional
+            The name of the test suite. If not provided, the name will be "Test suite generated from testset".
+
+        Returns
+        -------
+        giskard.Suite
+            The test suite.
+        """
         suite_default_params = {"dataset": self.to_dataset()}
         name = name or "Test suite generated from testset"
         suite = Suite(name=name, default_params=suite_default_params)
@@ -34,4 +64,5 @@ def to_test_suite(self, name=None):
         return suite
 
     def copy(self):
+        """Return a copy of the testset."""
         return QATestset(self._dataframe.copy())

From a9cd6883fad5f02f6aa96ceb3a8538367605485f Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Tue, 13 Feb 2024 15:09:51 +0100
Subject: [PATCH 87/88] Add warning message in the RAG toolset doc

---
 docs/open_source/testset_generation/index.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/open_source/testset_generation/index.md b/docs/open_source/testset_generation/index.md
index 528d7f7e70..ff688ba1e5 100644
--- a/docs/open_source/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -1,5 +1,8 @@
 # 🧰 RAG Testset Generation
 
+> ⚠️ **The RAG toolset is currently in beta version and is subject to change**. Feel free to reach out on our [Discord server](https://discord.gg/fkv7CAr3FE) if you have any trouble with test set generation or to provide feedback.
+
+
 The Giskard python library provides a toolset dedicated to Retrieval Augmented Generative models (RAGs) that generates question & answer pairs from the knowledge base of the model. The generated test set is then used to evaluate your model. 
 
 (difficulty_levels)=

From 20da7c14d8eeb3f1e5ebf77fe9d9b3ccd60a14e0 Mon Sep 17 00:00:00 2001
From: Pierre Le Jeune <pierre@giskard.ai>
Date: Tue, 13 Feb 2024 15:34:16 +0100
Subject: [PATCH 88/88] Update docs

---
 docs/open_source/testset_generation/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/open_source/testset_generation/index.md b/docs/open_source/testset_generation/index.md
index ff688ba1e5..f52c34a0ed 100644
--- a/docs/open_source/testset_generation/index.md
+++ b/docs/open_source/testset_generation/index.md
@@ -1,6 +1,6 @@
 # 🧰 RAG Testset Generation
 
-> ⚠️ **The RAG toolset is currently in beta version and is subject to change**. Feel free to reach out on our [Discord server](https://discord.gg/fkv7CAr3FE) if you have any trouble with test set generation or to provide feedback.
+> ⚠️ **The RAG toolset is currently in early version and is subject to change**. Feel free to reach out on our [Discord server](https://discord.gg/fkv7CAr3FE) if you have any trouble with test set generation or to provide feedback.
 
 
 The Giskard python library provides a toolset dedicated to Retrieval Augmented Generative models (RAGs) that generates question & answer pairs from the knowledge base of the model. The generated test set is then used to evaluate your model.