Source Code
rag_evaluator.py
import numpy as np
class RAGEvaluator:
def compute_retrieval_metrics(self, retrieved_ids, relevant_ids, k=5):
# Precision@k
retrieved_at_k = retrieved_ids[:k]
relevant_set = set(relevant_ids)
hits = [doc_id for doc_id in retrieved_at_k if doc_id in relevant_set]
precision = len(hits) / k
# Recall@k
recall = len(hits) / len(relevant_set) if relevant_set else 0.0
return {"precision": precision, "recall": recall}
def compute_faithfulness(self, answer, context):
# Placeholder for LLM-based check
# Real implementation would call an LLM to verify facts
return 1.0 if context in answer else 0.0