Skip to content

Commit

Permalink
docs: added a quickstart nb (#41)
Browse files Browse the repository at this point in the history
  • Loading branch information
jjmachan authored Jun 9, 2023
1 parent efbecf8 commit 6b76cd3
Show file tree
Hide file tree
Showing 8 changed files with 372 additions and 318 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,4 @@ experiments/**/storage
**/fil-result/
experiments/baselines/fiqa/datasets
src/ragas/_version.py
.python-version
17 changes: 11 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,23 @@ This is a small example program you can run to see ragas in action!
```python

from ragas import evaluate
from datasets import Dataset
import os

os.environ["OPENAI_API_KEY"] = "your-openai-key"

ds = Dataset({
features: ['question','context','answer'],
num_rows: 25
})
results = evaluate(ds)
# prepare your huggingface dataset in the format
# Dataset({
# features: ['question','contexts','answer'],
# num_rows: 25
# })

dataset: Dataset

results = evaluate(dataset)

```
If you want a more in-depth explanation of core components, check out our quick-start notebook
If you want a more in-depth explanation of core components, check out our [quick-start notebook](./examples/quickstart.ipynb)
## :luggage: Metrics

Ragas measures your pipeline's performance against two dimensions
Expand Down
393 changes: 208 additions & 185 deletions examples/quickstart.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion experiments/assesments/metrics_assesments.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
"metadata": {},
"outputs": [],
"source": [
"os.chdir('/Users/shahules/belar/src/')"
"os.chdir(\"/Users/shahules/belar/src/\")"
]
},
{
Expand Down
83 changes: 33 additions & 50 deletions experiments/baselines/fiqa/dataset-exploration-and-baseline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,11 @@
"from beir.datasets.data_loader import GenericDataLoader\n",
"\n",
"dataset = \"fiqa\"\n",
"url = \"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip\".format(dataset)\n",
"url = (\n",
" \"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip\".format(\n",
" dataset\n",
" )\n",
")\n",
"data_path = util.download_and_unzip(url, \"datasets\")"
]
},
Expand Down Expand Up @@ -218,7 +222,7 @@
"source": [
"with open(os.path.join(data_path, \"corpus.jsonl\")) as f:\n",
" cs = [pd.Series(json.loads(l)) for l in f.readlines()]\n",
" \n",
"\n",
"corpus_df = pd.DataFrame(cs)\n",
"corpus_df"
]
Expand Down Expand Up @@ -299,9 +303,7 @@
}
],
"source": [
"corpus_df = corpus_df.rename(columns={\n",
" \"_id\": \"corpus-id\", \"text\": \"ground_truth\"\n",
"})\n",
"corpus_df = corpus_df.rename(columns={\"_id\": \"corpus-id\", \"text\": \"ground_truth\"})\n",
"corpus_df = corpus_df.drop(columns=[\"title\", \"metadata\"])\n",
"corpus_df[\"corpus-id\"] = corpus_df[\"corpus-id\"].astype(int)\n",
"corpus_df.head()"
Expand Down Expand Up @@ -387,9 +389,7 @@
" qs = [pd.Series(json.loads(l)) for l in f.readlines()]\n",
"\n",
"queries_df = pd.DataFrame(qs)\n",
"queries_df = queries_df.rename(columns={\n",
" \"_id\": \"query-id\", \"text\": \"question\"\n",
"})\n",
"queries_df = queries_df.rename(columns={\"_id\": \"query-id\", \"text\": \"question\"})\n",
"queries_df = queries_df.drop(columns=[\"metadata\"])\n",
"queries_df[\"query-id\"] = queries_df[\"query-id\"].astype(int)\n",
"queries_df.head()"
Expand Down Expand Up @@ -474,10 +474,10 @@
"splits = [\"dev\", \"test\", \"train\"]\n",
"split_df = {}\n",
"for s in splits:\n",
" split_df[s] = pd.read_csv(\n",
" os.path.join(data_path, f\"qrels/{s}.tsv\"), sep=\"\\t\"\n",
" ).drop(columns=[\"score\"])\n",
" \n",
" split_df[s] = pd.read_csv(os.path.join(data_path, f\"qrels/{s}.tsv\"), sep=\"\\t\").drop(\n",
" columns=[\"score\"]\n",
" )\n",
"\n",
"split_df[\"dev\"].head()"
]
},
Expand Down Expand Up @@ -515,10 +515,14 @@
" df = queries_df.merge(split_df[split], on=\"query-id\")\n",
" df = df.merge(corpus_df, on=\"corpus-id\")\n",
" df = df.drop(columns=[\"corpus-id\"])\n",
" grouped = df.groupby('query-id').apply(lambda x: pd.Series({\n",
" 'question': x['question'].sample().values[0],\n",
" 'ground_truths': x['ground_truth'].tolist()\n",
" }))\n",
" grouped = df.groupby(\"query-id\").apply(\n",
" lambda x: pd.Series(\n",
" {\n",
" \"question\": x[\"question\"].sample().values[0],\n",
" \"ground_truths\": x[\"ground_truth\"].tolist(),\n",
" }\n",
" )\n",
" )\n",
"\n",
" grouped = grouped.reset_index()\n",
" grouped = grouped.drop(columns=\"query-id\")\n",
Expand Down Expand Up @@ -797,11 +801,8 @@
"assert os.path.exists(path_to_ds_repo), f\"{path_to_ds_repo} doesnot exist!\"\n",
"\n",
"for s in final_split_df:\n",
" final_split_df[s].to_csv(\n",
" os.path.join(path_to_ds_repo, f\"{s}.csv\"),\n",
" index=False\n",
" )\n",
" \n",
" final_split_df[s].to_csv(os.path.join(path_to_ds_repo, f\"{s}.csv\"), index=False)\n",
"\n",
"corpus_df.to_csv(os.path.join(path_to_ds_repo, \"corpus.csv\"), index=False)"
]
},
Expand Down Expand Up @@ -1009,18 +1010,11 @@
"from llama_index.node_parser import SimpleNodeParser\n",
"from langchain.text_splitter import TokenTextSplitter\n",
"\n",
"spliter = TokenTextSplitter(\n",
" chunk_size = 100,\n",
" chunk_overlap = 50\n",
")\n",
"spliter = TokenTextSplitter(chunk_size=100, chunk_overlap=50)\n",
"\n",
"parser = SimpleNodeParser(\n",
" text_splitter=spliter\n",
")\n",
"parser = SimpleNodeParser(text_splitter=spliter)\n",
"\n",
"nodes = parser.get_nodes_from_documents(\n",
" documents=docs\n",
")"
"nodes = parser.get_nodes_from_documents(documents=docs)"
]
},
{
Expand Down Expand Up @@ -1088,16 +1082,12 @@
"source": [
"# create index\n",
"index = GPTVectorStoreIndex.from_documents(\n",
" documents=docs, \n",
" documents=docs,\n",
" service_context=openai_sc,\n",
")\n",
"\n",
"# query with embed_model specified\n",
"qe = index.as_query_engine(\n",
" mode=\"embedding\", \n",
" verbose=True, \n",
" service_context=openai_sc\n",
")"
"qe = index.as_query_engine(mode=\"embedding\", verbose=True, service_context=openai_sc)"
]
},
{
Expand Down Expand Up @@ -1171,10 +1161,7 @@
"\n",
"# query with embed_model specified\n",
"qe = index.as_query_engine(\n",
" mode=\"embedding\", \n",
" verbose=True, \n",
" service_context=openai_sc,\n",
" use_async = False\n",
" mode=\"embedding\", verbose=True, service_context=openai_sc, use_async=False\n",
")"
]
},
Expand All @@ -1195,15 +1182,13 @@
"\n",
"# configure retriever\n",
"retriever = VectorIndexRetriever(\n",
" index=index, \n",
" index=index,\n",
" similarity_top_k=3,\n",
")\n",
"\n",
"# configure response synthesizer\n",
"response_synthesizer = ResponseSynthesizer.from_args(\n",
" node_postprocessors=[\n",
" SimilarityPostprocessor(similarity_cutoff=0.7)\n",
" ]\n",
" node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)]\n",
")\n",
"\n",
"# assemble query engine\n",
Expand Down Expand Up @@ -1257,9 +1242,10 @@
" r = qe.query(row[\"question\"])\n",
" row[\"answer\"] = r.response\n",
" row[\"contexts\"] = [sn.node.text for sn in r.source_nodes]\n",
" \n",
"\n",
" return row\n",
"\n",
"\n",
"# generate_response(test_ds[0])"
]
},
Expand Down Expand Up @@ -1530,10 +1516,7 @@
"from ragas.metrics import factuality, answer_relevancy, context_relevancy\n",
"from ragas import evaluate\n",
"\n",
"evaluate(\n",
" gen_ds, \n",
" metrics=[factuality, answer_relevancy, context_relevancy]\n",
")"
"evaluate(gen_ds, metrics=[factuality, answer_relevancy, context_relevancy])"
]
},
{
Expand Down
Loading

0 comments on commit 6b76cd3

Please sign in to comment.