# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
from langchain.schema import Document
from langchain.retrievers import BM25Retriever
import pickle
import io

df = dataiku.Dataset("chunks").get_dataframe()
folder = dataiku.Folder("qkyfR2rB")

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
documents = []
for i in df.index:
    metadata = {"chunk_id": df.at[i, "chunk_id"], "url": df.at[i, "url"]}
    documents.append(Document(page_content=df.at[i, "chunk"], metadata=metadata))

bm25_retriever = BM25Retriever.from_documents(documents, k=5)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
with io.BytesIO() as buf:
    pickle.dump(bm25_retriever, buf)
    folder.upload_data("bm25result.pkl", buf.getvalue())
