from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from get_embedding_function import get_embedding_function
from langchain_elasticsearch import ElasticsearchStore

ELASTICSEARCH_URL = "http://localhost:9200"
INDEX_NAME = "rag_index"


def afegirDoc(data_path: str):
    # Create (or update) the Elasticsearch index.
    documents = load_documents(data_path)
    chunks = split_documents(documents)
    return add_to_elasticsearch(chunks)


def load_documents(data_path: str):
    document_loader = PyPDFLoader(data_path)
    return document_loader.load()


def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
    )
    return text_splitter.split_documents(documents)


def add_to_elasticsearch(chunks: list[Document]):
    # Prepare document IDs
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Connect to Elasticsearch
    embedding_function = get_embedding_function()
    es_store = ElasticsearchStore(
        index_name=INDEX_NAME,
        embedding=embedding_function,
        es_url=ELASTICSEARCH_URL,
        es_user="elastic",
        es_password="HMkJyNKW",
        distance_strategy="COSINE",
        strategy=ElasticsearchStore.ApproxRetrievalStrategy(),
        vector_query_field="dense_vector",
    )


    existing_ids = set()
    try:
        results = es_store.client.search(index=INDEX_NAME, size=10000, _source=False)
        hits = results.get("hits", {}).get("hits", [])
        existing_ids = {hit["_id"] for hit in hits}
    except Exception as e:
        print("Index may not exist yet. Continuing...")

    # Filter out already existing chunks
    new_chunks = []
    new_chunk_ids = []
    for chunk in chunks_with_ids:
        chunk_id = chunk.metadata["id"]
        if chunk_id not in existing_ids:
            new_chunks.append(chunk)
            new_chunk_ids.append(chunk_id)

    if new_chunks:
        print(f"👉 Adding {len(new_chunks)} new chunks to Elasticsearch")
        try:
            es_store.add_documents(documents=new_chunks, ids=new_chunk_ids)
            return "success"
        except Exception as e:
            return f"Error adding documents to Elasticsearch: {e}"
    else:
        return "nothing_new_added"
    


def calculate_chunk_ids(chunks):
    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id
        chunk.metadata["id"] = chunk_id

    return chunks



data_path = "docs/OReilly Guide - RAG_in_production_with_Haystack-FINAL.pdf"

print(afegirDoc(data_path))
