Afegit codi de api

8043de40 · Hermes · 8043de40 · 8043de40 · 8043de40 · 8043de40
Commit 8043de40 authored Oct 15, 2025 by Hermes
19 changed files
--- a/.env
+++ b/.env
+SECRET_KEY=27e15496b7e4e2134612e531f6e264043ff627cc3545a29c51e13e735482ab15
+JWT_SECRET_KEY=9a0cdbc0d344ac8147ab47d50c84d9e045c13f0a1156b4ce8a711e88c1effb91
+SQLALCHEMY_DATABASE_URI=sqlite:///rag.db
+JWT_ACCESS_TOKEN_EXPIRES_MINUTES=15
+JWT_REFRESH_TOKEN_EXPIRES_DAYS=30
+ELASTICSEARCH_URL=http://localhost:9200
+ELASTICSEARCH_INDEX_NAME=rag_index
+ELASTICSEARCH_USER=elastic
+ELASTICSEARCH_PASSWORD=HMkJyNKW
+ELASTICSEARCH_QUERRY_FIELD=dense_vector
+ELASTICSEARCH_DISTANCE_STRATEGY=COSINE
+CHAT_MODEL=qwen2.5:32b
+OLLAMA_URL=http://localhost:11434
\ No newline at end of file
--- a/__pycache__/crida_api_rag.cpython-310.pyc
+++ b/__pycache__/crida_api_rag.cpython-310.pyc
--- a/__pycache__/get_embedding_function.cpython-310.pyc
+++ b/__pycache__/get_embedding_function.cpython-310.pyc
--- a/__pycache__/introduccio_docs.cpython-310.pyc
+++ b/__pycache__/introduccio_docs.cpython-310.pyc
--- a/api_endpoints.py
+++ b/api_endpoints.py
+from flask import Flask, request, jsonify
+from flask_jwt_extended import JWTManager, create_access_token, create_refresh_token, get_jwt_identity, jwt_required, get_jwt
+from flask_sqlalchemy import SQLAlchemy
+import bcrypt
+from datetime import timedelta
+from dotenv import load_dotenv
+import os
+from crida_api_rag import chat
+from introduccio_docs import afegirDoc
+
+app = Flask(__name__)
+
+load_dotenv()
+
+app.config['SECRET_KEY'] = os.getenv('SECRET_KEY')
+app.config["JWT_SECRET_KEY"] = os.getenv('JWT_SECRET_KEY')
+app.config['JWT_TOKEN_LOCATION'] = ['headers']
+app.config['SQLALCHEMY_DATABASE_URI'] = os.getenv('SQLALCHEMY_DATABASE_URI')
+app.config['JWT_ACCESS_TOKEN_EXPIRES'] = timedelta(minutes=int(os.getenv('JWT_ACCESS_TOKEN_EXPIRES_MINUTES')))
+app.config['JWT_REFRESH_TOKEN_EXPIRES'] = timedelta(days=int(os.getenv('JWT_REFRESH_TOKEN_EXPIRES_DAYS')))
+
+jwt = JWTManager(app)
+db = SQLAlchemy(app) 
+
+class User(db.Model):
+    id = db.Column(db.Integer, primary_key=True)
+    username = db.Column(db.String(20), unique=True, nullable=False)
+    password = db.Column(db.String(80), nullable=False)
+    is_active = db.Column(db.Boolean(), default=True)
+
+    def __repr__(self):
+        return f'<User {self.username}>'
+
+class RevokedToken(db.Model):
+    id = db.Column(db.Integer, primary_key=True)
+    jti = db.Column(db.String(120), unique=True, nullable=False)
+
+    def __repr__(self):
+        return f'<RevokedToken {self.jti}>'
+
+@app.route('/register', methods=['POST'])
+def register():
+    username = request.json.get('username')
+    password = request.json.get('password')
+
+    if not username or not password:
+        return jsonify({"msg": "Missing username or password"}), 400
+
+    if User.query.filter_by(username=username).first():
+        return jsonify({"msg": "User already exists"}), 400
+
+    hashed_pw = bcrypt.hashpw(password.encode('utf-8'), bcrypt.gensalt())
+    user = User(username=username, password=hashed_pw.decode('utf-8'))
+    db.session.add(user)
+    db.session.commit()
+    return jsonify({"msg": "User registered successfully"}), 201
+
+@app.route('/login', methods=['POST'])
+def login():
+    data = request.get_json()
+    username = data['username']
+    password = data['password']
+
+    user = User.query.filter_by(username=username).first()
+    if user and bcrypt.checkpw(password.encode('utf-8'), user.password.encode('utf-8')):
+        access_token = create_access_token(identity=user.id)
+        refresh_token = create_refresh_token(identity=user.id)
+        return jsonify({'message': 'Login Success', 'access_token': access_token, 'refresh_token': refresh_token}), 200
+    else:
+        return jsonify({'message': 'Login Failed'}), 401
+
+@app.route('/refresh', methods=['POST'])
+@jwt_required(refresh=True)
+def refresh():
+    current_user_id = get_jwt_identity()
+    new_access_token = create_access_token(identity=current_user_id)
+    return jsonify({'access_token': new_access_token}), 200
+
+@app.before_request
+@jwt.token_in_blocklist_loader
+def check_if_token_revoked(jwt_header, jwt_payload):
+    jti = jwt_payload["jti"]
+    if RevokedToken.query.filter_by(jti=jti).first():
+        return jsonify({"msg": "Token has been revoked"}), 401
+
+@app.route('/logout', methods=['DELETE'])
+@jwt_required()
+def logout():
+    jti = get_jwt()['jti']
+    revoked_token = RevokedToken(jti=jti)
+    db.session.add(revoked_token)
+    db.session.commit()
+    return jsonify({"msg": "Successfully logged out"}), 200
+
+
+@app.route('/chat', methods=['POST'])
+@jwt_required()
+def chat_endpoint():
+    current_user = get_jwt_identity()
+    if not User.query.get(current_user):
+        return jsonify({"msg": "User not found"}), 404
+
+    prompt = request.json.get('prompt')
+    if not prompt:
+        return jsonify({"msg": "Missing prompt"}), 400
+
+    response = chat(prompt)
+    return jsonify(response), 200
+
+@app.route('/add_documents', methods=['POST'])
+@jwt_required()
+def add_documents():
+    current_user = get_jwt_identity()
+    if not User.query.get(current_user):
+        return jsonify({"msg": "User not found"}), 404
+
+    data_path = request.json.get('data_path')
+    if not data_path:
+        return jsonify({"msg": "Missing data_path"}), 400
+
+    result = afegirDoc(data_path)
+    if result == "success":
+        return jsonify({"msg": "Documents added successfully"}), 200
+    elif result == "nothing_new_added":
+        return jsonify({"msg": "No new documents to add"}), 200
+    else:
+        return jsonify({"msg": result}), 400
+
+if __name__ == '__main__':
+    with app.app_context():
+        db.create_all()
+    app.run(debug=True)
\ No newline at end of file
--- a/create_index.txt
+++ b/create_index.txt
+PUT langchain_index
+{
+  "mappings": {
+    "properties": {
+      "vector": {
+        "type": "dense_vector",
+        "dims": 768,
+        "index": true,
+        "similarity": "cosine",
+        "index_options": {
+          "type": "int8_hnsw",
+          "m": 16,
+          "ef_construction": 100
+        }
+      },
+      "content": {
+        "type": "text"
+      }
+    }
+  }
+}
+
--- a/crida_api.py
+++ b/crida_api.py
+import sys
+import requests
+import json
+
+prompt = sys.argv[1]
+
+
+ollama_url = "http://127.0.0.1:11434/api/generate"
+headers = {"Content-Type": "application/json"}
+
+payload = {
+        "model": "qwen2.5:3b",
+        "prompt": f"{prompt}",
+        "stream": False
+        }
+
+try:
+    response = requests.post(ollama_url, headers=headers, json=payload)
+    response.raise_for_status()
+    print(response.json()["response"])
+except Exception as e:
+    print(e)
--- a/crida_api_rag.py
+++ b/crida_api_rag.py
+from langchain_core.prompts import ChatPromptTemplate
+import os
+from langchain_ollama import ChatOllama
+from langchain_elasticsearch import ElasticsearchStore
+from langchain_core.documents import Document
+from typing_extensions import List, TypedDict
+from langgraph.graph import START, StateGraph
+from get_embedding_function import get_embedding_function
+from dotenv import load_dotenv
+
+load_dotenv()
+
+ollama_url = os.getenv('OLLAMA_URL')
+model = os.getenv('CHAT_MODEL')
+
+elasticsearch_url = os.getenv('ELASTICSEARCH_URL')
+index_name = os.getenv('ELASTICSEARCH_INDEX_NAME')
+elasticsearch_user = os.getenv('ELASTICSEARCH_USER')
+elasticsearch_password = os.getenv('ELASTICSEARCH_PASSWORD')
+elasticsearch_strategy = os.getenv('ELASTICSEARCH_DISTANCE_STRATEGY')
+elasticsearch_querry_field = os.getenv('ELASTICSEARCH_QUERRY_FIELD')
+
+# Create embeddings and vector store
+embeddings = get_embedding_function()
+
+elastic_vector_search = ElasticsearchStore(
+    es_url=elasticsearch_url,
+    index_name=index_name,
+    embedding=embeddings,
+    es_user=elasticsearch_user,
+    es_password=elasticsearch_password,
+    distance_strategy=elasticsearch_strategy,
+    strategy=ElasticsearchStore.ApproxRetrievalStrategy(),
+    vector_query_field=elasticsearch_querry_field
+)
+
+# Initialize LLM
+llm = ChatOllama(model=model, temperature=0.7, base_url=ollama_url)
+
+# State definition
+class State(TypedDict):
+    question: str
+    context: List[Document]
+    answer: str
+
+# Steps
+def retrieve(state: State):
+    retrieved_docs = elastic_vector_search.similarity_search(state["question"])
+    return {"context": retrieved_docs}
+
+# Use ChatPromptTemplate inside this step
+def generate(state: State):
+    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
+    
+    prompt_template = ChatPromptTemplate.from_messages([
+        ("system", "Answer the question based on the context."),
+        ("human", "Question: {question}\n\nContext:\n{context}")
+    ])
+    
+    prompt_value = prompt_template.invoke({
+        "question": state["question"],
+        "context": docs_content
+    })
+
+    response = llm.invoke(prompt_value)
+    return {"answer": response.content}
+
+def chat(prompt: str) -> dict:
+    # Build graph
+    graph_builder = StateGraph(State).add_sequence([retrieve, generate])
+    graph_builder.add_edge(START, "retrieve")
+    graph = graph_builder.compile()
+
+    # Invoke graph with dictionary input
+    response = graph.invoke({"question": prompt})
+
+    return response
+
--- a/crida_rag.py
+++ b/crida_rag.py
+from langchain_core.prompts import ChatPromptTemplate
+import sys
+from langchain_ollama import ChatOllama
+from langchain_elasticsearch import ElasticsearchStore
+from langchain_core.documents import Document
+from typing_extensions import List, TypedDict
+from langgraph.graph import START, StateGraph
+from get_embedding_function import get_embedding_function
+import json
+
+base_url = "http://localhost:11434"
+model = "qwen2.5:32b"
+
+# Create embeddings and vector store
+embeddings = get_embedding_function()
+
+elastic_vector_search = ElasticsearchStore(
+    es_url="http://localhost:9200",
+    index_name="rag_index",
+    embedding=embeddings,
+    es_user="elastic",
+    es_password="HMkJyNKW",
+    distance_strategy="COSINE",
+    strategy=ElasticsearchStore.ApproxRetrievalStrategy(),
+    vector_query_field="dense_vector"
+)
+
+# Initialize LLM
+llm = ChatOllama(model=model, temperature=0.7, base_url=base_url)
+
+# State definition
+class State(TypedDict):
+    question: str
+    context: List[Document]
+    answer: str
+
+# Steps
+def retrieve(state: State):
+    retrieved_docs = elastic_vector_search.similarity_search(state["question"])
+    return {"context": retrieved_docs}
+
+# Use ChatPromptTemplate inside this step
+def generate(state: State):
+    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
+    
+    prompt_template = ChatPromptTemplate.from_messages([
+        ("system", "Answer the question based on the context."),
+        ("human", "Question: {question}\n\nContext:\n{context}")
+    ])
+    
+    prompt_value = prompt_template.invoke({
+        "question": state["question"],
+        "context": docs_content
+    })
+
+    response = llm.invoke(prompt_value)
+    return {"answer": response.content}
+
+
+prompt = sys.argv[1]
+# Build graph
+graph_builder = StateGraph(State).add_sequence([retrieve, generate])
+graph_builder.add_edge(START, "retrieve")
+graph = graph_builder.compile()
+
+# Invoke graph with dictionary input
+response = graph.invoke({"question": prompt})
+
+print(response)
+
--- a/docs/OReilly Guide - RAG_in_production_with_Haystack-FINAL.pdf
+++ b/docs/OReilly Guide - RAG_in_production_with_Haystack-FINAL.pdf
--- a/docs/javascript-handbook.pdf
+++ b/docs/javascript-handbook.pdf
--- a/docs/lleis.pdf
+++ b/docs/lleis.pdf
--- a/docs/symfony_doc-25-60.pdf
+++ b/docs/symfony_doc-25-60.pdf
--- a/docs/symfony_doc.pdf
+++ b/docs/symfony_doc.pdf
--- a/get_embedding_function.py
+++ b/get_embedding_function.py
+from langchain_ollama import OllamaEmbeddings
+
+model_embed = "bge-m3:567m"
+
+def get_embedding_function():
+    embeddings = OllamaEmbeddings(
+            model=model_embed,
+    )
+    # embeddings = OllamaEmbeddings(model="nomic-embed-text")
+    return embeddings
--- a/instance/rag.db
+++ b/instance/rag.db
--- a/intro_docs_sense_verif.py
+++ b/intro_docs_sense_verif.py
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.schema.document import Document
+from get_embedding_function import get_embedding_function
+from langchain_elasticsearch import ElasticsearchStore
+
+ELASTICSEARCH_URL = "http://localhost:9200"
+INDEX_NAME = "rag_index"
+
+
+def afegirDoc(data_path: str):
+    # Create (or update) the Elasticsearch index.
+    documents = load_documents(data_path)
+    chunks = split_documents(documents)
+    return add_to_elasticsearch(chunks)
+
+
+def load_documents(data_path: str):
+    document_loader = PyPDFLoader(data_path)
+    return document_loader.load()
+
+
+def split_documents(documents: list[Document]):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=800,
+        chunk_overlap=80,
+    )
+    return text_splitter.split_documents(documents)
+
+
+def add_to_elasticsearch(chunks: list[Document]):
+    # Prepare document IDs
+    chunks_with_ids = calculate_chunk_ids(chunks)
+
+    # Connect to Elasticsearch
+    embedding_function = get_embedding_function()
+    es_store = ElasticsearchStore(
+        index_name=INDEX_NAME,
+        embedding=embedding_function,
+        es_url=ELASTICSEARCH_URL,
+        es_user="elastic",
+        es_password="HMkJyNKW",
+        distance_strategy="COSINE",
+        strategy=ElasticsearchStore.ApproxRetrievalStrategy(),
+        vector_query_field="dense_vector",
+    )
+
+
+    existing_ids = set()
+    try:
+        results = es_store.client.search(index=INDEX_NAME, size=10000, _source=False)
+        hits = results.get("hits", {}).get("hits", [])
+        existing_ids = {hit["_id"] for hit in hits}
+    except Exception as e:
+        print("Index may not exist yet. Continuing...")
+
+    # Filter out already existing chunks
+    new_chunks = []
+    new_chunk_ids = []
+    for chunk in chunks_with_ids:
+        chunk_id = chunk.metadata["id"]
+        if chunk_id not in existing_ids:
+            new_chunks.append(chunk)
+            new_chunk_ids.append(chunk_id)
+
+    if new_chunks:
+        print(f"👉 Adding {len(new_chunks)} new chunks to Elasticsearch")
+        try:
+            es_store.add_documents(documents=new_chunks, ids=new_chunk_ids)
+            return "success"
+        except Exception as e:
+            return f"Error adding documents to Elasticsearch: {e}"
+    else:
+        return "nothing_new_added"
+    
+
+
+def calculate_chunk_ids(chunks):
+    last_page_id = None
+    current_chunk_index = 0
+
+    for chunk in chunks:
+        source = chunk.metadata.get("source")
+        page = chunk.metadata.get("page")
+        current_page_id = f"{source}:{page}"
+
+        if current_page_id == last_page_id:
+            current_chunk_index += 1
+        else:
+            current_chunk_index = 0
+
+        chunk_id = f"{current_page_id}:{current_chunk_index}"
+        last_page_id = current_page_id
+        chunk.metadata["id"] = chunk_id
+
+    return chunks
+
+
+
+data_path = "docs/OReilly Guide - RAG_in_production_with_Haystack-FINAL.pdf"
+
+print(afegirDoc(data_path))
--- a/introduccio_docs.py
+++ b/introduccio_docs.py
+from langchain_community.document_loaders import PyPDFDirectoryLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.schema.document import Document
+from get_embedding_function import get_embedding_function
+from langchain_elasticsearch import ElasticsearchStore
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+elasticsearch_url = os.getenv('ELASTICSEARCH_URL')
+index_name = os.getenv('ELASTICSEARCH_INDEX_NAME')
+elasticsearch_user = os.getenv('ELASTICSEARCH_USER')
+elasticsearch_password = os.getenv('ELASTICSEARCH_PASSWORD')
+elasticsearch_strategy = os.getenv('ELASTICSEARCH_DISTANCE_STRATEGY')
+elasticsearch_querry_field = os.getenv('ELASTICSEARCH_QUERRY_FIELD')
+
+def afegirDoc(data_path: str):
+    # Create (or update) the Elasticsearch index.
+    documents = load_documents(data_path)
+    chunks = split_documents(documents)
+    return add_to_elasticsearch(chunks)
+
+
+def load_documents(data_path: str):
+    document_loader = PyPDFDirectoryLoader(data_path, glob="**/*.pdf", recursive=True)
+    return document_loader.load()
+
+
+def split_documents(documents: list[Document]):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=800,
+        chunk_overlap=80,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    return text_splitter.split_documents(documents)
+
+
+def add_to_elasticsearch(chunks: list[Document]):
+    # Prepare document IDs
+    chunks_with_ids = calculate_chunk_ids(chunks)
+
+    # Connect to Elasticsearch
+    embedding_function = get_embedding_function()
+    elastic_vector_search = ElasticsearchStore(
+        es_url=elasticsearch_url,
+        index_name=index_name,
+        embedding=embedding_function,
+        es_user=elasticsearch_user,
+        es_password=elasticsearch_password,
+        distance_strategy=elasticsearch_strategy,
+        strategy=ElasticsearchStore.ApproxRetrievalStrategy(),
+        vector_query_field=elasticsearch_querry_field
+    )
+
+
+    existing_ids = set()
+    try:
+        results = elastic_vector_search.client.search(index=index_name, size=10000, _source=False)
+        hits = results.get("hits", {}).get("hits", [])
+        existing_ids = {hit["_id"] for hit in hits}
+    except Exception as e:
+        print("Index may not exist yet. Continuing...")
+
+    # Filter out already existing chunks
+    new_chunks = []
+    new_chunk_ids = []
+    for chunk in chunks_with_ids:
+        chunk_id = chunk.metadata["id"]
+        if chunk_id not in existing_ids:
+            new_chunks.append(chunk)
+            new_chunk_ids.append(chunk_id)
+
+    if new_chunks:
+        print(f"👉 Adding {len(new_chunks)} new chunks to Elasticsearch")
+        try:
+            elastic_vector_search.add_documents(documents=new_chunks, ids=new_chunk_ids)
+            return "success"
+        except Exception as e:
+            return f"Error adding documents to Elasticsearch: {e}"
+    else:
+        return "nothing_new_added"
+    
+
+
+def calculate_chunk_ids(chunks):
+    last_page_id = None
+    current_chunk_index = 0
+
+    for chunk in chunks:
+        source = chunk.metadata.get("source")
+        page = chunk.metadata.get("page")
+        current_page_id = f"{source}:{page}"
+
+        if current_page_id == last_page_id:
+            current_chunk_index += 1
+        else:
+            current_chunk_index = 0
+
+        chunk_id = f"{current_page_id}:{current_chunk_index}"
+        last_page_id = current_page_id
+        chunk.metadata["id"] = chunk_id
+
+    return chunks
+
--- a/rag.db
+++ b/rag.db