Build your own local app with Generative AI using RAG

Rihab Kallel
4 min readMay 13, 2024

--

Introduction

This article will help you build your own local generative AI application utilizing the Retrieval-Augmented Generation (RAG) with embedded data, specifically using FAISS (Facebook AI Similarity Search) and the open source model LLAMA 2–7b (CPU compatible).

Design

Project structure

Input data for embeddings:

  • Place your dataset for the RAG context in the data/ folder.

Application Scripts:

  • document_manager.py: Manages the retrieval of documents for embedding.
  • embedding_manager.py: Handles the embedding of documents and saving them in the FAISS vector database.
  • search_engine.py: Conducts similarity searches within the vector database.
  • settings.py: Contains the models and application settings.
  • text_splitter.py: Splits and prepares documents for embedding.
  • main.py: The main script to run the application.

Requirements

  • Python : Version 3.9 or higher.
  • Environment: Either Venv or Conda for managing Python virtual environments.
  • Dependencies: Get your Hugging Face account and then get your token to download the open-source models.
    The Langchain framework and the FAISS library are also necessary for setting up the vector database and similarity search functionalities.

Setup and Installation

1. Hugging Face Account Setup:

2. Model installation:

3. Virtual Environment:

conda create -n <your-env-name> python=<your-python-version>
conda activate <your-env-name>

4. Installation of Dependencies:

  • Install necessary Python packages: pip install -r requirements.txt.
  • Download and install the model: huggingface-cli download meta-llama/Llama-2-7b-hf --local-dir models/.

5. Running the Application

  • Noq Run the script using python app/main.py.
  • To test embeddings and similarity search functionalities, adjust the configurations in main.py (as you will see below).

Development

Requirements.txt

datasets
sentence-transformers
faiss-cpu
transformers
langchain
langchain_experimental
huggingface-hub
llama-cpp-python
langchain-community

document_manager.py

import os

class DocumentManager:
def __init__(self, base_dir, allowed_ext):
self.base_dir = base_dir
self.allowed_ext = allowed_ext

def extract_text(self, file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()

def get_documents(self):
text_data = []
metadata = []
for root, dirs, files in os.walk(self.base_dir):
for file in files:
if file.endswith(self.allowed_ext):
file_path = os.path.join(root, file)
text_data.append(self.extract_text(file_path))
metadata.append({"file-name": file_path.replace(self.base_dir, "")})
return text_data, metadata

embedding_manager.py

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

class EmbeddingManager:
def __init__(self, model_path, device, normalize_embeddings):
self.embeddings = HuggingFaceEmbeddings(
model_name=model_path,
model_kwargs={'device': device},
encode_kwargs={'normalize_embeddings': normalize_embeddings}
)

def create_vector_store(self, docs):
return FAISS.from_documents(docs, self.embeddings)

text_splitter.py

from langchain.text_splitter import RecursiveCharacterTextSplitter

class TextSplitter:
def __init__(self, chunk_size, chunk_overlap):
self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

def split(self, text_data, metadata):
docs = self.splitter.create_documents(text_data, metadata)
return self.splitter.split_documents(docs)

settings.py

DATA_BASE_DIR = "data"
DATA_ALLOWED_EXTENSIONS = ".txt"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 150
EMBEDDING_MODEL_PATH = "sentence-transformers/all-MiniLM-l6-v2"
EMBEDDING_MODEL_DEVICE = "cpu"
TOKENIZATION_MODEL_PATH = "meta-llama/Llama-2-7b-hf"
TOKENIZATION_MAX_LENGTH = 512
LLM_PATH = "models/llama-2-7b.Q4_K_M.gguf"
LLM_TEMPERATURE = 0.75
LLM_MAX_TOKENS = 4000
LLM_TOP_P = 1
LLM_VERBOSE = True
HF_TOKEN = "<your_hugging_face_token_here>"

search_engine.py

class SearchEngine:
def __init__(self, db):
self.db = db

def search(self, query):
return self.db.similarity_search(query)

main.py

from document_manager import DocumentManager
from text_splitter import TextSplitter
from embedding_manager import EmbeddingManager
from search_engine import SearchEngine
import settings as settings
from transformers import AutoTokenizer
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp
import sys

## USER QUERY
question = sys.argv[1]

def run_without_rag(llm, question):
llm(question)

def run_with_rag(llm, db, template):

# Create a retriever from the embeddings db
# It retrieves up to 2 relevant documents.
retriever = db.as_retriever(search_kwargs={"k": 2})
qa_chain_prompt = PromptTemplate.from_template(template)

# Create a question-answering instance (qa) using the RetrievalQA from langchain framework.
# The chain will take a list of documents, inserts them all into a prompt, and passes that prompt to an LLM:
qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=False,
verbose=True,
chain_type_kwargs={"prompt": qa_chain_prompt, "verbose": True},
)
# Ask the question and get a response
return qa({"query": question})


def run_similarity_search(db):
search_engine = SearchEngine(db)
search_results = search_engine.search(question)
print(search_results[0].page_content)


if __name__ == "__main__":
doc_manager = DocumentManager(settings.DATA_BASE_DIR, settings.DATA_ALLOWED_EXTENSIONS)
text_splitter = TextSplitter(settings.CHUNK_SIZE, settings.CHUNK_OVERLAP)
embedding_manager = EmbeddingManager(settings.EMBEDDING_MODEL_PATH, settings.EMBEDDING_MODEL_DEVICE, False)
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llm = LlamaCpp(
model_path=settings.LLM_PATH,
temperature=settings.LLM_TEMPERATURE,
max_tokens=settings.LLM_MAX_TOKENS,
top_p=settings.LLM_TOP_P,
callback_manager=callback_manager,
verbose=True, # Verbose is required to pass to the callback manager
)

text_data, metadata = doc_manager.get_documents()
docs = text_splitter.split(text_data, metadata)
db = embedding_manager.create_vector_store(docs)

##### Checking if the search engine work s#####
# run_similarity_search(db)

# Load the pre-trainedtokenizer related to the LLM chosen and loaded in line 34
# This tokenizer will be used convert inputs to the format the LLM expects
# This tokenizer is used implecitly by the LLM RetrievalQA
tokenizer = AutoTokenizer.from_pretrained(
settings.TOKENIZATION_MODEL_PATH,
padding=True,
truncation=True,
max_length=settings.TOKENIZATION_MAX_LENGTH,
token=settings.HF_TOKEN)

###### Baseline answer from LLM without retrieval (RAG) ######
# print(run_without_rag(llm, question))

# Prepare RAG prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible.
Context: {context}
Question: {question}
Helpful Answer:"""

######## Answer from LLM with retrieval (RAG) #######
print(run_with_rag(llm, db, template))

Additional Information

  • Vector Database Compatibility: FAISS is optimized for CPU however it does not support cloud deployments directly. Check this article for other cloud-compatible vector databases choices.
  • Model Flexibility: The system supports integration with other models like OpenAI’s GPT or any other model with Langchain framework.

--

--