1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
| import fitz
import os
import numpy as np
import json
from openai import OpenAI
def extract_text_from_pdf(pdf_path):
"""
Extracts text from a PDF file and prints the first `num_chars` characters.
Args:
pdf_path (str): Path to the PDF file.
Returns:
str: Extracted text from the PDF.
"""
# Open the PDF file
mypdf = fitz.open(pdf_path)
all_text = "" # Initialize an empty string to store the extracted text
# Iterate through each page in the PDF
for page_num in range(mypdf.page_count):
page = mypdf[page_num] # Get the page
text = page.get_text("text") # Extract text from the page
all_text += text # Append the extracted text to the all_text string
return all_text # Return the extracted text
def chunk_text(text, n, overlap):
"""
Chunks the given text into segments of n characters with overlap.
Args:
text (str): The text to be chunked.
n (int): The number of characters in each chunk.
overlap (int): The number of overlapping characters between chunks.
Returns:
List[str]: A list of text chunks.
"""
chunks = [] # Initialize an empty list to store the chunks
# Loop through the text with a step size of (n - overlap)
for i in range(0, len(text), n - overlap):
# Append a chunk of text from index i to i + n to the chunks list
chunks.append(text[i:i + n])
return chunks # Return the list of text chunks
# Initialize the OpenAI client with the base URL and API key
client = OpenAI(
base_url="https://api.studio.nebius.com/v1/",
api_key=os.getenv("OPENAI_API_KEY") # Retrieve the API key from environment variables
)
def create_embeddings(text, model="BAAI/bge-en-icl"):
"""
Creates embeddings for the given text using the specified OpenAI model.
Args:
text (str): The input text for which embeddings are to be created.
model (str): The model to be used for creating embeddings. Default is "BAAI/bge-en-icl".
Returns:
dict: The response from the OpenAI API containing the embeddings.
"""
# Create embeddings for the input text using the specified model
response = client.embeddings.create(
model=model,
input=text
)
return response # Return the response containing the embeddings
def cosine_similarity(vec1, vec2):
"""
Calculates the cosine similarity between two vectors.
Args:
vec1 (np.ndarray): The first vector.
vec2 (np.ndarray): The second vector.
Returns:
float: The cosine similarity between the two vectors.
"""
# Compute the dot product of the two vectors and divide by the product of their norms
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
def semantic_search(query, text_chunks, embeddings, k=5):
"""
Performs semantic search on the text chunks using the given query and embeddings.
Args:
query (str): The query for the semantic search.
text_chunks (List[str]): A list of text chunks to search through.
embeddings (List[dict]): A list of embeddings for the text chunks.
k (int): The number of top relevant text chunks to return. Default is 5.
Returns:
List[str]: A list of the top k most relevant text chunks based on the query.
"""
# Create an embedding for the query
query_embedding = create_embeddings(query).data[0].embedding
similarity_scores = [] # Initialize a list to store similarity scores
# Calculate similarity scores between the query embedding and each text chunk embedding
for i, chunk_embedding in enumerate(embeddings):
similarity_score = cosine_similarity(np.array(query_embedding), np.array(chunk_embedding.embedding))
similarity_scores.append((i, similarity_score)) # Append the index and similarity score
# Sort the similarity scores in descending order
similarity_scores.sort(key=lambda x: x[1], reverse=True)
# Get the indices of the top k most similar text chunks
top_indices = [index for index, _ in similarity_scores[:k]]
# Return the top k most relevant text chunks
return [text_chunks[index] for index in top_indices]
def generate_response(system_prompt, user_message, model="meta-llama/Llama-3.2-3B-Instruct"):
"""
Generates a response from the AI model based on the system prompt and user message.
Args:
system_prompt (str): The system prompt to guide the AI's behavior.
user_message (str): The user's message or query.
model (str): The model to be used for generating the response. Default is "meta-llama/Llama-2-7B-chat-hf".
Returns:
dict: The response from the AI model.
"""
response = client.chat.completions.create(
model=model,
temperature=0,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
]
)
return response
# 完整调用流程
def simple_rag_pipeline(pdf_path, query):
# 1. 提取PDF文本
extracted_text = extract_text_from_pdf(pdf_path)
# 2. 分块处理
text_chunks = chunk_text(extracted_text, 1000, 200)
# 3. 创建嵌入
response = create_embeddings(text_chunks)
# 4. 语义搜索
top_chunks = semantic_search(query, text_chunks, response.data, k=2)
# 5. 生成回答
system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly from the provided context, respond with: 'I do not have enough information to answer that.'"
user_prompt = "\n".join([f"Context {i + 1}:\n{chunk}\n=====================================\n" for i, chunk in enumerate(top_chunks)])
user_prompt = f"{user_prompt}\nQuestion: {query}"
ai_response = generate_response(system_prompt, user_prompt)
return ai_response.choices[0].message.content
|