-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_db.py
82 lines (71 loc) · 2.72 KB
/
create_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# 首先导入所需第三方库
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from tqdm import tqdm
import os
from langchain.docstore.document import Document
# 获取文件路径函数
def get_files(dir_path):
# args:dir_path,目标文件夹路径
file_list = []
for filepath, dirnames, filenames in os.walk(dir_path):
# os.walk 函数将递归遍历指定文件夹
for filename in filenames:
# 通过后缀名判断文件类型是否满足要求
if filename.endswith(".md"):
# 如果满足要求,将其绝对路径加入到结果列表
file_list.append(os.path.join(filepath, filename))
elif filename.endswith(".txt"):
file_list.append(os.path.join(filepath, filename))
return file_list
# 加载文件函数
def get_text(dir_path):
# args:dir_path,目标文件夹路径
# 首先调用上文定义的函数得到目标文件路径列表
file_lst = get_files(dir_path)
# docs 存放加载之后的纯文本对象
docs = []
# 遍历所有目标文件
for one_file in tqdm(file_lst):
file_type = one_file.split(".")[-1]
if file_type == "md":
loader = UnstructuredMarkdownLoader(one_file)
elif file_type == "txt":
loader = UnstructuredFileLoader(one_file)
else:
# 如果是不符合条件的文件,直接跳过
continue
docs.extend(loader.load())
return docs
# 目标文件夹
tar_dir1 = ["/root/code/law_rag/data/law"]
# 加载目标文件
docs = []
for dir_path in tar_dir1:
docs.extend(get_text(dir_path))
# 对文本进行分块
split_docs = []
for doc in docs:
cur_text = doc.page_content.split("\n\n")
metadata = doc.metadata
for line in cur_text:
split_docs.append(Document(page_content=line, metadata=metadata))
# tar_dir2 = ["/root/code/law_rag/data/cases"]
# for dir_path in tar_dir2:
# split_docs.extend(get_text(dir_path))
# 加载开源词向量模型
embeddings = HuggingFaceEmbeddings(model_name="/root/models/sentence-transformer")
# 构建向量数据库
# 定义持久化路径
persist_directory = "/root/code/law_rag/data_base/vector_db/law"
# 加载数据库
vectordb = Chroma.from_documents(
documents=split_docs,
embedding=embeddings,
persist_directory=persist_directory, # 允许我们将persist_directory目录保存到磁盘上
)
# 将加载的向量数据库持久化到磁盘上
vectordb.persist()