{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "d14ad3f1", "metadata": {}, "outputs": [], "source": [ "## 准备部分\n", "# 指定Embed对象\n", "embed_object = \"labors-law\"" ] }, { "cell_type": "code", "execution_count": 2, "id": "29688979-89c0-47c7-84ab-2b4b182d2bd7", "metadata": {}, "outputs": [], "source": [ "# 导入必要的包\n", "from langchain_community.document_loaders import PyPDFLoader\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "import chromadb\n", "import time\n", "from tqdm.notebook import tqdm\n", "from langchain.docstore.document import Document\n", "import os\n", "import sys\n", "import json\n", "import requests\n", "from chromadb import Client\n", "\n", "# 获取当前脚本的目录\n", "current_dir = os.getcwd()\n", "\n", "# 读取设定文件\n", "config_file_path = os.path.join(current_dir, \"..\", \"setup\", \"config_embed.json\")\n", "\n", "# 读取配置项目\n", "try:\n", " with open(config_file_path, \"r\", encoding=\"utf-8\") as f:\n", " dict_config = json.load(f)\n", " # 读取通用项目\n", " API_URL = dict_config[\"API_URL\"]\n", " cohere_access_token = dict_config[\"cohere_access_token\"]\n", " custom_proxies = dict_config[\"custom_proxies\"]\n", " list_chroma_dir = dict_config[\"list_chroma_dir\"]\n", " list_embed_file_path = dict_config[\"list_embed_file_path\"]\n", " # 读取Embed对象项目\n", " model_name = dict_config[\"docs\"][embed_object][\"model_name\"]\n", " embed_file_name = dict_config[\"docs\"][embed_object][\"file_name\"]\n", " split_chunk_size = dict_config[\"docs\"][embed_object][\"split_chunk_size\"]\n", " split_overlap = dict_config[\"docs\"][embed_object][\"split_overlap\"]\n", " model_batch_size = dict_config[\"docs\"][embed_object][\"model_batch_size\"]\n", " collection_ids_prefix = dict_config[\"docs\"][embed_object][\"collection_ids_prefix\"]\n", " embed_collection_name = dict_config[\"docs\"][embed_object][\"collection_name\"]\n", "except Exception as e:\n", " print(\"配置文件读取失败\", e)\n", " sys.exit(\"Stop processing\")\n", "\n", "# 回退到上级目录并构建相对路径\n", "chroma_dir = os.path.join(current_dir, *list_chroma_dir)\n", "embed_file_path = os.path.join(current_dir, *list_embed_file_path, embed_file_name)" ] }, { "cell_type": "code", "execution_count": null, "id": "f168e7ca-a61a-4e64-9d22-1e95b6f95a4d", "metadata": {}, "outputs": [], "source": [ "def load_and_split_pdf(file_path, chunk_size=500, chunk_overlap=100):\n", " # 加载PDF文档\n", " loader = PyPDFLoader(file_path)\n", " documents = loader.load()\n", "\n", " # 将所有页内容合并成一个字符串\n", " full_text = \" \".join([doc.page_content for doc in documents])\n", "\n", " import re\n", " # 使用正则表达式去除跨页的页码或分隔符,例如 \"\\n—6—\\n\"\n", " # full_text = re.sub(r\"\\n—\\d+—\\n\", \"\\n\", full_text)\n", "\n", " # 去掉多余换行符,将换行符替换为空格\n", " full_text = re.sub(r\"\\s*\\n\\s*\", \"\", full_text)\n", "\n", " # 分割文档\n", " text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=chunk_size,\n", " chunk_overlap=chunk_overlap\n", " )\n", " split_docs = text_splitter.split_text(full_text)\n", "\n", " # 将每个片段转为Document对象以保持一致性\n", " return [Document(page_content=chunk) for chunk in split_docs]" ] }, { "cell_type": "code", "execution_count": 4, "id": "e7288509", "metadata": {}, "outputs": [], "source": [ "docs = load_and_split_pdf(embed_file_path, split_chunk_size, split_overlap)" ] }, { "cell_type": "code", "execution_count": 5, "id": "8d3ed2e8-97c6-4db4-a4e8-d582013a2ba9", "metadata": {}, "outputs": [], "source": [ "# 初始化 Chroma 数据库客户端\n", "client = chromadb.PersistentClient(path=chroma_dir)\n", "\n", "# 重建Collection\n", "try:\n", " client.delete_collection(embed_collection_name)\n", "except Exception as e:\n", " print(e)\n", " pass\n", "\n", "collection = client.get_or_create_collection(name=embed_collection_name)" ] }, { "cell_type": "code", "execution_count": 6, "id": "c2b660e6-c38b-41be-a429-6a2edf1edd7d", "metadata": {}, "outputs": [], "source": [ "from IPython.display import clear_output" ] }, { "cell_type": "code", "execution_count": 7, "id": "5926d5d9", "metadata": {}, "outputs": [], "source": [ "# 装载本地的Hugging Face模型\n", "from sentence_transformers import SentenceTransformer\n", "\n", "model = SentenceTransformer(\"jinaai/jina-embeddings-v3\", trust_remote_code=True)" ] }, { "cell_type": "code", "execution_count": 8, "id": "0b23e16a-c069-4be2-8109-166f5cbb722c", "metadata": {}, "outputs": [], "source": [ "request_docs = []\n", "for idx, doc in enumerate(docs):\n", " # 使用 page_content 提取文档内容文本\n", " text = doc.page_content\n", " request_docs.append(text)\n", "\n", "embedded_docs = []\n", "for i in range(0, len(request_docs), model_batch_size):\n", " # embeddings_response = get_embeddings(request_docs[i:i+model_batch_size])\n", " # embedded_docs.extend(embeddings_response['embeddings'])\n", " embedding = model.encode(request_docs[i:i+model_batch_size])\n", " embedded_docs.extend(embedding)" ] }, { "cell_type": "code", "execution_count": 9, "id": "fb491aa1-4fdc-423f-ba76-861b68959777", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e60b40104823466e8e4df4df70b6fe39", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Processing documents: 0%| | 0/63 [00:00