{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "d14ad3f1", "metadata": {}, "outputs": [], "source": [ "## 准备部分\n", "# 指定Embed对象\n", "embed_object = \"labors-law\"" ] }, { "cell_type": "code", "execution_count": null, "id": "29688979-89c0-47c7-84ab-2b4b182d2bd7", "metadata": {}, "outputs": [], "source": [ "# 导入必要的包\n", "from langchain_community.document_loaders import PyPDFLoader\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "import chromadb\n", "import time\n", "from tqdm.notebook import tqdm\n", "from langchain.docstore.document import Document\n", "import os\n", "import sys\n", "import json\n", "import requests\n", "from chromadb import Client\n", "\n", "# 获取当前脚本的目录\n", "current_dir = os.getcwd()\n", "\n", "# 读取设定文件\n", "config_file_path = os.path.join(current_dir, \"..\", \"setup\", \"config_embed.json\")\n", "\n", "# 读取配置项目\n", "try:\n", " with open(config_file_path, \"r\", encoding=\"utf-8\") as f:\n", " dict_config = json.load(f)\n", " # 读取通用项目\n", " API_URL = dict_config[\"API_URL\"]\n", " cohere_access_token = dict_config[\"cohere_access_token\"]\n", " custom_proxies = dict_config[\"custom_proxies\"]\n", " list_chroma_dir = dict_config[\"list_chroma_dir\"]\n", " list_embed_file_path = dict_config[\"list_embed_file_path\"]\n", " # 读取Embed对象项目\n", " model_name = dict_config[\"docs\"][embed_object][\"model_name\"]\n", " embed_file_name = dict_config[\"docs\"][embed_object][\"file_name\"]\n", " split_chunk_size = dict_config[\"docs\"][embed_object][\"split_chunk_size\"]\n", " split_overlap = dict_config[\"docs\"][embed_object][\"split_overlap\"]\n", " model_batch_size = dict_config[\"docs\"][embed_object][\"model_batch_size\"]\n", " collection_ids_prefix = dict_config[\"docs\"][embed_object][\"collection_ids_prefix\"]\n", " embed_collection_name = dict_config[\"docs\"][embed_object][\"collection_name\"]\n", "except Exception as e:\n", " print(\"配置文件读取失败\", e)\n", " sys.exit(\"Stop processing\")\n", "\n", "# 回退到上级目录并构建相对路径\n", "chroma_dir = os.path.join(current_dir, *list_chroma_dir)\n", "embed_file_path = os.path.join(current_dir, *list_embed_file_path, embed_file_name)" ] }, { "cell_type": "code", "execution_count": null, "id": "f168e7ca-a61a-4e64-9d22-1e95b6f95a4d", "metadata": {}, "outputs": [], "source": [ "def load_and_split_pdf(file_path, chunk_size=500, chunk_overlap=100):\n", " # 加载PDF文档\n", " loader = PyPDFLoader(file_path)\n", " documents = loader.load()\n", "\n", " # 将所有页内容合并成一个字符串\n", " full_text = \" \".join([doc.page_content for doc in documents])\n", "\n", " import re\n", " # 使用正则表达式去除跨页的页码或分隔符,例如 \"\\n—6—\\n\"\n", " # full_text = re.sub(r\"\\n—\\d+—\\n\", \"\\n\", full_text)\n", "\n", " # 去掉多余换行符,将换行符替换为空格\n", " full_text = re.sub(r\"\\s*\\n\\s*\", \"\", full_text)\n", "\n", " # 分割文档\n", " text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=chunk_size,\n", " chunk_overlap=chunk_overlap\n", " )\n", " split_docs = text_splitter.split_text(full_text)\n", "\n", " # 将每个片段转为Document对象以保持一致性\n", " return [Document(page_content=chunk) for chunk in split_docs]" ] }, { "cell_type": "code", "execution_count": null, "id": "e7288509", "metadata": {}, "outputs": [], "source": [ "docs = load_and_split_pdf(embed_file_path, split_chunk_size, split_overlap)" ] }, { "cell_type": "code", "execution_count": null, "id": "8d3ed2e8-97c6-4db4-a4e8-d582013a2ba9", "metadata": {}, "outputs": [], "source": [ "# 初始化 Chroma 数据库客户端\n", "client = chromadb.PersistentClient(path=chroma_dir)\n", "\n", "# 重建Collection\n", "try:\n", " client.delete_collection(embed_collection_name)\n", "except Exception as e:\n", " print(e)\n", " pass\n", "\n", "collection = client.get_or_create_collection(name=embed_collection_name)" ] }, { "cell_type": "code", "execution_count": null, "id": "c2b660e6-c38b-41be-a429-6a2edf1edd7d", "metadata": {}, "outputs": [], "source": [ "from IPython.display import clear_output" ] }, { "cell_type": "code", "execution_count": null, "id": "5926d5d9", "metadata": {}, "outputs": [], "source": [ "# 装载本地的Hugging Face模型\n", "from sentence_transformers import SentenceTransformer\n", "\n", "model = SentenceTransformer(\"jinaai/jina-embeddings-v3\", trust_remote_code=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "0b23e16a-c069-4be2-8109-166f5cbb722c", "metadata": {}, "outputs": [], "source": [ "request_docs = []\n", "for idx, doc in enumerate(docs):\n", " # 使用 page_content 提取文档内容文本\n", " text = doc.page_content\n", " request_docs.append(text)\n", "\n", "embedded_docs = []\n", "for i in range(0, len(request_docs), model_batch_size):\n", " # embeddings_response = get_embeddings(request_docs[i:i+model_batch_size])\n", " # embedded_docs.extend(embeddings_response['embeddings'])\n", " embedding = model.encode(request_docs[i:i+model_batch_size])\n", " embedded_docs.extend(embedding)" ] }, { "cell_type": "code", "execution_count": null, "id": "fb491aa1-4fdc-423f-ba76-861b68959777", "metadata": {}, "outputs": [], "source": [ "for idx, doc in tqdm(enumerate(embedded_docs), total=len(embedded_docs), desc=\"Processing documents\"):\n", " \n", " # 将嵌入结果存储到 Chroma 数据库\n", " collection.add(\n", " ids=collection_ids_prefix + str(idx),\n", " documents=[request_docs[idx]],\n", " metadatas=[{\"chunk\": idx}],\n", " embeddings=[doc]\n", " )" ] }, { "cell_type": "code", "execution_count": null, "id": "e7d24e7f", "metadata": {}, "outputs": [], "source": [ "# 释放内存\n", "del model\n", "import gc\n", "gc.collect()" ] }, { "cell_type": "code", "execution_count": null, "id": "ac99b839", "metadata": {}, "outputs": [], "source": [ "# 导入 umap 和 matplotlib\n", "import umap\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import mplcursors\n", "\n", "# 创建 UMAP 转换器\n", "reducer = umap.UMAP()\n", "\n", "# 将数据降维到二维\n", "embedding = reducer.fit_transform(embedded_docs)" ] }, { "cell_type": "code", "execution_count": null, "id": "084d21af", "metadata": {}, "outputs": [], "source": [ "# 绘制二维数据点\n", "scatter = plt.scatter(embedding[:, 0], embedding[:, 1], s=10)\n", "\n", "# 添加交互式标签\n", "cursor = mplcursors.cursor(scatter, hover=True)\n", "cursor.connect(\"add\", lambda sel: sel.annotation.set_text(request_docs[sel.index]))\n", "\n", "plt.gca().set_aspect('equal', 'datalim')\n", "plt.title('UMAP Projection', fontsize=24)\n", "\n", "# 显示图形\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "61d5bbbe", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }