added QA LLM in retrieve script

0176e840 · Administrator · 7f1e6463 · 0176e840 · 0176e840
Commit 0176e840 authored Nov 25, 2024 by Administrator
--- a/notebooks/embed-with-hf-local.ipynb
+++ b/notebooks/embed-with-hf-local.ipynb
@@ -150,26 +150,52 @@
        metadatas=[{"chunk": idx}],
        embeddings=[doc]
    )
 ```

-%%%% Output: display_data
-
-
 %% Cell type:code id:e7d24e7f tags:

 ``` python
 # 释放内存
 del model
 import gc
 gc.collect()
 ```

-%%%% Output: execute_result
+%% Cell type:code id:ac99b839 tags:
+
+``` python
+# 导入 umap 和 matplotlib
+import umap
+import matplotlib.pyplot as plt
+import numpy as np
+import mplcursors

-    14438
+# 创建 UMAP 转换器
+reducer = umap.UMAP()

-%% Cell type:code id:ac99b839 tags:
+# 将数据降维到二维
+embedding = reducer.fit_transform(embedded_docs)
+```
+
+%% Cell type:code id:084d21af tags:
+
+``` python
+# 绘制二维数据点
+scatter = plt.scatter(embedding[:, 0], embedding[:, 1], s=10)
+
+# 添加交互式标签
+cursor = mplcursors.cursor(scatter, hover=True)
+cursor.connect("add", lambda sel: sel.annotation.set_text(request_docs[sel.index]))
+
+plt.gca().set_aspect('equal', 'datalim')
+plt.title('UMAP Projection', fontsize=24)
+
+# 显示图形
+plt.show()
+```
+
+%% Cell type:code id:61d5bbbe tags:

 ``` python

 ```

--- a/notebooks/retrieve-with-hf-local.ipynb
+++ b/notebooks/retrieve-with-hf-local.ipynb
@@ -3,11 +3,11 @@
 ``` python
 ## 准备部分
 # 指定Embed对象
 embed_object = "labors-law"
 # 指定用户问题
-query_text = "对于劳动合同的解除有什么规定?"
+query_text = "关于工资福利，有什么规定?"
 ```

 %% Cell type:code id:4b9d9fa0-1c73-4cef-abc4-397458215159 tags:

 ``` python
@@ -110,8 +110,61 @@
 del model
 import gc
 gc.collect()
 ```

-%%%% Output: execute_result
+%% Cell type:code id:4b9a1869 tags:

-    28863
+``` python
+# 导入QA模型
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+
+%% Cell type:code id:902860e9 tags:
+
+``` python
+# 准备提示词
+prompt = query_text
+messages = [
+    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
+    {"role": "system", "content": "在回答的过程中，必须根据以下提示回答:\n" + "\n - ".join(results["documents"][0])},
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+```
+
+%% Cell type:code id:f4aa0f73 tags:
+
+``` python
+# 生成回答
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=512
+)
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+]
+
+response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+
+print(response)
+```
+
+%% Cell type:code id:f2baa5f5 tags:
+
+``` python
+
+```