"Open

In [None]:
!pip install -q langchain==0.0.235 openai

In [None]:
!wget https://raw.githubusercontent.com/WTFAcademy/WTF-Langchain/main/01_Hello_Langchain/README.md

## 加载文档

In [None]:
from langchain.document_loaders import TextLoader

loader = TextLoader("./README.md")
docs = loader.load()

In [None]:
docs

## 拆分文档

### 按字符拆分

In [None]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
 separator = "\n\n",
 chunk_size = 1000,
 chunk_overlap = 200,
 length_function = len,
)

split_docs = text_splitter.split_documents(docs)
print(len(docs[0].page_content))
for split_doc in split_docs:
 print(len(split_doc.page_content))

### 拆分代码

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

PYTHON_CODE = """
def hello_langchain():
 print("Hello, Langchain!")

# Call the function
hello_langchain()
"""
python_splitter = RecursiveCharacterTextSplitter.from_language(
 language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)
python_docs = python_splitter.create_documents([PYTHON_CODE])
python_docs

### Markdown文档拆分

In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

markdown_document = "# Chapter 1\n\n ## Section 1\n\nHi this is the 1st section\n\nWelcome\n\n ### Module 1 \n\n Hi this is the first module \n\n ## Section 2\n\n Hi this is the 2nd section"

headers_to_split_on = [
 ("#", "Header 1"),
 ("##", "Header 2"),
 ("###", "Header 3"),
]

splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
splits = splitter.split_text(markdown_document)

splits

### 按字符递归拆分

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
 chunk_size = 100,
 chunk_overlap = 20,
 length_function = len,
)
texts = text_splitter.split_documents(docs)
print(len(docs[0].page_content))
for split_doc in texts:
 print(len(split_doc.page_content))

### 按token拆分

In [None]:
!pip install -q tiktoken

In [None]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
 chunk_size=100, chunk_overlap=0
)
split_docs = text_splitter.split_documents(docs)

split_docs

## 向量化文档分块

In [None]:
from langchain.embeddings import OpenAIEmbeddings
embeddings_model = OpenAIEmbeddings(openai_api_key="")
embeddings = embeddings_model.embed_documents(
 [
 "你好!",
 "Langchain!",
 "你真棒!"
 ]
)
embeddings

## 向量数据存储

### 存储

In [None]:
!pip install -q chromadb

In [None]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(docs)
db = Chroma.from_documents(documents, OpenAIEmbeddings(openai_api_key=""))

### 检索

In [None]:
query = "什么是WTF Langchain?"
docs = db.similarity_search(query)
docs

In [None]:
docs = db.similarity_search_with_score(query)
docs