{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "authorship_tag": "ABX9TyOgQoOin53yoGqil3iR6M6W", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "MjW9VjNto59d" }, "outputs": [], "source": [ "!pip install -q langchain==0.0.235 openai" ] }, { "cell_type": "code", "source": [ "!wget https://raw.githubusercontent.com/WTFAcademy/WTF-Langchain/main/01_Hello_Langchain/README.md" ], "metadata": { "id": "LS_efmfC5Hp6" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## 加载文档" ], "metadata": { "id": "C-W2t1v65-Gt" } }, { "cell_type": "code", "source": [ "from langchain.document_loaders import TextLoader\n", "\n", "loader = TextLoader(\"./README.md\")\n", "docs = loader.load()" ], "metadata": { "id": "e1_VoFqS5GJ4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "docs" ], "metadata": { "id": "omltifXH6jc7" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## 拆分文档" ], "metadata": { "id": "kRPF6Mfn6Ake" } }, { "cell_type": "markdown", "source": [ "### 按字符拆分" ], "metadata": { "id": "FakX37SB6DT4" } }, { "cell_type": "code", "source": [ "from langchain.text_splitter import CharacterTextSplitter\n", "text_splitter = CharacterTextSplitter(\n", " separator = \"\\n\\n\",\n", " chunk_size = 1000,\n", " chunk_overlap = 200,\n", " length_function = len,\n", ")\n", "\n", "split_docs = text_splitter.split_documents(docs)\n", "print(len(docs[0].page_content))\n", "for split_doc in split_docs:\n", " print(len(split_doc.page_content))" ], "metadata": { "id": "0gm-A-_r5Wfb" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### 拆分代码" ], "metadata": { "id": "8avQDR6u6HCP" } }, { "cell_type": "code", "source": [ "from langchain.text_splitter import RecursiveCharacterTextSplitter, Language\n", "\n", "PYTHON_CODE = \"\"\"\n", "def hello_langchain():\n", " print(\"Hello, Langchain!\")\n", "\n", "# Call the function\n", "hello_langchain()\n", "\"\"\"\n", "python_splitter = RecursiveCharacterTextSplitter.from_language(\n", " language=Language.PYTHON, chunk_size=50, chunk_overlap=0\n", ")\n", "python_docs = python_splitter.create_documents([PYTHON_CODE])\n", "python_docs" ], "metadata": { "id": "OlNC7pR15Z0r" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Markdown文档拆分" ], "metadata": { "id": "O_wWWlWS6JkO" } }, { "cell_type": "code", "source": [ "from langchain.text_splitter import MarkdownHeaderTextSplitter\n", "\n", "markdown_document = \"# Chapter 1\\n\\n ## Section 1\\n\\nHi this is the 1st section\\n\\nWelcome\\n\\n ### Module 1 \\n\\n Hi this is the first module \\n\\n ## Section 2\\n\\n Hi this is the 2nd section\"\n", "\n", "headers_to_split_on = [\n", " (\"#\", \"Header 1\"),\n", " (\"##\", \"Header 2\"),\n", " (\"###\", \"Header 3\"),\n", "]\n", "\n", "splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n", "splits = splitter.split_text(markdown_document)\n", "\n", "splits" ], "metadata": { "id": "Gg6twioR5cX8" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### 按字符递归拆分" ], "metadata": { "id": "Spo_Nn036Oko" } }, { "cell_type": "code", "source": [ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size = 100,\n", " chunk_overlap = 20,\n", " length_function = len,\n", ")\n", "texts = text_splitter.split_documents(docs)\n", "print(len(docs[0].page_content))\n", "for split_doc in texts:\n", " print(len(split_doc.page_content))" ], "metadata": { "id": "RLxIWV3G5nSh" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### 按token拆分" ], "metadata": { "id": "iH_AHoif6SVQ" } }, { "cell_type": "code", "source": [ "!pip install -q tiktoken" ], "metadata": { "id": "L0zcXo2y8urg" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from langchain.text_splitter import CharacterTextSplitter\n", "text_splitter = CharacterTextSplitter.from_tiktoken_encoder(\n", " chunk_size=100, chunk_overlap=0\n", ")\n", "split_docs = text_splitter.split_documents(docs)\n", "\n", "split_docs" ], "metadata": { "id": "WGg-ZOaq5pzl" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## 向量化文档分块" ], "metadata": { "id": "d8dfw22O6Vb2" } }, { "cell_type": "code", "source": [ "from langchain.embeddings import OpenAIEmbeddings\n", "embeddings_model = OpenAIEmbeddings(openai_api_key=\"\")\n", "embeddings = embeddings_model.embed_documents(\n", " [\n", " \"你好!\",\n", " \"Langchain!\",\n", " \"你真棒!\"\n", " ]\n", ")\n", "embeddings" ], "metadata": { "id": "AghMYu8r5zBW" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## 向量数据存储" ], "metadata": { "id": "QYq8gm4g6ZBl" } }, { "cell_type": "markdown", "source": [ "### 存储" ], "metadata": { "id": "Jff1dIkk6cwh" } }, { "cell_type": "code", "source": [ "!pip install -q chromadb" ], "metadata": { "id": "3KT-ziYSMMn9" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from langchain.document_loaders import TextLoader\n", "from langchain.embeddings.openai import OpenAIEmbeddings\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.vectorstores import Chroma\n", "\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "documents = text_splitter.split_documents(docs)\n", "db = Chroma.from_documents(documents, OpenAIEmbeddings(openai_api_key=\"\"))" ], "metadata": { "id": "vtDRMAx752w_" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### 检索" ], "metadata": { "id": "3SmtLL016f5l" } }, { "cell_type": "code", "source": [ "query = \"什么是WTF Langchain?\"\n", "docs = db.similarity_search(query)\n", "docs" ], "metadata": { "id": "XqqP4P4554j5" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "docs = db.similarity_search_with_score(query)\n", "docs" ], "metadata": { "id": "bAa13Y7DM-rO" }, "execution_count": null, "outputs": [] } ] }