[活用例]Local-LLM+Topic model+Langchain+ChromaDB
今回は集めた特許データをTopic modelで分類し、分類したtopicごとにChromaDBでデータベースを作成、Langchainを使ってRAGを設定し、Local-LLMに回答してもらうフローを整理しました。
フローは上のイメージ図の通り、下記の手順で進めます。
1. 特許からコンセプトを抽出
2. 抽出したコンセプトを分類
3. トピック毎にデータベースを作成
4. RAGの設定
0. 環境
1. コンセプト抽出
特許のデータはこちらで使用したGoogle Patentsで「(semiconductor) country:US after:priority:20230101 language:ENGLISH」でヒットした857件を流用します。
下記の通り、明細文からその特許の目的とキーコンセプトを抽出します。
# modelの指定
import torch
from transformers import pipeline
pipe = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto")
#
concepts_texts = []
for i in tqdm(range(len(df))):
text = df['description'][i][:10000] # 明細文の冒頭10000文字を使用
try:
messages = [
{
"role": "system",
"content": "You are an excellent technician, have a deep understanding of patents in particular, and always give careful, accurate and clear replies.",
},
{"role": "user", "content": f"What are the main objectives and key concepts of the patents shown next?\n-----\n{text}"},
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=2048, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
concept = outputs[0]["generated_text"][outputs[0]["generated_text"].find('<|assistant|>')+14:]
concepts_texts.append(concept)
pickle.dump(concepts_texts, open(f"{path}/concepts.pkl", 'wb')) # 保存
except:
concepts_texts.append("")
2. コンセプト分類
Bertopicが便利なので、Bertopicを使用。
embeddingにはBAAI(北京智源人工智能研究院)のbge-large-en-v1.5を使用。
import pickle
docs = pickle.load(open(f"{path}/concepts.pkl", 'br'))
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
main_representation_model = KeyBERTInspired()
aspect_representation_model = [KeyBERTInspired(top_n_words=30),
MaximalMarginalRelevance(diversity=.5)]
representation_model = {
"Main": main_representation_model,
"Aspect": aspect_representation_model
}
model = 'BAAI/bge-large-en-v1.5'
topic_model = BERTopic(verbose=True, embedding_model=model,
min_topic_size = 10,
representation_model = representation_model
)
topics, ini_probs = topic_model.fit_transform(docs)
zephyr-7B-αでそれぞれのtopicに命名。
今回はCPUで動かすため、Bloke兄さんの8bit量子化モデルを使用。
from llama_cpp import Llama
model_path = r".\LlamaCPP\model" # modelの保存フォルダ
model_name = "zephyr-7b-alpha.Q8_0.gguf"
llm = Llama(model_path=f"{model_path}/{model_name}", n_ctx=4096)
zephyr_keywords = []
for i in tqdm(range(len(topic_model.get_topic_info()))):
DOCUMENTS = topic_model.get_topic_info()["Representative_Docs"][i][0]
KEYWORDS = topic_model.get_topic_info()["Aspect"][i]
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""
example_prompt = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST] Environmental impacts of eating meat
"""
main_prompt = f"""
[INST]
I have a topic that contains the following documents:
{DOCUMENTS}
The topic is described by the following keywords: '{KEYWORDS}'.
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
"""
prompt = system_prompt + example_prompt + main_prompt
output = llm(prompt, temperature=0.1, echo=True)
keyword = output['choices'][0]['text'][output['choices'][0]['text'].rfind("\n")+1:]
zephyr_keywords.append(keyword)
topic_model.set_topic_labels(zephyr_keywords)
作成したtopic毎にフォルダを作成し、各特許のコンセプトのtxtファイルを入れておきます。
import os
# フォルダ作成
for keyword in zephyr_keywords:
os.makedirs(f"{path}/{keyword}", exist_ok=True)
# txtファイルの保存
df['topic'] = topics
df['concept'] = docs
for j in range(len(df)):
name = df.url[j][df.url[j].rfind("US"):-3]
outpath = f"{path}/topic folder/{zephyr_keywords[df.topic[j]-1].strip()}"
f = open(f'{outpath}/{name}.txt', 'w', encoding='utf-8')
f.write(df.concept[j])
f.close()
3. データベース作成
LangchainとChromadbでRAGに用いるベクトルデータベースを作成します。
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
txt_path = f"{path}/topic folder/{zephyr_keywords[0].strip()}" # ひとつめのtopicのデータを指定
loader = DirectoryLoader(txt_path, glob="**/*.txt", loader_cls=TextLoader)
docs = loader.load()
len(docs)
# sourceを調整
for doc in docs:
doc.metadata['source'] = doc.metadata['source'][doc.metadata['source'].rfind("\\")+1:-4]
# textを分割
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter (chunk_size=500, chunk_overlap=20)
texts = text_splitter.split_documents(docs)
# データベースの作成と保存
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma
model_name = "BAAI/bge-large-en-v1.5"
encode_kwargs = {'normalize_embeddings': True}
embedding_function = HuggingFaceBgeEmbeddings(
model_name=model_name,
model_kwargs={'device': 'cuda'},
encode_kwargs=encode_kwargs
)
persist_directory = f'{txt_path}/vdb' # データベースの保存先
vectordb = Chroma.from_documents(documents=texts,
embedding=embedding_function,
persist_directory=persist_directory)
4. RAGの設定
質疑に使用するLLMのmodelを設定します。今回はStarling-LM-7B-alphaを使用します。
import torch
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
pipe = pipeline("text-generation", model="berkeley-nest/Starling-LM-7B-alpha", torch_dtype=torch.bfloat16,
device_map="auto", max_new_tokens=512,
repetition_penalty=1.15
)
llm = HuggingFacePipeline(pipeline=pipe)
LangchainのRetrievalQAを設定します。
chain_typeを"map_reduce"にすることでRetrievalで選ばれたtextごとにLLMに回答を生成してもらい(map step)、その回答を基にひとつの回答をさらに生成してもらう(reduce step)工程を経ることができます。
from langchain.chains import RetrievalQA
retriever = vectordb.as_retriever(search_kwargs={"k": 5})
qa_chain = RetrievalQA.from_chain_type(llm=llm,
chain_type="map_reduce",
retriever=retriever,
return_source_documents=True
)
質問:
query = "What are the challenges of the 3D FinFET fabrication process"
llm_response = qa_chain(query)
print(llm_response['result'])
回答
回答に使用したソースを表示します。
[source.metadata for source in llm_response["source_documents"]]
Prompt templateとRetrievalQAWithSourcesChainを使用して回答を生成することもできます。
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import PromptTemplate, LLMChain
template = """
{summaries}
{question}
"""
retrieval_qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=True,
chain_type_kwargs={
"prompt": PromptTemplate(
template=template,
input_variables=["summaries", "question"],
),
},
)
llm_response = retrieval_qa_chain(query)
print(llm_response['answer'])