import arxiv
arxiv_query = "NLP"
search = arxiv.Search(
query=arxiv_query,
max_results=5,
sort_by=arxiv.SortCriterion.SubmittedDate,
)
titles, abstracts, urls = [], [], []
for result in search.results():
titles.append(result.title)
abstracts.append(result.summary)
urls.append(result.pdf_url)
今回使用する論文は下記5件 ”torchdistill Meets Hugging Face Libraries for Reproducible, Coding-Free Deep Learning Studies: A Case Study on NLP" "1D-Touch: NLP-Assisted Coarse Text Selection via a Semi-Direct Gesture The Validity of Evaluation Results: Assessing Concurrence Across Compositionality Benchmarks" "LightLM: A Lightweight Deep and Narrow Language Model for Generative Recommendation" "De-novo Chemical Reaction Generation by Means of Temporarily Convolutional Neural Networks"
from tqdm.notebook import tqdm
#Zephyr-7B-βによる要約list
summary_beta = []
for text in tqdm(abstracts):
messages = [
{
"role": "system",
"content": "You are an excellent technician, have a deep understanding of patents in particular, and always give careful, accurate and clear replies.",
},
{"role": "user", "content": f"Please summarize the patent specification section below, using bullet points to identify the main purpose and means used.\n-----\n{text}"},
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=1024, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
summary = outputs[0]["generated_text"][outputs[0]["generated_text"].find('<|assistant|>')+14:]
summary_beta.append(summary)
#Zephyr-7B-αによる要約list
summary_alpha = []
for text in tqdm(abstracts):
messages = [
{
"role": "system",
"content": "You are an excellent technician, have a deep understanding of patents in particular, and always give careful, accurate and clear replies.",
},
{"role": "user", "content": f"Please summarize the patent specification section below, using bullet points to identify the main purpose and means used.\n-----\n{text}"},
]
prompt = pipe_a.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe_a(prompt, max_new_tokens=1024, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
summary = outputs[0]["generated_text"][outputs[0]["generated_text"].find('<|assistant|>')+14:]
summary_alpha.append(summary)
Zephyr-7B-βによる要約
for i in range(len(titles)):
print(f"Title : {titles[i]}")
print(summary_beta[i])
print("\n")
Zephyr-7B-αによる要約
for i in range(len(titles)):
print(f"Title : {titles[i]}")
print(summary_alpha[i])
print("\n")
2. Arxivの"NLP"論文本文を要約
さきほどの5文献のうち1件の本文を取得
from langchain.document_loaders import OnlinePDFLoader
data = OnlinePDFLoader(urls[0]).load()
pdf_text = data[0].page_content
chunks = split_text(pdf_text, length=15000)
messages_list = []
for i in range(len(chunks)):
messages = [
{
"role": "system",
"content": "You are an excellent technician, have a deep understanding of patents in particular, and always give careful, accurate and clear replies.",
},
{"role": "user", "content": f"The following text is part of a technical paper. You are a good technician and will summarize it. In particular, please clearly state the purpose of the paper, approach used to achieve the purpose, and what you infer to be the conclusion of the paper. If references are listed, such as author names or journal names, please delete them without summarizing. \n-----\n{chunks[i]}"},
]
messages_list.append(messages)
要約
#Zephyr-7B-βによる要約
main_summary_beta = []
for messages in tqdm(messages_list):
try:
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=1024, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
main_summary = outputs[0]["generated_text"][outputs[0]["generated_text"].find('<|assistant|>')+14:]
main_summary_beta.append(main_summary)
except:
main_summary_beta.append("")
text = "\n".join(main_summary_beta)
messages = [
{
"role": "system",
"content": "You are an excellent technician, have a deep understanding of patents in particular, and always give careful, accurate and clear replies.",
},
{"role": "user", "content": f"Below is the summary text of the patent. Please itemize the main purpose and means used specifically. \n-----\n{text}"},
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=1024, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
main_summary_beta_l = outputs[0]["generated_text"][outputs[0]["generated_text"].find('<|assistant|>')+14:]
#Zephyr-7B-αによる要約
main_summary_alpha = []
for messages in tqdm(messages_list):
try:
prompt = pipe_a.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe_a(prompt, max_new_tokens=1024, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
main_summary = outputs[0]["generated_text"][outputs[0]["generated_text"].find('<|assistant|>')+14:]
main_summary_alpha.append(main_summary)
except:
main_summary_alpha.append("")
text = "\n".join(main_summary_alpha)
messages = [
{
"role": "system",
"content": "You are an excellent technician, have a deep understanding of patents in particular, and always give careful, accurate and clear replies.",
},
{"role": "user", "content": f"Below is the summary text of the patent. Please itemize the main purpose and means used specifically. \n-----\n{text}"},
]
prompt = pipe_a.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe_a(prompt, max_new_tokens=1024, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
main_summary_alpha_l = outputs[0]["generated_text"][outputs[0]["generated_text"].find('<|assistant|>')+14:]