NLP基本业务范围之二
创始人
2024-03-12 15:54:57
0

前面已经熟悉了相关博文,fastTEXT,纠错/MLM/NER/情感分析/摘要,本文将继续介绍基本业务。

6,给定文本,做阅读理解题,也即Extractive Question Answering:extracting an answer from a text given a question 。相关数据集为SQuAD。

from transformers import pipeline
question_answerer = pipeline("question-answering")
context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
a model on a SQuAD task, you may leverage the examples/pytorch/question-answering/run_squad.py script.
"""
>>> result = question_answerer(question="What is extractive question answering?", context=context)###277356808 QQ group
>>> result
{'score': 0.6177276968955994, 'start': 34, 'end': 95, 'answer': 'the task of extracting an answer from a text given a question'}
>>> question_answerer(question="What is a good example of a question answering dataset?", context=context )
{'score': 0.5152317881584167, 'start': 147, 'end': 160, 'answer': 'SQuAD dataset'}

 多个问答也可做,也可换用更大数据集的预训练模型

from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch #QQ group 277356808
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""
questions = ["How many pretrained models are available in 🤗 Transformers?","What does 🤗 Transformers provide?","🤗 Transformers provides interoperability between which frameworks?",
]for question in questions:inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")input_ids = inputs["input_ids"].tolist()[0]outputs = model(**inputs)answer_start_scores = outputs.start_logitsanswer_end_scores = outputs.end_logits# Get the most likely beginning of answer with the argmax of the scoreanswer_start = torch.argmax(answer_start_scores)# Get the most likely end of answer with the argmax of the scoreanswer_end = torch.argmax(answer_end_scores) + 1answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))print(f"Question: {question}")print(f"Answer: {answer}")

7,Causal Language Modeling因果语言模型

from transformers import AutoModelForCausalLM, AutoTokenizer, top_k_top_p_filtering
import torch #QQ group 277356808
from torch import nn
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
sequence = f"Hugging Face is based in DUMBO, New York City, and"
inputs = tokenizer(sequence, return_tensors="pt")
input_ids = inputs["input_ids"]
# get logits of last hidden state
next_token_logits = model(**inputs).logits[:, -1, :]
# filter
filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
# sample
probs = nn.functional.softmax(filtered_next_token_logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
generated = torch.cat([input_ids, next_token], dim=-1)
resulting_string = tokenizer.decode(generated.tolist()[0])
print(resulting_string)
Hugging Face is based in DUMBO, New York City, and is

这个效果有点差哦(留个坑),这是啥子哟。调整top_k_top_p_filtering中的参数并没有啥用,num_samples是下面token的个数。这也是属于text Generation里面的了。

8,文本生成Text Generation

from transformers import pipeline
text_generator = pipeline("text-generation")
>>> print(text_generator("I love you, I will", max_length=10, do_sample=False))
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
[{'generated_text': 'I love you, I will never forget you.'}]

有prompt(提示)及padding(背景)时,文本生成更好做,

from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("xlnet-base-cased")
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
# Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
(except for Alexei and Maria) are discovered.
The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
remainder of the story. 1883 Western Siberia,
a young Grigori Rasputin is asked by his father and a group of men to perform magic.
Rasputin has a vision and denounces one of the men as a horse thief. Although his
father initially slaps him for making such an accusation, Rasputin watches as the
man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
with people, even a bishop, begging for his blessing.   """
prompt = "Today the weather is really nice and I am planning on "
inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
prompt_length = len(tokenizer.decode(inputs[0]))
outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]
print(generated)
>>> generated
"Today the weather is really nice and I am planning on going to a lovely outdoor restaurant. I'm feeling very happy, that's why I'm going to you right now, because I need to get a job for a few more days before I can go out and get a little more paid. I'm just ready to go. If I'm not sure what I can do, it's all important to me."

 9,翻译translation

from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
article_hi = "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है"
article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا."
article_zh ="我爱你到永远。"
article_en ="I love you forver!"
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
# translate Hindi to French
tokenizer.src_lang = "hi_IN"
encoded_hi = tokenizer(article_hi, return_tensors="pt")
generated_tokens = model.generate(**encoded_hi,forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"]
)
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire dans la Syrie."
# translate Arabic to English
tokenizer.src_lang = "ar_AR"
encoded_ar = tokenizer(article_ar, return_tensors="pt")
generated_tokens = model.generate(**encoded_ar,forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
)
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "The Secretary-General of the United Nations says there is no military solution in Syria."
#QQ group 277356808
tokenizer.src_lang = "zh_CN"
encoded_zh = tokenizer(article_zh, return_tensors="pt")
generated_tokens = model.generate(**encoded_zh,forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
)
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
#['I love you forever.']tokenizer.src_lang = "en_XX"
encoded_en = tokenizer(article_en, return_tensors="pt")
generated_tokens = model.generate(**encoded_en,forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"]
)
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
#['I love you forever.']

欢迎关注本专栏,持续更新! 

愿我们终有重逢之时,而你还记得我们曾经讨论的话题 

相关内容

热门资讯

美国2年期国债收益率上涨15个... 原标题:美国2年期国债收益率上涨15个基点 美国2年期国债收益率上涨15个基...
汽车油箱结构是什么(汽车油箱结... 本篇文章极速百科给大家谈谈汽车油箱结构是什么,以及汽车油箱结构原理图解对应的知识点,希望对各位有所帮...
嵌入式 ADC使用手册完整版 ... 嵌入式 ADC使用手册完整版 (188977万字)💜&#...
重大消息战皇大厅开挂是真的吗... 您好:战皇大厅这款游戏可以开挂,确实是有挂的,需要了解加客服微信【8435338】很多玩家在这款游戏...
盘点十款牵手跑胡子为什么一直... 您好:牵手跑胡子这款游戏可以开挂,确实是有挂的,需要了解加客服微信【8435338】很多玩家在这款游...
senator香烟多少一盒(s... 今天给各位分享senator香烟多少一盒的知识,其中也会对sevebstars香烟进行解释,如果能碰...
终于懂了新荣耀斗牛真的有挂吗... 您好:新荣耀斗牛这款游戏可以开挂,确实是有挂的,需要了解加客服微信8435338】很多玩家在这款游戏...
盘点十款明星麻将到底有没有挂... 您好:明星麻将这款游戏可以开挂,确实是有挂的,需要了解加客服微信【5848499】很多玩家在这款游戏...
总结文章“新道游棋牌有透视挂吗... 您好:新道游棋牌这款游戏可以开挂,确实是有挂的,需要了解加客服微信【7682267】很多玩家在这款游...
终于懂了手机麻将到底有没有挂... 您好:手机麻将这款游戏可以开挂,确实是有挂的,需要了解加客服微信【8435338】很多玩家在这款游戏...