# transformers: introduction

In [None]:
# first, install the library Transformers
# you only need to install this library once. 

!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [None]:
# import the transformers library, along with the pipeline and set_seed functions

import transformers
from transformers import pipeline, set_seed

## text generation
generates new text based on an input prompt, like a chatbot. 

In [None]:
# pulling in the text generation "pipeline", and setting it to the variable
# called "generator"

generator = pipeline('text-generation')

No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# taking the generator function and passing a sentence and maximum length and 
# number of responses to the function

generator('This summer, I was rock climbing in Yosemite when',
          max_length=50,
          num_return_sequences=2)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "This summer, I was rock climbing in Yosemite when I saw Yosemite Park and how we were both in different places. All of our friends from that group were in the park too. We all went up there together (we didn't even have clothes)."},
 {'generated_text': 'This summer, I was rock climbing in Yosemite when I saw a very unusual looking figure wearing a hooded dress. The man went down there with a white hat and shorts while wearing a black dress. He had the same hair like a man, a'}]

## fill mask
Fills the word in the blank with a guess

In [None]:
# create the "unmasker" variable set to the "fill-mask" task

unmasker = pipeline('fill-mask')

No model was supplied, defaulted to distilroberta-base and revision ec58a5b (https://huggingface.co/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/331M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# give it a sentence, with the <mask> as a fill in the blank
# the "top_k" argument means we will get 4 responses

unmasker('To be or not to be; that is the <mask>', top_k=4)

[{'score': 0.10899456590414047,
  'token': 2249,
  'token_str': ' difference',
  'sequence': 'To be or not to be; that is the difference'},
 {'score': 0.057923685759305954,
  'token': 2031,
  'token_str': ' choice',
  'sequence': 'To be or not to be; that is the choice'},
 {'score': 0.0572822242975235,
  'token': 3157,
  'token_str': ' truth',
  'sequence': 'To be or not to be; that is the truth'},
 {'score': 0.04440426453948021,
  'token': 1948,
  'token_str': ' answer',
  'sequence': 'To be or not to be; that is the answer'}]

In [None]:
unmasker('My name is Professor Calado and I teach at <mask>', top_k=4)

[{'score': 0.1351284384727478,
  'token': 20124,
  'token_str': ' MIT',
  'sequence': 'My name is Professor Calado and I teach at MIT'},
 {'score': 0.07084151357412338,
  'token': 10441,
  'token_str': ' UCLA',
  'sequence': 'My name is Professor Calado and I teach at UCLA'},
 {'score': 0.06717374920845032,
  'token': 8607,
  'token_str': ' Stanford',
  'sequence': 'My name is Professor Calado and I teach at Stanford'},
 {'score': 0.06465509533882141,
  'token': 23706,
  'token_str': ' BYU',
  'sequence': 'My name is Professor Calado and I teach at BYU'}]

## summarization
Takes a longer text and condenses it.

In [None]:
# taking the "summarization" task and saving it to "summarizer"
# then passing some text into the "summarizer"

# we use three quotes at the beginning and end of the string 
# if we want to put in a text that spans multiple lines

summarizer = pipeline('summarization')
summarizer('''The past 3 years of work in NLP have been characterized 
by the development and deployment of ever larger language models, 
especially for English. BERT, its variants, GPT-2/3, and others, 
most recently Switch-C, have pushed the boundaries of the possible 
both through architectural innovations and through sheer size. Using 
these pretrained models and the methodology of fine-tuning them for 
specific tasks, researchers have extended the state of the art on a 
wide array of tasks as measured by leaderboards on specific benchmarks 
for English. In this paper, we take a step back and ask: How big is too 
big? What are the possible risks associated with this technology and 
what paths are available for mitigating those risks? We provide 
recommendations including weighing the environmental and financial costs 
first, investing resources into curating and carefully documenting 
datasets rather than ingesting everything on the web, carrying out 
pre-development exercises evaluating how the planned approach fits into 
research and development goals and supports stakeholder values, and 
encouraging research directions beyond ever larger language models.''')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

[{'summary_text': ' The past 3 years of work in NLP have been characterized by the development and deployment of ever larger language models,  especially for English . In this paper, we take a step back and ask: How big is too big? What are the possible risks associated with this technology and what paths are available for mitigating those risks?'}]

## question-answering
Takes an input question and context and provides an answer

In [None]:
# calling the question-answer pipeline
# passing a question and context into the pipeline
# the function will look into the context to get the answer

question_answer = pipeline('question-answering')
question_answer(question='Was the writer of Frankenstien a man or a woman?', 
                context='''Frankenstien is a book written by Mary Shelley who is 
                a woman''')

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'score': 0.6520994901657104, 'start': 71, 'end': 78, 'answer': 'a woman'}

## ner (named entity recognition)
Named entity recognition (NER) is a task where the model has to find which parts of the input text correspond to entities such as persons, locations, or organizations.

In [None]:
ner = pipeline("ner", grouped_entities=True)
ner("My name is Filipa Calado and I work at City College in Manhattan.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]



[{'entity_group': 'PER',
  'score': 0.9985998,
  'word': 'Filipa Calado',
  'start': 11,
  'end': 24},
 {'entity_group': 'ORG',
  'score': 0.9940423,
  'word': 'City College',
  'start': 39,
  'end': 51},
 {'entity_group': 'LOC',
  'score': 0.9883624,
  'word': 'Manhattan',
  'start': 55,
  'end': 64}]