HuggingFace Transformers Library

Master the HuggingFace ecosystem — pipeline API, tokenizers, AutoModel classes, and fine-tuning with Trainer.

Intermediate · 22 min read

The HuggingFace Ecosystem

HuggingFace provides a unified API to load and use 500,000+ pre-trained models for NLP, vision, audio, and multimodal tasks — with just a few lines of code.

from transformers import pipeline

# Sentiment analysis
classifier = pipeline("sentiment-analysis")
print(classifier("HuggingFace is absolutely amazing!"))
# [{'label': 'POSITIVE', 'score': 0.9999}]

# Zero-shot classification
zero_shot = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
result = zero_shot(
    "The new iPhone has an incredible camera.",
    candidate_labels=["technology", "sports", "politics"],
)
print(result['labels'][0])  # technology

# NER
ner = pipeline("ner", aggregation_strategy="simple")
entities = ner("Elon Musk founded SpaceX in Hawthorne, California.")
for e in entities:
    print(f"{e['word']:15} {e['entity_group']:8} ({e['score']:.3f})")

# Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
text = "The Amazon rainforest covers 5.5 million km2 in South America..."
print(summarizer(text, max_length=60, min_length=20)[0]['summary_text'])
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

dataset = load_dataset("imdb")

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

tokenized = dataset.map(tokenize, batched=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    return {"accuracy": accuracy_score(labels, np.argmax(logits, axis=-1))}

args = TrainingArguments(
    output_dir="./distilbert-imdb",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    fp16=True,
)

Trainer(
    model=model, args=args,
    train_dataset=tokenized["train"].select(range(2000)),
    eval_dataset=tokenized["test"].select(range(500)),
    compute_metrics=compute_metrics,
).train()

Part of the NLP & Language Models series on Tekivex. Browse all tutorials or explore our open-source products.