[Refactor] Splitting pipelines.py into its own module. (#9279)
* Splitting pipelines into its own module. * Moving everything into base.py * Moving FeatureExtractionPipeline into its own file. * TextGenerationPipeline. * TextClassifictionPipeline * ZeroShot + get_framework import. * FillMaskPipeline * NerPipeline + TokenClassificationPipeline * QuestionAnsweringPipeline * TableQuestionAnsweringPipeline * ConversationnalPipeline * Text2TextGenerationPipeline, TranslationPipeline, SummarizationPipeline * Typo import fix. * Relative imports.
This commit is contained in:
parent
d64372fdfc
commit
090d28e32d
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,418 @@
|
|||
# flake8: noqa
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import warnings
|
||||
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
||||
|
||||
from ..configuration_utils import PretrainedConfig
|
||||
from ..file_utils import is_tf_available, is_torch_available
|
||||
from ..modelcard import ModelCard
|
||||
from ..models.auto.tokenization_auto import AutoTokenizer
|
||||
from ..tokenization_utils import PreTrainedTokenizer
|
||||
from ..utils import logging
|
||||
from .base import (
|
||||
ArgumentHandler,
|
||||
CsvPipelineDataFormat,
|
||||
JsonPipelineDataFormat,
|
||||
PipedPipelineDataFormat,
|
||||
Pipeline,
|
||||
PipelineDataFormat,
|
||||
PipelineException,
|
||||
get_default_model,
|
||||
get_framework,
|
||||
)
|
||||
from .conversational import Conversation, ConversationalPipeline
|
||||
from .feature_extraction import FeatureExtractionPipeline
|
||||
from .fill_mask import FillMaskPipeline
|
||||
from .question_answering import QuestionAnsweringArgumentHandler, QuestionAnsweringPipeline
|
||||
from .table_question_answering import TableQuestionAnsweringArgumentHandler, TableQuestionAnsweringPipeline
|
||||
from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
|
||||
from .text_classification import TextClassificationPipeline
|
||||
from .text_generation import TextGenerationPipeline
|
||||
from .token_classification import NerPipeline, TokenClassificationArgumentHandler, TokenClassificationPipeline
|
||||
from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
|
||||
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
|
||||
from ..models.auto.modeling_tf_auto import (
|
||||
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
|
||||
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
|
||||
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
|
||||
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
|
||||
TF_MODEL_WITH_LM_HEAD_MAPPING,
|
||||
TFAutoModel,
|
||||
TFAutoModelForCausalLM,
|
||||
TFAutoModelForMaskedLM,
|
||||
TFAutoModelForQuestionAnswering,
|
||||
TFAutoModelForSeq2SeqLM,
|
||||
TFAutoModelForSequenceClassification,
|
||||
TFAutoModelForTokenClassification,
|
||||
)
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from ..models.auto.modeling_auto import (
|
||||
MODEL_FOR_MASKED_LM_MAPPING,
|
||||
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
|
||||
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
|
||||
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
|
||||
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
|
||||
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
|
||||
AutoModel,
|
||||
AutoModelForCausalLM,
|
||||
AutoModelForMaskedLM,
|
||||
AutoModelForQuestionAnswering,
|
||||
AutoModelForSeq2SeqLM,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoModelForTableQuestionAnswering,
|
||||
AutoModelForTokenClassification,
|
||||
)
|
||||
if TYPE_CHECKING:
|
||||
from ..modeling_tf_utils import TFPreTrainedModel
|
||||
from ..modeling_utils import PreTrainedModel
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
# Register all the supported tasks here
|
||||
SUPPORTED_TASKS = {
|
||||
"feature-extraction": {
|
||||
"impl": FeatureExtractionPipeline,
|
||||
"tf": TFAutoModel if is_tf_available() else None,
|
||||
"pt": AutoModel if is_torch_available() else None,
|
||||
"default": {"model": {"pt": "distilbert-base-cased", "tf": "distilbert-base-cased"}},
|
||||
},
|
||||
"sentiment-analysis": {
|
||||
"impl": TextClassificationPipeline,
|
||||
"tf": TFAutoModelForSequenceClassification if is_tf_available() else None,
|
||||
"pt": AutoModelForSequenceClassification if is_torch_available() else None,
|
||||
"default": {
|
||||
"model": {
|
||||
"pt": "distilbert-base-uncased-finetuned-sst-2-english",
|
||||
"tf": "distilbert-base-uncased-finetuned-sst-2-english",
|
||||
},
|
||||
},
|
||||
},
|
||||
"ner": {
|
||||
"impl": TokenClassificationPipeline,
|
||||
"tf": TFAutoModelForTokenClassification if is_tf_available() else None,
|
||||
"pt": AutoModelForTokenClassification if is_torch_available() else None,
|
||||
"default": {
|
||||
"model": {
|
||||
"pt": "dbmdz/bert-large-cased-finetuned-conll03-english",
|
||||
"tf": "dbmdz/bert-large-cased-finetuned-conll03-english",
|
||||
},
|
||||
},
|
||||
},
|
||||
"question-answering": {
|
||||
"impl": QuestionAnsweringPipeline,
|
||||
"tf": TFAutoModelForQuestionAnswering if is_tf_available() else None,
|
||||
"pt": AutoModelForQuestionAnswering if is_torch_available() else None,
|
||||
"default": {
|
||||
"model": {"pt": "distilbert-base-cased-distilled-squad", "tf": "distilbert-base-cased-distilled-squad"},
|
||||
},
|
||||
},
|
||||
"table-question-answering": {
|
||||
"impl": TableQuestionAnsweringPipeline,
|
||||
"pt": AutoModelForTableQuestionAnswering if is_torch_available() else None,
|
||||
"tf": None,
|
||||
"default": {
|
||||
"model": {
|
||||
"pt": "nielsr/tapas-base-finetuned-wtq",
|
||||
"tokenizer": "nielsr/tapas-base-finetuned-wtq",
|
||||
"tf": "nielsr/tapas-base-finetuned-wtq",
|
||||
},
|
||||
},
|
||||
},
|
||||
"fill-mask": {
|
||||
"impl": FillMaskPipeline,
|
||||
"tf": TFAutoModelForMaskedLM if is_tf_available() else None,
|
||||
"pt": AutoModelForMaskedLM if is_torch_available() else None,
|
||||
"default": {"model": {"pt": "distilroberta-base", "tf": "distilroberta-base"}},
|
||||
},
|
||||
"summarization": {
|
||||
"impl": SummarizationPipeline,
|
||||
"tf": TFAutoModelForSeq2SeqLM if is_tf_available() else None,
|
||||
"pt": AutoModelForSeq2SeqLM if is_torch_available() else None,
|
||||
"default": {"model": {"pt": "sshleifer/distilbart-cnn-12-6", "tf": "t5-small"}},
|
||||
},
|
||||
# This task is a special case as it's parametrized by SRC, TGT languages.
|
||||
"translation": {
|
||||
"impl": TranslationPipeline,
|
||||
"tf": TFAutoModelForSeq2SeqLM if is_tf_available() else None,
|
||||
"pt": AutoModelForSeq2SeqLM if is_torch_available() else None,
|
||||
"default": {
|
||||
("en", "fr"): {"model": {"pt": "t5-base", "tf": "t5-base"}},
|
||||
("en", "de"): {"model": {"pt": "t5-base", "tf": "t5-base"}},
|
||||
("en", "ro"): {"model": {"pt": "t5-base", "tf": "t5-base"}},
|
||||
},
|
||||
},
|
||||
"text2text-generation": {
|
||||
"impl": Text2TextGenerationPipeline,
|
||||
"tf": TFAutoModelForSeq2SeqLM if is_tf_available() else None,
|
||||
"pt": AutoModelForSeq2SeqLM if is_torch_available() else None,
|
||||
"default": {"model": {"pt": "t5-base", "tf": "t5-base"}},
|
||||
},
|
||||
"text-generation": {
|
||||
"impl": TextGenerationPipeline,
|
||||
"tf": TFAutoModelForCausalLM if is_tf_available() else None,
|
||||
"pt": AutoModelForCausalLM if is_torch_available() else None,
|
||||
"default": {"model": {"pt": "gpt2", "tf": "gpt2"}},
|
||||
},
|
||||
"zero-shot-classification": {
|
||||
"impl": ZeroShotClassificationPipeline,
|
||||
"tf": TFAutoModelForSequenceClassification if is_tf_available() else None,
|
||||
"pt": AutoModelForSequenceClassification if is_torch_available() else None,
|
||||
"default": {
|
||||
"model": {"pt": "facebook/bart-large-mnli", "tf": "roberta-large-mnli"},
|
||||
"config": {"pt": "facebook/bart-large-mnli", "tf": "roberta-large-mnli"},
|
||||
"tokenizer": {"pt": "facebook/bart-large-mnli", "tf": "roberta-large-mnli"},
|
||||
},
|
||||
},
|
||||
"conversational": {
|
||||
"impl": ConversationalPipeline,
|
||||
"tf": TFAutoModelForCausalLM if is_tf_available() else None,
|
||||
"pt": AutoModelForCausalLM if is_torch_available() else None,
|
||||
"default": {"model": {"pt": "microsoft/DialoGPT-medium", "tf": "microsoft/DialoGPT-medium"}},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def check_task(task: str) -> Tuple[Dict, Any]:
|
||||
"""
|
||||
Checks an incoming task string, to validate it's correct and return the default Pipeline and Model classes, and
|
||||
default models if they exist.
|
||||
|
||||
Args:
|
||||
task (:obj:`str`):
|
||||
The task defining which pipeline will be returned. Currently accepted tasks are:
|
||||
|
||||
- :obj:`"feature-extraction"`
|
||||
- :obj:`"sentiment-analysis"`
|
||||
- :obj:`"ner"`
|
||||
- :obj:`"question-answering"`
|
||||
- :obj:`"fill-mask"`
|
||||
- :obj:`"summarization"`
|
||||
- :obj:`"translation_xx_to_yy"`
|
||||
- :obj:`"translation"`
|
||||
- :obj:`"text-generation"`
|
||||
- :obj:`"conversational"`
|
||||
|
||||
Returns:
|
||||
(task_defaults:obj:`dict`, task_options: (:obj:`tuple`, None)) The actual dictionary required to initialize the
|
||||
pipeline and some extra task options for parametrized tasks like "translation_XX_to_YY"
|
||||
|
||||
|
||||
"""
|
||||
if task in SUPPORTED_TASKS:
|
||||
targeted_task = SUPPORTED_TASKS[task]
|
||||
return targeted_task, None
|
||||
|
||||
if task.startswith("translation"):
|
||||
tokens = task.split("_")
|
||||
if len(tokens) == 4 and tokens[0] == "translation" and tokens[2] == "to":
|
||||
targeted_task = SUPPORTED_TASKS["translation"]
|
||||
return targeted_task, (tokens[1], tokens[3])
|
||||
raise KeyError("Invalid translation task {}, use 'translation_XX_to_YY' format".format(task))
|
||||
|
||||
raise KeyError(
|
||||
"Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()) + ["translation_XX_to_YY"])
|
||||
)
|
||||
|
||||
|
||||
def pipeline(
|
||||
task: str,
|
||||
model: Optional = None,
|
||||
config: Optional[Union[str, PretrainedConfig]] = None,
|
||||
tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
|
||||
framework: Optional[str] = None,
|
||||
revision: Optional[str] = None,
|
||||
use_fast: bool = True,
|
||||
**kwargs
|
||||
) -> Pipeline:
|
||||
"""
|
||||
Utility factory method to build a :class:`~transformers.Pipeline`.
|
||||
|
||||
Pipelines are made of:
|
||||
|
||||
- A :doc:`tokenizer <tokenizer>` in charge of mapping raw textual input to token.
|
||||
- A :doc:`model <model>` to make predictions from the inputs.
|
||||
- Some (optional) post processing for enhancing model's output.
|
||||
|
||||
Args:
|
||||
task (:obj:`str`):
|
||||
The task defining which pipeline will be returned. Currently accepted tasks are:
|
||||
|
||||
- :obj:`"feature-extraction"`: will return a :class:`~transformers.FeatureExtractionPipeline`.
|
||||
- :obj:`"sentiment-analysis"`: will return a :class:`~transformers.TextClassificationPipeline`.
|
||||
- :obj:`"ner"`: will return a :class:`~transformers.TokenClassificationPipeline`.
|
||||
- :obj:`"question-answering"`: will return a :class:`~transformers.QuestionAnsweringPipeline`.
|
||||
- :obj:`"fill-mask"`: will return a :class:`~transformers.FillMaskPipeline`.
|
||||
- :obj:`"summarization"`: will return a :class:`~transformers.SummarizationPipeline`.
|
||||
- :obj:`"translation_xx_to_yy"`: will return a :class:`~transformers.TranslationPipeline`.
|
||||
- :obj:`"text2text-generation"`: will return a :class:`~transformers.Text2TextGenerationPipeline`.
|
||||
- :obj:`"text-generation"`: will return a :class:`~transformers.TextGenerationPipeline`.
|
||||
- :obj:`"zero-shot-classification:`: will return a :class:`~transformers.ZeroShotClassificationPipeline`.
|
||||
- :obj:`"conversation"`: will return a :class:`~transformers.ConversationalPipeline`.
|
||||
model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`):
|
||||
The model that will be used by the pipeline to make predictions. This can be a model identifier or an
|
||||
actual instance of a pretrained model inheriting from :class:`~transformers.PreTrainedModel` (for PyTorch)
|
||||
or :class:`~transformers.TFPreTrainedModel` (for TensorFlow).
|
||||
|
||||
If not provided, the default for the :obj:`task` will be loaded.
|
||||
config (:obj:`str` or :obj:`~transformers.PretrainedConfig`, `optional`):
|
||||
The configuration that will be used by the pipeline to instantiate the model. This can be a model
|
||||
identifier or an actual pretrained model configuration inheriting from
|
||||
:class:`~transformers.PretrainedConfig`.
|
||||
|
||||
If not provided, the default configuration file for the requested model will be used. That means that if
|
||||
:obj:`model` is given, its default configuration will be used. However, if :obj:`model` is not supplied,
|
||||
this :obj:`task`'s default model's config is used instead.
|
||||
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`):
|
||||
The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
|
||||
identifier or an actual pretrained tokenizer inheriting from :class:`~transformers.PreTrainedTokenizer`.
|
||||
|
||||
If not provided, the default tokenizer for the given :obj:`model` will be loaded (if it is a string). If
|
||||
:obj:`model` is not specified or not a string, then the default tokenizer for :obj:`config` is loaded (if
|
||||
it is a string). However, if :obj:`config` is also not given or not a string, then the default tokenizer
|
||||
for the given :obj:`task` will be loaded.
|
||||
framework (:obj:`str`, `optional`):
|
||||
The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
|
||||
must be installed.
|
||||
|
||||
If no framework is specified, will default to the one currently installed. If no framework is specified and
|
||||
both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model
|
||||
is provided.
|
||||
revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
|
||||
When passing a task name or a string model identifier: The specific model version to use. It can be a
|
||||
branch name, a tag name, or a commit id, since we use a git-based system for storing models and other
|
||||
artifacts on huggingface.co, so ``revision`` can be any identifier allowed by git.
|
||||
use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to use a Fast tokenizer if possible (a :class:`~transformers.PreTrainedTokenizerFast`).
|
||||
kwargs:
|
||||
Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
|
||||
corresponding pipeline class for possible values).
|
||||
|
||||
Returns:
|
||||
:class:`~transformers.Pipeline`: A suitable pipeline for the task.
|
||||
|
||||
Examples::
|
||||
|
||||
>>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
|
||||
|
||||
>>> # Sentiment analysis pipeline
|
||||
>>> pipeline('sentiment-analysis')
|
||||
|
||||
>>> # Question answering pipeline, specifying the checkpoint identifier
|
||||
>>> pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')
|
||||
|
||||
>>> # Named entity recognition pipeline, passing in a specific model and tokenizer
|
||||
>>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
>>> pipeline('ner', model=model, tokenizer=tokenizer)
|
||||
"""
|
||||
# Retrieve the task
|
||||
targeted_task, task_options = check_task(task)
|
||||
|
||||
# Use default model/config/tokenizer for the task if no model is provided
|
||||
if model is None:
|
||||
# At that point framework might still be undetermined
|
||||
model = get_default_model(targeted_task, framework, task_options)
|
||||
|
||||
framework = framework or get_framework(model)
|
||||
|
||||
task_class, model_class = targeted_task["impl"], targeted_task[framework]
|
||||
|
||||
# Try to infer tokenizer from model or config name (if provided as str)
|
||||
if tokenizer is None:
|
||||
if isinstance(model, str):
|
||||
tokenizer = model
|
||||
elif isinstance(config, str):
|
||||
tokenizer = config
|
||||
else:
|
||||
# Impossible to guest what is the right tokenizer here
|
||||
raise Exception(
|
||||
"Impossible to guess which tokenizer to use. "
|
||||
"Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer."
|
||||
)
|
||||
|
||||
modelcard = None
|
||||
# Try to infer modelcard from model or config name (if provided as str)
|
||||
if isinstance(model, str):
|
||||
modelcard = model
|
||||
elif isinstance(config, str):
|
||||
modelcard = config
|
||||
|
||||
# Instantiate tokenizer if needed
|
||||
if isinstance(tokenizer, (str, tuple)):
|
||||
if isinstance(tokenizer, tuple):
|
||||
# For tuple we have (tokenizer name, {kwargs})
|
||||
use_fast = tokenizer[1].pop("use_fast", use_fast)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
tokenizer[0], use_fast=use_fast, revision=revision, **tokenizer[1]
|
||||
)
|
||||
else:
|
||||
tokenizer = AutoTokenizer.from_pretrained(tokenizer, revision=revision, use_fast=use_fast)
|
||||
|
||||
# Instantiate config if needed
|
||||
if isinstance(config, str):
|
||||
config = AutoConfig.from_pretrained(config, revision=revision)
|
||||
|
||||
# Instantiate modelcard if needed
|
||||
if isinstance(modelcard, str):
|
||||
modelcard = ModelCard.from_pretrained(modelcard, revision=revision)
|
||||
|
||||
# Instantiate model if needed
|
||||
if isinstance(model, str):
|
||||
# Handle transparent TF/PT model conversion
|
||||
model_kwargs = {}
|
||||
if framework == "pt" and model.endswith(".h5"):
|
||||
model_kwargs["from_tf"] = True
|
||||
logger.warning(
|
||||
"Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
|
||||
"Trying to load the model with PyTorch."
|
||||
)
|
||||
elif framework == "tf" and model.endswith(".bin"):
|
||||
model_kwargs["from_pt"] = True
|
||||
logger.warning(
|
||||
"Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
|
||||
"Trying to load the model with Tensorflow."
|
||||
)
|
||||
|
||||
if model_class is None:
|
||||
raise ValueError(
|
||||
f"Pipeline using {framework} framework, but this framework is not supported by this pipeline."
|
||||
)
|
||||
|
||||
model = model_class.from_pretrained(model, config=config, revision=revision, **model_kwargs)
|
||||
if task == "translation" and model.config.task_specific_params:
|
||||
for key in model.config.task_specific_params:
|
||||
if key.startswith("translation"):
|
||||
task = key
|
||||
warnings.warn(
|
||||
'"translation" task was used, instead of "translation_XX_to_YY", defaulting to "{}"'.format(
|
||||
task
|
||||
),
|
||||
UserWarning,
|
||||
)
|
||||
break
|
||||
|
||||
return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs)
|
|
@ -0,0 +1,622 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2018 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import pickle
|
||||
import sys
|
||||
from abc import ABC, abstractmethod
|
||||
from contextlib import contextmanager
|
||||
from os.path import abspath, exists
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
||||
|
||||
from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available
|
||||
from ..modelcard import ModelCard
|
||||
from ..tokenization_utils import PreTrainedTokenizer
|
||||
from ..utils import logging
|
||||
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
|
||||
from ..models.auto.modeling_tf_auto import TFAutoModel
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from ..models.auto.modeling_auto import AutoModel
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..modeling_tf_utils import TFPreTrainedModel
|
||||
from ..modeling_utils import PreTrainedModel
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
def get_framework(model, revision: Optional[str] = None):
|
||||
"""
|
||||
Select framework (TensorFlow or PyTorch) to use.
|
||||
|
||||
Args:
|
||||
model (:obj:`str`, :class:`~transformers.PreTrainedModel` or :class:`~transformers.TFPreTrainedModel`):
|
||||
If both frameworks are installed, picks the one corresponding to the model passed (either a model class or
|
||||
the model name). If no specific model is provided, defaults to using PyTorch.
|
||||
"""
|
||||
if not is_tf_available() and not is_torch_available():
|
||||
raise RuntimeError(
|
||||
"At least one of TensorFlow 2.0 or PyTorch should be installed. "
|
||||
"To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
|
||||
"To install PyTorch, read the instructions at https://pytorch.org/."
|
||||
)
|
||||
if isinstance(model, str):
|
||||
if is_torch_available() and not is_tf_available():
|
||||
model = AutoModel.from_pretrained(model, revision=revision)
|
||||
elif is_tf_available() and not is_torch_available():
|
||||
model = TFAutoModel.from_pretrained(model, revision=revision)
|
||||
else:
|
||||
try:
|
||||
model = AutoModel.from_pretrained(model, revision=revision)
|
||||
except OSError:
|
||||
model = TFAutoModel.from_pretrained(model, revision=revision)
|
||||
|
||||
framework = "tf" if model.__class__.__name__.startswith("TF") else "pt"
|
||||
return framework
|
||||
|
||||
|
||||
def get_default_model(targeted_task: Dict, framework: Optional[str], task_options: Optional[Any]) -> str:
|
||||
"""
|
||||
Select a default model to use for a given task. Defaults to pytorch if ambiguous.
|
||||
|
||||
Args:
|
||||
targeted_task (:obj:`Dict` ):
|
||||
Dictionary representing the given task, that should contain default models
|
||||
|
||||
framework (:obj:`str`, None)
|
||||
"pt", "tf" or None, representing a specific framework if it was specified, or None if we don't know yet.
|
||||
|
||||
task_options (:obj:`Any`, None)
|
||||
Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for
|
||||
translation task.
|
||||
|
||||
Returns
|
||||
|
||||
:obj:`str` The model string representing the default model for this pipeline
|
||||
"""
|
||||
if is_torch_available() and not is_tf_available():
|
||||
framework = "pt"
|
||||
elif is_tf_available() and not is_torch_available():
|
||||
framework = "tf"
|
||||
|
||||
defaults = targeted_task["default"]
|
||||
if task_options:
|
||||
if task_options not in defaults:
|
||||
raise ValueError("The task does not provide any default models for options {}".format(task_options))
|
||||
default_models = defaults[task_options]["model"]
|
||||
elif "model" in defaults:
|
||||
default_models = targeted_task["default"]["model"]
|
||||
else:
|
||||
# XXX This error message needs to be updated to be more generic if more tasks are going to become
|
||||
# parametrized
|
||||
raise ValueError('The task defaults can\'t be correctly selected. You probably meant "translation_XX_to_YY"')
|
||||
|
||||
if framework is None:
|
||||
framework = "pt"
|
||||
|
||||
return default_models[framework]
|
||||
|
||||
|
||||
class PipelineException(Exception):
|
||||
"""
|
||||
Raised by a :class:`~transformers.Pipeline` when handling __call__.
|
||||
|
||||
Args:
|
||||
task (:obj:`str`): The task of the pipeline.
|
||||
model (:obj:`str`): The model used by the pipeline.
|
||||
reason (:obj:`str`): The error message to display.
|
||||
"""
|
||||
|
||||
def __init__(self, task: str, model: str, reason: str):
|
||||
super().__init__(reason)
|
||||
|
||||
self.task = task
|
||||
self.model = model
|
||||
|
||||
|
||||
class ArgumentHandler(ABC):
|
||||
"""
|
||||
Base interface for handling arguments for each :class:`~transformers.pipelines.Pipeline`.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def __call__(self, *args, **kwargs):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class PipelineDataFormat:
|
||||
"""
|
||||
Base class for all the pipeline supported data format both for reading and writing. Supported data formats
|
||||
currently includes:
|
||||
|
||||
- JSON
|
||||
- CSV
|
||||
- stdin/stdout (pipe)
|
||||
|
||||
:obj:`PipelineDataFormat` also includes some utilities to work with multi-columns like mapping from datasets
|
||||
columns to pipelines keyword arguments through the :obj:`dataset_kwarg_1=dataset_column_1` format.
|
||||
|
||||
Args:
|
||||
output_path (:obj:`str`, `optional`): Where to save the outgoing data.
|
||||
input_path (:obj:`str`, `optional`): Where to look for the input data.
|
||||
column (:obj:`str`, `optional`): The column to read.
|
||||
overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to overwrite the :obj:`output_path`.
|
||||
"""
|
||||
|
||||
SUPPORTED_FORMATS = ["json", "csv", "pipe"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
output_path: Optional[str],
|
||||
input_path: Optional[str],
|
||||
column: Optional[str],
|
||||
overwrite: bool = False,
|
||||
):
|
||||
self.output_path = output_path
|
||||
self.input_path = input_path
|
||||
self.column = column.split(",") if column is not None else [""]
|
||||
self.is_multi_columns = len(self.column) > 1
|
||||
|
||||
if self.is_multi_columns:
|
||||
self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]
|
||||
|
||||
if output_path is not None and not overwrite:
|
||||
if exists(abspath(self.output_path)):
|
||||
raise OSError("{} already exists on disk".format(self.output_path))
|
||||
|
||||
if input_path is not None:
|
||||
if not exists(abspath(self.input_path)):
|
||||
raise OSError("{} doesnt exist on disk".format(self.input_path))
|
||||
|
||||
@abstractmethod
|
||||
def __iter__(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def save(self, data: Union[dict, List[dict]]):
|
||||
"""
|
||||
Save the provided data object with the representation for the current
|
||||
:class:`~transformers.pipelines.PipelineDataFormat`.
|
||||
|
||||
Args:
|
||||
data (:obj:`dict` or list of :obj:`dict`): The data to store.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def save_binary(self, data: Union[dict, List[dict]]) -> str:
|
||||
"""
|
||||
Save the provided data object as a pickle-formatted binary data on the disk.
|
||||
|
||||
Args:
|
||||
data (:obj:`dict` or list of :obj:`dict`): The data to store.
|
||||
|
||||
Returns:
|
||||
:obj:`str`: Path where the data has been saved.
|
||||
"""
|
||||
path, _ = os.path.splitext(self.output_path)
|
||||
binary_path = os.path.extsep.join((path, "pickle"))
|
||||
|
||||
with open(binary_path, "wb+") as f_output:
|
||||
pickle.dump(data, f_output)
|
||||
|
||||
return binary_path
|
||||
|
||||
@staticmethod
|
||||
def from_str(
|
||||
format: str,
|
||||
output_path: Optional[str],
|
||||
input_path: Optional[str],
|
||||
column: Optional[str],
|
||||
overwrite=False,
|
||||
) -> "PipelineDataFormat":
|
||||
"""
|
||||
Creates an instance of the right subclass of :class:`~transformers.pipelines.PipelineDataFormat` depending on
|
||||
:obj:`format`.
|
||||
|
||||
Args:
|
||||
format: (:obj:`str`):
|
||||
The format of the desired pipeline. Acceptable values are :obj:`"json"`, :obj:`"csv"` or :obj:`"pipe"`.
|
||||
output_path (:obj:`str`, `optional`):
|
||||
Where to save the outgoing data.
|
||||
input_path (:obj:`str`, `optional`):
|
||||
Where to look for the input data.
|
||||
column (:obj:`str`, `optional`):
|
||||
The column to read.
|
||||
overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to overwrite the :obj:`output_path`.
|
||||
|
||||
Returns:
|
||||
:class:`~transformers.pipelines.PipelineDataFormat`: The proper data format.
|
||||
"""
|
||||
if format == "json":
|
||||
return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
|
||||
elif format == "csv":
|
||||
return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
|
||||
elif format == "pipe":
|
||||
return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
|
||||
else:
|
||||
raise KeyError("Unknown reader {} (Available reader are json/csv/pipe)".format(format))
|
||||
|
||||
|
||||
class CsvPipelineDataFormat(PipelineDataFormat):
|
||||
"""
|
||||
Support for pipelines using CSV data format.
|
||||
|
||||
Args:
|
||||
output_path (:obj:`str`, `optional`): Where to save the outgoing data.
|
||||
input_path (:obj:`str`, `optional`): Where to look for the input data.
|
||||
column (:obj:`str`, `optional`): The column to read.
|
||||
overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to overwrite the :obj:`output_path`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
output_path: Optional[str],
|
||||
input_path: Optional[str],
|
||||
column: Optional[str],
|
||||
overwrite=False,
|
||||
):
|
||||
super().__init__(output_path, input_path, column, overwrite=overwrite)
|
||||
|
||||
def __iter__(self):
|
||||
with open(self.input_path, "r") as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
if self.is_multi_columns:
|
||||
yield {k: row[c] for k, c in self.column}
|
||||
else:
|
||||
yield row[self.column[0]]
|
||||
|
||||
def save(self, data: List[dict]):
|
||||
"""
|
||||
Save the provided data object with the representation for the current
|
||||
:class:`~transformers.pipelines.PipelineDataFormat`.
|
||||
|
||||
Args:
|
||||
data (:obj:`List[dict]`): The data to store.
|
||||
"""
|
||||
with open(self.output_path, "w") as f:
|
||||
if len(data) > 0:
|
||||
writer = csv.DictWriter(f, list(data[0].keys()))
|
||||
writer.writeheader()
|
||||
writer.writerows(data)
|
||||
|
||||
|
||||
class JsonPipelineDataFormat(PipelineDataFormat):
|
||||
"""
|
||||
Support for pipelines using JSON file format.
|
||||
|
||||
Args:
|
||||
output_path (:obj:`str`, `optional`): Where to save the outgoing data.
|
||||
input_path (:obj:`str`, `optional`): Where to look for the input data.
|
||||
column (:obj:`str`, `optional`): The column to read.
|
||||
overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to overwrite the :obj:`output_path`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
output_path: Optional[str],
|
||||
input_path: Optional[str],
|
||||
column: Optional[str],
|
||||
overwrite=False,
|
||||
):
|
||||
super().__init__(output_path, input_path, column, overwrite=overwrite)
|
||||
|
||||
with open(input_path, "r") as f:
|
||||
self._entries = json.load(f)
|
||||
|
||||
def __iter__(self):
|
||||
for entry in self._entries:
|
||||
if self.is_multi_columns:
|
||||
yield {k: entry[c] for k, c in self.column}
|
||||
else:
|
||||
yield entry[self.column[0]]
|
||||
|
||||
def save(self, data: dict):
|
||||
"""
|
||||
Save the provided data object in a json file.
|
||||
|
||||
Args:
|
||||
data (:obj:`dict`): The data to store.
|
||||
"""
|
||||
with open(self.output_path, "w") as f:
|
||||
json.dump(data, f)
|
||||
|
||||
|
||||
class PipedPipelineDataFormat(PipelineDataFormat):
|
||||
"""
|
||||
Read data from piped input to the python process. For multi columns data, columns should separated by \t
|
||||
|
||||
If columns are provided, then the output will be a dictionary with {column_x: value_x}
|
||||
|
||||
Args:
|
||||
output_path (:obj:`str`, `optional`): Where to save the outgoing data.
|
||||
input_path (:obj:`str`, `optional`): Where to look for the input data.
|
||||
column (:obj:`str`, `optional`): The column to read.
|
||||
overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to overwrite the :obj:`output_path`.
|
||||
"""
|
||||
|
||||
def __iter__(self):
|
||||
for line in sys.stdin:
|
||||
# Split for multi-columns
|
||||
if "\t" in line:
|
||||
|
||||
line = line.split("\t")
|
||||
if self.column:
|
||||
# Dictionary to map arguments
|
||||
yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
|
||||
else:
|
||||
yield tuple(line)
|
||||
|
||||
# No dictionary to map arguments
|
||||
else:
|
||||
yield line
|
||||
|
||||
def save(self, data: dict):
|
||||
"""
|
||||
Print the data.
|
||||
|
||||
Args:
|
||||
data (:obj:`dict`): The data to store.
|
||||
"""
|
||||
print(data)
|
||||
|
||||
def save_binary(self, data: Union[dict, List[dict]]) -> str:
|
||||
if self.output_path is None:
|
||||
raise KeyError(
|
||||
"When using piped input on pipeline outputting large object requires an output file path. "
|
||||
"Please provide such output path through --output argument."
|
||||
)
|
||||
|
||||
return super().save_binary(data)
|
||||
|
||||
|
||||
class _ScikitCompat(ABC):
|
||||
"""
|
||||
Interface layer for the Scikit and Keras compatibility.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def transform(self, X):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def predict(self, X):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
PIPELINE_INIT_ARGS = r"""
|
||||
Arguments:
|
||||
model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
|
||||
The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
|
||||
:class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
|
||||
TensorFlow.
|
||||
tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
|
||||
The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
|
||||
:class:`~transformers.PreTrainedTokenizer`.
|
||||
modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`):
|
||||
Model card attributed to the model for this pipeline.
|
||||
framework (:obj:`str`, `optional`):
|
||||
The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
|
||||
must be installed.
|
||||
|
||||
If no framework is specified, will default to the one currently installed. If no framework is specified and
|
||||
both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model
|
||||
is provided.
|
||||
task (:obj:`str`, defaults to :obj:`""`):
|
||||
A task-identifier for the pipeline.
|
||||
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`):
|
||||
Reference to the object in charge of parsing supplied pipeline parameters.
|
||||
device (:obj:`int`, `optional`, defaults to -1):
|
||||
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
|
||||
the associated CUDA device id.
|
||||
binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Flag indicating if the output the pipeline should happen in a binary format (i.e., pickle) or as raw text.
|
||||
"""
|
||||
|
||||
|
||||
@add_end_docstrings(PIPELINE_INIT_ARGS)
|
||||
class Pipeline(_ScikitCompat):
|
||||
"""
|
||||
The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
|
||||
different pipelines.
|
||||
|
||||
Base class implementing pipelined operations. Pipeline workflow is defined as a sequence of the following
|
||||
operations:
|
||||
|
||||
Input -> Tokenization -> Model Inference -> Post-Processing (task dependent) -> Output
|
||||
|
||||
Pipeline supports running on CPU or GPU through the device argument (see below).
|
||||
|
||||
Some pipeline, like for instance :class:`~transformers.FeatureExtractionPipeline` (:obj:`'feature-extraction'` )
|
||||
output large tensor object as nested-lists. In order to avoid dumping such large structure as textual data we
|
||||
provide the :obj:`binary_output` constructor argument. If set to :obj:`True`, the output will be stored in the
|
||||
pickle format.
|
||||
"""
|
||||
|
||||
default_input_names = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union["PreTrainedModel", "TFPreTrainedModel"],
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
modelcard: Optional[ModelCard] = None,
|
||||
framework: Optional[str] = None,
|
||||
task: str = "",
|
||||
args_parser: ArgumentHandler = None,
|
||||
device: int = -1,
|
||||
binary_output: bool = False,
|
||||
):
|
||||
|
||||
if framework is None:
|
||||
framework = get_framework(model)
|
||||
|
||||
self.task = task
|
||||
self.model = model
|
||||
self.tokenizer = tokenizer
|
||||
self.modelcard = modelcard
|
||||
self.framework = framework
|
||||
self.device = device if framework == "tf" else torch.device("cpu" if device < 0 else "cuda:{}".format(device))
|
||||
self.binary_output = binary_output
|
||||
|
||||
# Special handling
|
||||
if self.framework == "pt" and self.device.type == "cuda":
|
||||
self.model = self.model.to(self.device)
|
||||
|
||||
# Update config with task specific parameters
|
||||
task_specific_params = self.model.config.task_specific_params
|
||||
if task_specific_params is not None and task in task_specific_params:
|
||||
self.model.config.update(task_specific_params.get(task))
|
||||
|
||||
def save_pretrained(self, save_directory: str):
|
||||
"""
|
||||
Save the pipeline's model and tokenizer.
|
||||
|
||||
Args:
|
||||
save_directory (:obj:`str`):
|
||||
A path to the directory where to saved. It will be created if it doesn't exist.
|
||||
"""
|
||||
if os.path.isfile(save_directory):
|
||||
logger.error("Provided path ({}) should be a directory, not a file".format(save_directory))
|
||||
return
|
||||
os.makedirs(save_directory, exist_ok=True)
|
||||
|
||||
self.model.save_pretrained(save_directory)
|
||||
self.tokenizer.save_pretrained(save_directory)
|
||||
if self.modelcard is not None:
|
||||
self.modelcard.save_pretrained(save_directory)
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
|
||||
"""
|
||||
return self(X=X)
|
||||
|
||||
def predict(self, X):
|
||||
"""
|
||||
Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
|
||||
"""
|
||||
return self(X=X)
|
||||
|
||||
@contextmanager
|
||||
def device_placement(self):
|
||||
"""
|
||||
Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
|
||||
|
||||
Returns:
|
||||
Context manager
|
||||
|
||||
Examples::
|
||||
|
||||
# Explicitly ask for tensor allocation on CUDA device :0
|
||||
pipe = pipeline(..., device=0)
|
||||
with pipe.device_placement():
|
||||
# Every framework specific tensor allocation will be done on the request device
|
||||
output = pipe(...)
|
||||
"""
|
||||
if self.framework == "tf":
|
||||
with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)):
|
||||
yield
|
||||
else:
|
||||
if self.device.type == "cuda":
|
||||
torch.cuda.set_device(self.device)
|
||||
|
||||
yield
|
||||
|
||||
def ensure_tensor_on_device(self, **inputs):
|
||||
"""
|
||||
Ensure PyTorch tensors are on the specified device.
|
||||
|
||||
Args:
|
||||
inputs (keyword arguments that should be :obj:`torch.Tensor`): The tensors to place on :obj:`self.device`.
|
||||
|
||||
Return:
|
||||
:obj:`Dict[str, torch.Tensor]`: The same as :obj:`inputs` but on the proper device.
|
||||
"""
|
||||
return {name: tensor.to(self.device) for name, tensor in inputs.items()}
|
||||
|
||||
def check_model_type(self, supported_models: Union[List[str], dict]):
|
||||
"""
|
||||
Check if the model class is in supported by the pipeline.
|
||||
|
||||
Args:
|
||||
supported_models (:obj:`List[str]` or :obj:`dict`):
|
||||
The list of models supported by the pipeline, or a dictionary with model class values.
|
||||
"""
|
||||
if not isinstance(supported_models, list): # Create from a model mapping
|
||||
supported_models = [item[1].__name__ for item in supported_models.items()]
|
||||
if self.model.__class__.__name__ not in supported_models:
|
||||
raise PipelineException(
|
||||
self.task,
|
||||
self.model.base_model_prefix,
|
||||
f"The model '{self.model.__class__.__name__}' is not supported for {self.task}. Supported models are {supported_models}",
|
||||
)
|
||||
|
||||
def _parse_and_tokenize(self, inputs, padding=True, add_special_tokens=True, **kwargs):
|
||||
"""
|
||||
Parse arguments and tokenize
|
||||
"""
|
||||
# Parse arguments
|
||||
inputs = self.tokenizer(
|
||||
inputs,
|
||||
add_special_tokens=add_special_tokens,
|
||||
return_tensors=self.framework,
|
||||
padding=padding,
|
||||
)
|
||||
|
||||
return inputs
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
inputs = self._parse_and_tokenize(*args, **kwargs)
|
||||
return self._forward(inputs)
|
||||
|
||||
def _forward(self, inputs, return_tensors=False):
|
||||
"""
|
||||
Internal framework specific forward dispatching
|
||||
|
||||
Args:
|
||||
inputs: dict holding all the keyword arguments for required by the model forward method.
|
||||
return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array
|
||||
|
||||
Returns:
|
||||
Numpy array
|
||||
"""
|
||||
# Encode for forward
|
||||
with self.device_placement():
|
||||
if self.framework == "tf":
|
||||
# TODO trace model
|
||||
predictions = self.model(inputs.data, training=False)[0]
|
||||
else:
|
||||
with torch.no_grad():
|
||||
inputs = self.ensure_tensor_on_device(**inputs)
|
||||
predictions = self.model(**inputs)[0].cpu()
|
||||
|
||||
if return_tensors:
|
||||
return predictions
|
||||
else:
|
||||
return predictions.numpy()
|
|
@ -0,0 +1,341 @@
|
|||
import uuid
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available
|
||||
from ..utils import logging
|
||||
from .base import PIPELINE_INIT_ARGS, Pipeline
|
||||
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Conversation:
|
||||
"""
|
||||
Utility class containing a conversation and its history. This class is meant to be used as an input to the
|
||||
:class:`~transformers.ConversationalPipeline`. The conversation contains a number of utility function to manage the
|
||||
addition of new user input and generated model responses. A conversation needs to contain an unprocessed user input
|
||||
before being passed to the :class:`~transformers.ConversationalPipeline`. This user input is either created when
|
||||
the class is instantiated, or by calling :obj:`conversational_pipeline.append_response("input")` after a
|
||||
conversation turn.
|
||||
|
||||
Arguments:
|
||||
text (:obj:`str`, `optional`):
|
||||
The initial user input to start the conversation. If not provided, a user input needs to be provided
|
||||
manually using the :meth:`~transformers.Conversation.add_user_input` method before the conversation can
|
||||
begin.
|
||||
conversation_id (:obj:`uuid.UUID`, `optional`):
|
||||
Unique identifier for the conversation. If not provided, a random UUID4 id will be assigned to the
|
||||
conversation.
|
||||
|
||||
Usage::
|
||||
|
||||
conversation = Conversation("Going to the movies tonight - any suggestions?")
|
||||
|
||||
# Steps usually performed by the model when generating a response:
|
||||
# 1. Mark the user input as processed (moved to the history)
|
||||
conversation.mark_processed()
|
||||
# 2. Append a mode response
|
||||
conversation.append_response("The Big lebowski.")
|
||||
|
||||
conversation.add_user_input("Is it good?")
|
||||
"""
|
||||
|
||||
def __init__(self, text: str = None, conversation_id: uuid.UUID = None):
|
||||
if not conversation_id:
|
||||
conversation_id = uuid.uuid4()
|
||||
self.uuid: uuid.UUID = conversation_id
|
||||
self.past_user_inputs: List[str] = []
|
||||
self.generated_responses: List[str] = []
|
||||
self.history: List[int] = []
|
||||
self.new_user_input: Optional[str] = text
|
||||
|
||||
def add_user_input(self, text: str, overwrite: bool = False):
|
||||
"""
|
||||
Add a user input to the conversation for the next round. This populates the internal :obj:`new_user_input`
|
||||
field.
|
||||
|
||||
Args:
|
||||
text (:obj:`str`): The user input for the next conversation round.
|
||||
overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not existing and unprocessed user input should be overwritten when this function is called.
|
||||
"""
|
||||
if self.new_user_input:
|
||||
if overwrite:
|
||||
logger.warning(
|
||||
'User input added while unprocessed input was existing: "{}" was overwritten with: "{}".'.format(
|
||||
self.new_user_input, text
|
||||
)
|
||||
)
|
||||
self.new_user_input = text
|
||||
else:
|
||||
logger.warning(
|
||||
'User input added while unprocessed input was existing: "{}" new input ignored: "{}". '
|
||||
"Set `overwrite` to True to overwrite unprocessed user input".format(self.new_user_input, text)
|
||||
)
|
||||
else:
|
||||
self.new_user_input = text
|
||||
|
||||
def mark_processed(self):
|
||||
"""
|
||||
Mark the conversation as processed (moves the content of :obj:`new_user_input` to :obj:`past_user_inputs`) and
|
||||
empties the :obj:`new_user_input` field.
|
||||
"""
|
||||
if self.new_user_input:
|
||||
self.past_user_inputs.append(self.new_user_input)
|
||||
self.new_user_input = None
|
||||
|
||||
def append_response(self, response: str):
|
||||
"""
|
||||
Append a response to the list of generated responses.
|
||||
|
||||
Args:
|
||||
response (:obj:`str`): The model generated response.
|
||||
"""
|
||||
self.generated_responses.append(response)
|
||||
|
||||
def set_history(self, history: List[int]):
|
||||
"""
|
||||
Updates the value of the history of the conversation. The history is represented by a list of :obj:`token_ids`.
|
||||
The history is used by the model to generate responses based on the previous conversation turns.
|
||||
|
||||
Args:
|
||||
history (:obj:`List[int]`): History of tokens provided and generated for this conversation.
|
||||
"""
|
||||
self.history = history
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Generates a string representation of the conversation.
|
||||
|
||||
Return:
|
||||
:obj:`str`:
|
||||
|
||||
Example: Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114 user >> Going to the movies tonight - any
|
||||
suggestions? bot >> The Big Lebowski
|
||||
"""
|
||||
output = "Conversation id: {} \n".format(self.uuid)
|
||||
for user_input, generated_response in zip(self.past_user_inputs, self.generated_responses):
|
||||
output += "user >> {} \n".format(user_input)
|
||||
output += "bot >> {} \n".format(generated_response)
|
||||
if self.new_user_input is not None:
|
||||
output += "user >> {} \n".format(self.new_user_input)
|
||||
return output
|
||||
|
||||
|
||||
@add_end_docstrings(
|
||||
PIPELINE_INIT_ARGS,
|
||||
r"""
|
||||
min_length_for_response (:obj:`int`, `optional`, defaults to 32):
|
||||
The minimum length (in number of tokens) for a response.
|
||||
""",
|
||||
)
|
||||
class ConversationalPipeline(Pipeline):
|
||||
"""
|
||||
Multi-turn conversational pipeline.
|
||||
|
||||
This conversational pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
|
||||
identifier: :obj:`"conversational"`.
|
||||
|
||||
The models that this pipeline can use are models that have been fine-tuned on a multi-turn conversational task,
|
||||
currently: `'microsoft/DialoGPT-small'`, `'microsoft/DialoGPT-medium'`, `'microsoft/DialoGPT-large'`. See the
|
||||
up-to-date list of available models on `huggingface.co/models
|
||||
<https://huggingface.co/models?filter=conversational>`__.
|
||||
|
||||
Usage::
|
||||
|
||||
conversational_pipeline = pipeline("conversational")
|
||||
|
||||
conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
|
||||
conversation_2 = Conversation("What's the last book you have read?")
|
||||
|
||||
conversational_pipeline([conversation_1, conversation_2])
|
||||
|
||||
conversation_1.add_user_input("Is it an action movie?")
|
||||
conversation_2.add_user_input("What is the genre of this book?")
|
||||
|
||||
conversational_pipeline([conversation_1, conversation_2])
|
||||
"""
|
||||
|
||||
def __init__(self, min_length_for_response=32, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# We need at least an eos_token
|
||||
assert self.tokenizer.eos_token_id is not None, "DialoguePipeline tokenizer should have an EOS token set"
|
||||
if self.tokenizer.pad_token_id is None:
|
||||
self.tokenizer.pad_token = self.tokenizer.eos_token
|
||||
|
||||
self.min_length_for_response = min_length_for_response
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
conversations: Union[Conversation, List[Conversation]],
|
||||
clean_up_tokenization_spaces=True,
|
||||
**generate_kwargs
|
||||
):
|
||||
r"""
|
||||
Generate responses for the conversation(s) given as inputs.
|
||||
|
||||
Args:
|
||||
conversations (a :class:`~transformers.Conversation` or a list of :class:`~transformers.Conversation`):
|
||||
Conversations to generate responses for.
|
||||
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to clean up the potential extra spaces in the text output.
|
||||
generate_kwargs:
|
||||
Additional keyword arguments to pass along to the generate method of the model (see the generate method
|
||||
corresponding to your framework `here <./model.html#generative-models>`__).
|
||||
|
||||
Returns:
|
||||
:class:`~transformers.Conversation` or a list of :class:`~transformers.Conversation`: Conversation(s) with
|
||||
updated generated responses for those containing a new user input.
|
||||
"""
|
||||
|
||||
if isinstance(conversations, Conversation):
|
||||
conversations = [conversations]
|
||||
# Input validation
|
||||
if isinstance(conversations, list):
|
||||
for conversation in conversations:
|
||||
assert isinstance(
|
||||
conversation, Conversation
|
||||
), "DialoguePipeline expects a Conversation or list of Conversations as an input"
|
||||
if conversation.new_user_input is None:
|
||||
raise ValueError(
|
||||
"Conversation with UUID {} does not contain new user input to process. "
|
||||
"Add user inputs with the conversation's `add_user_input` method".format(
|
||||
type(conversation.uuid)
|
||||
)
|
||||
)
|
||||
assert (
|
||||
self.tokenizer.pad_token_id is not None or self.tokenizer.eos_token_id is not None
|
||||
), "Please make sure that the tokenizer has a pad_token_id or eos_token_id when using a batch input"
|
||||
else:
|
||||
raise ValueError("DialoguePipeline expects a Conversation or list of Conversations as an input")
|
||||
|
||||
with self.device_placement():
|
||||
|
||||
inputs = self._parse_and_tokenize([conversation.new_user_input for conversation in conversations])
|
||||
histories = [conversation.history for conversation in conversations]
|
||||
max_length = generate_kwargs.get("max_length", self.model.config.max_length)
|
||||
inputs = self._concat_inputs_history(inputs, histories, max_length)
|
||||
|
||||
if self.framework == "pt":
|
||||
inputs = self.ensure_tensor_on_device(**inputs)
|
||||
input_length = inputs["input_ids"].shape[-1]
|
||||
|
||||
elif self.framework == "tf":
|
||||
input_length = tf.shape(inputs["input_ids"])[-1].numpy()
|
||||
|
||||
if input_length > 0.9 * max_length:
|
||||
logger.warning(
|
||||
"Longest conversation length: {} is bigger than 0.9 * max_length: {}. "
|
||||
"You might consider trimming the early phase of the conversation".format(input_length, max_length)
|
||||
)
|
||||
generated_responses = self.model.generate(
|
||||
inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
**generate_kwargs,
|
||||
)
|
||||
|
||||
if self.model.config.is_encoder_decoder:
|
||||
if self.framework == "pt":
|
||||
history = torch.cat((inputs["input_ids"], generated_responses[:, 1:]), 1)
|
||||
elif self.framework == "tf":
|
||||
history = tf.concat([inputs["input_ids"], generated_responses[:, 1:]], 1)
|
||||
else:
|
||||
history = generated_responses
|
||||
|
||||
history = self._clean_padding_history(history)
|
||||
if self.model.config.is_encoder_decoder:
|
||||
start_position = 1
|
||||
else:
|
||||
start_position = input_length
|
||||
|
||||
output = []
|
||||
for conversation_index, conversation in enumerate(conversations):
|
||||
conversation.mark_processed()
|
||||
conversation.generated_responses.append(
|
||||
self.tokenizer.decode(
|
||||
generated_responses[conversation_index][start_position:],
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||
)
|
||||
)
|
||||
conversation.set_history(history[conversation_index])
|
||||
output.append(conversation)
|
||||
if len(output) == 1:
|
||||
return output[0]
|
||||
else:
|
||||
return output
|
||||
|
||||
def _parse_and_tokenize(self, inputs, **kwargs):
|
||||
"""
|
||||
Parse arguments and tokenize, adding an EOS token at the end of the user input
|
||||
"""
|
||||
# Parse arguments
|
||||
inputs = self.tokenizer(inputs, add_special_tokens=False, padding=False).get("input_ids", [])
|
||||
for input in inputs:
|
||||
input.append(self.tokenizer.eos_token_id)
|
||||
return inputs
|
||||
|
||||
def _clean_padding_history(self, generated_tensor) -> List[List[int]]:
|
||||
"""
|
||||
Cleans the padding history. Padding may be generated in two places when multiple conversations are provided as
|
||||
an input:
|
||||
|
||||
- at the end of the concatenated history and new user input, so that all input to the model have the same
|
||||
length
|
||||
- at the end of the generated response, as some responses will be longer than others
|
||||
This method cleans up these padding token so that the history for each conversation is not impacted by the
|
||||
batching process.
|
||||
"""
|
||||
outputs = []
|
||||
for sequence in generated_tensor:
|
||||
sequence_tokens = []
|
||||
is_previous_pad = False
|
||||
for token in sequence:
|
||||
if token == self.tokenizer.pad_token_id:
|
||||
if self.tokenizer.pad_token_id != self.tokenizer.eos_token_id:
|
||||
continue
|
||||
if is_previous_pad:
|
||||
continue
|
||||
else:
|
||||
is_previous_pad = True
|
||||
else:
|
||||
is_previous_pad = False
|
||||
if self.framework == "pt":
|
||||
sequence_tokens.append(token.item())
|
||||
else:
|
||||
sequence_tokens.append(int(token.numpy()))
|
||||
|
||||
outputs.append(sequence_tokens)
|
||||
return outputs
|
||||
|
||||
def _concat_inputs_history(self, inputs: List[List[int]], histories: List[Optional[List[int]]], max_length: int):
|
||||
"""
|
||||
Builds an input prepended by the history for this conversation, allowing multi-turn conversation with context
|
||||
"""
|
||||
outputs = []
|
||||
for new_input, history in zip(inputs, histories):
|
||||
if history is not None:
|
||||
new_input = history + new_input
|
||||
if len(new_input) > max_length - self.min_length_for_response:
|
||||
cutoff_eos_index = 0
|
||||
while len(new_input) - cutoff_eos_index > max_length - self.min_length_for_response:
|
||||
if cutoff_eos_index >= len(new_input):
|
||||
break
|
||||
cutoff_eos_index = new_input[cutoff_eos_index:].index(self.tokenizer.eos_token_id)
|
||||
if cutoff_eos_index == 0 or cutoff_eos_index == len(new_input) - 1:
|
||||
break
|
||||
else:
|
||||
new_input = new_input[cutoff_eos_index + 1 :]
|
||||
outputs.append(new_input)
|
||||
padded_outputs = self.tokenizer.pad(
|
||||
{"input_ids": outputs}, padding="longest", return_attention_mask=True, return_tensors=self.framework
|
||||
)
|
||||
return padded_outputs
|
|
@ -0,0 +1,82 @@
|
|||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
from ..modelcard import ModelCard
|
||||
from ..tokenization_utils import PreTrainedTokenizer
|
||||
from .base import ArgumentHandler, Pipeline
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..modeling_tf_utils import TFPreTrainedModel
|
||||
from ..modeling_utils import PreTrainedModel
|
||||
|
||||
|
||||
# Can't use @add_end_docstrings(PIPELINE_INIT_ARGS) here because this one does not accept `binary_output`
|
||||
class FeatureExtractionPipeline(Pipeline):
|
||||
"""
|
||||
Feature extraction pipeline using no model head. This pipeline extracts the hidden states from the base
|
||||
transformer, which can be used as features in downstream tasks.
|
||||
|
||||
This feature extraction pipeline can currently be loaded from :func:`~transformers.pipeline` using the task
|
||||
identifier: :obj:`"feature-extraction"`.
|
||||
|
||||
All models may be used for this pipeline. See a list of all models, including community-contributed models on
|
||||
`huggingface.co/models <https://huggingface.co/models>`__.
|
||||
|
||||
Arguments:
|
||||
model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
|
||||
The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
|
||||
:class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
|
||||
TensorFlow.
|
||||
tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
|
||||
The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
|
||||
:class:`~transformers.PreTrainedTokenizer`.
|
||||
modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`):
|
||||
Model card attributed to the model for this pipeline.
|
||||
framework (:obj:`str`, `optional`):
|
||||
The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
|
||||
must be installed.
|
||||
|
||||
If no framework is specified, will default to the one currently installed. If no framework is specified and
|
||||
both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model
|
||||
is provided.
|
||||
task (:obj:`str`, defaults to :obj:`""`):
|
||||
A task-identifier for the pipeline.
|
||||
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`):
|
||||
Reference to the object in charge of parsing supplied pipeline parameters.
|
||||
device (:obj:`int`, `optional`, defaults to -1):
|
||||
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
|
||||
the associated CUDA device id.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union["PreTrainedModel", "TFPreTrainedModel"],
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
modelcard: Optional[ModelCard] = None,
|
||||
framework: Optional[str] = None,
|
||||
args_parser: ArgumentHandler = None,
|
||||
device: int = -1,
|
||||
task: str = "",
|
||||
):
|
||||
super().__init__(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
modelcard=modelcard,
|
||||
framework=framework,
|
||||
args_parser=args_parser,
|
||||
device=device,
|
||||
binary_output=True,
|
||||
task=task,
|
||||
)
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
"""
|
||||
Extract the features of the input(s).
|
||||
|
||||
Args:
|
||||
args (:obj:`str` or :obj:`List[str]`): One or several texts (or one list of texts) to get the features of.
|
||||
|
||||
Return:
|
||||
A nested list of :obj:`float`: The features computed by the model.
|
||||
"""
|
||||
return super().__call__(*args, **kwargs).tolist()
|
|
@ -0,0 +1,194 @@
|
|||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available
|
||||
from ..modelcard import ModelCard
|
||||
from ..tokenization_utils import PreTrainedTokenizer
|
||||
from ..utils import logging
|
||||
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline, PipelineException
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..modeling_tf_utils import TFPreTrainedModel
|
||||
from ..modeling_utils import PreTrainedModel
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
|
||||
from ..models.auto.modeling_tf_auto import TF_MODEL_WITH_LM_HEAD_MAPPING
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from ..models.auto.modeling_auto import MODEL_FOR_MASKED_LM_MAPPING
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@add_end_docstrings(
|
||||
PIPELINE_INIT_ARGS,
|
||||
r"""
|
||||
top_k (:obj:`int`, defaults to 5): The number of predictions to return.
|
||||
""",
|
||||
)
|
||||
class FillMaskPipeline(Pipeline):
|
||||
"""
|
||||
Masked language modeling prediction pipeline using any :obj:`ModelWithLMHead`. See the `masked language modeling
|
||||
examples <../task_summary.html#masked-language-modeling>`__ for more information.
|
||||
|
||||
This mask filling pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
|
||||
identifier: :obj:`"fill-mask"`.
|
||||
|
||||
The models that this pipeline can use are models that have been trained with a masked language modeling objective,
|
||||
which includes the bi-directional models in the library. See the up-to-date list of available models on
|
||||
`huggingface.co/models <https://huggingface.co/models?filter=masked-lm>`__.
|
||||
|
||||
.. note::
|
||||
|
||||
This pipeline only works for inputs with exactly one token masked.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union["PreTrainedModel", "TFPreTrainedModel"],
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
modelcard: Optional[ModelCard] = None,
|
||||
framework: Optional[str] = None,
|
||||
args_parser: ArgumentHandler = None,
|
||||
device: int = -1,
|
||||
top_k=5,
|
||||
task: str = "",
|
||||
):
|
||||
super().__init__(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
modelcard=modelcard,
|
||||
framework=framework,
|
||||
args_parser=args_parser,
|
||||
device=device,
|
||||
binary_output=True,
|
||||
task=task,
|
||||
)
|
||||
|
||||
self.check_model_type(TF_MODEL_WITH_LM_HEAD_MAPPING if self.framework == "tf" else MODEL_FOR_MASKED_LM_MAPPING)
|
||||
self.top_k = top_k
|
||||
|
||||
def ensure_exactly_one_mask_token(self, masked_index: np.ndarray):
|
||||
numel = np.prod(masked_index.shape)
|
||||
if numel > 1:
|
||||
raise PipelineException(
|
||||
"fill-mask",
|
||||
self.model.base_model_prefix,
|
||||
f"More than one mask_token ({self.tokenizer.mask_token}) is not supported",
|
||||
)
|
||||
elif numel < 1:
|
||||
raise PipelineException(
|
||||
"fill-mask",
|
||||
self.model.base_model_prefix,
|
||||
f"No mask_token ({self.tokenizer.mask_token}) found on the input",
|
||||
)
|
||||
|
||||
def __call__(self, *args, targets=None, top_k: Optional[int] = None, **kwargs):
|
||||
"""
|
||||
Fill the masked token in the text(s) given as inputs.
|
||||
|
||||
Args:
|
||||
args (:obj:`str` or :obj:`List[str]`):
|
||||
One or several texts (or one list of prompts) with masked tokens.
|
||||
targets (:obj:`str` or :obj:`List[str]`, `optional`):
|
||||
When passed, the model will return the scores for the passed token or tokens rather than the top k
|
||||
predictions in the entire vocabulary. If the provided targets are not in the model vocab, they will be
|
||||
tokenized and the first resulting token will be used (with a warning).
|
||||
top_k (:obj:`int`, `optional`):
|
||||
When passed, overrides the number of predictions to return.
|
||||
|
||||
Return:
|
||||
A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys:
|
||||
|
||||
- **sequence** (:obj:`str`) -- The corresponding input with the mask token prediction.
|
||||
- **score** (:obj:`float`) -- The corresponding probability.
|
||||
- **token** (:obj:`int`) -- The predicted token id (to replace the masked one).
|
||||
- **token** (:obj:`str`) -- The predicted token (to replace the masked one).
|
||||
"""
|
||||
inputs = self._parse_and_tokenize(*args, **kwargs)
|
||||
outputs = self._forward(inputs, return_tensors=True)
|
||||
|
||||
results = []
|
||||
batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0)
|
||||
|
||||
if targets is not None:
|
||||
if len(targets) == 0 or len(targets[0]) == 0:
|
||||
raise ValueError("At least one target must be provided when passed.")
|
||||
if isinstance(targets, str):
|
||||
targets = [targets]
|
||||
|
||||
targets_proc = []
|
||||
for target in targets:
|
||||
target_enc = self.tokenizer.tokenize(target)
|
||||
if len(target_enc) > 1 or target_enc[0] == self.tokenizer.unk_token:
|
||||
logger.warning(
|
||||
"The specified target token `{}` does not exist in the model vocabulary. Replacing with `{}`.".format(
|
||||
target, target_enc[0]
|
||||
)
|
||||
)
|
||||
targets_proc.append(target_enc[0])
|
||||
target_inds = np.array(self.tokenizer.convert_tokens_to_ids(targets_proc))
|
||||
|
||||
for i in range(batch_size):
|
||||
input_ids = inputs["input_ids"][i]
|
||||
result = []
|
||||
|
||||
if self.framework == "tf":
|
||||
masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()
|
||||
|
||||
# Fill mask pipeline supports only one ${mask_token} per sample
|
||||
self.ensure_exactly_one_mask_token(masked_index)
|
||||
|
||||
logits = outputs[i, masked_index.item(), :]
|
||||
probs = tf.nn.softmax(logits)
|
||||
if targets is None:
|
||||
topk = tf.math.top_k(probs, k=top_k if top_k is not None else self.top_k)
|
||||
values, predictions = topk.values.numpy(), topk.indices.numpy()
|
||||
else:
|
||||
values = tf.gather_nd(probs, tf.reshape(target_inds, (-1, 1)))
|
||||
sort_inds = tf.reverse(tf.argsort(values), [0])
|
||||
values = tf.gather_nd(values, tf.reshape(sort_inds, (-1, 1))).numpy()
|
||||
predictions = target_inds[sort_inds.numpy()]
|
||||
else:
|
||||
masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False)
|
||||
|
||||
# Fill mask pipeline supports only one ${mask_token} per sample
|
||||
self.ensure_exactly_one_mask_token(masked_index.numpy())
|
||||
|
||||
logits = outputs[i, masked_index.item(), :]
|
||||
probs = logits.softmax(dim=0)
|
||||
if targets is None:
|
||||
values, predictions = probs.topk(top_k if top_k is not None else self.top_k)
|
||||
else:
|
||||
values = probs[..., target_inds]
|
||||
sort_inds = list(reversed(values.argsort(dim=-1)))
|
||||
values = values[..., sort_inds]
|
||||
predictions = target_inds[sort_inds]
|
||||
|
||||
for v, p in zip(values.tolist(), predictions.tolist()):
|
||||
tokens = input_ids.numpy()
|
||||
tokens[masked_index] = p
|
||||
# Filter padding out:
|
||||
tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
|
||||
result.append(
|
||||
{
|
||||
"sequence": self.tokenizer.decode(tokens),
|
||||
"score": v,
|
||||
"token": p,
|
||||
"token_str": self.tokenizer.convert_ids_to_tokens(p),
|
||||
}
|
||||
)
|
||||
|
||||
# Append
|
||||
results += [result]
|
||||
|
||||
if len(results) == 1:
|
||||
return results[0]
|
||||
return results
|
|
@ -0,0 +1,488 @@
|
|||
from collections.abc import Iterable
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_features
|
||||
from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available
|
||||
from ..modelcard import ModelCard
|
||||
from ..tokenization_utils import PreTrainedTokenizer
|
||||
from ..tokenization_utils_base import PaddingStrategy
|
||||
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..modeling_tf_utils import TFPreTrainedModel
|
||||
from ..modeling_utils import PreTrainedModel
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
|
||||
from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
|
||||
|
||||
|
||||
class QuestionAnsweringArgumentHandler(ArgumentHandler):
|
||||
"""
|
||||
QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to
|
||||
internal :class:`~transformers.SquadExample`.
|
||||
|
||||
QuestionAnsweringArgumentHandler manages all the possible to create a :class:`~transformers.SquadExample` from the
|
||||
command-line supplied arguments.
|
||||
"""
|
||||
|
||||
def normalize(self, item):
|
||||
if isinstance(item, SquadExample):
|
||||
return item
|
||||
elif isinstance(item, dict):
|
||||
for k in ["question", "context"]:
|
||||
if k not in item:
|
||||
raise KeyError("You need to provide a dictionary with keys {question:..., context:...}")
|
||||
elif item[k] is None:
|
||||
raise ValueError("`{}` cannot be None".format(k))
|
||||
elif isinstance(item[k], str) and len(item[k]) == 0:
|
||||
raise ValueError("`{}` cannot be empty".format(k))
|
||||
|
||||
return QuestionAnsweringPipeline.create_sample(**item)
|
||||
raise ValueError("{} argument needs to be of type (SquadExample, dict)".format(item))
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
# Detect where the actual inputs are
|
||||
if args is not None and len(args) > 0:
|
||||
if len(args) == 1:
|
||||
inputs = args[0]
|
||||
elif len(args) == 2 and {type(el) for el in args} == {str}:
|
||||
inputs = [{"question": args[0], "context": args[1]}]
|
||||
else:
|
||||
inputs = list(args)
|
||||
# Generic compatibility with sklearn and Keras
|
||||
# Batched data
|
||||
elif "X" in kwargs:
|
||||
inputs = kwargs["X"]
|
||||
elif "data" in kwargs:
|
||||
inputs = kwargs["data"]
|
||||
elif "question" in kwargs and "context" in kwargs:
|
||||
if isinstance(kwargs["question"], list) and isinstance(kwargs["context"], str):
|
||||
inputs = [{"question": Q, "context": kwargs["context"]} for Q in kwargs["question"]]
|
||||
elif isinstance(kwargs["question"], list) and isinstance(kwargs["context"], list):
|
||||
if len(kwargs["question"]) != len(kwargs["context"]):
|
||||
raise ValueError("Questions and contexts don't have the same lengths")
|
||||
|
||||
inputs = [{"question": Q, "context": C} for Q, C in zip(kwargs["question"], kwargs["context"])]
|
||||
elif isinstance(kwargs["question"], str) and isinstance(kwargs["context"], str):
|
||||
inputs = [{"question": kwargs["question"], "context": kwargs["context"]}]
|
||||
else:
|
||||
raise ValueError("Arguments can't be understood")
|
||||
else:
|
||||
raise ValueError("Unknown arguments {}".format(kwargs))
|
||||
|
||||
# Normalize inputs
|
||||
if isinstance(inputs, dict):
|
||||
inputs = [inputs]
|
||||
elif isinstance(inputs, Iterable):
|
||||
# Copy to avoid overriding arguments
|
||||
inputs = [i for i in inputs]
|
||||
else:
|
||||
raise ValueError("Invalid arguments {}".format(inputs))
|
||||
|
||||
for i, item in enumerate(inputs):
|
||||
inputs[i] = self.normalize(item)
|
||||
|
||||
return inputs
|
||||
|
||||
|
||||
@add_end_docstrings(PIPELINE_INIT_ARGS)
|
||||
class QuestionAnsweringPipeline(Pipeline):
|
||||
"""
|
||||
Question Answering pipeline using any :obj:`ModelForQuestionAnswering`. See the `question answering examples
|
||||
<../task_summary.html#question-answering>`__ for more information.
|
||||
|
||||
This question answering pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
|
||||
task identifier: :obj:`"question-answering"`.
|
||||
|
||||
The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the
|
||||
up-to-date list of available models on `huggingface.co/models
|
||||
<https://huggingface.co/models?filter=question-answering>`__.
|
||||
"""
|
||||
|
||||
default_input_names = "question,context"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union["PreTrainedModel", "TFPreTrainedModel"],
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
modelcard: Optional[ModelCard] = None,
|
||||
framework: Optional[str] = None,
|
||||
device: int = -1,
|
||||
task: str = "",
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
modelcard=modelcard,
|
||||
framework=framework,
|
||||
device=device,
|
||||
task=task,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self._args_parser = QuestionAnsweringArgumentHandler()
|
||||
self.check_model_type(
|
||||
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING if self.framework == "tf" else MODEL_FOR_QUESTION_ANSWERING_MAPPING
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def create_sample(
|
||||
question: Union[str, List[str]], context: Union[str, List[str]]
|
||||
) -> Union[SquadExample, List[SquadExample]]:
|
||||
"""
|
||||
QuestionAnsweringPipeline leverages the :class:`~transformers.SquadExample` internally. This helper method
|
||||
encapsulate all the logic for converting question(s) and context(s) to :class:`~transformers.SquadExample`.
|
||||
|
||||
We currently support extractive question answering.
|
||||
|
||||
Arguments:
|
||||
question (:obj:`str` or :obj:`List[str]`): The question(s) asked.
|
||||
context (:obj:`str` or :obj:`List[str]`): The context(s) in which we will look for the answer.
|
||||
|
||||
Returns:
|
||||
One or a list of :class:`~transformers.SquadExample`: The corresponding :class:`~transformers.SquadExample`
|
||||
grouping question and context.
|
||||
"""
|
||||
if isinstance(question, list):
|
||||
return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
|
||||
else:
|
||||
return SquadExample(None, question, context, None, None, None)
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
"""
|
||||
Answer the question(s) given as inputs by using the context(s).
|
||||
|
||||
Args:
|
||||
args (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`):
|
||||
One or several :class:`~transformers.SquadExample` containing the question and context.
|
||||
X (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`, `optional`):
|
||||
One or several :class:`~transformers.SquadExample` containing the question and context (will be treated
|
||||
the same way as if passed as the first positional argument).
|
||||
data (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`, `optional`):
|
||||
One or several :class:`~transformers.SquadExample` containing the question and context (will be treated
|
||||
the same way as if passed as the first positional argument).
|
||||
question (:obj:`str` or :obj:`List[str]`):
|
||||
One or several question(s) (must be used in conjunction with the :obj:`context` argument).
|
||||
context (:obj:`str` or :obj:`List[str]`):
|
||||
One or several context(s) associated with the question(s) (must be used in conjunction with the
|
||||
:obj:`question` argument).
|
||||
topk (:obj:`int`, `optional`, defaults to 1):
|
||||
The number of answers to return (will be chosen by order of likelihood).
|
||||
doc_stride (:obj:`int`, `optional`, defaults to 128):
|
||||
If the context is too long to fit with the question for the model, it will be split in several chunks
|
||||
with some overlap. This argument controls the size of that overlap.
|
||||
max_answer_len (:obj:`int`, `optional`, defaults to 15):
|
||||
The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
|
||||
max_seq_len (:obj:`int`, `optional`, defaults to 384):
|
||||
The maximum length of the total sentence (context + question) after tokenization. The context will be
|
||||
split in several chunks (using :obj:`doc_stride`) if needed.
|
||||
max_question_len (:obj:`int`, `optional`, defaults to 64):
|
||||
The maximum length of the question after tokenization. It will be truncated if needed.
|
||||
handle_impossible_answer (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not we accept impossible as an answer.
|
||||
|
||||
Return:
|
||||
A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the following keys:
|
||||
|
||||
- **score** (:obj:`float`) -- The probability associated to the answer.
|
||||
- **start** (:obj:`int`) -- The start index of the answer (in the tokenized version of the input).
|
||||
- **end** (:obj:`int`) -- The end index of the answer (in the tokenized version of the input).
|
||||
- **answer** (:obj:`str`) -- The answer to the question.
|
||||
"""
|
||||
# Set defaults values
|
||||
kwargs.setdefault("padding", "longest")
|
||||
kwargs.setdefault("topk", 1)
|
||||
kwargs.setdefault("doc_stride", 128)
|
||||
kwargs.setdefault("max_answer_len", 15)
|
||||
kwargs.setdefault("max_seq_len", 384)
|
||||
kwargs.setdefault("max_question_len", 64)
|
||||
kwargs.setdefault("handle_impossible_answer", False)
|
||||
|
||||
if kwargs["topk"] < 1:
|
||||
raise ValueError("topk parameter should be >= 1 (got {})".format(kwargs["topk"]))
|
||||
|
||||
if kwargs["max_answer_len"] < 1:
|
||||
raise ValueError("max_answer_len parameter should be >= 1 (got {})".format(kwargs["max_answer_len"]))
|
||||
|
||||
# Convert inputs to features
|
||||
examples = self._args_parser(*args, **kwargs)
|
||||
if not self.tokenizer.is_fast:
|
||||
features_list = [
|
||||
squad_convert_examples_to_features(
|
||||
examples=[example],
|
||||
tokenizer=self.tokenizer,
|
||||
max_seq_length=kwargs["max_seq_len"],
|
||||
doc_stride=kwargs["doc_stride"],
|
||||
max_query_length=kwargs["max_question_len"],
|
||||
padding_strategy=PaddingStrategy.MAX_LENGTH.value,
|
||||
is_training=False,
|
||||
tqdm_enabled=False,
|
||||
)
|
||||
for example in examples
|
||||
]
|
||||
else:
|
||||
features_list = []
|
||||
for example in examples:
|
||||
# Define the side we want to truncate / pad and the text/pair sorting
|
||||
question_first = bool(self.tokenizer.padding_side == "right")
|
||||
|
||||
encoded_inputs = self.tokenizer(
|
||||
text=example.question_text if question_first else example.context_text,
|
||||
text_pair=example.context_text if question_first else example.question_text,
|
||||
padding=kwargs["padding"],
|
||||
truncation="only_second" if question_first else "only_first",
|
||||
max_length=kwargs["max_seq_len"],
|
||||
stride=kwargs["doc_stride"],
|
||||
return_tensors="np",
|
||||
return_token_type_ids=True,
|
||||
return_overflowing_tokens=True,
|
||||
return_offsets_mapping=True,
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
|
||||
# When the input is too long, it's converted in a batch of inputs with overflowing tokens
|
||||
# and a stride of overlap between the inputs. If a batch of inputs is given, a special output
|
||||
# "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample.
|
||||
# Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping".
|
||||
# "num_span" is the number of output samples generated from the overflowing tokens.
|
||||
num_spans = len(encoded_inputs["input_ids"])
|
||||
|
||||
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
|
||||
# We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
|
||||
p_mask = np.asarray(
|
||||
[
|
||||
[tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)]
|
||||
for span_id in range(num_spans)
|
||||
]
|
||||
)
|
||||
|
||||
# keep the cls_token unmasked (some models use it to indicate unanswerable questions)
|
||||
if self.tokenizer.cls_token_id:
|
||||
cls_index = np.nonzero(encoded_inputs["input_ids"] == self.tokenizer.cls_token_id)
|
||||
p_mask[cls_index] = 0
|
||||
|
||||
features = []
|
||||
for span_idx in range(num_spans):
|
||||
features.append(
|
||||
SquadFeatures(
|
||||
input_ids=encoded_inputs["input_ids"][span_idx],
|
||||
attention_mask=encoded_inputs["attention_mask"][span_idx],
|
||||
token_type_ids=encoded_inputs["token_type_ids"][span_idx],
|
||||
p_mask=p_mask[span_idx].tolist(),
|
||||
encoding=encoded_inputs[span_idx],
|
||||
# We don't use the rest of the values - and actually
|
||||
# for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample
|
||||
cls_index=None,
|
||||
token_to_orig_map={},
|
||||
example_index=0,
|
||||
unique_id=0,
|
||||
paragraph_len=0,
|
||||
token_is_max_context=0,
|
||||
tokens=[],
|
||||
start_position=0,
|
||||
end_position=0,
|
||||
is_impossible=False,
|
||||
qas_id=None,
|
||||
)
|
||||
)
|
||||
features_list.append(features)
|
||||
|
||||
all_answers = []
|
||||
for features, example in zip(features_list, examples):
|
||||
model_input_names = self.tokenizer.model_input_names + ["input_ids"]
|
||||
fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names}
|
||||
|
||||
# Manage tensor allocation on correct device
|
||||
with self.device_placement():
|
||||
if self.framework == "tf":
|
||||
fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
|
||||
start, end = self.model(fw_args)[:2]
|
||||
start, end = start.numpy(), end.numpy()
|
||||
else:
|
||||
with torch.no_grad():
|
||||
# Retrieve the score for the context tokens only (removing question tokens)
|
||||
fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
|
||||
# On Windows, the default int type in numpy is np.int32 so we get some non-long tensors.
|
||||
fw_args = {k: v.long() if v.dtype == torch.int32 else v for (k, v) in fw_args.items()}
|
||||
start, end = self.model(**fw_args)[:2]
|
||||
start, end = start.cpu().numpy(), end.cpu().numpy()
|
||||
|
||||
min_null_score = 1000000 # large and positive
|
||||
answers = []
|
||||
for (feature, start_, end_) in zip(features, start, end):
|
||||
# Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
|
||||
undesired_tokens = np.abs(np.array(feature.p_mask) - 1) & feature.attention_mask
|
||||
|
||||
# Generate mask
|
||||
undesired_tokens_mask = undesired_tokens == 0.0
|
||||
|
||||
# Make sure non-context indexes in the tensor cannot contribute to the softmax
|
||||
start_ = np.where(undesired_tokens_mask, -10000.0, start_)
|
||||
end_ = np.where(undesired_tokens_mask, -10000.0, end_)
|
||||
|
||||
# Normalize logits and spans to retrieve the answer
|
||||
start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True)))
|
||||
end_ = np.exp(end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True)))
|
||||
|
||||
if kwargs["handle_impossible_answer"]:
|
||||
min_null_score = min(min_null_score, (start_[0] * end_[0]).item())
|
||||
|
||||
# Mask CLS
|
||||
start_[0] = end_[0] = 0.0
|
||||
|
||||
starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
|
||||
if not self.tokenizer.is_fast:
|
||||
char_to_word = np.array(example.char_to_word_offset)
|
||||
|
||||
# Convert the answer (tokens) back to the original text
|
||||
# Score: score from the model
|
||||
# Start: Index of the first character of the answer in the context string
|
||||
# End: Index of the character following the last character of the answer in the context string
|
||||
# Answer: Plain text of the answer
|
||||
answers += [
|
||||
{
|
||||
"score": score.item(),
|
||||
"start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
|
||||
"end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
|
||||
"answer": " ".join(
|
||||
example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
|
||||
),
|
||||
}
|
||||
for s, e, score in zip(starts, ends, scores)
|
||||
]
|
||||
else:
|
||||
# Convert the answer (tokens) back to the original text
|
||||
# Score: score from the model
|
||||
# Start: Index of the first character of the answer in the context string
|
||||
# End: Index of the character following the last character of the answer in the context string
|
||||
# Answer: Plain text of the answer
|
||||
question_first = bool(self.tokenizer.padding_side == "right")
|
||||
enc = feature.encoding
|
||||
|
||||
# Sometimes the max probability token is in the middle of a word so:
|
||||
# - we start by finding the right word containing the token with `token_to_word`
|
||||
# - then we convert this word in a character span with `word_to_chars`
|
||||
answers += [
|
||||
{
|
||||
"score": score.item(),
|
||||
"start": enc.word_to_chars(
|
||||
enc.token_to_word(s), sequence_index=1 if question_first else 0
|
||||
)[0],
|
||||
"end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1 if question_first else 0)[
|
||||
1
|
||||
],
|
||||
"answer": example.context_text[
|
||||
enc.word_to_chars(enc.token_to_word(s), sequence_index=1 if question_first else 0)[
|
||||
0
|
||||
] : enc.word_to_chars(enc.token_to_word(e), sequence_index=1 if question_first else 0)[
|
||||
1
|
||||
]
|
||||
],
|
||||
}
|
||||
for s, e, score in zip(starts, ends, scores)
|
||||
]
|
||||
|
||||
if kwargs["handle_impossible_answer"]:
|
||||
answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""})
|
||||
|
||||
answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]]
|
||||
all_answers += answers
|
||||
|
||||
if len(all_answers) == 1:
|
||||
return all_answers[0]
|
||||
return all_answers
|
||||
|
||||
def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
|
||||
"""
|
||||
Take the output of any :obj:`ModelForQuestionAnswering` and will generate probabilities for each span to be the
|
||||
actual answer.
|
||||
|
||||
In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
|
||||
answer end position being before the starting position. The method supports output the k-best answer through
|
||||
the topk argument.
|
||||
|
||||
Args:
|
||||
start (:obj:`np.ndarray`): Individual start probabilities for each token.
|
||||
end (:obj:`np.ndarray`): Individual end probabilities for each token.
|
||||
topk (:obj:`int`): Indicates how many possible answer span(s) to extract from the model output.
|
||||
max_answer_len (:obj:`int`): Maximum size of the answer to extract from the model's output.
|
||||
"""
|
||||
# Ensure we have batch axis
|
||||
if start.ndim == 1:
|
||||
start = start[None]
|
||||
|
||||
if end.ndim == 1:
|
||||
end = end[None]
|
||||
|
||||
# Compute the score of each tuple(start, end) to be the real answer
|
||||
outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
|
||||
|
||||
# Remove candidate with end < start and end - start > max_answer_len
|
||||
candidates = np.tril(np.triu(outer), max_answer_len - 1)
|
||||
|
||||
# Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
|
||||
scores_flat = candidates.flatten()
|
||||
if topk == 1:
|
||||
idx_sort = [np.argmax(scores_flat)]
|
||||
elif len(scores_flat) < topk:
|
||||
idx_sort = np.argsort(-scores_flat)
|
||||
else:
|
||||
idx = np.argpartition(-scores_flat, topk)[0:topk]
|
||||
idx_sort = idx[np.argsort(-scores_flat[idx])]
|
||||
|
||||
start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
|
||||
return start, end, candidates[0, start, end]
|
||||
|
||||
def span_to_answer(self, text: str, start: int, end: int) -> Dict[str, Union[str, int]]:
|
||||
"""
|
||||
When decoding from token probabilities, this method maps token indexes to actual word in the initial context.
|
||||
|
||||
Args:
|
||||
text (:obj:`str`): The actual context to extract the answer from.
|
||||
start (:obj:`int`): The answer starting token index.
|
||||
end (:obj:`int`): The answer end token index.
|
||||
|
||||
Returns:
|
||||
Dictionary like :obj:`{'answer': str, 'start': int, 'end': int}`
|
||||
"""
|
||||
words = []
|
||||
token_idx = char_start_idx = char_end_idx = chars_idx = 0
|
||||
|
||||
for i, word in enumerate(text.split(" ")):
|
||||
token = self.tokenizer.tokenize(word)
|
||||
|
||||
# Append words if they are in the span
|
||||
if start <= token_idx <= end:
|
||||
if token_idx == start:
|
||||
char_start_idx = chars_idx
|
||||
|
||||
if token_idx == end:
|
||||
char_end_idx = chars_idx + len(word)
|
||||
|
||||
words += [word]
|
||||
|
||||
# Stop if we went over the end of the answer
|
||||
if token_idx > end:
|
||||
break
|
||||
|
||||
# Append the subtokenization length to the running index
|
||||
token_idx += len(token)
|
||||
chars_idx += len(word) + 1
|
||||
|
||||
# Join text with spaces
|
||||
return {
|
||||
"answer": " ".join(words),
|
||||
"start": max(0, char_start_idx),
|
||||
"end": min(len(text), char_end_idx),
|
||||
}
|
|
@ -0,0 +1,280 @@
|
|||
import collections
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..file_utils import add_end_docstrings, is_torch_available, requires_pandas
|
||||
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from ..models.auto.modeling_auto import MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
|
||||
|
||||
|
||||
class TableQuestionAnsweringArgumentHandler(ArgumentHandler):
|
||||
"""
|
||||
Handles arguments for the TableQuestionAnsweringPipeline
|
||||
"""
|
||||
|
||||
def __call__(self, table=None, query=None, sequential=False, padding=True, truncation=True):
|
||||
# Returns tqa_pipeline_inputs of shape:
|
||||
# [
|
||||
# {"table": pd.DataFrame, "query": List[str]},
|
||||
# ...,
|
||||
# {"table": pd.DataFrame, "query" : List[str]}
|
||||
# ]
|
||||
requires_pandas(self)
|
||||
import pandas as pd
|
||||
|
||||
if table is None:
|
||||
raise ValueError("Keyword argument `table` cannot be None.")
|
||||
elif query is None:
|
||||
if isinstance(table, dict) and table.get("query") is not None and table.get("table") is not None:
|
||||
tqa_pipeline_inputs = [table]
|
||||
elif isinstance(table, list) and len(table) > 0:
|
||||
if not all(isinstance(d, dict) for d in table):
|
||||
raise ValueError(
|
||||
f"Keyword argument `table` should be a list of dict, but is {(type(d) for d in table)}"
|
||||
)
|
||||
|
||||
if table[0].get("query") is not None and table[0].get("table") is not None:
|
||||
tqa_pipeline_inputs = table
|
||||
else:
|
||||
raise ValueError(
|
||||
f"If keyword argument `table` is a list of dictionaries, each dictionary should have a `table` "
|
||||
f"and `query` key, but only dictionary has keys {table[0].keys()} `table` and `query` keys."
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but "
|
||||
f"is {type(table)})"
|
||||
)
|
||||
else:
|
||||
tqa_pipeline_inputs = [{"table": table, "query": query}]
|
||||
|
||||
for tqa_pipeline_input in tqa_pipeline_inputs:
|
||||
if not isinstance(tqa_pipeline_input["table"], pd.DataFrame):
|
||||
if tqa_pipeline_input["table"] is None:
|
||||
raise ValueError("Table cannot be None.")
|
||||
|
||||
tqa_pipeline_input["table"] = pd.DataFrame(tqa_pipeline_input["table"])
|
||||
|
||||
return tqa_pipeline_inputs, sequential, padding, truncation
|
||||
|
||||
|
||||
@add_end_docstrings(PIPELINE_INIT_ARGS)
|
||||
class TableQuestionAnsweringPipeline(Pipeline):
|
||||
"""
|
||||
Table Question Answering pipeline using a :obj:`ModelForTableQuestionAnswering`. This pipeline is only available in
|
||||
PyTorch.
|
||||
|
||||
This tabular question answering pipeline can currently be loaded from :func:`~transformers.pipeline` using the
|
||||
following task identifier: :obj:`"table-question-answering"`.
|
||||
|
||||
The models that this pipeline can use are models that have been fine-tuned on a tabular question answering task.
|
||||
See the up-to-date list of available models on `huggingface.co/models
|
||||
<https://huggingface.co/models?filter=table-question-answering>`__.
|
||||
"""
|
||||
|
||||
default_input_names = "table,query"
|
||||
|
||||
def __init__(self, args_parser=TableQuestionAnsweringArgumentHandler(), *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._args_parser = args_parser
|
||||
|
||||
if self.framework == "tf":
|
||||
raise ValueError("The TableQuestionAnsweringPipeline is only available in PyTorch.")
|
||||
|
||||
self.check_model_type(MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING)
|
||||
|
||||
self.aggregate = bool(getattr(self.model.config, "aggregation_labels")) and bool(
|
||||
getattr(self.model.config, "num_aggregation_labels")
|
||||
)
|
||||
|
||||
def batch_inference(self, **inputs):
|
||||
with torch.no_grad():
|
||||
return self.model(**inputs)
|
||||
|
||||
def sequential_inference(self, **inputs):
|
||||
"""
|
||||
Inference used for models that need to process sequences in a sequential fashion, like the SQA models which
|
||||
handle conversational query related to a table.
|
||||
"""
|
||||
with torch.no_grad():
|
||||
all_logits = []
|
||||
all_aggregations = []
|
||||
prev_answers = None
|
||||
batch_size = inputs["input_ids"].shape[0]
|
||||
|
||||
input_ids = inputs["input_ids"].to(self.device)
|
||||
attention_mask = inputs["attention_mask"].to(self.device)
|
||||
token_type_ids = inputs["token_type_ids"].to(self.device)
|
||||
token_type_ids_example = None
|
||||
|
||||
for index in range(batch_size):
|
||||
# If sequences have already been processed, the token type IDs will be created according to the previous
|
||||
# answer.
|
||||
if prev_answers is not None:
|
||||
prev_labels_example = token_type_ids_example[:, 3] # shape (seq_len,)
|
||||
model_labels = np.zeros_like(prev_labels_example.cpu().numpy()) # shape (seq_len,)
|
||||
|
||||
token_type_ids_example = token_type_ids[index] # shape (seq_len, 7)
|
||||
for i in range(model_labels.shape[0]):
|
||||
segment_id = token_type_ids_example[:, 0].tolist()[i]
|
||||
col_id = token_type_ids_example[:, 1].tolist()[i] - 1
|
||||
row_id = token_type_ids_example[:, 2].tolist()[i] - 1
|
||||
|
||||
if row_id >= 0 and col_id >= 0 and segment_id == 1:
|
||||
model_labels[i] = int(prev_answers[(col_id, row_id)])
|
||||
|
||||
token_type_ids_example[:, 3] = torch.from_numpy(model_labels).type(torch.long).to(self.device)
|
||||
|
||||
input_ids_example = input_ids[index]
|
||||
attention_mask_example = attention_mask[index] # shape (seq_len,)
|
||||
token_type_ids_example = token_type_ids[index] # shape (seq_len, 7)
|
||||
outputs = self.model(
|
||||
input_ids=input_ids_example.unsqueeze(0),
|
||||
attention_mask=attention_mask_example.unsqueeze(0),
|
||||
token_type_ids=token_type_ids_example.unsqueeze(0),
|
||||
)
|
||||
logits = outputs.logits
|
||||
|
||||
if self.aggregate:
|
||||
all_aggregations.append(outputs.logits_aggregation)
|
||||
|
||||
all_logits.append(logits)
|
||||
|
||||
dist_per_token = torch.distributions.Bernoulli(logits=logits)
|
||||
probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to(
|
||||
dist_per_token.probs.device
|
||||
)
|
||||
|
||||
coords_to_probs = collections.defaultdict(list)
|
||||
for i, p in enumerate(probabilities.squeeze().tolist()):
|
||||
segment_id = token_type_ids_example[:, 0].tolist()[i]
|
||||
col = token_type_ids_example[:, 1].tolist()[i] - 1
|
||||
row = token_type_ids_example[:, 2].tolist()[i] - 1
|
||||
if col >= 0 and row >= 0 and segment_id == 1:
|
||||
coords_to_probs[(col, row)].append(p)
|
||||
|
||||
prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs}
|
||||
|
||||
logits_batch = torch.cat(tuple(all_logits), 0)
|
||||
|
||||
return (logits_batch,) if not self.aggregate else (logits_batch, torch.cat(tuple(all_aggregations), 0))
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
r"""
|
||||
Answers queries according to a table. The pipeline accepts several types of inputs which are detailed below:
|
||||
|
||||
- ``pipeline(table, query)``
|
||||
- ``pipeline(table, [query])``
|
||||
- ``pipeline(table=table, query=query)``
|
||||
- ``pipeline(table=table, query=[query])``
|
||||
- ``pipeline({"table": table, "query": query})``
|
||||
- ``pipeline({"table": table, "query": [query]})``
|
||||
- ``pipeline([{"table": table, "query": query}, {"table": table, "query": query}])``
|
||||
|
||||
The :obj:`table` argument should be a dict or a DataFrame built from that dict, containing the whole table:
|
||||
|
||||
Example::
|
||||
|
||||
data = {
|
||||
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
|
||||
"age": ["56", "45", "59"],
|
||||
"number of movies": ["87", "53", "69"],
|
||||
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
|
||||
}
|
||||
|
||||
This dictionary can be passed in as such, or can be converted to a pandas DataFrame:
|
||||
|
||||
Example::
|
||||
|
||||
import pandas as pd
|
||||
table = pd.DataFrame.from_dict(data)
|
||||
|
||||
|
||||
Args:
|
||||
table (:obj:`pd.DataFrame` or :obj:`Dict`):
|
||||
Pandas DataFrame or dictionary that will be converted to a DataFrame containing all the table values.
|
||||
See above for an example of dictionary.
|
||||
query (:obj:`str` or :obj:`List[str]`):
|
||||
Query or list of queries that will be sent to the model alongside the table.
|
||||
sequential (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the
|
||||
inference to be done sequentially to extract relations within sequences, given their conversational
|
||||
nature.
|
||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
||||
Activates and controls padding. Accepts the following values:
|
||||
|
||||
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
||||
single sequence if provided).
|
||||
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
|
||||
maximum acceptable input length for the model if that argument is not provided.
|
||||
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
|
||||
different lengths).
|
||||
|
||||
truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.TapasTruncationStrategy`, `optional`, defaults to :obj:`False`):
|
||||
Activates and controls truncation. Accepts the following values:
|
||||
|
||||
* :obj:`True` or :obj:`'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument
|
||||
:obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
|
||||
provided. This will truncate row by row, removing rows from the table.
|
||||
* :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
|
||||
sequence lengths greater than the model maximum admissible input size).
|
||||
|
||||
|
||||
Return:
|
||||
A dictionary or a list of dictionaries containing results: Each result is a dictionary with the following
|
||||
keys:
|
||||
|
||||
- **answer** (:obj:`str`) -- The answer of the query given the table. If there is an aggregator, the answer
|
||||
will be preceded by :obj:`AGGREGATOR >`.
|
||||
- **coordinates** (:obj:`List[Tuple[int, int]]`) -- Coordinates of the cells of the answers.
|
||||
- **cells** (:obj:`List[str]`) -- List of strings made up of the answer cell values.
|
||||
- **aggregator** (:obj:`str`) -- If the model has an aggregator, this returns the aggregator.
|
||||
"""
|
||||
pipeline_inputs, sequential, padding, truncation = self._args_parser(*args, **kwargs)
|
||||
batched_answers = []
|
||||
for pipeline_input in pipeline_inputs:
|
||||
table, query = pipeline_input["table"], pipeline_input["query"]
|
||||
inputs = self.tokenizer(
|
||||
table, query, return_tensors=self.framework, truncation="drop_rows_to_fit", padding=padding
|
||||
)
|
||||
|
||||
outputs = self.sequential_inference(**inputs) if sequential else self.batch_inference(**inputs)
|
||||
|
||||
if self.aggregate:
|
||||
logits, logits_agg = outputs[:2]
|
||||
predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits.detach(), logits_agg)
|
||||
answer_coordinates_batch, agg_predictions = predictions
|
||||
aggregators = {i: self.model.config.aggregation_labels[pred] for i, pred in enumerate(agg_predictions)}
|
||||
|
||||
no_agg_label_index = self.model.config.no_aggregation_label_index
|
||||
aggregators_prefix = {
|
||||
i: aggregators[i] + " > " for i, pred in enumerate(agg_predictions) if pred != no_agg_label_index
|
||||
}
|
||||
else:
|
||||
logits = outputs[0]
|
||||
predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits.detach())
|
||||
answer_coordinates_batch = predictions[0]
|
||||
aggregators = {}
|
||||
aggregators_prefix = {}
|
||||
|
||||
answers = []
|
||||
for index, coordinates in enumerate(answer_coordinates_batch):
|
||||
cells = [table.iat[coordinate] for coordinate in coordinates]
|
||||
aggregator = aggregators.get(index, "")
|
||||
aggregator_prefix = aggregators_prefix.get(index, "")
|
||||
answer = {
|
||||
"answer": aggregator_prefix + ", ".join(cells),
|
||||
"coordinates": coordinates,
|
||||
"cells": [table.iat[coordinate] for coordinate in coordinates],
|
||||
}
|
||||
if aggregator:
|
||||
answer["aggregator"] = aggregator
|
||||
|
||||
answers.append(answer)
|
||||
batched_answers.append(answers if len(answers) > 1 else answers[0])
|
||||
return batched_answers if len(batched_answers) > 1 else batched_answers[0]
|
|
@ -0,0 +1,345 @@
|
|||
from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available
|
||||
from ..utils import logging
|
||||
from .base import PIPELINE_INIT_ARGS, Pipeline
|
||||
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
|
||||
from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING
|
||||
|
||||
if is_torch_available():
|
||||
from ..models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@add_end_docstrings(PIPELINE_INIT_ARGS)
|
||||
class SummarizationPipeline(Pipeline):
|
||||
"""
|
||||
Summarize news articles and other documents.
|
||||
|
||||
This summarizing pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
|
||||
identifier: :obj:`"summarization"`.
|
||||
|
||||
The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is
|
||||
currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'. See the up-to-date
|
||||
list of available models on `huggingface.co/models <https://huggingface.co/models?filter=summarization>`__.
|
||||
|
||||
Usage::
|
||||
|
||||
# use bart in pytorch
|
||||
summarizer = pipeline("summarization")
|
||||
summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20)
|
||||
|
||||
# use t5 in tf
|
||||
summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")
|
||||
summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20)
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs.update(task="summarization")
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self.check_model_type(
|
||||
TF_MODEL_WITH_LM_HEAD_MAPPING if self.framework == "tf" else MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||
)
|
||||
|
||||
def __call__(
|
||||
self, *documents, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
|
||||
):
|
||||
r"""
|
||||
Summarize the text(s) given as inputs.
|
||||
|
||||
Args:
|
||||
documents (`str` or :obj:`List[str]`):
|
||||
One or several articles (or one list of articles) to summarize.
|
||||
return_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to include the decoded texts in the outputs
|
||||
return_tensors (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to include the tensors of predictions (as token indices) in the outputs.
|
||||
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to clean up the potential extra spaces in the text output.
|
||||
generate_kwargs:
|
||||
Additional keyword arguments to pass along to the generate method of the model (see the generate method
|
||||
corresponding to your framework `here <./model.html#generative-models>`__).
|
||||
|
||||
Return:
|
||||
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
|
||||
|
||||
- **summary_text** (:obj:`str`, present when ``return_text=True``) -- The summary of the corresponding
|
||||
input.
|
||||
- **summary_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``) --
|
||||
The token ids of the summary.
|
||||
"""
|
||||
assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"
|
||||
assert len(documents) > 0, "Please provide a document to summarize"
|
||||
|
||||
prefix = self.model.config.prefix if self.model.config.prefix is not None else ""
|
||||
|
||||
if isinstance(documents[0], list):
|
||||
assert (
|
||||
self.tokenizer.pad_token_id is not None
|
||||
), "Please make sure that the tokenizer has a pad_token_id when using a batch input"
|
||||
|
||||
documents = ([prefix + document for document in documents[0]],)
|
||||
padding = True
|
||||
|
||||
elif isinstance(documents[0], str):
|
||||
documents = (prefix + documents[0],)
|
||||
padding = False
|
||||
else:
|
||||
raise ValueError(
|
||||
" `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
|
||||
documents[0]
|
||||
)
|
||||
)
|
||||
|
||||
with self.device_placement():
|
||||
inputs = self._parse_and_tokenize(*documents, padding=padding)
|
||||
|
||||
if self.framework == "pt":
|
||||
inputs = self.ensure_tensor_on_device(**inputs)
|
||||
input_length = inputs["input_ids"].shape[-1]
|
||||
elif self.framework == "tf":
|
||||
input_length = tf.shape(inputs["input_ids"])[-1].numpy()
|
||||
|
||||
min_length = generate_kwargs.get("min_length", self.model.config.min_length)
|
||||
if input_length < min_length // 2:
|
||||
logger.warning(
|
||||
"Your min_length is set to {}, but you input_length is only {}. You might consider decreasing min_length manually, e.g. summarizer('...', min_length=10)".format(
|
||||
min_length, input_length
|
||||
)
|
||||
)
|
||||
|
||||
max_length = generate_kwargs.get("max_length", self.model.config.max_length)
|
||||
if input_length < max_length:
|
||||
logger.warning(
|
||||
"Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)".format(
|
||||
max_length, input_length
|
||||
)
|
||||
)
|
||||
|
||||
summaries = self.model.generate(
|
||||
inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
**generate_kwargs,
|
||||
)
|
||||
|
||||
results = []
|
||||
for summary in summaries:
|
||||
record = {}
|
||||
if return_tensors:
|
||||
record["summary_token_ids"] = summary
|
||||
if return_text:
|
||||
record["summary_text"] = self.tokenizer.decode(
|
||||
summary,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||
)
|
||||
results.append(record)
|
||||
return results
|
||||
|
||||
|
||||
@add_end_docstrings(PIPELINE_INIT_ARGS)
|
||||
class TranslationPipeline(Pipeline):
|
||||
"""
|
||||
Translates from one language to another.
|
||||
|
||||
This translation pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
|
||||
identifier: :obj:`"translation_xx_to_yy"`.
|
||||
|
||||
The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
|
||||
up-to-date list of available models on `huggingface.co/models
|
||||
<https://huggingface.co/models?filter=translation>`__.
|
||||
|
||||
Usage::
|
||||
en_fr_translator = pipeline("translation_en_to_fr")
|
||||
en_fr_translator("How old are you?")
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self.check_model_type(
|
||||
TF_MODEL_WITH_LM_HEAD_MAPPING if self.framework == "tf" else MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||
)
|
||||
|
||||
def __call__(
|
||||
self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
|
||||
):
|
||||
r"""
|
||||
Translate the text(s) given as inputs.
|
||||
|
||||
Args:
|
||||
args (:obj:`str` or :obj:`List[str]`):
|
||||
Texts to be translated.
|
||||
return_tensors (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to include the tensors of predictions (as token indices) in the outputs.
|
||||
return_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to include the decoded texts in the outputs.
|
||||
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to clean up the potential extra spaces in the text output.
|
||||
generate_kwargs:
|
||||
Additional keyword arguments to pass along to the generate method of the model (see the generate method
|
||||
corresponding to your framework `here <./model.html#generative-models>`__).
|
||||
|
||||
Return:
|
||||
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
|
||||
|
||||
- **translation_text** (:obj:`str`, present when ``return_text=True``) -- The translation.
|
||||
- **translation_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
|
||||
-- The token ids of the translation.
|
||||
"""
|
||||
assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"
|
||||
|
||||
prefix = self.model.config.prefix if self.model.config.prefix is not None else ""
|
||||
|
||||
if isinstance(args[0], list):
|
||||
assert (
|
||||
self.tokenizer.pad_token_id is not None
|
||||
), "Please make sure that the tokenizer has a pad_token_id when using a batch input"
|
||||
args = ([prefix + text for text in args[0]],)
|
||||
padding = True
|
||||
|
||||
elif isinstance(args[0], str):
|
||||
args = (prefix + args[0],)
|
||||
padding = False
|
||||
else:
|
||||
raise ValueError(
|
||||
" `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
|
||||
args[0]
|
||||
)
|
||||
)
|
||||
|
||||
with self.device_placement():
|
||||
inputs = self._parse_and_tokenize(*args, padding=padding)
|
||||
|
||||
if self.framework == "pt":
|
||||
inputs = self.ensure_tensor_on_device(**inputs)
|
||||
input_length = inputs["input_ids"].shape[-1]
|
||||
|
||||
elif self.framework == "tf":
|
||||
input_length = tf.shape(inputs["input_ids"])[-1].numpy()
|
||||
|
||||
max_length = generate_kwargs.get("max_length", self.model.config.max_length)
|
||||
if input_length > 0.9 * max_length:
|
||||
logger.warning(
|
||||
"Your input_length: {} is bigger than 0.9 * max_length: {}. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)".format(
|
||||
input_length, max_length
|
||||
)
|
||||
)
|
||||
|
||||
translations = self.model.generate(
|
||||
inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
**generate_kwargs,
|
||||
)
|
||||
results = []
|
||||
for translation in translations:
|
||||
record = {}
|
||||
if return_tensors:
|
||||
record["translation_token_ids"] = translation
|
||||
if return_text:
|
||||
record["translation_text"] = self.tokenizer.decode(
|
||||
translation,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||
)
|
||||
results.append(record)
|
||||
return results
|
||||
|
||||
|
||||
@add_end_docstrings(PIPELINE_INIT_ARGS)
|
||||
class Text2TextGenerationPipeline(Pipeline):
|
||||
"""
|
||||
Pipeline for text to text generation using seq2seq models.
|
||||
|
||||
This Text2TextGenerationPipeline pipeline can currently be loaded from :func:`~transformers.pipeline` using the
|
||||
following task identifier: :obj:`"text2text-generation"`.
|
||||
|
||||
The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
|
||||
up-to-date list of available models on `huggingface.co/models <https://huggingface.co/models?filter=seq2seq>`__.
|
||||
|
||||
Usage::
|
||||
|
||||
text2text_generator = pipeline("text2text-generation")
|
||||
text2text_generator("question: What is 42 ? context: 42 is the answer to life, the universe and everything")
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self.check_model_type(
|
||||
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||
if self.framework == "tf"
|
||||
else MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||
)
|
||||
|
||||
def __call__(
|
||||
self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
|
||||
):
|
||||
r"""
|
||||
Generate the output text(s) using text(s) given as inputs.
|
||||
|
||||
Args:
|
||||
args (:obj:`str` or :obj:`List[str]`):
|
||||
Input text for the encoder.
|
||||
return_tensors (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to include the tensors of predictions (as token indices) in the outputs.
|
||||
return_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to include the decoded texts in the outputs.
|
||||
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to clean up the potential extra spaces in the text output.
|
||||
generate_kwargs:
|
||||
Additional keyword arguments to pass along to the generate method of the model (see the generate method
|
||||
corresponding to your framework `here <./model.html#generative-models>`__).
|
||||
|
||||
Return:
|
||||
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
|
||||
|
||||
- **generated_text** (:obj:`str`, present when ``return_text=True``) -- The generated text.
|
||||
- **generated_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
|
||||
-- The token ids of the generated text.
|
||||
"""
|
||||
assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"
|
||||
|
||||
if isinstance(args[0], list):
|
||||
assert (
|
||||
self.tokenizer.pad_token_id is not None
|
||||
), "Please make sure that the tokenizer has a pad_token_id when using a batch input"
|
||||
padding = True
|
||||
|
||||
elif isinstance(args[0], str):
|
||||
padding = False
|
||||
else:
|
||||
raise ValueError(
|
||||
" `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
|
||||
args[0]
|
||||
)
|
||||
)
|
||||
|
||||
with self.device_placement():
|
||||
inputs = self._parse_and_tokenize(*args, padding=padding)
|
||||
|
||||
if self.framework == "pt":
|
||||
inputs = self.ensure_tensor_on_device(**inputs)
|
||||
|
||||
generations = self.model.generate(
|
||||
inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
**generate_kwargs,
|
||||
)
|
||||
results = []
|
||||
for generation in generations:
|
||||
record = {}
|
||||
if return_tensors:
|
||||
record["generated_token_ids"] = generation
|
||||
if return_text:
|
||||
record["generated_text"] = self.tokenizer.decode(
|
||||
generation,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||
)
|
||||
results.append(record)
|
||||
return results
|
|
@ -0,0 +1,79 @@
|
|||
import numpy as np
|
||||
|
||||
from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available
|
||||
from .base import PIPELINE_INIT_ARGS, Pipeline
|
||||
|
||||
|
||||
if is_tf_available():
|
||||
from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
|
||||
|
||||
if is_torch_available():
|
||||
from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
|
||||
|
||||
|
||||
@add_end_docstrings(
|
||||
PIPELINE_INIT_ARGS,
|
||||
r"""
|
||||
return_all_scores (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether to return all prediction scores or just the one of the predicted class.
|
||||
""",
|
||||
)
|
||||
class TextClassificationPipeline(Pipeline):
|
||||
"""
|
||||
Text classification pipeline using any :obj:`ModelForSequenceClassification`. See the `sequence classification
|
||||
examples <../task_summary.html#sequence-classification>`__ for more information.
|
||||
|
||||
This text classification pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
|
||||
task identifier: :obj:`"sentiment-analysis"` (for classifying sequences according to positive or negative
|
||||
sentiments).
|
||||
|
||||
If multiple classification labels are available (:obj:`model.config.num_labels >= 2`), the pipeline will run a
|
||||
softmax over the results. If there is a single label, the pipeline will run a sigmoid over the result.
|
||||
|
||||
The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
|
||||
the up-to-date list of available models on `huggingface.co/models
|
||||
<https://huggingface.co/models?filter=text-classification>`__.
|
||||
"""
|
||||
|
||||
def __init__(self, return_all_scores: bool = False, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.check_model_type(
|
||||
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
|
||||
if self.framework == "tf"
|
||||
else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
|
||||
)
|
||||
|
||||
self.return_all_scores = return_all_scores
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
"""
|
||||
Classify the text(s) given as inputs.
|
||||
|
||||
Args:
|
||||
args (:obj:`str` or :obj:`List[str]`):
|
||||
One or several texts (or one list of prompts) to classify.
|
||||
|
||||
Return:
|
||||
A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys:
|
||||
|
||||
- **label** (:obj:`str`) -- The label predicted.
|
||||
- **score** (:obj:`float`) -- The corresponding probability.
|
||||
|
||||
If ``self.return_all_scores=True``, one such dictionary is returned per label.
|
||||
"""
|
||||
outputs = super().__call__(*args, **kwargs)
|
||||
|
||||
if self.model.config.num_labels == 1:
|
||||
scores = 1.0 / (1.0 + np.exp(-outputs))
|
||||
else:
|
||||
scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)
|
||||
if self.return_all_scores:
|
||||
return [
|
||||
[{"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(item)]
|
||||
for item in scores
|
||||
]
|
||||
else:
|
||||
return [
|
||||
{"label": self.model.config.id2label[item.argmax()], "score": item.max().item()} for item in scores
|
||||
]
|
|
@ -0,0 +1,189 @@
|
|||
from ..file_utils import add_end_docstrings
|
||||
from .base import PIPELINE_INIT_ARGS, Pipeline
|
||||
|
||||
|
||||
@add_end_docstrings(PIPELINE_INIT_ARGS)
|
||||
class TextGenerationPipeline(Pipeline):
|
||||
"""
|
||||
Language generation pipeline using any :obj:`ModelWithLMHead`. This pipeline predicts the words that will follow a
|
||||
specified text prompt.
|
||||
|
||||
This language generation pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
|
||||
task identifier: :obj:`"text-generation"`.
|
||||
|
||||
The models that this pipeline can use are models that have been trained with an autoregressive language modeling
|
||||
objective, which includes the uni-directional models in the library (e.g. gpt2). See the list of available models
|
||||
on `huggingface.co/models <https://huggingface.co/models?filter=causal-lm>`__.
|
||||
"""
|
||||
|
||||
# Prefix text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
|
||||
# in https://github.com/rusiaaman/XLNet-gen#methodology
|
||||
# and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
|
||||
|
||||
XL_PREFIX = """
|
||||
In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The
|
||||
voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western
|
||||
Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision
|
||||
and denounces one of the men as a horse thief. Although his father initially slaps him for making such an
|
||||
accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
|
||||
the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop,
|
||||
begging for his blessing. <eod> </s> <eos>
|
||||
"""
|
||||
|
||||
ALLOWED_MODELS = [
|
||||
"XLNetLMHeadModel",
|
||||
"TransfoXLLMHeadModel",
|
||||
"ReformerModelWithLMHead",
|
||||
"GPT2LMHeadModel",
|
||||
"OpenAIGPTLMHeadModel",
|
||||
"CTRLLMHeadModel",
|
||||
"TFXLNetLMHeadModel",
|
||||
"TFTransfoXLLMHeadModel",
|
||||
"TFGPT2LMHeadModel",
|
||||
"TFOpenAIGPTLMHeadModel",
|
||||
"TFCTRLLMHeadModel",
|
||||
]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self.check_model_type(self.ALLOWED_MODELS)
|
||||
|
||||
# overriding _parse_and_tokenize to allow for unusual language-modeling tokenizer arguments
|
||||
|
||||
def _parse_and_tokenize(self, inputs, padding=True, add_special_tokens=True, **kwargs):
|
||||
"""
|
||||
Parse arguments and tokenize
|
||||
"""
|
||||
# Parse arguments
|
||||
if self.model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
|
||||
tokenizer_kwargs = {"add_space_before_punct_symbol": True}
|
||||
else:
|
||||
tokenizer_kwargs = {}
|
||||
inputs = self.tokenizer(
|
||||
inputs,
|
||||
add_special_tokens=add_special_tokens,
|
||||
return_tensors=self.framework,
|
||||
padding=padding,
|
||||
**tokenizer_kwargs,
|
||||
)
|
||||
|
||||
return inputs
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text_inputs,
|
||||
return_tensors=False,
|
||||
return_text=True,
|
||||
clean_up_tokenization_spaces=False,
|
||||
prefix=None,
|
||||
**generate_kwargs
|
||||
):
|
||||
"""
|
||||
Complete the prompt(s) given as inputs.
|
||||
|
||||
Args:
|
||||
args (:obj:`str` or :obj:`List[str]`):
|
||||
One or several prompts (or one list of prompts) to complete.
|
||||
return_tensors (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to include the tensors of predictions (as token indices) in the outputs.
|
||||
return_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to include the decoded texts in the outputs.
|
||||
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to clean up the potential extra spaces in the text output.
|
||||
prefix (:obj:`str`, `optional`):
|
||||
Prefix added to prompt.
|
||||
generate_kwargs:
|
||||
Additional keyword arguments to pass along to the generate method of the model (see the generate method
|
||||
corresponding to your framework `here <./model.html#generative-models>`__).
|
||||
|
||||
Return:
|
||||
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
|
||||
|
||||
- **generated_text** (:obj:`str`, present when ``return_text=True``) -- The generated text.
|
||||
- **generated_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
|
||||
-- The token ids of the generated text.
|
||||
"""
|
||||
|
||||
if isinstance(text_inputs, str):
|
||||
text_inputs = [text_inputs]
|
||||
results = []
|
||||
for prompt_text in text_inputs:
|
||||
# Manage correct placement of the tensors
|
||||
with self.device_placement():
|
||||
prefix = prefix if prefix is not None else self.model.config.prefix
|
||||
if prefix is None and self.model.__class__.__name__ in [
|
||||
"XLNetLMHeadModel",
|
||||
"TransfoXLLMHeadModel",
|
||||
"TFXLNetLMHeadModel",
|
||||
"TFTransfoXLLMHeadModel",
|
||||
]:
|
||||
# For XLNet and TransformerXL we add an article to the prompt to give more state to the model.
|
||||
prefix = self.XL_PREFIX
|
||||
|
||||
if prefix:
|
||||
prefix_inputs = self._parse_and_tokenize(prefix, padding=False, add_special_tokens=False)
|
||||
# This impacts max_length and min_length argument that need adjusting.
|
||||
prefix_length = prefix_inputs["input_ids"].shape[-1]
|
||||
if generate_kwargs.get("max_length", None) is not None:
|
||||
generate_kwargs["max_length"] += prefix_length
|
||||
if generate_kwargs.get("min_length", None) is not None:
|
||||
generate_kwargs["min_length"] += prefix_length
|
||||
|
||||
prefix = prefix or ""
|
||||
inputs = self._parse_and_tokenize(prefix + prompt_text, padding=False, add_special_tokens=False)
|
||||
|
||||
# set input_ids to None to allow empty prompt
|
||||
if inputs["input_ids"].shape[-1] == 0:
|
||||
inputs["input_ids"] = None
|
||||
inputs["attention_mask"] = None
|
||||
|
||||
if self.framework == "pt" and inputs["input_ids"] is not None:
|
||||
inputs = self.ensure_tensor_on_device(**inputs)
|
||||
|
||||
input_ids = inputs["input_ids"]
|
||||
|
||||
# Ensure that batch size = 1 (batch generation not allowed for now)
|
||||
assert (
|
||||
input_ids is None or input_ids.shape[0] == 1
|
||||
), "Batch generation is currently not supported. See https://github.com/huggingface/transformers/issues/3021 for more information."
|
||||
|
||||
output_sequences = self.model.generate(input_ids=input_ids, **generate_kwargs) # BS x SL
|
||||
|
||||
result = []
|
||||
for generated_sequence in output_sequences:
|
||||
if self.framework == "pt" and generated_sequence is not None:
|
||||
generated_sequence = generated_sequence.cpu()
|
||||
generated_sequence = generated_sequence.numpy().tolist()
|
||||
record = {}
|
||||
if return_tensors:
|
||||
record["generated_token_ids"] = generated_sequence
|
||||
if return_text:
|
||||
# Decode text
|
||||
text = self.tokenizer.decode(
|
||||
generated_sequence,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||
)
|
||||
|
||||
# Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used
|
||||
if input_ids is None:
|
||||
prompt_length = 0
|
||||
else:
|
||||
prompt_length = len(
|
||||
self.tokenizer.decode(
|
||||
input_ids[0],
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||
)
|
||||
)
|
||||
|
||||
record["generated_text"] = prompt_text + text[prompt_length:]
|
||||
|
||||
result.append(record)
|
||||
results += [result]
|
||||
|
||||
if len(results) == 1:
|
||||
return results[0]
|
||||
|
||||
return results
|
|
@ -0,0 +1,303 @@
|
|||
from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available
|
||||
from ..modelcard import ModelCard
|
||||
from ..models.bert.tokenization_bert import BasicTokenizer
|
||||
from ..tokenization_utils import PreTrainedTokenizer
|
||||
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..modeling_tf_utils import TFPreTrainedModel
|
||||
from ..modeling_utils import PreTrainedModel
|
||||
|
||||
if is_tf_available():
|
||||
|
||||
from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from ..models.auto.modeling_auto import MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
|
||||
|
||||
|
||||
class TokenClassificationArgumentHandler(ArgumentHandler):
|
||||
"""
|
||||
Handles arguments for token classification.
|
||||
"""
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
|
||||
if args is not None and len(args) > 0:
|
||||
inputs = list(args)
|
||||
batch_size = len(inputs)
|
||||
else:
|
||||
raise ValueError("At least one input is required.")
|
||||
|
||||
offset_mapping = kwargs.get("offset_mapping")
|
||||
if offset_mapping:
|
||||
if isinstance(offset_mapping, list) and isinstance(offset_mapping[0], tuple):
|
||||
offset_mapping = [offset_mapping]
|
||||
if len(offset_mapping) != batch_size:
|
||||
raise ValueError("offset_mapping should have the same batch size as the input")
|
||||
return inputs, offset_mapping
|
||||
|
||||
|
||||
@add_end_docstrings(
|
||||
PIPELINE_INIT_ARGS,
|
||||
r"""
|
||||
ignore_labels (:obj:`List[str]`, defaults to :obj:`["O"]`):
|
||||
A list of labels to ignore.
|
||||
grouped_entities (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to group the tokens corresponding to the same entity together in the predictions or not.
|
||||
""",
|
||||
)
|
||||
class TokenClassificationPipeline(Pipeline):
|
||||
"""
|
||||
Named Entity Recognition pipeline using any :obj:`ModelForTokenClassification`. See the `named entity recognition
|
||||
examples <../task_summary.html#named-entity-recognition>`__ for more information.
|
||||
|
||||
This token recognition pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
|
||||
task identifier: :obj:`"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location
|
||||
or miscellaneous).
|
||||
|
||||
The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the
|
||||
up-to-date list of available models on `huggingface.co/models
|
||||
<https://huggingface.co/models?filter=token-classification>`__.
|
||||
"""
|
||||
|
||||
default_input_names = "sequences"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union["PreTrainedModel", "TFPreTrainedModel"],
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
modelcard: Optional[ModelCard] = None,
|
||||
framework: Optional[str] = None,
|
||||
args_parser: ArgumentHandler = TokenClassificationArgumentHandler(),
|
||||
device: int = -1,
|
||||
binary_output: bool = False,
|
||||
ignore_labels=["O"],
|
||||
task: str = "",
|
||||
grouped_entities: bool = False,
|
||||
ignore_subwords: bool = False,
|
||||
):
|
||||
super().__init__(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
modelcard=modelcard,
|
||||
framework=framework,
|
||||
device=device,
|
||||
binary_output=binary_output,
|
||||
task=task,
|
||||
)
|
||||
|
||||
self.check_model_type(
|
||||
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
|
||||
if self.framework == "tf"
|
||||
else MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
|
||||
)
|
||||
|
||||
self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
|
||||
self._args_parser = args_parser
|
||||
self.ignore_labels = ignore_labels
|
||||
self.grouped_entities = grouped_entities
|
||||
self.ignore_subwords = ignore_subwords
|
||||
|
||||
if self.ignore_subwords and not self.tokenizer.is_fast:
|
||||
raise ValueError(
|
||||
"Slow tokenizers cannot ignore subwords. Please set the `ignore_subwords` option"
|
||||
"to `False` or use a fast tokenizer."
|
||||
)
|
||||
|
||||
def __call__(self, inputs: Union[str, List[str]], **kwargs):
|
||||
"""
|
||||
Classify each token of the text(s) given as inputs.
|
||||
|
||||
Args:
|
||||
inputs (:obj:`str` or :obj:`List[str]`):
|
||||
One or several texts (or one list of texts) for token classification.
|
||||
|
||||
Return:
|
||||
A list or a list of list of :obj:`dict`: Each result comes as a list of dictionaries (one for each token in
|
||||
the corresponding input, or each entity if this pipeline was instantiated with
|
||||
:obj:`grouped_entities=True`) with the following keys:
|
||||
|
||||
- **word** (:obj:`str`) -- The token/word classified.
|
||||
- **score** (:obj:`float`) -- The corresponding probability for :obj:`entity`.
|
||||
- **entity** (:obj:`str`) -- The entity predicted for that token/word (it is named `entity_group` when
|
||||
`grouped_entities` is set to True.
|
||||
- **index** (:obj:`int`, only present when ``self.grouped_entities=False``) -- The index of the
|
||||
corresponding token in the sentence.
|
||||
- **start** (:obj:`int`, `optional`) -- The index of the start of the corresponding entity in the sentence.
|
||||
Only exists if the offsets are available within the tokenizer
|
||||
- **end** (:obj:`int`, `optional`) -- The index of the end of the corresponding entity in the sentence.
|
||||
Only exists if the offsets are available within the tokenizer
|
||||
"""
|
||||
|
||||
inputs, offset_mappings = self._args_parser(inputs, **kwargs)
|
||||
|
||||
answers = []
|
||||
|
||||
for i, sentence in enumerate(inputs):
|
||||
|
||||
# Manage correct placement of the tensors
|
||||
with self.device_placement():
|
||||
|
||||
tokens = self.tokenizer(
|
||||
sentence,
|
||||
return_attention_mask=False,
|
||||
return_tensors=self.framework,
|
||||
truncation=True,
|
||||
return_special_tokens_mask=True,
|
||||
return_offsets_mapping=self.tokenizer.is_fast,
|
||||
)
|
||||
if self.tokenizer.is_fast:
|
||||
offset_mapping = tokens.pop("offset_mapping").cpu().numpy()[0]
|
||||
elif offset_mappings:
|
||||
offset_mapping = offset_mappings[i]
|
||||
else:
|
||||
offset_mapping = None
|
||||
|
||||
special_tokens_mask = tokens.pop("special_tokens_mask").cpu().numpy()[0]
|
||||
|
||||
# Forward
|
||||
if self.framework == "tf":
|
||||
entities = self.model(tokens.data)[0][0].numpy()
|
||||
input_ids = tokens["input_ids"].numpy()[0]
|
||||
else:
|
||||
with torch.no_grad():
|
||||
tokens = self.ensure_tensor_on_device(**tokens)
|
||||
entities = self.model(**tokens)[0][0].cpu().numpy()
|
||||
input_ids = tokens["input_ids"].cpu().numpy()[0]
|
||||
|
||||
score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
|
||||
labels_idx = score.argmax(axis=-1)
|
||||
|
||||
entities = []
|
||||
# Filter to labels not in `self.ignore_labels`
|
||||
# Filter special_tokens
|
||||
filtered_labels_idx = [
|
||||
(idx, label_idx)
|
||||
for idx, label_idx in enumerate(labels_idx)
|
||||
if (self.model.config.id2label[label_idx] not in self.ignore_labels) and not special_tokens_mask[idx]
|
||||
]
|
||||
|
||||
for idx, label_idx in filtered_labels_idx:
|
||||
if offset_mapping is not None:
|
||||
start_ind, end_ind = offset_mapping[idx]
|
||||
word_ref = sentence[start_ind:end_ind]
|
||||
word = self.tokenizer.convert_ids_to_tokens([int(input_ids[idx])])[0]
|
||||
is_subword = len(word_ref) != len(word)
|
||||
|
||||
if int(input_ids[idx]) == self.tokenizer.unk_token_id:
|
||||
word = word_ref
|
||||
is_subword = False
|
||||
else:
|
||||
word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
|
||||
|
||||
start_ind = None
|
||||
end_ind = None
|
||||
|
||||
entity = {
|
||||
"word": word,
|
||||
"score": score[idx][label_idx].item(),
|
||||
"entity": self.model.config.id2label[label_idx],
|
||||
"index": idx,
|
||||
"start": start_ind,
|
||||
"end": end_ind,
|
||||
}
|
||||
|
||||
if self.grouped_entities and self.ignore_subwords:
|
||||
entity["is_subword"] = is_subword
|
||||
|
||||
entities += [entity]
|
||||
|
||||
if self.grouped_entities:
|
||||
answers += [self.group_entities(entities)]
|
||||
# Append ungrouped entities
|
||||
else:
|
||||
answers += [entities]
|
||||
|
||||
if len(answers) == 1:
|
||||
return answers[0]
|
||||
return answers
|
||||
|
||||
def group_sub_entities(self, entities: List[dict]) -> dict:
|
||||
"""
|
||||
Group together the adjacent tokens with the same entity predicted.
|
||||
|
||||
Args:
|
||||
entities (:obj:`dict`): The entities predicted by the pipeline.
|
||||
"""
|
||||
# Get the first entity in the entity group
|
||||
entity = entities[0]["entity"].split("-")[-1]
|
||||
scores = np.nanmean([entity["score"] for entity in entities])
|
||||
tokens = [entity["word"] for entity in entities]
|
||||
|
||||
entity_group = {
|
||||
"entity_group": entity,
|
||||
"score": np.mean(scores),
|
||||
"word": self.tokenizer.convert_tokens_to_string(tokens),
|
||||
"start": entities[0]["start"],
|
||||
"end": entities[-1]["end"],
|
||||
}
|
||||
return entity_group
|
||||
|
||||
def group_entities(self, entities: List[dict]) -> List[dict]:
|
||||
"""
|
||||
Find and group together the adjacent tokens with the same entity predicted.
|
||||
|
||||
Args:
|
||||
entities (:obj:`dict`): The entities predicted by the pipeline.
|
||||
"""
|
||||
|
||||
entity_groups = []
|
||||
entity_group_disagg = []
|
||||
|
||||
if entities:
|
||||
last_idx = entities[-1]["index"]
|
||||
|
||||
for entity in entities:
|
||||
|
||||
is_last_idx = entity["index"] == last_idx
|
||||
is_subword = self.ignore_subwords and entity["is_subword"]
|
||||
if not entity_group_disagg:
|
||||
entity_group_disagg += [entity]
|
||||
if is_last_idx:
|
||||
entity_groups += [self.group_sub_entities(entity_group_disagg)]
|
||||
continue
|
||||
|
||||
# If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group
|
||||
# The split is meant to account for the "B" and "I" suffixes
|
||||
# Shouldn't merge if both entities are B-type
|
||||
if (
|
||||
(
|
||||
entity["entity"].split("-")[-1] == entity_group_disagg[-1]["entity"].split("-")[-1]
|
||||
and entity["entity"].split("-")[0] != "B"
|
||||
)
|
||||
and entity["index"] == entity_group_disagg[-1]["index"] + 1
|
||||
) or is_subword:
|
||||
# Modify subword type to be previous_type
|
||||
if is_subword:
|
||||
entity["entity"] = entity_group_disagg[-1]["entity"].split("-")[-1]
|
||||
entity["score"] = np.nan # set ignored scores to nan and use np.nanmean
|
||||
|
||||
entity_group_disagg += [entity]
|
||||
# Group the entities at the last entity
|
||||
if is_last_idx:
|
||||
entity_groups += [self.group_sub_entities(entity_group_disagg)]
|
||||
# If the current entity is different from the previous entity, aggregate the disaggregated entity group
|
||||
else:
|
||||
entity_groups += [self.group_sub_entities(entity_group_disagg)]
|
||||
entity_group_disagg = [entity]
|
||||
# If it's the last entity, add it to the entity groups
|
||||
if is_last_idx:
|
||||
entity_groups += [self.group_sub_entities(entity_group_disagg)]
|
||||
|
||||
return entity_groups
|
||||
|
||||
|
||||
NerPipeline = TokenClassificationPipeline
|
|
@ -0,0 +1,170 @@
|
|||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..file_utils import add_end_docstrings
|
||||
from ..utils import logging
|
||||
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class ZeroShotClassificationArgumentHandler(ArgumentHandler):
|
||||
"""
|
||||
Handles arguments for zero-shot for text classification by turning each possible label into an NLI
|
||||
premise/hypothesis pair.
|
||||
"""
|
||||
|
||||
def _parse_labels(self, labels):
|
||||
if isinstance(labels, str):
|
||||
labels = [label.strip() for label in labels.split(",")]
|
||||
return labels
|
||||
|
||||
def __call__(self, sequences, labels, hypothesis_template):
|
||||
if len(labels) == 0 or len(sequences) == 0:
|
||||
raise ValueError("You must include at least one label and at least one sequence.")
|
||||
if hypothesis_template.format(labels[0]) == hypothesis_template:
|
||||
raise ValueError(
|
||||
(
|
||||
'The provided hypothesis_template "{}" was not able to be formatted with the target labels. '
|
||||
"Make sure the passed template includes formatting syntax such as {{}} where the label should go."
|
||||
).format(hypothesis_template)
|
||||
)
|
||||
|
||||
if isinstance(sequences, str):
|
||||
sequences = [sequences]
|
||||
labels = self._parse_labels(labels)
|
||||
|
||||
sequence_pairs = []
|
||||
for sequence in sequences:
|
||||
sequence_pairs.extend([[sequence, hypothesis_template.format(label)] for label in labels])
|
||||
|
||||
return sequence_pairs
|
||||
|
||||
|
||||
@add_end_docstrings(PIPELINE_INIT_ARGS)
|
||||
class ZeroShotClassificationPipeline(Pipeline):
|
||||
"""
|
||||
NLI-based zero-shot classification pipeline using a :obj:`ModelForSequenceClassification` trained on NLI (natural
|
||||
language inference) tasks.
|
||||
|
||||
Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
|
||||
pair and passed to the pretrained model. Then, the logit for `entailment` is taken as the logit for the candidate
|
||||
label being valid. Any NLI model can be used, but the id of the `entailment` label must be included in the model
|
||||
config's :attr:`~transformers.PretrainedConfig.label2id`.
|
||||
|
||||
This NLI pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task identifier:
|
||||
:obj:`"zero-shot-classification"`.
|
||||
|
||||
The models that this pipeline can use are models that have been fine-tuned on an NLI task. See the up-to-date list
|
||||
of available models on `huggingface.co/models <https://huggingface.co/models?search=nli>`__.
|
||||
"""
|
||||
|
||||
def __init__(self, args_parser=ZeroShotClassificationArgumentHandler(), *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._args_parser = args_parser
|
||||
if self.entailment_id == -1:
|
||||
logger.warning(
|
||||
"Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to "
|
||||
"-1. Define a descriptive label2id mapping in the model config to ensure correct outputs."
|
||||
)
|
||||
|
||||
@property
|
||||
def entailment_id(self):
|
||||
for label, ind in self.model.config.label2id.items():
|
||||
if label.lower().startswith("entail"):
|
||||
return ind
|
||||
return -1
|
||||
|
||||
def _parse_and_tokenize(
|
||||
self, sequences, candidate_labels, hypothesis_template, padding=True, add_special_tokens=True, **kwargs
|
||||
):
|
||||
"""
|
||||
Parse arguments and tokenize only_first so that hypothesis (label) is not truncated
|
||||
"""
|
||||
sequence_pairs = self._args_parser(sequences, candidate_labels, hypothesis_template)
|
||||
inputs = self.tokenizer(
|
||||
sequence_pairs,
|
||||
add_special_tokens=add_special_tokens,
|
||||
return_tensors=self.framework,
|
||||
padding=padding,
|
||||
truncation="only_first",
|
||||
)
|
||||
|
||||
return inputs
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
sequences: Union[str, List[str]],
|
||||
candidate_labels,
|
||||
hypothesis_template="This example is {}.",
|
||||
multi_class=False,
|
||||
):
|
||||
"""
|
||||
Classify the sequence(s) given as inputs. See the :obj:`~transformers.ZeroShotClassificationPipeline`
|
||||
documentation for more information.
|
||||
|
||||
Args:
|
||||
sequences (:obj:`str` or :obj:`List[str]`):
|
||||
The sequence(s) to classify, will be truncated if the model input is too large.
|
||||
candidate_labels (:obj:`str` or :obj:`List[str]`):
|
||||
The set of possible class labels to classify each sequence into. Can be a single label, a string of
|
||||
comma-separated labels, or a list of labels.
|
||||
hypothesis_template (:obj:`str`, `optional`, defaults to :obj:`"This example is {}."`):
|
||||
The template used to turn each label into an NLI-style hypothesis. This template must include a {} or
|
||||
similar syntax for the candidate label to be inserted into the template. For example, the default
|
||||
template is :obj:`"This example is {}."` With the candidate label :obj:`"sports"`, this would be fed
|
||||
into the model like :obj:`"<cls> sequence to classify <sep> This example is sports . <sep>"`. The
|
||||
default template works well in many cases, but it may be worthwhile to experiment with different
|
||||
templates depending on the task setting.
|
||||
multi_class (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not multiple candidate labels can be true. If :obj:`False`, the scores are normalized such
|
||||
that the sum of the label likelihoods for each sequence is 1. If :obj:`True`, the labels are considered
|
||||
independent and probabilities are normalized for each candidate by doing a softmax of the entailment
|
||||
score vs. the contradiction score.
|
||||
|
||||
Return:
|
||||
A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the following keys:
|
||||
|
||||
- **sequence** (:obj:`str`) -- The sequence for which this is the output.
|
||||
- **labels** (:obj:`List[str]`) -- The labels sorted by order of likelihood.
|
||||
- **scores** (:obj:`List[float]`) -- The probabilities for each of the labels.
|
||||
"""
|
||||
if sequences and isinstance(sequences, str):
|
||||
sequences = [sequences]
|
||||
|
||||
outputs = super().__call__(sequences, candidate_labels, hypothesis_template)
|
||||
num_sequences = len(sequences)
|
||||
candidate_labels = self._args_parser._parse_labels(candidate_labels)
|
||||
reshaped_outputs = outputs.reshape((num_sequences, len(candidate_labels), -1))
|
||||
|
||||
if len(candidate_labels) == 1:
|
||||
multi_class = True
|
||||
|
||||
if not multi_class:
|
||||
# softmax the "entailment" logits over all candidate labels
|
||||
entail_logits = reshaped_outputs[..., self.entailment_id]
|
||||
scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True)
|
||||
else:
|
||||
# softmax over the entailment vs. contradiction dim for each label independently
|
||||
entailment_id = self.entailment_id
|
||||
contradiction_id = -1 if entailment_id == 0 else 0
|
||||
entail_contr_logits = reshaped_outputs[..., [contradiction_id, entailment_id]]
|
||||
scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True)
|
||||
scores = scores[..., 1]
|
||||
|
||||
result = []
|
||||
for iseq in range(num_sequences):
|
||||
top_inds = list(reversed(scores[iseq].argsort()))
|
||||
result.append(
|
||||
{
|
||||
"sequence": sequences if isinstance(sequences, str) else sequences[iseq],
|
||||
"labels": [candidate_labels[i] for i in top_inds],
|
||||
"scores": scores[iseq][top_inds].tolist(),
|
||||
}
|
||||
)
|
||||
|
||||
if len(result) == 1:
|
||||
return result[0]
|
||||
return result
|
Loading…
Reference in New Issue