diff --git a/transformers-cli b/transformers-cli index 168e6e6f32..39b7f5816b 100755 --- a/transformers-cli +++ b/transformers-cli @@ -2,6 +2,7 @@ from argparse import ArgumentParser from transformers.commands.download import DownloadCommand +from transformers.commands.run import RunCommand from transformers.commands.serving import ServeCommand from transformers.commands.user import UserCommands from transformers.commands.train import TrainCommand @@ -14,9 +15,10 @@ if __name__ == '__main__': # Register commands ConvertCommand.register_subcommand(commands_parser) DownloadCommand.register_subcommand(commands_parser) + RunCommand.register_subcommand(commands_parser) ServeCommand.register_subcommand(commands_parser) - UserCommands.register_subcommand(commands_parser) TrainCommand.register_subcommand(commands_parser) + UserCommands.register_subcommand(commands_parser) # Let's go args = parser.parse_args() diff --git a/transformers/commands/run.py b/transformers/commands/run.py new file mode 100644 index 0000000000..bcbb87391d --- /dev/null +++ b/transformers/commands/run.py @@ -0,0 +1,56 @@ +from argparse import ArgumentParser + +from transformers.commands import BaseTransformersCLICommand +from transformers.pipelines import pipeline, Pipeline, PipelineDataFormat, SUPPORTED_TASKS + + +def try_infer_format_from_ext(path: str): + for ext in PipelineDataFormat.SUPPORTED_FORMATS: + if path.endswith(ext): + return ext + + raise Exception( + 'Unable to determine file format from file extension {}. ' + 'Please provide the format through --format {}'.format(path, PipelineDataFormat.SUPPORTED_FORMATS) + ) + + +def run_command_factory(args): + nlp = pipeline(task=args.task, model=args.model, tokenizer=args.tokenizer) + format = try_infer_format_from_ext(args.input) if args.format == 'infer' else args.format + reader = PipelineDataFormat.from_str(format, args.output, args.input, args.column) + return RunCommand(nlp, reader) + + +class RunCommand(BaseTransformersCLICommand): + + def __init__(self, nlp: Pipeline, reader: PipelineDataFormat): + self._nlp = nlp + self._reader = reader + + @staticmethod + def register_subcommand(parser: ArgumentParser): + run_parser = parser.add_parser('run', help="Run a pipeline through the CLI") + run_parser.add_argument('--task', choices=SUPPORTED_TASKS.keys(), help='Task to run') + run_parser.add_argument('--model', type=str, required=True, help='Name or path to the model to instantiate.') + run_parser.add_argument('--tokenizer', type=str, help='Name of the tokenizer to use. (default: same as the model name)') + run_parser.add_argument('--column', type=str, required=True, help='Name of the column to use as input. (For multi columns input as QA use column1,columns2)') + run_parser.add_argument('--format', type=str, default='infer', choices=PipelineDataFormat.SUPPORTED_FORMATS, help='Input format to read from') + run_parser.add_argument('--input', type=str, required=True, help='Path to the file to use for inference') + run_parser.add_argument('--output', type=str, required=True, help='Path to the file that will be used post to write results.') + run_parser.add_argument('kwargs', nargs='*', help='Arguments to forward to the file format reader') + run_parser.set_defaults(func=run_command_factory) + + def run(self): + nlp, output = self._nlp, [] + for entry in self._reader: + if self._reader.is_multi_columns: + output += [nlp(**entry)] + else: + output += [nlp(entry)] + + # Saving data + self._reader.save(output) + + + diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 6fbb7e2f04..a5718b822f 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -14,6 +14,8 @@ # limitations under the License. from __future__ import absolute_import, division, print_function, unicode_literals +import csv +import json import os from abc import ABC, abstractmethod from itertools import groupby @@ -25,11 +27,13 @@ from transformers import AutoTokenizer, PreTrainedTokenizer, PretrainedConfig, \ SquadExample, squad_convert_examples_to_features, is_tf_available, is_torch_available, logger if is_tf_available(): - from transformers import TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering, TFAutoModelForTokenClassification + from transformers import TFAutoModel, TFAutoModelForSequenceClassification, \ + TFAutoModelForQuestionAnswering, TFAutoModelForTokenClassification if is_torch_available(): import torch - from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForTokenClassification + from transformers import AutoModel, AutoModelForSequenceClassification, \ + AutoModelForQuestionAnswering, AutoModelForTokenClassification class Pipeline(ABC): @@ -58,6 +62,84 @@ class Pipeline(ABC): raise NotImplementedError() +class PipelineDataFormat: + SUPPORTED_FORMATS = ['json', 'csv'] + + def __init__(self, output: str, path: str, column: str): + self.output = output + self.path = path + self.column = column.split(',') + self.is_multi_columns = len(self.column) > 1 + + if self.is_multi_columns: + self.column = [tuple(c.split('=')) if '=' in c else (c, c) for c in self.column] + + from os.path import abspath, exists + if exists(abspath(self.output)): + raise OSError('{} already exists on disk'.format(self.output)) + + if not exists(abspath(self.path)): + raise OSError('{} doesnt exist on disk'.format(self.path)) + + @abstractmethod + def __iter__(self): + raise NotImplementedError() + + @abstractmethod + def save(self, data: dict): + raise NotImplementedError() + + @staticmethod + def from_str(name: str, output: str, path: str, column: str): + if name == 'json': + return JsonPipelineDataFormat(output, path, column) + elif name == 'csv': + return CsvPipelineDataFormat(output, path, column) + else: + raise KeyError('Unknown reader {} (Available reader are json/csv)'.format(name)) + + +class CsvPipelineDataFormat(PipelineDataFormat): + def __init__(self, output: str, path: str, column: str): + super().__init__(output, path, column) + + def __iter__(self): + with open(self.path, 'r') as f: + reader = csv.DictReader(f) + for row in reader: + if self.is_multi_columns: + yield {k: row[c] for k, c in self.column} + else: + yield row[self.column] + + def save(self, data: List[dict]): + with open(self.output, 'w') as f: + if len(data) > 0: + writer = csv.DictWriter(f, list(data[0].keys())) + writer.writeheader() + writer.writerows(data) + + +class JsonPipelineDataFormat(PipelineDataFormat): + + def __init__(self, output: str, path: str, column: str): + super().__init__(output, path, column) + + with open(path, 'r') as f: + self._entries = json.load(f) + + def __iter__(self): + for entry in self._entries: + if self.is_multi_columns: + yield {k: entry[c] for k, c in self.column} + else: + yield entry[self.column] + + def save(self, data: dict): + with open(self.output, 'w') as f: + json.dump(data, f) + + class FeatureExtractionPipeline(Pipeline): def __call__(self, *texts, **kwargs): @@ -127,7 +209,7 @@ class NerPipeline(Pipeline): label_idx = score.argmax() answer += [{ - 'word': words[idx - 1], 'score': score[label_idx], 'entity': self.model.config.id2label[label_idx] + 'word': words[idx - 1], 'score': score[label_idx].item(), 'entity': self.model.config.id2label[label_idx] }] # Update token start @@ -270,16 +352,18 @@ class QuestionAnsweringPipeline(Pipeline): char_to_word = np.array(example.char_to_word_offset) # Convert the answer (tokens) back to the original text - answers += [[ + answers += [ { - 'score': score, - 'start': np.where(char_to_word == feature.token_to_orig_map[s])[0][0], - 'end': np.where(char_to_word == feature.token_to_orig_map[e])[0][-1], + 'score': score.item(), + 'start': np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), + 'end': np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), 'answer': ' '.join(example.doc_tokens[feature.token_to_orig_map[s]: feature.token_to_orig_map[e] + 1]) } for s, e, score in zip(starts, ends, scores) - ]] + ] + if len(answers) == 1: + return answers[0] return answers def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple: @@ -363,7 +447,7 @@ def pipeline(task: str, model, config: Optional[PretrainedConfig] = None, tokeni Utility factory method to build pipeline. """ # Try to infer tokenizer from model name (if provided as str) - if not isinstance(tokenizer, PreTrainedTokenizer): + if tokenizer is None: if not isinstance(model, str): # Impossible to guest what is the right tokenizer here raise Exception('Tokenizer cannot be None if provided model is a PreTrainedModel instance')