Extract supported marian models to extra folder

This commit is contained in:
Joshua Lochner 2023-04-05 21:24:11 +02:00
parent 585097059c
commit b6483cc7f5
2 changed files with 46 additions and 96 deletions

View File

@ -1,6 +1,48 @@
import json
from transformers.utils import cached_file
# NOTE: In total, there are 1440 models available on the HuggingFace hub (https://huggingface.co/Helsinki-NLP/).
# We have converted some of these (listed below). If you don't see your model here, feel free to convert it yourself
# and make a pull request to this repo.
SUPPORTED_HELSINKI_NLP_MODELS = [
'en-es', 'es-en', # English <-> Spanish
'en-fr', 'fr-en', # English <-> French
'en-hi', 'hi-en', # English <-> Hindi
'en-de', 'de-en', # English <-> German
'en-ru', 'ru-en', # English <-> Russian
'en-it', 'it-en', # English <-> Italian
'en-ar', 'ar-en', # English <-> Arabic
'en-zh', 'zh-en', # English <-> Chinese
'en-sv', 'sv-en', # English <-> Swedish
'en-mul', 'mul-en', # English <-> Multilingual
'en-nl', 'nl-en', # English <-> Dutch
'en-fi', 'fi-en', # English <-> Finnish
'en-jap', 'jap-en', # English <-> Japanese
'en-cs', 'cs-en', # English <-> Czech
'en-vi', 'vi-en', # English <-> Vietnamese
'en-xh', 'xh-en', # English <-> Xhosa
'en-hu', 'hu-en', # English <-> Hungarian
'en-da', 'da-en', # English <-> Danish
'en-id', 'id-en', # English <-> Indonesia
'en-uk', 'uk-en', # English <-> Ukranian
'en-af', 'af-en', # English <-> Afrikaans
'de-es', 'es-de', # German <-> Spanish
'fr-es', 'es-fr', # French <-> Spanish
'fr-de', 'de-fr', # French <-> German
'es-it', 'it-es', # Spanish <-> Italian
'en-ro', # English --> Romanian
'pl-en', # Poland --> English
'tr-en', # Turkey --> English
'ko-en', # Korean --> English
'es-ru', 'ru-es', # Spanish <-> Russian
'fr-ru', 'ru-fr', # French <-> Russian
'fr-ro', 'ro-fr', # French <-> Romanian
'uk-ru', 'ru-uk', # Ukranian <-> Russian
]
def generate_tokenizer_json(model_path, tokenizer):
# Marian models use two separate tokenizers for source and target languages.

View File

@ -1,3 +1,4 @@
from .extra.marian import SUPPORTED_HELSINKI_NLP_MODELS
SUPPORTED_TASKS = {
# map tasks to automodels
@ -166,104 +167,11 @@ SUPPORTED_MODELS = {
]
},
'marian': {
'Helsinki-NLP/opus-mt-en-es': [
f'Helsinki-NLP/opus-mt-{x}': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-es-en': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-en-fr': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-fr-en': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-en-hi': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-hi-en': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-en-de': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-de-en': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-en-ru': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-ru-en': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-en-it': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-it-en': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-en-ar': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-ar-en': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-en-zh': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-zh-en': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-en-sv': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-sv-en': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-en-mul': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-mul-en': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-en-nl': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-nl-en': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-en-fi': [
'default',
'seq2seq-lm-with-past',
],
'Helsinki-NLP/opus-mt-fi-en': [
'default',
'seq2seq-lm-with-past',
],
# TODO add more models, or dynamically generate this list
]
for x in SUPPORTED_HELSINKI_NLP_MODELS
},
'mobilebert': {
'google/mobilebert-uncased': [