Compare commits

...

2 Commits

Author SHA1 Message Date
Joshua Lochner 5d924dca69 Update JSDoc 2023-12-26 02:55:40 +02:00
Joshua Lochner d327d31284 Add support for CLIPSeg models 2023-12-26 02:54:01 +02:00
5 changed files with 71 additions and 0 deletions

View File

@ -278,6 +278,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.

View File

@ -13,6 +13,7 @@
1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.

View File

@ -204,6 +204,14 @@ SUPPORTED_MODELS = {
'openai/clip-vit-large-patch14-336',
],
},
'clipseg': {
# Image segmentation
'image-segmentation': [
'CIDAS/clipseg-rd64-refined',
'CIDAS/clipseg-rd64',
'CIDAS/clipseg-rd16',
],
},
'codegen': {
# Text generation
'text-generation': [

View File

@ -3095,6 +3095,62 @@ export class ChineseCLIPModel extends ChineseCLIPPreTrainedModel { }
//////////////////////////////////////////////////
//////////////////////////////////////////////////
// CLIPSeg models
export class CLIPSegPreTrainedModel extends PreTrainedModel { }
export class CLIPSegModel extends CLIPSegPreTrainedModel { }
/**
* CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
*
* **Example:** Perform zero-shot image segmentation with a `CLIPSegForImageSegmentation` model.
*
* ```javascript
* import { AutoTokenizer, AutoProcessor, CLIPSegForImageSegmentation, RawImage } from '@xenova/transformers';
*
* // Load tokenizer, processor, and model
* const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clipseg-rd64-refined');
* const processor = await AutoProcessor.from_pretrained('Xenova/clipseg-rd64-refined');
* const model = await CLIPSegForImageSegmentation.from_pretrained('Xenova/clipseg-rd64-refined');
*
* // Run tokenization
* const texts = ['a glass', 'something to fill', 'wood', 'a jar'];
* const text_inputs = tokenizer(texts, { padding: true, truncation: true });
*
* // Read image and run processor
* const image = await RawImage.read('https://github.com/timojl/clipseg/blob/master/example_image.jpg?raw=true');
* const image_inputs = await processor(image);
*
* // Run model with both text and pixel inputs
* const { logits } = await model({ ...text_inputs, ...image_inputs });
* // logits: Tensor {
* // dims: [4, 352, 352],
* // type: 'float32',
* // data: Float32Array(495616) [ ... ],
* // size: 495616
* // }
* ```
*
* You can visualize the predictions as follows:
* ```javascript
* const preds = logits
* .unsqueeze_(1)
* .sigmoid_()
* .mul_(255)
* .round_()
* .to('uint8');
*
* for (let i = 0; i < preds.dims[0]; ++i) {
* const img = RawImage.fromTensor(preds[i]);
* img.save(`prediction_${i}.png`);
* }
* ```
*/
export class CLIPSegForImageSegmentation extends CLIPSegPreTrainedModel { }
//////////////////////////////////////////////////
//////////////////////////////////////////////////
// GPT2 models
export class GPT2PreTrainedModel extends PreTrainedModel {
@ -4685,6 +4741,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
['xlm-roberta', ['XLMRobertaModel', XLMRobertaModel]],
['clap', ['ClapModel', ClapModel]],
['clip', ['CLIPModel', CLIPModel]],
['clipseg', ['CLIPSegModel', CLIPSegModel]],
['chinese_clip', ['ChineseCLIPModel', ChineseCLIPModel]],
['mobilebert', ['MobileBertModel', MobileBertModel]],
['squeezebert', ['SqueezeBertModel', SqueezeBertModel]],
@ -4885,6 +4942,7 @@ const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([
const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
['detr', ['DetrForSegmentation', DetrForSegmentation]],
['clipseg', ['CLIPSegForImageSegmentation', CLIPSegForImageSegmentation]],
]);
const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([

View File

@ -617,6 +617,8 @@ export class ChineseCLIPFeatureExtractor extends ImageFeatureExtractor { }
export class ConvNextFeatureExtractor extends ImageFeatureExtractor { }
export class ConvNextImageProcessor extends ConvNextFeatureExtractor { } // NOTE extends ConvNextFeatureExtractor
export class ViTFeatureExtractor extends ImageFeatureExtractor { }
export class ViTImageProcessor extends ImageFeatureExtractor { }
export class MobileViTFeatureExtractor extends ImageFeatureExtractor { }
export class OwlViTFeatureExtractor extends ImageFeatureExtractor {
/** @type {post_process_object_detection} */
@ -1709,6 +1711,7 @@ export class AutoProcessor {
DonutFeatureExtractor,
NougatImageProcessor,
ViTImageProcessor,
VitMatteImageProcessor,
SamImageProcessor,
Swin2SRImageProcessor,