Compare commits

...

3 Commits

Author SHA1 Message Date
Joshua Lochner 07c5d86e4f Update pipelines.test.js 2023-12-27 00:19:03 +02:00
Joshua Lochner 4498437025 Add semantic segmentation unit test 2023-12-26 20:13:42 +02:00
Joshua Lochner 32a9e2d4d8 Add support for Segformer 2023-12-26 19:25:08 +02:00
7 changed files with 206 additions and 5 deletions

View File

@ -326,6 +326,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.

View File

@ -61,6 +61,7 @@
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.

View File

@ -362,7 +362,7 @@ SUPPORTED_MODELS = {
'distilbert-base-cased',
],
},
'dit': { # NOTE: DiT has the same architecture as BEiT.
'dit': { # NOTE: DiT has the same architecture as BEiT.
# Feature extraction
# NOTE: requires --task feature-extraction
'feature-extraction': [
@ -680,8 +680,8 @@ SUPPORTED_MODELS = {
'hf-tiny-model-private/tiny-random-RoFormerForTokenClassification',
],
# TODO
# # Text generation
# TODO
# # Text generation
# 'text-generation': [
# 'hf-tiny-model-private/tiny-random-RoFormerForCausalLM',
# ],
@ -736,6 +736,40 @@ SUPPORTED_MODELS = {
# 'facebook/sam-vit-large',
# 'facebook/sam-vit-huge',
# ],
'segformer': {
# Image segmentation
'image-segmentation': [
'mattmdjaga/segformer_b0_clothes',
'mattmdjaga/segformer_b2_clothes',
'jonathandinu/face-parsing',
'nvidia/segformer-b0-finetuned-cityscapes-768-768',
'nvidia/segformer-b0-finetuned-cityscapes-512-1024',
'nvidia/segformer-b0-finetuned-cityscapes-640-1280',
'nvidia/segformer-b0-finetuned-cityscapes-1024-1024',
'nvidia/segformer-b1-finetuned-cityscapes-1024-1024',
'nvidia/segformer-b2-finetuned-cityscapes-1024-1024',
'nvidia/segformer-b3-finetuned-cityscapes-1024-1024',
'nvidia/segformer-b4-finetuned-cityscapes-1024-1024',
'nvidia/segformer-b5-finetuned-cityscapes-1024-1024',
'nvidia/segformer-b0-finetuned-ade-512-512',
'nvidia/segformer-b1-finetuned-ade-512-512',
'nvidia/segformer-b2-finetuned-ade-512-512',
'nvidia/segformer-b3-finetuned-ade-512-512',
'nvidia/segformer-b4-finetuned-ade-512-512',
'nvidia/segformer-b5-finetuned-ade-640-640',
],
# Image classification
'image-classification': [
'nvidia/mit-b0',
'nvidia/mit-b1',
'nvidia/mit-b2',
'nvidia/mit-b3',
'nvidia/mit-b4',
'nvidia/mit-b5',
],
},
'speecht5': {
# Text-to-audio/Text-to-speech

View File

@ -4736,6 +4736,27 @@ export class VitsModel extends VitsPreTrainedModel {
}
//////////////////////////////////////////////////
//////////////////////////////////////////////////
// Segformer models
export class SegformerPreTrainedModel extends PreTrainedModel { }
/**
* The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.
*/
export class SegformerModel extends SegformerPreTrainedModel { }
/**
* SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden states) e.g. for ImageNet.
*/
export class SegformerForImageClassification extends SegformerPreTrainedModel { }
/**
* SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes.
*/
export class SegformerForSemanticSegmentation extends SegformerPreTrainedModel { }
//////////////////////////////////////////////////
//////////////////////////////////////////////////
// AutoModels, used to simplify construction of PreTrainedModels
@ -5020,6 +5041,7 @@ const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
['dinov2', ['Dinov2ForImageClassification', Dinov2ForImageClassification]],
['resnet', ['ResNetForImageClassification', ResNetForImageClassification]],
['swin', ['SwinForImageClassification', SwinForImageClassification]],
['segformer', ['SegformerForImageClassification', SegformerForImageClassification]],
]);
const MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = new Map([
@ -5036,6 +5058,10 @@ const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
['detr', ['DetrForSegmentation', DetrForSegmentation]],
]);
const MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = new Map([
['segformer', ['SegformerForSemanticSegmentation', SegformerForSemanticSegmentation]],
]);
const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([
['sam', ['SamModel', SamModel]],
]);
@ -5081,6 +5107,7 @@ const MODEL_CLASS_TYPE_MAPPING = [
[MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Vision2Seq],
[MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
[MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
[MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
[MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
[MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
[MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
@ -5260,6 +5287,17 @@ export class AutoModelForImageSegmentation extends PretrainedMixin {
static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES];
}
/**
* Helper class which is used to instantiate pretrained image segmentation models with the `from_pretrained` function.
* The chosen model class is determined by the type specified in the model config.
*
* @example
* let model = await AutoModelForSemanticSegmentation.from_pretrained('nvidia/segformer-b3-finetuned-cityscapes-1024-1024');
*/
export class AutoModelForSemanticSegmentation extends PretrainedMixin {
static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES];
}
/**
* Helper class which is used to instantiate pretrained object detection models with the `from_pretrained` function.
* The chosen model class is determined by the type specified in the model config.

View File

@ -33,6 +33,7 @@ import {
AutoModelForVision2Seq,
AutoModelForImageClassification,
AutoModelForImageSegmentation,
AutoModelForSemanticSegmentation,
AutoModelForObjectDetection,
AutoModelForZeroShotObjectDetection,
AutoModelForDocumentQuestionAnswering,
@ -1710,8 +1711,26 @@ export class ImageSegmentationPipeline extends Pipeline {
}
} else if (subtask === 'semantic') {
throw Error(`semantic segmentation not yet supported.`);
const { segmentation, labels } = fn(output, target_sizes ?? imageSizes)[0];
const id2label = this.model.config.id2label;
for (let label of labels) {
const maskData = new Uint8ClampedArray(segmentation.data.length);
for (let i = 0; i < segmentation.data.length; ++i) {
if (segmentation.data[i] === label) {
maskData[i] = 255;
}
}
const mask = new RawImage(maskData, segmentation.dims[1], segmentation.dims[0], 1);
annotation.push({
score: null,
label: id2label[label],
mask: mask
});
}
} else {
throw Error(`Subtask ${subtask} not supported.`);
}
@ -2488,7 +2507,7 @@ const SUPPORTED_TASKS = {
"image-segmentation": {
// no tokenizer
"pipeline": ImageSegmentationPipeline,
"model": AutoModelForImageSegmentation,
"model": [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
"processor": AutoProcessor,
"default": {
// TODO: replace with original

View File

@ -609,6 +609,71 @@ export class ImageFeatureExtractor extends FeatureExtractor {
}
export class SegformerFeatureExtractor extends ImageFeatureExtractor {
/**
* Converts the output of `SegformerForSemanticSegmentation` into semantic segmentation maps.
* @param {*} outputs Raw outputs of the model.
* @param {number[][]} [target_sizes=null] List of tuples corresponding to the requested final size
* (height, width) of each prediction. If unset, predictions will not be resized.
* @returns {{segmentation: Tensor; labels: number[]}[]} The semantic segmentation maps.
*/
post_process_semantic_segmentation(outputs, target_sizes = null) {
const logits = outputs.logits;
const batch_size = logits.dims[0];
if (target_sizes !== null && target_sizes.length !== batch_size) {
throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
}
const toReturn = [];
for (let i = 0; i < batch_size; ++i) {
const target_size = target_sizes !== null ? target_sizes[i] : null;
let data = logits[i];
// 1. If target_size is not null, we need to resize the masks to the target size
if (target_size !== null) {
// resize the masks to the target size
data = interpolate(data, target_size, 'bilinear', false);
}
const [height, width] = target_size ?? data.dims.slice(-2);
const segmentation = new Tensor(
'int32',
new Int32Array(height * width),
[height, width]
);
// Buffer to store current largest value
const buffer = data[0].data;
for (let j = 1; j < data.dims[0]; ++j) {
const row = data[j].data;
for (let k = 0; k < row.length; ++k) {
if (row[k] > buffer[k]) {
buffer[k] = row[k];
segmentation.data[k] = j;
}
}
}
// Store which objects have labels
// This is much more efficient that creating a set of the final values
const hasLabel = new Array(data.dims[0]);
const out = segmentation.data;
for (let j = 0; j < out.length; ++j) {
const index = out[j];
hasLabel[index] = index;
}
/** @type {number[]} The unique list of labels that were detected */
const labels = hasLabel.filter(x => x !== undefined);
toReturn.push({ segmentation, labels });
}
return toReturn;
}
}
export class BitImageProcessor extends ImageFeatureExtractor { }
export class DPTFeatureExtractor extends ImageFeatureExtractor { }
export class GLPNFeatureExtractor extends ImageFeatureExtractor { }
@ -1699,6 +1764,7 @@ export class AutoProcessor {
ChineseCLIPFeatureExtractor,
ConvNextFeatureExtractor,
ConvNextImageProcessor,
SegformerFeatureExtractor,
BitImageProcessor,
DPTFeatureExtractor,
GLPNFeatureExtractor,

View File

@ -1164,6 +1164,7 @@ describe('Pipelines', () => {
// List all models which will be tested
const models = [
'facebook/detr-resnet-50-panoptic',
'mattmdjaga/segformer_b2_clothes',
];
it(models[0], async () => {
@ -1195,6 +1196,47 @@ describe('Pipelines', () => {
await segmenter.dispose();
}, MAX_TEST_EXECUTION_TIME);
it(models[1], async () => {
let segmenter = await pipeline('image-segmentation', m(models[1]));
let img = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/young-man-standing-and-leaning-on-car.jpg';
// single
{
let outputs = await segmenter(img);
let expected = [
{ label: 'Background' },
{ label: 'Hair' },
{ label: 'Upper-clothes' },
{ label: 'Pants' },
{ label: 'Left-shoe' },
{ label: 'Right-shoe' },
{ label: 'Face' },
{ label: 'Left-leg' },
{ label: 'Right-leg' },
{ label: 'Left-arm' },
{ label: 'Right-arm' },
];
let outputLabels = outputs.map(x => x.label);
let expectedLabels = expected.map(x => x.label);
expect(outputLabels).toHaveLength(expectedLabels.length);
expect(outputLabels.sort()).toEqual(expectedLabels.sort())
// check that all scores are null, and masks have correct dimensions
for (let output of outputs) {
expect(output.score).toBeNull();
expect(output.mask.width).toEqual(970);
expect(output.mask.height).toEqual(1455);
expect(output.mask.channels).toEqual(1);
}
}
await segmenter.dispose();
}, MAX_TEST_EXECUTION_TIME);
});
describe('Zero-shot image classification', () => {