Compare commits
3 Commits
main
...
add-segfor
Author | SHA1 | Date |
---|---|---|
Joshua Lochner | 07c5d86e4f | |
Joshua Lochner | 4498437025 | |
Joshua Lochner | 32a9e2d4d8 |
|
@ -326,6 +326,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
|
|||
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
|
|
|
@ -61,6 +61,7 @@
|
|||
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
|
|
|
@ -362,7 +362,7 @@ SUPPORTED_MODELS = {
|
|||
'distilbert-base-cased',
|
||||
],
|
||||
},
|
||||
'dit': { # NOTE: DiT has the same architecture as BEiT.
|
||||
'dit': { # NOTE: DiT has the same architecture as BEiT.
|
||||
# Feature extraction
|
||||
# NOTE: requires --task feature-extraction
|
||||
'feature-extraction': [
|
||||
|
@ -680,8 +680,8 @@ SUPPORTED_MODELS = {
|
|||
'hf-tiny-model-private/tiny-random-RoFormerForTokenClassification',
|
||||
],
|
||||
|
||||
# TODO
|
||||
# # Text generation
|
||||
# TODO
|
||||
# # Text generation
|
||||
# 'text-generation': [
|
||||
# 'hf-tiny-model-private/tiny-random-RoFormerForCausalLM',
|
||||
# ],
|
||||
|
@ -736,6 +736,40 @@ SUPPORTED_MODELS = {
|
|||
# 'facebook/sam-vit-large',
|
||||
# 'facebook/sam-vit-huge',
|
||||
# ],
|
||||
'segformer': {
|
||||
# Image segmentation
|
||||
'image-segmentation': [
|
||||
'mattmdjaga/segformer_b0_clothes',
|
||||
'mattmdjaga/segformer_b2_clothes',
|
||||
'jonathandinu/face-parsing',
|
||||
|
||||
'nvidia/segformer-b0-finetuned-cityscapes-768-768',
|
||||
'nvidia/segformer-b0-finetuned-cityscapes-512-1024',
|
||||
'nvidia/segformer-b0-finetuned-cityscapes-640-1280',
|
||||
'nvidia/segformer-b0-finetuned-cityscapes-1024-1024',
|
||||
'nvidia/segformer-b1-finetuned-cityscapes-1024-1024',
|
||||
'nvidia/segformer-b2-finetuned-cityscapes-1024-1024',
|
||||
'nvidia/segformer-b3-finetuned-cityscapes-1024-1024',
|
||||
'nvidia/segformer-b4-finetuned-cityscapes-1024-1024',
|
||||
'nvidia/segformer-b5-finetuned-cityscapes-1024-1024',
|
||||
'nvidia/segformer-b0-finetuned-ade-512-512',
|
||||
'nvidia/segformer-b1-finetuned-ade-512-512',
|
||||
'nvidia/segformer-b2-finetuned-ade-512-512',
|
||||
'nvidia/segformer-b3-finetuned-ade-512-512',
|
||||
'nvidia/segformer-b4-finetuned-ade-512-512',
|
||||
'nvidia/segformer-b5-finetuned-ade-640-640',
|
||||
],
|
||||
|
||||
# Image classification
|
||||
'image-classification': [
|
||||
'nvidia/mit-b0',
|
||||
'nvidia/mit-b1',
|
||||
'nvidia/mit-b2',
|
||||
'nvidia/mit-b3',
|
||||
'nvidia/mit-b4',
|
||||
'nvidia/mit-b5',
|
||||
],
|
||||
},
|
||||
|
||||
'speecht5': {
|
||||
# Text-to-audio/Text-to-speech
|
||||
|
|
|
@ -4736,6 +4736,27 @@ export class VitsModel extends VitsPreTrainedModel {
|
|||
}
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// Segformer models
|
||||
export class SegformerPreTrainedModel extends PreTrainedModel { }
|
||||
|
||||
/**
|
||||
* The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.
|
||||
*/
|
||||
export class SegformerModel extends SegformerPreTrainedModel { }
|
||||
|
||||
/**
|
||||
* SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden states) e.g. for ImageNet.
|
||||
*/
|
||||
export class SegformerForImageClassification extends SegformerPreTrainedModel { }
|
||||
|
||||
/**
|
||||
* SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes.
|
||||
*/
|
||||
export class SegformerForSemanticSegmentation extends SegformerPreTrainedModel { }
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// AutoModels, used to simplify construction of PreTrainedModels
|
||||
|
@ -5020,6 +5041,7 @@ const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
|
|||
['dinov2', ['Dinov2ForImageClassification', Dinov2ForImageClassification]],
|
||||
['resnet', ['ResNetForImageClassification', ResNetForImageClassification]],
|
||||
['swin', ['SwinForImageClassification', SwinForImageClassification]],
|
||||
['segformer', ['SegformerForImageClassification', SegformerForImageClassification]],
|
||||
]);
|
||||
|
||||
const MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = new Map([
|
||||
|
@ -5036,6 +5058,10 @@ const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
|
|||
['detr', ['DetrForSegmentation', DetrForSegmentation]],
|
||||
]);
|
||||
|
||||
const MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = new Map([
|
||||
['segformer', ['SegformerForSemanticSegmentation', SegformerForSemanticSegmentation]],
|
||||
]);
|
||||
|
||||
const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([
|
||||
['sam', ['SamModel', SamModel]],
|
||||
]);
|
||||
|
@ -5081,6 +5107,7 @@ const MODEL_CLASS_TYPE_MAPPING = [
|
|||
[MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Vision2Seq],
|
||||
[MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
||||
[MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
||||
[MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
||||
[MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
||||
[MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
||||
[MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
||||
|
@ -5260,6 +5287,17 @@ export class AutoModelForImageSegmentation extends PretrainedMixin {
|
|||
static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES];
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper class which is used to instantiate pretrained image segmentation models with the `from_pretrained` function.
|
||||
* The chosen model class is determined by the type specified in the model config.
|
||||
*
|
||||
* @example
|
||||
* let model = await AutoModelForSemanticSegmentation.from_pretrained('nvidia/segformer-b3-finetuned-cityscapes-1024-1024');
|
||||
*/
|
||||
export class AutoModelForSemanticSegmentation extends PretrainedMixin {
|
||||
static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES];
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper class which is used to instantiate pretrained object detection models with the `from_pretrained` function.
|
||||
* The chosen model class is determined by the type specified in the model config.
|
||||
|
|
|
@ -33,6 +33,7 @@ import {
|
|||
AutoModelForVision2Seq,
|
||||
AutoModelForImageClassification,
|
||||
AutoModelForImageSegmentation,
|
||||
AutoModelForSemanticSegmentation,
|
||||
AutoModelForObjectDetection,
|
||||
AutoModelForZeroShotObjectDetection,
|
||||
AutoModelForDocumentQuestionAnswering,
|
||||
|
@ -1710,8 +1711,26 @@ export class ImageSegmentationPipeline extends Pipeline {
|
|||
}
|
||||
|
||||
} else if (subtask === 'semantic') {
|
||||
throw Error(`semantic segmentation not yet supported.`);
|
||||
const { segmentation, labels } = fn(output, target_sizes ?? imageSizes)[0];
|
||||
|
||||
const id2label = this.model.config.id2label;
|
||||
|
||||
for (let label of labels) {
|
||||
const maskData = new Uint8ClampedArray(segmentation.data.length);
|
||||
for (let i = 0; i < segmentation.data.length; ++i) {
|
||||
if (segmentation.data[i] === label) {
|
||||
maskData[i] = 255;
|
||||
}
|
||||
}
|
||||
|
||||
const mask = new RawImage(maskData, segmentation.dims[1], segmentation.dims[0], 1);
|
||||
|
||||
annotation.push({
|
||||
score: null,
|
||||
label: id2label[label],
|
||||
mask: mask
|
||||
});
|
||||
}
|
||||
} else {
|
||||
throw Error(`Subtask ${subtask} not supported.`);
|
||||
}
|
||||
|
@ -2488,7 +2507,7 @@ const SUPPORTED_TASKS = {
|
|||
"image-segmentation": {
|
||||
// no tokenizer
|
||||
"pipeline": ImageSegmentationPipeline,
|
||||
"model": AutoModelForImageSegmentation,
|
||||
"model": [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
|
||||
"processor": AutoProcessor,
|
||||
"default": {
|
||||
// TODO: replace with original
|
||||
|
|
|
@ -609,6 +609,71 @@ export class ImageFeatureExtractor extends FeatureExtractor {
|
|||
|
||||
}
|
||||
|
||||
export class SegformerFeatureExtractor extends ImageFeatureExtractor {
|
||||
|
||||
/**
|
||||
* Converts the output of `SegformerForSemanticSegmentation` into semantic segmentation maps.
|
||||
* @param {*} outputs Raw outputs of the model.
|
||||
* @param {number[][]} [target_sizes=null] List of tuples corresponding to the requested final size
|
||||
* (height, width) of each prediction. If unset, predictions will not be resized.
|
||||
* @returns {{segmentation: Tensor; labels: number[]}[]} The semantic segmentation maps.
|
||||
*/
|
||||
post_process_semantic_segmentation(outputs, target_sizes = null) {
|
||||
|
||||
const logits = outputs.logits;
|
||||
const batch_size = logits.dims[0];
|
||||
|
||||
if (target_sizes !== null && target_sizes.length !== batch_size) {
|
||||
throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
|
||||
}
|
||||
|
||||
const toReturn = [];
|
||||
for (let i = 0; i < batch_size; ++i) {
|
||||
const target_size = target_sizes !== null ? target_sizes[i] : null;
|
||||
|
||||
let data = logits[i];
|
||||
|
||||
// 1. If target_size is not null, we need to resize the masks to the target size
|
||||
if (target_size !== null) {
|
||||
// resize the masks to the target size
|
||||
data = interpolate(data, target_size, 'bilinear', false);
|
||||
}
|
||||
const [height, width] = target_size ?? data.dims.slice(-2);
|
||||
|
||||
const segmentation = new Tensor(
|
||||
'int32',
|
||||
new Int32Array(height * width),
|
||||
[height, width]
|
||||
);
|
||||
|
||||
// Buffer to store current largest value
|
||||
const buffer = data[0].data;
|
||||
for (let j = 1; j < data.dims[0]; ++j) {
|
||||
const row = data[j].data;
|
||||
for (let k = 0; k < row.length; ++k) {
|
||||
if (row[k] > buffer[k]) {
|
||||
buffer[k] = row[k];
|
||||
segmentation.data[k] = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Store which objects have labels
|
||||
// This is much more efficient that creating a set of the final values
|
||||
const hasLabel = new Array(data.dims[0]);
|
||||
const out = segmentation.data;
|
||||
for (let j = 0; j < out.length; ++j) {
|
||||
const index = out[j];
|
||||
hasLabel[index] = index;
|
||||
}
|
||||
/** @type {number[]} The unique list of labels that were detected */
|
||||
const labels = hasLabel.filter(x => x !== undefined);
|
||||
|
||||
toReturn.push({ segmentation, labels });
|
||||
}
|
||||
return toReturn;
|
||||
}
|
||||
}
|
||||
export class BitImageProcessor extends ImageFeatureExtractor { }
|
||||
export class DPTFeatureExtractor extends ImageFeatureExtractor { }
|
||||
export class GLPNFeatureExtractor extends ImageFeatureExtractor { }
|
||||
|
@ -1699,6 +1764,7 @@ export class AutoProcessor {
|
|||
ChineseCLIPFeatureExtractor,
|
||||
ConvNextFeatureExtractor,
|
||||
ConvNextImageProcessor,
|
||||
SegformerFeatureExtractor,
|
||||
BitImageProcessor,
|
||||
DPTFeatureExtractor,
|
||||
GLPNFeatureExtractor,
|
||||
|
|
|
@ -1164,6 +1164,7 @@ describe('Pipelines', () => {
|
|||
// List all models which will be tested
|
||||
const models = [
|
||||
'facebook/detr-resnet-50-panoptic',
|
||||
'mattmdjaga/segformer_b2_clothes',
|
||||
];
|
||||
|
||||
it(models[0], async () => {
|
||||
|
@ -1195,6 +1196,47 @@ describe('Pipelines', () => {
|
|||
await segmenter.dispose();
|
||||
|
||||
}, MAX_TEST_EXECUTION_TIME);
|
||||
|
||||
it(models[1], async () => {
|
||||
let segmenter = await pipeline('image-segmentation', m(models[1]));
|
||||
let img = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/young-man-standing-and-leaning-on-car.jpg';
|
||||
|
||||
// single
|
||||
{
|
||||
let outputs = await segmenter(img);
|
||||
|
||||
let expected = [
|
||||
{ label: 'Background' },
|
||||
{ label: 'Hair' },
|
||||
{ label: 'Upper-clothes' },
|
||||
{ label: 'Pants' },
|
||||
{ label: 'Left-shoe' },
|
||||
{ label: 'Right-shoe' },
|
||||
{ label: 'Face' },
|
||||
{ label: 'Left-leg' },
|
||||
{ label: 'Right-leg' },
|
||||
{ label: 'Left-arm' },
|
||||
{ label: 'Right-arm' },
|
||||
];
|
||||
|
||||
let outputLabels = outputs.map(x => x.label);
|
||||
let expectedLabels = expected.map(x => x.label);
|
||||
|
||||
expect(outputLabels).toHaveLength(expectedLabels.length);
|
||||
expect(outputLabels.sort()).toEqual(expectedLabels.sort())
|
||||
|
||||
// check that all scores are null, and masks have correct dimensions
|
||||
for (let output of outputs) {
|
||||
expect(output.score).toBeNull();
|
||||
expect(output.mask.width).toEqual(970);
|
||||
expect(output.mask.height).toEqual(1455);
|
||||
expect(output.mask.channels).toEqual(1);
|
||||
}
|
||||
}
|
||||
|
||||
await segmenter.dispose();
|
||||
|
||||
}, MAX_TEST_EXECUTION_TIME);
|
||||
});
|
||||
|
||||
describe('Zero-shot image classification', () => {
|
||||
|
|
Loading…
Reference in New Issue