Add `default_to_square_for_size` to `CLIPImageProcessor` (#26965)

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar 2023-10-24 11:08:17 +02:00 committed by GitHub
parent cc7803c0a6
commit fc142bd775
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 78 additions and 24 deletions

View File

@ -84,6 +84,10 @@ class BitImageProcessor(BaseImageProcessor):
Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB.
use_square_size (`bool`, *optional*, defaults to `False`):
The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
`size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or not.
Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
"""
model_input_names = ["pixel_values"]
@ -101,11 +105,12 @@ class BitImageProcessor(BaseImageProcessor):
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = True,
use_square_size: bool = False,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 224}
size = get_size_dict(size, default_to_square=False)
size = get_size_dict(size, default_to_square=use_square_size)
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
@ -120,6 +125,7 @@ class BitImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self.do_convert_rgb = do_convert_rgb
self.use_square_size = use_square_size
# Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
def resize(
@ -147,11 +153,14 @@ class BitImageProcessor(BaseImageProcessor):
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
size = get_size_dict(size, default_to_square=False)
size = get_size_dict(size, default_to_square=self.use_square_size)
if "shortest_edge" not in size:
raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
output_size = get_resize_output_image_size(
image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
image,
size=size["shortest_edge"],
default_to_square=self.use_square_size,
input_data_format=input_data_format,
)
return resize(
image,
@ -234,7 +243,7 @@ class BitImageProcessor(BaseImageProcessor):
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
size = get_size_dict(size, param_name="size", default_to_square=False)
size = get_size_dict(size, param_name="size", default_to_square=self.use_square_size)
resample = resample if resample is not None else self.resample
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
crop_size = crop_size if crop_size is not None else self.crop_size

View File

@ -84,6 +84,10 @@ class CLIPImageProcessor(BaseImageProcessor):
Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB.
use_square_size (`bool`, *optional*, defaults to `False`):
The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
`size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or not.
Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
"""
model_input_names = ["pixel_values"]
@ -101,11 +105,12 @@ class CLIPImageProcessor(BaseImageProcessor):
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = True,
use_square_size: bool = False,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 224}
size = get_size_dict(size, default_to_square=False)
size = get_size_dict(size, default_to_square=use_square_size)
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
@ -120,6 +125,7 @@ class CLIPImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self.do_convert_rgb = do_convert_rgb
self.use_square_size = use_square_size
def resize(
self,
@ -146,11 +152,14 @@ class CLIPImageProcessor(BaseImageProcessor):
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
size = get_size_dict(size, default_to_square=False)
size = get_size_dict(size, default_to_square=self.use_square_size)
if "shortest_edge" not in size:
raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
output_size = get_resize_output_image_size(
image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
image,
size=size["shortest_edge"],
default_to_square=self.use_square_size,
input_data_format=input_data_format,
)
return resize(
image,
@ -233,7 +242,7 @@ class CLIPImageProcessor(BaseImageProcessor):
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
size = get_size_dict(size, param_name="size", default_to_square=False)
size = get_size_dict(size, param_name="size", default_to_square=self.use_square_size)
resample = resample if resample is not None else self.resample
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
crop_size = crop_size if crop_size is not None else self.crop_size

View File

@ -79,6 +79,10 @@ class MobileNetV1ImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
use_square_size (`bool`, *optional*, defaults to `False`):
The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
`size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or not.
Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
"""
model_input_names = ["pixel_values"]
@ -95,11 +99,12 @@ class MobileNetV1ImageProcessor(BaseImageProcessor):
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
use_square_size: bool = False,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 256}
size = get_size_dict(size, default_to_square=False)
size = get_size_dict(size, default_to_square=use_square_size)
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
crop_size = get_size_dict(crop_size)
self.do_resize = do_resize
@ -112,6 +117,7 @@ class MobileNetV1ImageProcessor(BaseImageProcessor):
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.use_square_size = use_square_size
# Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
def resize(
@ -139,11 +145,14 @@ class MobileNetV1ImageProcessor(BaseImageProcessor):
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
size = get_size_dict(size, default_to_square=False)
size = get_size_dict(size, default_to_square=self.use_square_size)
if "shortest_edge" not in size:
raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
output_size = get_resize_output_image_size(
image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
image,
size=size["shortest_edge"],
default_to_square=self.use_square_size,
input_data_format=input_data_format,
)
return resize(
image,
@ -222,7 +231,7 @@ class MobileNetV1ImageProcessor(BaseImageProcessor):
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
size = get_size_dict(size, default_to_square=False)
size = get_size_dict(size, default_to_square=self.use_square_size)
resample = resample if resample is not None else self.resample
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
crop_size = crop_size if crop_size is not None else self.crop_size

View File

@ -83,6 +83,10 @@ class MobileNetV2ImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
use_square_size (`bool`, *optional*, defaults to `False`):
The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
`size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or not.
Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
"""
model_input_names = ["pixel_values"]
@ -99,11 +103,12 @@ class MobileNetV2ImageProcessor(BaseImageProcessor):
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
use_square_size: bool = False,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 256}
size = get_size_dict(size, default_to_square=False)
size = get_size_dict(size, default_to_square=use_square_size)
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
crop_size = get_size_dict(crop_size, param_name="crop_size")
self.do_resize = do_resize
@ -116,6 +121,7 @@ class MobileNetV2ImageProcessor(BaseImageProcessor):
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.use_square_size = use_square_size
# Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize
def resize(
@ -143,11 +149,14 @@ class MobileNetV2ImageProcessor(BaseImageProcessor):
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
size = get_size_dict(size, default_to_square=False)
size = get_size_dict(size, default_to_square=self.use_square_size)
if "shortest_edge" not in size:
raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
output_size = get_resize_output_image_size(
image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
image,
size=size["shortest_edge"],
default_to_square=self.use_square_size,
input_data_format=input_data_format,
)
return resize(
image,
@ -226,7 +235,7 @@ class MobileNetV2ImageProcessor(BaseImageProcessor):
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
size = get_size_dict(size, default_to_square=False)
size = get_size_dict(size, default_to_square=self.use_square_size)
resample = resample if resample is not None else self.resample
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
crop_size = crop_size if crop_size is not None else self.crop_size

View File

@ -78,6 +78,10 @@ class MobileViTImageProcessor(BaseImageProcessor):
do_flip_channel_order (`bool`, *optional*, defaults to `True`):
Whether to flip the color channels from RGB to BGR. Can be overridden by the `do_flip_channel_order`
parameter in the `preprocess` method.
use_square_size (`bool`, *optional*, defaults to `False`):
The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
`size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or not.
Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
"""
model_input_names = ["pixel_values"]
@ -92,11 +96,12 @@ class MobileViTImageProcessor(BaseImageProcessor):
do_center_crop: bool = True,
crop_size: Dict[str, int] = None,
do_flip_channel_order: bool = True,
use_square_size: bool = False,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 224}
size = get_size_dict(size, default_to_square=False)
size = get_size_dict(size, default_to_square=use_square_size)
crop_size = crop_size if crop_size is not None else {"height": 256, "width": 256}
crop_size = get_size_dict(crop_size, param_name="crop_size")
@ -108,6 +113,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_flip_channel_order = do_flip_channel_order
self.use_square_size = use_square_size
# Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize with PILImageResampling.BICUBIC->PILImageResampling.BILINEAR
def resize(
@ -135,11 +141,14 @@ class MobileViTImageProcessor(BaseImageProcessor):
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
size = get_size_dict(size, default_to_square=False)
size = get_size_dict(size, default_to_square=self.use_square_size)
if "shortest_edge" not in size:
raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
output_size = get_resize_output_image_size(
image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
image,
size=size["shortest_edge"],
default_to_square=self.use_square_size,
input_data_format=input_data_format,
)
return resize(
image,
@ -237,7 +246,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
)
size = size if size is not None else self.size
size = get_size_dict(size, default_to_square=False)
size = get_size_dict(size, default_to_square=self.use_square_size)
crop_size = crop_size if crop_size is not None else self.crop_size
crop_size = get_size_dict(crop_size, param_name="crop_size")

View File

@ -84,6 +84,10 @@ class ViTHybridImageProcessor(BaseImageProcessor):
Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB.
use_square_size (`bool`, *optional*, defaults to `False`):
The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
`size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or not.
Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
"""
model_input_names = ["pixel_values"]
@ -101,11 +105,12 @@ class ViTHybridImageProcessor(BaseImageProcessor):
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = True,
use_square_size: bool = False,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 224}
size = get_size_dict(size, default_to_square=False)
size = get_size_dict(size, default_to_square=use_square_size)
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
@ -120,6 +125,7 @@ class ViTHybridImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self.do_convert_rgb = do_convert_rgb
self.use_square_size = use_square_size
# Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
def resize(
@ -147,11 +153,14 @@ class ViTHybridImageProcessor(BaseImageProcessor):
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
size = get_size_dict(size, default_to_square=False)
size = get_size_dict(size, default_to_square=self.use_square_size)
if "shortest_edge" not in size:
raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
output_size = get_resize_output_image_size(
image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
image,
size=size["shortest_edge"],
default_to_square=self.use_square_size,
input_data_format=input_data_format,
)
return resize(
image,
@ -234,7 +243,7 @@ class ViTHybridImageProcessor(BaseImageProcessor):
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
size = get_size_dict(size, param_name="size", default_to_square=False)
size = get_size_dict(size, param_name="size", default_to_square=self.use_square_size)
resample = resample if resample is not None else self.resample
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
crop_size = crop_size if crop_size is not None else self.crop_size