Fix OWLv2 post_process_object_detection for multiple images (#31082)
* Add test for multiple images * [run slow] owlv2 * Fix box rescaling * [run slow] owlv2
This commit is contained in:
parent
c31473ed44
commit
98e2d48e9a
|
@ -524,19 +524,11 @@ class Owlv2ImageProcessor(BaseImageProcessor):
|
||||||
else:
|
else:
|
||||||
img_h, img_w = target_sizes.unbind(1)
|
img_h, img_w = target_sizes.unbind(1)
|
||||||
|
|
||||||
# rescale coordinates
|
# Rescale coordinates, image is padded to square for inference,
|
||||||
width_ratio = 1
|
# that is why we need to scale boxes to the max size
|
||||||
height_ratio = 1
|
size = torch.max(img_h, img_w)
|
||||||
|
scale_fct = torch.stack([size, size, size, size], dim=1).to(boxes.device)
|
||||||
|
|
||||||
if img_w < img_h:
|
|
||||||
width_ratio = img_w / img_h
|
|
||||||
elif img_h < img_w:
|
|
||||||
height_ratio = img_h / img_w
|
|
||||||
|
|
||||||
img_w = img_w / width_ratio
|
|
||||||
img_h = img_h / height_ratio
|
|
||||||
|
|
||||||
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
|
|
||||||
boxes = boxes * scale_fct[:, None, :]
|
boxes = boxes * scale_fct[:, None, :]
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
|
|
@ -130,17 +130,42 @@ class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||||
model = Owlv2ForObjectDetection.from_pretrained(checkpoint)
|
model = Owlv2ForObjectDetection.from_pretrained(checkpoint)
|
||||||
|
|
||||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||||
inputs = processor(text=["cat"], images=image, return_tensors="pt")
|
text = ["cat"]
|
||||||
|
target_size = image.size[::-1]
|
||||||
|
expected_boxes = torch.tensor(
|
||||||
|
[
|
||||||
|
[341.66656494140625, 23.38756561279297, 642.321044921875, 371.3482971191406],
|
||||||
|
[6.753320693969727, 51.96149826049805, 326.61810302734375, 473.12982177734375],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# single image
|
||||||
|
inputs = processor(text=[text], images=[image], return_tensors="pt")
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
|
|
||||||
target_sizes = torch.tensor([image.size[::-1]])
|
results = processor.post_process_object_detection(outputs, threshold=0.2, target_sizes=[target_size])[0]
|
||||||
results = processor.post_process_object_detection(outputs, threshold=0.2, target_sizes=target_sizes)[0]
|
|
||||||
|
|
||||||
boxes = results["boxes"].tolist()
|
boxes = results["boxes"]
|
||||||
self.assertEqual(boxes[0], [341.66656494140625, 23.38756561279297, 642.321044921875, 371.3482971191406])
|
self.assertTrue(
|
||||||
self.assertEqual(boxes[1], [6.753320693969727, 51.96149826049805, 326.61810302734375, 473.12982177734375])
|
torch.allclose(boxes, expected_boxes, atol=1e-2),
|
||||||
|
f"Single image bounding boxes fail. Expected {expected_boxes}, got {boxes}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# batch of images
|
||||||
|
inputs = processor(text=[text, text], images=[image, image], return_tensors="pt")
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs)
|
||||||
|
results = processor.post_process_object_detection(
|
||||||
|
outputs, threshold=0.2, target_sizes=[target_size, target_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
boxes = result["boxes"]
|
||||||
|
self.assertTrue(
|
||||||
|
torch.allclose(boxes, expected_boxes, atol=1e-2),
|
||||||
|
f"Batch image bounding boxes fail. Expected {expected_boxes}, got {boxes}",
|
||||||
|
)
|
||||||
|
|
||||||
@unittest.skip("OWLv2 doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
|
@unittest.skip("OWLv2 doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
|
||||||
def test_call_numpy_4_channels(self):
|
def test_call_numpy_4_channels(self):
|
||||||
|
|
Loading…
Reference in New Issue