Fix OWLv2 post_process_object_detection for multiple images (#31082)

* Add test for multiple images

* [run slow] owlv2

* Fix box rescaling

* [run slow] owlv2
This commit is contained in:
Pavel Iakubovskii 2024-05-28 11:06:06 +00:00 committed by GitHub
parent c31473ed44
commit 98e2d48e9a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 35 additions and 18 deletions

View File

@ -524,19 +524,11 @@ class Owlv2ImageProcessor(BaseImageProcessor):
else:
img_h, img_w = target_sizes.unbind(1)
# rescale coordinates
width_ratio = 1
height_ratio = 1
# Rescale coordinates, image is padded to square for inference,
# that is why we need to scale boxes to the max size
size = torch.max(img_h, img_w)
scale_fct = torch.stack([size, size, size, size], dim=1).to(boxes.device)
if img_w < img_h:
width_ratio = img_w / img_h
elif img_h < img_w:
height_ratio = img_h / img_w
img_w = img_w / width_ratio
img_h = img_h / height_ratio
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
boxes = boxes * scale_fct[:, None, :]
results = []

View File

@ -130,17 +130,42 @@ class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
model = Owlv2ForObjectDetection.from_pretrained(checkpoint)
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
inputs = processor(text=["cat"], images=image, return_tensors="pt")
text = ["cat"]
target_size = image.size[::-1]
expected_boxes = torch.tensor(
[
[341.66656494140625, 23.38756561279297, 642.321044921875, 371.3482971191406],
[6.753320693969727, 51.96149826049805, 326.61810302734375, 473.12982177734375],
]
)
# single image
inputs = processor(text=[text], images=[image], return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
target_sizes = torch.tensor([image.size[::-1]])
results = processor.post_process_object_detection(outputs, threshold=0.2, target_sizes=target_sizes)[0]
results = processor.post_process_object_detection(outputs, threshold=0.2, target_sizes=[target_size])[0]
boxes = results["boxes"].tolist()
self.assertEqual(boxes[0], [341.66656494140625, 23.38756561279297, 642.321044921875, 371.3482971191406])
self.assertEqual(boxes[1], [6.753320693969727, 51.96149826049805, 326.61810302734375, 473.12982177734375])
boxes = results["boxes"]
self.assertTrue(
torch.allclose(boxes, expected_boxes, atol=1e-2),
f"Single image bounding boxes fail. Expected {expected_boxes}, got {boxes}",
)
# batch of images
inputs = processor(text=[text, text], images=[image, image], return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
results = processor.post_process_object_detection(
outputs, threshold=0.2, target_sizes=[target_size, target_size]
)
for result in results:
boxes = result["boxes"]
self.assertTrue(
torch.allclose(boxes, expected_boxes, atol=1e-2),
f"Batch image bounding boxes fail. Expected {expected_boxes}, got {boxes}",
)
@unittest.skip("OWLv2 doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
def test_call_numpy_4_channels(self):