Skip to content

Evaluation

FacePairsBenchmark

Benchmark procedure for face recognition on matched and mismatched face pairs.

Procedure is based on the procedure described in LFW dataset README file. It uses K-Fold cross-validation (k=10): - Use 90% of the data to select the best threshold for similarity score comparison - Use the remaining 10% to evaluate the accuracy of the model with the selected threshold - Compute the mean and standard deviation of the accuracy across all folds

Uses cosine similarity as the similarity measure. Computes accuracy: (TP + TN) / TOTAL

Source code in src/evaluation/face_pairs.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
class FacePairsBenchmark:
    """Benchmark procedure for face recognition on matched and mismatched face pairs.

    Procedure is based on the procedure described in LFW dataset README file.
    It uses K-Fold cross-validation (k=10):
    - Use 90% of the data to select the best threshold for similarity score comparison
    - Use the remaining 10% to evaluate the accuracy of the model with the selected threshold
    - Compute the mean and standard deviation of the accuracy across all folds

    Uses cosine similarity as the similarity measure.
    Computes accuracy: (TP + TN) / TOTAL
    """

    def __init__(
        self,
        model: nn.Module,
        dataset: FacePairsDataset,
        config: FacePairsBenchmarkConfig,
    ):
        self.model = model
        self.dataset = dataset
        self.config = config

        self.device = torch.device(config.device)
        self.model.to(self.device)
        self.model.eval()

    @classmethod
    def lfw(
        cls,
        model: nn.Module,
        config: FacePairsBenchmarkConfig,
        pairs_file: str = "data/lfw/pairs.csv",
        root_dir: str = "data/lfw_cropped",
    ):
        bench_transform = cls._get_default_transform()

        dataset = LFWDataset.test_set_from_pairs_file(
            root_dir=root_dir,
            pairs_file=pairs_file,
            transform_1=bench_transform,
            transform_2=bench_transform,
        )
        return cls(model, dataset, config)

    @classmethod
    def rof_sunglasses(cls, model: nn.Module, config: FacePairsBenchmarkConfig):
        bench_transform = cls._get_default_transform()

        dataset = ROFDataset.sunglasses(transform=bench_transform)
        return cls(model, dataset, config)

    @classmethod
    def rof_masked(cls, model: nn.Module, config: FacePairsBenchmarkConfig):
        bench_transform = cls._get_default_transform()
        dataset = ROFDataset.masked(transform=bench_transform)
        return cls(model, dataset, config)

    @staticmethod
    def _get_default_transform():
        return transforms.Compose(
            [
                transforms.Resize((160, 160)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
            ]
        )

    def _get_data_loader(self):
        return DataLoader(
            self.dataset,
            batch_size=self.config.batch_size,
            num_workers=self.config.num_workers,
            shuffle=False,  # For deterministic results
        )

    def _compute_embeddings(self) -> tuple[Tensor, Tensor, Tensor]:
        n_examples = len(self.dataset)
        pair_classes = []
        embeddings_1 = []
        embeddings_2 = []

        data_loader = self._get_data_loader()
        with torch.no_grad():
            for _, (img1, img2, label) in enumerate(data_loader):
                img1, img2 = img1.to(self.device), img2.to(self.device)

                emb1 = self.model(img1)  # Shape: (batch_size, embedding_size)
                emb2 = self.model(img2)  # Shape: (batch_size, embedding_size)
                emb1 = nn.functional.normalize(emb1, p=2, dim=1)
                emb2 = nn.functional.normalize(emb2, p=2, dim=1)

                embeddings_1.append(emb1.cpu())
                embeddings_2.append(emb2.cpu())
                pair_classes.extend(label.numpy())

        embeddings_1 = torch.cat(embeddings_1, dim=0)
        embeddings_2 = torch.cat(embeddings_2, dim=0)
        pair_classes = torch.tensor(pair_classes, dtype=torch.long)

        assert embeddings_1.shape == (n_examples, self.config.embedding_size)
        assert embeddings_2.shape == (n_examples, self.config.embedding_size)
        assert pair_classes.shape == (n_examples,)

        return embeddings_1, embeddings_2, pair_classes

    def _find_best_threshold(
        self, embeddings_1: Tensor, embeddings_2: Tensor, labels: Tensor
    ) -> float:
        thresholds = torch.linspace(-1, 1, steps=200)

        accuracies = torch.tensor(
            [
                self._compute_accuracy(
                    embeddings_1, embeddings_2, labels, float(threshold)
                )
                for threshold in thresholds
            ]
        )

        best_accuracy_idx = torch.argmax(accuracies)
        best_threshold = thresholds[best_accuracy_idx]
        return best_threshold.item()

    def _compute_accuracy(
        self,
        embeddings_1: Tensor,
        embeddings_2: Tensor,
        labels: Tensor,
        threshold: float,
    ) -> float:
        similarities = nn.functional.cosine_similarity(embeddings_1, embeddings_2)
        predictions = (similarities >= threshold).float()  # 1 for match, 0 for mismatch
        predictions = predictions * 2 - 1  # Convert to 1 for match, -1 for mismatch
        accuracy = (predictions == labels).float().mean().item()
        return accuracy

    def evaluate(self) -> FacePairsBenchmarkResult:
        embeddings_1, embeddings_2, pair_classes = self._compute_embeddings()

        kfold = KFold(n_splits=self.config.n_folds, shuffle=False)
        accuracies = []

        indices = torch.arange(len(self.dataset)).numpy()
        for _, (train_index, test_index) in enumerate(kfold.split(indices)):
            emb1_train, emb1_test = (
                embeddings_1[train_index],
                embeddings_1[test_index],
            )
            emb2_train, emb2_test = (
                embeddings_2[train_index],
                embeddings_2[test_index],
            )
            labels_train, labels_test = (
                pair_classes[train_index],
                pair_classes[test_index],
            )

            best_threshold = self._find_best_threshold(
                emb1_train, emb2_train, labels_train
            )
            accuracy = self._compute_accuracy(
                emb1_test, emb2_test, labels_test, best_threshold
            )
            accuracies.append(accuracy)

        return FacePairsBenchmarkResult(
            mean_accuracy=torch.tensor(accuracies).mean().item(),
            std_accuracy=torch.tensor(accuracies).std().item(),
        )

FaceRecognitionSystemFacenetPytorchAdapter

Bases: FaceRecognitionSystem

Adapter for compatibility with the rococo evaluation library.

For using models from the facenet-pytorch library. Allows the use of MTCNN for face detection and InceptionResnetV1

Source code in src/evaluation/rococo.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
class FaceRecognitionSystemFacenetPytorchAdapter(FaceRecognitionSystem):
    """Adapter for compatibility with the rococo evaluation library.

    For using models from the facenet-pytorch library.
    Allows the use of MTCNN for face detection and InceptionResnetV1
    """

    def __init__(self, facenet: nn.Module, detector: nn.Module, device: torch.device):
        self.facenet = facenet
        self.detector = detector
        self.device = device

        self.facenet.to(device).eval()
        self.detector.to(device).eval()

    def feature_vector_length(self) -> int:
        return 512

    def compute_feature_vector(self, image: ndarray) -> ndarray:
        image_pil = self.cv2_to_pil(image)
        with torch.no_grad():
            image_cropped = self.detector(image_pil)
            if image_cropped is None:
                raise NoFaceDetectedException("No face detected in the image.")

            image_cropped = image_cropped.unsqueeze(0)  # Add batch dimension
            image_cropped = image_cropped.to(self.device)
            feature_vector = self.facenet(image_cropped)
            return (
                feature_vector.squeeze(0).cpu().numpy()
            )  # Remove batch dimension and convert

    @staticmethod
    def cv2_to_pil(image: ndarray) -> Image.Image:
        """
        Converts a cv2 image (numpy array) to a PIL image tensor.
        """
        return Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

cv2_to_pil(image) staticmethod

Converts a cv2 image (numpy array) to a PIL image tensor.

Source code in src/evaluation/rococo.py
48
49
50
51
52
53
@staticmethod
def cv2_to_pil(image: ndarray) -> Image.Image:
    """
    Converts a cv2 image (numpy array) to a PIL image tensor.
    """
    return Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

face_pairs

FacePairsBenchmark

Benchmark procedure for face recognition on matched and mismatched face pairs.

Procedure is based on the procedure described in LFW dataset README file. It uses K-Fold cross-validation (k=10): - Use 90% of the data to select the best threshold for similarity score comparison - Use the remaining 10% to evaluate the accuracy of the model with the selected threshold - Compute the mean and standard deviation of the accuracy across all folds

Uses cosine similarity as the similarity measure. Computes accuracy: (TP + TN) / TOTAL

Source code in src/evaluation/face_pairs.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
class FacePairsBenchmark:
    """Benchmark procedure for face recognition on matched and mismatched face pairs.

    Procedure is based on the procedure described in LFW dataset README file.
    It uses K-Fold cross-validation (k=10):
    - Use 90% of the data to select the best threshold for similarity score comparison
    - Use the remaining 10% to evaluate the accuracy of the model with the selected threshold
    - Compute the mean and standard deviation of the accuracy across all folds

    Uses cosine similarity as the similarity measure.
    Computes accuracy: (TP + TN) / TOTAL
    """

    def __init__(
        self,
        model: nn.Module,
        dataset: FacePairsDataset,
        config: FacePairsBenchmarkConfig,
    ):
        self.model = model
        self.dataset = dataset
        self.config = config

        self.device = torch.device(config.device)
        self.model.to(self.device)
        self.model.eval()

    @classmethod
    def lfw(
        cls,
        model: nn.Module,
        config: FacePairsBenchmarkConfig,
        pairs_file: str = "data/lfw/pairs.csv",
        root_dir: str = "data/lfw_cropped",
    ):
        bench_transform = cls._get_default_transform()

        dataset = LFWDataset.test_set_from_pairs_file(
            root_dir=root_dir,
            pairs_file=pairs_file,
            transform_1=bench_transform,
            transform_2=bench_transform,
        )
        return cls(model, dataset, config)

    @classmethod
    def rof_sunglasses(cls, model: nn.Module, config: FacePairsBenchmarkConfig):
        bench_transform = cls._get_default_transform()

        dataset = ROFDataset.sunglasses(transform=bench_transform)
        return cls(model, dataset, config)

    @classmethod
    def rof_masked(cls, model: nn.Module, config: FacePairsBenchmarkConfig):
        bench_transform = cls._get_default_transform()
        dataset = ROFDataset.masked(transform=bench_transform)
        return cls(model, dataset, config)

    @staticmethod
    def _get_default_transform():
        return transforms.Compose(
            [
                transforms.Resize((160, 160)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
            ]
        )

    def _get_data_loader(self):
        return DataLoader(
            self.dataset,
            batch_size=self.config.batch_size,
            num_workers=self.config.num_workers,
            shuffle=False,  # For deterministic results
        )

    def _compute_embeddings(self) -> tuple[Tensor, Tensor, Tensor]:
        n_examples = len(self.dataset)
        pair_classes = []
        embeddings_1 = []
        embeddings_2 = []

        data_loader = self._get_data_loader()
        with torch.no_grad():
            for _, (img1, img2, label) in enumerate(data_loader):
                img1, img2 = img1.to(self.device), img2.to(self.device)

                emb1 = self.model(img1)  # Shape: (batch_size, embedding_size)
                emb2 = self.model(img2)  # Shape: (batch_size, embedding_size)
                emb1 = nn.functional.normalize(emb1, p=2, dim=1)
                emb2 = nn.functional.normalize(emb2, p=2, dim=1)

                embeddings_1.append(emb1.cpu())
                embeddings_2.append(emb2.cpu())
                pair_classes.extend(label.numpy())

        embeddings_1 = torch.cat(embeddings_1, dim=0)
        embeddings_2 = torch.cat(embeddings_2, dim=0)
        pair_classes = torch.tensor(pair_classes, dtype=torch.long)

        assert embeddings_1.shape == (n_examples, self.config.embedding_size)
        assert embeddings_2.shape == (n_examples, self.config.embedding_size)
        assert pair_classes.shape == (n_examples,)

        return embeddings_1, embeddings_2, pair_classes

    def _find_best_threshold(
        self, embeddings_1: Tensor, embeddings_2: Tensor, labels: Tensor
    ) -> float:
        thresholds = torch.linspace(-1, 1, steps=200)

        accuracies = torch.tensor(
            [
                self._compute_accuracy(
                    embeddings_1, embeddings_2, labels, float(threshold)
                )
                for threshold in thresholds
            ]
        )

        best_accuracy_idx = torch.argmax(accuracies)
        best_threshold = thresholds[best_accuracy_idx]
        return best_threshold.item()

    def _compute_accuracy(
        self,
        embeddings_1: Tensor,
        embeddings_2: Tensor,
        labels: Tensor,
        threshold: float,
    ) -> float:
        similarities = nn.functional.cosine_similarity(embeddings_1, embeddings_2)
        predictions = (similarities >= threshold).float()  # 1 for match, 0 for mismatch
        predictions = predictions * 2 - 1  # Convert to 1 for match, -1 for mismatch
        accuracy = (predictions == labels).float().mean().item()
        return accuracy

    def evaluate(self) -> FacePairsBenchmarkResult:
        embeddings_1, embeddings_2, pair_classes = self._compute_embeddings()

        kfold = KFold(n_splits=self.config.n_folds, shuffle=False)
        accuracies = []

        indices = torch.arange(len(self.dataset)).numpy()
        for _, (train_index, test_index) in enumerate(kfold.split(indices)):
            emb1_train, emb1_test = (
                embeddings_1[train_index],
                embeddings_1[test_index],
            )
            emb2_train, emb2_test = (
                embeddings_2[train_index],
                embeddings_2[test_index],
            )
            labels_train, labels_test = (
                pair_classes[train_index],
                pair_classes[test_index],
            )

            best_threshold = self._find_best_threshold(
                emb1_train, emb2_train, labels_train
            )
            accuracy = self._compute_accuracy(
                emb1_test, emb2_test, labels_test, best_threshold
            )
            accuracies.append(accuracy)

        return FacePairsBenchmarkResult(
            mean_accuracy=torch.tensor(accuracies).mean().item(),
            std_accuracy=torch.tensor(accuracies).std().item(),
        )

rococo

Utilities for the rococo evaluation procedure.

FaceRecognitionSystemFacenetPytorchAdapter

Bases: FaceRecognitionSystem

Adapter for compatibility with the rococo evaluation library.

For using models from the facenet-pytorch library. Allows the use of MTCNN for face detection and InceptionResnetV1

Source code in src/evaluation/rococo.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
class FaceRecognitionSystemFacenetPytorchAdapter(FaceRecognitionSystem):
    """Adapter for compatibility with the rococo evaluation library.

    For using models from the facenet-pytorch library.
    Allows the use of MTCNN for face detection and InceptionResnetV1
    """

    def __init__(self, facenet: nn.Module, detector: nn.Module, device: torch.device):
        self.facenet = facenet
        self.detector = detector
        self.device = device

        self.facenet.to(device).eval()
        self.detector.to(device).eval()

    def feature_vector_length(self) -> int:
        return 512

    def compute_feature_vector(self, image: ndarray) -> ndarray:
        image_pil = self.cv2_to_pil(image)
        with torch.no_grad():
            image_cropped = self.detector(image_pil)
            if image_cropped is None:
                raise NoFaceDetectedException("No face detected in the image.")

            image_cropped = image_cropped.unsqueeze(0)  # Add batch dimension
            image_cropped = image_cropped.to(self.device)
            feature_vector = self.facenet(image_cropped)
            return (
                feature_vector.squeeze(0).cpu().numpy()
            )  # Remove batch dimension and convert

    @staticmethod
    def cv2_to_pil(image: ndarray) -> Image.Image:
        """
        Converts a cv2 image (numpy array) to a PIL image tensor.
        """
        return Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

cv2_to_pil(image) staticmethod

Converts a cv2 image (numpy array) to a PIL image tensor.

Source code in src/evaluation/rococo.py
48
49
50
51
52
53
@staticmethod
def cv2_to_pil(image: ndarray) -> Image.Image:
    """
    Converts a cv2 image (numpy array) to a PIL image tensor.
    """
    return Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))