Evaluation

`FacePairsBenchmark`

Benchmark procedure for face recognition on matched and mismatched face pairs.

Procedure is based on the procedure described in LFW dataset README file. It uses K-Fold cross-validation (k=10): - Use 90% of the data to select the best threshold for similarity score comparison - Use the remaining 10% to evaluate the accuracy of the model with the selected threshold - Compute the mean and standard deviation of the accuracy across all folds

Uses cosine similarity as the similarity measure. Computes accuracy: (TP + TN) / TOTAL

Source code in src/evaluation/face_pairs.py

class FacePairsBenchmark:
    """Benchmark procedure for face recognition on matched and mismatched face pairs.

    Procedure is based on the procedure described in LFW dataset README file.
    It uses K-Fold cross-validation (k=10):
    - Use 90% of the data to select the best threshold for similarity score comparison
    - Use the remaining 10% to evaluate the accuracy of the model with the selected threshold
    - Compute the mean and standard deviation of the accuracy across all folds

    Uses cosine similarity as the similarity measure.
    Computes accuracy: (TP + TN) / TOTAL
    """

    def __init__(
        self,
        model: nn.Module,
        dataset: FacePairsDataset,
        config: FacePairsBenchmarkConfig,
    ):
        self.model = model
        self.dataset = dataset
        self.config = config

        self.device = torch.device(config.device)
        self.model.to(self.device)
        self.model.eval()

    @classmethod
    def lfw(
        cls,
        model: nn.Module,
        config: FacePairsBenchmarkConfig,
        pairs_file: str = "data/lfw/pairs.csv",
        root_dir: str = "data/lfw_cropped",
    ):
        bench_transform = cls._get_default_transform()

        dataset = LFWDataset.test_set_from_pairs_file(
            root_dir=root_dir,
            pairs_file=pairs_file,
            transform_1=bench_transform,
            transform_2=bench_transform,
        )
        return cls(model, dataset, config)

    @classmethod
    def rof_sunglasses(cls, model: nn.Module, config: FacePairsBenchmarkConfig):
        bench_transform = cls._get_default_transform()

        dataset = ROFDataset.sunglasses(transform=bench_transform)
        return cls(model, dataset, config)

    @classmethod
    def rof_masked(cls, model: nn.Module, config: FacePairsBenchmarkConfig):
        bench_transform = cls._get_default_transform()
        dataset = ROFDataset.masked(transform=bench_transform)
        return cls(model, dataset, config)

    @staticmethod
    def _get_default_transform():
        return transforms.Compose(
            [
                transforms.Resize((160, 160)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
            ]
        )

    def _get_data_loader(self):
        return DataLoader(
            self.dataset,
            batch_size=self.config.batch_size,
            num_workers=self.config.num_workers,
            shuffle=False,  # For deterministic results
        )

    def _compute_embeddings(self) -> tuple[Tensor, Tensor, Tensor]:
        n_examples = len(self.dataset)
        pair_classes = []
        embeddings_1 = []
        embeddings_2 = []

        data_loader = self._get_data_loader()
        with torch.no_grad():
            for _, (img1, img2, label) in enumerate(data_loader):
                img1, img2 = img1.to(self.device), img2.to(self.device)

                emb1 = self.model(img1)  # Shape: (batch_size, embedding_size)
                emb2 = self.model(img2)  # Shape: (batch_size, embedding_size)
                emb1 = nn.functional.normalize(emb1, p=2, dim=1)
                emb2 = nn.functional.normalize(emb2, p=2, dim=1)

                embeddings_1.append(emb1.cpu())
                embeddings_2.append(emb2.cpu())
                pair_classes.extend(label.numpy())

        embeddings_1 = torch.cat(embeddings_1, dim=0)
        embeddings_2 = torch.cat(embeddings_2, dim=0)
        pair_classes = torch.tensor(pair_classes, dtype=torch.long)

        assert embeddings_1.shape == (n_examples, self.config.embedding_size)
        assert embeddings_2.shape == (n_examples, self.config.embedding_size)
        assert pair_classes.shape == (n_examples,)

        return embeddings_1, embeddings_2, pair_classes

    def _find_best_threshold(
        self, embeddings_1: Tensor, embeddings_2: Tensor, labels: Tensor
    ) -> float:
        thresholds = torch.linspace(-1, 1, steps=200)

        accuracies = torch.tensor(
            [
                self._compute_accuracy(
                    embeddings_1, embeddings_2, labels, float(threshold)
                )
                for threshold in thresholds
            ]
        )

        best_accuracy_idx = torch.argmax(accuracies)
        best_threshold = thresholds[best_accuracy_idx]
        return best_threshold.item()

    def _compute_accuracy(
        self,
        embeddings_1: Tensor,
        embeddings_2: Tensor,
        labels: Tensor,
        threshold: float,
    ) -> float:
        similarities = nn.functional.cosine_similarity(embeddings_1, embeddings_2)
        predictions = (similarities >= threshold).float()  # 1 for match, 0 for mismatch
        predictions = predictions * 2 - 1  # Convert to 1 for match, -1 for mismatch
        accuracy = (predictions == labels).float().mean().item()
        return accuracy

    def evaluate(self) -> FacePairsBenchmarkResult:
        embeddings_1, embeddings_2, pair_classes = self._compute_embeddings()

        kfold = KFold(n_splits=self.config.n_folds, shuffle=False)
        accuracies = []

        indices = torch.arange(len(self.dataset)).numpy()
        for _, (train_index, test_index) in enumerate(kfold.split(indices)):
            emb1_train, emb1_test = (
                embeddings_1[train_index],
                embeddings_1[test_index],
            )
            emb2_train, emb2_test = (
                embeddings_2[train_index],
                embeddings_2[test_index],
            )
            labels_train, labels_test = (
                pair_classes[train_index],
                pair_classes[test_index],
            )

            best_threshold = self._find_best_threshold(
                emb1_train, emb2_train, labels_train
            )
            accuracy = self._compute_accuracy(
                emb1_test, emb2_test, labels_test, best_threshold
            )
            accuracies.append(accuracy)

        return FacePairsBenchmarkResult(
            mean_accuracy=torch.tensor(accuracies).mean().item(),
            std_accuracy=torch.tensor(accuracies).std().item(),
        )

`FaceRecognitionSystemFacenetPytorchAdapter`

Bases: FaceRecognitionSystem

Adapter for compatibility with the rococo evaluation library.

For using models from the facenet-pytorch library. Allows the use of MTCNN for face detection and InceptionResnetV1

Source code in src/evaluation/rococo.py

class FaceRecognitionSystemFacenetPytorchAdapter(FaceRecognitionSystem):
    """Adapter for compatibility with the rococo evaluation library.

    For using models from the facenet-pytorch library.
    Allows the use of MTCNN for face detection and InceptionResnetV1
    """

    def __init__(self, facenet: nn.Module, detector: nn.Module, device: torch.device):
        self.facenet = facenet
        self.detector = detector
        self.device = device

        self.facenet.to(device).eval()
        self.detector.to(device).eval()

    def feature_vector_length(self) -> int:
        return 512

    def compute_feature_vector(self, image: ndarray) -> ndarray:
        image_pil = self.cv2_to_pil(image)
        with torch.no_grad():
            image_cropped = self.detector(image_pil)
            if image_cropped is None:
                raise NoFaceDetectedException("No face detected in the image.")

            image_cropped = image_cropped.unsqueeze(0)  # Add batch dimension
            image_cropped = image_cropped.to(self.device)
            feature_vector = self.facenet(image_cropped)
            return (
                feature_vector.squeeze(0).cpu().numpy()
            )  # Remove batch dimension and convert

    @staticmethod
    def cv2_to_pil(image: ndarray) -> Image.Image:
        """
        Converts a cv2 image (numpy array) to a PIL image tensor.
        """
        return Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

`cv2_to_pil(image)` `staticmethod`

Converts a cv2 image (numpy array) to a PIL image tensor.

Source code in src/evaluation/rococo.py

@staticmethod
def cv2_to_pil(image: ndarray) -> Image.Image:
    """
    Converts a cv2 image (numpy array) to a PIL image tensor.
    """
    return Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

`face_pairs`

`FacePairsBenchmark`

Benchmark procedure for face recognition on matched and mismatched face pairs.

Procedure is based on the procedure described in LFW dataset README file. It uses K-Fold cross-validation (k=10): - Use 90% of the data to select the best threshold for similarity score comparison - Use the remaining 10% to evaluate the accuracy of the model with the selected threshold - Compute the mean and standard deviation of the accuracy across all folds

Uses cosine similarity as the similarity measure. Computes accuracy: (TP + TN) / TOTAL

Source code in src/evaluation/face_pairs.py

class FacePairsBenchmark:
    """Benchmark procedure for face recognition on matched and mismatched face pairs.

    Procedure is based on the procedure described in LFW dataset README file.
    It uses K-Fold cross-validation (k=10):
    - Use 90% of the data to select the best threshold for similarity score comparison
    - Use the remaining 10% to evaluate the accuracy of the model with the selected threshold
    - Compute the mean and standard deviation of the accuracy across all folds

    Uses cosine similarity as the similarity measure.
    Computes accuracy: (TP + TN) / TOTAL
    """

    def __init__(
        self,
        model: nn.Module,
        dataset: FacePairsDataset,
        config: FacePairsBenchmarkConfig,
    ):
        self.model = model
        self.dataset = dataset
        self.config = config

        self.device = torch.device(config.device)
        self.model.to(self.device)
        self.model.eval()

    @classmethod
    def lfw(
        cls,
        model: nn.Module,
        config: FacePairsBenchmarkConfig,
        pairs_file: str = "data/lfw/pairs.csv",
        root_dir: str = "data/lfw_cropped",
    ):
        bench_transform = cls._get_default_transform()

        dataset = LFWDataset.test_set_from_pairs_file(
            root_dir=root_dir,
            pairs_file=pairs_file,
            transform_1=bench_transform,
            transform_2=bench_transform,
        )
        return cls(model, dataset, config)

    @classmethod
    def rof_sunglasses(cls, model: nn.Module, config: FacePairsBenchmarkConfig):
        bench_transform = cls._get_default_transform()

        dataset = ROFDataset.sunglasses(transform=bench_transform)
        return cls(model, dataset, config)

    @classmethod
    def rof_masked(cls, model: nn.Module, config: FacePairsBenchmarkConfig):
        bench_transform = cls._get_default_transform()
        dataset = ROFDataset.masked(transform=bench_transform)
        return cls(model, dataset, config)

    @staticmethod
    def _get_default_transform():
        return transforms.Compose(
            [
                transforms.Resize((160, 160)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
            ]
        )

    def _get_data_loader(self):
        return DataLoader(
            self.dataset,
            batch_size=self.config.batch_size,
            num_workers=self.config.num_workers,
            shuffle=False,  # For deterministic results
        )

    def _compute_embeddings(self) -> tuple[Tensor, Tensor, Tensor]:
        n_examples = len(self.dataset)
        pair_classes = []
        embeddings_1 = []
        embeddings_2 = []

        data_loader = self._get_data_loader()
        with torch.no_grad():
            for _, (img1, img2, label) in enumerate(data_loader):
                img1, img2 = img1.to(self.device), img2.to(self.device)

                emb1 = self.model(img1)  # Shape: (batch_size, embedding_size)
                emb2 = self.model(img2)  # Shape: (batch_size, embedding_size)
                emb1 = nn.functional.normalize(emb1, p=2, dim=1)
                emb2 = nn.functional.normalize(emb2, p=2, dim=1)

                embeddings_1.append(emb1.cpu())
                embeddings_2.append(emb2.cpu())
                pair_classes.extend(label.numpy())

        embeddings_1 = torch.cat(embeddings_1, dim=0)
        embeddings_2 = torch.cat(embeddings_2, dim=0)
        pair_classes = torch.tensor(pair_classes, dtype=torch.long)

        assert embeddings_1.shape == (n_examples, self.config.embedding_size)
        assert embeddings_2.shape == (n_examples, self.config.embedding_size)
        assert pair_classes.shape == (n_examples,)

        return embeddings_1, embeddings_2, pair_classes

    def _find_best_threshold(
        self, embeddings_1: Tensor, embeddings_2: Tensor, labels: Tensor
    ) -> float:
        thresholds = torch.linspace(-1, 1, steps=200)

        accuracies = torch.tensor(
            [
                self._compute_accuracy(
                    embeddings_1, embeddings_2, labels, float(threshold)
                )
                for threshold in thresholds
            ]
        )

        best_accuracy_idx = torch.argmax(accuracies)
        best_threshold = thresholds[best_accuracy_idx]
        return best_threshold.item()

    def _compute_accuracy(
        self,
        embeddings_1: Tensor,
        embeddings_2: Tensor,
        labels: Tensor,
        threshold: float,
    ) -> float:
        similarities = nn.functional.cosine_similarity(embeddings_1, embeddings_2)
        predictions = (similarities >= threshold).float()  # 1 for match, 0 for mismatch
        predictions = predictions * 2 - 1  # Convert to 1 for match, -1 for mismatch
        accuracy = (predictions == labels).float().mean().item()
        return accuracy

    def evaluate(self) -> FacePairsBenchmarkResult:
        embeddings_1, embeddings_2, pair_classes = self._compute_embeddings()

        kfold = KFold(n_splits=self.config.n_folds, shuffle=False)
        accuracies = []

        indices = torch.arange(len(self.dataset)).numpy()
        for _, (train_index, test_index) in enumerate(kfold.split(indices)):
            emb1_train, emb1_test = (
                embeddings_1[train_index],
                embeddings_1[test_index],
            )
            emb2_train, emb2_test = (
                embeddings_2[train_index],
                embeddings_2[test_index],
            )
            labels_train, labels_test = (
                pair_classes[train_index],
                pair_classes[test_index],
            )

            best_threshold = self._find_best_threshold(
                emb1_train, emb2_train, labels_train
            )
            accuracy = self._compute_accuracy(
                emb1_test, emb2_test, labels_test, best_threshold
            )
            accuracies.append(accuracy)

        return FacePairsBenchmarkResult(
            mean_accuracy=torch.tensor(accuracies).mean().item(),
            std_accuracy=torch.tensor(accuracies).std().item(),
        )

`rococo`

Utilities for the rococo evaluation procedure.

`FaceRecognitionSystemFacenetPytorchAdapter`

Bases: FaceRecognitionSystem

Adapter for compatibility with the rococo evaluation library.

For using models from the facenet-pytorch library. Allows the use of MTCNN for face detection and InceptionResnetV1

Source code in src/evaluation/rococo.py

class FaceRecognitionSystemFacenetPytorchAdapter(FaceRecognitionSystem):
    """Adapter for compatibility with the rococo evaluation library.

    For using models from the facenet-pytorch library.
    Allows the use of MTCNN for face detection and InceptionResnetV1
    """

    def __init__(self, facenet: nn.Module, detector: nn.Module, device: torch.device):
        self.facenet = facenet
        self.detector = detector
        self.device = device

        self.facenet.to(device).eval()
        self.detector.to(device).eval()

    def feature_vector_length(self) -> int:
        return 512

    def compute_feature_vector(self, image: ndarray) -> ndarray:
        image_pil = self.cv2_to_pil(image)
        with torch.no_grad():
            image_cropped = self.detector(image_pil)
            if image_cropped is None:
                raise NoFaceDetectedException("No face detected in the image.")

            image_cropped = image_cropped.unsqueeze(0)  # Add batch dimension
            image_cropped = image_cropped.to(self.device)
            feature_vector = self.facenet(image_cropped)
            return (
                feature_vector.squeeze(0).cpu().numpy()
            )  # Remove batch dimension and convert

    @staticmethod
    def cv2_to_pil(image: ndarray) -> Image.Image:
        """
        Converts a cv2 image (numpy array) to a PIL image tensor.
        """
        return Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

`cv2_to_pil(image)` `staticmethod`

Converts a cv2 image (numpy array) to a PIL image tensor.

Source code in src/evaluation/rococo.py

@staticmethod
def cv2_to_pil(image: ndarray) -> Image.Image:
    """
    Converts a cv2 image (numpy array) to a PIL image tensor.
    """
    return Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

Evaluation

FacePairsBenchmark

FaceRecognitionSystemFacenetPytorchAdapter

cv2_to_pil(image) staticmethod

face_pairs

FacePairsBenchmark

rococo

FaceRecognitionSystemFacenetPytorchAdapter

cv2_to_pil(image) staticmethod

`FacePairsBenchmark`

`FaceRecognitionSystemFacenetPytorchAdapter`

`cv2_to_pil(image)` `staticmethod`

`face_pairs`

`FacePairsBenchmark`

`rococo`

`FaceRecognitionSystemFacenetPytorchAdapter`

`cv2_to_pil(image)` `staticmethod`