Source code for gptcache.similarity_evaluation.kreciprocal

import numpy as np
from typing import Dict, Any

from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation
from gptcache.manager.vector_data.base import VectorBase


[docs]def euclidean_distance_calculate(vec_l: np.array, vec_r: np.array):
    return np.sum((vec_l - vec_r)**2)

[docs]class KReciprocalEvaluation(SearchDistanceEvaluation):
    """Using K Reciprocal to evaluate sentences pair similarity.

    This evaluator borrows popular reranking method K-reprocical reranking for similarity evaluation. K-reciprocal relation refers to the mutual
    nearest neighbor relationship between two embeddings, where each embedding is the K nearest neighbor of the other based on a given distance
    metric.  This evaluator checks whether the query embedding is in candidate cache embedding's `top_k` nearest neighbors. If query embedding
    is not candidate's `top_k` neighbors, the pair will be considered as dissimilar pair. Otherwise, their distance will be kept and continue
    for a `SearchDistanceEvaluation` check.  `max_distance` is used to bound this distance to make it between [0-`max_distance`]. `positive` is
    used to indicate this distance is directly proportional to the similarity of two entites. If `positive` is set `False`,
    `max_distance` will be used to substract this distance to get the final score.

    :param vectordb: vector database to retrieval embeddings to test k-reciprocal relationship.
    :type vectordb: gptcache.manager.vector_data.base.VectorBase
    :param top_k: for each retievaled candidates, this method need to test if the query is top-k of candidate.
    :type top_k: int
    :param max_distance: the bound of maximum distance.
    :type max_distance: float
    :param positive: if the larger distance indicates more similar of two entities, It is True. Otherwise it is False.
    :type positive: bool

    Example:
        .. code-block:: python

            from gptcache.similarity_evaluation import KReciprocalEvaluation
            from gptcache.manager.vector_data.faiss import Faiss
            from gptcache.manager.vector_data.base import VectorData
            import numpy as np

            faiss = Faiss('./none', 3, 10)
            cached_data = np.array([0.57735027, 0.57735027, 0.57735027])
            faiss.mul_add([VectorData(id=0, data=cached_data)])
            evaluation = KReciprocalEvaluation(vectordb=faiss, top_k=2, max_distance = 4.0, positive=False)
            query = np.array([0.61396013, 0.55814557, 0.55814557])
            score = evaluation.evaluation(
                {
                    'question': 'question1',
                    'embedding': query
                },
                {
                    'question': 'question2',
                    'embedding': cached_data
                }
            )
    """


    def __init__(self, vectordb: VectorBase, top_k: int = 3, max_distance: float = 4.0, positive: bool=False):
        super().__init__(max_distance, positive)
        self.vectordb = vectordb
        self.top_k = top_k

[docs]    @staticmethod
    def normalize(vec: np.ndarray):
        """Normalize the input vector.

        :param vec: numpy vector needs to normalize.
        :type vec: numpy.array

        :return: normalized vector.
        """
        magnitude = np.linalg.norm(vec)
        normalized_v = vec / magnitude
        return normalized_v


[docs]    def evaluation(
        self, src_dict: Dict[str, Any], cache_dict: Dict[str, Any], **_
    ) -> float:
        """Evaluate the similarity score of pair.

        :param src_dict: the query dictionary to evaluate with cache.
        :type src_dict: Dict
        :param cache_dict: the cache dictionary.
        :type cache_dict: Dict

        :return: evaluation score.
        """
        src_question = src_dict['question']
        cache_question = cache_dict['question']
        if src_question == cache_question:
            return 1
        query_emb = self.normalize(src_dict['embedding'])
        candidates = self.vectordb.search(cache_dict['embedding'], self.top_k + 1)
        euc_dist = euclidean_distance_calculate(query_emb, cache_dict['embedding'])
        if euc_dist > candidates[-1][0]:
            euc_dist = self.range()[1]

        result_dict = {}
        result_dict['search_result'] = (euc_dist, None)
        return super().evaluation(None, result_dict)