Source code for gptcache.similarity_evaluation.kreciprocal

import numpy as np
from typing import Dict, Any

from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation
from gptcache.manager.vector_data.base import VectorBase


[docs]def euclidean_distance_calculate(vec_l: np.array, vec_r: np.array): return np.sum((vec_l - vec_r)**2)
[docs]class KReciprocalEvaluation(SearchDistanceEvaluation): """Using K Reciprocal to evaluate sentences pair similarity. This evaluator borrows popular reranking method K-reprocical reranking for similarity evaluation. K-reciprocal relation refers to the mutual nearest neighbor relationship between two embeddings, where each embedding is the K nearest neighbor of the other based on a given distance metric. This evaluator checks whether the query embedding is in candidate cache embedding's `top_k` nearest neighbors. If query embedding is not candidate's `top_k` neighbors, the pair will be considered as dissimilar pair. Otherwise, their distance will be kept and continue for a `SearchDistanceEvaluation` check. `max_distance` is used to bound this distance to make it between [0-`max_distance`]. `positive` is used to indicate this distance is directly proportional to the similarity of two entites. If `positive` is set `False`, `max_distance` will be used to substract this distance to get the final score. :param vectordb: vector database to retrieval embeddings to test k-reciprocal relationship. :type vectordb: gptcache.manager.vector_data.base.VectorBase :param top_k: for each retievaled candidates, this method need to test if the query is top-k of candidate. :type top_k: int :param max_distance: the bound of maximum distance. :type max_distance: float :param positive: if the larger distance indicates more similar of two entities, It is True. Otherwise it is False. :type positive: bool Example: .. code-block:: python from gptcache.similarity_evaluation import KReciprocalEvaluation from gptcache.manager.vector_data.faiss import Faiss from gptcache.manager.vector_data.base import VectorData import numpy as np faiss = Faiss('./none', 3, 10) cached_data = np.array([0.57735027, 0.57735027, 0.57735027]) faiss.mul_add([VectorData(id=0, data=cached_data)]) evaluation = KReciprocalEvaluation(vectordb=faiss, top_k=2, max_distance = 4.0, positive=False) query = np.array([0.61396013, 0.55814557, 0.55814557]) score = evaluation.evaluation( { 'question': 'question1', 'embedding': query }, { 'question': 'question2', 'embedding': cached_data } ) """ def __init__(self, vectordb: VectorBase, top_k: int = 3, max_distance: float = 4.0, positive: bool=False): super().__init__(max_distance, positive) self.vectordb = vectordb self.top_k = top_k
[docs] @staticmethod def normalize(vec: np.ndarray): """Normalize the input vector. :param vec: numpy vector needs to normalize. :type vec: numpy.array :return: normalized vector. """ magnitude = np.linalg.norm(vec) normalized_v = vec / magnitude return normalized_v
[docs] def evaluation( self, src_dict: Dict[str, Any], cache_dict: Dict[str, Any], **_ ) -> float: """Evaluate the similarity score of pair. :param src_dict: the query dictionary to evaluate with cache. :type src_dict: Dict :param cache_dict: the cache dictionary. :type cache_dict: Dict :return: evaluation score. """ src_question = src_dict['question'] cache_question = cache_dict['question'] if src_question == cache_question: return 1 query_emb = self.normalize(src_dict['embedding']) candidates = self.vectordb.search(cache_dict['embedding'], self.top_k + 1) euc_dist = euclidean_distance_calculate(query_emb, cache_dict['embedding']) if euc_dist > candidates[-1][0]: euc_dist = self.range()[1] result_dict = {} result_dict['search_result'] = (euc_dist, None) return super().evaluation(None, result_dict)