Source code for gptcache.similarity_evaluation.sequence_match

from typing import Tuple, Dict, Any, List

import numpy as np

from gptcache.adapter.api import _get_model
from gptcache.similarity_evaluation import SimilarityEvaluation


[docs]def euclidean_distance_calculate(vec_l: np.array, vec_r: np.array):
    return np.sum((vec_l - vec_r) ** 2)


[docs]def reweight(weights, length):
    if length >= len(weights):
        return weights
    else:
        reweighted_ws = []
        sum_ws = 0
        for i in range(length):
            sum_ws += weights[i]
        for i in range(length):
            reweighted_ws.append(weights[i] * (1 / sum_ws))
        return reweighted_ws


[docs]class SequenceMatchEvaluation(SimilarityEvaluation):
    """
    Evaluate sentence pair similarity using SequenceMatchEvaluation.

    :param weights: List of weights corresponding to each sequence element for calculating the weighted distance.
    :type weights: List[float]
    :param embedding_extractor: The embedding extractor used to obtain embeddings from the text content.
    :type embedding_extractor: gptcache.embedding.base.BaseEmbedding


    Example:
        .. code-block:: python

            from gptcache.similarity_evaluation import SequenceMatchEvaluation
            from gptcache.embedding import Onnx

            weights = [0.5, 0.3, 0.2]
            evaluation = SequenceMatchEvaluation(weights, 'onnx')

            query = {
                'question': 'USER: "foo2" USER: "foo4"',
            }

            cache = {
                'question': 'USER: "foo6" USER: "foo8"',
            }

            score = evaluation.evaluation(query, cache)
    """

    def __init__(
        self, weights: List[float], embedding_extractor: str, embedding_config=None
    ):
        self.embedding_extractor = _get_model(embedding_extractor, embedding_config)
        self.weights = weights

[docs]    @staticmethod
    def normalize(vec: np.ndarray):
        """Normalize the input vector.

        :param vec: numpy vector needs to normalize.
        :type vec: numpy.array

        :return: normalized vector.
        """
        magnitude = np.linalg.norm(vec)
        normalized_v = vec / magnitude
        return normalized_v

[docs]    def evaluation(
        self, src_dict: Dict[str, Any], cache_dict: Dict[str, Any], **_
    ) -> float:
        """Evaluate the similarity score of pair.

        :param src_dict: the query dictionary to evaluate with cache.
        :type src_dict: Dict
        :param cache_dict: the cache dictionary.
        :type cache_dict: Dict

        :return: evaluation score.
        """
        src_question = src_dict["question"]
        cache_question = cache_dict["question"]
        src_contents = src_question.split("USER: ")
        cache_contents = cache_question.split("USER: ")
        src_contents = [content for content in src_contents if len(content) > 0]
        cache_contents = [content for content in cache_contents if len(content) > 0]
        src_embs = []
        cache_embs = []
        for content in src_contents:
            src_embs.append(
                self.normalize(self.embedding_extractor.to_embeddings(content))
            )
        for content in cache_contents:
            cache_embs.append(
                self.normalize(self.embedding_extractor.to_embeddings(content))
            )
        length = min([len(src_contents), len(cache_contents), len(self.weights)])
        assert length > 0
        ws = self.weights
        ws = ws[::-1]
        ws = reweight(ws, length)
        src_embs = src_embs[::-1]
        cache_embs = cache_embs[::-1]
        weighted_distance = 0
        for i in range(length):
            weighted_distance += (
                4 - euclidean_distance_calculate(src_embs[i], cache_embs[i])
            ) * ws[i]
        return weighted_distance

[docs]    def range(self) -> Tuple[float, float]:
        """Range of similarity score.

        :return: minimum and maximum of similarity score.
        """
        return 0.0, 4.0