Source code for gptcache.similarity_evaluation.sequence_match
from typing import Tuple, Dict, Any, List
import numpy as np
from gptcache.adapter.api import _get_model
from gptcache.similarity_evaluation import SimilarityEvaluation
[docs]def euclidean_distance_calculate(vec_l: np.array, vec_r: np.array):
return np.sum((vec_l - vec_r) ** 2)
[docs]def reweight(weights, length):
if length >= len(weights):
return weights
else:
reweighted_ws = []
sum_ws = 0
for i in range(length):
sum_ws += weights[i]
for i in range(length):
reweighted_ws.append(weights[i] * (1 / sum_ws))
return reweighted_ws
[docs]class SequenceMatchEvaluation(SimilarityEvaluation):
"""
Evaluate sentence pair similarity using SequenceMatchEvaluation.
:param weights: List of weights corresponding to each sequence element for calculating the weighted distance.
:type weights: List[float]
:param embedding_extractor: The embedding extractor used to obtain embeddings from the text content.
:type embedding_extractor: gptcache.embedding.base.BaseEmbedding
Example:
.. code-block:: python
from gptcache.similarity_evaluation import SequenceMatchEvaluation
from gptcache.embedding import Onnx
weights = [0.5, 0.3, 0.2]
evaluation = SequenceMatchEvaluation(weights, 'onnx')
query = {
'question': 'USER: "foo2" USER: "foo4"',
}
cache = {
'question': 'USER: "foo6" USER: "foo8"',
}
score = evaluation.evaluation(query, cache)
"""
def __init__(
self, weights: List[float], embedding_extractor: str, embedding_config=None
):
self.embedding_extractor = _get_model(embedding_extractor, embedding_config)
self.weights = weights
[docs] @staticmethod
def normalize(vec: np.ndarray):
"""Normalize the input vector.
:param vec: numpy vector needs to normalize.
:type vec: numpy.array
:return: normalized vector.
"""
magnitude = np.linalg.norm(vec)
normalized_v = vec / magnitude
return normalized_v
[docs] def evaluation(
self, src_dict: Dict[str, Any], cache_dict: Dict[str, Any], **_
) -> float:
"""Evaluate the similarity score of pair.
:param src_dict: the query dictionary to evaluate with cache.
:type src_dict: Dict
:param cache_dict: the cache dictionary.
:type cache_dict: Dict
:return: evaluation score.
"""
src_question = src_dict["question"]
cache_question = cache_dict["question"]
src_contents = src_question.split("USER: ")
cache_contents = cache_question.split("USER: ")
src_contents = [content for content in src_contents if len(content) > 0]
cache_contents = [content for content in cache_contents if len(content) > 0]
src_embs = []
cache_embs = []
for content in src_contents:
src_embs.append(
self.normalize(self.embedding_extractor.to_embeddings(content))
)
for content in cache_contents:
cache_embs.append(
self.normalize(self.embedding_extractor.to_embeddings(content))
)
length = min([len(src_contents), len(cache_contents), len(self.weights)])
assert length > 0
ws = self.weights
ws = ws[::-1]
ws = reweight(ws, length)
src_embs = src_embs[::-1]
cache_embs = cache_embs[::-1]
weighted_distance = 0
for i in range(length):
weighted_distance += (
4 - euclidean_distance_calculate(src_embs[i], cache_embs[i])
) * ws[i]
return weighted_distance
[docs] def range(self) -> Tuple[float, float]:
"""Range of similarity score.
:return: minimum and maximum of similarity score.
"""
return 0.0, 4.0