Source code for gptcache.similarity_evaluation.onnx
from typing import Dict, List, Tuple, Any
import numpy as np
from gptcache.similarity_evaluation import SimilarityEvaluation
from gptcache.utils import (
import_onnxruntime,
import_huggingface_hub,
import_huggingface,
)
import_onnxruntime()
import_huggingface_hub()
import_huggingface()
from transformers import AutoTokenizer # pylint: disable=C0413
from huggingface_hub import hf_hub_download # pylint: disable=C0413
import onnxruntime # pylint: disable=C0413
[docs]def pad_sequence(input_ids_list: List[np.ndarray], padding_value: int = 0):
max_len = max(len(sequence) for sequence in input_ids_list)
padded_sequences = np.full((len(input_ids_list), max_len), padding_value)
for i, sequence in enumerate(input_ids_list):
padded_sequences[i, : len(sequence)] = sequence
return padded_sequences
[docs]class OnnxModelEvaluation(SimilarityEvaluation):
"""Using ONNX model to evaluate sentences pair similarity.
This evaluator use the ONNX model to evaluate the similarity of two sentences.
:param model: model name of OnnxModelEvaluation. Default is 'GPTCache/albert-duplicate-onnx'.
:type model: str
Example:
.. code-block:: python
from gptcache.similarity_evaluation import OnnxModelEvaluation
evaluation = OnnxModelEvaluation()
score = evaluation.evaluation(
{
'question': 'What is the color of sky?'
},
{
'question': 'hello'
}
)
"""
def __init__(self, model: str = "GPTCache/albert-duplicate-onnx"):
tokenizer_name = "albert-base-v2"
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
self.model = model
onnx_model_path = hf_hub_download(repo_id=model, filename="model.onnx")
self.ort_session = onnxruntime.InferenceSession(onnx_model_path)
# WARNING: the model cannot evaluate text with more than 512 tokens
[docs] def evaluation(
self, src_dict: Dict[str, Any], cache_dict: Dict[str, Any], **_
) -> float:
"""Evaluate the similarity score of pair.
:param src_dict: the query dictionary to evaluate with cache.
:type src_dict: Dict
:param cache_dict: the cache dictionary.
:type cache_dict: Dict
:return: evaluation score.
"""
try:
src_question = src_dict["question"]
cache_question = cache_dict["question"]
if src_question.lower() == cache_question.lower():
return 1
return self.inference(src_question, [cache_question])
except Exception: # pylint: disable=W0703
return 0
[docs] def range(self) -> Tuple[float, float]:
"""Range of similarity score.
:return: minimum and maximum of similarity score.
"""
return 0.0, 1.0
[docs] def inference(self, reference: str, candidates: List[str]) -> np.ndarray:
"""Inference the ONNX model.
:param reference: reference sentence.
:type reference: str
:param candidates: candidate sentences.
:type candidates: List[str]
:return: probability score indcates how much is reference similar to candidates.
"""
n_candidates = len(candidates)
inference_texts = [
{"text_a": reference, "text_b": candidate} for candidate in candidates
]
batch_encoding_list = [
self.tokenizer.encode_plus(
text["text_a"], text["text_b"], padding="longest"
)
for text in inference_texts
]
input_ids_list = [np.array(encode.input_ids) for encode in batch_encoding_list]
attention_mask_list = [
np.array(encode.attention_mask) for encode in batch_encoding_list
]
token_type_ids_list = [
np.array(encode.token_type_ids) for encode in batch_encoding_list
]
padded_input_ids = pad_sequence(
input_ids_list, padding_value=self.tokenizer.pad_token_id
)
padded_attention_mask = pad_sequence(
attention_mask_list, padding_value=self.tokenizer.pad_token_id
)
padded_token_type_ids = pad_sequence(
token_type_ids_list, padding_value=self.tokenizer.pad_token_id
)
ort_inputs = {
"input_ids": padded_input_ids.reshape(n_candidates, -1),
"attention_mask": padded_attention_mask.reshape(n_candidates, -1),
"token_type_ids": padded_token_type_ids.reshape(n_candidates, -1),
}
ort_outputs = self.ort_session.run(None, ort_inputs)
scores = ort_outputs[0][:, 1]
return float(scores[0])