Source code for gptcache.embedding.huggingface
import numpy as np
from gptcache.embedding.base import BaseEmbedding
from gptcache.utils import import_huggingface, import_torch
import_torch()
import_huggingface()
import torch # pylint: disable=C0413
from transformers import AutoTokenizer, AutoModel # pylint: disable=C0413
[docs]class Huggingface(BaseEmbedding):
"""Generate sentence embedding for given text using pretrained models from Huggingface transformers.
:param model: model name, defaults to 'distilbert-base-uncased'.
:type model: str
Example:
.. code-block:: python
from gptcache.embedding import Huggingface
test_sentence = 'Hello, world.'
encoder = Huggingface(model='distilbert-base-uncased')
embed = encoder.to_embeddings(test_sentence)
test_sentence = '什么是Github'
huggingface = Huggingface(model='uer/albert-base-chinese-cluecorpussmall')
embed = huggingface.to_embeddings(test_sentence)
"""
def __init__(self, model: str = "distilbert-base-uncased"):
self.model = AutoModel.from_pretrained(model)
self.model.eval()
self.tokenizer = AutoTokenizer.from_pretrained(model)
if not self.tokenizer.pad_token:
self.tokenizer.pad_token = "[PAD]"
try:
self.__dimension = self.model.config.hidden_size
except Exception: # pylint: disable=W0703
from transformers import AutoConfig # pylint: disable=C0415
config = AutoConfig.from_pretrained(model)
self.__dimension = config.hidden_size
[docs] def to_embeddings(self, data, **_):
"""Generate embedding given text input
:param data: text in string.
:type data: str
:return: a text embedding in shape of (dim,).
"""
if not isinstance(data, list):
data = [data]
inputs = self.tokenizer(
data, padding=True, truncation=True, return_tensors="pt"
)
outs = self.model(**inputs).last_hidden_state
emb = self.post_proc(outs, inputs).squeeze(0).detach().numpy()
return np.array(emb).astype("float32")
[docs] def post_proc(self, token_embeddings, inputs):
attention_mask = inputs["attention_mask"]
input_mask_expanded = (
attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
)
sentence_embs = torch.sum(
token_embeddings * input_mask_expanded, 1
) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
return sentence_embs
@property
def dimension(self):
"""Embedding dimension.
:return: embedding dimension
"""
return self.__dimension