Source code for gptcache.embedding.fasttext
import numpy as np
import os
from gptcache.utils import import_fasttext
from gptcache.embedding.base import BaseEmbedding
import_fasttext()
import fasttext.util # pylint: disable=C0413
[docs]class FastText(BaseEmbedding):
"""Generate sentence embedding for given text using pretrained models of different languages from fastText.
:param model: model name, defaults to 'en'.
:type model: str
:param dim: reduced dimension of embedding. If this parameter is not provided, the embedding dimension (300) will not change.
:type dim: int
Example:
.. code-block:: python
from gptcache.embedding import FastText
test_sentence = 'Hello, world.'
encoder = FastText(model='en', dim=100)
embed = encoder.to_embeddings(test_sentence)
"""
def __init__(self, model: str = "en", dim: int = None):
self.model_path = os.path.abspath(fasttext.util.download_model(model))
self.ft = fasttext.load_model(self.model_path)
if dim:
fasttext.util.reduce_model(self.ft, dim)
self.__dimension = self.ft.get_dimension()
[docs] def to_embeddings(self, data, **_):
"""Generate embedding given text input
:param data: text in string.
:type data: str
:return: a text embedding in shape of (dim,).
"""
assert isinstance(data, str), "Only allow string as input."
emb = self.ft.get_sentence_vector(data)
return np.array(emb).astype("float32")
@property
def dimension(self):
"""Embedding dimension.
:return: embedding dimension
"""
return self.__dimension