Source code for gptcache.adapter.llama_cpp
import time
from typing import Iterator
from gptcache.adapter.adapter import adapt
from gptcache.manager.scalar_data.base import DataType, Answer
from gptcache.utils import import_llama_cpp_python
import_llama_cpp_python()
import llama_cpp # pylint: disable=wrong-import-position
[docs]class Llama(llama_cpp.Llama):
"""llama.cpp wrapper
You should have the llama-cpp-python library installed.
https://github.com/abetlen/llama-cpp-python
Example:
.. code-block:: python
onnx = Onnx()
m = manager_factory('sqlite,faiss,local', data_dir=root, vector_params={"dimension": onnx.dimension})
llm_cache = Cache()
llm_cache.init(
pre_embedding_func=get_prompt,
data_manager=m,
embedding_func=onnx.to_embeddings
)
llm = Llama('./models/7B/ggml-model.bin')
answer = llm(prompt=question, cache_obj=llm_cache)
"""
def __call__(
self,
prompt: str,
**kwargs
):
def update_cache_callback(llm_data, update_cache_func, *args, **kwargs): # pylint: disable=unused-argument
if not isinstance(llm_data, Iterator):
update_cache_func(Answer(llm_data["choices"][0]["text"], DataType.STR))
return llm_data
else:
def stream_answer(it):
total_answer = ""
for item in it:
total_answer += item["choices"][0]["text"]
yield item
update_cache_func(Answer(total_answer, DataType.STR))
return stream_answer(llm_data)
def cache_data_convert(cache_data):
if kwargs.get("stream", False):
return _construct_stream_resp_from_cache(cache_data)
return _construct_resp_from_cache(cache_data)
return adapt(
self.create_completion,
cache_data_convert,
update_cache_callback,
prompt=prompt,
**kwargs
)
def _construct_resp_from_cache(return_message):
return {
"gptcache": True,
"choices": [
{
"text": return_message,
"finish_reason": "stop",
"index": 0,
}
],
"created": int(time.time()),
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
"object": "chat.completion",
}
def _construct_stream_resp_from_cache(return_message):
return [
{
"gptcache": True,
"choices": [
{
"text": return_message,
"finish_reason": None,
"index": 0,
}
],
"created": int(time.time()),
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
"object": "chat.completion",
}
]