Skip to content
Snippets Groups Projects
Commit 04402d4a authored by Danial Hezarkhani's avatar Danial Hezarkhani
Browse files

Merge branch 'feature/quantized_llm' into 'main'

quantized llm to master

See merge request !11
parents 26c7396e 3dd70794
No related branches found
No related tags found
1 merge request!11quantized llm to master
Showing
with 646 additions and 0 deletions
**/__pycache__
*.gguf
*gradio_cached_examples
\ No newline at end of file
FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime
RUN apt-get update -y
RUN pip install --no-cache-dir -U pip \
&& python -m pip install --upgrade build
COPY requirements.docker.txt ./requirements.docker.txt
RUN pip install --no-cache-dir -r requirements.docker.txt
ENV SHARED_FOLDER_PATH "/data"
ENV JUPYTER_PASSWORD "password"
ENV PORT 8062
WORKDIR /home/llm
COPY jupyter_server_config.py /etc/jupyter/
#EXPOSE 8061 8062
ENTRYPOINT ["python","-m", "jupyterlab"]
\ No newline at end of file
# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
# mypy: ignore-errors
import os
import stat
import subprocess
from pathlib import Path
from jupyter_server.auth import passwd
from jupyter_core.paths import jupyter_data_dir
c = get_config() # noqa: F821
c.ServerApp.ip = "0.0.0.0"
c.ServerApp.open_browser = False
c.ServerApp.allow_root = True
c.ServerApp.allow_origin = "*"
c.ServerApp.port = int(os.getenv('PORT', 8888))
c.ServerApp.allow_remote_access=True
c.ServerApp.password = passwd(os.getenv("JUPYTER_PASSWORD", "test"))
c.FileContentsManager.delete_to_trash = True
\ No newline at end of file
## Step 3: Push to docker
Currenlty our container registery is being used to host the docker images:
docker build -t cicd.ai4eu-dev.eu:7444/tutorials/quantized_llm/llm_hf:latest .
docker run -p 8062:8062 cicd.ai4eu-dev.eu:7444/tutorials/quantized_llm/llm_hf:latest
docker push cicd.ai4eu-dev.eu:7444/tutorials/quantized_llm/llm_hf:latest
Any registery can be used to host the images.
\ No newline at end of file
timm~=0.9.8
pytorch-lightning~=2.1.0
transformers~=4.34.1
pycocotools~=2.0.7
scipy~=1.11.4
tensorboard~=2.15.0
jupyter
jupyterlab
accelerate
langchain
\ No newline at end of file
%% Cell type:code id: tags:
```
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch
import os
access_token = os.getenv("HFTOKEN")
cachefolder= "/p/scratch/hai_westai_api/llm/cache"
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", cache_dir=cachefolder, token=access_token, device_map = 'cuda')
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", cache_dir=cachefolder, token=access_token, device_map = 'cuda')
```
%% Cell type:markdown id: tags:
# Simple Way
%% Cell type:code id: tags:
```
pipe = pipeline(
"text-generation",
model=model,
tokenizer = tokenizer,
torch_dtype=torch.bfloat16,
device_map="auto"
)
```
%% Cell type:code id: tags:
```
prompt = "As a data scientist, can you explain the concept of regularization in machine learning?"
sequences = pipe(
prompt,
do_sample=True,
max_new_tokens=100,
temperature=0.7,
top_k=50,
top_p=0.95,
num_return_sequences=1,
)
print(sequences[0]['generated_text'])
```
%% Cell type:markdown id: tags:
# Second Way
%% Cell type:code id: tags:
```
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=5000)
hf = HuggingFacePipeline(pipeline=pipe)
```
%% Cell type:code id: tags:
```
from langchain_core.prompts import PromptTemplate
template = """Question: {question}
Answer: Let's think step by step."""
prompt = PromptTemplate.from_template(template)
chain = prompt | hf
question = "What is electroencephalography?"
print(chain.invoke({"question": question}))
```
%% Cell type:markdown id: tags:
# Third way
%% Cell type:code id: tags:
```
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
pipe = pipeline(
"text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1000, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id,
)
hf = HuggingFacePipeline(pipeline=pipe)
template = """SYSTEM: You are a helpful, respectful and honest INTP-T AI Assistant named Buddy. You are talking to a human User.
Always answer as helpfully and logically as possible, while being safe.
USER: {question}
ASSISTANT:
"""
prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=hf)
print(llm_chain.invoke("Who is the Pope ?"))
```
FROM python:3.10
ENV SHARED_FOLDER_PATH "/data"
ENV JUPYTER_PASSWORD "password"
ENV PORT 8062
RUN pip install jupyter -U && pip install jupyterlab
#This will be the home of jupyter lab.
#WORKDIR /jupyter
COPY jupyter_server_config.py /etc/jupyter/
#EXPOSE 8062
#ENTRYPOINT ["python","-m", "jupyterlab", "--allow-root", "--ip=*", "--port=8888", "--no-browser"]
#ENTRYPOINT ["python","-m", "jupyterlab", "--config", "/jupyter/jupyter_notebook_config.py"]
ENTRYPOINT ["python","-m", "jupyterlab"]
\ No newline at end of file
# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
# mypy: ignore-errors
import os
import stat
import subprocess
from pathlib import Path
from jupyter_server.auth import passwd
from jupyter_core.paths import jupyter_data_dir
c = get_config() # noqa: F821
c.ServerApp.ip = "0.0.0.0"
c.ServerApp.open_browser = False
c.ServerApp.allow_root = True
c.ServerApp.allow_origin = "*"
c.ServerApp.port = int(os.getenv('PORT', 8888))
c.ServerApp.allow_remote_access=True
c.ServerApp.password = passwd(os.getenv("JUPYTER_PASSWORD", "test"))
c.FileContentsManager.delete_to_trash = True
\ No newline at end of file
{
"$schema": "https://raw.githubusercontent.com/acumos/license-manager/master/license-manager-client-library/src/main/resources/schema/1.0.0/license-profile.json",
"keyword": "Apache-2.0",
"licenseName": "Apache License 2.0",
"copyright": {
"year": 2019,
"company": "Company A",
"suffix": "All Rights Reserved"
},
"softwareType": "Machine Learning Model",
"companyName": "Company A",
"contact": {
"name": "Company A Team Member",
"URL": "http://companya.com",
"email": "support@companya.com"
},
"rtuRequired": false
}
\ No newline at end of file
syntax = "proto3";
message Empty {
}
message JupyterStatus {
int32 status = 1;
}
service JupyterLab {
rpc loggging(JupyterStatus) returns(JupyterStatus);
}
docker run -p 8888:8888 <image-name>
Push to docker
Currenlty our container registery is being used to host the docker images:
docker build -t cicd.ai4eu-dev.eu:7444/tutorials/quantized_llm/jupyter_lab:latest .
docker run -p 8062:8062 cicd.ai4eu-dev.eu:7444/tutorials/quantized_llm/jupyter_lab:latest
docker push cicd.ai4eu-dev.eu:7444/tutorials/quantized_llm/jupyter_lab:latest
#
/dev
\ No newline at end of file
FROM python:3.10
# We need to set the host to 0.0.0.0 to allow outside access
#ENV HOST 0.0.0.0
RUN apt-get update -y
RUN pip install --no-cache-dir -U pip \
&& python -m pip install --upgrade build
# Install depencencies
RUN python3 -m pip install llama-cpp-python
ENV SHARED_FOLDER_PATH "/test"
ENV PORT 8062
# Download default model
ENV DEFAULT_MODEL_NAME "mistral-7b-instruct-v0.1.Q2_K.gguf"
ENV DEFAUL_MODEL_DL_URL "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q2_K.gguf"
ENV DEFAULT_MODEL_PATH "/default_model"
WORKDIR ${DEFAULT_MODEL_PATH}
RUN wget ${DEFAUL_MODEL_DL_URL}
# Copy scripts
WORKDIR /llm
COPY license-1.0.0.json config.py __init__.py logger.py model.proto gradioui.py ql_server.py llm.py requirements.txt ./
COPY model_pb2_grpc.py model_pb2.py ./
COPY static ./static
COPY templates ./templates
RUN python3 -m pip install -r requirements.txt
ENTRYPOINT [ "python3","ql_server.py" ]
\ No newline at end of file
#FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime
FROM nvidia/cuda:11.7.1-devel-ubuntu22.04
# We need to set the host to 0.0.0.0 to allow outside access
ENV HOST 0.0.0.0
RUN apt-get update && apt-get upgrade -y
RUN apt-get install -y git build-essential \
python3 python3-pip gcc wget \
ocl-icd-opencl-dev opencl-headers clinfo \
libclblast-dev libopenblas-dev
RUN mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
# setting build related env vars
ENV CUDA_DOCKER_ARCH=all
ENV LLAMA_CUBLAS=1
# Install depencencies
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
# Install llama-cpp-python (build with cuda)
RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
ENV PYTHONUNBUFFERED=1
ENV SHARED_FOLDER_PATH "/test"
ENV PORT 8062
WORKDIR ${SHARED_FOLDER_PATH}
WORKDIR /llm
COPY license-1.0.0.json config.py __init__.py logger.py model.proto app.py ql_server.py requirements.txt ./
COPY static ./static
COPY templates ./templates
RUN python3 -m pip install -r requirements.txt
#ENV MODEL_NAME "mistral-7b-instruct-v0.1.Q4_K_M.gguf"
ENV MODEL_NAME "mistral-7b-instruct-v0.1.Q2_K.gguf"
ENV MODEL_DL_URL "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q2_K.gguf"
#COPY mistral-7b-instruct-v0.1.Q2_K.gguf ${SHARED_FOLDER_PATH}
ENTRYPOINT [ "python3","ql_server.py" ]
import os
def create_not_existing_dirs(list_of_paths):
for dir in list_of_paths:
if not os.path.exists(dir):
# If not, create the folder
os.makedirs(dir)
print(f"Folder '{dir}' created successfully.")
SHARED_FOLDER = os.getenv("SHARED_FOLDER_PATH", "./dev/local_shared_folder/")
port = int(os.getenv("PORT", "8062"))
default_model_name = os.getenv("DEFAULT_MODEL_NAME", "mistral-7b-instruct-v0.1.Q2_K.gguf")
default_model_path = os.getenv("DEFAULT_MODEL_PATH", "./dev")
create_not_existing_dirs([SHARED_FOLDER])
\ No newline at end of file
import gradio as gr
import requests
from llm import LLMModel
from logger import logging
port = 8061
llm = LLMModel()
with gr.Blocks() as gradio_ui:
chatbot = gr.Chatbot()
msg = gr.Textbox()
clear = gr.Button("Clear")
#submit_btn = gr.Button("submit")
def send_request_to_localllm(data):
resp = llm.chat(data)
return resp
def send_chat_request(data):
url = ('[::]:{}'.format(port))
endpoint = url + "/chat"
#endpoint = "http://127.0.0.1:8062" + "/chat"
try:
r = requests.post(url = endpoint, json = data)
resp_json = r.json()
return resp_json
except Exception as e:
logging.error("Error calling llm endpoint.")
logging.error(e)
#raise gr.Error("Connection to server is failing!")
return "yoooo"
def convert_hitory_lama(history):
history_dic_list = []
# Add system promt. What should the chatbot do.
system_message = "You are a friendly chatbot."
new_prompt= {
"role": "system",
"content":system_message
}
history_dic_list.append(new_prompt)
# add history to the new message if anything exists
if history:
for item in history:
userpropmpt = {
"role": "user",
"content":item[0]
}
systemanswer = {
"role": "system",
"content":item[1]
}
history_dic_list.append(userpropmpt)
history_dic_list.append(systemanswer)
return history_dic_list
def bot(message,history):
history_dic_list = convert_hitory_lama(history)
# add the current prompt
new_prompt= {
"role": "user",
"content":message
}
history_dic_list.append(new_prompt)
resp = send_request_to_localllm(history_dic_list)
#resp = send_chat_request(history_dic_list)
return "",history + [[message, resp]]
##TODO: demove live above and use below
if resp:
return "",history + [[message, resp]]
else:
return "",history
msg.submit(bot, [msg, chatbot], [msg, chatbot], queue=False)
clear.click(lambda: None, None, chatbot, queue=False)
def serve_gradio():
gradio_ui.queue()
gradio_ui.launch(server_port=8062, server_name="0.0.0.0")
# serve()
\ No newline at end of file
{
"$schema": "https://raw.githubusercontent.com/acumos/license-manager/master/license-manager-client-library/src/main/resources/schema/1.0.0/license-profile.json",
"keyword": "Apache-2.0",
"licenseName": "Apache License 2.0",
"copyright": {
"year": 2019,
"company": "Company A",
"suffix": "All Rights Reserved"
},
"softwareType": "Machine Learning Model",
"companyName": "Company A",
"contact": {
"name": "Company A Team Member",
"URL": "http://companya.com",
"email": "support@companya.com"
},
"rtuRequired": false
}
\ No newline at end of file
import json
from llama_cpp.llama import Llama, LlamaGrammar
import requests
import os
import json
import copy
from logger import Logger
import config
logger = Logger(__name__)
class LLMModel:
def __init__(self):
self.path_to_parameter = os.path.join(config.SHARED_FOLDER, "parameter.json")
self.default_parameters = {
#"model_dl_url": config.default_model_dl_url,
"model_name": config.default_model_name,
"temp": 0.7,
"top_p": 0.95,
"top_k": 50,
"n_ctx": 4096, # The max sequence length to use - note that longer sequence lengths require much more resources
"echo":False,
#n_threads:n_threads, # The number of CPU threads to use, tailor to your system and the resulting performance
# n_gpu_layers:n_gpu_layers # The number of layers to offload to GPU, if you have GPU acceleration available
}
self.parameters = None
self.model = None
self.parameters_changed= True
# save the first parameter dictionary
self.save_parameter(self.default_parameters)
# Load default model
self.get_parameters()
self.get_model()
def get_model(self):
if self.parameters_changed or self.model is None:
# check if the default model is being used.
if self.parameters["model_name"] == config.default_model_name:
logger.debug("Default model is being used.")
modelpath = os.path.join(config.default_model_path,self.parameters["model_name"])
else:
logger.debug("Model uploaded by user is being used.")
modelpath = os.path.join(config.SHARED_FOLDER,self.parameters["model_name"])
logger.info("Loading new model.")
try:
self.model = Llama(
model_path=modelpath,
temp=self.parameters["temp"],
top_p=self.parameters["top_p"],
top_k=self.parameters["top_k"],
echo=self.parameters["echo"],
stop = ["Q", "\n"],
)
except ValueError:
logger.error("Model path is empty: {}".format(modelpath))
else:
logger.debug("Model is already loaded")
return modelpath
def save_parameter(self, para):
with open(self.path_to_parameter, mode="w") as f:
json.dump(para, f)
f.close()
def _check_if_parameter_changed(self,new_paramters):
if self.parameters:
for key, item in self.parameters.items():
if item != new_paramters[key]:
self.parameters_changed = True
return
self.parameters_changed = False
else:
self.parameters_changed = True
def get_parameters(self):
try:
# Reading JSON data from the file
with open(self.path_to_parameter, mode="r") as file:
data_read = json.load(file)
self._check_if_parameter_changed(data_read)
self.parameters = data_read
except FileNotFoundError:
logger.info(f"File not found: {self.path_to_parameter}. Please restart the container")
except json.JSONDecodeError as e:
logger.error(f"Error decoding JSON: {e}")
def chat(self, prompt_dic):
self.get_parameters()
if self.parameters_changed or not self.model:
modelpath = self.get_model()
if not self.model:
raise FileNotFoundError("Model could not get loaded. Please check if the model exists under: {}".format(modelpath))
#data = request.get_json()
resp = self.model.create_chat_completion(
messages = prompt_dic
)
resp_return = resp["choices"][0]
logger.info("finishing reason: " + resp_return["finish_reason"])
logger.info("chat request finished")
return resp_return["message"]["content"]
# def app_run():
# app.secret_key = "qlmodel"
# bootstrap = Bootstrap(app)
# app.run(host="0.0.0.0", port=config.port)
# if __name__ == '__main__':
# llm.chat([
# {
# "role": "system",
# "content": "You are a story writing assistant."
# },
# {
# "role": "user",
# "content": "hi how are you."
# }])
\ No newline at end of file
import logging
class Logger(logging.Logger):
def __init__(self, name):
super().__init__(name)
formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
console_handler.setFormatter(formatter)
self.addHandler(console_handler)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment