Merge branch 'feature/quantized_llm' into 'main'

quantized llm to master See merge request !11

Merge branch 'feature/quantized_llm' into 'main'
quantized llm to master See merge request !11
04402d4a · Danial Hezarkhani · 26c7396e · 3dd70794 · 04402d4a · 04402d4a
Commit 04402d4a authored 1 year ago by Danial Hezarkhani
--- a/quantized_llm/.gitignore
+++ b/quantized_llm/.gitignore
+**/__pycache__
+*.gguf
+*gradio_cached_examples
\ No newline at end of file
--- a/quantized_llm/not_quantized/Dockerfile
+++ b/quantized_llm/not_quantized/Dockerfile
+FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime
+RUN apt-get update -y 
+RUN pip install --no-cache-dir -U pip \
+    && python -m pip install --upgrade build  
+COPY requirements.docker.txt ./requirements.docker.txt
+RUN pip install --no-cache-dir -r requirements.docker.txt
+ENV SHARED_FOLDER_PATH "/data"
+ENV JUPYTER_PASSWORD "password"
+ENV PORT 8062
+WORKDIR /home/llm
+COPY jupyter_server_config.py /etc/jupyter/
+#EXPOSE 8061 8062
+ENTRYPOINT ["python","-m", "jupyterlab"]
\ No newline at end of file
--- a/quantized_llm/not_quantized/jupyter_server_config.py
+++ b/quantized_llm/not_quantized/jupyter_server_config.py
+# Copyright (c) Jupyter Development Team.
+# Distributed under the terms of the Modified BSD License.
+# mypy: ignore-errors
+import os
+import stat
+import subprocess
+from pathlib import Path
+from jupyter_server.auth import passwd
+from jupyter_core.paths import jupyter_data_dir
+c = get_config()  # noqa: F821
+c.ServerApp.ip = "0.0.0.0"
+c.ServerApp.open_browser = False
+c.ServerApp.allow_root = True
+c.ServerApp.allow_origin = "*"
+c.ServerApp.port = int(os.getenv('PORT', 8888)) 
+c.ServerApp.allow_remote_access=True
+c.ServerApp.password = passwd(os.getenv("JUPYTER_PASSWORD", "test"))
+c.FileContentsManager.delete_to_trash = True
\ No newline at end of file
--- a/quantized_llm/not_quantized/readme.md
+++ b/quantized_llm/not_quantized/readme.md
+## Step 3: Push to docker
+Currenlty our container registery is being used to host the docker images:
+docker build -t cicd.ai4eu-dev.eu:7444/tutorials/quantized_llm/llm_hf:latest .
+docker run -p 8062:8062 cicd.ai4eu-dev.eu:7444/tutorials/quantized_llm/llm_hf:latest
+docker push cicd.ai4eu-dev.eu:7444/tutorials/quantized_llm/llm_hf:latest
+Any registery can be used to host the images.
\ No newline at end of file
--- a/quantized_llm/not_quantized/requirements.docker.txt
+++ b/quantized_llm/not_quantized/requirements.docker.txt
+timm~=0.9.8
+pytorch-lightning~=2.1.0
+transformers~=4.34.1
+pycocotools~=2.0.7
+scipy~=1.11.4
+tensorboard~=2.15.0
+jupyter
+jupyterlab
+accelerate
+langchain
\ No newline at end of file
--- a/quantized_llm/not_quantized/running_llm.ipynb
+++ b/quantized_llm/not_quantized/running_llm.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline\n",
+    "import torch\n",
+    "import os\n",
+    "access_token = os.getenv(\"HFTOKEN\")\n",
+    "\n",
+    "cachefolder= \"/p/scratch/hai_westai_api/llm/cache\"\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"mistralai/Mistral-7B-v0.1\", cache_dir=cachefolder, token=access_token, device_map = 'cuda')\n",
+    "model = AutoModelForCausalLM.from_pretrained(\"mistralai/Mistral-7B-v0.1\", cache_dir=cachefolder, token=access_token, device_map = 'cuda')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Simple Way"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe = pipeline(\n",
+    "    \"text-generation\", \n",
+    "    model=model, \n",
+    "    tokenizer = tokenizer, \n",
+    "    torch_dtype=torch.bfloat16, \n",
+    "    device_map=\"auto\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = \"As a data scientist, can you explain the concept of regularization in machine learning?\"\n",
+    "\n",
+    "sequences = pipe(\n",
+    "    prompt,\n",
+    "    do_sample=True,\n",
+    "    max_new_tokens=100, \n",
+    "    temperature=0.7, \n",
+    "    top_k=50, \n",
+    "    top_p=0.95,\n",
+    "    num_return_sequences=1,\n",
+    ")\n",
+    "print(sequences[0]['generated_text'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Second Way"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
+    "pipe = pipeline(\"text-generation\", model=model, tokenizer=tokenizer, max_new_tokens=5000)\n",
+    "hf = HuggingFacePipeline(pipeline=pipe)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import PromptTemplate\n",
+    "\n",
+    "template = \"\"\"Question: {question}\n",
+    "\n",
+    "Answer: Let's think step by step.\"\"\"\n",
+    "prompt = PromptTemplate.from_template(template)\n",
+    "\n",
+    "chain = prompt | hf\n",
+    "\n",
+    "question = \"What is electroencephalography?\"\n",
+    "\n",
+    "print(chain.invoke({\"question\": question}))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Third way"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.llms import HuggingFacePipeline\n",
+    "from langchain import PromptTemplate, LLMChain\n",
+    "pipe = pipeline(\n",
+    "    \"text-generation\", model=model, tokenizer=tokenizer, max_new_tokens=1000, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id,\n",
+    ")\n",
+    "hf = HuggingFacePipeline(pipeline=pipe)\n",
+    "\n",
+    "template = \"\"\"SYSTEM: You are a helpful, respectful and honest INTP-T AI Assistant named Buddy. You are talking to a human User.\n",
+    "Always answer as helpfully and logically as possible, while being safe.\n",
+    "USER: {question}\n",
+    "ASSISTANT:\n",
+    "\"\"\"\n",
+    "\n",
+    "prompt = PromptTemplate(template=template, input_variables=[\"question\"])\n",
+    "llm_chain = LLMChain(prompt=prompt, llm=hf)\n",
+    "\n",
+    "print(llm_chain.invoke(\"Who is the Pope ?\"))"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:code id: tags:
+``` 
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
+import torch
+import os
+access_token = os.getenv("HFTOKEN")
+cachefolder= "/p/scratch/hai_westai_api/llm/cache"
+tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", cache_dir=cachefolder, token=access_token, device_map = 'cuda')
+model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", cache_dir=cachefolder, token=access_token, device_map = 'cuda')
+```
+%% Cell type:markdown id: tags:
+# Simple Way
+%% Cell type:code id: tags:
+``` 
+pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer = tokenizer,
+    torch_dtype=torch.bfloat16,
+    device_map="auto"
+)
+```
+%% Cell type:code id: tags:
+``` 
+prompt = "As a data scientist, can you explain the concept of regularization in machine learning?"
+sequences = pipe(
+    prompt,
+    do_sample=True,
+    max_new_tokens=100,
+    temperature=0.7,
+    top_k=50,
+    top_p=0.95,
+    num_return_sequences=1,
+)
+print(sequences[0]['generated_text'])
+```
+%% Cell type:markdown id: tags:
+# Second Way
+%% Cell type:code id: tags:
+``` 
+from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=5000)
+hf = HuggingFacePipeline(pipeline=pipe)
+```
+%% Cell type:code id: tags:
+``` 
+from langchain_core.prompts import PromptTemplate
+template = """Question: {question}
+Answer: Let's think step by step."""
+prompt = PromptTemplate.from_template(template)
+chain = prompt | hf
+question = "What is electroencephalography?"
+print(chain.invoke({"question": question}))
+```
+%% Cell type:markdown id: tags:
+# Third way
+%% Cell type:code id: tags:
+``` 
+from langchain.llms import HuggingFacePipeline
+from langchain import PromptTemplate, LLMChain
+pipe = pipeline(
+    "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1000, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id,
+)
+hf = HuggingFacePipeline(pipeline=pipe)
+template = """SYSTEM: You are a helpful, respectful and honest INTP-T AI Assistant named Buddy. You are talking to a human User.
+Always answer as helpfully and logically as possible, while being safe.
+USER: {question}
+ASSISTANT:
+"""
+prompt = PromptTemplate(template=template, input_variables=["question"])
+llm_chain = LLMChain(prompt=prompt, llm=hf)
+print(llm_chain.invoke("Who is the Pope ?"))
+```
--- a/quantized_llm/quantized/jupyter_lab/Dockerfile
+++ b/quantized_llm/quantized/jupyter_lab/Dockerfile
+FROM python:3.10
+ENV SHARED_FOLDER_PATH "/data"
+ENV JUPYTER_PASSWORD "password"
+ENV PORT 8062
+RUN pip install jupyter -U && pip install jupyterlab
+#This will be the home of jupyter lab. 
+#WORKDIR /jupyter
+COPY jupyter_server_config.py /etc/jupyter/
+#EXPOSE 8062
+#ENTRYPOINT ["python","-m", "jupyterlab", "--allow-root", "--ip=*", "--port=8888", "--no-browser"]
+#ENTRYPOINT ["python","-m", "jupyterlab", "--config", "/jupyter/jupyter_notebook_config.py"]
+ENTRYPOINT ["python","-m", "jupyterlab"]
\ No newline at end of file
--- a/quantized_llm/quantized/jupyter_lab/jupyter_server_config.py
+++ b/quantized_llm/quantized/jupyter_lab/jupyter_server_config.py
+# Copyright (c) Jupyter Development Team.
+# Distributed under the terms of the Modified BSD License.
+# mypy: ignore-errors
+import os
+import stat
+import subprocess
+from pathlib import Path
+from jupyter_server.auth import passwd
+from jupyter_core.paths import jupyter_data_dir
+c = get_config()  # noqa: F821
+c.ServerApp.ip = "0.0.0.0"
+c.ServerApp.open_browser = False
+c.ServerApp.allow_root = True
+c.ServerApp.allow_origin = "*"
+c.ServerApp.port = int(os.getenv('PORT', 8888)) 
+c.ServerApp.allow_remote_access=True
+c.ServerApp.password = passwd(os.getenv("JUPYTER_PASSWORD", "test"))
+c.FileContentsManager.delete_to_trash = True
\ No newline at end of file
--- a/quantized_llm/quantized/jupyter_lab/license-1.0.0.json
+++ b/quantized_llm/quantized/jupyter_lab/license-1.0.0.json
+{
+    "$schema": "https://raw.githubusercontent.com/acumos/license-manager/master/license-manager-client-library/src/main/resources/schema/1.0.0/license-profile.json",
+    "keyword": "Apache-2.0",
+    "licenseName": "Apache License 2.0",
+    "copyright": {
+        "year": 2019,
+        "company": "Company A",
+        "suffix": "All Rights Reserved"
+    },
+    "softwareType": "Machine Learning Model",
+    "companyName": "Company A",
+    "contact": {
+        "name": "Company A Team Member",
+        "URL": "http://companya.com",
+        "email": "support@companya.com"
+    },
+    "rtuRequired": false
+}
\ No newline at end of file
--- a/quantized_llm/quantized/jupyter_lab/model.proto
+++ b/quantized_llm/quantized/jupyter_lab/model.proto
+syntax = "proto3";
+message Empty {
+}
+message JupyterStatus {
+  int32 status = 1;
+}
+service JupyterLab {
+  rpc loggging(JupyterStatus) returns(JupyterStatus);
+}
--- a/quantized_llm/quantized/jupyter_lab/readme.md
+++ b/quantized_llm/quantized/jupyter_lab/readme.md
+docker run -p 8888:8888 <image-name>
+Push to docker
+Currenlty our container registery is being used to host the docker images:
+docker build -t cicd.ai4eu-dev.eu:7444/tutorials/quantized_llm/jupyter_lab:latest .
+docker run -p 8062:8062 cicd.ai4eu-dev.eu:7444/tutorials/quantized_llm/jupyter_lab:latest
+docker push cicd.ai4eu-dev.eu:7444/tutorials/quantized_llm/jupyter_lab:latest
+#
--- a/quantized_llm/quantized/qlmodel/.gitignore
+++ b/quantized_llm/quantized/qlmodel/.gitignore
+/dev
\ No newline at end of file
--- a/quantized_llm/quantized/qlmodel/Dockerfile.cpu
+++ b/quantized_llm/quantized/qlmodel/Dockerfile.cpu
+FROM python:3.10
+# We need to set the host to 0.0.0.0 to allow outside access
+#ENV HOST 0.0.0.0
+RUN apt-get update -y 
+RUN pip install --no-cache-dir -U pip \
+    && python -m pip install --upgrade build
+# Install depencencies
+RUN python3 -m pip install llama-cpp-python
+ENV SHARED_FOLDER_PATH "/test"
+ENV PORT 8062
+# Download default model
+ENV DEFAULT_MODEL_NAME "mistral-7b-instruct-v0.1.Q2_K.gguf"
+ENV DEFAUL_MODEL_DL_URL "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q2_K.gguf"
+ENV DEFAULT_MODEL_PATH "/default_model"
+WORKDIR ${DEFAULT_MODEL_PATH}
+RUN wget ${DEFAUL_MODEL_DL_URL}
+# Copy scripts
+WORKDIR /llm
+COPY license-1.0.0.json config.py __init__.py logger.py model.proto gradioui.py ql_server.py llm.py requirements.txt ./
+COPY model_pb2_grpc.py model_pb2.py ./
+COPY static ./static
+COPY templates ./templates
+RUN python3 -m pip install -r requirements.txt
+ENTRYPOINT [ "python3","ql_server.py" ]
\ No newline at end of file
--- a/quantized_llm/quantized/qlmodel/Dockerfile.gpu
+++ b/quantized_llm/quantized/qlmodel/Dockerfile.gpu
+#FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime
+FROM nvidia/cuda:11.7.1-devel-ubuntu22.04
+# We need to set the host to 0.0.0.0 to allow outside access
+ENV HOST 0.0.0.0
+RUN apt-get update && apt-get upgrade -y
+RUN apt-get install -y git build-essential \
+    python3 python3-pip gcc wget \
+    ocl-icd-opencl-dev opencl-headers clinfo \
+    libclblast-dev libopenblas-dev
+RUN mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
+# setting build related env vars
+ENV CUDA_DOCKER_ARCH=all
+ENV LLAMA_CUBLAS=1
+# Install depencencies
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
+# Install llama-cpp-python (build with cuda)
+RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
+ENV PYTHONUNBUFFERED=1
+ENV SHARED_FOLDER_PATH "/test"
+ENV PORT 8062
+WORKDIR ${SHARED_FOLDER_PATH}
+WORKDIR /llm
+COPY license-1.0.0.json config.py __init__.py logger.py model.proto app.py ql_server.py requirements.txt ./
+COPY static ./static
+COPY templates ./templates
+RUN python3 -m pip install -r requirements.txt
+#ENV MODEL_NAME "mistral-7b-instruct-v0.1.Q4_K_M.gguf"
+ENV MODEL_NAME "mistral-7b-instruct-v0.1.Q2_K.gguf"
+ENV MODEL_DL_URL "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q2_K.gguf"
+#COPY mistral-7b-instruct-v0.1.Q2_K.gguf ${SHARED_FOLDER_PATH}
+ENTRYPOINT [ "python3","ql_server.py" ]
--- a/quantized_llm/quantized/qlmodel/__init__.py
+++ b/quantized_llm/quantized/qlmodel/__init__.py
--- a/quantized_llm/quantized/qlmodel/config.py
+++ b/quantized_llm/quantized/qlmodel/config.py
+import os
+def create_not_existing_dirs(list_of_paths):
+     for dir in list_of_paths:
+        if not os.path.exists(dir):
+            # If not, create the folder
+            os.makedirs(dir)
+            print(f"Folder '{dir}' created successfully.")
+SHARED_FOLDER = os.getenv("SHARED_FOLDER_PATH", "./dev/local_shared_folder/")  
+port = int(os.getenv("PORT", "8062"))
+default_model_name = os.getenv("DEFAULT_MODEL_NAME", "mistral-7b-instruct-v0.1.Q2_K.gguf")
+default_model_path = os.getenv("DEFAULT_MODEL_PATH", "./dev")
+create_not_existing_dirs([SHARED_FOLDER])
\ No newline at end of file
--- a/quantized_llm/quantized/qlmodel/gradioui.py
+++ b/quantized_llm/quantized/qlmodel/gradioui.py
+import gradio as gr
+import requests
+from llm import LLMModel
+from logger import logging
+port = 8061
+llm = LLMModel()
+with gr.Blocks() as gradio_ui:
+    chatbot = gr.Chatbot()
+    msg = gr.Textbox()
+    clear = gr.Button("Clear")
+    #submit_btn = gr.Button("submit")
+    def send_request_to_localllm(data):
+        resp = llm.chat(data)
+        return resp
+    def send_chat_request(data):
+        url = ('[::]:{}'.format(port))
+        endpoint = url + "/chat"
+        #endpoint = "http://127.0.0.1:8062" + "/chat"
+        try:
+            r = requests.post(url = endpoint, json = data)
+            resp_json = r.json()
+            return resp_json
+        except Exception as e:
+            logging.error("Error calling llm endpoint.")
+            logging.error(e)
+            #raise gr.Error("Connection to server is failing!")
+        return "yoooo"
+    def convert_hitory_lama(history):
+        history_dic_list = []
+        # Add system promt. What should the chatbot do.
+        system_message = "You are a friendly chatbot."
+        new_prompt= {
+            "role": "system",
+            "content":system_message
+        }
+        history_dic_list.append(new_prompt)
+        # add history to the new message if anything exists
+        if history:
+            for item in history:
+                userpropmpt = {
+                    "role": "user",
+                    "content":item[0]
+                }   
+                systemanswer = {
+                    "role": "system",
+                    "content":item[1]
+                }
+                history_dic_list.append(userpropmpt)
+                history_dic_list.append(systemanswer)
+        return history_dic_list
+    def bot(message,history):
+        history_dic_list = convert_hitory_lama(history)
+        # add the current prompt
+        new_prompt= {
+            "role": "user",
+            "content":message
+        } 
+        history_dic_list.append(new_prompt)
+        resp = send_request_to_localllm(history_dic_list)
+        #resp = send_chat_request(history_dic_list)
+        return "",history + [[message, resp]]
+        ##TODO: demove live above and use below
+        if resp:
+            return "",history + [[message, resp]]
+        else:
+            return "",history
+    msg.submit(bot, [msg, chatbot], [msg, chatbot], queue=False)
+    clear.click(lambda: None, None, chatbot, queue=False)
+def serve_gradio():
+    gradio_ui.queue()
+    gradio_ui.launch(server_port=8062, server_name="0.0.0.0")
+# serve()
\ No newline at end of file
--- a/quantized_llm/quantized/qlmodel/license-1.0.0.json
+++ b/quantized_llm/quantized/qlmodel/license-1.0.0.json
+{
+    "$schema": "https://raw.githubusercontent.com/acumos/license-manager/master/license-manager-client-library/src/main/resources/schema/1.0.0/license-profile.json",
+    "keyword": "Apache-2.0",
+    "licenseName": "Apache License 2.0",
+    "copyright": {
+        "year": 2019,
+        "company": "Company A",
+        "suffix": "All Rights Reserved"
+    },
+    "softwareType": "Machine Learning Model",
+    "companyName": "Company A",
+    "contact": {
+        "name": "Company A Team Member",
+        "URL": "http://companya.com",
+        "email": "support@companya.com"
+    },
+    "rtuRequired": false
+}
\ No newline at end of file
--- a/quantized_llm/quantized/qlmodel/llm.py
+++ b/quantized_llm/quantized/qlmodel/llm.py
+import json
+from llama_cpp.llama import Llama, LlamaGrammar
+import requests
+import os
+import json
+import copy
+from logger import Logger
+import config
+logger = Logger(__name__)
+class LLMModel:
+    def __init__(self):
+        self.path_to_parameter = os.path.join(config.SHARED_FOLDER, "parameter.json")
+        self.default_parameters = {
+            #"model_dl_url": config.default_model_dl_url,
+            "model_name": config.default_model_name,
+            "temp": 0.7,
+            "top_p": 0.95,
+            "top_k": 50,
+            "n_ctx": 4096, # The max sequence length to use - note that longer sequence lengths require much more resources
+            "echo":False,
+            #n_threads:n_threads,            # The number of CPU threads to use, tailor to your system and the resulting performance
+            # n_gpu_layers:n_gpu_layers   # The number of layers to offload to GPU, if you have GPU acceleration available
+        }
+        self.parameters = None
+        self.model = None
+        self.parameters_changed= True
+        # save the first parameter dictionary
+        self.save_parameter(self.default_parameters)
+        # Load default model
+        self.get_parameters()
+        self.get_model()
+    def get_model(self):
+        if self.parameters_changed or self.model is None:
+            # check if the default model is being used.
+            if self.parameters["model_name"] == config.default_model_name:
+                logger.debug("Default model is being used.")
+                modelpath = os.path.join(config.default_model_path,self.parameters["model_name"])
+            else:
+                logger.debug("Model uploaded by user is being used.")
+                modelpath = os.path.join(config.SHARED_FOLDER,self.parameters["model_name"])
+            logger.info("Loading new model.")
+            try:
+                self.model = Llama(
+                        model_path=modelpath,
+                        temp=self.parameters["temp"],
+                        top_p=self.parameters["top_p"],
+                        top_k=self.parameters["top_k"],
+                        echo=self.parameters["echo"],
+                        stop = ["Q", "\n"],
+                    )
+            except ValueError:
+                logger.error("Model path is empty: {}".format(modelpath))
+        else:
+            logger.debug("Model is already loaded")
+        return modelpath
+    def save_parameter(self, para):
+        with open(self.path_to_parameter, mode="w") as f:
+            json.dump(para, f)
+            f.close()
+    def _check_if_parameter_changed(self,new_paramters):
+        if self.parameters:
+            for key, item in self.parameters.items():
+                if item != new_paramters[key]:
+                    self.parameters_changed = True
+                    return
+            self.parameters_changed = False
+        else:
+            self.parameters_changed = True
+    def get_parameters(self):
+        try:
+            # Reading JSON data from the file
+            with open(self.path_to_parameter, mode="r") as file:
+                data_read = json.load(file)
+                self._check_if_parameter_changed(data_read)
+                self.parameters = data_read
+        except FileNotFoundError:
+            logger.info(f"File not found: {self.path_to_parameter}. Please restart the container")
+        except json.JSONDecodeError as e:
+            logger.error(f"Error decoding JSON: {e}")
+    def chat(self, prompt_dic):
+        self.get_parameters()
+        if self.parameters_changed or not self.model:
+            modelpath = self.get_model()
+            if not self.model:
+                raise FileNotFoundError("Model could not get loaded. Please check if the model exists under: {}".format(modelpath))
+        #data = request.get_json()
+        resp = self.model.create_chat_completion(
+            messages = prompt_dic
+        )
+        resp_return = resp["choices"][0]
+        logger.info("finishing reason: " + resp_return["finish_reason"])
+        logger.info("chat request finished")
+        return resp_return["message"]["content"]
+# def app_run():
+#     app.secret_key = "qlmodel"
+#     bootstrap = Bootstrap(app)
+#     app.run(host="0.0.0.0", port=config.port)
+# if __name__ == '__main__':
+#     llm.chat([
+#     {
+#         "role": "system",
+#         "content": "You are a story writing assistant."
+#     },
+#     {
+#         "role": "user",
+#         "content": "hi how are you."
+#     }])
\ No newline at end of file
--- a/quantized_llm/quantized/qlmodel/logger.py
+++ b/quantized_llm/quantized/qlmodel/logger.py
+import logging
+class Logger(logging.Logger):
+    def __init__(self, name):
+        super().__init__(name)
+        formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(logging.DEBUG)
+        console_handler.setFormatter(formatter)
+        self.addHandler(console_handler)
\ No newline at end of file