csv data analysis with pandas ai added

cea201ec · SM Farhad Ali · 5b9888be · cea201ec · cea201ec · cea201ec
Commit cea201ec authored 10 months ago by SM Farhad Ali
--- a/csv_data_analysis/Dockerfile
+++ b/csv_data_analysis/Dockerfile
+# Use an appropriate Python base image with Python 3.9 or later
+FROM python:3.9-slim
+# Set the working directory inside the container
+WORKDIR /app
+# Update and install necessary system packages
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    software-properties-common \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Copy just the requirements file first to leverage Docker caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application code into the container
+COPY . .
+# Expose the port that Streamlit uses
+EXPOSE 8501
+# Healthcheck to verify Streamlit app is running
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health || exit 1
+# Command to run the Streamlit app
+CMD ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
--- a/csv_data_analysis/cache/cache_db_0.10.db
+++ b/csv_data_analysis/cache/cache_db_0.10.db
--- a/csv_data_analysis/docker-compose.yml
+++ b/csv_data_analysis/docker-compose.yml
+version: '3.8'
+services:
+  streamlit:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: streamlit
+    ports:
+      - "8501:8501"
+    environment:
+      - PYTHONUNBUFFERED=1  # Ensures Python output is logged in real-time
+      - LLM_AUTH_TOKEN=c2FsaTpQYXNzd29yZEAx
+      - LLM_ENDPOINT=https://genai.iais.fraunhofer.de/api/v1/models/Mistral-7B-Instruct-v0.3_t2t  # Set the LLM endpoint accessible from the streamlit service
+    restart: unless-stopped
+    volumes:
+      - ./exports/charts:/app/exports/charts 
--- a/csv_data_analysis/exports/charts/temp_chart.png
+++ b/csv_data_analysis/exports/charts/temp_chart.png
--- a/csv_data_analysis/llms/CustomMistralLLM.py
+++ b/csv_data_analysis/llms/CustomMistralLLM.py
+import requests
+from pandasai.llm.base import LLM
+from pandasai.prompts.base import BasePrompt
+from pandasai.helpers.memory import Memory
+class CustomMistralLLM(LLM):
+    def __init__(self, api_url: str, api_token: str):
+        self.api_url = api_url
+        self.api_token = api_token
+    @property
+    def type(self) -> str:
+        return "custom_mistral"
+    def call(self, instruction: BasePrompt, context: Memory = None) -> str:
+        headers = {
+            "accept": "application/json; charset=utf-8",
+            "Process-Mode": "sync",
+            "Authorization": f"Basic {self.api_token}",
+            "Content-Type": "application/json; charset=utf-8",
+        }
+        payload = {
+            "prompts": [{"role": "user", "content": instruction.to_string()}],
+            "doSample": True,
+            "maxTokens": 1200,
+            "numBeams": 1,
+            "repPenalty": 1.2,
+            "temperature": 0,
+            "topK": 50,
+            "topP": 0.6
+        }
+        response = requests.post(self.api_url, headers=headers, json=payload)
+        response.raise_for_status()
+        data = response.json()
+        print(data['payload']['data']['text'])
+        print("############")
+        return data['payload']['data']['text']
--- a/csv_data_analysis/llms/__init__.py
+++ b/csv_data_analysis/llms/__init__.py
--- a/csv_data_analysis/model.proto
+++ b/csv_data_analysis/model.proto
+syntax = "proto3";
+message Empty {
+}
+message NewsText {
+    string text = 1;
+}
+service NewsDatabroker {
+    rpc pullData(Empty) returns(NewsText);
+}
--- a/csv_data_analysis/model_pb2.py
+++ b/csv_data_analysis/model_pb2.py
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: model.proto
+# Protobuf Python Version: 5.26.1
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+# @@protoc_insertion_point(imports)
+_sym_db = _symbol_database.Default()
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0bmodel.proto\"\x07\n\x05\x45mpty\"\x18\n\x08NewsText\x12\x0c\n\x04text\x18\x01 \x01(\t2/\n\x0eNewsDatabroker\x12\x1d\n\x08pullData\x12\x06.Empty\x1a\t.NewsTextb\x06proto3')
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'model_pb2', _globals)
+if not _descriptor._USE_C_DESCRIPTORS:
+  DESCRIPTOR._loaded_options = None
+  _globals['_EMPTY']._serialized_start=15
+  _globals['_EMPTY']._serialized_end=22
+  _globals['_NEWSTEXT']._serialized_start=24
+  _globals['_NEWSTEXT']._serialized_end=48
+  _globals['_NEWSDATABROKER']._serialized_start=50
+  _globals['_NEWSDATABROKER']._serialized_end=97
+# @@protoc_insertion_point(module_scope)
--- a/csv_data_analysis/model_pb2_grpc.py
+++ b/csv_data_analysis/model_pb2_grpc.py
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+import warnings
+import model_pb2 as model__pb2
+GRPC_GENERATED_VERSION = '1.64.1'
+GRPC_VERSION = grpc.__version__
+EXPECTED_ERROR_RELEASE = '1.65.0'
+SCHEDULED_RELEASE_DATE = 'June 25, 2024'
+_version_not_supported = False
+try:
+    from grpc._utilities import first_version_is_lower
+    _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
+except ImportError:
+    _version_not_supported = True
+if _version_not_supported:
+    warnings.warn(
+        f'The grpc package installed is at version {GRPC_VERSION},'
+        + f' but the generated code in model_pb2_grpc.py depends on'
+        + f' grpcio>={GRPC_GENERATED_VERSION}.'
+        + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
+        + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
+        + f' This warning will become an error in {EXPECTED_ERROR_RELEASE},'
+        + f' scheduled for release on {SCHEDULED_RELEASE_DATE}.',
+        RuntimeWarning
+    )
+class NewsDatabrokerStub(object):
+    """Missing associated documentation comment in .proto file."""
+    def __init__(self, channel):
+        """Constructor.
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.pullData = channel.unary_unary(
+                '/NewsDatabroker/pullData',
+                request_serializer=model__pb2.Empty.SerializeToString,
+                response_deserializer=model__pb2.NewsText.FromString,
+                _registered_method=True)
+class NewsDatabrokerServicer(object):
+    """Missing associated documentation comment in .proto file."""
+    def pullData(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+def add_NewsDatabrokerServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'pullData': grpc.unary_unary_rpc_method_handler(
+                    servicer.pullData,
+                    request_deserializer=model__pb2.Empty.FromString,
+                    response_serializer=model__pb2.NewsText.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'NewsDatabroker', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+    server.add_registered_method_handlers('NewsDatabroker', rpc_method_handlers)
+ # This class is part of an EXPERIMENTAL API.
+class NewsDatabroker(object):
+    """Missing associated documentation comment in .proto file."""
+    @staticmethod
+    def pullData(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/NewsDatabroker/pullData',
+            model__pb2.Empty.SerializeToString,
+            model__pb2.NewsText.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
--- a/csv_data_analysis/requirements.txt
+++ b/csv_data_analysis/requirements.txt
+langchain
+langchain_community
+pandasai==2.2.4
+streamlit
+requests
+python-dotenv
+protobuf
\ No newline at end of file
--- a/csv_data_analysis/streamlit_app.py
+++ b/csv_data_analysis/streamlit_app.py
+import streamlit as st
+import pandas as pd
+from dotenv import load_dotenv
+from os.path import dirname, join
+from pandasai import SmartDataframe
+from llms.CustomMistralLLM import CustomMistralLLM
+import os
+# Load environment variables
+load_dotenv()
+st.set_page_config(page_title='Data Analysis with Pandas AI')
+st.title('Data Analysis with Pandas AI')
+uploaded_file = st.file_uploader("Choose a .csv file", type="csv")
+LLM_ENDPOINT = os.getenv('LLM_ENDPOINT')
+#llm = Ollama(model="mistral", base_url=LLM_ENDPOINT)
+api_url = LLM_ENDPOINT
+api_token = os.getenv("LLM_AUTH_TOKEN")
+llm = CustomMistralLLM(
+    api_url,
+    api_token
+)
+# Check if a file has been uploaded
+if uploaded_file:
+    # Read the uploaded CSV file into a pandas dataframe
+    data = pd.read_csv(uploaded_file)
+    st.subheader(f"File uploaded: {uploaded_file.name}")
+    st.write(data.head(2))
+    # Define a path to save charts (if needed)
+    base_path = dirname(__file__)
+    save_charts_path = join(base_path, 'exports', 'charts')
+    print(save_charts_path)
+    # Initialize SmartDataframe for interactive data analysis
+    smart_df = SmartDataframe(data, config= {
+        "llm": llm,
+        "verbose": True,
+        'open_charts': True,
+       # 'save_charts': True,   # Save generated charts
+       # 'enable_cache': False
+    })
+    prompt = st.text_area("Enter your prompt:")
+    # Generate a response when the button is clicked
+    if st.button("Generate"):
+        if prompt:
+            with st.spinner("Generating response..."):
+                try:
+                    # Get the response from the SmartDataframe
+                    response = smart_df.chat(prompt)
+                    if isinstance(response, pd.DataFrame):
+                        st.write(response)
+                    elif isinstance(response, str) and response.endswith(".png"):
+                        st.image(response)
+                    else:
+                        st.write(response)
+                except Exception as e:
+                    # Display an error message if there's an issue
+                    st.error(f"Error generating response: {e}")
+        else:
+            # Warn the user if the prompt is empty
+            st.warning("Please enter a prompt!")