Create a Simple Data Science Assistant Agent

Building a simple data science agent that can handle

  • Basic data operations (load, filter, aggregate)
  • Code generation and execution
  • Visualization with auto-display
  • Excel/CSV support
  • Conversation context

To make the system more robust, we need to incorporate advanced data handling capabilities — including large file support, database connectivity, multi-DataFrame input, and data versioning to enable undo or rollback functions. It could also include machine learning analytics, report generation, and dashboard creation.

On the user experience side, enhancements such as streaming responses, progress indicators, error recovery, and multimodal input should be considered.

From a safety perspective, the current execution environment isn’t secure — we need sandboxing, memory and CPU limits, and data privacy features like PII detection or masking, along with audit logging.

For commercial readiness, user authentication, role-based access control, team workspaces, cost tracking, and compliance mechanisms will also be essential.

"""
Simple Chat Agent using Azure OpenAI with Data Science capabilities
Reads configuration from config/config.ini
"""

import configparser
import os
import pandas as pd
import numpy as np
import json
from pathlib import Path
from openai import AzureOpenAI
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image


def load_config(config_path="config/config.ini"):
    """Load configuration from ini file"""
    config = configparser.ConfigParser()
    config.read(config_path)
    return config


def create_chat_agent(config):
    """Create Azure OpenAI client from config"""
    azure_config = config['AzureOpenAI']
    
    # Set up SSL certificate
    import httpx
    ca_bundle_path = "config/ca-bundle-full.crt"
    http_client = httpx.Client(verify=ca_bundle_path)
    
    client = AzureOpenAI(
        api_key=azure_config.get('api_key'),
        api_version=azure_config.get('api_version', '2024-02-15-preview'),
        azure_endpoint=azure_config.get('endpoint'),
        http_client=http_client
    )
    
    return client, azure_config.get('deployment_name', 'gpt-4')


def chat(client, deployment_name, user_message, conversation_history=None):
    """Send a message and get response"""
    if conversation_history is None:
        conversation_history = []
    
    # Add user message to history
    conversation_history.append({"role": "user", "content": user_message})
    
    # Get response
    response = client.chat.completions.create(
        model=deployment_name,
        messages=conversation_history,
        temperature=0.7,
        max_tokens=1500
    )
    
    # Extract assistant message
    assistant_message = response.choices[0].message.content
    conversation_history.append({"role": "assistant", "content": assistant_message})
    
    return assistant_message, conversation_history


def execute_python_code(code, df=None, output_dir=None):
    """Execute Python code safely and return result"""
    # Pre-process code to fix output_dir references
    if output_dir and 'output_dir = ' in code:
        code = code.replace('output_dir = "."', f'output_dir = r"{output_dir}"')
        code = code.replace("output_dir = '.'", f'output_dir = r"{output_dir}"')
    
    local_vars = {
        'pd': pd,
        'np': np,
        'plt': plt,
        'sns': sns,
        'df': df,
        'output_dir': str(output_dir) if output_dir else '.'
    }
    try:
        exec(code, {}, local_vars)
        return local_vars.get('result', local_vars.get('df', None)), None
    except Exception as e:
        return None, str(e)


def load_csv_file(filepath):
    """Load CSV or Excel file into DataFrame"""
    try:
        if filepath.endswith(('.xlsx', '.xls')):
            df = pd.read_excel(filepath)
        else:
            df = pd.read_csv(filepath)
        return df, None
    except Exception as e:
        return None, str(e)


def save_dataframe(df, output_path):
    """Save DataFrame to CSV or Excel"""
    try:
        if str(output_path).endswith(('.xlsx', '.xls')):
            df.to_excel(output_path, index=False)
        else:
            df.to_csv(output_path, index=False)
        return output_path, None
    except Exception as e:
        return None, str(e)


def main():
    """Main chat loop"""
    print("Loading configuration...")
    config = load_config()
    
    print("Initializing chat agent...")
    client, deployment_name = create_chat_agent(config)
    
    # Data science context
    current_df = None
    current_file = None
    output_dir = Path("outputs")
    output_dir.mkdir(exist_ok=True)
    
    system_prompt = """You are an advanced data science assistant with expertise in pandas, numpy, matplotlib, and seaborn.

When generating code:
1. Use 'df' for the DataFrame variable
2. Store final results in 'result' variable
3. Wrap code in ```python``` blocks
4. For plots: ALWAYS use plt.savefig(f'{output_dir}/plotname.png') and plt.close()
5. NEVER reassign output_dir - it's already set correctly
6. When summing numeric columns, use pd.to_numeric() to handle string values properly

Available libraries: pd (pandas), np (numpy), plt (matplotlib.pyplot), sns (seaborn)

Capabilities:
- Data cleaning: handle missing values, duplicates, outliers
- Transformations: pivot, melt, merge, join, groupby, aggregations
- Statistics: describe, correlations, distributions
- Visualizations: histograms, scatter plots, heatmaps, box plots
- Feature engineering: encoding, scaling, binning
- Time series: resampling, rolling windows, date operations
- Machine learning prep: train/test split, feature selection

Commands:
- "load <file>" - Load CSV/Excel
- "save <file>" - Save DataFrame
- "show" - Display info
- Ask any data science question"""
    
    conversation_history = [
        {"role": "system", "content": system_prompt}
    ]
    
    print("\n" + "="*50)
    print("Advanced Data Science Agent Ready!")
    print("Commands: load, save, show, quit")
    print("Capabilities: cleaning, stats, viz, ML prep")
    print("="*50 + "\n")
    
    while True:
        user_input = input("You: ").strip()
        
        if user_input.lower() in ['quit', 'exit', 'q']:
            print("Goodbye!")
            break
        
        if not user_input:
            continue
        
        # Handle special commands
        if user_input.lower().startswith('load '):
            filepath = user_input[5:].strip()
            current_df, error = load_csv_file(filepath)
            if error:
                print(f"\nError loading file: {error}\n")
            else:
                current_file = filepath
                print(f"\nLoaded {filepath}")
                print(f"Shape: {current_df.shape}")
                print(f"Columns: {list(current_df.columns)}\n")
            continue
        
        if user_input.lower().startswith('save '):
            if current_df is None:
                print("\nNo DataFrame loaded. Use 'load <file>' first.\n")
                continue
            filename = user_input[5:].strip()
            output_path = output_dir / filename
            saved_path, error = save_dataframe(current_df, output_path)
            if error:
                print(f"\nError saving file: {error}\n")
            else:
                print(f"\nSaved to: {saved_path}\n")
            continue
        
        if user_input.lower() in ['show', 'display', 'head']:
            if current_df is None:
                print("\nNo DataFrame loaded.\n")
            else:
                print(f"\nCurrent DataFrame ({current_file}):")
                print(f"Shape: {current_df.shape}")
                print(f"Columns: {list(current_df.columns)}")
                print(f"\nFirst 5 rows:\n{current_df.head()}\n")
            continue
        
        try:
            # Add DataFrame context to user message if available
            context_message = user_input
            if current_df is not None:
                df_preview = current_df.head(10).to_string()
                context_message = f"""[Current DataFrame Info:
Shape: {current_df.shape}
Columns: {list(current_df.columns)}

First 10 rows:
{df_preview}
]

User question: {user_input}"""
            
            # Get AI response
            response, conversation_history = chat(
                client, 
                deployment_name, 
                context_message, 
                conversation_history
            )
            
            print(f"\nAssistant: {response}\n")
            
            # Extract and execute Python code if present
            if '```python' in response:
                code_blocks = response.split('```python')[1:]
                for block in code_blocks:
                    code = block.split('```')[0].strip()
                    print(f"Executing code...\n")
                    
                    # Track existing plots before execution
                    existing_plots = set(output_dir.glob('*.png'))
                    
                    result, error = execute_python_code(code, current_df, output_dir)
                    
                    if error:
                        print(f"Execution Error: {error}\n")
                    else:
                        # Check for NEW plots only
                        current_plots = set(output_dir.glob('*.png'))
                        new_plots = current_plots - existing_plots
                        
                        if new_plots:
                            latest_plot = max(new_plots, key=lambda p: p.stat().st_mtime)
                            print(f"✓ Plot saved: {latest_plot}")
                            
                            # Display the plot
                            try:
                                img = Image.open(latest_plot)
                                img.show()
                                print("✓ Plot displayed\n")
                            except Exception as e:
                                print(f"Could not display plot: {e}\n")
                        
                        if isinstance(result, pd.DataFrame):
                            current_df = result
                            print(f"Result DataFrame shape: {result.shape}")
                            print(f"\n{result.head()}\n")
                        elif isinstance(result, pd.Series):
                            print(f"Result Series:\n{result}\n")
                        elif result is not None:
                            print(f"Result: {result}\n")
            
        except Exception as e:
            print(f"\nError: {e}\n")


if __name__ == "__main__":
    main()

Leave a comment

This site uses Akismet to reduce spam. Learn how your comment data is processed.