Building a simple data science agent that can handle
- Basic data operations (load, filter, aggregate)
- Code generation and execution
- Visualization with auto-display
- Excel/CSV support
- Conversation context
To make the system more robust, we need to incorporate advanced data handling capabilities — including large file support, database connectivity, multi-DataFrame input, and data versioning to enable undo or rollback functions. It could also include machine learning analytics, report generation, and dashboard creation.
On the user experience side, enhancements such as streaming responses, progress indicators, error recovery, and multimodal input should be considered.
From a safety perspective, the current execution environment isn’t secure — we need sandboxing, memory and CPU limits, and data privacy features like PII detection or masking, along with audit logging.
For commercial readiness, user authentication, role-based access control, team workspaces, cost tracking, and compliance mechanisms will also be essential.
"""
Simple Chat Agent using Azure OpenAI with Data Science capabilities
Reads configuration from config/config.ini
"""
import configparser
import os
import pandas as pd
import numpy as np
import json
from pathlib import Path
from openai import AzureOpenAI
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
def load_config(config_path="config/config.ini"):
"""Load configuration from ini file"""
config = configparser.ConfigParser()
config.read(config_path)
return config
def create_chat_agent(config):
"""Create Azure OpenAI client from config"""
azure_config = config['AzureOpenAI']
# Set up SSL certificate
import httpx
ca_bundle_path = "config/ca-bundle-full.crt"
http_client = httpx.Client(verify=ca_bundle_path)
client = AzureOpenAI(
api_key=azure_config.get('api_key'),
api_version=azure_config.get('api_version', '2024-02-15-preview'),
azure_endpoint=azure_config.get('endpoint'),
http_client=http_client
)
return client, azure_config.get('deployment_name', 'gpt-4')
def chat(client, deployment_name, user_message, conversation_history=None):
"""Send a message and get response"""
if conversation_history is None:
conversation_history = []
# Add user message to history
conversation_history.append({"role": "user", "content": user_message})
# Get response
response = client.chat.completions.create(
model=deployment_name,
messages=conversation_history,
temperature=0.7,
max_tokens=1500
)
# Extract assistant message
assistant_message = response.choices[0].message.content
conversation_history.append({"role": "assistant", "content": assistant_message})
return assistant_message, conversation_history
def execute_python_code(code, df=None, output_dir=None):
"""Execute Python code safely and return result"""
# Pre-process code to fix output_dir references
if output_dir and 'output_dir = ' in code:
code = code.replace('output_dir = "."', f'output_dir = r"{output_dir}"')
code = code.replace("output_dir = '.'", f'output_dir = r"{output_dir}"')
local_vars = {
'pd': pd,
'np': np,
'plt': plt,
'sns': sns,
'df': df,
'output_dir': str(output_dir) if output_dir else '.'
}
try:
exec(code, {}, local_vars)
return local_vars.get('result', local_vars.get('df', None)), None
except Exception as e:
return None, str(e)
def load_csv_file(filepath):
"""Load CSV or Excel file into DataFrame"""
try:
if filepath.endswith(('.xlsx', '.xls')):
df = pd.read_excel(filepath)
else:
df = pd.read_csv(filepath)
return df, None
except Exception as e:
return None, str(e)
def save_dataframe(df, output_path):
"""Save DataFrame to CSV or Excel"""
try:
if str(output_path).endswith(('.xlsx', '.xls')):
df.to_excel(output_path, index=False)
else:
df.to_csv(output_path, index=False)
return output_path, None
except Exception as e:
return None, str(e)
def main():
"""Main chat loop"""
print("Loading configuration...")
config = load_config()
print("Initializing chat agent...")
client, deployment_name = create_chat_agent(config)
# Data science context
current_df = None
current_file = None
output_dir = Path("outputs")
output_dir.mkdir(exist_ok=True)
system_prompt = """You are an advanced data science assistant with expertise in pandas, numpy, matplotlib, and seaborn.
When generating code:
1. Use 'df' for the DataFrame variable
2. Store final results in 'result' variable
3. Wrap code in ```python``` blocks
4. For plots: ALWAYS use plt.savefig(f'{output_dir}/plotname.png') and plt.close()
5. NEVER reassign output_dir - it's already set correctly
6. When summing numeric columns, use pd.to_numeric() to handle string values properly
Available libraries: pd (pandas), np (numpy), plt (matplotlib.pyplot), sns (seaborn)
Capabilities:
- Data cleaning: handle missing values, duplicates, outliers
- Transformations: pivot, melt, merge, join, groupby, aggregations
- Statistics: describe, correlations, distributions
- Visualizations: histograms, scatter plots, heatmaps, box plots
- Feature engineering: encoding, scaling, binning
- Time series: resampling, rolling windows, date operations
- Machine learning prep: train/test split, feature selection
Commands:
- "load <file>" - Load CSV/Excel
- "save <file>" - Save DataFrame
- "show" - Display info
- Ask any data science question"""
conversation_history = [
{"role": "system", "content": system_prompt}
]
print("\n" + "="*50)
print("Advanced Data Science Agent Ready!")
print("Commands: load, save, show, quit")
print("Capabilities: cleaning, stats, viz, ML prep")
print("="*50 + "\n")
while True:
user_input = input("You: ").strip()
if user_input.lower() in ['quit', 'exit', 'q']:
print("Goodbye!")
break
if not user_input:
continue
# Handle special commands
if user_input.lower().startswith('load '):
filepath = user_input[5:].strip()
current_df, error = load_csv_file(filepath)
if error:
print(f"\nError loading file: {error}\n")
else:
current_file = filepath
print(f"\nLoaded {filepath}")
print(f"Shape: {current_df.shape}")
print(f"Columns: {list(current_df.columns)}\n")
continue
if user_input.lower().startswith('save '):
if current_df is None:
print("\nNo DataFrame loaded. Use 'load <file>' first.\n")
continue
filename = user_input[5:].strip()
output_path = output_dir / filename
saved_path, error = save_dataframe(current_df, output_path)
if error:
print(f"\nError saving file: {error}\n")
else:
print(f"\nSaved to: {saved_path}\n")
continue
if user_input.lower() in ['show', 'display', 'head']:
if current_df is None:
print("\nNo DataFrame loaded.\n")
else:
print(f"\nCurrent DataFrame ({current_file}):")
print(f"Shape: {current_df.shape}")
print(f"Columns: {list(current_df.columns)}")
print(f"\nFirst 5 rows:\n{current_df.head()}\n")
continue
try:
# Add DataFrame context to user message if available
context_message = user_input
if current_df is not None:
df_preview = current_df.head(10).to_string()
context_message = f"""[Current DataFrame Info:
Shape: {current_df.shape}
Columns: {list(current_df.columns)}
First 10 rows:
{df_preview}
]
User question: {user_input}"""
# Get AI response
response, conversation_history = chat(
client,
deployment_name,
context_message,
conversation_history
)
print(f"\nAssistant: {response}\n")
# Extract and execute Python code if present
if '```python' in response:
code_blocks = response.split('```python')[1:]
for block in code_blocks:
code = block.split('```')[0].strip()
print(f"Executing code...\n")
# Track existing plots before execution
existing_plots = set(output_dir.glob('*.png'))
result, error = execute_python_code(code, current_df, output_dir)
if error:
print(f"Execution Error: {error}\n")
else:
# Check for NEW plots only
current_plots = set(output_dir.glob('*.png'))
new_plots = current_plots - existing_plots
if new_plots:
latest_plot = max(new_plots, key=lambda p: p.stat().st_mtime)
print(f"✓ Plot saved: {latest_plot}")
# Display the plot
try:
img = Image.open(latest_plot)
img.show()
print("✓ Plot displayed\n")
except Exception as e:
print(f"Could not display plot: {e}\n")
if isinstance(result, pd.DataFrame):
current_df = result
print(f"Result DataFrame shape: {result.shape}")
print(f"\n{result.head()}\n")
elif isinstance(result, pd.Series):
print(f"Result Series:\n{result}\n")
elif result is not None:
print(f"Result: {result}\n")
except Exception as e:
print(f"\nError: {e}\n")
if __name__ == "__main__":
main()