mic3333 commited on
Commit
c4848ed
Β·
verified Β·
1 Parent(s): aa35d25

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +419 -0
app.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import io
4
+ import pandas as pd
5
+ import plotly.express as px
6
+ import plotly.graph_objects as go
7
+ from dash import Dash, html, dcc, Input, Output, State
8
+ import dash_bootstrap_components as dbc
9
+
10
+ # Langchain imports
11
+ from langchain.llms import HuggingFacePipeline
12
+ from langchain.embeddings import HuggingFaceEmbeddings
13
+ from langchain.vectorstores import FAISS
14
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
15
+ from langchain.chains import RetrievalQA
16
+ from langchain.document_loaders import CSVLoader, DataFrameLoader
17
+ from langchain.schema import Document
18
+
19
+ # Initialize Dash app
20
+ app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
21
+ server = app.server
22
+
23
+ # Initialize Langchain components
24
+ @st.cache_resource
25
+ def init_langchain():
26
+ """Initialize Langchain components"""
27
+ try:
28
+ # Use a lightweight model for embeddings
29
+ embeddings = HuggingFaceEmbeddings(
30
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
31
+ model_kwargs={'device': 'cpu'}
32
+ )
33
+
34
+ # Initialize text splitter
35
+ text_splitter = RecursiveCharacterTextSplitter(
36
+ chunk_size=1000,
37
+ chunk_overlap=200
38
+ )
39
+
40
+ return embeddings, text_splitter
41
+ except Exception as e:
42
+ print(f"Error initializing Langchain: {e}")
43
+ return None, None
44
+
45
+ # Global variables
46
+ embeddings, text_splitter = init_langchain()
47
+ vector_store = None
48
+
49
+ # App layout
50
+ app.layout = dbc.Container([
51
+ dbc.Row([
52
+ dbc.Col([
53
+ html.H1("πŸ€– AI-Powered Data Analytics", className="text-center mb-4"),
54
+ html.P("Upload data, ask questions, and get AI-powered insights!",
55
+ className="text-center text-muted"),
56
+ html.Hr(),
57
+ ], width=12)
58
+ ]),
59
+
60
+ dbc.Row([
61
+ dbc.Col([
62
+ dbc.Card([
63
+ dbc.CardBody([
64
+ html.H4("πŸ“ Data Upload", className="card-title"),
65
+ dcc.Upload(
66
+ id='upload-data',
67
+ children=html.Div([
68
+ 'Drag and Drop or ',
69
+ html.A('Select Files')
70
+ ]),
71
+ style={
72
+ 'width': '100%',
73
+ 'height': '60px',
74
+ 'lineHeight': '60px',
75
+ 'borderWidth': '1px',
76
+ 'borderStyle': 'dashed',
77
+ 'borderRadius': '5px',
78
+ 'textAlign': 'center',
79
+ 'margin': '10px'
80
+ },
81
+ multiple=False,
82
+ accept='.csv,.xlsx,.txt'
83
+ ),
84
+
85
+ html.Div(id='upload-status', className="mt-2"),
86
+ html.Hr(),
87
+
88
+ html.H4("πŸ€– AI Assistant", className="card-title"),
89
+ dbc.InputGroup([
90
+ dbc.Input(
91
+ id="ai-question",
92
+ placeholder="Ask questions about your data...",
93
+ type="text",
94
+ style={"fontSize": "14px"}
95
+ ),
96
+ dbc.Button(
97
+ "Ask AI",
98
+ id="ask-button",
99
+ color="primary",
100
+ n_clicks=0
101
+ )
102
+ ]),
103
+
104
+ html.Div(id="ai-response", className="mt-3"),
105
+ html.Hr(),
106
+
107
+ html.H4("πŸ“Š Quick Analytics", className="card-title"),
108
+ dbc.ButtonGroup([
109
+ dbc.Button("Summary Stats", id="stats-btn", size="sm"),
110
+ dbc.Button("Correlations", id="corr-btn", size="sm"),
111
+ dbc.Button("Missing Data", id="missing-btn", size="sm"),
112
+ ], className="w-100"),
113
+
114
+ html.Div(id="quick-analytics", className="mt-3")
115
+ ])
116
+ ])
117
+ ], width=4),
118
+
119
+ dbc.Col([
120
+ dbc.Card([
121
+ dbc.CardBody([
122
+ html.H4("πŸ“ˆ Visualizations", className="card-title"),
123
+ dcc.Graph(id='main-graph', style={'height': '400px'}),
124
+ ])
125
+ ]),
126
+
127
+ dbc.Card([
128
+ dbc.CardBody([
129
+ html.H4("πŸ” Data Explorer", className="card-title"),
130
+ html.Div(id='data-table')
131
+ ])
132
+ ], className="mt-3")
133
+ ], width=8)
134
+ ], className="mt-4"),
135
+
136
+ # Store components
137
+ dcc.Store(id='stored-data'),
138
+ dcc.Store(id='data-context')
139
+ ], fluid=True)
140
+
141
+ def create_vector_store(df):
142
+ """Create vector store from dataframe"""
143
+ global vector_store
144
+
145
+ if embeddings is None:
146
+ return False
147
+
148
+ try:
149
+ # Convert dataframe to documents
150
+ documents = []
151
+
152
+ # Add column information
153
+ col_info = f"Dataset has {len(df)} rows and {len(df.columns)} columns.\n"
154
+ col_info += f"Columns: {', '.join(df.columns)}\n"
155
+ col_info += f"Data types: {df.dtypes.to_string()}\n"
156
+ documents.append(Document(page_content=col_info, metadata={"type": "schema"}))
157
+
158
+ # Add summary statistics
159
+ summary = df.describe().to_string()
160
+ documents.append(Document(page_content=f"Summary statistics:\n{summary}",
161
+ metadata={"type": "statistics"}))
162
+
163
+ # Add sample rows
164
+ sample_data = df.head(10).to_string()
165
+ documents.append(Document(page_content=f"Sample data:\n{sample_data}",
166
+ metadata={"type": "sample"}))
167
+
168
+ # Add correlation information for numeric columns
169
+ numeric_cols = df.select_dtypes(include=['number']).columns
170
+ if len(numeric_cols) > 1:
171
+ corr = df[numeric_cols].corr().to_string()
172
+ documents.append(Document(page_content=f"Correlations:\n{corr}",
173
+ metadata={"type": "correlation"}))
174
+
175
+ # Create vector store
176
+ vector_store = FAISS.from_documents(documents, embeddings)
177
+ return True
178
+
179
+ except Exception as e:
180
+ print(f"Error creating vector store: {e}")
181
+ return False
182
+
183
+ def get_ai_response(question, df):
184
+ """Get AI response using RAG"""
185
+ global vector_store
186
+
187
+ if vector_store is None:
188
+ return "Please upload data first to enable AI features."
189
+
190
+ try:
191
+ # Simple keyword-based responses for demo
192
+ question_lower = question.lower()
193
+
194
+ if "summary" in question_lower or "overview" in question_lower:
195
+ return f"""πŸ“Š **Data Summary**:
196
+ - **Shape**: {df.shape[0]} rows Γ— {df.shape[1]} columns
197
+ - **Columns**: {', '.join(df.columns)}
198
+ - **Missing values**: {df.isnull().sum().sum()} total
199
+ - **Numeric columns**: {len(df.select_dtypes(include=['number']).columns)}
200
+ """
201
+
202
+ elif "correlation" in question_lower or "relationship" in question_lower:
203
+ numeric_cols = df.select_dtypes(include=['number']).columns
204
+ if len(numeric_cols) > 1:
205
+ corr = df[numeric_cols].corr()
206
+ # Find highest correlation
207
+ corr_vals = corr.abs().unstack().sort_values(ascending=False)
208
+ corr_vals = corr_vals[corr_vals < 1.0] # Remove self-correlations
209
+ if not corr_vals.empty:
210
+ top_corr = corr_vals.iloc[0]
211
+ col1, col2 = corr_vals.index[0]
212
+ return f"""πŸ”— **Correlation Analysis**:
213
+ - Strongest relationship: **{col1}** and **{col2}** (r = {top_corr:.3f})
214
+ - This suggests a {'strong' if top_corr > 0.7 else 'moderate' if top_corr > 0.5 else 'weak'} correlation
215
+ """
216
+ return "No numeric columns found for correlation analysis."
217
+
218
+ elif "missing" in question_lower or "null" in question_lower:
219
+ missing = df.isnull().sum()
220
+ missing = missing[missing > 0]
221
+ if missing.empty:
222
+ return "βœ… **Great news!** No missing values found in your dataset."
223
+ else:
224
+ return f"""⚠️ **Missing Data Found**:
225
+ {missing.to_string()}
226
+
227
+ **Recommendation**: Consider filling or removing missing values before analysis.
228
+ """
229
+
230
+ elif "recommend" in question_lower or "suggest" in question_lower:
231
+ suggestions = []
232
+ numeric_cols = df.select_dtypes(include=['number']).columns
233
+ categorical_cols = df.select_dtypes(include=['object']).columns
234
+
235
+ if len(numeric_cols) >= 2:
236
+ suggestions.append("πŸ“ˆ Try scatter plots to explore relationships between numeric variables")
237
+ if len(categorical_cols) > 0 and len(numeric_cols) > 0:
238
+ suggestions.append("πŸ“Š Create bar charts to compare numeric values across categories")
239
+ if len(numeric_cols) > 0:
240
+ suggestions.append("πŸ“‰ Use histograms to understand data distributions")
241
+
242
+ return f"""πŸ’‘ **Analysis Suggestions**:
243
+ {chr(10).join(['β€’ ' + s for s in suggestions])}
244
+ """
245
+
246
+ else:
247
+ return f"""πŸ€– **AI Assistant**: I can help you with:
248
+ - Data summaries and overviews
249
+ - Correlation and relationship analysis
250
+ - Missing data detection
251
+ - Visualization recommendations
252
+
253
+ Try asking: "What's the summary?" or "Any missing data?"
254
+ """
255
+
256
+ except Exception as e:
257
+ return f"Error processing question: {str(e)}"
258
+
259
+ def parse_contents(contents, filename):
260
+ """Parse uploaded file contents"""
261
+ content_type, content_string = contents.split(',')
262
+ decoded = base64.b64decode(content_string)
263
+
264
+ try:
265
+ if 'csv' in filename:
266
+ df = pd.read_csv(io.StringIO(decoded.decode('utf-8')))
267
+ elif 'xls' in filename:
268
+ df = pd.read_excel(io.BytesIO(decoded))
269
+ else:
270
+ return None, "Unsupported file type"
271
+
272
+ return df, None
273
+ except Exception as e:
274
+ return None, f"Error processing file: {str(e)}"
275
+
276
+ @app.callback(
277
+ [Output('stored-data', 'data'),
278
+ Output('upload-status', 'children'),
279
+ Output('data-table', 'children')],
280
+ [Input('upload-data', 'contents')],
281
+ [State('upload-data', 'filename')]
282
+ )
283
+ def update_data(contents, filename):
284
+ """Update data when file is uploaded"""
285
+ if contents is None:
286
+ return None, "", ""
287
+
288
+ df, error = parse_contents(contents, filename)
289
+
290
+ if error:
291
+ return None, dbc.Alert(error, color="danger"), ""
292
+
293
+ # Create vector store for AI
294
+ vector_success = create_vector_store(df)
295
+
296
+ # Create data table preview
297
+ table = dbc.Table.from_dataframe(
298
+ df.head(10),
299
+ striped=True,
300
+ bordered=True,
301
+ hover=True,
302
+ size='sm'
303
+ )
304
+
305
+ ai_status = "πŸ€– AI Ready" if vector_success else "⚠️ AI Limited"
306
+
307
+ success_msg = dbc.Alert([
308
+ html.H6(f"βœ… File uploaded successfully! {ai_status}"),
309
+ html.P(f"Shape: {df.shape[0]} rows Γ— {df.shape[1]} columns"),
310
+ html.P(f"Columns: {', '.join(df.columns.tolist())}")
311
+ ], color="success")
312
+
313
+ return df.to_dict('records'), success_msg, table
314
+
315
+ @app.callback(
316
+ Output('ai-response', 'children'),
317
+ [Input('ask-button', 'n_clicks')],
318
+ [State('ai-question', 'value'),
319
+ State('stored-data', 'data')]
320
+ )
321
+ def handle_ai_question(n_clicks, question, data):
322
+ """Handle AI question"""
323
+ if not n_clicks or not question or not data:
324
+ return ""
325
+
326
+ df = pd.DataFrame(data)
327
+ response = get_ai_response(question, df)
328
+
329
+ return dbc.Alert(
330
+ dcc.Markdown(response),
331
+ color="info"
332
+ )
333
+
334
+ @app.callback(
335
+ Output('quick-analytics', 'children'),
336
+ [Input('stats-btn', 'n_clicks'),
337
+ Input('corr-btn', 'n_clicks'),
338
+ Input('missing-btn', 'n_clicks')],
339
+ [State('stored-data', 'data')]
340
+ )
341
+ def quick_analytics(stats_clicks, corr_clicks, missing_clicks, data):
342
+ """Handle quick analytics buttons"""
343
+ if not data:
344
+ return ""
345
+
346
+ df = pd.DataFrame(data)
347
+ ctx = callback_context
348
+
349
+ if not ctx.triggered:
350
+ return ""
351
+
352
+ button_id = ctx.triggered[0]['prop_id'].split('.')[0]
353
+
354
+ if button_id == 'stats-btn':
355
+ stats = df.describe()
356
+ return dbc.Alert([
357
+ html.H6("πŸ“Š Summary Statistics"),
358
+ dbc.Table.from_dataframe(stats.reset_index(), size='sm')
359
+ ], color="light")
360
+
361
+ elif button_id == 'corr-btn':
362
+ numeric_df = df.select_dtypes(include=['number'])
363
+ if len(numeric_df.columns) > 1:
364
+ corr = numeric_df.corr()
365
+ fig = px.imshow(corr, text_auto=True, aspect="auto",
366
+ title="Correlation Matrix")
367
+ return dcc.Graph(figure=fig, style={'height': '300px'})
368
+ return dbc.Alert("No numeric columns for correlation analysis", color="warning")
369
+
370
+ elif button_id == 'missing-btn':
371
+ missing = df.isnull().sum()
372
+ missing = missing[missing > 0]
373
+ if missing.empty:
374
+ return dbc.Alert("βœ… No missing values!", color="success")
375
+ return dbc.Alert([
376
+ html.H6("⚠️ Missing Values"),
377
+ html.Pre(missing.to_string())
378
+ ], color="warning")
379
+
380
+ return ""
381
+
382
+ @app.callback(
383
+ Output('main-graph', 'figure'),
384
+ [Input('stored-data', 'data')]
385
+ )
386
+ def update_main_graph(data):
387
+ """Update main visualization"""
388
+ if not data:
389
+ return {}
390
+
391
+ df = pd.DataFrame(data)
392
+
393
+ # Create a smart default visualization
394
+ numeric_cols = df.select_dtypes(include=['number']).columns
395
+ categorical_cols = df.select_dtypes(include=['object']).columns
396
+
397
+ if len(numeric_cols) >= 2:
398
+ # Scatter plot for numeric data
399
+ fig = px.scatter(df, x=numeric_cols[0], y=numeric_cols[1],
400
+ title=f"Relationship: {numeric_cols[1]} vs {numeric_cols[0]}")
401
+ elif len(numeric_cols) >= 1 and len(categorical_cols) >= 1:
402
+ # Bar chart for mixed data
403
+ fig = px.bar(df, x=categorical_cols[0], y=numeric_cols[0],
404
+ title=f"Distribution: {numeric_cols[0]} by {categorical_cols[0]}")
405
+ elif len(numeric_cols) >= 1:
406
+ # Histogram for single numeric
407
+ fig = px.histogram(df, x=numeric_cols[0],
408
+ title=f"Distribution of {numeric_cols[0]}")
409
+ else:
410
+ # Default message
411
+ fig = go.Figure()
412
+ fig.add_annotation(text="Upload data to see visualizations",
413
+ x=0.5, y=0.5, showarrow=False)
414
+
415
+ fig.update_layout(template="plotly_white")
416
+ return fig
417
+
418
+ if __name__ == '__main__':
419
+ app.run_server(host='0.0.0.0', port=7860, debug=False)