Pydata-visualizer Examples

This document provides various examples of how to use Pydata-visualizer in different scenarios.

Table of Contents

  1. Basic Usage

  2. Customizing Analysis

  3. Working with Different Data Types

  4. Processing Large Datasets

  5. Accessing Results Programmatically

  6. Integration Examples

  7. Common Workflows

Basic Usage

Simple Analysis and Report Generation

import pandas as pd
from data_visualizer.profiler import AnalysisReport

# Load sample data
df = pd.read_csv("sample_data.csv")

# Create and generate report
report = AnalysisReport(df)
report.to_html("basic_report.html")

Loading Data from Different Sources

import pandas as pd
from data_visualizer.profiler import AnalysisReport

# From CSV
df_csv = pd.read_csv("data.csv")
report_csv = AnalysisReport(df_csv)
report_csv.to_html("csv_report.html")

# From Excel
df_excel = pd.read_excel("data.xlsx", sheet_name="Sheet1")
report_excel = AnalysisReport(df_excel)
report_excel.to_html("excel_report.html")

# From database using SQL
import sqlite3
conn = sqlite3.connect("database.db")
df_sql = pd.read_sql_query("SELECT * FROM table_name", conn)
report_sql = AnalysisReport(df_sql)
report_sql.to_html("sql_report.html")

# From JSON
df_json = pd.read_json("data.json")
report_json = AnalysisReport(df_json)
report_json.to_html("json_report.html")

Customizing Analysis

Using Settings to Configure Analysis

import pandas as pd
from data_visualizer.profiler import AnalysisReport, Settings

# Load data
df = pd.read_csv("customer_data.csv")

# Custom settings
settings = Settings(
    minimal=False,              # Full analysis with all features
    top_n_values=5,             # Show top 5 values in categorical columns
    skewness_threshold=1.5,     # Lower threshold for skewness alerts
    outlier_method='iqr',       # Use IQR method for outlier detection
    outlier_threshold=1.5,      # Standard IQR multiplier
    duplicate_threshold=3.0,    # Alert if duplicates exceed 3%
    text_analysis=True          # Enable word cloud and frequency analysis
)

# Create and generate report with custom settings
report = AnalysisReport(df, settings=settings)
report.to_html("custom_report.html")

Minimal Analysis for Quick Overview

import pandas as pd
from data_visualizer.profiler import AnalysisReport, Settings

# Load data
df = pd.read_csv("large_dataset.csv")

# Minimal settings for fast processing
minimal_settings = Settings(minimal=True)

# Create and generate quick report
quick_report = AnalysisReport(df, settings=minimal_settings)
quick_report.to_html("quick_overview.html")

Working with Different Data Types

Mixed Data Types Analysis

import pandas as pd
import numpy as np
from data_visualizer.profiler import AnalysisReport

# Create a dataset with mixed types
data = {
    'numeric': np.random.normal(0, 1, 1000),
    'categorical': np.random.choice(['A', 'B', 'C', 'D'], 1000),
    'boolean': np.random.choice([True, False], 1000),
    'datetime': pd.date_range('2023-01-01', periods=1000),
    'text': ['Text sample ' + str(i) for i in range(1000)]
}
df = pd.DataFrame(data)

# Create and generate report
mixed_report = AnalysisReport(df)
mixed_report.to_html("mixed_types_report.html")

Financial Data Analysis

import pandas as pd
import numpy as np
from data_visualizer.profiler import AnalysisReport, Settings

# Create sample financial data
np.random.seed(42)
data = {
    'date': pd.date_range('2022-01-01', periods=100),
    'price': np.random.normal(100, 15, 100).cumsum() + 1000,
    'volume': np.random.randint(1000, 100000, 100),
    'change_pct': np.random.normal(0, 1, 100),
    'sector': np.random.choice(['Tech', 'Finance', 'Healthcare', 'Energy'], 100)
}
financial_df = pd.DataFrame(data)

# Settings for financial analysis (higher skewness tolerance)
financial_settings = Settings(skewness_threshold=3.0, outlier_threshold=3.0)

# Create and generate report
financial_report = AnalysisReport(financial_df, settings=financial_settings)
financial_report.to_html("financial_report.html")

Text Data Analysis

Analyzing Text Columns with Word Clouds

import pandas as pd
from data_visualizer.profiler import AnalysisReport, Settings

# Create sample dataset with text reviews
df = pd.DataFrame({
    'product_id': range(1, 101),
    'customer_review': [
        'excellent product quality amazing', 'good value for money',
        'poor customer service disappointed', 'fast delivery great experience',
        'highly recommend excellent', 'defective item bad quality'
    ] * 16 + ['great product'] * 4,
    'rating': [5, 4, 2, 5, 5, 1] * 16 + [5] * 4
})

# Enable text analysis for word clouds
settings = Settings(text_analysis=True, top_n_values=10)

# Create report - will generate word clouds for text columns
report = AnalysisReport(df, settings=settings)
report.to_html("text_analysis_report.html")

# Word clouds will show frequently occurring words in the reviews
# Bar charts will show the most common complete review texts

Disabling Text Analysis for Performance

import pandas as pd
from data_visualizer.profiler import AnalysisReport, Settings

# For datasets with many text columns, disable text analysis for faster processing
settings = Settings(text_analysis=False)

# Create report without word frequency analysis
report = AnalysisReport(large_text_df, settings=settings)
report.to_html("no_text_analysis_report.html")

Processing Large Datasets

Using Sampling for Large Datasets

import pandas as pd
from data_visualizer.profiler import AnalysisReport, Settings

# Load large dataset
large_df = pd.read_csv("large_dataset.csv")

# Sample the dataset for faster processing
sampled_df = large_df.sample(n=10000, random_state=42)

# Create report with sampled data
sampled_report = AnalysisReport(sampled_df)
sampled_report.to_html("sampled_report.html")

Analyzing Specific Columns Only

import pandas as pd
from data_visualizer.profiler import AnalysisReport

# Load large dataset
large_df = pd.read_csv("large_dataset.csv")

# Select only important columns
important_columns = ['customer_id', 'purchase_amount', 'product_category', 'purchase_date']
subset_df = large_df[important_columns]

# Create report with subset of columns
subset_report = AnalysisReport(subset_df)
subset_report.to_html("important_columns_report.html")

Accessing Results Programmatically

Getting Analysis Results as Python Objects

import pandas as pd
from data_visualizer.profiler import AnalysisReport

# Load data
df = pd.read_csv("survey_data.csv")

# Create report and run analysis
report = AnalysisReport(df)
results = report.analyse()

# Access various components of the results
overview = results['overview']
variables = results['variables']
correlations = results['Correlations_JSON']

# Print some insights
print(f"Dataset has {overview['num_Row']} responses with {overview['missing_percentage']:.2f}% missing data")

# Check for highly skewed variables
skewed_variables = []
for var_name, var_stats in variables.items():
    if 'skewness' in var_stats and abs(var_stats['skewness']) > 2.0:
        skewed_variables.append((var_name, var_stats['skewness']))

print(f"Found {len(skewed_variables)} highly skewed variables:")
for var, skew in skewed_variables:
    print(f"- {var}: {skew:.2f}")

# Find strongly correlated pairs
if 'pearson' in correlations:
    pearson = pd.DataFrame(correlations['pearson'])
    strong_correlations = []
    
    for i in range(len(pearson.columns)):
        for j in range(i+1, len(pearson.columns)):
            if abs(pearson.iloc[i, j]) > 0.7:
                strong_correlations.append((
                    pearson.columns[i], 
                    pearson.columns[j], 
                    pearson.iloc[i, j]
                ))
    
    print(f"Found {len(strong_correlations)} strongly correlated pairs:")
    for var1, var2, corr in strong_correlations:
        print(f"- {var1} & {var2}: {corr:.3f}")

Extracting Visualizations from Results

import pandas as pd
import matplotlib.pyplot as plt
import base64
from data_visualizer.profiler import AnalysisReport
import io

# Load data
df = pd.read_csv("product_data.csv")

# Create report and run analysis
report = AnalysisReport(df)
results = report.analyse()

# Extract and save a specific column's visualization
if 'price' in results['variables']:
    price_info = results['variables']['price']
    if 'plot_base64' in price_info:
        # Extract the base64 image data (remove header info)
        img_data = price_info['plot_base64'].split(',')[1]
        
        # Decode and save to file
        with open("price_distribution.png", "wb") as f:
            f.write(base64.b64decode(img_data))
        
        print("Saved price distribution plot as price_distribution.png")
        
# Extract and save correlation heatmap
if 'pearson' in results['Correlations_Plots']:
    # Extract the base64 image data (remove header info)
    img_data = results['Correlations_Plots']['pearson'].split(',')[1]
    
    # Decode and save to file
    with open("correlation_heatmap.png", "wb") as f:
        f.write(base64.b64decode(img_data))
    
    print("Saved correlation heatmap as correlation_heatmap.png")

Integration Examples

Integrating with Web Applications (Flask)

from flask import Flask, render_template, request, redirect, url_for
import pandas as pd
import os
from data_visualizer.profiler import AnalysisReport
from werkzeug.utils import secure_filename

app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'uploads'
app.config['REPORT_FOLDER'] = 'static/reports'

os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
os.makedirs(app.config['REPORT_FOLDER'], exist_ok=True)

@app.route('/', methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        if 'file' not in request.files:
            return redirect(request.url)
        
        file = request.files['file']
        if file.filename == '':
            return redirect(request.url)
        
        if file:
            filename = secure_filename(file.filename)
            filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
            file.save(filepath)
            
            # Generate a unique report name
            report_name = f"report_{filename.split('.')[0]}.html"
            report_path = os.path.join(app.config['REPORT_FOLDER'], report_name)
            
            # Process the file based on extension
            if filename.endswith('.csv'):
                df = pd.read_csv(filepath)
            elif filename.endswith(('.xls', '.xlsx')):
                df = pd.read_excel(filepath)
            else:
                return "Unsupported file format"
            
            # Generate report
            report = AnalysisReport(df)
            report.to_html(report_path)
            
            return redirect(url_for('show_report', report_name=report_name))
    
    return '''
    <!doctype html>
    <title>Upload Data for Analysis</title>
    <h1>Upload your data file</h1>
    <form method=post enctype=multipart/form-data>
      <input type=file name=file accept=".csv,.xls,.xlsx">
      <input type=submit value=Analyze>
    </form>
    '''

@app.route('/report/<report_name>')
def show_report(report_name):
    return redirect(f'/static/reports/{report_name}')

if __name__ == '__main__':
    app.run(debug=True)

Integration with Jupyter Notebooks

import pandas as pd
import matplotlib.pyplot as plt
from data_visualizer.profiler import AnalysisReport, Settings
import webbrowser
import os
from IPython.display import IFrame, display, HTML

# Load data
df = pd.read_csv("dataset.csv")

# Create report with custom settings
settings = Settings(top_n_values=7, skewness_threshold=1.5)
report = AnalysisReport(df, settings=settings)

# Generate HTML report
report_path = "notebook_report.html"
report.to_html(report_path)

# For inline preview in the notebook
display(HTML(f'<a href="{report_path}" target="_blank">Click to open full report</a>'))

# Show report in an iframe (limited functionality)
display(IFrame(src=report_path, width=800, height=600))

# Alternatively, open in a browser
full_path = os.path.abspath(report_path)
webbrowser.open(f'file://{full_path}')

Common Workflows

Data Cleaning Workflow

import pandas as pd
import numpy as np
from data_visualizer.profiler import AnalysisReport

# Load messy data
df_messy = pd.read_csv("messy_data.csv")

# Step 1: Initial profiling to identify issues
initial_report = AnalysisReport(df_messy)
results = initial_report.analyse()
initial_report.to_html("initial_report.html")

# Step 2: Clean data based on profiling results
df_clean = df_messy.copy()

# Handle missing values
missing_cols = []
for col, stats in results['variables'].items():
    if stats['missing_%'] > 0:
        missing_cols.append((col, stats['missing_%']))

print("Columns with missing values:")
for col, pct in sorted(missing_cols, key=lambda x: x[1], reverse=True):
    print(f"- {col}: {pct:.2f}%")
    
    # Fill or drop based on missing percentage
    if pct > 50:
        print(f"  Dropping column {col} due to high missing rate")
        df_clean = df_clean.drop(columns=[col])
    elif pd.api.types.is_numeric_dtype(df_clean[col]):
        print(f"  Filling {col} with median")
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())
    else:
        print(f"  Filling {col} with mode")
        df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])

# Handle extreme outliers in numeric columns
for col, stats in results['variables'].items():
    if col in df_clean and pd.api.types.is_numeric_dtype(df_clean[col]):
        if 'skewness' in stats and abs(stats['skewness']) > 3:
            print(f"Handling outliers in {col} (skewness: {stats['skewness']:.2f})")
            q1 = stats.get('25%', df_clean[col].quantile(0.25))
            q3 = stats.get('75%', df_clean[col].quantile(0.75))
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            
            # Cap outliers
            df_clean[col] = df_clean[col].clip(lower_bound, upper_bound)

# Step 3: Profile the cleaned data
final_report = AnalysisReport(df_clean)
final_report.to_html("cleaned_data_report.html")

# Save cleaned data
df_clean.to_csv("cleaned_data.csv", index=False)
print(f"Cleaned data saved with {df_clean.shape[0]} rows and {df_clean.shape[1]} columns")

Feature Selection Workflow

import pandas as pd
import numpy as np
from data_visualizer.profiler import AnalysisReport

# Load dataset with many features
df = pd.read_csv("features_dataset.csv")

# Assume the target column name is 'target'
features = df.drop(columns=['target'])
target = df['target']

# Step 1: Profile the data
report = AnalysisReport(df)
results = report.analyse()

# Step 2: Filter columns based on profiling insights
selected_features = []

# Remove columns with high missing values
for col, stats in results['variables'].items():
    if col != 'target' and stats['missing_%'] < 20:
        selected_features.append(col)

print(f"After removing high-missing columns: {len(selected_features)} features left")

# Filter out highly correlated features
if 'pearson' in results['Correlations_JSON']:
    pearson = pd.DataFrame(results['Correlations_JSON']['pearson'])
    
    # Find pairs of highly correlated features
    correlated_pairs = []
    for i, col1 in enumerate(selected_features):
        for col2 in selected_features[i+1:]:
            if col1 in pearson and col2 in pearson:
                correlation = abs(pearson.loc[col1, col2])
                if correlation > 0.85:  # High correlation threshold
                    correlated_pairs.append((col1, col2, correlation))
    
    # Sort by correlation strength
    correlated_pairs.sort(key=lambda x: x[2], reverse=True)
    
    # Remove one feature from each highly correlated pair
    removed_features = set()
    for col1, col2, corr in correlated_pairs:
        if col1 not in removed_features and col2 not in removed_features:
            # Keep the one with lower missing value percentage
            if results['variables'][col1]['missing_%'] > results['variables'][col2]['missing_%']:
                removed_features.add(col1)
            else:
                removed_features.add(col2)
    
    for feature in removed_features:
        if feature in selected_features:
            selected_features.remove(feature)

print(f"After removing highly correlated features: {len(selected_features)} features left")

# Create a new dataset with selected features
selected_df = df[selected_features + ['target']]

# Profile the reduced feature set
reduced_report = AnalysisReport(selected_df)
reduced_report.to_html("reduced_features_report.html")

# Save the dataset with selected features
selected_df.to_csv("selected_features.csv", index=False)

These examples demonstrate various ways to use Pydata-visualizer in real-world scenarios. You can adapt them to your specific needs and data types.