cmpt815perf/results/3 - Scale All The Things/Client - Frankfurt/server_processing.py

import pandas as pd
import matplotlib.pyplot as plt

def read_csv_with_fallback(file_path):
    """
    Attempt to read CSV file with multiple parsing strategies.

    Args:
        file_path (str): Path to the CSV file

    Returns:
        pandas.DataFrame: Parsed DataFrame
    """
    try:
        # First, try reading with header
        try:
            df = pd.read_csv(file_path,
                             dtype={'timestamp': str},  # Ensure timestamp is read as string
                             skipinitialspace=True,
                             skip_blank_lines=True)
        except Exception:
            # If that fails, try reading without header and specify column names
            df = pd.read_csv(file_path,
                             names=['session_id', 'timestamp', 'service_time', 'db_time', 'cache_time',
                                    'db_rows_read', 'db_rows_written', 'db_total_rows',
                                    'cache_hits', 'cache_misses'],
                             header=None,
                             dtype={'timestamp': str},
                             skipinitialspace=True,
                             skip_blank_lines=True)

        # Remove any rows where timestamp is 'timestamp'
        df = df[df['timestamp'] != 'timestamp']

        # Convert timestamp to numeric
        df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')

        # Validate required columns
        required_columns = ['timestamp', 'service_time', 'db_time', 'cache_time']
        for col in required_columns:
            if col not in df.columns:
                raise ValueError(f"Missing required column: {col}")

        return df

    except Exception as e:
        print(f"Error reading CSV: {e}")
        print("Please check the file format and ensure it matches the expected structure.")
        raise

def convert_timestamps(df):
    """
    Convert timestamps to datetime.

    Args:
        df (pandas.DataFrame): Input DataFrame

    Returns:
        pandas.DataFrame: DataFrame with converted timestamps
    """
    # Convert millisecond timestamps to datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

    return df

def analyze_latency_data(df):
    """
    Calculate latency statistics.

    Args:
        df (pandas.DataFrame): Input DataFrame

    Returns:
        dict: Latency statistics
    """
    # Calculate statistics
    stats = {
        'overall': {
            'avg': df['service_time'].mean(),
            'p50': df['service_time'].quantile(0.5),
            'p95': df['service_time'].quantile(0.95),
            'p99': df['service_time'].quantile(0.99),
            'max': df['service_time'].max()
        },
        'db': {
            'avg': df['db_time'].mean(),
            'p50': df['db_time'].quantile(0.5),
            'p95': df['db_time'].quantile(0.95),
            'p99': df['db_time'].quantile(0.99),
            'max': df['db_time'].max()
        },
        'cache': {
            'avg': df['cache_time'].mean(),
            'p50': df['cache_time'].quantile(0.5),
            'p95': df['cache_time'].quantile(0.95),
            'p99': df['cache_time'].quantile(0.99),
            'max': df['cache_time'].max()
        }
    }

    return stats

def plot_latency_graph(df):
    """
    Create a multi-axis time series plot for latencies.

    Args:
        df (pandas.DataFrame): DataFrame with timestamp and time columns
    """
    plt.figure(figsize=(15, 7))

    # Plot overall service time
    plt.plot(df['timestamp'], df['service_time'], label='Overall Service Time', color='blue')

    # Create a twin axis for DB time
    ax1 = plt.gca()
    ax2 = ax1.twinx()
    ax2.plot(df['timestamp'], df['db_time'], label='DB Time', color='red', linestyle='--')

    # Create a third axis for cache time
    ax3 = ax1.twinx()
    # Offset the third axis slightly to the right
    ax3.spines['right'].set_position(('axes', 1.2))
    ax3.plot(df['timestamp'], df['cache_time'], label='Cache Time', color='green', linestyle=':')

    # Set labels and title
    ax1.set_xlabel('Timestamp')
    ax1.set_ylabel('Overall Service Time (ms)', color='blue')
    ax2.set_ylabel('DB Time (ms)', color='red')
    ax3.set_ylabel('Cache Time (ms)', color='green')

    # Format x-axis to show timestamps nicely
    plt.gcf().autofmt_xdate()
    plt.title('Latency Breakdown Over Time')

    # Combine legends
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    lines3, labels3 = ax3.get_legend_handles_labels()
    ax1.legend(lines1 + lines2 + lines3, labels1 + labels2 + labels3, loc='best')

    plt.tight_layout()
    plt.show()

def main(file_path='server_metrics.csv'):
    """
    Main function to process and visualize server metrics.

    Args:
        file_path (str, optional): Path to the CSV file. Defaults to 'server_metrics.csv'.
    """
    try:
        # Read CSV file
        df = read_csv_with_fallback(file_path)

        # Convert timestamps
        df = convert_timestamps(df)

        # Sort by timestamp to ensure chronological order
        df = df.sort_values('timestamp')

        # Analyze latency data
        stats = analyze_latency_data(df)

        # Print statistics
        print("Latency Statistics:")
        for category, metrics in stats.items():
            print(f"\n{category.capitalize()} Latency:")
            for metric, value in metrics.items():
                print(f"{metric}: {value:.2f} ms")

        # Plot the graph
        plot_latency_graph(df)

    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found. Please ensure the CSV file exists in the same directory.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()