Source code for maui.eda.eda

"""
This module provides utilities for Exploratory Data Analysis (EDA) with an
emphasis on visual and summary outputs for categorical data within a DataFrame.
Utilizing Plotly for dynamic visualizations and FPDF for PDF report generation,
it supports the creation of summary cards and plots, enhancing data
understanding and presentation.

Features include:
- Generation of summary cards for data overview.
- Creation of various plots (scatter, bar, etc.) for data comparison and trend
  analysis.
- PDF report generation for easy sharing and presentation.

Exceptions:
- CategoryLimitError: Raised when an attempt is made to process more than the
  allowed number of categories.

Dependencies:
- plotly: For creating interactive plots.
- fpdf: For generating PDF reports.
- pandas: Assumed for DataFrame manipulation and input.

Functions:
- card_summary(df, categories, show_plot=True): Generates a summary card and
  plots for specified categories.

Note:
- This module is designed to work with pandas DataFrames and expects a specific
  structure/format for input data.
"""
import warnings
import tempfile
import time
import pandas as pd

import pkg_resources
from fpdf import FPDF
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots


class CategoryLimitError(Exception):
    """Exception raised when more than two categories are selected."""



[docs]
def card_summary(df, categories, show_plot: bool = True):
    """
    Generates a summary card and plots for specified categories from a
    DataFrame.
    This function processes the input DataFrame to compute various statistics, including the
    number of samples, distinct days, total and mean duration (in minutes) of some activities.
    It also dynamically incorporates additional specified categories into its computations and
    visualizations. If enabled, a plot is generated using Plotly to visually represent these
    statistics alongside the categories specified.

    Parameters
    ----------
    df : pandas.DataFrame
            The input DataFrame containing at least the following columns: 'file_path', 'dt',
            and 'duration'. Additional columns should match the specified categories if any.
    categories : list of str
            A list of category names (column names in `df`) to include in the summary and plot.
            At most two categories can be specified.
    show_plot : bool, optional
            If True (default), the function will generate and show a Plotly plot representing the
            calculated statistics and specified categories. If False, no plot will be displayed.

    Raises
    ------
    Exception
            If more than two categories are specified, an exception is
            raised due to plotting limitations.

    Returns
    -------
    tuple
            Returns a tuple containing:
            
            - card_dict (dict): A dictionary with keys for 'n_samples',
                'distinct_days', 'total_time_duration',
                'mean_time_duration', and one key per category specified.
                The values are the respective
                computed statistics.

            - fig (plotly.graph_objs._figure.Figure): A Plotly figure object with indicators 
                for each of the statistics and categories specified.
                Only returned if `show_plot` is True.

    Notes
    -----
    The function is designed to work with data pertaining to
    durations and occurrences across
    different categories. It's particularly useful for analyzing time series or event data.
    The 'duration' column is expected to be in seconds.

    Examples
    --------
    >>> from maui import samples, eda
    >>> df = samples.get_audio_sample(dataset="leec")
    >>> categories = ['landscape', 'environment']
    >>> card_dict, fig = eda.card_summary(df, categories)
    """

    if len(categories) > 2:
        raise CategoryLimitError("At most two categories should be selected.")

    df_count = df.nunique(axis=0)
    duration_mean = df["duration"].mean() / 60
    duration_total = df["duration"].sum() / 60

    card_dict = {
        "n_samples": df_count["file_path"],
        "distinct_days": df_count["dt"],
        "total_time_duration": duration_total,
        "mean_time_duration": duration_mean,
    }

    subplot_titles = ["Distinct Days", "Total Duration", "Mean Duration", "Samples"]

    for category in categories:
        card_dict[category] = df_count[category]
        subplot_titles.append(category)

    specs = [
        [{"type": "indicator"}, {"type": "indicator"}, {"type": "indicator"}],
        [{"type": "indicator"}, {"type": "indicator"}, {"type": "indicator"}],
    ]

    fig = make_subplots(rows=2, cols=3, subplot_titles=subplot_titles, specs=specs)

    trace0 = go.Indicator(
        mode="number",
        value=card_dict["distinct_days"],
        number={"suffix": ""},
        delta={"position": "top", "reference": 320},
        domain={"x": [0, 1], "y": [0, 1]},
    )
    fig.add_trace(trace0, 1, 1)

    trace1 = go.Indicator(
        mode="number",
        value=card_dict["total_time_duration"],
        number={"suffix": " min"},
        delta={"position": "top", "reference": 320},
        domain={"x": [0, 1], "y": [0, 1]},
    )
    fig.append_trace(trace1, 1, 2)

    trace2 = go.Indicator(
        mode="number",
        value=card_dict["mean_time_duration"],
        number={"suffix": " min"},
        delta={"position": "top", "reference": 320},
        domain={"x": [0, 1], "y": [0, 1]},
    )
    fig.append_trace(trace2, 1, 3)

    trace3 = go.Indicator(
        mode="number",
        value=card_dict["n_samples"],
        number={"prefix": ""},
        delta={"position": "top", "reference": 320},
        domain={"x": [0, 1], "y": [0, 1]},
    )
    fig.append_trace(trace3, 2, 1)

    i = 2
    j = 2

    for category in categories:

        trace_tmp = go.Indicator(
            mode="number",
            value=card_dict[category],
            number={"prefix": ""},
            delta={"position": "top", "reference": 320},
            domain={"x": [0, 1], "y": [0, 1]},
        )
        fig.append_trace(trace_tmp, i, j)

        j = (j % 3) + 1
        if j == 1:
            i = i + 1

    # fig.update_layout(paper_bgcolor = "lightgray")
    if show_plot:
        fig.show()

    return card_dict, fig



# ----------------------------------------------------------------------------



[docs]
def heatmap_analysis(
    df,
    x_axis: str,
    y_axis: str,
    color_continuous_scale="Viridis",
    show_plot: bool = True,
    **kwargs,
):
    """
    Generates a heatmap to analyze the relationship between two categorical
    variables in a DataFrame.

    This function groups the data by the specified `x_axis` and `y_axis`
    categories, counts the occurrences of each group, and then creates a
    heatmap visualization of these counts using Plotly Express. The heatmap
    intensity is determined by the count of occurrences, with an option to
    customize the color scale.

    Parameters
    ----------
    df : pandas.DataFrame
            The input DataFrame containing the data to be analyzed.
            Must include the columns specified by `x_axis` and `y_axis`,
            as well as a 'file_path' column used for counting occurrences.
    x_axis : str
            The name of the column in `df` to be used as the x-axis in the heatmap.
    y_axis : str
            The name of the column in `df` to be used as the y-axis in the heatmap.
    color_continuous_scale : str, optional
            The name of the color scale to use for the heatmap. 
            Defaults to 'Viridis'. For more options, refer
            to Plotly's documentation on color scales.
    show_plot : bool, optional
            If True (default), displays the heatmap plot. 
            If False, the plot is not displayed but is still returned.
    **kwargs : dict
            Additional arguments for plot customization, such as `height` and `width`.

    Returns
    -------
    tuple
            A tuple containing:
            - df_group (pandas.DataFrame): A DataFrame with the grouped counts
            for each combination of `x_axis` and `y_axis` values.
            - fig (plotly.graph_objs._figure.Figure): A Plotly figure object containing the heatmap.

    Notes
    -----
    The 'file_path' column in the input DataFrame is used to count occurrences
    of each group formed by the specified `x_axis` and `y_axis` values.
    This function is useful for visualizing the distribution and
    relationship between two categorical variables.

    Examples
    --------
    >>> from maui import samples, eda
    >>> df = samples.get_audio_sample(dataset="leec")
    >>> df_group, fig = eda.heatmap_analysis(df, 'landscape', 'environment')
    """

    # Aggregates the count of occurrences based on the columns specified
    df_group = df.groupby([x_axis, y_axis], as_index=False)["file_path"].count()
    df_group = df_group.rename(columns={"file_path": "count"})

    # Creates all possible combinations of the categorical values
    categorias_x = sorted(df[x_axis].unique())
    categorias_y = sorted(df[y_axis].unique())
    indice_completo = pd.MultiIndex.from_product([categorias_x, categorias_y], names=[x_axis, y_axis])

    # Reindexes to ensure all combinations exist, filling in zeros where missing
    df_group_complete = (
        df_group.set_index([x_axis, y_axis])
        .reindex(indice_completo, fill_value=0)
        .reset_index()
    )

    # Creates the pivot table for generating the heatmap
    df_group_pivot = df_group_complete.pivot(index=x_axis, columns=y_axis, values="count")

    # Builds the heatmap using Plotly Express
    fig = px.imshow(
        df_group_pivot,
        color_continuous_scale=color_continuous_scale,
        text_auto=True,
        title=f"Heatmap - Número de arquivos de áudio por {x_axis} e {y_axis}",
    )
    fig.update_layout(
        title_x=0.5,
        height=kwargs.get("height", 600),
        width=kwargs.get("width", 800),
    )

    if show_plot:
        fig.show()

    return df_group_complete, fig



# ----------------------------------------------------------------------------



[docs]
def histogram_analysis(df, x_axis: str, category_column: str, show_plot: bool = True):
    """
    Generates a histogram plot for data distribution across a specified axis,
    optionally segmented by categories.

    This function creates a histogram to visualize the distribution of data
    in `df` along the `x_axis`, with data optionally segmented by
    `category_column`. The histogram's appearance, such as opacity and
    bar gap, is customizable. The plot is generated using Plotly Express
    and can be displayed in the notebook or IDE if `show_plot` is set to True.

    Parameters
    ----------
    df : pandas.DataFrame
            The DataFrame containing the data to plot.
            Must include the columns specified by `x_axis`
            and `category_column`.
    x_axis : str
            The name of the column in `df` to be used for the x-axis of the histogram.
    category_column : str
            The name of the column in `df` that contains categorical data for
            segmenting the histogram. Each category will be represented with
            a different color.
    show_plot : bool, optional
            If True (default), the generated plot will be immediately displayed.
            If False, the plot will not be displayed but will still be returned
            by the function.

    Returns
    -------
    plotly.graph_objs._figure.Figure
            The Plotly figure object for the generated histogram.
            This object can be further customized or saved after the function returns.

    Notes
    -----
    This function is designed to offer a quick and convenient way to visualize
    the distribution of data in a DataFrame along a specified axis.
    It is particularly useful for exploratory data analysis and
    for identifying patterns or outliers in dataset segments.

    Examples
    --------
    >>> from maui import samples, eda
    >>> df = samples.get_audio_sample(dataset="leec")
    >>> fig = eda.histogram_analysis(df, 'landscape', 'environment')
    """

    totals = df.groupby(x_axis).size().sort_values(ascending=False)
    order = totals.index.tolist()

    fig = px.histogram(
        df,
        x=x_axis,
        color=category_column,
        opacity=0.7,
        title=f"""Amount of samples by {x_axis} and segmented by {category_column}""",
        category_orders={x_axis: order}
    )
    fig.update_layout(bargap=0.1, title_x=0.5)

    if show_plot:
        fig.show()

    return fig



# ----------------------------------------------------------------------------



[docs]
def duration_analysis(df, category_column: str, duration_column: str, show_plot=True):
    """
    Generates a box plot visualizing the distribution of durations across different categories.

    This function takes a DataFrame and creates a box plot to analyze
    the distribution of durations (or any numerical data) across specified
    categories. The box plot provides a visual representation of the central
    tendency, dispersion, and skewness of the data and identifies outliers.

    Parameters
    ----------
    df : pandas.DataFrame
            The DataFrame containing the data to be analyzed.
            It should include at least two columns:
            one for the category and one for the duration
            (or any numerical data to be analyzed).
    category_column : str
            The name of the column in `df` that contains the categorical data. This column will be
            used to group the numerical data into different categories for the box plot.
    duration_column : str
            The name of the column in `df` that contains the numerical data to
            be analyzed. This data will be distributed into boxes according to
            the categories specified by `category_column`.
    show_plot : bool, optional
            If True (default), the function will display the generated box plot. If False, the plot
            will not be displayed, but the figure object will still be returned.

    Returns
    -------
    plotly.graph_objs._figure.Figure
            The generated Plotly figure object containing the box plot.
            This object can be used for further customization or to display
            the plot at a later time if `show_plot` is False.

    Notes
    -----
    The box plot generated by this function can help identify the range, interquartile range,
    median, and potential outliers within each category. This visual analysis is crucial for
    understanding the distribution characteristics of numerical data across different groups.

    Examples
    --------
    >>> from maui import samples, eda
    >>> df = samples.get_audio_sample(dataset="leec")
    >>> fig = eda.duration_analysis(df, 'landscape', 'duration')
    """

    fig = px.box(
        df,
        x=category_column,
        y=duration_column,
        title=f"""Duration distribution by {category_column}""",
    )
    fig.update_layout(title_x=0.5)

    if show_plot:
        fig.show()

    return fig



# ----------------------------------------------------------------------------



[docs]
def daily_distribution_analysis(
    df, date_column: str, category_column: str, show_plot=True
):
    """
    Analyzes and visualizes the daily distribution of samples by categories.

    This function generates a histogram that shows the distribution of samples over days, separated
    by a specified category. It provides insights into how the frequency of samples varies daily
    and according to the categories within the specified category column.

    Parameters
    ----------
    df : pandas.DataFrame
            The DataFrame containing the data to be analyzed. 
            It must include the specified `date_column` and `category_column`.
    date_column : str
            The name of the column in `df` that contains date information. The values in this column
            should be in a date or datetime format.
    category_column : str
            The name of the column in `df` that contains categorical data,
            which will be used to color the bars in the histogram.
    show_plot : bool, optional
            If True (default), the function will display the generated plot. If False, the plot will
            not be displayed but will still be returned.

    Returns
    -------
    plotly.graph_objs._figure.Figure
            A Plotly figure object representing the histogram of daily sample distribution by
            the specified category. The histogram bars are colored based on the categories
            in the `category_column`.

    Notes
    -----
    The function leverages Plotly for plotting, thus ensuring interactive plots that can be further
    explored in a web browser. It's particularly useful for time series data where understanding the
    distribution of events or samples over time and across different categories is crucial.

    Examples
    --------
    >>> from maui import samples, eda
    >>> df = samples.get_audio_sample(dataset="leec")
    >>> fig = eda.daily_distribution_analysis(df, 'dt', 'landscape')
    """

    # Convert with errors='coerce' to transform invalid dates to NaT (Not a Time)
    df_original_size = len(df)
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')

    # Check for invalid dates and generate warning
    invalid_dates_count = df[date_column].isna().sum()
    if invalid_dates_count > 0:
        warnings.warn(
            f"Found {invalid_dates_count} invalid dates out of {df_original_size} total records. "
            f"These will be removed from the analysis.",
            UserWarning
        )

    # Remove rows with invalid dates (optional)
    df = df.dropna(subset=[date_column])

    # Calculate number of unique days
    num_days = df[date_column].dt.date.nunique()

    fig = px.histogram(
        df,
        x=date_column,
        color=category_column,
        opacity=0.7,
        title=f"""Amount of samples by Day and {category_column}""",
        nbins=num_days  # Força um bin por dia
    )
    fig.update_layout(bargap=0.1, title_x=0.5)

    if show_plot:
        fig.show()

    return fig



# ----------------------------------------------------------------------------



[docs]
def duration_distribution(df, time_unit='s', show_plot=True):
    """
    Generates a distribution plot for the 'duration' column in the provided DataFrame.

    This function creates a distribution plot, including a
    histogram and a kernel density estimate (KDE),
    for the 'duration' column in the input DataFrame. 
    It is designed to give a visual understanding of the
    distribution of duration values across the dataset.

    Parameters
    ----------
    df : pandas.DataFrame
            The DataFrame containing the data to be analyzed. 
            It must include a column named 'duration',
            which contains numeric data.
    time_unit: string
            The time unit of the audio duration column. It is used to make it explicit in the
            visualization which is the time unit. Default: 's'
    show_plot : bool, optional
            If True (default), the function will display the generated plot. If False, the plot will
            not be displayed but will still be returned.

    Returns
    -------
    plotly.graph_objs._figure.Figure
            A Plotly figure object representing the distribution plot of
            the 'duration' column. The plot includes both a histogram of
            the data and a kernel density estimate (KDE) curve.

    Notes
    -----
    The function uses Plotly's `create_distplot` function from the `plotly.figure_factory` module,
    offering a detailed visual representation of data distribution. It's particularly useful for
    analyzing the spread and skewness of numeric data. The KDE curve provides insight into the
    probability density of the durations, complementing the histogram's discrete bins.

    Examples
    --------
    >>> from maui import samples, eda
    >>> df = samples.get_audio_sample(dataset="leec")
    >>> fig = eda.duration_distribution(df)
    """

    group_labels = ["duration"]  # name of the dataset

    fig = ff.create_distplot([df["duration"].values], group_labels)
    fig.update_layout(
        bargap=0.005,
        title_text="Duration distribution",
        title_x=0.5,
        xaxis_title=f"""Duration ({time_unit})""",
        yaxis_title="Density (number of files)"
    )

    if show_plot:
        fig.show()

    return fig



# ----------------------------------------------------------------------------


class PDF(FPDF):
    """
    Internal class to organize the PDF generation
    """

    def footer(self):
        """
        Generate footer of the PDF.
        """
        self.set_y(-15)
        self.set_font("Helvetica", "I", 8)
        self.set_text_color(128)
        self.cell(
            0,
            10,
            "Generated with <3 by Maui Software - Page " + str(self.page_no()),
            0,
            0,
            "C",
        )


def create_letterhead(pdf, width, image):
    """
    Generate title of the PDF.
    """
    pdf.image(image, 0, 0, width)


def create_title(pdf, title, subtitle=None):
    """
    Generate title of the PDF.
    """

    # Add main title
    pdf.set_font("Helvetica", "b", 20)
    pdf.ln(100)
    pdf.write(5, title)
    pdf.ln(15)

    if subtitle is not None:
        # Add subtitle
        pdf.set_font("Helvetica", "b", 16)
        pdf.write(5, subtitle)
        pdf.ln(10)

    # Add date of report
    pdf.set_font("Helvetica", "", 14)
    pdf.set_text_color(r=128, g=128, b=128)
    today = time.strftime("%d/%m/%Y")
    pdf.write(4, f"{today}")

    # Add line break
    pdf.ln(30)


def write_to_pdf(pdf, words):
    """
    Write data.
    """

    # Set text colour, font size, and font type
    pdf.set_text_color(r=0, g=0, b=0)
    pdf.set_font("Helvetica", "", 12)

    pdf.write(5, words)


def write_subtitle(pdf, words):
    """
    Write subtitle.
    """

    # Set text colour, font size, and font type
    pdf.set_text_color(r=0, g=0, b=0)
    pdf.set_font("Helvetica", "b", 14)

    pdf.write(5, words)



[docs]
def export_file_names_summary_pdf_leec(
    df, file_name: str, analysis_title=None, width=210):
    """
    Export a summary PDF report with analysis of file names for LEEC project.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing the data to be analyzed.
    file_name : str
        Name of the output PDF file.
    analysis_title : str, optional
        Title of the analysis section in the PDF.
    width : int, optional
        Width of the PDF document in millimeters.

    Returns
    -------
    None

    Notes
    -----
    This function exports a summary PDF report with various analyses of file
    names for the LEEC project. 
    It includes landscape analysis, environment analysis, and duration
    analysis. 
    The PDF is created using the provided DataFrame `df` and saved with
    the specified `file_name`.

    Examples
    --------
    >>> export_file_names_summary_pdf_leec(df, 'summary_report.pdf', 
                                           analysis_title='Audio Files Analysis')
    """

    categories = ['landscape', 'environment']

    with tempfile.TemporaryDirectory() as temp_dir:

        _, fig = card_summary(df, categories, show_plot=False)
        fig.write_image(f"""{temp_dir}/summary1.png""", height=300, width=1200)
        _, fig = heatmap_analysis(
            df,
            "landscape",
            "environment",
            color_continuous_scale="Viridis",
            show_plot=False,
        )
        fig.write_image(f"""{temp_dir}/summary2.png""")

        fig = histogram_analysis(df, "landscape", "environment", show_plot=False)
        fig.write_image(f"""{temp_dir}/landscape1.png""", height=400, width=1200)
        fig = duration_analysis(df, "landscape", "duration", show_plot=False)
        fig.write_image(f"""{temp_dir}/landscape2.png""", height=400, width=1200)
        fig = daily_distribution_analysis(df, "dt", "landscape", show_plot=False)
        fig.write_image(f"""{temp_dir}/landscape3.png""", height=400, width=1200)

        fig = histogram_analysis(df, "environment", "landscape", show_plot=False)
        fig.write_image(f"""{temp_dir}/environment1.png""", height=400, width=1200)
        fig = duration_analysis(df, "environment", "duration", show_plot=False)
        fig.write_image(f"""{temp_dir}/environment2.png""", height=400, width=1200)
        fig = daily_distribution_analysis(df, "dt", "environment", show_plot=False)
        fig.write_image(f"""{temp_dir}/environment3.png""", height=400, width=1200)

        fig = duration_distribution(df, show_plot=False)
        fig.write_image(f"""{temp_dir}/duration1.png""", height=400, width=1200)

        # Global Variables
        title = "Audio Files Exploratory Data Analysis"
        subtitle = analysis_title

        # Create PDF
        pdf = PDF()  # A4 (210 by 297 mm)

        # First Page of PDF

        # Add Page
        pdf.add_page()

        letterhead_cover = pkg_resources.resource_filename(
            "maui", "data/letterhead_cover.png"
        )
        letterhead = pkg_resources.resource_filename("maui", "data/letterhead.png")

        # Add lettterhead and title
        create_letterhead(pdf, width, letterhead_cover)
        create_title(pdf, title, subtitle)

        # Add table
        w = 200
        pdf.image(f"""{temp_dir}/summary1.png""", w=w, x=(width - w) / 2)
        pdf.ln(5)

        intro_text = """
        This report contains a brief exploratory data analysis """\
        "comprehending the data obtained by audio file names. "\
        "The objective is to present an overview of the acoustic """\
        "landscapes and environments of the recordings, as well as their duration. "\
        "Further analysis such as false color spectrograms and acoustic indices "\
        "summarization can be performed with Maui Sotware analysis and "\
        "visualization tools."
        write_to_pdf(pdf, intro_text)

        pdf.add_page()

        create_letterhead(pdf, width, letterhead)

        pdf.ln(20)
        write_subtitle(pdf, "1. Landscape Analysis")
        pdf.ln(20)
        pdf.image(f"""{temp_dir}/landscape1.png""", w=w, x=(width - w) / 2)
        pdf.ln(5)
        pdf.image(f"""{temp_dir}/landscape2.png""", w=w, x=(width - w) / 2)
        pdf.ln(5)
        pdf.image(f"""{temp_dir}/landscape3.png""", w=w, x=(width - w) / 2)
        pdf.ln(10)

        pdf.add_page()
        create_letterhead(pdf, width, letterhead)

        pdf.ln(20)
        write_subtitle(pdf, "2. Environment Analysis")
        pdf.ln(20)
        pdf.image(f"""{temp_dir}/environment1.png""", w=w, x=(width - w) / 2)
        pdf.ln(5)
        pdf.image(f"""{temp_dir}/environment2.png""", w=w, x=(width - w) / 2)
        pdf.ln(5)
        pdf.image(f"""{temp_dir}/environment3.png""", w=w, x=(width - w) / 2)
        pdf.ln(10)

        pdf.add_page()
        create_letterhead(pdf, width, letterhead)

        pdf.ln(20)
        write_subtitle(pdf, "3. Duration Analysis")
        pdf.ln(20)
        pdf.image(f"""{temp_dir}/duration1.png""", w=w, x=(width - w) / 2)

        pdf.output(file_name, "F")