Source code for maui.eda.eda

"""
This module provides utilities for Exploratory Data Analysis (EDA) with an
emphasis on visual and summary outputs for categorical data within a DataFrame.
Utilizing Plotly for dynamic visualizations and FPDF for PDF report generation,
it supports the creation of summary cards and plots, enhancing data
understanding and presentation.

Features include:
- Generation of summary cards for data overview.
- Creation of various plots (scatter, bar, etc.) for data comparison and trend
  analysis.
- PDF report generation for easy sharing and presentation.

Exceptions:
- CategoryLimitError: Raised when an attempt is made to process more than the
  allowed number of categories.

Dependencies:
- plotly: For creating interactive plots.
- fpdf: For generating PDF reports.
- pandas: Assumed for DataFrame manipulation and input.

Functions:
- card_summary(df, categories, show_plot=True): Generates a summary card and
  plots for specified categories.

Note:
- This module is designed to work with pandas DataFrames and expects a specific
  structure/format for input data.
"""
import warnings
import tempfile
import time
import pandas as pd

import pkg_resources
from fpdf import FPDF
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots


class CategoryLimitError(Exception):
    """Exception raised when more than two categories are selected."""


[docs] def card_summary(df, categories, show_plot: bool = True): """ Generates a summary card and plots for specified categories from a DataFrame. This function processes the input DataFrame to compute various statistics, including the number of samples, distinct days, total and mean duration (in minutes) of some activities. It also dynamically incorporates additional specified categories into its computations and visualizations. If enabled, a plot is generated using Plotly to visually represent these statistics alongside the categories specified. Parameters ---------- df : pandas.DataFrame The input DataFrame containing at least the following columns: 'file_path', 'dt', and 'duration'. Additional columns should match the specified categories if any. categories : list of str A list of category names (column names in `df`) to include in the summary and plot. At most two categories can be specified. show_plot : bool, optional If True (default), the function will generate and show a Plotly plot representing the calculated statistics and specified categories. If False, no plot will be displayed. Raises ------ Exception If more than two categories are specified, an exception is raised due to plotting limitations. Returns ------- tuple Returns a tuple containing: - card_dict (dict): A dictionary with keys for 'n_samples', 'distinct_days', 'total_time_duration', 'mean_time_duration', and one key per category specified. The values are the respective computed statistics. - fig (plotly.graph_objs._figure.Figure): A Plotly figure object with indicators for each of the statistics and categories specified. Only returned if `show_plot` is True. Notes ----- The function is designed to work with data pertaining to durations and occurrences across different categories. It's particularly useful for analyzing time series or event data. The 'duration' column is expected to be in seconds. Examples -------- >>> from maui import samples, eda >>> df = samples.get_audio_sample(dataset="leec") >>> categories = ['landscape', 'environment'] >>> card_dict, fig = eda.card_summary(df, categories) """ if len(categories) > 2: raise CategoryLimitError("At most two categories should be selected.") df_count = df.nunique(axis=0) duration_mean = df["duration"].mean() / 60 duration_total = df["duration"].sum() / 60 card_dict = { "n_samples": df_count["file_path"], "distinct_days": df_count["dt"], "total_time_duration": duration_total, "mean_time_duration": duration_mean, } subplot_titles = ["Distinct Days", "Total Duration", "Mean Duration", "Samples"] for category in categories: card_dict[category] = df_count[category] subplot_titles.append(category) specs = [ [{"type": "indicator"}, {"type": "indicator"}, {"type": "indicator"}], [{"type": "indicator"}, {"type": "indicator"}, {"type": "indicator"}], ] fig = make_subplots(rows=2, cols=3, subplot_titles=subplot_titles, specs=specs) trace0 = go.Indicator( mode="number", value=card_dict["distinct_days"], number={"suffix": ""}, delta={"position": "top", "reference": 320}, domain={"x": [0, 1], "y": [0, 1]}, ) fig.add_trace(trace0, 1, 1) trace1 = go.Indicator( mode="number", value=card_dict["total_time_duration"], number={"suffix": " min"}, delta={"position": "top", "reference": 320}, domain={"x": [0, 1], "y": [0, 1]}, ) fig.append_trace(trace1, 1, 2) trace2 = go.Indicator( mode="number", value=card_dict["mean_time_duration"], number={"suffix": " min"}, delta={"position": "top", "reference": 320}, domain={"x": [0, 1], "y": [0, 1]}, ) fig.append_trace(trace2, 1, 3) trace3 = go.Indicator( mode="number", value=card_dict["n_samples"], number={"prefix": ""}, delta={"position": "top", "reference": 320}, domain={"x": [0, 1], "y": [0, 1]}, ) fig.append_trace(trace3, 2, 1) i = 2 j = 2 for category in categories: trace_tmp = go.Indicator( mode="number", value=card_dict[category], number={"prefix": ""}, delta={"position": "top", "reference": 320}, domain={"x": [0, 1], "y": [0, 1]}, ) fig.append_trace(trace_tmp, i, j) j = (j % 3) + 1 if j == 1: i = i + 1 # fig.update_layout(paper_bgcolor = "lightgray") if show_plot: fig.show() return card_dict, fig
# ----------------------------------------------------------------------------
[docs] def heatmap_analysis( df, x_axis: str, y_axis: str, color_continuous_scale="Viridis", show_plot: bool = True, **kwargs, ): """ Generates a heatmap to analyze the relationship between two categorical variables in a DataFrame. This function groups the data by the specified `x_axis` and `y_axis` categories, counts the occurrences of each group, and then creates a heatmap visualization of these counts using Plotly Express. The heatmap intensity is determined by the count of occurrences, with an option to customize the color scale. Parameters ---------- df : pandas.DataFrame The input DataFrame containing the data to be analyzed. Must include the columns specified by `x_axis` and `y_axis`, as well as a 'file_path' column used for counting occurrences. x_axis : str The name of the column in `df` to be used as the x-axis in the heatmap. y_axis : str The name of the column in `df` to be used as the y-axis in the heatmap. color_continuous_scale : str, optional The name of the color scale to use for the heatmap. Defaults to 'Viridis'. For more options, refer to Plotly's documentation on color scales. show_plot : bool, optional If True (default), displays the heatmap plot. If False, the plot is not displayed but is still returned. **kwargs : dict Additional arguments for plot customization, such as `height` and `width`. Returns ------- tuple A tuple containing: - df_group (pandas.DataFrame): A DataFrame with the grouped counts for each combination of `x_axis` and `y_axis` values. - fig (plotly.graph_objs._figure.Figure): A Plotly figure object containing the heatmap. Notes ----- The 'file_path' column in the input DataFrame is used to count occurrences of each group formed by the specified `x_axis` and `y_axis` values. This function is useful for visualizing the distribution and relationship between two categorical variables. Examples -------- >>> from maui import samples, eda >>> df = samples.get_audio_sample(dataset="leec") >>> df_group, fig = eda.heatmap_analysis(df, 'landscape', 'environment') """ # Aggregates the count of occurrences based on the columns specified df_group = df.groupby([x_axis, y_axis], as_index=False)["file_path"].count() df_group = df_group.rename(columns={"file_path": "count"}) # Creates all possible combinations of the categorical values categorias_x = sorted(df[x_axis].unique()) categorias_y = sorted(df[y_axis].unique()) indice_completo = pd.MultiIndex.from_product([categorias_x, categorias_y], names=[x_axis, y_axis]) # Reindexes to ensure all combinations exist, filling in zeros where missing df_group_complete = ( df_group.set_index([x_axis, y_axis]) .reindex(indice_completo, fill_value=0) .reset_index() ) # Creates the pivot table for generating the heatmap df_group_pivot = df_group_complete.pivot(index=x_axis, columns=y_axis, values="count") # Builds the heatmap using Plotly Express fig = px.imshow( df_group_pivot, color_continuous_scale=color_continuous_scale, text_auto=True, title=f"Heatmap - Número de arquivos de áudio por {x_axis} e {y_axis}", ) fig.update_layout( title_x=0.5, height=kwargs.get("height", 600), width=kwargs.get("width", 800), ) if show_plot: fig.show() return df_group_complete, fig
# ----------------------------------------------------------------------------
[docs] def histogram_analysis(df, x_axis: str, category_column: str, show_plot: bool = True): """ Generates a histogram plot for data distribution across a specified axis, optionally segmented by categories. This function creates a histogram to visualize the distribution of data in `df` along the `x_axis`, with data optionally segmented by `category_column`. The histogram's appearance, such as opacity and bar gap, is customizable. The plot is generated using Plotly Express and can be displayed in the notebook or IDE if `show_plot` is set to True. Parameters ---------- df : pandas.DataFrame The DataFrame containing the data to plot. Must include the columns specified by `x_axis` and `category_column`. x_axis : str The name of the column in `df` to be used for the x-axis of the histogram. category_column : str The name of the column in `df` that contains categorical data for segmenting the histogram. Each category will be represented with a different color. show_plot : bool, optional If True (default), the generated plot will be immediately displayed. If False, the plot will not be displayed but will still be returned by the function. Returns ------- plotly.graph_objs._figure.Figure The Plotly figure object for the generated histogram. This object can be further customized or saved after the function returns. Notes ----- This function is designed to offer a quick and convenient way to visualize the distribution of data in a DataFrame along a specified axis. It is particularly useful for exploratory data analysis and for identifying patterns or outliers in dataset segments. Examples -------- >>> from maui import samples, eda >>> df = samples.get_audio_sample(dataset="leec") >>> fig = eda.histogram_analysis(df, 'landscape', 'environment') """ totals = df.groupby(x_axis).size().sort_values(ascending=False) order = totals.index.tolist() fig = px.histogram( df, x=x_axis, color=category_column, opacity=0.7, title=f"""Amount of samples by {x_axis} and segmented by {category_column}""", category_orders={x_axis: order} ) fig.update_layout(bargap=0.1, title_x=0.5) if show_plot: fig.show() return fig
# ----------------------------------------------------------------------------
[docs] def duration_analysis(df, category_column: str, duration_column: str, show_plot=True): """ Generates a box plot visualizing the distribution of durations across different categories. This function takes a DataFrame and creates a box plot to analyze the distribution of durations (or any numerical data) across specified categories. The box plot provides a visual representation of the central tendency, dispersion, and skewness of the data and identifies outliers. Parameters ---------- df : pandas.DataFrame The DataFrame containing the data to be analyzed. It should include at least two columns: one for the category and one for the duration (or any numerical data to be analyzed). category_column : str The name of the column in `df` that contains the categorical data. This column will be used to group the numerical data into different categories for the box plot. duration_column : str The name of the column in `df` that contains the numerical data to be analyzed. This data will be distributed into boxes according to the categories specified by `category_column`. show_plot : bool, optional If True (default), the function will display the generated box plot. If False, the plot will not be displayed, but the figure object will still be returned. Returns ------- plotly.graph_objs._figure.Figure The generated Plotly figure object containing the box plot. This object can be used for further customization or to display the plot at a later time if `show_plot` is False. Notes ----- The box plot generated by this function can help identify the range, interquartile range, median, and potential outliers within each category. This visual analysis is crucial for understanding the distribution characteristics of numerical data across different groups. Examples -------- >>> from maui import samples, eda >>> df = samples.get_audio_sample(dataset="leec") >>> fig = eda.duration_analysis(df, 'landscape', 'duration') """ fig = px.box( df, x=category_column, y=duration_column, title=f"""Duration distribution by {category_column}""", ) fig.update_layout(title_x=0.5) if show_plot: fig.show() return fig
# ----------------------------------------------------------------------------
[docs] def daily_distribution_analysis( df, date_column: str, category_column: str, show_plot=True ): """ Analyzes and visualizes the daily distribution of samples by categories. This function generates a histogram that shows the distribution of samples over days, separated by a specified category. It provides insights into how the frequency of samples varies daily and according to the categories within the specified category column. Parameters ---------- df : pandas.DataFrame The DataFrame containing the data to be analyzed. It must include the specified `date_column` and `category_column`. date_column : str The name of the column in `df` that contains date information. The values in this column should be in a date or datetime format. category_column : str The name of the column in `df` that contains categorical data, which will be used to color the bars in the histogram. show_plot : bool, optional If True (default), the function will display the generated plot. If False, the plot will not be displayed but will still be returned. Returns ------- plotly.graph_objs._figure.Figure A Plotly figure object representing the histogram of daily sample distribution by the specified category. The histogram bars are colored based on the categories in the `category_column`. Notes ----- The function leverages Plotly for plotting, thus ensuring interactive plots that can be further explored in a web browser. It's particularly useful for time series data where understanding the distribution of events or samples over time and across different categories is crucial. Examples -------- >>> from maui import samples, eda >>> df = samples.get_audio_sample(dataset="leec") >>> fig = eda.daily_distribution_analysis(df, 'dt', 'landscape') """ # Convert with errors='coerce' to transform invalid dates to NaT (Not a Time) df_original_size = len(df) df[date_column] = pd.to_datetime(df[date_column], errors='coerce') # Check for invalid dates and generate warning invalid_dates_count = df[date_column].isna().sum() if invalid_dates_count > 0: warnings.warn( f"Found {invalid_dates_count} invalid dates out of {df_original_size} total records. " f"These will be removed from the analysis.", UserWarning ) # Remove rows with invalid dates (optional) df = df.dropna(subset=[date_column]) # Calculate number of unique days num_days = df[date_column].dt.date.nunique() fig = px.histogram( df, x=date_column, color=category_column, opacity=0.7, title=f"""Amount of samples by Day and {category_column}""", nbins=num_days # Força um bin por dia ) fig.update_layout(bargap=0.1, title_x=0.5) if show_plot: fig.show() return fig
# ----------------------------------------------------------------------------
[docs] def duration_distribution(df, time_unit='s', show_plot=True): """ Generates a distribution plot for the 'duration' column in the provided DataFrame. This function creates a distribution plot, including a histogram and a kernel density estimate (KDE), for the 'duration' column in the input DataFrame. It is designed to give a visual understanding of the distribution of duration values across the dataset. Parameters ---------- df : pandas.DataFrame The DataFrame containing the data to be analyzed. It must include a column named 'duration', which contains numeric data. time_unit: string The time unit of the audio duration column. It is used to make it explicit in the visualization which is the time unit. Default: 's' show_plot : bool, optional If True (default), the function will display the generated plot. If False, the plot will not be displayed but will still be returned. Returns ------- plotly.graph_objs._figure.Figure A Plotly figure object representing the distribution plot of the 'duration' column. The plot includes both a histogram of the data and a kernel density estimate (KDE) curve. Notes ----- The function uses Plotly's `create_distplot` function from the `plotly.figure_factory` module, offering a detailed visual representation of data distribution. It's particularly useful for analyzing the spread and skewness of numeric data. The KDE curve provides insight into the probability density of the durations, complementing the histogram's discrete bins. Examples -------- >>> from maui import samples, eda >>> df = samples.get_audio_sample(dataset="leec") >>> fig = eda.duration_distribution(df) """ group_labels = ["duration"] # name of the dataset fig = ff.create_distplot([df["duration"].values], group_labels) fig.update_layout( bargap=0.005, title_text="Duration distribution", title_x=0.5, xaxis_title=f"""Duration ({time_unit})""", yaxis_title="Density (number of files)" ) if show_plot: fig.show() return fig
# ---------------------------------------------------------------------------- class PDF(FPDF): """ Internal class to organize the PDF generation """ def footer(self): """ Generate footer of the PDF. """ self.set_y(-15) self.set_font("Helvetica", "I", 8) self.set_text_color(128) self.cell( 0, 10, "Generated with <3 by Maui Software - Page " + str(self.page_no()), 0, 0, "C", ) def create_letterhead(pdf, width, image): """ Generate title of the PDF. """ pdf.image(image, 0, 0, width) def create_title(pdf, title, subtitle=None): """ Generate title of the PDF. """ # Add main title pdf.set_font("Helvetica", "b", 20) pdf.ln(100) pdf.write(5, title) pdf.ln(15) if subtitle is not None: # Add subtitle pdf.set_font("Helvetica", "b", 16) pdf.write(5, subtitle) pdf.ln(10) # Add date of report pdf.set_font("Helvetica", "", 14) pdf.set_text_color(r=128, g=128, b=128) today = time.strftime("%d/%m/%Y") pdf.write(4, f"{today}") # Add line break pdf.ln(30) def write_to_pdf(pdf, words): """ Write data. """ # Set text colour, font size, and font type pdf.set_text_color(r=0, g=0, b=0) pdf.set_font("Helvetica", "", 12) pdf.write(5, words) def write_subtitle(pdf, words): """ Write subtitle. """ # Set text colour, font size, and font type pdf.set_text_color(r=0, g=0, b=0) pdf.set_font("Helvetica", "b", 14) pdf.write(5, words)
[docs] def export_file_names_summary_pdf_leec( df, file_name: str, analysis_title=None, width=210): """ Export a summary PDF report with analysis of file names for LEEC project. Parameters ---------- df : pandas.DataFrame DataFrame containing the data to be analyzed. file_name : str Name of the output PDF file. analysis_title : str, optional Title of the analysis section in the PDF. width : int, optional Width of the PDF document in millimeters. Returns ------- None Notes ----- This function exports a summary PDF report with various analyses of file names for the LEEC project. It includes landscape analysis, environment analysis, and duration analysis. The PDF is created using the provided DataFrame `df` and saved with the specified `file_name`. Examples -------- >>> export_file_names_summary_pdf_leec(df, 'summary_report.pdf', analysis_title='Audio Files Analysis') """ categories = ['landscape', 'environment'] with tempfile.TemporaryDirectory() as temp_dir: _, fig = card_summary(df, categories, show_plot=False) fig.write_image(f"""{temp_dir}/summary1.png""", height=300, width=1200) _, fig = heatmap_analysis( df, "landscape", "environment", color_continuous_scale="Viridis", show_plot=False, ) fig.write_image(f"""{temp_dir}/summary2.png""") fig = histogram_analysis(df, "landscape", "environment", show_plot=False) fig.write_image(f"""{temp_dir}/landscape1.png""", height=400, width=1200) fig = duration_analysis(df, "landscape", "duration", show_plot=False) fig.write_image(f"""{temp_dir}/landscape2.png""", height=400, width=1200) fig = daily_distribution_analysis(df, "dt", "landscape", show_plot=False) fig.write_image(f"""{temp_dir}/landscape3.png""", height=400, width=1200) fig = histogram_analysis(df, "environment", "landscape", show_plot=False) fig.write_image(f"""{temp_dir}/environment1.png""", height=400, width=1200) fig = duration_analysis(df, "environment", "duration", show_plot=False) fig.write_image(f"""{temp_dir}/environment2.png""", height=400, width=1200) fig = daily_distribution_analysis(df, "dt", "environment", show_plot=False) fig.write_image(f"""{temp_dir}/environment3.png""", height=400, width=1200) fig = duration_distribution(df, show_plot=False) fig.write_image(f"""{temp_dir}/duration1.png""", height=400, width=1200) # Global Variables title = "Audio Files Exploratory Data Analysis" subtitle = analysis_title # Create PDF pdf = PDF() # A4 (210 by 297 mm) # First Page of PDF # Add Page pdf.add_page() letterhead_cover = pkg_resources.resource_filename( "maui", "data/letterhead_cover.png" ) letterhead = pkg_resources.resource_filename("maui", "data/letterhead.png") # Add lettterhead and title create_letterhead(pdf, width, letterhead_cover) create_title(pdf, title, subtitle) # Add table w = 200 pdf.image(f"""{temp_dir}/summary1.png""", w=w, x=(width - w) / 2) pdf.ln(5) intro_text = """ This report contains a brief exploratory data analysis """\ "comprehending the data obtained by audio file names. "\ "The objective is to present an overview of the acoustic """\ "landscapes and environments of the recordings, as well as their duration. "\ "Further analysis such as false color spectrograms and acoustic indices "\ "summarization can be performed with Maui Sotware analysis and "\ "visualization tools." write_to_pdf(pdf, intro_text) pdf.add_page() create_letterhead(pdf, width, letterhead) pdf.ln(20) write_subtitle(pdf, "1. Landscape Analysis") pdf.ln(20) pdf.image(f"""{temp_dir}/landscape1.png""", w=w, x=(width - w) / 2) pdf.ln(5) pdf.image(f"""{temp_dir}/landscape2.png""", w=w, x=(width - w) / 2) pdf.ln(5) pdf.image(f"""{temp_dir}/landscape3.png""", w=w, x=(width - w) / 2) pdf.ln(10) pdf.add_page() create_letterhead(pdf, width, letterhead) pdf.ln(20) write_subtitle(pdf, "2. Environment Analysis") pdf.ln(20) pdf.image(f"""{temp_dir}/environment1.png""", w=w, x=(width - w) / 2) pdf.ln(5) pdf.image(f"""{temp_dir}/environment2.png""", w=w, x=(width - w) / 2) pdf.ln(5) pdf.image(f"""{temp_dir}/environment3.png""", w=w, x=(width - w) / 2) pdf.ln(10) pdf.add_page() create_letterhead(pdf, width, letterhead) pdf.ln(20) write_subtitle(pdf, "3. Duration Analysis") pdf.ln(20) pdf.image(f"""{temp_dir}/duration1.png""", w=w, x=(width - w) / 2) pdf.output(file_name, "F")