Source code for maui.samples.audio_samples

"""
    This module offers a streamlined approach to retrieving information on
    audio samples within the Maui project framework. It serves to abstract the
    complexities of accessing and parsing audio file metadata, providing a simple
    method for users to obtain a structured and comprehensive overview of available
    audio samples. The methods returns a pandas DataFrame detailing
    the samples, including aspects such as file paths, durations, and other
    pertinent metadata.

    The functionality leverages the `maui.io` module for the extraction of audio
    information, ensuring consistency and reliability in the data presented.

    Functionality:
    - Simplifies the retrieval of audio sample metadata within the Maui
      framework.

    Usage:
    - Intended for use in data analysis workflows requiring access to structured
      information about specific sets of audio samples.

    Dependencies:
    - os: For handling file and directory paths.
    - maui.io: For underlying audio information extraction processes.

    Examples and additional details are provided in the function docstring,
    guiding users in applying the module to their specific needs.
"""

import os
import zipfile
import urllib.request
from pathlib import Path
import requests

import pandas as pd
import gdown
from tqdm import tqdm

import maui.io


def get_dataset_url(
    dataset: str,
) -> str:
    """
    Generate a Google Drive URL for the specified dataset.

    This function returns a direct download URL for the dataset based on the provided dataset name. 
    Currently, it supports the "leec" dataset, for which it constructs a URL from a pre-defined
    Google Drive file ID.

    Parameters
    ----------
    dataset : str
        The name of the dataset for which the download URL is requested.
        Currently supported datasets:
        - "leec": Returns the download URL for the "leec" dataset.

    Returns
    -------
    str
        A string containing the direct download URL from Google Drive for the specified dataset.
        If the dataset is not supported, an empty URL is returned.

    Examples
    --------
    >>> get_dataset_url("leec")
    'https://drive.google.com/uc?id=1tw7BpPNBeS6Dz0XJOwwYuJOYJgd4XSUE'
    
    Notes
    -----
    - This function is designed to handle future datasets by mapping their names to specific Google
      Drive file IDs.
    - If an unsupported dataset is provided, an empty file ID will result in an invalid URL.
    """
    file_id = ""
    if dataset == "leec":
        file_id = "1tw7BpPNBeS6Dz0XJOwwYuJOYJgd4XSUE"
    return f"https://drive.google.com/uc?id={file_id}"



[docs]
def get_audio_sample(dataset: str, extract_path: str = None) -> pd.DataFrame:
    """
    Get Leec Audio Samples available in maui.

    Parameters
    ----------
    dataset : str
        Dataset to be loaded. The available datasets are: leec
    extract_path : str
        Directory to extract sample files

    Returns
    -------
    df : pandas.DataFrame
        A DataFrame containing information about the audio samples.

    Examples
    --------
    To retrieve Leec audio samples and store the information in a DataFrame,
    you can call this function as follows:

    >>> from maui import samples
    >>> df = samples.get_audio_sample(dataset="leec")


    """
    available_datasets = ["leec"]
    if dataset not in available_datasets:
        raise Exception("Dataset not available")

    dataset_format_name = "unknown"
    if dataset == "leec":
        dataset_format_name = "LEEC_FILE_FORMAT"

    zip_file_name = f"{dataset}.zip"
    zip_file_path = os.path.join(os.path.dirname(__file__), "data", zip_file_name)
    if extract_path is None:
        extract_path = os.path.join(os.path.dirname(__file__), "data", dataset)

    print(zip_file_path)
    print(extract_path)

    file_url = get_dataset_url(dataset)

    os.makedirs(os.path.dirname(zip_file_path), exist_ok=True)
    gdown.download(file_url, zip_file_path, quiet=False)

    # Check if the file is a valid zip file
    try:
        with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
            zip_ref.testzip()
            # Extract all files to the specified directory
            zip_ref.extractall(extract_path)
        print("Extraction complete.")
    except zipfile.BadZipFile:
        print("Error: The downloaded file is not a valid zip file.")
        return None

    df = maui.io.get_audio_info(
        extract_path,
        format_name=dataset_format_name,
        store_duration=1,
        perc_sample=1,
    )

    return df



def _get_xc_dataset(q: dict) -> pd.DataFrame:
    """
    Retrieves a dataset from the Xeno-canto API based on the provided query parameters.

    Parameters
    ----------
    q : dict
        A dictionary containing the query parameters to filter recordings from the Xeno-canto API.
        Example of keys: 'gen', 'sp', 'cnt', etc.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the recordings data retrieved from the Xeno-canto API.
        If no recordings are found or an error occurs, an empty DataFrame is returned.

    Notes
    -----
    - The method constructs the API URL from the query dictionary and sends a GET request.
    - If the API response contains recordings, they are converted into a DataFrame.
    """

    url = "https://xeno-canto.org/api/2/recordings?query="

    for i, (key, value) in enumerate(q.items()):
        if i >= 1:
            url += " "
        url += key + ":" + value

    response = requests.get(url)

    if response.status_code == 200:
        jsondata = response.json()

        # Check if there are recordings
        if "recordings" in jsondata and jsondata["recordings"]:
            # Create a DataFrame from the recordings data
            df_xc = pd.DataFrame(jsondata["recordings"])
        else:
            print("No recordings found.")
            df_xc = pd.DataFrame()  # Empty DataFrame in case no results
    else:
        print(f"Error: {response.status_code}")
        df_xc = pd.DataFrame()  # Empty DataFrame in case of an error

    return df_xc


def _download_xc_files(df: pd.DataFrame, extract_path: str) -> pd.DataFrame:
    """
    Downloads audio files from Xeno-canto based on a DataFrame of recordings and saves them locally.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing the recordings information, including the file URLs and file names.
    extract_path : str
        The directory where the audio files will be saved. If the directory does not exist, it
        is created.

    Returns
    -------
    pd.DataFrame
        The input DataFrame with two additional columns:
        - 'local_file_path': The local path where the file was saved (or None if download failed).
        - 'file_downloaded': A boolean indicating whether the file was successfully downloaded.

    Notes
    -----
    - Files that already exist in the destination folder are skipped.
    - If a file download fails, the method will print an error and add a None value to the
    file path.
    """

    extract_path = Path(extract_path)
    os.makedirs(extract_path, exist_ok=True)

    path_list = []
    downloaded_list = []

    # Adding tqdm progress bar to the loop
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Downloading files"):
        file_name_xc = row["file-name"]
        full_path = os.path.join(extract_path, file_name_xc)
        if os.path.exists(full_path):
            print(f"File {file_name_xc} already exists.")
            path_list.append(full_path)
            downloaded_list.append(True)
        else:
            try:
                _, _ = urllib.request.urlretrieve(row["file"], full_path)
                path_list.append(full_path)
                downloaded_list.append(True)
            except (urllib.error.URLError, OSError) as e:
                print(f"File {file_name_xc} could not be downloaded. Error: {str(e)}")
                path_list.append(None)
                downloaded_list.append(False)

    df["local_file_path"] = path_list
    df["file_downloaded"] = downloaded_list

    return df



[docs]
def get_xc_data(q: dict, extract_path: str) -> pd.DataFrame:
    """
    Retrieves and downloads Xeno-canto data based on a set of query parameters.

    Parameters
    ----------
    q : dict
        A dictionary of query parameters to filter recordings from Xeno-canto.
    extract_path : str
        The directory where the audio files will be saved.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the recordings data from Xeno-canto, with additional columns for
        the local file paths and file download status.

    Raises
    ------
    ValueError
        If unexpected query parameters are provided.

    Notes
    -----
    - The method first validates the query dictionary to ensure only valid keys are used.
    - After retrieving the recordings data using the `_get_xc_dataset` method, it downloads
      the audio files using the `_download_xc_files` method.

    Example
    --------
    >>> from maui import samples
    >>> params = {
    >>>     'cnt':'brazil'
    >>> }
    >>> df = samples.get_xc_data(q = params, extract_path="./xc_data")
    """

    query_params_list = [
        "id", "gen", "sp", "ssp", "group", "en", "rec", "cnt",
        "loc",   "lat",   "lng",  "type",   "sex", "stage",
        "method", "url", "file", "file-name", "sono", "osci", "lic",
        "q", "length", "time", "date", "uploaded", "also", "rmk",
        "bird-seen", "animal-seen", "playback-used", "temperature",
        "regnr", "auto", "dvc", "mic", "smp",
    ]

    extra_keys = set(q.keys()) - set(query_params_list)

    if extra_keys:
        raise ValueError(f"Unexpected keys found: {extra_keys}")

    df_xc = _get_xc_dataset(q)
    df_xc = _download_xc_files(df_xc, extract_path)

    return df_xc