Source code for maui.files_metadata.files_metadata
"""
This module provides utilities for managing and extracting metadata from
strings according to specified formats. It leverages YAML files to define
and verify the formats, enabling dynamic configuration of metadata extraction
procedures. Functions within this module allow for verification of YAML format
configurations, retrieval of format-specific configurations, and extraction of
metadata based on the defined formats.
Capabilities include:
- Verifying the structure and validity of YAML configuration files.
- Retrieving configuration data for specified formats from YAML files.
- Extracting metadata from strings based on configurable format definitions.
The module is designed to be flexible and extensible, supporting various
metadata tag configurations and formats, including custom date and time
processing functions for specialized needs.
Main Functions:
- verify_yaml_format(data): Checks if YAML data follows the expected structure.
- get_format_config(format_name, format_file_path): Retrieves the configuration
for a specific format from a YAML file.
- extract_metadata(string, format_name, date_time_func, format_file_path):
Extracts metadata from strings according to the specified format.
These functions support a wide range of applications in data processing and
analysis tasks, particularly where metadata extraction and validation against
pre-defined formats are required.
Note:
- The module depends on `re` for regex operations, `datetime` for handling date
and time data, `importlib.resources` for resource management, and `yaml` for
parsing YAML files.
Examples and detailed descriptions of parameters, return types, and exceptions
are provided in each function's docstring, guiding their use in specific
scenarios.
"""
import re
import datetime
from importlib import resources
import yaml
[docs]
def verify_yaml_format(data):
"""
Verify if the provided YAML data follows a specific format.
Parameters
----------
data : dict
A dictionary representing YAML data.
Returns
-------
bool
True if the YAML data follows the expected format, False otherwise.
"""
if "formats" not in data or not isinstance(data["formats"], list):
# If 'formats' key is missing or its value is not a list, return False
return False
for format_data in data["formats"]:
# Iterate through each format data in the 'formats' list
if (
"format_name" not in format_data
or "file_name_format" not in format_data
or "file_extension" not in format_data
or "metadata_tag_info" not in format_data
):
# If any of the required keys are missing in format data, return False
return False
metadata_tag_info = format_data["metadata_tag_info"]
for _, tag_info in metadata_tag_info.items():
# Iterate through each metadata tag info in the 'metadata_tag_info' dictionary
if (
"description" not in tag_info
or "type" not in tag_info
or "format" not in tag_info
):
# If any of the required keys are missing in tag info, return False
return False
# If all checks pass, return True indicating the YAML data follows the expected format
return True
# ---------------------------------------------------
[docs]
def get_format_config(format_name, format_file_path):
"""
Retrieve configuration for a specific format from a YAML file.
Parameters
----------
format_name : str
Name of the format to retrieve configuration for.
format_file_path : str
Path to the YAML file containing format configurations.
Returns
-------
dict
A dictionary containing configuration information for the specified format.
Raises
------
ValueError
If the provided YAML file is not properly formatted or if the specified format is not found.
"""
# Load data from .yaml
if format_file_path is None:
data = yaml.safe_load(
resources.files("maui.files_metadata")
.joinpath("files_formats.yaml").read_text(encoding="utf-8")
)
else:
with open(format_file_path, "r", encoding="utf-8") as file:
data = yaml.safe_load(file)
# Verify if YAML data follows the expected format
if not verify_yaml_format(data):
raise ValueError("The provided YAML is not properly formatted")
# Search for the format with provided name
for format_data in data["formats"]:
if format_data["format_name"] == format_name:
selected_format = format_data
break
else:
raise ValueError(f"{format_name} not found in the YAML data")
return selected_format
# ---------------------------------------------------
[docs]
def extract_metadata(string, format_name, date_time_func=None, format_file_path=None):
"""
Extract metadata from a string based on a specified format.
Parameters
----------
string : str
The string from which metadata will be extracted.
format_name : str
Name of the format to use for metadata extraction.
date_time_func : function, optional
A function to handle date and time processing for extracted metadata.
Default is None.
format_file_path : str, optional
Path to the YAML file containing format configurations.
Default is 'files_formats.yaml'.
Returns
-------
dict or None
A dictionary containing extracted metadata if successful, None otherwise.
Raises
------
ValueError
If the specified format is not found in the format file.
"""
# Retrieve format configuration from YAML file
file_format_config = get_format_config(format_name, format_file_path)
# Extract pattern and metadata dictionary from format configuration
pattern = file_format_config["file_name_format"]
metadata_dict = file_format_config["metadata_tag_info"]
# Fill pattern with metadata format placeholders
pattern_filled = pattern
for key in metadata_dict.keys():
pattern_filled = pattern_filled.replace(key, metadata_dict[key]["format"])
# Compile regex pattern and match against input string
regex = re.compile(pattern_filled)
result = regex.match(string)
# If match is found, extract metadata values and return as a dictionary
if result:
values = result.groups()
values = dict(zip(metadata_dict.keys(), values))
# If the format is "LEEC_FILE_FORMAT", handle specific date and time format
if format_name == "LEEC_FILE_FORMAT":
values["timestamp_init"] = datetime.datetime.strptime(
values["date"] + " " + values["time"], "%Y%m%d %H%M%S"
)
# If a date_time_func is provided, apply it to the metadata values
elif date_time_func is not None:
values = date_time_func(values)
return values
return None