API Reference#
Data Rules#
- class metadata_guardian.data_rules.AvailableCategory(value)#
Available Data Rules Categories.
- class metadata_guardian.data_rules.DataRule(*, rule_name, regex_pattern, documentation)#
DataRule instance with a regex pattern and a documentation.
- Parameters
rule_name (str) –
regex_pattern (str) –
documentation (str) –
- Return type
None
- class metadata_guardian.data_rules.DataRules(data_rules)#
Data Rules instances.
- Parameters
data_rules (RawDataRules) –
- Return type
None
- classmethod from_available_category(category)#
Get Data Rules from an available category.
- Parameters
category (metadata_guardian.data_rules.AvailableCategory) – the available category of the data rules
- Returns
the Data Rules instance
- Return type
- classmethod from_new_category(category, data_rules)#
Create data rules from a given category and data rules.
- Parameters
category (str) – the category of the Data Rules
data_rules (List[metadata_guardian.data_rules.DataRule]) – the list of data rule
- Returns
the Data Rules instance
- Return type
- classmethod from_path(path)#
Get Data Rules from a path.
- Parameters
path (str) – the path of the yaml file
- Returns
the Data Rules instance
- Return type
- validate_file(path)#
Validate a file content with the data rules defined.
- Parameters
path (str) – the file path
- Returns
the metadata guardian results
- Return type
- validate_word(word)#
Validate a word with the data rules defined.
- Parameters
word (str) – the word to validate
- Returns
the metadata guardian results
- Return type
- validate_words(words)#
Validate a list of words with the data rules defined.
- Parameters
words (List[str]) – the words to validate
- Returns
the metadata guardian results
- Return type
- class metadata_guardian.data_rules.MetadataGuardianResults(*, category, content, data_rules)#
Metadata Guardian Results instance with the content that matches with the data rules.
- Parameters
category (str) –
content (str) –
data_rules (List[metadata_guardian.data_rules.DataRule]) –
- Return type
None
Configuration#
- metadata_guardian.conf.configure_logger()#
Configure the loguru configuration with Rich.
- Returns
- Return type
None
Scanner#
- class metadata_guardian.scanner.ColumnScanner(*, data_rules, progression_bar_disabled=True)#
Column Scanner instance.
- Parameters
data_rules (metadata_guardian.data_rules.DataRules) –
progression_bar_disabled (bool) –
- Return type
None
- scan_external(source, database_name, table_name=None, include_comment=False)#
Scan the column names from the external source using a table name or a database name.
- Parameters
source (metadata_guardian.source.external.external_metadata_source.ExternalMetadataSource) – the ExternalMetadataSource to scan
database_name (str) – the name of the database
table_name (Optional[str]) – the name of the table
include_comment (bool) – the scan include the comment section
- Returns
a Metadata Guardian report
- Return type
- async scan_external_async(source, database_name, tasks_limit=2, table_name=None, include_comment=False)#
Scan the column names from the external source using a table name or a database name. Note that it can generate multiple concurrent calls to your metadata source.
- Parameters
source (metadata_guardian.source.external.external_metadata_source.ExternalMetadataSource) – the ExternalMetadataSource to scan
database_name (str) – the name of the database
tasks_limit (int) – the limit of the tasks to run in parallel
table_name (Optional[str]) – the name of the table
include_comment (bool) – the scan include the comment section
- Returns
a Metadata Guardian report
- Return type
- scan_local(source)#
Scan the column names from the local source.
- Parameters
source (metadata_guardian.source.local.local_metadata_source.LocalMetadataSource) – the MetadataSource to scan
- Returns
a Metadata Guardian report
- Return type
- class metadata_guardian.scanner.ContentFilesScanner(*, data_rules, progression_bar_disabled=True)#
Content Files Scanner instance.
- Parameters
data_rules (metadata_guardian.data_rules.DataRules) –
progression_bar_disabled (bool) –
- Return type
None
- scan_directory(directory_path, file_names_extension)#
Scan all the files inside directory path with the file name extension.
- Parameters
directory_path (str) – the directory path to scan
file_names_extension (str) – the file name extension to include (without the “.”)
- Returns
a Metadata Guardian report
- Return type
- scan_local_file(path)#
Scan a file with data rules.
- Parameters
path (str) – the path of the file to scan
- Returns
a Metadata Guardian report
- Return type
- class metadata_guardian.scanner.Scanner#
Scanner Interface.
- Return type
None
- abstract scan_external(source, database_name, table_name=None, include_comment=False)#
Scan the column names from the external source.
- Parameters
source (metadata_guardian.source.external.external_metadata_source.ExternalMetadataSource) – the ExternalMetadataSource to scan
database_name (str) – the name of the database
table_name (Optional[str]) – the name of the table
include_comment (bool) – the scan include the comment section
- Returns
a Metadata Guardian report
- Return type
- abstract async scan_external_async(source, database_name, tasks_limit, table_name=None, include_comment=False)#
Scan the column names from the external source asynchronously.
- Parameters
source (metadata_guardian.source.external.external_metadata_source.ExternalMetadataSource) – the ExternalMetadataSource to scan
database_name (str) – the name of the database
tasks_limit (int) – the limit of the tasks to run in parallel
table_name (Optional[str]) – the name of the table
include_comment (bool) – the scan include the comment section
- Returns
a Metadata Guardian report
- Return type
- abstract scan_local(source)#
Scan the local source.
- Parameters
source (metadata_guardian.source.local.local_metadata_source.LocalMetadataSource) – the LocalMetadataSource to scan
- Returns
a Metadata Guardian report
- Return type
Report#
- class metadata_guardian.report.MetadataGuardianReport(*, report_results=None)#
Metadata Guardian Report.
- Parameters
report_results (List[metadata_guardian.report.ReportResults]) –
- Return type
None
- append(other_report)#
Concat the results before making the report.
- Parameters
other_report (metadata_guardian.report.MetadataGuardianReport) – other report to append
- Returns
- Return type
None
- to_console()#
Display the metadata guardian results to the console.
- Returns
- Return type
None
- to_csv(file_path)#
Save the metadata guardian results to a CSV file.
- Param
file path is the path of the CSV file.
- Returns
- Parameters
file_path (str) –
- Return type
None
- class metadata_guardian.report.ProgressionBar(disable)#
Progression Bar provides a progression bar to display the results of the scanner.
- Parameters
disable (bool) –
- Return type
None
- add_task_with_item(item_name, source_type, total, current_item='Starting')#
Add task in the Progression Bar.
- Parameters
item_name (str) – the name of the item to search
current_item (str) – the name of the current item
source_type (str) – the source type
total (int) – total of the number of tables
- Returns
the created Task
- Return type
None
- update_item(current_item)#
Update the current item of the task.
- Parameters
current_item (str) – the name of the current item
- Returns
- Return type
None
- class metadata_guardian.report.ReportResults(*, source, results=None)#
Metadata Guardian Results.
- Parameters
source (str) –
results (List[metadata_guardian.data_rules.MetadataGuardianResults]) –
- Return type
None
Source#
- class metadata_guardian.source.metadata_source.ColumnMetadata(*, column_name, column_comment=None)#
Column Metadata instance.
- Parameters
column_name (str) –
column_comment (Optional[str]) –
- Return type
None
- as_list()#
Return as a raw list of strings.
- Returns
a list of string
- Return type
Iterator[str]
Local Sources#
- class metadata_guardian.source.local.avro_schema_source.AvroSchemaSource(*, local_path, fs=<pyarrow._fs.LocalFileSystem object>, extra_connection_args=None)#
Instance for a local Avro Schema file.
- Parameters
local_path (str) –
fs (pyarrow._fs.FileSystem) –
extra_connection_args (Dict[str, Any]) –
- Return type
None
- get_column_names()#
Get column names from the AVRO Schema file.
- Returns
the list of the column names
- Return type
Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]
- property namespace: str#
Namespace of the AVRO schema.
- Returns
the namespace
- read()#
Read the AVRO Schema file.
- Return type
Union[str, bytes]
- classmethod type()#
The type of the source.
- Returns
the name o of the source.
- Return type
str
- class metadata_guardian.source.local.avro_source.AvroSource(*, local_path, fs=<pyarrow._fs.LocalFileSystem object>, extra_connection_args=None)#
Instance for a local Avro file.
- Parameters
local_path (str) –
fs (pyarrow._fs.FileSystem) –
extra_connection_args (Dict[str, Any]) –
- Return type
None
- get_column_names()#
Get column names from the AVRO file.
- Returns
the list of the column names
- Return type
Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]
- get_field_attribute(attribute_name)#
Get the specific attribute from the AVRO Schema.
- Parameters
attribute_name (str) – the attribute name to get
- Returns
the list of attributes in the fields
- Return type
List[Optional[metadata_guardian.source.metadata_source.ColumnMetadata]]
- property namespace: str#
Namespace of the AVRO schema.
- Returns
the namespace
- read()#
Read the AVRO file.
- Return type
avro.datafile.DataFileReader
- classmethod type()#
The type of the source.
- Returns
the name o of the source.
- Return type
str
- class metadata_guardian.source.local.local_metadata_source.LocalMetadataSource(*, local_path, fs=<pyarrow._fs.LocalFileSystem object>, extra_connection_args=None)#
LocalMetadata Source contract.
- Parameters
local_path (str) –
fs (pyarrow._fs.FileSystem) –
extra_connection_args (Dict[str, Any]) –
- Return type
None
- get_column_names()#
Get the column names from the schema.
- Returns
the list of the column names
- Return type
Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]
- read()#
Read the source local file.
- Returns
the file content
- Return type
pyarrow._dataset.Dataset
- class metadata_guardian.source.local.parquet_source.ParquetSource(*, local_path, fs=<pyarrow._fs.LocalFileSystem object>, extra_connection_args=None)#
Instance for a local Parquet file.
- Parameters
local_path (str) –
fs (pyarrow._fs.FileSystem) –
extra_connection_args (Dict[str, Any]) –
- Return type
None
- classmethod type()#
The type of the source.
- Returns
the name of the source.
- Return type
str
External Sources#
- class metadata_guardian.source.external.aws_source.AthenaSource(*, s3_staging_dir, catalog_name='AWSDataCatalog', region_name=None, aws_access_key_id=None, aws_secret_access_key=None, extra_connection_args=None)#
Athena Source instance.
- Parameters
s3_staging_dir (str) –
catalog_name (str) –
region_name (Optional[str]) –
aws_access_key_id (Optional[str]) –
aws_secret_access_key (Optional[str]) –
extra_connection_args (Dict[str, Any]) –
- Return type
None
- close_connection()#
Close the connection of the source.
- Returns
- Return type
None
- create_connection()#
Create Athena connection. :return:
- Return type
None
- get_column_names(database_name, table_name, include_comment=False)#
Get the column names from the table.
- Parameters
database_name (str) – the database name
table_name (str) – the table name
include_comment (bool) – include the comment
- Returns
the list of the column names
- Return type
Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]
- get_table_names_list(database_name)#
Get the table names list from the database in AWS Athena.
- Parameters
database_name (str) – the database name
- Returns
the list of the table names of the database
- Return type
Iterator[str]
- classmethod type()#
The type of the source.
- Returns
the name o of the source.
- Return type
str
- class metadata_guardian.source.external.aws_source.GlueSource(*, region_name=None, aws_access_key_id=None, aws_secret_access_key=None, extra_connection_args=None)#
Glue Source instance.
- Parameters
region_name (Optional[str]) –
aws_access_key_id (Optional[str]) –
aws_secret_access_key (Optional[str]) –
extra_connection_args (Dict[str, Any]) –
- Return type
None
- close_connection()#
Close the connection of the source.
- Returns
- Return type
None
- create_connection()#
Create the Glue connection.
- Returns
- Return type
None
- get_column_names(database_name, table_name, include_comment=False)#
Get the column names from AWS Glue table.
- Parameters
database_name (str) – the name of the database
table_name (str) – the name of the table
include_comment (bool) – include the comments
- Returns
the list of the column names
- Return type
Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]
- get_table_names_list(database_name)#
Get the table names list from the database in AWS Glue.
- Parameters
database_name (str) – the database name
- Returns
the list of the table names of the database
- Return type
Iterator[str]
- classmethod type()#
The type of the source.
- Returns
the name of the source.
- Return type
str
- class metadata_guardian.source.external.deltatable_source.DeltaTableSource(*, uri, data_catalog=DataCatalog.AWS, external_data_catalog_disable=True, extra_connection_args=None)#
- Parameters
uri (str) –
data_catalog (deltalake.data_catalog.DataCatalog) –
external_data_catalog_disable (bool) –
extra_connection_args (Dict[str, Any]) –
- Return type
None
- close_connection()#
Close the connection of the source.
- Returns
- Return type
None
- create_connection()#
Create the DeltaTable instance.
- Returns
- Return type
None
- get_column_names(database_name=None, table_name=None, include_comment=False)#
Get column names from the Delta table.
- Parameters
database_name (Optional[str]) – the database name
table_name (Optional[str]) – the table name
include_comment (bool) – include the comment
- Returns
the list of the column names
- Return type
Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]
- get_table_names_list(database_name)#
Not relevant, just return the current Delta Table URI.
- Parameters
database_name (str) – the database name
- Returns
the list of the table names of the database
- Return type
Iterator[str]
- classmethod type()#
The type of the source.
- Returns
the name of the source.
- Return type
str
- class metadata_guardian.source.external.external_metadata_source.ExternalMetadataSource#
ExternalMetadataSource Source.
- Return type
None
- close_connection()#
Close the connection of the source.
- Returns
- Return type
None
- abstract create_connection()#
Create the connection of the source.
- Returns
- Return type
None
- abstract get_column_names(database_name, table_name, include_comment=False)#
Get the column names from the schema.
- Parameters
database_name (str) – the database name
table_name (str) – the table name
include_comment (bool) – include the comment
- Returns
the list of the column names
- Return type
Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]
- abstract get_table_names_list(database_name)#
Get the table names list from the database.
- Parameters
database_name (str) – the database name
- Returns
the list of the table names of the database
- Return type
Iterator[str]
- exception metadata_guardian.source.external.external_metadata_source.ExternalMetadataSourceException#
Raised where there is an exception to describe a external metadata source exception.
- class metadata_guardian.source.external.gcp_source.BigQuerySource(*, service_account_json_path, project=None, location=None, extra_connection_args=None)#
Instance of a BigQuery source.
- Parameters
service_account_json_path (str) –
project (Optional[str]) –
location (Optional[str]) –
extra_connection_args (Dict[str, Any]) –
- Return type
None
- close_connection()#
Close the BigQuery connection.
- Returns
- Return type
None
- create_connection()#
Get the Big Query connection.
- Returns
- Return type
None
- get_column_names(database_name, table_name, include_comment=False)#
Get column names from the table of the dataset.
- Parameters
database_name (str) – in that case the dataset
table_name (str) – the table name
include_comment (bool) – include the comment
- Returns
the list of the column names
- Return type
Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]
- get_table_names_list(database_name)#
Get the table names list from the GCP dataset.
- Parameters
database_name (str) – in that case the dataset
- Returns
the list of the table names list
- Return type
Iterator[str]
- classmethod type()#
The type of the source.
- Returns
the name bof the source.
- Return type
str
- class metadata_guardian.source.external.kafka_schema_registry_source.KafkaSchemaRegistryAuthentication(value)#
Authentication method for Kafka Schema Registry source.
- class metadata_guardian.source.external.kafka_schema_registry_source.KafkaSchemaRegistrySource(*, url, ssl_certificate_location=None, ssl_key_location=None, authenticator=KafkaSchemaRegistryAuthentication.USER_PWD, comment_field_name='doc', extra_connection_args=None)#
Instance of a Kafka Schema Registry source.
- Parameters
url (str) –
ssl_certificate_location (Optional[str]) –
ssl_key_location (Optional[str]) –
authenticator (metadata_guardian.source.external.kafka_schema_registry_source.KafkaSchemaRegistryAuthentication) –
comment_field_name (str) –
extra_connection_args (Dict[str, Any]) –
- Return type
None
- close_connection()#
Close the Kafka Schema Registry connection.
- Returns
- Return type
None
- create_connection()#
Create the connection of the Kafka Schema Registry.
- Returns
- Return type
None
- get_column_names(database_name, table_name, include_comment=False)#
Get the column names from the subject.
- Parameters
database_name (str) – not relevant
table_name (str) – the subject name
include_comment (bool) – include the comment
- Returns
the list of the column names
- Return type
Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]
- get_table_names_list(database_name)#
Get all the subjects from the Schema Registry.
- Parameters
database_name (str) – not relevant in that case
- Returns
the list of the table names of the database
- Return type
Iterator[str]
- classmethod type()#
The type of the source.
- Returns
the name of the source.
- Return type
str
- class metadata_guardian.source.external.mysql_source.MySQLAuthenticator(value)#
Authentication method for MySQL source.
- class metadata_guardian.source.external.mysql_source.MySQLSource(*, user, password, host, database=None, authenticator=MySQLAuthenticator.USER_PWD, extra_connection_args=None)#
Instance of a MySQL source.
- Parameters
user (str) –
password (str) –
host (str) –
database (Optional[str]) –
authenticator (metadata_guardian.source.external.mysql_source.MySQLAuthenticator) –
extra_connection_args (Dict[str, Any]) –
- Return type
None
- create_connection()#
Create a MySQL connection based on the MySQLAuthenticator.
- Returns
- Return type
None
- get_column_names(database_name, table_name, include_comment=False)#
Get column names from the table.
- Parameters
database_name (str) – the database name
table_name (str) – the table name
include_comment (bool) – include the comment
- Returns
the list of the column names
- Return type
Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]
- get_table_names_list(database_name)#
Get the table names list from the MySQL database.
- Parameters
database_name (str) – the database name
- Returns
the list of the table names of the database
- Return type
Iterator[str]
- classmethod type()#
The type of the source.
- Returns
the name of the source.
- Return type
str
- class metadata_guardian.source.external.snowflake_source.SnowflakeAuthenticator(value)#
Authentication method for Snowflake source.
- class metadata_guardian.source.external.snowflake_source.SnowflakeSource(*, sf_account, sf_user, sf_password, warehouse, schema_name, okta_account_name=None, oauth_token=None, oauth_host=None, authenticator=SnowflakeAuthenticator.USER_PWD, extra_connection_args=None)#
Instance of a Snowflake source.
- Parameters
sf_account (str) –
sf_user (str) –
sf_password (str) –
warehouse (str) –
schema_name (str) –
okta_account_name (Optional[str]) –
oauth_token (Optional[str]) –
oauth_host (Optional[str]) –
authenticator (metadata_guardian.source.external.snowflake_source.SnowflakeAuthenticator) –
extra_connection_args (Dict[str, Any]) –
- Return type
None
- close_connection()#
Close the Snowflake connection. :return:
- Return type
None
- create_connection()#
Create a Snowflake connection based on the SnowflakeAuthenticator.
- Returns
- Return type
None
- get_column_names(database_name, table_name, include_comment=False)#
Get column names from the table.
- Parameters
database_name (str) – the database name
table_name (str) – the table name
include_comment (bool) – include the comment
- Returns
the list of the column names
- Return type
Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]
- get_table_names_list(database_name)#
Get the table names list from the Snowflake database.
- Parameters
database_name (str) – the database name
- Returns
the list of the table names of the database
- Return type
Iterator[str]
- classmethod type()#
The type of the source.
- Returns
the name of the source.
- Return type
str