API Reference#

Data Rules#

class metadata_guardian.data_rules.AvailableCategory(value)#

Available Data Rules Categories.

class metadata_guardian.data_rules.DataRule(*, rule_name, regex_pattern, documentation)#

DataRule instance with a regex pattern and a documentation.

Parameters
  • rule_name (str) –

  • regex_pattern (str) –

  • documentation (str) –

Return type

None

class metadata_guardian.data_rules.DataRules(data_rules)#

Data Rules instances.

Parameters

data_rules (RawDataRules) –

Return type

None

classmethod from_available_category(category)#

Get Data Rules from an available category.

Parameters

category (metadata_guardian.data_rules.AvailableCategory) – the available category of the data rules

Returns

the Data Rules instance

Return type

metadata_guardian.data_rules.DataRules

classmethod from_new_category(category, data_rules)#

Create data rules from a given category and data rules.

Parameters
Returns

the Data Rules instance

Return type

metadata_guardian.data_rules.DataRules

classmethod from_path(path)#

Get Data Rules from a path.

Parameters

path (str) – the path of the yaml file

Returns

the Data Rules instance

Return type

metadata_guardian.data_rules.DataRules

validate_file(path)#

Validate a file content with the data rules defined.

Parameters

path (str) – the file path

Returns

the metadata guardian results

Return type

List[metadata_guardian.data_rules.MetadataGuardianResults]

validate_word(word)#

Validate a word with the data rules defined.

Parameters

word (str) – the word to validate

Returns

the metadata guardian results

Return type

metadata_guardian.data_rules.MetadataGuardianResults

validate_words(words)#

Validate a list of words with the data rules defined.

Parameters

words (List[str]) – the words to validate

Returns

the metadata guardian results

Return type

List[metadata_guardian.data_rules.MetadataGuardianResults]

class metadata_guardian.data_rules.MetadataGuardianResults(*, category, content, data_rules)#

Metadata Guardian Results instance with the content that matches with the data rules.

Parameters
Return type

None

Configuration#

metadata_guardian.conf.configure_logger()#

Configure the loguru configuration with Rich.

Returns

Return type

None

Scanner#

class metadata_guardian.scanner.ColumnScanner(*, data_rules, progression_bar_disabled=True)#

Column Scanner instance.

Parameters
Return type

None

scan_external(source, database_name, table_name=None, include_comment=False)#

Scan the column names from the external source using a table name or a database name.

Parameters
Returns

a Metadata Guardian report

Return type

metadata_guardian.report.MetadataGuardianReport

async scan_external_async(source, database_name, tasks_limit=2, table_name=None, include_comment=False)#

Scan the column names from the external source using a table name or a database name. Note that it can generate multiple concurrent calls to your metadata source.

Parameters
Returns

a Metadata Guardian report

Return type

metadata_guardian.report.MetadataGuardianReport

scan_local(source)#

Scan the column names from the local source.

Parameters

source (metadata_guardian.source.local.local_metadata_source.LocalMetadataSource) – the MetadataSource to scan

Returns

a Metadata Guardian report

Return type

metadata_guardian.report.MetadataGuardianReport

class metadata_guardian.scanner.ContentFilesScanner(*, data_rules, progression_bar_disabled=True)#

Content Files Scanner instance.

Parameters
Return type

None

scan_directory(directory_path, file_names_extension)#

Scan all the files inside directory path with the file name extension.

Parameters
  • directory_path (str) – the directory path to scan

  • file_names_extension (str) – the file name extension to include (without the “.”)

Returns

a Metadata Guardian report

Return type

metadata_guardian.report.MetadataGuardianReport

scan_local_file(path)#

Scan a file with data rules.

Parameters

path (str) – the path of the file to scan

Returns

a Metadata Guardian report

Return type

metadata_guardian.report.MetadataGuardianReport

class metadata_guardian.scanner.Scanner#

Scanner Interface.

Return type

None

abstract scan_external(source, database_name, table_name=None, include_comment=False)#

Scan the column names from the external source.

Parameters
Returns

a Metadata Guardian report

Return type

metadata_guardian.report.MetadataGuardianReport

abstract async scan_external_async(source, database_name, tasks_limit, table_name=None, include_comment=False)#

Scan the column names from the external source asynchronously.

Parameters
Returns

a Metadata Guardian report

Return type

metadata_guardian.report.MetadataGuardianReport

abstract scan_local(source)#

Scan the local source.

Parameters

source (metadata_guardian.source.local.local_metadata_source.LocalMetadataSource) – the LocalMetadataSource to scan

Returns

a Metadata Guardian report

Return type

metadata_guardian.report.MetadataGuardianReport

Report#

class metadata_guardian.report.MetadataGuardianReport(*, report_results=None)#

Metadata Guardian Report.

Parameters

report_results (List[metadata_guardian.report.ReportResults]) –

Return type

None

append(other_report)#

Concat the results before making the report.

Parameters

other_report (metadata_guardian.report.MetadataGuardianReport) – other report to append

Returns

Return type

None

to_console()#

Display the metadata guardian results to the console.

Returns

Return type

None

to_csv(file_path)#

Save the metadata guardian results to a CSV file.

Param

file path is the path of the CSV file.

Returns

Parameters

file_path (str) –

Return type

None

class metadata_guardian.report.ProgressionBar(disable)#

Progression Bar provides a progression bar to display the results of the scanner.

Parameters

disable (bool) –

Return type

None

add_task_with_item(item_name, source_type, total, current_item='Starting')#

Add task in the Progression Bar.

Parameters
  • item_name (str) – the name of the item to search

  • current_item (str) – the name of the current item

  • source_type (str) – the source type

  • total (int) – total of the number of tables

Returns

the created Task

Return type

None

update_item(current_item)#

Update the current item of the task.

Parameters

current_item (str) – the name of the current item

Returns

Return type

None

class metadata_guardian.report.ReportResults(*, source, results=None)#

Metadata Guardian Results.

Parameters
Return type

None

Source#

class metadata_guardian.source.metadata_source.ColumnMetadata(*, column_name, column_comment=None)#

Column Metadata instance.

Parameters
  • column_name (str) –

  • column_comment (Optional[str]) –

Return type

None

as_list()#

Return as a raw list of strings.

Returns

a list of string

Return type

Iterator[str]

class metadata_guardian.source.metadata_source.Metadata#

Metadata contract.

Return type

None

abstract as_list()#

Return as a raw list of strings.

Returns

a list of string

Return type

Iterator[str]

class metadata_guardian.source.metadata_source.MetadataSource#

Metadata Source contract.

Return type

None

classmethod type()#

The type of the source.

Returns

the name of the source.

Return type

str

Local Sources#

class metadata_guardian.source.local.avro_schema_source.AvroSchemaSource(*, local_path, fs=<pyarrow._fs.LocalFileSystem object>, extra_connection_args=None)#

Instance for a local Avro Schema file.

Parameters
  • local_path (str) –

  • fs (pyarrow._fs.FileSystem) –

  • extra_connection_args (Dict[str, Any]) –

Return type

None

get_column_names()#

Get column names from the AVRO Schema file.

Returns

the list of the column names

Return type

Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]

property namespace: str#

Namespace of the AVRO schema.

Returns

the namespace

read()#

Read the AVRO Schema file.

Return type

Union[str, bytes]

classmethod type()#

The type of the source.

Returns

the name o of the source.

Return type

str

class metadata_guardian.source.local.avro_source.AvroSource(*, local_path, fs=<pyarrow._fs.LocalFileSystem object>, extra_connection_args=None)#

Instance for a local Avro file.

Parameters
  • local_path (str) –

  • fs (pyarrow._fs.FileSystem) –

  • extra_connection_args (Dict[str, Any]) –

Return type

None

get_column_names()#

Get column names from the AVRO file.

Returns

the list of the column names

Return type

Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]

get_field_attribute(attribute_name)#

Get the specific attribute from the AVRO Schema.

Parameters

attribute_name (str) – the attribute name to get

Returns

the list of attributes in the fields

Return type

List[Optional[metadata_guardian.source.metadata_source.ColumnMetadata]]

property namespace: str#

Namespace of the AVRO schema.

Returns

the namespace

read()#

Read the AVRO file.

Return type

avro.datafile.DataFileReader

classmethod type()#

The type of the source.

Returns

the name o of the source.

Return type

str

class metadata_guardian.source.local.local_metadata_source.LocalMetadataSource(*, local_path, fs=<pyarrow._fs.LocalFileSystem object>, extra_connection_args=None)#

LocalMetadata Source contract.

Parameters
  • local_path (str) –

  • fs (pyarrow._fs.FileSystem) –

  • extra_connection_args (Dict[str, Any]) –

Return type

None

get_column_names()#

Get the column names from the schema.

Returns

the list of the column names

Return type

Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]

read()#

Read the source local file.

Returns

the file content

Return type

pyarrow._dataset.Dataset

class metadata_guardian.source.local.parquet_source.ParquetSource(*, local_path, fs=<pyarrow._fs.LocalFileSystem object>, extra_connection_args=None)#

Instance for a local Parquet file.

Parameters
  • local_path (str) –

  • fs (pyarrow._fs.FileSystem) –

  • extra_connection_args (Dict[str, Any]) –

Return type

None

classmethod type()#

The type of the source.

Returns

the name of the source.

Return type

str

External Sources#

class metadata_guardian.source.external.aws_source.AthenaSource(*, s3_staging_dir, catalog_name='AWSDataCatalog', region_name=None, aws_access_key_id=None, aws_secret_access_key=None, extra_connection_args=None)#

Athena Source instance.

Parameters
  • s3_staging_dir (str) –

  • catalog_name (str) –

  • region_name (Optional[str]) –

  • aws_access_key_id (Optional[str]) –

  • aws_secret_access_key (Optional[str]) –

  • extra_connection_args (Dict[str, Any]) –

Return type

None

close_connection()#

Close the connection of the source.

Returns

Return type

None

create_connection()#

Create Athena connection. :return:

Return type

None

get_column_names(database_name, table_name, include_comment=False)#

Get the column names from the table.

Parameters
  • database_name (str) – the database name

  • table_name (str) – the table name

  • include_comment (bool) – include the comment

Returns

the list of the column names

Return type

Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]

get_table_names_list(database_name)#

Get the table names list from the database in AWS Athena.

Parameters

database_name (str) – the database name

Returns

the list of the table names of the database

Return type

Iterator[str]

classmethod type()#

The type of the source.

Returns

the name o of the source.

Return type

str

class metadata_guardian.source.external.aws_source.GlueSource(*, region_name=None, aws_access_key_id=None, aws_secret_access_key=None, extra_connection_args=None)#

Glue Source instance.

Parameters
  • region_name (Optional[str]) –

  • aws_access_key_id (Optional[str]) –

  • aws_secret_access_key (Optional[str]) –

  • extra_connection_args (Dict[str, Any]) –

Return type

None

close_connection()#

Close the connection of the source.

Returns

Return type

None

create_connection()#

Create the Glue connection.

Returns

Return type

None

get_column_names(database_name, table_name, include_comment=False)#

Get the column names from AWS Glue table.

Parameters
  • database_name (str) – the name of the database

  • table_name (str) – the name of the table

  • include_comment (bool) – include the comments

Returns

the list of the column names

Return type

Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]

get_table_names_list(database_name)#

Get the table names list from the database in AWS Glue.

Parameters

database_name (str) – the database name

Returns

the list of the table names of the database

Return type

Iterator[str]

classmethod type()#

The type of the source.

Returns

the name of the source.

Return type

str

class metadata_guardian.source.external.deltatable_source.DeltaTableSource(*, uri, data_catalog=DataCatalog.AWS, external_data_catalog_disable=True, extra_connection_args=None)#
Parameters
  • uri (str) –

  • data_catalog (deltalake.data_catalog.DataCatalog) –

  • external_data_catalog_disable (bool) –

  • extra_connection_args (Dict[str, Any]) –

Return type

None

close_connection()#

Close the connection of the source.

Returns

Return type

None

create_connection()#

Create the DeltaTable instance.

Returns

Return type

None

get_column_names(database_name=None, table_name=None, include_comment=False)#

Get column names from the Delta table.

Parameters
  • database_name (Optional[str]) – the database name

  • table_name (Optional[str]) – the table name

  • include_comment (bool) – include the comment

Returns

the list of the column names

Return type

Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]

get_table_names_list(database_name)#

Not relevant, just return the current Delta Table URI.

Parameters

database_name (str) – the database name

Returns

the list of the table names of the database

Return type

Iterator[str]

classmethod type()#

The type of the source.

Returns

the name of the source.

Return type

str

class metadata_guardian.source.external.external_metadata_source.ExternalMetadataSource#

ExternalMetadataSource Source.

Return type

None

close_connection()#

Close the connection of the source.

Returns

Return type

None

abstract create_connection()#

Create the connection of the source.

Returns

Return type

None

abstract get_column_names(database_name, table_name, include_comment=False)#

Get the column names from the schema.

Parameters
  • database_name (str) – the database name

  • table_name (str) – the table name

  • include_comment (bool) – include the comment

Returns

the list of the column names

Return type

Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]

abstract get_table_names_list(database_name)#

Get the table names list from the database.

Parameters

database_name (str) – the database name

Returns

the list of the table names of the database

Return type

Iterator[str]

exception metadata_guardian.source.external.external_metadata_source.ExternalMetadataSourceException#

Raised where there is an exception to describe a external metadata source exception.

class metadata_guardian.source.external.gcp_source.BigQuerySource(*, service_account_json_path, project=None, location=None, extra_connection_args=None)#

Instance of a BigQuery source.

Parameters
  • service_account_json_path (str) –

  • project (Optional[str]) –

  • location (Optional[str]) –

  • extra_connection_args (Dict[str, Any]) –

Return type

None

close_connection()#

Close the BigQuery connection.

Returns

Return type

None

create_connection()#

Get the Big Query connection.

Returns

Return type

None

get_column_names(database_name, table_name, include_comment=False)#

Get column names from the table of the dataset.

Parameters
  • database_name (str) – in that case the dataset

  • table_name (str) – the table name

  • include_comment (bool) – include the comment

Returns

the list of the column names

Return type

Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]

get_table_names_list(database_name)#

Get the table names list from the GCP dataset.

Parameters

database_name (str) – in that case the dataset

Returns

the list of the table names list

Return type

Iterator[str]

classmethod type()#

The type of the source.

Returns

the name bof the source.

Return type

str

class metadata_guardian.source.external.kafka_schema_registry_source.KafkaSchemaRegistryAuthentication(value)#

Authentication method for Kafka Schema Registry source.

class metadata_guardian.source.external.kafka_schema_registry_source.KafkaSchemaRegistrySource(*, url, ssl_certificate_location=None, ssl_key_location=None, authenticator=KafkaSchemaRegistryAuthentication.USER_PWD, comment_field_name='doc', extra_connection_args=None)#

Instance of a Kafka Schema Registry source.

Parameters
Return type

None

close_connection()#

Close the Kafka Schema Registry connection.

Returns

Return type

None

create_connection()#

Create the connection of the Kafka Schema Registry.

Returns

Return type

None

get_column_names(database_name, table_name, include_comment=False)#

Get the column names from the subject.

Parameters
  • database_name (str) – not relevant

  • table_name (str) – the subject name

  • include_comment (bool) – include the comment

Returns

the list of the column names

Return type

Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]

get_table_names_list(database_name)#

Get all the subjects from the Schema Registry.

Parameters

database_name (str) – not relevant in that case

Returns

the list of the table names of the database

Return type

Iterator[str]

classmethod type()#

The type of the source.

Returns

the name of the source.

Return type

str

class metadata_guardian.source.external.mysql_source.MySQLAuthenticator(value)#

Authentication method for MySQL source.

class metadata_guardian.source.external.mysql_source.MySQLSource(*, user, password, host, database=None, authenticator=MySQLAuthenticator.USER_PWD, extra_connection_args=None)#

Instance of a MySQL source.

Parameters
Return type

None

create_connection()#

Create a MySQL connection based on the MySQLAuthenticator.

Returns

Return type

None

get_column_names(database_name, table_name, include_comment=False)#

Get column names from the table.

Parameters
  • database_name (str) – the database name

  • table_name (str) – the table name

  • include_comment (bool) – include the comment

Returns

the list of the column names

Return type

Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]

get_table_names_list(database_name)#

Get the table names list from the MySQL database.

Parameters

database_name (str) – the database name

Returns

the list of the table names of the database

Return type

Iterator[str]

classmethod type()#

The type of the source.

Returns

the name of the source.

Return type

str

class metadata_guardian.source.external.snowflake_source.SnowflakeAuthenticator(value)#

Authentication method for Snowflake source.

class metadata_guardian.source.external.snowflake_source.SnowflakeSource(*, sf_account, sf_user, sf_password, warehouse, schema_name, okta_account_name=None, oauth_token=None, oauth_host=None, authenticator=SnowflakeAuthenticator.USER_PWD, extra_connection_args=None)#

Instance of a Snowflake source.

Parameters
Return type

None

close_connection()#

Close the Snowflake connection. :return:

Return type

None

create_connection()#

Create a Snowflake connection based on the SnowflakeAuthenticator.

Returns

Return type

None

get_column_names(database_name, table_name, include_comment=False)#

Get column names from the table.

Parameters
  • database_name (str) – the database name

  • table_name (str) – the table name

  • include_comment (bool) – include the comment

Returns

the list of the column names

Return type

Iterator[metadata_guardian.source.metadata_source.ColumnMetadata]

get_table_names_list(database_name)#

Get the table names list from the Snowflake database.

Parameters

database_name (str) – the database name

Returns

the list of the table names of the database

Return type

Iterator[str]

classmethod type()#

The type of the source.

Returns

the name of the source.

Return type

str