Checks

Module containing data quality check classes.

`AverageCheck`

Bases: ColumnTransformationCheck

Compute the average (AVG) of a numeric column for the filtered rows.

Inherits from ColumnTransformationCheck. Thresholds apply to the computed average.

Source code in src/koality/checks.py

class AverageCheck(ColumnTransformationCheck):
    """Compute the average (AVG) of a numeric column for the filtered rows.

    Inherits from ColumnTransformationCheck. Thresholds apply to the computed average.
    """

    def __init__(
        self,
        database_accessor: str,
        database_provider: DatabaseProvider | None,
        table: str,
        check_column: str,
        lower_threshold: float = -math.inf,
        upper_threshold: float = math.inf,
        *,
        filters: dict[str, Any] | None = None,
        identifier_format: str = "identifier",
        identifier_placeholder: str = "ALL",
        date_info: str | None = None,
        extra_info: str | None = None,
        monitor_only: bool = False,
    ) -> None:
        """Initialize the average check."""
        super().__init__(
            database_accessor=database_accessor,
            database_provider=database_provider,
            transformation_name="avg",
            table=table,
            check_column=check_column,
            lower_threshold=lower_threshold,
            upper_threshold=upper_threshold,
            filters=filters,
            identifier_format=identifier_format,
            identifier_placeholder=identifier_placeholder,
            date_info=date_info,
            extra_info=extra_info,
            monitor_only=monitor_only,
        )

    def transformation_statement(self) -> str:
        """Return the SQL statement for computing the average."""
        return f"AVG({self.in_memory_column}) AS {self.name}"

`init(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

Initialize the average check.

Source code in src/koality/checks.py

def __init__(
    self,
    database_accessor: str,
    database_provider: DatabaseProvider | None,
    table: str,
    check_column: str,
    lower_threshold: float = -math.inf,
    upper_threshold: float = math.inf,
    *,
    filters: dict[str, Any] | None = None,
    identifier_format: str = "identifier",
    identifier_placeholder: str = "ALL",
    date_info: str | None = None,
    extra_info: str | None = None,
    monitor_only: bool = False,
) -> None:
    """Initialize the average check."""
    super().__init__(
        database_accessor=database_accessor,
        database_provider=database_provider,
        transformation_name="avg",
        table=table,
        check_column=check_column,
        lower_threshold=lower_threshold,
        upper_threshold=upper_threshold,
        filters=filters,
        identifier_format=identifier_format,
        identifier_placeholder=identifier_placeholder,
        date_info=date_info,
        extra_info=extra_info,
        monitor_only=monitor_only,
    )

`transformation_statement()`

Return the SQL statement for computing the average.

Source code in src/koality/checks.py

def transformation_statement(self) -> str:
    """Return the SQL statement for computing the average."""
    return f"AVG({self.in_memory_column}) AS {self.name}"

`ColumnTransformationCheck`

Bases: DataQualityCheck, ABC

Abstract class for data quality checks performing checks on a specific column of a table.

Parameters:

Name	Type	Description	Default
`transformation_name`	`str`	The name to refer to this check (in combination with check_column)	required
`table`	`str`	Name of BQ table (e.g., "project.dataset.table")	required
`check_column`	`str \| None`	Name of column to be checked (e.g., "category")	`None`
`lower_threshold`	`float`	Check will fail if check result < lower_threshold	`-inf`
`upper_threshold`	`float`	Check will fail if check result > upper_threshold	`inf`
`monitor_only`	`bool`	If True, no checks will be performed	`False`
`extra_info`	`str \| None`	Optional additional text that will be added to the end of the failure message	`None`

Source code in src/koality/checks.py

class ColumnTransformationCheck(DataQualityCheck, abc.ABC):
    """Abstract class for data quality checks performing checks on a specific column of a table.

    Args:
        transformation_name: The name to refer to this check (in combination with check_column)
        table: Name of BQ table (e.g., "project.dataset.table")
        check_column: Name of column to be checked (e.g., "category")
        lower_threshold: Check will fail if check result < lower_threshold
        upper_threshold: Check will fail if check result > upper_threshold
        monitor_only: If True, no checks will be performed
        extra_info: Optional additional text that will be added to the end of the failure message

    """

    def __init__(
        self,
        database_accessor: str,
        database_provider: DatabaseProvider | None,
        transformation_name: str,
        table: str,
        check_column: str | None = None,
        lower_threshold: float = -math.inf,
        upper_threshold: float = math.inf,
        *,
        filters: dict[str, Any] | None = None,
        identifier_format: str = "identifier",
        identifier_placeholder: str = "ALL",
        date_info: str | None = None,
        extra_info: str | None = None,
        monitor_only: bool = False,
    ) -> None:
        """Initialize the column transformation check."""
        self.transformation_name = transformation_name

        super().__init__(
            database_accessor=database_accessor,
            database_provider=database_provider,
            table=table,
            check_column=check_column,
            lower_threshold=lower_threshold,
            upper_threshold=upper_threshold,
            filters=filters,
            identifier_format=identifier_format,
            identifier_placeholder=identifier_placeholder,
            date_info=date_info,
            extra_info=extra_info,
            monitor_only=monitor_only,
        )

    def assemble_name(self) -> str:
        """Return the check name combining column and transformation."""
        return f"{self.check_column.split('.')[-1]}_{self.transformation_name}"

    @abc.abstractmethod
    def transformation_statement(self) -> str:
        """Return the SQL transformation statement for this check."""

    def query_boilerplate(self, metric_statement: str) -> str:
        """Return the base SQL query structure with the given metric statement."""
        return f"""
        SELECT
            {metric_statement}
        FROM
            "{self.table}"
        """

    def assemble_query(self) -> str:
        """Assemble the complete SQL query for this check."""
        main_query = self.query_boilerplate(self.transformation_statement())

        filters = self.filters.copy()
        if isinstance(self, IqrOutlierCheck):
            filters = {name: cfg for name, cfg in filters.items() if cfg.get("type") != "date"}

        if where_statement := self.assemble_where_statement(filters, database_accessor=self.database_accessor):
            return main_query + "\n" + where_statement

        return main_query

    def assemble_data_exists_query(self) -> str:
        """Assemble the SQL query to check if data exists in the table."""
        data_exists_query = f"""
        SELECT
            IF(COUNT(*) > 0, '', '{self.table}') AS empty_table
        FROM
            "{self.table}"
        """

        if where_statement := self.assemble_where_statement(self.filters, database_accessor=self.database_accessor):
            return f"{data_exists_query}\n{where_statement}"

        return data_exists_query

`init(database_accessor, database_provider, transformation_name, table, check_column=None, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

Initialize the column transformation check.

Source code in src/koality/checks.py

def __init__(
    self,
    database_accessor: str,
    database_provider: DatabaseProvider | None,
    transformation_name: str,
    table: str,
    check_column: str | None = None,
    lower_threshold: float = -math.inf,
    upper_threshold: float = math.inf,
    *,
    filters: dict[str, Any] | None = None,
    identifier_format: str = "identifier",
    identifier_placeholder: str = "ALL",
    date_info: str | None = None,
    extra_info: str | None = None,
    monitor_only: bool = False,
) -> None:
    """Initialize the column transformation check."""
    self.transformation_name = transformation_name

    super().__init__(
        database_accessor=database_accessor,
        database_provider=database_provider,
        table=table,
        check_column=check_column,
        lower_threshold=lower_threshold,
        upper_threshold=upper_threshold,
        filters=filters,
        identifier_format=identifier_format,
        identifier_placeholder=identifier_placeholder,
        date_info=date_info,
        extra_info=extra_info,
        monitor_only=monitor_only,
    )

`assemble_data_exists_query()`

Assemble the SQL query to check if data exists in the table.

Source code in src/koality/checks.py

def assemble_data_exists_query(self) -> str:
    """Assemble the SQL query to check if data exists in the table."""
    data_exists_query = f"""
    SELECT
        IF(COUNT(*) > 0, '', '{self.table}') AS empty_table
    FROM
        "{self.table}"
    """

    if where_statement := self.assemble_where_statement(self.filters, database_accessor=self.database_accessor):
        return f"{data_exists_query}\n{where_statement}"

    return data_exists_query

`assemble_name()`

Return the check name combining column and transformation.

Source code in src/koality/checks.py

def assemble_name(self) -> str:
    """Return the check name combining column and transformation."""
    return f"{self.check_column.split('.')[-1]}_{self.transformation_name}"

`assemble_query()`

Assemble the complete SQL query for this check.

Source code in src/koality/checks.py

def assemble_query(self) -> str:
    """Assemble the complete SQL query for this check."""
    main_query = self.query_boilerplate(self.transformation_statement())

    filters = self.filters.copy()
    if isinstance(self, IqrOutlierCheck):
        filters = {name: cfg for name, cfg in filters.items() if cfg.get("type") != "date"}

    if where_statement := self.assemble_where_statement(filters, database_accessor=self.database_accessor):
        return main_query + "\n" + where_statement

    return main_query

`query_boilerplate(metric_statement)`

Return the base SQL query structure with the given metric statement.

Source code in src/koality/checks.py

def query_boilerplate(self, metric_statement: str) -> str:
    """Return the base SQL query structure with the given metric statement."""
    return f"""
    SELECT
        {metric_statement}
    FROM
        "{self.table}"
    """

`transformation_statement()` `abstractmethod`

Return the SQL transformation statement for this check.

Source code in src/koality/checks.py

@abc.abstractmethod
def transformation_statement(self) -> str:
    """Return the SQL transformation statement for this check."""

`CountCheck`

Bases: ColumnTransformationCheck

Check the number of rows or distinct values of a specific column.

Inherits from koality.checks.ColumnTransformationCheck, and thus, we refer to argument descriptions in its super class, except for the distinct argument which is added in this subclass.

Parameters:

Name	Type	Description	Default
`distinct`	`bool`	Indicates if the count should count all rows or only distinct values of a specific column. Note: distinct=True cannot be used with check_column="*".	`False`

Example: CountCheck( database_accessor="my-gcp-project.SHOP01", database_provider=None, table="my-gcp-project.SHOP01.skufeed_latest", check_column="sku_id", distinct=True, filters={ "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"}, "date": {"column": "DATE", "value": "2023-01-01", "type": "date"}, }, lower_threshold=10000.0, upper_threshold=99999.0, )

Source code in src/koality/checks.py

class CountCheck(ColumnTransformationCheck):
    """Check the number of rows or distinct values of a specific column.

    Inherits from `koality.checks.ColumnTransformationCheck`, and thus, we refer to
    argument descriptions in its super class, except for the `distinct` argument which
    is added in this subclass.

    Args:
        distinct: Indicates if the count should count all rows or only distinct values
                  of a specific column.
                  Note: distinct=True cannot be used with check_column="*".

    Example:
    CountCheck(
        database_accessor="my-gcp-project.SHOP01",
        database_provider=None,
        table="my-gcp-project.SHOP01.skufeed_latest",
        check_column="sku_id",
        distinct=True,
        filters={
            "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"},
            "date": {"column": "DATE", "value": "2023-01-01", "type": "date"},
        },
        lower_threshold=10000.0,
        upper_threshold=99999.0,
    )

    """

    def __init__(
        self,
        database_accessor: str,
        database_provider: DatabaseProvider | None,
        table: str,
        check_column: str,
        lower_threshold: float = -math.inf,
        upper_threshold: float = math.inf,
        *,
        distinct: bool = False,
        filters: dict[str, Any] | None = None,
        identifier_format: str = "identifier",
        identifier_placeholder: str = "ALL",
        date_info: str | None = None,
        extra_info: str | None = None,
        monitor_only: bool = False,
    ) -> None:
        """Initialize the count check."""
        if check_column == "*" and distinct:
            msg = "Cannot COUNT(DISTINCT *)! Either set check_column != '*' or distinct = False."
            raise KoalityError(msg)

        self.distinct = distinct

        super().__init__(
            database_accessor=database_accessor,
            database_provider=database_provider,
            transformation_name="distinct_count" if distinct else "count",
            table=table,
            check_column=check_column,
            lower_threshold=lower_threshold,
            upper_threshold=upper_threshold,
            filters=filters,
            identifier_format=identifier_format,
            identifier_placeholder=identifier_placeholder,
            date_info=date_info,
            extra_info=extra_info,
            monitor_only=monitor_only,
        )

    def transformation_statement(self) -> str:
        """Return the SQL statement for counting rows or distinct values."""
        if self.distinct:
            return f"COUNT(DISTINCT {self.in_memory_column}) AS {self.name}"

        return f"COUNT({self.in_memory_column}) AS {self.name}"

    def assemble_name(self) -> str:
        """Return the check name, using 'row_' prefix for wildcard columns."""
        if self.check_column == "*":
            return f"row_{self.transformation_name}"

        return super().assemble_name()

`init(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, distinct=False, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

Initialize the count check.

Source code in src/koality/checks.py

def __init__(
    self,
    database_accessor: str,
    database_provider: DatabaseProvider | None,
    table: str,
    check_column: str,
    lower_threshold: float = -math.inf,
    upper_threshold: float = math.inf,
    *,
    distinct: bool = False,
    filters: dict[str, Any] | None = None,
    identifier_format: str = "identifier",
    identifier_placeholder: str = "ALL",
    date_info: str | None = None,
    extra_info: str | None = None,
    monitor_only: bool = False,
) -> None:
    """Initialize the count check."""
    if check_column == "*" and distinct:
        msg = "Cannot COUNT(DISTINCT *)! Either set check_column != '*' or distinct = False."
        raise KoalityError(msg)

    self.distinct = distinct

    super().__init__(
        database_accessor=database_accessor,
        database_provider=database_provider,
        transformation_name="distinct_count" if distinct else "count",
        table=table,
        check_column=check_column,
        lower_threshold=lower_threshold,
        upper_threshold=upper_threshold,
        filters=filters,
        identifier_format=identifier_format,
        identifier_placeholder=identifier_placeholder,
        date_info=date_info,
        extra_info=extra_info,
        monitor_only=monitor_only,
    )

`assemble_name()`

Return the check name, using 'row_' prefix for wildcard columns.

Source code in src/koality/checks.py

def assemble_name(self) -> str:
    """Return the check name, using 'row_' prefix for wildcard columns."""
    if self.check_column == "*":
        return f"row_{self.transformation_name}"

    return super().assemble_name()

`transformation_statement()`

Return the SQL statement for counting rows or distinct values.

Source code in src/koality/checks.py

def transformation_statement(self) -> str:
    """Return the SQL statement for counting rows or distinct values."""
    if self.distinct:
        return f"COUNT(DISTINCT {self.in_memory_column}) AS {self.name}"

    return f"COUNT({self.in_memory_column}) AS {self.name}"

`DataQualityCheck`

Bases: ABC

Abstract class for all data quality checks.

Provides generic methods relevant to all data quality check classes.

Parameters:

Name	Type	Description	Default
`table`	`str`	Name of BQ table (e.g., "project.dataset.table")	required
`check_column`	`str \| None`	Name of column to be checked (e.g., "category")	`None`
`lower_threshold`	`float`	Check will fail if check result < lower_threshold	`-inf`
`upper_threshold`	`float`	Check will fail if check result > upper_threshold	`inf`
`monitor_only`	`bool`	If True, no checks will be performed	`False`
`extra_info`	`str \| None`	Optional additional text that will be added to the end of the failure message	`None`

Source code in src/koality/checks.py

class DataQualityCheck(abc.ABC):
    """Abstract class for all data quality checks.

    Provides generic methods relevant to all data quality check classes.

    Args:
        table: Name of BQ table (e.g., "project.dataset.table")
        check_column: Name of column to be checked (e.g., "category")
        lower_threshold: Check will fail if check result < lower_threshold
        upper_threshold: Check will fail if check result > upper_threshold
        monitor_only: If True, no checks will be performed
        extra_info: Optional additional text that will be added to the end of the failure message

    """

    def __init__(
        self,
        database_accessor: str,
        database_provider: DatabaseProvider | None,
        table: str,
        check_column: str | None = None,
        lower_threshold: float = -math.inf,
        upper_threshold: float = math.inf,
        *,
        filters: dict[str, Any] | None = None,
        identifier_format: str = "identifier",
        identifier_placeholder: str = "ALL",
        date_info: str | None = None,
        extra_info: str | None = None,
        monitor_only: bool = False,
    ) -> None:
        """Initialize the data quality check with configuration parameters."""
        self.database_provider = database_provider
        self.database_accessor = database_accessor
        self.query_wrapped = database_provider.type.lower() == "bigquery" if database_provider else False
        self.table = table
        self.lower_threshold = lower_threshold
        self.upper_threshold = upper_threshold
        self.monitor_only = monitor_only
        self.extra_info_string = f" {extra_info}" if extra_info else ""
        self.date_info_string = f" ({date_info})" if date_info else ""

        self.status = "NOT_EXECUTED"
        self.message: str | None = None
        self.bytes_billed: int = 0

        # Identifier format configuration
        self.identifier_format = identifier_format
        self.identifier_placeholder = identifier_placeholder

        # for where filter handling
        self.filters = self.get_filters(filters or {})

        # Find identifier filter by type and format based on identifier_format setting
        identifier_filter_result = self.get_identifier_filter(self.filters)
        if identifier_filter_result:
            filter_name, filter_config = identifier_filter_result
            # If value key is missing or explicitly None, treat as identifier_placeholder (meaning "no specific value")
            if "value" in filter_config and filter_config["value"] is not None:
                value = filter_config["value"]
            else:
                value = self.identifier_placeholder
            column = filter_config.get("column", "")

            if self.identifier_format == "identifier":
                # Format as "column=value"
                self.identifier = f"{column}={value}" if column else str(value)
                self.identifier_column = "IDENTIFIER"
            elif self.identifier_format == "filter_name":
                # Use filter name as column, value as-is
                self.identifier = str(value)
                self.identifier_column = filter_name.upper()
            else:  # column_name
                # Use database column name as column, value as-is
                self.identifier = str(value)
                self.identifier_column = column.upper() if column else "IDENTIFIER"
        else:
            self.identifier = "ALL"
            self.identifier_column = "IDENTIFIER"

        # Find date filter by type and store the filter dict
        date_filter_result = self.get_date_filter(self.filters)
        if date_filter_result:
            self.date_filter = date_filter_result[1]
        else:
            self.date_filter = None

        if check_column is None:
            self.check_column = "*"
        else:
            self.check_column = check_column

        self.name = self.assemble_name()
        self.result: dict[str, Any] | None = None

    @property
    def in_memory_column(self) -> str:
        """Return the column name to reference in in-memory queries.

        If a configured column references a nested field (e.g. "value.shopId"):
        - When querying data loaded via database_accessor: uses underscores ("value_shopId")
          because the executor flattens struct columns with underscore aliases
        - When querying existing DuckDB tables (no accessor): keeps dots ("value.shopId")
          to support native DuckDB struct column syntax

        This property provides the appropriate name without modifying the original
        configured `self.check_column` which is still used for result writing.
        """
        if isinstance(self.check_column, str) and "." in self.check_column:  # noqa: SIM102
            # Only convert to underscores if data was loaded via database_accessor
            # (which flattens structs). For native DuckDB tables, keep dotted notation.
            if self.database_accessor:
                return self.check_column.replace(".", "_")
        return self.check_column

    @property
    def query(self) -> str:
        """Return the assembled SQL query for this check."""
        return self.assemble_query()

    @abc.abstractmethod
    def assemble_query(self) -> str:
        """Assemble and return the SQL query for this check."""

    @abc.abstractmethod
    def assemble_data_exists_query(self) -> str:
        """Assemble and return the SQL query to check if data exists."""

    @abc.abstractmethod
    def assemble_name(self) -> str:
        """Assemble and return the name for this check."""

    def __repr__(self) -> str:
        """Return string representation combining identifier and check name."""
        if not hasattr(self, "identifier"):
            return self.name

        return f"{self.identifier}_{self.name}"

    def data_check(self, duckdb_client: duckdb.DuckDBPyConnection) -> dict:
        """Check if database tables used in the actual check contain data.

        Note: The returned result dict and failure message will be later
        aggregated in order to avoid duplicates in the reported failures.

        Args:
            duckdb_client: DuckDB client for interacting with DuckDB

        Returns:
            If there is a table without data, a dict containing information about
            missing data will be returned, otherwise an empty dict indicating that
            data exists.

        """
        try:
            result = execute_query(
                self.assemble_data_exists_query(),
                duckdb_client,
                None,
                None,
            ).fetchone()
        except duckdb.Error as e:
            msg = f"Error while executing data check query on {self.table}"
            raise DatabaseError(msg) from e
        else:
            empty_table = result[0] if result else self.table
            is_empty_table = bool(empty_table)

        if not is_empty_table:
            return {}

        date = self.date_filter["value"] if self.date_filter else dt.datetime.now(tz=dt.UTC).date().isoformat()
        self.message = f"No data in {empty_table} on {date} for: {self.identifier}"
        self.status = "FAIL"
        return {
            "DATE": date,
            "METRIC_NAME": "data_exists",
            self.identifier_column: self.identifier,
            "TABLE": empty_table,
        }

    def _check(self, duckdb_client: duckdb.DuckDBPyConnection, query: str) -> tuple[list[dict], str | None]:
        data = []
        error = None
        try:
            result = execute_query(
                query,
                duckdb_client,
                None,
                None,
            )
        except duckdb.Error as e:
            error = str(e)
        else:
            data = [dict(zip(result.columns, row, strict=False)) for row in result.fetchall()]
        return data, error

    def check(self, duckdb_client: duckdb.DuckDBPyConnection) -> dict:
        """Perform the data quality check and return results.

        If the check is set to `monitor_only`, the results of the
        check will be documented without comparison to the lower and
        upper thresholds.

        Args:
            duckdb_client: DuckDB client for interacting with DuckDB

        Returns:
            A dict containing all information and the result of the check

        """
        result, error = self._check(duckdb_client, self.query)

        check_value = result[0][self.name] if result else None
        check_value = float(check_value) if check_value is not None else None

        date = self.date_filter["value"] if self.date_filter else dt.datetime.now(tz=dt.UTC).date().isoformat()

        if error:
            # Map BigQuery binder "dataset not found" errors to a table existence failure
            err_str = str(error)
            # Any "not found" / "does not exist" error (BigQuery Binder or DuckDB) maps to table_exists
            lowered = err_str.lower()
            if "not found" in lowered or "does not exist" in lowered:
                self.status = "FAIL"
                result_dict = {
                    "DATE": date,
                    "METRIC_NAME": "table_exists",
                    self.identifier_column: self.identifier,
                    "TABLE": self.table,
                    "COLUMN": None,
                    "VALUE": None,
                    "LOWER_THRESHOLD": None,
                    "UPPER_THRESHOLD": None,
                    "RESULT": "FAIL",
                }
                self.result = result_dict
                return result_dict
            result = "ERROR"
            self.message = f"{self.identifier}: Metric {self.name} query errored with {error}"
        elif self.monitor_only:
            result = "MONITOR_ONLY"
        else:
            success = check_value is not None and self.lower_threshold <= check_value <= self.upper_threshold
            result = "SUCCESS" if success else "FAIL"

        result_dict = {
            "DATE": date,
            "METRIC_NAME": self.name,
            self.identifier_column: self.identifier,
            "TABLE": self.table,
            "COLUMN": self.check_column,
            "VALUE": check_value,
            "LOWER_THRESHOLD": self.lower_threshold,
            "UPPER_THRESHOLD": self.upper_threshold,
            "RESULT": result,
        }

        if result_dict["RESULT"] == "FAIL":
            value_string = f"{result_dict['VALUE']:.{FLOAT_PRECISION}f}" if result_dict["VALUE"] is not None else "NULL"
            self.message = (
                f"{self.identifier}: Metric {self.name} failed on {date}{self.date_info_string} "
                f"for {self.table}. Value {value_string} is not between {self.lower_threshold} and "
                f"{self.upper_threshold}.{self.extra_info_string}"
            )
        self.status = result_dict["RESULT"]
        self.result = result_dict

        return result_dict

    def __call__(self, duckdb_client: duckdb.DuckDBPyConnection) -> dict:
        """Execute the data quality check and return results."""
        data_check_result = self.data_check(duckdb_client)
        if data_check_result:
            return data_check_result

        return self.check(duckdb_client)

    @staticmethod
    def get_filters(filters_config: dict[str, Any]) -> dict[str, dict[str, Any]]:
        """Generate a filter dict from filter configurations.

        Args:
            filters_config: Dictionary containing filter configurations.

        Example YAML:
            filters:
              partition_date:
                column: BQ_PARTITIONTIME
                value: yesterday-2  # 2 days before yesterday
                type: date  # auto-parses value as date
              shop_id:
                column: shopId
                value: EC0601
                type: identifier
              revenue:
                column: total_revenue
                value: 1000
                operator: ">="
              category:
                column: category
                value: ["toys", "electronics"]
                operator: "IN"

        Returns:
            A dict of the format:
                {"partition_date": {"column": "DATE", "value": "2020-01-01", "operator": "=", "type": "date"}, ...}

        """
        filters: dict[str, dict[str, Any]] = {}

        for filter_name, config in filters_config.items():
            if isinstance(config, FilterConfig):
                config_dict = config.model_dump()
            elif isinstance(config, dict):
                config_dict = config
            else:
                config_dict = {"value": config}

            column = config_dict.get("column")

            value = config_dict.get("value")
            filter_type = config_dict.get("type", "other")

            # Auto-parse date values when type is "date" or parse_as_date is True
            should_parse = filter_type == "date" or config_dict.get("parse_as_date", False)
            if should_parse and value is not None:
                value = parse_date(str(value))

            operator = config_dict.get("operator", "=")

            # If no column is provided, only allow identifier-type filters (used for naming);
            # otherwise skip the filter entirely.
            if column is None and filter_type != "identifier":
                continue

            # Build filter entry; omit 'column' key if column is not provided so
            # downstream code can distinguish filters intended only for naming.
            entry: dict[str, Any] = {
                "value": value,
                "operator": operator,
                "type": filter_type,
            }
            if column is not None:
                entry["column"] = column

            filters[filter_name] = entry

        return filters

    @staticmethod
    def get_date_filter(filters: dict[str, dict[str, Any]]) -> tuple[str, dict[str, Any]] | None:
        """Find the date filter (type='date') from the filters dict.

        Args:
            filters: The filters dict from get_filters().

        Returns:
            A tuple of (filter_name, filter_config) if found, None otherwise.

        """
        for name, config in filters.items():
            if config.get("type") == "date":
                return name, config
        return None

    @staticmethod
    def get_identifier_filter(filters: dict[str, dict[str, Any]]) -> tuple[str, dict[str, Any]] | None:
        """Find the identifier filter (type='identifier') from the filters dict.

        Args:
            filters: The filters dict from get_filters().

        Returns:
            A tuple of (filter_name, filter_config) if found, None otherwise.

        """
        for name, config in filters.items():
            if config.get("type") == "identifier":
                return (name, config)
        return None

    @staticmethod
    def assemble_where_statement(  # noqa: C901
        filters: dict[str, dict[str, Any]],
        *,
        strip_dotted_columns: bool = True,
        database_accessor: str | None = None,
    ) -> str:
        """Generate the where statement for the check query using the specified filters.

        Args:
            filters: A dict containing filter specifications, e.g.,
            strip_dotted_columns: When True (default), dotted column names (e.g. "a.b") are
                transformed based on the data source:
                - With database_accessor: converted to underscores ("a_b") for flattened data
                - Without database_accessor: kept as dots ("a.b") for native DuckDB structs
                If False, the full dotted expression is preserved regardless (used when
                querying source databases that expect the original dotted column syntax).
            database_accessor: Optional database accessor string. When provided and non-empty,
                indicates data was loaded from external source and dotted columns should be
                converted to underscores.

                Example filters:
                `{
                    'identifier': {
                        'column': 'shop_code',
                        'value': 'SHOP01',
                        'operator': '=',
                        'type': 'identifier'
                    },
                    'date': {
                        'column': 'date',
                        'value': '2023-01-01',
                        'operator': '=',
                        'type': 'date'
                    },
                    'revenue': {
                        'column': 'total_revenue',
                        'value': 1000,
                        'operator': '>='
                    },
                    'category': {
                        'column': 'category',
                        'value': ['toys', 'electronics'],
                        'operator': 'IN'
                    }
                }`

        Returns:
            A WHERE statement to restrict the data being used for the check, e.g.,
            'WHERE shop_code = 'SHOP01' AND date = '2023-01-01' AND total_revenue >= 1000'

        """
        if len(filters) == 0:
            return ""

        filters_statements = []
        for filter_dict in filters.values():
            column = filter_dict.get("column")
            value = filter_dict.get("value")
            # Skip identifier filters that do not specify a concrete value (used only for naming)
            if filter_dict.get("type") == "identifier" and (value is None or value == "*"):
                continue
            # If column is not provided we cannot build a WHERE condition
            if column is None:
                continue
            # If the column references a nested field (e.g. "value.shopId"):
            # - With database_accessor: convert to underscores (value_shopId) for flattened data
            # - Without database_accessor: keep dots (value.shopId) for native DuckDB structs
            # Callers can disable this by setting `strip_dotted_columns=False`
            # (used when querying source DBs where the original dotted expression is needed).
            if isinstance(column, str) and "." in column and strip_dotted_columns and database_accessor:
                column = column.replace(".", "_")
                # else: keep dotted notation for native DuckDB struct support

            operator = filter_dict.get("operator", "=")

            # Cast date columns for proper comparison
            is_date_filter = filter_dict.get("type") == "date"
            if is_date_filter:
                column = f"CAST({column} AS DATE)"

            # Handle NULL values with IS NULL / IS NOT NULL
            if value is None:
                if operator == "!=":
                    filters_statements.append(f"    {column} IS NOT NULL")
                else:
                    filters_statements.append(f"    {column} IS NULL")
                continue

            formatted_value = format_filter_value(value, operator)
            # Prefix DATE for date type filters
            if is_date_filter and operator not in ("BETWEEN", "IN", "NOT IN"):
                formatted_value = f"DATE {formatted_value}"
            filters_statements.append(f"    {column} {operator} {formatted_value}")

        if len(filters_statements) == 0:
            return ""

        return "WHERE\n" + "\nAND\n".join(filters_statements)

`in_memory_column` `property`

Return the column name to reference in in-memory queries.

If a configured column references a nested field (e.g. "value.shopId"): - When querying data loaded via database_accessor: uses underscores ("value_shopId") because the executor flattens struct columns with underscore aliases - When querying existing DuckDB tables (no accessor): keeps dots ("value.shopId") to support native DuckDB struct column syntax

This property provides the appropriate name without modifying the original configured self.check_column which is still used for result writing.

`query` `property`

Return the assembled SQL query for this check.

`call(duckdb_client)`

Execute the data quality check and return results.

Source code in src/koality/checks.py

def __call__(self, duckdb_client: duckdb.DuckDBPyConnection) -> dict:
    """Execute the data quality check and return results."""
    data_check_result = self.data_check(duckdb_client)
    if data_check_result:
        return data_check_result

    return self.check(duckdb_client)

`init(database_accessor, database_provider, table, check_column=None, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

Initialize the data quality check with configuration parameters.

Source code in src/koality/checks.py

def __init__(
    self,
    database_accessor: str,
    database_provider: DatabaseProvider | None,
    table: str,
    check_column: str | None = None,
    lower_threshold: float = -math.inf,
    upper_threshold: float = math.inf,
    *,
    filters: dict[str, Any] | None = None,
    identifier_format: str = "identifier",
    identifier_placeholder: str = "ALL",
    date_info: str | None = None,
    extra_info: str | None = None,
    monitor_only: bool = False,
) -> None:
    """Initialize the data quality check with configuration parameters."""
    self.database_provider = database_provider
    self.database_accessor = database_accessor
    self.query_wrapped = database_provider.type.lower() == "bigquery" if database_provider else False
    self.table = table
    self.lower_threshold = lower_threshold
    self.upper_threshold = upper_threshold
    self.monitor_only = monitor_only
    self.extra_info_string = f" {extra_info}" if extra_info else ""
    self.date_info_string = f" ({date_info})" if date_info else ""

    self.status = "NOT_EXECUTED"
    self.message: str | None = None
    self.bytes_billed: int = 0

    # Identifier format configuration
    self.identifier_format = identifier_format
    self.identifier_placeholder = identifier_placeholder

    # for where filter handling
    self.filters = self.get_filters(filters or {})

    # Find identifier filter by type and format based on identifier_format setting
    identifier_filter_result = self.get_identifier_filter(self.filters)
    if identifier_filter_result:
        filter_name, filter_config = identifier_filter_result
        # If value key is missing or explicitly None, treat as identifier_placeholder (meaning "no specific value")
        if "value" in filter_config and filter_config["value"] is not None:
            value = filter_config["value"]
        else:
            value = self.identifier_placeholder
        column = filter_config.get("column", "")

        if self.identifier_format == "identifier":
            # Format as "column=value"
            self.identifier = f"{column}={value}" if column else str(value)
            self.identifier_column = "IDENTIFIER"
        elif self.identifier_format == "filter_name":
            # Use filter name as column, value as-is
            self.identifier = str(value)
            self.identifier_column = filter_name.upper()
        else:  # column_name
            # Use database column name as column, value as-is
            self.identifier = str(value)
            self.identifier_column = column.upper() if column else "IDENTIFIER"
    else:
        self.identifier = "ALL"
        self.identifier_column = "IDENTIFIER"

    # Find date filter by type and store the filter dict
    date_filter_result = self.get_date_filter(self.filters)
    if date_filter_result:
        self.date_filter = date_filter_result[1]
    else:
        self.date_filter = None

    if check_column is None:
        self.check_column = "*"
    else:
        self.check_column = check_column

    self.name = self.assemble_name()
    self.result: dict[str, Any] | None = None

`repr()`

Return string representation combining identifier and check name.

Source code in src/koality/checks.py

def __repr__(self) -> str:
    """Return string representation combining identifier and check name."""
    if not hasattr(self, "identifier"):
        return self.name

    return f"{self.identifier}_{self.name}"

`assemble_data_exists_query()` `abstractmethod`

Assemble and return the SQL query to check if data exists.

Source code in src/koality/checks.py

@abc.abstractmethod
def assemble_data_exists_query(self) -> str:
    """Assemble and return the SQL query to check if data exists."""

`assemble_name()` `abstractmethod`

Assemble and return the name for this check.

Source code in src/koality/checks.py

@abc.abstractmethod
def assemble_name(self) -> str:
    """Assemble and return the name for this check."""

`assemble_query()` `abstractmethod`

Assemble and return the SQL query for this check.

Source code in src/koality/checks.py

@abc.abstractmethod
def assemble_query(self) -> str:
    """Assemble and return the SQL query for this check."""

`assemble_where_statement(filters, *, strip_dotted_columns=True, database_accessor=None)` `staticmethod`

Generate the where statement for the check query using the specified filters.

Parameters:

Name	Type	Description	Default
`filters`	`dict[str, dict[str, Any]]`	A dict containing filter specifications, e.g.,	required
`strip_dotted_columns`	`bool`	When True (default), dotted column names (e.g. "a.b") are transformed based on the data source: - With database_accessor: converted to underscores ("a_b") for flattened data - Without database_accessor: kept as dots ("a.b") for native DuckDB structs If False, the full dotted expression is preserved regardless (used when querying source databases that expect the original dotted column syntax).	`True`
`database_accessor`	`str \| None`	Optional database accessor string. When provided and non-empty, indicates data was loaded from external source and dotted columns should be converted to underscores. Example filters: `{ 'identifier': { 'column': 'shop_code', 'value': 'SHOP01', 'operator': '=', 'type': 'identifier' }, 'date': { 'column': 'date', 'value': '2023-01-01', 'operator': '=', 'type': 'date' }, 'revenue': { 'column': 'total_revenue', 'value': 1000, 'operator': '>=' }, 'category': { 'column': 'category', 'value': ['toys', 'electronics'], 'operator': 'IN' } }`	`None`

Returns:

Type	Description
`str`	A WHERE statement to restrict the data being used for the check, e.g.,
`str`	'WHERE shop_code = 'SHOP01' AND date = '2023-01-01' AND total_revenue >= 1000'

Source code in src/koality/checks.py

@staticmethod
def assemble_where_statement(  # noqa: C901
    filters: dict[str, dict[str, Any]],
    *,
    strip_dotted_columns: bool = True,
    database_accessor: str | None = None,
) -> str:
    """Generate the where statement for the check query using the specified filters.

    Args:
        filters: A dict containing filter specifications, e.g.,
        strip_dotted_columns: When True (default), dotted column names (e.g. "a.b") are
            transformed based on the data source:
            - With database_accessor: converted to underscores ("a_b") for flattened data
            - Without database_accessor: kept as dots ("a.b") for native DuckDB structs
            If False, the full dotted expression is preserved regardless (used when
            querying source databases that expect the original dotted column syntax).
        database_accessor: Optional database accessor string. When provided and non-empty,
            indicates data was loaded from external source and dotted columns should be
            converted to underscores.

            Example filters:
            `{
                'identifier': {
                    'column': 'shop_code',
                    'value': 'SHOP01',
                    'operator': '=',
                    'type': 'identifier'
                },
                'date': {
                    'column': 'date',
                    'value': '2023-01-01',
                    'operator': '=',
                    'type': 'date'
                },
                'revenue': {
                    'column': 'total_revenue',
                    'value': 1000,
                    'operator': '>='
                },
                'category': {
                    'column': 'category',
                    'value': ['toys', 'electronics'],
                    'operator': 'IN'
                }
            }`

    Returns:
        A WHERE statement to restrict the data being used for the check, e.g.,
        'WHERE shop_code = 'SHOP01' AND date = '2023-01-01' AND total_revenue >= 1000'

    """
    if len(filters) == 0:
        return ""

    filters_statements = []
    for filter_dict in filters.values():
        column = filter_dict.get("column")
        value = filter_dict.get("value")
        # Skip identifier filters that do not specify a concrete value (used only for naming)
        if filter_dict.get("type") == "identifier" and (value is None or value == "*"):
            continue
        # If column is not provided we cannot build a WHERE condition
        if column is None:
            continue
        # If the column references a nested field (e.g. "value.shopId"):
        # - With database_accessor: convert to underscores (value_shopId) for flattened data
        # - Without database_accessor: keep dots (value.shopId) for native DuckDB structs
        # Callers can disable this by setting `strip_dotted_columns=False`
        # (used when querying source DBs where the original dotted expression is needed).
        if isinstance(column, str) and "." in column and strip_dotted_columns and database_accessor:
            column = column.replace(".", "_")
            # else: keep dotted notation for native DuckDB struct support

        operator = filter_dict.get("operator", "=")

        # Cast date columns for proper comparison
        is_date_filter = filter_dict.get("type") == "date"
        if is_date_filter:
            column = f"CAST({column} AS DATE)"

        # Handle NULL values with IS NULL / IS NOT NULL
        if value is None:
            if operator == "!=":
                filters_statements.append(f"    {column} IS NOT NULL")
            else:
                filters_statements.append(f"    {column} IS NULL")
            continue

        formatted_value = format_filter_value(value, operator)
        # Prefix DATE for date type filters
        if is_date_filter and operator not in ("BETWEEN", "IN", "NOT IN"):
            formatted_value = f"DATE {formatted_value}"
        filters_statements.append(f"    {column} {operator} {formatted_value}")

    if len(filters_statements) == 0:
        return ""

    return "WHERE\n" + "\nAND\n".join(filters_statements)

`check(duckdb_client)`

Perform the data quality check and return results.

If the check is set to monitor_only, the results of the check will be documented without comparison to the lower and upper thresholds.

Parameters:

Name	Type	Description	Default
`duckdb_client`	`DuckDBPyConnection`	DuckDB client for interacting with DuckDB	required

Returns:

Type	Description
`dict`	A dict containing all information and the result of the check

Source code in src/koality/checks.py

def check(self, duckdb_client: duckdb.DuckDBPyConnection) -> dict:
    """Perform the data quality check and return results.

    If the check is set to `monitor_only`, the results of the
    check will be documented without comparison to the lower and
    upper thresholds.

    Args:
        duckdb_client: DuckDB client for interacting with DuckDB

    Returns:
        A dict containing all information and the result of the check

    """
    result, error = self._check(duckdb_client, self.query)

    check_value = result[0][self.name] if result else None
    check_value = float(check_value) if check_value is not None else None

    date = self.date_filter["value"] if self.date_filter else dt.datetime.now(tz=dt.UTC).date().isoformat()

    if error:
        # Map BigQuery binder "dataset not found" errors to a table existence failure
        err_str = str(error)
        # Any "not found" / "does not exist" error (BigQuery Binder or DuckDB) maps to table_exists
        lowered = err_str.lower()
        if "not found" in lowered or "does not exist" in lowered:
            self.status = "FAIL"
            result_dict = {
                "DATE": date,
                "METRIC_NAME": "table_exists",
                self.identifier_column: self.identifier,
                "TABLE": self.table,
                "COLUMN": None,
                "VALUE": None,
                "LOWER_THRESHOLD": None,
                "UPPER_THRESHOLD": None,
                "RESULT": "FAIL",
            }
            self.result = result_dict
            return result_dict
        result = "ERROR"
        self.message = f"{self.identifier}: Metric {self.name} query errored with {error}"
    elif self.monitor_only:
        result = "MONITOR_ONLY"
    else:
        success = check_value is not None and self.lower_threshold <= check_value <= self.upper_threshold
        result = "SUCCESS" if success else "FAIL"

    result_dict = {
        "DATE": date,
        "METRIC_NAME": self.name,
        self.identifier_column: self.identifier,
        "TABLE": self.table,
        "COLUMN": self.check_column,
        "VALUE": check_value,
        "LOWER_THRESHOLD": self.lower_threshold,
        "UPPER_THRESHOLD": self.upper_threshold,
        "RESULT": result,
    }

    if result_dict["RESULT"] == "FAIL":
        value_string = f"{result_dict['VALUE']:.{FLOAT_PRECISION}f}" if result_dict["VALUE"] is not None else "NULL"
        self.message = (
            f"{self.identifier}: Metric {self.name} failed on {date}{self.date_info_string} "
            f"for {self.table}. Value {value_string} is not between {self.lower_threshold} and "
            f"{self.upper_threshold}.{self.extra_info_string}"
        )
    self.status = result_dict["RESULT"]
    self.result = result_dict

    return result_dict

`data_check(duckdb_client)`

Check if database tables used in the actual check contain data.

Note: The returned result dict and failure message will be later aggregated in order to avoid duplicates in the reported failures.

Parameters:

Name	Type	Description	Default
`duckdb_client`	`DuckDBPyConnection`	DuckDB client for interacting with DuckDB	required

Returns:

Type	Description
`dict`	If there is a table without data, a dict containing information about
`dict`	missing data will be returned, otherwise an empty dict indicating that
`dict`	data exists.

Source code in src/koality/checks.py

def data_check(self, duckdb_client: duckdb.DuckDBPyConnection) -> dict:
    """Check if database tables used in the actual check contain data.

    Note: The returned result dict and failure message will be later
    aggregated in order to avoid duplicates in the reported failures.

    Args:
        duckdb_client: DuckDB client for interacting with DuckDB

    Returns:
        If there is a table without data, a dict containing information about
        missing data will be returned, otherwise an empty dict indicating that
        data exists.

    """
    try:
        result = execute_query(
            self.assemble_data_exists_query(),
            duckdb_client,
            None,
            None,
        ).fetchone()
    except duckdb.Error as e:
        msg = f"Error while executing data check query on {self.table}"
        raise DatabaseError(msg) from e
    else:
        empty_table = result[0] if result else self.table
        is_empty_table = bool(empty_table)

    if not is_empty_table:
        return {}

    date = self.date_filter["value"] if self.date_filter else dt.datetime.now(tz=dt.UTC).date().isoformat()
    self.message = f"No data in {empty_table} on {date} for: {self.identifier}"
    self.status = "FAIL"
    return {
        "DATE": date,
        "METRIC_NAME": "data_exists",
        self.identifier_column: self.identifier,
        "TABLE": empty_table,
    }

`get_date_filter(filters)` `staticmethod`

Find the date filter (type='date') from the filters dict.

Parameters:

Name	Type	Description	Default
`filters`	`dict[str, dict[str, Any]]`	The filters dict from get_filters().	required

Returns:

Type	Description
`tuple[str, dict[str, Any]] \| None`	A tuple of (filter_name, filter_config) if found, None otherwise.

Source code in src/koality/checks.py

@staticmethod
def get_date_filter(filters: dict[str, dict[str, Any]]) -> tuple[str, dict[str, Any]] | None:
    """Find the date filter (type='date') from the filters dict.

    Args:
        filters: The filters dict from get_filters().

    Returns:
        A tuple of (filter_name, filter_config) if found, None otherwise.

    """
    for name, config in filters.items():
        if config.get("type") == "date":
            return name, config
    return None

`get_filters(filters_config)` `staticmethod`

Generate a filter dict from filter configurations.

Parameters:

Name	Type	Description	Default
`filters_config`	`dict[str, Any]`	Dictionary containing filter configurations.	required

Example YAML

filters: partition_date: column: BQ_PARTITIONTIME value: yesterday-2 # 2 days before yesterday type: date # auto-parses value as date shop_id: column: shopId value: EC0601 type: identifier revenue: column: total_revenue value: 1000 operator: ">=" category: column: category value: ["toys", "electronics"] operator: "IN"

Returns:

Type	Description
`dict[str, dict[str, Any]]`	A dict of the format: {"partition_date": {"column": "DATE", "value": "2020-01-01", "operator": "=", "type": "date"}, ...}

Source code in src/koality/checks.py

@staticmethod
def get_filters(filters_config: dict[str, Any]) -> dict[str, dict[str, Any]]:
    """Generate a filter dict from filter configurations.

    Args:
        filters_config: Dictionary containing filter configurations.

    Example YAML:
        filters:
          partition_date:
            column: BQ_PARTITIONTIME
            value: yesterday-2  # 2 days before yesterday
            type: date  # auto-parses value as date
          shop_id:
            column: shopId
            value: EC0601
            type: identifier
          revenue:
            column: total_revenue
            value: 1000
            operator: ">="
          category:
            column: category
            value: ["toys", "electronics"]
            operator: "IN"

    Returns:
        A dict of the format:
            {"partition_date": {"column": "DATE", "value": "2020-01-01", "operator": "=", "type": "date"}, ...}

    """
    filters: dict[str, dict[str, Any]] = {}

    for filter_name, config in filters_config.items():
        if isinstance(config, FilterConfig):
            config_dict = config.model_dump()
        elif isinstance(config, dict):
            config_dict = config
        else:
            config_dict = {"value": config}

        column = config_dict.get("column")

        value = config_dict.get("value")
        filter_type = config_dict.get("type", "other")

        # Auto-parse date values when type is "date" or parse_as_date is True
        should_parse = filter_type == "date" or config_dict.get("parse_as_date", False)
        if should_parse and value is not None:
            value = parse_date(str(value))

        operator = config_dict.get("operator", "=")

        # If no column is provided, only allow identifier-type filters (used for naming);
        # otherwise skip the filter entirely.
        if column is None and filter_type != "identifier":
            continue

        # Build filter entry; omit 'column' key if column is not provided so
        # downstream code can distinguish filters intended only for naming.
        entry: dict[str, Any] = {
            "value": value,
            "operator": operator,
            "type": filter_type,
        }
        if column is not None:
            entry["column"] = column

        filters[filter_name] = entry

    return filters

`get_identifier_filter(filters)` `staticmethod`

Find the identifier filter (type='identifier') from the filters dict.

Parameters:

Name	Type	Description	Default
`filters`	`dict[str, dict[str, Any]]`	The filters dict from get_filters().	required

Returns:

Type	Description
`tuple[str, dict[str, Any]] \| None`	A tuple of (filter_name, filter_config) if found, None otherwise.

Source code in src/koality/checks.py

@staticmethod
def get_identifier_filter(filters: dict[str, dict[str, Any]]) -> tuple[str, dict[str, Any]] | None:
    """Find the identifier filter (type='identifier') from the filters dict.

    Args:
        filters: The filters dict from get_filters().

    Returns:
        A tuple of (filter_name, filter_config) if found, None otherwise.

    """
    for name, config in filters.items():
        if config.get("type") == "identifier":
            return (name, config)
    return None

`DuplicateCheck`

Bases: ColumnTransformationCheck

Check the number of duplicates for a specific column.

Counts all rows minus distinct counts. Inherits from ColumnTransformationCheck.

Example: DuplicateCheck( database_accessor="my-gcp-project.SHOP01", database_provider=None, table="my-gcp-project.SHOP01.skufeed_latest", check_column="sku_id", filters={ "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"}, "date": {"column": "DATE", "value": "2023-01-01", "type": "date"}, }, lower_threshold=0.0, upper_threshold=0.0, )

Source code in src/koality/checks.py

class DuplicateCheck(ColumnTransformationCheck):
    """Check the number of duplicates for a specific column.

    Counts all rows minus distinct counts. Inherits from ColumnTransformationCheck.

    Example:
    DuplicateCheck(
        database_accessor="my-gcp-project.SHOP01",
        database_provider=None,
        table="my-gcp-project.SHOP01.skufeed_latest",
        check_column="sku_id",
        filters={
            "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"},
            "date": {"column": "DATE", "value": "2023-01-01", "type": "date"},
        },
        lower_threshold=0.0,
        upper_threshold=0.0,
    )

    """

    def __init__(
        self,
        database_accessor: str,
        database_provider: DatabaseProvider | None,
        table: str,
        check_column: str,
        lower_threshold: float = -math.inf,
        upper_threshold: float = math.inf,
        *,
        filters: dict[str, Any] | None = None,
        identifier_format: str = "identifier",
        identifier_placeholder: str = "ALL",
        date_info: str | None = None,
        extra_info: str | None = None,
        monitor_only: bool = False,
    ) -> None:
        """Initialize the duplicate check."""
        super().__init__(
            database_accessor=database_accessor,
            database_provider=database_provider,
            transformation_name="duplicates",
            table=table,
            check_column=check_column,
            lower_threshold=lower_threshold,
            upper_threshold=upper_threshold,
            filters=filters,
            identifier_format=identifier_format,
            identifier_placeholder=identifier_placeholder,
            date_info=date_info,
            extra_info=extra_info,
            monitor_only=monitor_only,
        )

    def transformation_statement(self) -> str:
        """Return the SQL statement for counting duplicates."""
        return f"COUNT(*) - COUNT(DISTINCT {self.in_memory_column}) AS {self.name}"

`init(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

Initialize the duplicate check.

Source code in src/koality/checks.py

def __init__(
    self,
    database_accessor: str,
    database_provider: DatabaseProvider | None,
    table: str,
    check_column: str,
    lower_threshold: float = -math.inf,
    upper_threshold: float = math.inf,
    *,
    filters: dict[str, Any] | None = None,
    identifier_format: str = "identifier",
    identifier_placeholder: str = "ALL",
    date_info: str | None = None,
    extra_info: str | None = None,
    monitor_only: bool = False,
) -> None:
    """Initialize the duplicate check."""
    super().__init__(
        database_accessor=database_accessor,
        database_provider=database_provider,
        transformation_name="duplicates",
        table=table,
        check_column=check_column,
        lower_threshold=lower_threshold,
        upper_threshold=upper_threshold,
        filters=filters,
        identifier_format=identifier_format,
        identifier_placeholder=identifier_placeholder,
        date_info=date_info,
        extra_info=extra_info,
        monitor_only=monitor_only,
    )

`transformation_statement()`

Return the SQL statement for counting duplicates.

Source code in src/koality/checks.py

def transformation_statement(self) -> str:
    """Return the SQL statement for counting duplicates."""
    return f"COUNT(*) - COUNT(DISTINCT {self.in_memory_column}) AS {self.name}"

`IqrOutlierCheck`

Bases: ColumnTransformationCheck

Check if a column value is an outlier based on the interquartile range (IQR) method.

Inherits from koality.checks.ColumnTransformationCheck, and thus, we refer to argument descriptions in its super class.

The IQR method is based on the 25th and 75th percentiles of the data. The thresholds are calculated as follows: - lower_threshold = q25 - iqr_factor * (q75 - q25) - upper_threshold = q75 + iqr_factor * (q75 - q25) Note: lower_threshold and upper_threshold are calculated internally and should not be provided directly in the configuration for IqrOutlierCheck.

Parameters:

Name	Type	Description	Default
`filters`	`dict[str, Any] \| None`	Filter configuration dict. Must include a filter with type='date'.	`None`
`interval_days`	`int`	Number of historic days to use for IQR calculation.	required
`how`	`Literal['both', 'upper', 'lower']`	Check mode - 'both', 'upper', or 'lower' outliers.	required
`iqr_factor`	`float`	Multiplier for IQR range (minimum 1.5).	required

Example: IqrOutlierCheck( database_accessor="my-gcp-project.SHOP01", database_provider=None, check_column="num_orders", table="my-gcp-project.SHOP01.orders", interval_days=14, how="both", iqr_factor=1.5, filters={ "partition_date": {"column": "DATE", "value": "2023-01-01", "type": "date"}, "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"}, }, )

Source code in src/koality/checks.py

class IqrOutlierCheck(ColumnTransformationCheck):
    """Check if a column value is an outlier based on the interquartile range (IQR) method.

    Inherits from `koality.checks.ColumnTransformationCheck`, and thus, we refer to
    argument descriptions in its super class.

    The IQR method is based on the 25th and 75th percentiles of the data. The
    thresholds are calculated as follows:
        - lower_threshold = q25 - iqr_factor * (q75 - q25)
        - upper_threshold = q75 + iqr_factor * (q75 - q25)
    Note: `lower_threshold` and `upper_threshold` are calculated internally and should
    not be provided directly in the configuration for IqrOutlierCheck.

    Args:
        filters: Filter configuration dict. Must include a filter with type='date'.
        interval_days: Number of historic days to use for IQR calculation.
        how: Check mode - 'both', 'upper', or 'lower' outliers.
        iqr_factor: Multiplier for IQR range (minimum 1.5).

    Example:
    IqrOutlierCheck(
        database_accessor="my-gcp-project.SHOP01",
        database_provider=None,
        check_column="num_orders",
        table="my-gcp-project.SHOP01.orders",
        interval_days=14,
        how="both",
        iqr_factor=1.5,
        filters={
            "partition_date": {"column": "DATE", "value": "2023-01-01", "type": "date"},
            "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"},
        },
    )

    """

    MIN_IQR_FACTOR = 1.5

    def __init__(
        self,
        database_accessor: str,
        database_provider: DatabaseProvider | None,
        check_column: str,
        table: str,
        interval_days: int,
        how: Literal["both", "upper", "lower"],
        iqr_factor: float,
        *,
        filters: dict[str, Any] | None = None,
        identifier_format: str = "identifier",
        identifier_placeholder: str = "ALL",
        date_info: str | None = None,
        extra_info: str | None = None,
        monitor_only: bool = False,
    ) -> None:
        """Initialize the IQR outlier check."""
        # Find date filter by type
        filters = filters or {}
        date_filter = None
        for config in filters.values():
            cfg = config.model_dump() if isinstance(config, FilterConfig) else config
            if cfg.get("type") == "date":
                date_filter = cfg
                break

        if not date_filter or not date_filter.get("column") or date_filter.get("value") is None:
            msg = "IqrOutlierCheck requires a filter with type='date'"
            raise KoalityError(msg)

        if interval_days < 1:
            msg = "interval_days must be at least 1"
            raise KoalityError(msg)
        self.interval_days = int(interval_days)
        if how not in ["both", "upper", "lower"]:
            msg = "how must be one of 'both', 'upper', 'lower'"
            raise KoalityError(msg)
        self.how = how
        # reasonable lower bound for iqr_factor
        if iqr_factor < self.MIN_IQR_FACTOR:
            msg = f"iqr_factor must be at least {self.MIN_IQR_FACTOR}"
            raise KoalityError(msg)
        self.iqr_factor = float(iqr_factor)

        super().__init__(
            database_accessor=database_accessor,
            database_provider=database_provider,
            transformation_name=f"outlier_iqr_{self.how}_{str(self.iqr_factor).replace('.', '_')}",
            table=table,
            check_column=check_column,
            lower_threshold=-math.inf,
            upper_threshold=math.inf,
            filters=filters,
            identifier_format=identifier_format,
            identifier_placeholder=identifier_placeholder,
            date_info=date_info,
            extra_info=extra_info,
            monitor_only=monitor_only,
        )

    def transformation_statement(self) -> str:
        """Return the SQL statement for IQR-based outlier detection."""
        # TODO: currently we only raise an error if there is no data for the date
        #       we could also raise an error if there is not enough data for the
        #       IQR calculation
        where_statement = ""
        filter_columns = ""
        date_col = self.date_filter["column"]
        date_val = self.date_filter["value"]

        filters = {k: v for k, v in self.filters.items() if v["type"] != "date"}

        if filters:
            filter_columns = ",\n".join([v["column"] for v in filters.values()])
            filter_columns = ",\n" + filter_columns
            where_statement = self.assemble_where_statement(filters, database_accessor=self.database_accessor)
            where_statement = "\nAND\n" + where_statement.removeprefix("WHERE\n")
        return f"""
        WITH
            base AS (
                SELECT
                    DATE({date_col}) AS {date_col},
                    {self.in_memory_column}
                    {filter_columns}
                FROM
                    "{self.table}"
                WHERE
                    DATE({date_col}) BETWEEN (DATE '{date_val}' - INTERVAL {self.interval_days} DAY)
                    AND DATE '{date_val}'
                    {where_statement}
            ),
            compare AS (
                SELECT * FROM base WHERE {date_col} < DATE '{date_val}'
            ),
            slice AS (
                SELECT * FROM base WHERE {date_col} = DATE '{date_val}'
            ),
            percentiles AS (
                SELECT
                  QUANTILE_CONT(CAST({self.in_memory_column} AS FLOAT), 0.25) AS q25,
                  QUANTILE_CONT(CAST({self.in_memory_column} AS FLOAT), 0.75) AS q75
                FROM
                  compare
            ),
            stats AS (
                SELECT
                  * exclude ({self.in_memory_column}),
                  {self.in_memory_column} AS {self.name},
                  (percentiles.q25 - {self.iqr_factor} * (percentiles.q75 - percentiles.q25)) AS lower_threshold,
                  (percentiles.q75 + {self.iqr_factor} * (percentiles.q75 - percentiles.q25)) AS upper_threshold,
                FROM
                  slice
                LEFT JOIN percentiles
                ON TRUE
            )
        """

    def query_boilerplate(self, metric_statement: str) -> str:
        """Return the query structure for IQR outlier detection."""
        return f"""
            {metric_statement}

            SELECT
                *
            FROM
                stats
        """

    def _check(self, duckdb_client: duckdb.DuckDBPyConnection, query: str) -> tuple[list[dict], str | None]:
        """Execute check and update thresholds from IQR calculation."""
        result, error = super()._check(duckdb_client, query)
        # overwrite the lower and upper thresholds as required
        if result:
            if self.how in ["both", "lower"]:
                self.lower_threshold = result[0]["lower_threshold"]
            if self.how in ["both", "upper"]:
                self.upper_threshold = result[0]["upper_threshold"]
        return result, error

    def assemble_data_exists_query(self) -> str:
        """Assemble the query to check if data exists for IQR outlier detection."""
        data_exists_query = f"""
        SELECT
            IF(COUNTIF({self.in_memory_column} IS NOT NULL) > 0, '', '{self.table}') AS empty_table
        FROM
            "{self.table}"
        """
        date_col = self.date_filter["column"]
        date_val = self.date_filter["value"]

        filters = {k: v for k, v in self.filters.items() if v["type"] != "date"}

        where_statement = self.assemble_where_statement(filters, database_accessor=self.database_accessor)
        if where_statement:
            where_statement = f"{where_statement} AND CAST({date_col} AS DATE) = DATE '{date_val}'"
        else:
            where_statement = f"WHERE CAST({date_col} AS DATE) = DATE '{date_val}'"
        return f"{data_exists_query}\n{where_statement}"

`init(database_accessor, database_provider, check_column, table, interval_days, how, iqr_factor, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

Initialize the IQR outlier check.

Source code in src/koality/checks.py

def __init__(
    self,
    database_accessor: str,
    database_provider: DatabaseProvider | None,
    check_column: str,
    table: str,
    interval_days: int,
    how: Literal["both", "upper", "lower"],
    iqr_factor: float,
    *,
    filters: dict[str, Any] | None = None,
    identifier_format: str = "identifier",
    identifier_placeholder: str = "ALL",
    date_info: str | None = None,
    extra_info: str | None = None,
    monitor_only: bool = False,
) -> None:
    """Initialize the IQR outlier check."""
    # Find date filter by type
    filters = filters or {}
    date_filter = None
    for config in filters.values():
        cfg = config.model_dump() if isinstance(config, FilterConfig) else config
        if cfg.get("type") == "date":
            date_filter = cfg
            break

    if not date_filter or not date_filter.get("column") or date_filter.get("value") is None:
        msg = "IqrOutlierCheck requires a filter with type='date'"
        raise KoalityError(msg)

    if interval_days < 1:
        msg = "interval_days must be at least 1"
        raise KoalityError(msg)
    self.interval_days = int(interval_days)
    if how not in ["both", "upper", "lower"]:
        msg = "how must be one of 'both', 'upper', 'lower'"
        raise KoalityError(msg)
    self.how = how
    # reasonable lower bound for iqr_factor
    if iqr_factor < self.MIN_IQR_FACTOR:
        msg = f"iqr_factor must be at least {self.MIN_IQR_FACTOR}"
        raise KoalityError(msg)
    self.iqr_factor = float(iqr_factor)

    super().__init__(
        database_accessor=database_accessor,
        database_provider=database_provider,
        transformation_name=f"outlier_iqr_{self.how}_{str(self.iqr_factor).replace('.', '_')}",
        table=table,
        check_column=check_column,
        lower_threshold=-math.inf,
        upper_threshold=math.inf,
        filters=filters,
        identifier_format=identifier_format,
        identifier_placeholder=identifier_placeholder,
        date_info=date_info,
        extra_info=extra_info,
        monitor_only=monitor_only,
    )

`assemble_data_exists_query()`

Assemble the query to check if data exists for IQR outlier detection.

Source code in src/koality/checks.py

def assemble_data_exists_query(self) -> str:
    """Assemble the query to check if data exists for IQR outlier detection."""
    data_exists_query = f"""
    SELECT
        IF(COUNTIF({self.in_memory_column} IS NOT NULL) > 0, '', '{self.table}') AS empty_table
    FROM
        "{self.table}"
    """
    date_col = self.date_filter["column"]
    date_val = self.date_filter["value"]

    filters = {k: v for k, v in self.filters.items() if v["type"] != "date"}

    where_statement = self.assemble_where_statement(filters, database_accessor=self.database_accessor)
    if where_statement:
        where_statement = f"{where_statement} AND CAST({date_col} AS DATE) = DATE '{date_val}'"
    else:
        where_statement = f"WHERE CAST({date_col} AS DATE) = DATE '{date_val}'"
    return f"{data_exists_query}\n{where_statement}"

`query_boilerplate(metric_statement)`

Return the query structure for IQR outlier detection.

Source code in src/koality/checks.py

def query_boilerplate(self, metric_statement: str) -> str:
    """Return the query structure for IQR outlier detection."""
    return f"""
        {metric_statement}

        SELECT
            *
        FROM
            stats
    """

`transformation_statement()`

Return the SQL statement for IQR-based outlier detection.

Source code in src/koality/checks.py

def transformation_statement(self) -> str:
    """Return the SQL statement for IQR-based outlier detection."""
    # TODO: currently we only raise an error if there is no data for the date
    #       we could also raise an error if there is not enough data for the
    #       IQR calculation
    where_statement = ""
    filter_columns = ""
    date_col = self.date_filter["column"]
    date_val = self.date_filter["value"]

    filters = {k: v for k, v in self.filters.items() if v["type"] != "date"}

    if filters:
        filter_columns = ",\n".join([v["column"] for v in filters.values()])
        filter_columns = ",\n" + filter_columns
        where_statement = self.assemble_where_statement(filters, database_accessor=self.database_accessor)
        where_statement = "\nAND\n" + where_statement.removeprefix("WHERE\n")
    return f"""
    WITH
        base AS (
            SELECT
                DATE({date_col}) AS {date_col},
                {self.in_memory_column}
                {filter_columns}
            FROM
                "{self.table}"
            WHERE
                DATE({date_col}) BETWEEN (DATE '{date_val}' - INTERVAL {self.interval_days} DAY)
                AND DATE '{date_val}'
                {where_statement}
        ),
        compare AS (
            SELECT * FROM base WHERE {date_col} < DATE '{date_val}'
        ),
        slice AS (
            SELECT * FROM base WHERE {date_col} = DATE '{date_val}'
        ),
        percentiles AS (
            SELECT
              QUANTILE_CONT(CAST({self.in_memory_column} AS FLOAT), 0.25) AS q25,
              QUANTILE_CONT(CAST({self.in_memory_column} AS FLOAT), 0.75) AS q75
            FROM
              compare
        ),
        stats AS (
            SELECT
              * exclude ({self.in_memory_column}),
              {self.in_memory_column} AS {self.name},
              (percentiles.q25 - {self.iqr_factor} * (percentiles.q75 - percentiles.q25)) AS lower_threshold,
              (percentiles.q75 + {self.iqr_factor} * (percentiles.q75 - percentiles.q25)) AS upper_threshold,
            FROM
              slice
            LEFT JOIN percentiles
            ON TRUE
        )
    """

`MatchRateCheck`

Bases: DataQualityCheck

Checks the match rate between two tables after joining on specific columns.

If left_join_columns (or right_join_columns) is defined, these columns will be used for joining the data. If not, join_columns will be used as fallback.

Parameters:

Name	Type	Description	Default
`left_table`	`str`	Name of table for left part of join (e.g., "my-gcp-project.SHOP01.identifier_base")	required
`right_table`	`str`	Name of table for right part of join (e.g., "my-gcp-project.SHOP01.feature_baseline")	required
`check_column`	`str`	Name of column to be checked (e.g., "product_number")	required
`join_columns`	`list[str] \| None`	List of columns to join data on (e.g., ["PREDICTION_DATE", "product_number"])	`None`
`join_columns_left`	`list[str] \| None`	List of columns of left table to join data on (e.g., ["BQ_PARTITIONTIME", "productId"])	`None`
`join_columns_right`	`list[str] \| None`	List of columns of right table to join data on (e.g., ["PREDICTION_DATE", "product_number"])	`None`
`lower_threshold`	`float`	Check will fail if check result < lower_threshold	`-inf`
`upper_threshold`	`float`	Check will fail if check result > upper_threshold	`inf`
`monitor_only`	`bool`	If True, no checks will be performed	`False`
`extra_info`	`str \| None`	Optional additional text that will be added to the end of the failure message	`None`

Example: MatchRateCheck( database_accessor="my-gcp-project.SHOP01", database_provider=None, left_table="my-gcp-project.SHOP01.pdp_views", right_table="my-gcp-project.SHOP01.skufeed_latest", join_columns_left=["DATE", "product_number_v2"], join_columns_right=["DATE", "product_number"], check_column="product_number", filters={ "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"}, "date": {"column": "DATE", "value": "2023-01-01", "type": "date"}, }, filters_left={ "status": {"column": "order_status", "value": "completed"}, }, filters_right={ "active": {"column": "is_active", "value": True}, }, )

Source code in src/koality/checks.py

class MatchRateCheck(DataQualityCheck):
    """Checks the match rate between two tables after joining on specific columns.

    If left_join_columns (or right_join_columns) is defined, these columns will be
    used for joining the data. If not, join_columns will be used as fallback.

    Args:
        left_table: Name of table for left part of join
                    (e.g., "my-gcp-project.SHOP01.identifier_base")
        right_table: Name of table for right part of join
                     (e.g., "my-gcp-project.SHOP01.feature_baseline")
        check_column: Name of column to be checked (e.g., "product_number")
        join_columns: List of columns to join data on (e.g., ["PREDICTION_DATE", "product_number"])
        join_columns_left: List of columns of left table to join data on
                           (e.g., ["BQ_PARTITIONTIME", "productId"])
        join_columns_right: List of columns of right table to join data on
                            (e.g., ["PREDICTION_DATE", "product_number"])
        lower_threshold: Check will fail if check result < lower_threshold
        upper_threshold: Check will fail if check result > upper_threshold
        monitor_only: If True, no checks will be performed
        extra_info: Optional additional text that will be added to the end of the failure message

    Example:
    MatchRateCheck(
        database_accessor="my-gcp-project.SHOP01",
        database_provider=None,
        left_table="my-gcp-project.SHOP01.pdp_views",
        right_table="my-gcp-project.SHOP01.skufeed_latest",
        join_columns_left=["DATE", "product_number_v2"],
        join_columns_right=["DATE", "product_number"],
        check_column="product_number",
        filters={
            "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"},
            "date": {"column": "DATE", "value": "2023-01-01", "type": "date"},
        },
        filters_left={
            "status": {"column": "order_status", "value": "completed"},
        },
        filters_right={
            "active": {"column": "is_active", "value": True},
        },
    )

    """

    def __init__(
        self,
        database_accessor: str,
        database_provider: DatabaseProvider | None,
        left_table: str,
        right_table: str,
        check_column: str,
        join_columns: list[str] | None = None,
        join_columns_left: list[str] | None = None,
        join_columns_right: list[str] | None = None,
        lower_threshold: float = -math.inf,
        upper_threshold: float = math.inf,
        *,
        monitor_only: bool = False,
        extra_info: str | None = None,
        filters: dict[str, Any] | None = None,
        filters_left: dict[str, Any] | None = None,
        filters_right: dict[str, Any] | None = None,
        identifier_format: str = "identifier",
        identifier_placeholder: str = "ALL",
        date_info: str | None = None,
    ) -> None:
        """Initialize the match rate check."""
        self.left_table = left_table
        self.right_table = right_table

        if not (join_columns or (join_columns_left and join_columns_right)):
            msg = "No join_columns was provided. Use either join_columns or join_columns_left and join_columns_right"
            raise KoalityError(msg)

        # mypy typing does not understand that None is not possible, thus, we
        # add `or []`
        self.join_columns_left: list[str] = join_columns_left if join_columns_left else join_columns or []
        self.join_columns_right: list[str] = join_columns_right if join_columns_right else join_columns or []

        if not self.join_columns_right or not self.join_columns_left:
            msg = "No join_columns was provided. Use join_columns, join_columns_left, and/or join_columns_right"
            raise KoalityError(msg)

        if len(self.join_columns_left) != len(self.join_columns_right):
            msg = (
                f"join_columns_left and join_columns_right need to have equal length"
                f" ({len(self.join_columns_left)} vs. {len(self.join_columns_right)})."
            )
            raise KoalityError(msg)

        super().__init__(
            database_accessor=database_accessor,
            database_provider=database_provider,
            table=f"{self.left_table}_JOIN_{self.right_table}",
            check_column=check_column,
            lower_threshold=lower_threshold,
            upper_threshold=upper_threshold,
            filters=filters,
            identifier_format=identifier_format,
            identifier_placeholder=identifier_placeholder,
            date_info=date_info,
            extra_info=extra_info,
            monitor_only=monitor_only,
        )

        # Support table-specific filters via filters_left and filters_right
        self.filters_left = self.filters | self.get_filters(filters_left or {})
        self.filters_right = self.filters | self.get_filters(filters_right or {})

    def assemble_name(self) -> str:
        """Return the check name for match rate."""
        return f"{self.check_column.split('.')[-1]}_matchrate"

    def assemble_query(self) -> str:
        """Assemble the SQL query for calculating match rate between tables."""
        # Transform dotted column names based on data source:
        # - With database_accessor: convert to underscores (value.shopId → value_shopId) for flattened data
        # - Without database_accessor: keep dots for SELECT (value.shopId), use last part for JOIN (shopId)
        if self.database_accessor:
            right_column_statement = ",\n    ".join([col.replace(".", "_") for col in self.join_columns_right])
            join_on_statement = "\n    AND\n    ".join(
                [
                    f"lefty.{left_col.replace('.', '_')} = righty.{right_col.replace('.', '_')}"
                    for left_col, right_col in zip(self.join_columns_left, self.join_columns_right, strict=False)
                ],
            )
        else:
            # For native DuckDB struct columns: SELECT uses dotted notation,
            # but DuckDB names the result column as just the last part
            right_column_statement = ",\n    ".join(self.join_columns_right)
            join_on_statement = "\n    AND\n    ".join(
                [
                    f"lefty.{left_col.split('.')[-1]} = righty.{right_col.split('.')[-1]}"
                    for left_col, right_col in zip(self.join_columns_left, self.join_columns_right, strict=False)
                ],
            )

        return f"""
        WITH
            righty AS (
                SELECT DISTINCT
                    {right_column_statement},
                    TRUE AS in_right_table
                FROM
                    "{self.right_table}"
                {self.assemble_where_statement(self.filters_right, database_accessor=self.database_accessor)}
            ),
            lefty AS (
                SELECT
                    *
                FROM
                    "{self.left_table}"
                {self.assemble_where_statement(self.filters_left, database_accessor=self.database_accessor)}
            )

            SELECT
                CASE
                    WHEN COUNT(*) = 0 THEN 0.0
                    ELSE ROUND(COUNTIF(in_right_table IS TRUE) / COUNT(*), 3)
                END AS {self.name}
            FROM
                lefty
            LEFT JOIN
                righty
            ON
                {join_on_statement}
        """

    def assemble_data_exists_query(self) -> str:
        """First checks left, then right table for data.

        Returns:
            Empty table name or empty string

        """
        return f"""
        WITH
        righty AS (
            SELECT
                COUNT(*) AS right_counter,
            FROM
                "{self.right_table}"
            {self.assemble_where_statement(self.filters_right, database_accessor=self.database_accessor)}
        ),

        lefty AS (
            SELECT
                COUNT(*) AS left_counter,
            FROM
                "{self.left_table}"
            {self.assemble_where_statement(self.filters_left, database_accessor=self.database_accessor)}
        )

        SELECT
            IF(
                (SELECT * FROM lefty) > 0,
                IF((SELECT * FROM righty) > 0, '', '{self.right_table}'),
                '{self.left_table}'
            ) AS empty_table
        """

`init(database_accessor, database_provider, left_table, right_table, check_column, join_columns=None, join_columns_left=None, join_columns_right=None, lower_threshold=-math.inf, upper_threshold=math.inf, *, monitor_only=False, extra_info=None, filters=None, filters_left=None, filters_right=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None)`

Initialize the match rate check.

Source code in src/koality/checks.py

def __init__(
    self,
    database_accessor: str,
    database_provider: DatabaseProvider | None,
    left_table: str,
    right_table: str,
    check_column: str,
    join_columns: list[str] | None = None,
    join_columns_left: list[str] | None = None,
    join_columns_right: list[str] | None = None,
    lower_threshold: float = -math.inf,
    upper_threshold: float = math.inf,
    *,
    monitor_only: bool = False,
    extra_info: str | None = None,
    filters: dict[str, Any] | None = None,
    filters_left: dict[str, Any] | None = None,
    filters_right: dict[str, Any] | None = None,
    identifier_format: str = "identifier",
    identifier_placeholder: str = "ALL",
    date_info: str | None = None,
) -> None:
    """Initialize the match rate check."""
    self.left_table = left_table
    self.right_table = right_table

    if not (join_columns or (join_columns_left and join_columns_right)):
        msg = "No join_columns was provided. Use either join_columns or join_columns_left and join_columns_right"
        raise KoalityError(msg)

    # mypy typing does not understand that None is not possible, thus, we
    # add `or []`
    self.join_columns_left: list[str] = join_columns_left if join_columns_left else join_columns or []
    self.join_columns_right: list[str] = join_columns_right if join_columns_right else join_columns or []

    if not self.join_columns_right or not self.join_columns_left:
        msg = "No join_columns was provided. Use join_columns, join_columns_left, and/or join_columns_right"
        raise KoalityError(msg)

    if len(self.join_columns_left) != len(self.join_columns_right):
        msg = (
            f"join_columns_left and join_columns_right need to have equal length"
            f" ({len(self.join_columns_left)} vs. {len(self.join_columns_right)})."
        )
        raise KoalityError(msg)

    super().__init__(
        database_accessor=database_accessor,
        database_provider=database_provider,
        table=f"{self.left_table}_JOIN_{self.right_table}",
        check_column=check_column,
        lower_threshold=lower_threshold,
        upper_threshold=upper_threshold,
        filters=filters,
        identifier_format=identifier_format,
        identifier_placeholder=identifier_placeholder,
        date_info=date_info,
        extra_info=extra_info,
        monitor_only=monitor_only,
    )

    # Support table-specific filters via filters_left and filters_right
    self.filters_left = self.filters | self.get_filters(filters_left or {})
    self.filters_right = self.filters | self.get_filters(filters_right or {})

`assemble_data_exists_query()`

First checks left, then right table for data.

Returns:

Type	Description
`str`	Empty table name or empty string

Source code in src/koality/checks.py

def assemble_data_exists_query(self) -> str:
    """First checks left, then right table for data.

    Returns:
        Empty table name or empty string

    """
    return f"""
    WITH
    righty AS (
        SELECT
            COUNT(*) AS right_counter,
        FROM
            "{self.right_table}"
        {self.assemble_where_statement(self.filters_right, database_accessor=self.database_accessor)}
    ),

    lefty AS (
        SELECT
            COUNT(*) AS left_counter,
        FROM
            "{self.left_table}"
        {self.assemble_where_statement(self.filters_left, database_accessor=self.database_accessor)}
    )

    SELECT
        IF(
            (SELECT * FROM lefty) > 0,
            IF((SELECT * FROM righty) > 0, '', '{self.right_table}'),
            '{self.left_table}'
        ) AS empty_table
    """

`assemble_name()`

Return the check name for match rate.

Source code in src/koality/checks.py

def assemble_name(self) -> str:
    """Return the check name for match rate."""
    return f"{self.check_column.split('.')[-1]}_matchrate"

`assemble_query()`

Assemble the SQL query for calculating match rate between tables.

Source code in src/koality/checks.py

def assemble_query(self) -> str:
    """Assemble the SQL query for calculating match rate between tables."""
    # Transform dotted column names based on data source:
    # - With database_accessor: convert to underscores (value.shopId → value_shopId) for flattened data
    # - Without database_accessor: keep dots for SELECT (value.shopId), use last part for JOIN (shopId)
    if self.database_accessor:
        right_column_statement = ",\n    ".join([col.replace(".", "_") for col in self.join_columns_right])
        join_on_statement = "\n    AND\n    ".join(
            [
                f"lefty.{left_col.replace('.', '_')} = righty.{right_col.replace('.', '_')}"
                for left_col, right_col in zip(self.join_columns_left, self.join_columns_right, strict=False)
            ],
        )
    else:
        # For native DuckDB struct columns: SELECT uses dotted notation,
        # but DuckDB names the result column as just the last part
        right_column_statement = ",\n    ".join(self.join_columns_right)
        join_on_statement = "\n    AND\n    ".join(
            [
                f"lefty.{left_col.split('.')[-1]} = righty.{right_col.split('.')[-1]}"
                for left_col, right_col in zip(self.join_columns_left, self.join_columns_right, strict=False)
            ],
        )

    return f"""
    WITH
        righty AS (
            SELECT DISTINCT
                {right_column_statement},
                TRUE AS in_right_table
            FROM
                "{self.right_table}"
            {self.assemble_where_statement(self.filters_right, database_accessor=self.database_accessor)}
        ),
        lefty AS (
            SELECT
                *
            FROM
                "{self.left_table}"
            {self.assemble_where_statement(self.filters_left, database_accessor=self.database_accessor)}
        )

        SELECT
            CASE
                WHEN COUNT(*) = 0 THEN 0.0
                ELSE ROUND(COUNTIF(in_right_table IS TRUE) / COUNT(*), 3)
            END AS {self.name}
        FROM
            lefty
        LEFT JOIN
            righty
        ON
            {join_on_statement}
    """

`MaxCheck`

Bases: ColumnTransformationCheck

Compute the maximum (MAX) of a column for the filtered rows.

Inherits from ColumnTransformationCheck. Thresholds apply to the computed maximum.

Source code in src/koality/checks.py

class MaxCheck(ColumnTransformationCheck):
    """Compute the maximum (MAX) of a column for the filtered rows.

    Inherits from ColumnTransformationCheck. Thresholds apply to the computed maximum.
    """

    def __init__(
        self,
        database_accessor: str,
        database_provider: DatabaseProvider | None,
        table: str,
        check_column: str,
        lower_threshold: float = -math.inf,
        upper_threshold: float = math.inf,
        *,
        filters: dict[str, Any] | None = None,
        identifier_format: str = "identifier",
        identifier_placeholder: str = "ALL",
        date_info: str | None = None,
        extra_info: str | None = None,
        monitor_only: bool = False,
    ) -> None:
        """Initialize the max check."""
        super().__init__(
            database_accessor=database_accessor,
            database_provider=database_provider,
            transformation_name="max",
            table=table,
            check_column=check_column,
            lower_threshold=lower_threshold,
            upper_threshold=upper_threshold,
            filters=filters,
            identifier_format=identifier_format,
            identifier_placeholder=identifier_placeholder,
            date_info=date_info,
            extra_info=extra_info,
            monitor_only=monitor_only,
        )

    def transformation_statement(self) -> str:
        """Return the SQL statement for computing the maximum."""
        return f"MAX({self.in_memory_column}) AS {self.name}"

`init(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

Initialize the max check.

Source code in src/koality/checks.py

def __init__(
    self,
    database_accessor: str,
    database_provider: DatabaseProvider | None,
    table: str,
    check_column: str,
    lower_threshold: float = -math.inf,
    upper_threshold: float = math.inf,
    *,
    filters: dict[str, Any] | None = None,
    identifier_format: str = "identifier",
    identifier_placeholder: str = "ALL",
    date_info: str | None = None,
    extra_info: str | None = None,
    monitor_only: bool = False,
) -> None:
    """Initialize the max check."""
    super().__init__(
        database_accessor=database_accessor,
        database_provider=database_provider,
        transformation_name="max",
        table=table,
        check_column=check_column,
        lower_threshold=lower_threshold,
        upper_threshold=upper_threshold,
        filters=filters,
        identifier_format=identifier_format,
        identifier_placeholder=identifier_placeholder,
        date_info=date_info,
        extra_info=extra_info,
        monitor_only=monitor_only,
    )

`transformation_statement()`

Return the SQL statement for computing the maximum.

Source code in src/koality/checks.py

def transformation_statement(self) -> str:
    """Return the SQL statement for computing the maximum."""
    return f"MAX({self.in_memory_column}) AS {self.name}"

`MinCheck`

Bases: ColumnTransformationCheck

Compute the minimum (MIN) of a column for the filtered rows.

Inherits from ColumnTransformationCheck. Thresholds apply to the computed minimum.

Source code in src/koality/checks.py

class MinCheck(ColumnTransformationCheck):
    """Compute the minimum (MIN) of a column for the filtered rows.

    Inherits from ColumnTransformationCheck. Thresholds apply to the computed minimum.
    """

    def __init__(
        self,
        database_accessor: str,
        database_provider: DatabaseProvider | None,
        table: str,
        check_column: str,
        lower_threshold: float = -math.inf,
        upper_threshold: float = math.inf,
        *,
        filters: dict[str, Any] | None = None,
        identifier_format: str = "identifier",
        identifier_placeholder: str = "ALL",
        date_info: str | None = None,
        extra_info: str | None = None,
        monitor_only: bool = False,
    ) -> None:
        """Initialize the min check."""
        super().__init__(
            database_accessor=database_accessor,
            database_provider=database_provider,
            transformation_name="min",
            table=table,
            check_column=check_column,
            lower_threshold=lower_threshold,
            upper_threshold=upper_threshold,
            filters=filters,
            identifier_format=identifier_format,
            identifier_placeholder=identifier_placeholder,
            date_info=date_info,
            extra_info=extra_info,
            monitor_only=monitor_only,
        )

    def transformation_statement(self) -> str:
        """Return the SQL statement for computing the minimum."""
        return f"MIN({self.in_memory_column}) AS {self.name}"

`init(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

Initialize the min check.

Source code in src/koality/checks.py

def __init__(
    self,
    database_accessor: str,
    database_provider: DatabaseProvider | None,
    table: str,
    check_column: str,
    lower_threshold: float = -math.inf,
    upper_threshold: float = math.inf,
    *,
    filters: dict[str, Any] | None = None,
    identifier_format: str = "identifier",
    identifier_placeholder: str = "ALL",
    date_info: str | None = None,
    extra_info: str | None = None,
    monitor_only: bool = False,
) -> None:
    """Initialize the min check."""
    super().__init__(
        database_accessor=database_accessor,
        database_provider=database_provider,
        transformation_name="min",
        table=table,
        check_column=check_column,
        lower_threshold=lower_threshold,
        upper_threshold=upper_threshold,
        filters=filters,
        identifier_format=identifier_format,
        identifier_placeholder=identifier_placeholder,
        date_info=date_info,
        extra_info=extra_info,
        monitor_only=monitor_only,
    )

`transformation_statement()`

Return the SQL statement for computing the minimum.

Source code in src/koality/checks.py

def transformation_statement(self) -> str:
    """Return the SQL statement for computing the minimum."""
    return f"MIN({self.in_memory_column}) AS {self.name}"

`NullRatioCheck`

Bases: ColumnTransformationCheck

Check the share of NULL values in a specific column of a table.

Inherits from ColumnTransformationCheck; see its documentation for argument descriptions.

Example: NullRatioCheck( database_accessor="project.dataset", database_provider=None, table="project.dataset.table", check_column="orders", filters={ "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"}, "date": {"column": "date", "value": "2023-01-01", "type": "date"}, }, lower_threshold=0.9, upper_threshold=1.0, )

Source code in src/koality/checks.py

class NullRatioCheck(ColumnTransformationCheck):
    """Check the share of NULL values in a specific column of a table.

    Inherits from ColumnTransformationCheck; see its documentation for argument descriptions.

    Example:
    NullRatioCheck(
        database_accessor="project.dataset",
        database_provider=None,
        table="project.dataset.table",
        check_column="orders",
        filters={
            "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"},
            "date": {"column": "date", "value": "2023-01-01", "type": "date"},
        },
        lower_threshold=0.9,
        upper_threshold=1.0,
    )

    """

    def __init__(
        self,
        database_accessor: str,
        database_provider: DatabaseProvider | None,
        table: str,
        check_column: str,
        lower_threshold: float = -math.inf,
        upper_threshold: float = math.inf,
        *,
        filters: dict[str, Any] | None = None,
        identifier_format: str = "identifier",
        identifier_placeholder: str = "ALL",
        date_info: str | None = None,
        extra_info: str | None = None,
        monitor_only: bool = False,
    ) -> None:
        """Initialize the null ratio check."""
        super().__init__(
            database_accessor=database_accessor,
            database_provider=database_provider,
            transformation_name="null_ratio",
            table=table,
            check_column=check_column,
            lower_threshold=lower_threshold,
            upper_threshold=upper_threshold,
            filters=filters,
            identifier_format=identifier_format,
            identifier_placeholder=identifier_placeholder,
            date_info=date_info,
            extra_info=extra_info,
            monitor_only=monitor_only,
        )

    def transformation_statement(self) -> str:
        """Return the SQL statement for calculating null ratio."""
        return f"""
            CASE
                WHEN COUNT(*) = 0 THEN 0.0
                ELSE ROUND(COUNTIF({self.in_memory_column} IS NULL) / COUNT(*), 3)
            END AS {self.name}
        """

`init(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

Initialize the null ratio check.

Source code in src/koality/checks.py

def __init__(
    self,
    database_accessor: str,
    database_provider: DatabaseProvider | None,
    table: str,
    check_column: str,
    lower_threshold: float = -math.inf,
    upper_threshold: float = math.inf,
    *,
    filters: dict[str, Any] | None = None,
    identifier_format: str = "identifier",
    identifier_placeholder: str = "ALL",
    date_info: str | None = None,
    extra_info: str | None = None,
    monitor_only: bool = False,
) -> None:
    """Initialize the null ratio check."""
    super().__init__(
        database_accessor=database_accessor,
        database_provider=database_provider,
        transformation_name="null_ratio",
        table=table,
        check_column=check_column,
        lower_threshold=lower_threshold,
        upper_threshold=upper_threshold,
        filters=filters,
        identifier_format=identifier_format,
        identifier_placeholder=identifier_placeholder,
        date_info=date_info,
        extra_info=extra_info,
        monitor_only=monitor_only,
    )

`transformation_statement()`

Return the SQL statement for calculating null ratio.

Source code in src/koality/checks.py

def transformation_statement(self) -> str:
    """Return the SQL statement for calculating null ratio."""
    return f"""
        CASE
            WHEN COUNT(*) = 0 THEN 0.0
            ELSE ROUND(COUNTIF({self.in_memory_column} IS NULL) / COUNT(*), 3)
        END AS {self.name}
    """

`OccurrenceCheck`

Bases: ColumnTransformationCheck

Check how often any value in a column occurs.

Inherits from koality.checks.ColumnTransformationCheck, and thus, we refer to argument descriptions in its super class. Useful e.g. to check for a single product occurring unusually often (likely an error).

Parameters:

Name	Type	Description	Default
`max_or_min`	`Literal['max', 'min']`	Check either the maximum or minimum occurrence of any value. If you want to check if any value occurs more than x times, use 'max' and upper_threshold=x If you want to check if any value occurs less than y times, use 'min' and lower_threshold=y	required

Example: OccurrenceCheck( database_accessor="my-gcp-project.SHOP01", database_provider=None, max_or_min="max", table="my-gcp-project.SHOP01.skufeed_latest", check_column="sku_id", filters={ "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"}, "date": {"column": "DATE", "value": "2023-01-01", "type": "date"}, }, lower_threshold=0, upper_threshold=500, )

Source code in src/koality/checks.py

class OccurrenceCheck(ColumnTransformationCheck):
    """Check how often any value in a column occurs.

    Inherits from `koality.checks.ColumnTransformationCheck`, and thus, we refer to argument
    descriptions in its super class.
    Useful e.g. to check for a single product occurring unusually often (likely an error).

    Args:
        max_or_min: Check either the maximum or minimum occurrence of any value.
                    If you want to check if any value occurs more than x times, use 'max' and upper_threshold=x
                    If you want to check if any value occurs less than y times, use 'min' and lower_threshold=y

    Example:
    OccurrenceCheck(
        database_accessor="my-gcp-project.SHOP01",
        database_provider=None,
        max_or_min="max",
        table="my-gcp-project.SHOP01.skufeed_latest",
        check_column="sku_id",
        filters={
            "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"},
            "date": {"column": "DATE", "value": "2023-01-01", "type": "date"},
        },
        lower_threshold=0,
        upper_threshold=500,
    )

    """

    def __init__(
        self,
        database_accessor: str,
        database_provider: DatabaseProvider | None,
        max_or_min: Literal["max", "min"],
        table: str,
        check_column: str,
        lower_threshold: float = -math.inf,
        upper_threshold: float = math.inf,
        *,
        filters: dict[str, Any] | None = None,
        identifier_format: str = "identifier",
        identifier_placeholder: str = "ALL",
        date_info: str | None = None,
        extra_info: str | None = None,
        monitor_only: bool = False,
    ) -> None:
        """Initialize the occurrence check."""
        if max_or_min not in ("max", "min"):
            msg = "'max_or_min' must be one of supported modes 'min' or 'max'"
            raise KoalityError(msg)
        self.max_or_min = max_or_min
        super().__init__(
            database_accessor=database_accessor,
            database_provider=database_provider,
            transformation_name=f"occurrence_{max_or_min}",
            table=table,
            check_column=check_column,
            lower_threshold=lower_threshold,
            upper_threshold=upper_threshold,
            filters=filters,
            identifier_format=identifier_format,
            identifier_placeholder=identifier_placeholder,
            date_info=date_info,
            extra_info=extra_info,
            monitor_only=monitor_only,
        )

    def transformation_statement(self) -> str:
        """Return the SQL statement for counting occurrences."""
        return f"{self.in_memory_column}, COUNT(*) AS {self.name}"

    def assemble_query(self) -> str:
        """Assemble query to find max or min occurrence of any value."""
        # Since koality checks only the first entry, the table with value + count_occurence is
        # ordered DESC/ASC depending on whether max/min occurence is supposed to be checked.
        order = {"max": "DESC", "min": "ASC"}[self.max_or_min]
        return f"""
            {self.query_boilerplate(self.transformation_statement())}
            {self.assemble_where_statement(self.filters, database_accessor=self.database_accessor)}
            GROUP BY {self.in_memory_column}
            ORDER BY {self.name} {order}
            LIMIT 1  -- only the first entry is needed
        """

`init(database_accessor, database_provider, max_or_min, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

Initialize the occurrence check.

Source code in src/koality/checks.py

def __init__(
    self,
    database_accessor: str,
    database_provider: DatabaseProvider | None,
    max_or_min: Literal["max", "min"],
    table: str,
    check_column: str,
    lower_threshold: float = -math.inf,
    upper_threshold: float = math.inf,
    *,
    filters: dict[str, Any] | None = None,
    identifier_format: str = "identifier",
    identifier_placeholder: str = "ALL",
    date_info: str | None = None,
    extra_info: str | None = None,
    monitor_only: bool = False,
) -> None:
    """Initialize the occurrence check."""
    if max_or_min not in ("max", "min"):
        msg = "'max_or_min' must be one of supported modes 'min' or 'max'"
        raise KoalityError(msg)
    self.max_or_min = max_or_min
    super().__init__(
        database_accessor=database_accessor,
        database_provider=database_provider,
        transformation_name=f"occurrence_{max_or_min}",
        table=table,
        check_column=check_column,
        lower_threshold=lower_threshold,
        upper_threshold=upper_threshold,
        filters=filters,
        identifier_format=identifier_format,
        identifier_placeholder=identifier_placeholder,
        date_info=date_info,
        extra_info=extra_info,
        monitor_only=monitor_only,
    )

`assemble_query()`

Assemble query to find max or min occurrence of any value.

Source code in src/koality/checks.py

def assemble_query(self) -> str:
    """Assemble query to find max or min occurrence of any value."""
    # Since koality checks only the first entry, the table with value + count_occurence is
    # ordered DESC/ASC depending on whether max/min occurence is supposed to be checked.
    order = {"max": "DESC", "min": "ASC"}[self.max_or_min]
    return f"""
        {self.query_boilerplate(self.transformation_statement())}
        {self.assemble_where_statement(self.filters, database_accessor=self.database_accessor)}
        GROUP BY {self.in_memory_column}
        ORDER BY {self.name} {order}
        LIMIT 1  -- only the first entry is needed
    """

`transformation_statement()`

Return the SQL statement for counting occurrences.

Source code in src/koality/checks.py

def transformation_statement(self) -> str:
    """Return the SQL statement for counting occurrences."""
    return f"{self.in_memory_column}, COUNT(*) AS {self.name}"

`RegexMatchCheck`

Bases: ColumnTransformationCheck

Check the share of values matching a regex in a specific column of a table.

Inherits from ColumnTransformationCheck; see its documentation for argument descriptions.

Parameters:

Name	Type	Description	Default
`regex_to_match`	`str`	The regular expression to be checked on check_column (e.g., "SHOP[0-9]{2}-.*" to check for a shop code prefix like "SHOP01-")	required

Example: RegexMatchCheck( database_accessor="project.dataset", database_provider=None, table="project.dataset.table", check_column="orders", regex_to_match="^SHOP[0-9]{2}-.*", filters={"date": {"column": "date", "value": "2023-01-01", "type": "date"}}, lower_threshold=0.9, upper_threshold=1.0, )

Source code in src/koality/checks.py

class RegexMatchCheck(ColumnTransformationCheck):
    """Check the share of values matching a regex in a specific column of a table.

    Inherits from ColumnTransformationCheck; see its documentation for argument descriptions.

    Args:
        regex_to_match: The regular expression to be checked on check_column (e.g.,
                        "SHOP[0-9]{2}-.*" to check for a shop code prefix like "SHOP01-")

    Example:
    RegexMatchCheck(
        database_accessor="project.dataset",
        database_provider=None,
        table="project.dataset.table",
        check_column="orders",
        regex_to_match="^SHOP[0-9]{2}-.*",
        filters={"date": {"column": "date", "value": "2023-01-01", "type": "date"}},
        lower_threshold=0.9,
        upper_threshold=1.0,
    )

    """

    def __init__(
        self,
        database_accessor: str,
        database_provider: DatabaseProvider | None,
        table: str,
        check_column: str,
        regex_to_match: str,
        lower_threshold: float = -math.inf,
        upper_threshold: float = math.inf,
        *,
        filters: dict[str, Any] | None = None,
        identifier_format: str = "identifier",
        identifier_placeholder: str = "ALL",
        date_info: str | None = None,
        extra_info: str | None = None,
        monitor_only: bool = False,
    ) -> None:
        """Initialize the regex match check."""
        self.regex_to_match = regex_to_match

        super().__init__(
            database_accessor=database_accessor,
            database_provider=database_provider,
            transformation_name="regex_match_ratio",
            table=table,
            check_column=check_column,
            lower_threshold=lower_threshold,
            upper_threshold=upper_threshold,
            filters=filters,
            identifier_format=identifier_format,
            identifier_placeholder=identifier_placeholder,
            date_info=date_info,
            extra_info=extra_info,
            monitor_only=monitor_only,
        )

    def transformation_statement(self) -> str:
        """Return the SQL statement for calculating regex match ratio."""
        return f"""AVG(IF(regexp_matches({self.in_memory_column}, '{self.regex_to_match}'), 1, 0)) AS {self.name}"""

`init(database_accessor, database_provider, table, check_column, regex_to_match, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

Initialize the regex match check.

Source code in src/koality/checks.py

def __init__(
    self,
    database_accessor: str,
    database_provider: DatabaseProvider | None,
    table: str,
    check_column: str,
    regex_to_match: str,
    lower_threshold: float = -math.inf,
    upper_threshold: float = math.inf,
    *,
    filters: dict[str, Any] | None = None,
    identifier_format: str = "identifier",
    identifier_placeholder: str = "ALL",
    date_info: str | None = None,
    extra_info: str | None = None,
    monitor_only: bool = False,
) -> None:
    """Initialize the regex match check."""
    self.regex_to_match = regex_to_match

    super().__init__(
        database_accessor=database_accessor,
        database_provider=database_provider,
        transformation_name="regex_match_ratio",
        table=table,
        check_column=check_column,
        lower_threshold=lower_threshold,
        upper_threshold=upper_threshold,
        filters=filters,
        identifier_format=identifier_format,
        identifier_placeholder=identifier_placeholder,
        date_info=date_info,
        extra_info=extra_info,
        monitor_only=monitor_only,
    )

`transformation_statement()`

Return the SQL statement for calculating regex match ratio.

Source code in src/koality/checks.py

def transformation_statement(self) -> str:
    """Return the SQL statement for calculating regex match ratio."""
    return f"""AVG(IF(regexp_matches({self.in_memory_column}, '{self.regex_to_match}'), 1, 0)) AS {self.name}"""

`RelCountChangeCheck`

Bases: DataQualityCheck

Check the relative change of a count compared to historic average.

Compares the count to the average counts of a number of historic days before the check date.

Parameters:

Name	Type	Description	Default
`table`	`str`	Name of table (e.g., "my-gcp-project.SHOP01.feature_category")	required
`check_column`	`str`	Name of column to be checked (e.g., "category")	required
`rolling_days`	`int`	The number of historic days to be taken into account for the historic average baseline for the comparison (e.g., 7).	required
`lower_threshold`	`float`	Check will fail if check result < lower_threshold	`-inf`
`upper_threshold`	`float`	Check will fail if check result > upper_threshold	`inf`
`monitor_only`	`bool`	If True, no checks will be performed	`False`
`extra_info`	`str \| None`	Optional additional text that will be added to the end of the failure message	`None`
`filters`	`dict[str, Any] \| None`	Filter configuration dict. Must include a 'date' filter with column and value.	`None`

Example: RelCountChangeCheck( database_accessor="my-gcp-project.SHOP01", database_provider=None, table="my-gcp-project.SHOP01.skufeed_latest", check_column="sku_id", rolling_days=7, filters={ "partition_date": {"column": "DATE", "value": "2023-01-01", "type": "date"}, "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"}, }, lower_threshold=-0.15, upper_threshold=0.15, )

Source code in src/koality/checks.py

class RelCountChangeCheck(DataQualityCheck):  # TODO: (non)distinct counts parameter?
    """Check the relative change of a count compared to historic average.

    Compares the count to the average counts of a number of historic days before
    the check date.

    Args:
        table: Name of table (e.g., "my-gcp-project.SHOP01.feature_category")
        check_column: Name of column to be checked (e.g., "category")
        rolling_days: The number of historic days to be taken into account for
                      the historic average baseline for the comparison (e.g., 7).
        lower_threshold: Check will fail if check result < lower_threshold
        upper_threshold: Check will fail if check result > upper_threshold
        monitor_only: If True, no checks will be performed
        extra_info: Optional additional text that will be added to the end of the failure message
        filters: Filter configuration dict. Must include a 'date' filter with column and value.

    Example:
    RelCountChangeCheck(
        database_accessor="my-gcp-project.SHOP01",
        database_provider=None,
        table="my-gcp-project.SHOP01.skufeed_latest",
        check_column="sku_id",
        rolling_days=7,
        filters={
            "partition_date": {"column": "DATE", "value": "2023-01-01", "type": "date"},
            "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"},
        },
        lower_threshold=-0.15,
        upper_threshold=0.15,
    )

    """

    def __init__(
        self,
        database_accessor: str,
        database_provider: DatabaseProvider | None,
        table: str,
        check_column: str,
        rolling_days: int,
        lower_threshold: float = -math.inf,
        upper_threshold: float = math.inf,
        *,
        filters: dict[str, Any] | None = None,
        identifier_format: str = "identifier",
        identifier_placeholder: str = "ALL",
        date_info: str | None = None,
        extra_info: str | None = None,
        monitor_only: bool = False,
    ) -> None:
        """Initialize the relative count change check."""
        self.rolling_days = rolling_days

        # Find date filter by type
        filters = filters or {}
        date_filter = None
        for config in filters.values():
            cfg = config.model_dump() if isinstance(config, FilterConfig) else config
            if cfg.get("type") == "date":
                date_filter = cfg
                break

        if not date_filter or not date_filter.get("column") or date_filter.get("value") is None:
            msg = "RelCountChangeCheck requires a filter with type='date'"
            raise KoalityError(msg)

        super().__init__(
            database_accessor=database_accessor,
            database_provider=database_provider,
            table=table,
            check_column=check_column,
            lower_threshold=lower_threshold,
            upper_threshold=upper_threshold,
            filters=filters,
            identifier_format=identifier_format,
            identifier_placeholder=identifier_placeholder,
            date_info=date_info,
            extra_info=extra_info,
            monitor_only=monitor_only,
        )

        # Remove date filter from WHERE clause (it's used in the rolling window SQL, not WHERE)
        self.filters = {name: cfg for name, cfg in self.filters.items() if cfg.get("type") != "date"}

    def assemble_name(self) -> str:
        """Return the check name for count change."""
        return f"{self.check_column.split('.')[-1]}_count_change"

    def assemble_query(self) -> str:
        """Assemble the SQL query for calculating relative count change."""
        where_statement = self.assemble_where_statement(self.filters, database_accessor=self.database_accessor).replace(
            "WHERE",
            "AND",
        )
        date_col = self.date_filter["column"]
        date_val = self.date_filter["value"]

        return f"""
        WITH
            base AS (
                SELECT
                    CAST({date_col} AS DATE) AS date_col_cast,
                    COUNT(DISTINCT {self.in_memory_column}) AS dist_cnt
                FROM
                    "{self.table}"
                WHERE
                    CAST({date_col} AS DATE) BETWEEN (DATE '{date_val}' - INTERVAL {self.rolling_days} DAY)
                    AND DATE '{date_val}'
                {where_statement}
                GROUP BY
                    CAST({date_col} AS DATE)
            ),
            rolling_avgs AS (
                SELECT
                    AVG(dist_cnt) AS rolling_avg
                FROM
                    base
                WHERE
                    date_col_cast BETWEEN (DATE '{date_val}' - INTERVAL {self.rolling_days} DAY)
                AND
                    (DATE '{date_val}' - INTERVAL 1 DAY)
            ),

            -- Helper is needed to cover case where no current data is available
            dist_cnt_helper AS (
                SELECT
                    MAX(dist_cnt) AS dist_cnt
                FROM
                    (
                        SELECT dist_cnt FROM base WHERE date_col_cast = DATE '{date_val}'
                        UNION ALL
                        SELECT 0 AS dist_cnt
                    )
            )

            SELECT
                CASE
                    WHEN rolling_avg = 0 THEN 0.0
                    ELSE ROUND((dist_cnt - rolling_avg) / rolling_avg, 3)
                END AS {self.name}
            FROM
                dist_cnt_helper
            JOIN
                rolling_avgs
            ON TRUE
        """

    def assemble_data_exists_query(self) -> str:
        """Assemble the SQL query to check if data exists for the check date."""
        data_exists_query = f"""
        SELECT
            IF(COUNT(*) > 0, '', '{self.table}') AS empty_table
        FROM
            "{self.table}"
        """
        date_col = self.date_filter["column"]
        date_val = self.date_filter["value"]

        where_statement = self.assemble_where_statement(self.filters, database_accessor=self.database_accessor)
        if where_statement:
            return f"{data_exists_query}\n{where_statement} AND CAST({date_col} AS DATE) = DATE '{date_val}'"
        return f"{data_exists_query}\nWHERE CAST({date_col} AS DATE) = DATE '{date_val}'"

`init(database_accessor, database_provider, table, check_column, rolling_days, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

Initialize the relative count change check.

Source code in src/koality/checks.py

def __init__(
    self,
    database_accessor: str,
    database_provider: DatabaseProvider | None,
    table: str,
    check_column: str,
    rolling_days: int,
    lower_threshold: float = -math.inf,
    upper_threshold: float = math.inf,
    *,
    filters: dict[str, Any] | None = None,
    identifier_format: str = "identifier",
    identifier_placeholder: str = "ALL",
    date_info: str | None = None,
    extra_info: str | None = None,
    monitor_only: bool = False,
) -> None:
    """Initialize the relative count change check."""
    self.rolling_days = rolling_days

    # Find date filter by type
    filters = filters or {}
    date_filter = None
    for config in filters.values():
        cfg = config.model_dump() if isinstance(config, FilterConfig) else config
        if cfg.get("type") == "date":
            date_filter = cfg
            break

    if not date_filter or not date_filter.get("column") or date_filter.get("value") is None:
        msg = "RelCountChangeCheck requires a filter with type='date'"
        raise KoalityError(msg)

    super().__init__(
        database_accessor=database_accessor,
        database_provider=database_provider,
        table=table,
        check_column=check_column,
        lower_threshold=lower_threshold,
        upper_threshold=upper_threshold,
        filters=filters,
        identifier_format=identifier_format,
        identifier_placeholder=identifier_placeholder,
        date_info=date_info,
        extra_info=extra_info,
        monitor_only=monitor_only,
    )

    # Remove date filter from WHERE clause (it's used in the rolling window SQL, not WHERE)
    self.filters = {name: cfg for name, cfg in self.filters.items() if cfg.get("type") != "date"}

`assemble_data_exists_query()`

Assemble the SQL query to check if data exists for the check date.

Source code in src/koality/checks.py

def assemble_data_exists_query(self) -> str:
    """Assemble the SQL query to check if data exists for the check date."""
    data_exists_query = f"""
    SELECT
        IF(COUNT(*) > 0, '', '{self.table}') AS empty_table
    FROM
        "{self.table}"
    """
    date_col = self.date_filter["column"]
    date_val = self.date_filter["value"]

    where_statement = self.assemble_where_statement(self.filters, database_accessor=self.database_accessor)
    if where_statement:
        return f"{data_exists_query}\n{where_statement} AND CAST({date_col} AS DATE) = DATE '{date_val}'"
    return f"{data_exists_query}\nWHERE CAST({date_col} AS DATE) = DATE '{date_val}'"

`assemble_name()`

Return the check name for count change.

Source code in src/koality/checks.py

def assemble_name(self) -> str:
    """Return the check name for count change."""
    return f"{self.check_column.split('.')[-1]}_count_change"

`assemble_query()`

Assemble the SQL query for calculating relative count change.

Source code in src/koality/checks.py

def assemble_query(self) -> str:
    """Assemble the SQL query for calculating relative count change."""
    where_statement = self.assemble_where_statement(self.filters, database_accessor=self.database_accessor).replace(
        "WHERE",
        "AND",
    )
    date_col = self.date_filter["column"]
    date_val = self.date_filter["value"]

    return f"""
    WITH
        base AS (
            SELECT
                CAST({date_col} AS DATE) AS date_col_cast,
                COUNT(DISTINCT {self.in_memory_column}) AS dist_cnt
            FROM
                "{self.table}"
            WHERE
                CAST({date_col} AS DATE) BETWEEN (DATE '{date_val}' - INTERVAL {self.rolling_days} DAY)
                AND DATE '{date_val}'
            {where_statement}
            GROUP BY
                CAST({date_col} AS DATE)
        ),
        rolling_avgs AS (
            SELECT
                AVG(dist_cnt) AS rolling_avg
            FROM
                base
            WHERE
                date_col_cast BETWEEN (DATE '{date_val}' - INTERVAL {self.rolling_days} DAY)
            AND
                (DATE '{date_val}' - INTERVAL 1 DAY)
        ),

        -- Helper is needed to cover case where no current data is available
        dist_cnt_helper AS (
            SELECT
                MAX(dist_cnt) AS dist_cnt
            FROM
                (
                    SELECT dist_cnt FROM base WHERE date_col_cast = DATE '{date_val}'
                    UNION ALL
                    SELECT 0 AS dist_cnt
                )
        )

        SELECT
            CASE
                WHEN rolling_avg = 0 THEN 0.0
                ELSE ROUND((dist_cnt - rolling_avg) / rolling_avg, 3)
            END AS {self.name}
        FROM
            dist_cnt_helper
        JOIN
            rolling_avgs
        ON TRUE
    """

`RollingValuesInSetCheck`

Bases: ValuesInSetCheck

Check share of values matching a value set over a rolling time period.

Similar to ValuesInSetCheck, but the share is computed for a longer time period (currently also including data of the 14 days before the actual check date). It inherits from koality.checks.ValuesInSetCheck, and thus, also from koality.checks.ColumnTransformationCheck, so we refer to argument descriptions in its super class.

Parameters:

Name	Type	Description	Default
`filters`	`dict[str, Any] \| None`	Filter configuration dict. Must include a 'date' filter with column and value.	`None`

Example: RollingValuesInSetCheck( database_accessor="my-gcp-project.SHOP01", database_provider=None, table="my-gcp-project.SHOP01.orders", check_column="category", value_set='("toys", "shoes")', filters={ "partition_date": {"column": "DATE", "value": "2023-01-01", "type": "date"}, "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"}, }, lower_threshold=0.9, upper_threshold=1.0, )

Source code in src/koality/checks.py

class RollingValuesInSetCheck(ValuesInSetCheck):
    """Check share of values matching a value set over a rolling time period.

    Similar to `ValuesInSetCheck`, but the share is computed for a longer time period
    (currently also including data of the 14 days before the actual check date).
    It inherits from `koality.checks.ValuesInSetCheck`, and thus, also from
    `koality.checks.ColumnTransformationCheck`, so we refer to argument descriptions
    in its super class.

    Args:
        filters: Filter configuration dict. Must include a 'date' filter with column and value.

    Example:
    RollingValuesInSetCheck(
        database_accessor="my-gcp-project.SHOP01",
        database_provider=None,
        table="my-gcp-project.SHOP01.orders",
        check_column="category",
        value_set='("toys", "shoes")',
        filters={
            "partition_date": {"column": "DATE", "value": "2023-01-01", "type": "date"},
            "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"},
        },
        lower_threshold=0.9,
        upper_threshold=1.0,
    )

    """

    def __init__(
        self,
        database_accessor: str,
        database_provider: DatabaseProvider | None,
        table: str,
        check_column: str,
        value_set: str | bytes | Iterable,
        lower_threshold: float = -math.inf,
        upper_threshold: float = math.inf,
        *,
        filters: dict[str, Any] | None = None,
        identifier_format: str = "identifier",
        identifier_placeholder: str = "ALL",
        date_info: str | None = None,
        extra_info: str | None = None,
        monitor_only: bool = False,
    ) -> None:
        """Initialize the rolling values in set check."""
        # Find date filter by type
        filters = filters or {}
        date_filter = None
        for config in filters.values():
            cfg = config.model_dump() if isinstance(config, FilterConfig) else config
            if cfg.get("type") == "date":
                date_filter = cfg
                break

        if not date_filter or not date_filter.get("column") or date_filter.get("value") is None:
            msg = "RollingValuesInSetCheck requires a filter with type='date'"
            raise KoalityError(msg)

        super().__init__(
            database_accessor=database_accessor,
            database_provider=database_provider,
            transformation_name="rolling_values_in_set_ratio",
            table=table,
            value_set=value_set,
            check_column=check_column,
            lower_threshold=lower_threshold,
            upper_threshold=upper_threshold,
            filters=filters,
            identifier_format=identifier_format,
            identifier_placeholder=identifier_placeholder,
            date_info=date_info,
            extra_info=extra_info,
            monitor_only=monitor_only,
        )

        # Remove date filter from WHERE clause (it's used in the rolling window SQL, not WHERE)
        self.filters = {name: cfg for name, cfg in self.filters.items() if cfg.get("type") != "date"}

    def assemble_query(self) -> str:
        """Assemble query with rolling date range for values in set check."""
        main_query = self.query_boilerplate(self.transformation_statement())
        date_col = self.date_filter["column"]
        date_val = self.date_filter["value"]

        main_query += (
            "WHERE\n    "
            f"CAST({date_col} AS DATE) BETWEEN (DATE '{date_val}' - INTERVAL 14 DAY) AND DATE '{date_val}'"
        )  # TODO: maybe parameterize interval days

        if where_statement := self.assemble_where_statement(self.filters, database_accessor=self.database_accessor):
            return main_query + "\nAND\n" + where_statement.removeprefix("WHERE\n")

        return main_query

`init(database_accessor, database_provider, table, check_column, value_set, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

Initialize the rolling values in set check.

Source code in src/koality/checks.py

def __init__(
    self,
    database_accessor: str,
    database_provider: DatabaseProvider | None,
    table: str,
    check_column: str,
    value_set: str | bytes | Iterable,
    lower_threshold: float = -math.inf,
    upper_threshold: float = math.inf,
    *,
    filters: dict[str, Any] | None = None,
    identifier_format: str = "identifier",
    identifier_placeholder: str = "ALL",
    date_info: str | None = None,
    extra_info: str | None = None,
    monitor_only: bool = False,
) -> None:
    """Initialize the rolling values in set check."""
    # Find date filter by type
    filters = filters or {}
    date_filter = None
    for config in filters.values():
        cfg = config.model_dump() if isinstance(config, FilterConfig) else config
        if cfg.get("type") == "date":
            date_filter = cfg
            break

    if not date_filter or not date_filter.get("column") or date_filter.get("value") is None:
        msg = "RollingValuesInSetCheck requires a filter with type='date'"
        raise KoalityError(msg)

    super().__init__(
        database_accessor=database_accessor,
        database_provider=database_provider,
        transformation_name="rolling_values_in_set_ratio",
        table=table,
        value_set=value_set,
        check_column=check_column,
        lower_threshold=lower_threshold,
        upper_threshold=upper_threshold,
        filters=filters,
        identifier_format=identifier_format,
        identifier_placeholder=identifier_placeholder,
        date_info=date_info,
        extra_info=extra_info,
        monitor_only=monitor_only,
    )

    # Remove date filter from WHERE clause (it's used in the rolling window SQL, not WHERE)
    self.filters = {name: cfg for name, cfg in self.filters.items() if cfg.get("type") != "date"}

`assemble_query()`

Assemble query with rolling date range for values in set check.

Source code in src/koality/checks.py

def assemble_query(self) -> str:
    """Assemble query with rolling date range for values in set check."""
    main_query = self.query_boilerplate(self.transformation_statement())
    date_col = self.date_filter["column"]
    date_val = self.date_filter["value"]

    main_query += (
        "WHERE\n    "
        f"CAST({date_col} AS DATE) BETWEEN (DATE '{date_val}' - INTERVAL 14 DAY) AND DATE '{date_val}'"
    )  # TODO: maybe parameterize interval days

    if where_statement := self.assemble_where_statement(self.filters, database_accessor=self.database_accessor):
        return main_query + "\nAND\n" + where_statement.removeprefix("WHERE\n")

    return main_query

`ValuesInSetCheck`

Bases: ColumnTransformationCheck

Check the share of values that match any value of a value set in a column.

Inherits from ColumnTransformationCheck; see its documentation for argument descriptions.

Parameters:

Name	Type	Description	Default
`value_set`	`str \| bytes \| Iterable`	A list of values (or a string representation of such a list) to be checked. Single values are also allowed. Examples for valid inputs: - ["shoes", "clothing"] - "clothing" - '("shoes", "toys")'	required

Example: ValuesInSetCheck( database_accessor="project.dataset", database_provider=None, table="project.dataset.table", check_column="category", value_set='("toys", "shoes")', filters={ "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"}, "date": {"column": "date", "value": "2023-01-01", "type": "date"}, }, lower_threshold=0.9, upper_threshold=1.0, )

Source code in src/koality/checks.py

class ValuesInSetCheck(ColumnTransformationCheck):
    """Check the share of values that match any value of a value set in a column.

    Inherits from ColumnTransformationCheck; see its documentation for argument descriptions.

    Args:
        value_set: A list of values (or a string representation of such a list) to be checked.
                   Single values are also allowed. Examples for valid inputs:
                   - ["shoes", "clothing"]
                   - "clothing"
                   - '("shoes", "toys")'

    Example:
    ValuesInSetCheck(
        database_accessor="project.dataset",
        database_provider=None,
        table="project.dataset.table",
        check_column="category",
        value_set='("toys", "shoes")',
        filters={
            "identifier": {"column": "shop_code", "value": "SHOP01", "type": "identifier"},
            "date": {"column": "date", "value": "2023-01-01", "type": "date"},
        },
        lower_threshold=0.9,
        upper_threshold=1.0,
    )

    """

    def __init__(
        self,
        database_accessor: str,
        database_provider: DatabaseProvider | None,
        table: str,
        check_column: str,
        value_set: str | bytes | Iterable,
        lower_threshold: float = -math.inf,
        upper_threshold: float = math.inf,
        *,
        monitor_only: bool = False,
        transformation_name: str | None = None,
        extra_info: str | None = None,
        filters: dict[str, Any] | None = None,
        identifier_format: str = "identifier",
        identifier_placeholder: str = "ALL",
        date_info: str | None = None,
    ) -> None:
        """Initialize the values in set check."""
        self.value_set = to_set(value_set)
        if not self.value_set:
            msg = "'value_set' must not be empty"
            raise KoalityError(msg)
        self.value_set_string = format_filter_value(self.value_set, "IN")

        super().__init__(
            database_accessor=database_accessor,
            database_provider=database_provider,
            transformation_name=transformation_name if transformation_name else "values_in_set_ratio",
            table=table,
            check_column=check_column,
            lower_threshold=lower_threshold,
            upper_threshold=upper_threshold,
            filters=filters,
            identifier_format=identifier_format,
            identifier_placeholder=identifier_placeholder,
            date_info=date_info,
            extra_info=extra_info,
            monitor_only=monitor_only,
        )

    def transformation_statement(self) -> str:
        """Return the SQL statement for calculating values in set ratio."""
        return f"""AVG(IF({self.in_memory_column} IN {self.value_set_string}, 1, 0)) AS {self.name}"""

`init(database_accessor, database_provider, table, check_column, value_set, lower_threshold=-math.inf, upper_threshold=math.inf, *, monitor_only=False, transformation_name=None, extra_info=None, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None)`

Initialize the values in set check.

Source code in src/koality/checks.py

def __init__(
    self,
    database_accessor: str,
    database_provider: DatabaseProvider | None,
    table: str,
    check_column: str,
    value_set: str | bytes | Iterable,
    lower_threshold: float = -math.inf,
    upper_threshold: float = math.inf,
    *,
    monitor_only: bool = False,
    transformation_name: str | None = None,
    extra_info: str | None = None,
    filters: dict[str, Any] | None = None,
    identifier_format: str = "identifier",
    identifier_placeholder: str = "ALL",
    date_info: str | None = None,
) -> None:
    """Initialize the values in set check."""
    self.value_set = to_set(value_set)
    if not self.value_set:
        msg = "'value_set' must not be empty"
        raise KoalityError(msg)
    self.value_set_string = format_filter_value(self.value_set, "IN")

    super().__init__(
        database_accessor=database_accessor,
        database_provider=database_provider,
        transformation_name=transformation_name if transformation_name else "values_in_set_ratio",
        table=table,
        check_column=check_column,
        lower_threshold=lower_threshold,
        upper_threshold=upper_threshold,
        filters=filters,
        identifier_format=identifier_format,
        identifier_placeholder=identifier_placeholder,
        date_info=date_info,
        extra_info=extra_info,
        monitor_only=monitor_only,
    )

`transformation_statement()`

Return the SQL statement for calculating values in set ratio.

Source code in src/koality/checks.py

def transformation_statement(self) -> str:
    """Return the SQL statement for calculating values in set ratio."""
    return f"""AVG(IF({self.in_memory_column} IN {self.value_set_string}, 1, 0)) AS {self.name}"""

Checks

AverageCheck

__init__(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)

transformation_statement()

ColumnTransformationCheck

__init__(database_accessor, database_provider, transformation_name, table, check_column=None, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)

assemble_data_exists_query()

assemble_name()

assemble_query()

query_boilerplate(metric_statement)

transformation_statement() abstractmethod

CountCheck

__init__(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, distinct=False, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)

assemble_name()

transformation_statement()

DataQualityCheck

in_memory_column property

query property

__call__(duckdb_client)

__init__(database_accessor, database_provider, table, check_column=None, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)

__repr__()

assemble_data_exists_query() abstractmethod

assemble_name() abstractmethod

assemble_query() abstractmethod

assemble_where_statement(filters, *, strip_dotted_columns=True, database_accessor=None) staticmethod

check(duckdb_client)

data_check(duckdb_client)

get_date_filter(filters) staticmethod

get_filters(filters_config) staticmethod

get_identifier_filter(filters) staticmethod

DuplicateCheck

__init__(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)

transformation_statement()

IqrOutlierCheck

__init__(database_accessor, database_provider, check_column, table, interval_days, how, iqr_factor, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)

assemble_data_exists_query()

query_boilerplate(metric_statement)

transformation_statement()

MatchRateCheck

assemble_data_exists_query()

assemble_name()

assemble_query()

MaxCheck

__init__(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)

transformation_statement()

MinCheck

__init__(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)

transformation_statement()

NullRatioCheck

__init__(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)

transformation_statement()

OccurrenceCheck

__init__(database_accessor, database_provider, max_or_min, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)

assemble_query()

transformation_statement()

RegexMatchCheck

__init__(database_accessor, database_provider, table, check_column, regex_to_match, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)

transformation_statement()

RelCountChangeCheck

__init__(database_accessor, database_provider, table, check_column, rolling_days, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)

assemble_data_exists_query()

assemble_name()

assemble_query()

RollingValuesInSetCheck

__init__(database_accessor, database_provider, table, check_column, value_set, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)

assemble_query()

ValuesInSetCheck

__init__(database_accessor, database_provider, table, check_column, value_set, lower_threshold=-math.inf, upper_threshold=math.inf, *, monitor_only=False, transformation_name=None, extra_info=None, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None)

transformation_statement()

`AverageCheck`

`init(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

`transformation_statement()`

`ColumnTransformationCheck`

`init(database_accessor, database_provider, transformation_name, table, check_column=None, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

`assemble_data_exists_query()`

`assemble_name()`

`assemble_query()`

`query_boilerplate(metric_statement)`

`transformation_statement()` `abstractmethod`

`CountCheck`

`init(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, distinct=False, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

`assemble_name()`

`transformation_statement()`

`DataQualityCheck`

`in_memory_column` `property`

`query` `property`

`call(duckdb_client)`

`init(database_accessor, database_provider, table, check_column=None, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

`repr()`

`assemble_data_exists_query()` `abstractmethod`

`assemble_name()` `abstractmethod`

`assemble_query()` `abstractmethod`

`assemble_where_statement(filters, *, strip_dotted_columns=True, database_accessor=None)` `staticmethod`

`check(duckdb_client)`

`data_check(duckdb_client)`

`get_date_filter(filters)` `staticmethod`

`get_filters(filters_config)` `staticmethod`

`get_identifier_filter(filters)` `staticmethod`

`DuplicateCheck`

`init(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

`transformation_statement()`

`IqrOutlierCheck`

`init(database_accessor, database_provider, check_column, table, interval_days, how, iqr_factor, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

`assemble_data_exists_query()`

`query_boilerplate(metric_statement)`

`transformation_statement()`

`MatchRateCheck`

`assemble_data_exists_query()`

`assemble_name()`

`assemble_query()`

`MaxCheck`

`init(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

`transformation_statement()`

`MinCheck`

`init(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

`transformation_statement()`

`NullRatioCheck`

`init(database_accessor, database_provider, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

`transformation_statement()`

`OccurrenceCheck`

`init(database_accessor, database_provider, max_or_min, table, check_column, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

`assemble_query()`

`transformation_statement()`

`RegexMatchCheck`

`init(database_accessor, database_provider, table, check_column, regex_to_match, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

`transformation_statement()`

`RelCountChangeCheck`

`init(database_accessor, database_provider, table, check_column, rolling_days, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

`assemble_data_exists_query()`

`assemble_name()`

`assemble_query()`

`RollingValuesInSetCheck`

`init(database_accessor, database_provider, table, check_column, value_set, lower_threshold=-math.inf, upper_threshold=math.inf, *, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None, extra_info=None, monitor_only=False)`

`assemble_query()`

`ValuesInSetCheck`

`init(database_accessor, database_provider, table, check_column, value_set, lower_threshold=-math.inf, upper_threshold=math.inf, *, monitor_only=False, transformation_name=None, extra_info=None, filters=None, identifier_format='identifier', identifier_placeholder='ALL', date_info=None)`

`transformation_statement()`