Feature Validation¶

mlforge provides a validation system to ensure data quality before features are materialized. Validators run on the output of your feature function, before any metrics are computed.

Why Validate?¶

Validation helps catch data quality issues early:

Prevent bad data from entering your feature store
Document data expectations in code
Fail fast instead of discovering issues in production
Track validation rules in metadata for auditing

Using Built-in Validators¶

mlforge includes common validators for typical data quality checks.

Basic Example¶

import mlforge as mlf

@mlf.feature(
    keys=["merchant_id"],
    source="data/transactions.parquet",
    validators={
        "merchant_id": [mlf.not_null()],
        "amount": [mlf.not_null(), mlf.greater_than_or_equal(0)],
    }
)
def merchant_transactions(df):
    return df.select(["merchant_id", "amount", "transaction_date"])

If validation fails, the build will stop and report which validations failed:

ERROR: Feature validation failed for merchant_transactions
  - Column 'amount': 3 values < 0 (greater_than_or_equal(0))

Available Validators¶

Null Checks¶

import mlforge as mlf

@mlf.feature(
    keys=["user_id"],
    source="users.parquet",
    validators={
        "user_id": [mlf.not_null()],
        "email": [mlf.not_null()],
    }
)
def user_features(df):
    return df

Uniqueness¶

import mlforge as mlf

@mlf.feature(
    keys=["user_id"],
    source="users.parquet",
    validators={
        "user_id": [mlf.unique()],  # Ensure no duplicate user IDs
    }
)
def user_features(df):
    return df

Numeric Comparisons¶

import mlforge as mlf

@mlf.feature(
    keys=["product_id"],
    source="products.parquet",
    validators={
        "price": [mlf.greater_than(0)],
        "discount_pct": [mlf.greater_than_or_equal(0), mlf.less_than_or_equal(100)],
        "stock": [mlf.greater_than_or_equal(0)],
    }
)
def product_features(df):
    return df

Range Validation¶

import mlforge as mlf

@mlf.feature(
    keys=["user_id"],
    source="users.parquet",
    validators={
        "age": [mlf.in_range(18, 120)],  # inclusive by default
        "score": [mlf.in_range(0, 100, inclusive=True)],
    }
)
def user_features(df):
    return df

Pattern Matching¶

import mlforge as mlf

@mlf.feature(
    keys=["user_id"],
    source="users.parquet",
    validators={
        "email": [mlf.matches_regex(r"^\w+@\w+\.\w+$")],
        "phone": [mlf.matches_regex(r"^\+?1?\d{9,15}$")],
    }
)
def user_features(df):
    return df

Set Membership¶

import mlforge as mlf

@mlf.feature(
    keys=["transaction_id"],
    source="transactions.parquet",
    validators={
        "status": [mlf.is_in(["pending", "approved", "rejected"])],
        "payment_method": [mlf.is_in(["card", "bank", "wallet"])],
    }
)
def transaction_features(df):
    return df

Combining Validators¶

You can apply multiple validators to a single column:

import mlforge as mlf

@mlf.feature(
    keys=["user_id"],
    source="users.parquet",
    validators={
        "age": [
            mlf.not_null(),           # Must have a value
            mlf.greater_than(0),      # Must be positive
            mlf.less_than(150),       # Must be reasonable
        ],
    }
)
def user_features(df):
    return df

Validators run in order. If any validator fails, validation stops and reports the failure.

Creating Custom Validators¶

You can create custom validators for domain-specific validation logic.

Basic Custom Validator¶

A validator is a function that returns a Validator object:

from mlforge.validators import Validator, ValidationResult
import polars as pl

def is_valid_email() -> Validator:
    """Validate email addresses using custom logic."""

    def validate(series: pl.Series) -> ValidationResult:
        # Implement your validation logic
        invalid_emails = series.filter(
            ~series.str.contains("@") | series.is_null()
        )

        if len(invalid_emails) > 0:
            return ValidationResult(
                passed=False,
                message=f"{len(invalid_emails)} invalid email addresses",
                failed_count=len(invalid_emails),
            )

        return ValidationResult(passed=True)

    return Validator(name="is_valid_email", fn=validate)

Use it like any built-in validator:

import mlforge as mlf

@mlf.feature(
    keys=["user_id"],
    source="users.parquet",
    validators={
        "email": [is_valid_email()],
    }
)
def user_features(df):
    return df

Parameterized Custom Validators¶

Create validators that accept parameters:

from mlforge.validators import Validator, ValidationResult
import polars as pl

def min_length(length: int) -> Validator:
    """Validate that string values have minimum length."""

    def validate(series: pl.Series) -> ValidationResult:
        too_short = series.filter(series.str.lengths() < length)

        if len(too_short) > 0:
            return ValidationResult(
                passed=False,
                message=f"{len(too_short)} values shorter than {length}",
                failed_count=len(too_short),
            )

        return ValidationResult(passed=True)

    return Validator(name=f"min_length({length})", fn=validate)


@mlf.feature(
    keys=["user_id"],
    source="users.parquet",
    validators={
        "username": [min_length(3)],
        "password_hash": [min_length(60)],  # bcrypt hashes are 60 chars
    }
)
def user_features(df):
    return df

Business Logic Validators¶

Implement complex business rules:

from mlforge.validators import Validator, ValidationResult
import polars as pl

def is_valid_transaction() -> Validator:
    """
    Validate transaction business rules.

    - Amount must be > 0
    - Refunds (negative amounts) must have a parent transaction
    - High-value transactions must have approval
    """

    def validate(series: pl.Series) -> ValidationResult:
        # This validator works on the entire DataFrame row context
        # For row-level validation, you'd need to implement custom logic

        invalid = series.filter(series == 0)  # No zero-amount transactions

        if len(invalid) > 0:
            return ValidationResult(
                passed=False,
                message=f"{len(invalid)} transactions with zero amount",
                failed_count=len(invalid),
            )

        return ValidationResult(passed=True)

    return Validator(name="is_valid_transaction", fn=validate)

Statistical Validators¶

Validate statistical properties:

from mlforge.validators import Validator, ValidationResult
import polars as pl

def within_std_devs(n_std: float) -> Validator:
    """Validate values are within n standard deviations of mean."""

    def validate(series: pl.Series) -> ValidationResult:
        mean = series.mean()
        std = series.std()

        if std is None or mean is None:
            return ValidationResult(passed=True)  # Skip if insufficient data

        lower = mean - (n_std * std)
        upper = mean + (n_std * std)

        outliers = series.filter((series < lower) | (series > upper))

        if len(outliers) > 0:
            return ValidationResult(
                passed=False,
                message=f"{len(outliers)} outliers beyond {n_std} std devs",
                failed_count=len(outliers),
            )

        return ValidationResult(passed=True)

    return Validator(name=f"within_std_devs({n_std})", fn=validate)


@mlf.feature(
    keys=["user_id"],
    source="transactions.parquet",
    validators={
        "amount": [within_std_devs(3)],  # Catch extreme outliers
    }
)
def transaction_features(df):
    return df

Multi-Column Validators¶

For validators that need to check relationships between columns, implement the logic in your feature function and validate the result:

import mlforge as mlf
from mlforge.validators import Validator, ValidationResult
import polars as pl

def discount_less_than_price() -> Validator:
    """Validate discount is always less than price."""

    def validate(series: pl.Series) -> ValidationResult:
        # Assumes series contains a boolean column from the feature function
        invalid = series.filter(~series)

        if len(invalid) > 0:
            return ValidationResult(
                passed=False,
                message=f"{len(invalid)} rows where discount >= price",
                failed_count=len(invalid),
            )

        return ValidationResult(passed=True)

    return Validator(name="discount_less_than_price", fn=validate)


@mlf.feature(
    keys=["product_id"],
    source="products.parquet",
    validators={
        "valid_pricing": [discount_less_than_price()],
    }
)
def product_features(df):
    return df.with_columns([
        (pl.col("discount") < pl.col("price")).alias("valid_pricing")
    ])

Validation in Metadata¶

Validators are tracked in feature metadata for auditing and documentation:

{
  "name": "merchant_transactions",
  "columns": [
    {
      "name": "merchant_id",
      "dtype": "String"
    },
    {
      "name": "amount",
      "dtype": "Float64",
      "validators": [
        {
          "validator": "not_null"
        },
        {
          "validator": "greater_than_or_equal",
          "value": 0
        }
      ]
    }
  ],
  "features": [...]
}

This metadata shows: - Which columns have validators - What validation rules are applied - Parameters for each validator

Validation CLI Command¶

Run validation without building features:

# Validate all features
mlforge validate

# Validate specific features
mlforge validate --features merchant_transactions

# Validate by tag
mlforge validate --tags transactions

This is useful for: - Testing new validators before building - Validating source data changes - CI/CD data quality checks

Best Practices¶

1. Validate at the Source¶

Add validators close to where data enters your feature store:

@mlf.feature(
    keys=["user_id"],
    source="raw/users.parquet",
    validators={
        "user_id": [mlf.not_null(), mlf.unique()],
        "created_at": [mlf.not_null()],
        "email": [mlf.not_null(), mlf.matches_regex(r"^\w+@\w+\.\w+$")],
    }
)
def user_base_features(df):
    return df

2. Use Validators for Assumptions¶

Document assumptions your feature logic makes:

@mlf.feature(
    keys=["transaction_id"],
    source="transactions.parquet",
    validators={
        "amount": [mlf.greater_than(0)],  # Feature assumes positive amounts
    }
)
def transaction_features(df):
    # This logic assumes amount > 0
    return df.with_columns([
        (100.0 / pl.col("amount")).alias("amount_inverse")
    ])

3. Keep Validators Simple¶

Each validator should check one thing:

# Good: Each validator checks one aspect
validators={
    "age": [mlf.not_null(), mlf.greater_than(0), mlf.less_than(150)]
}

# Bad: Complex multi-condition validator
validators={
    "age": [validate_age_is_valid_and_reasonable_and_not_null()]
}

4. Provide Clear Error Messages¶

Make validation failures actionable:

def is_valid_phone_number() -> Validator:
    def validate(series: pl.Series) -> ValidationResult:
        invalid = series.filter(~series.str.contains(r"^\+?\d{10,15}$"))

        if len(invalid) > 0:
            return ValidationResult(
                passed=False,
                message=f"{len(invalid)} invalid phone numbers (expected format: +1234567890)",
                failed_count=len(invalid),
            )

        return ValidationResult(passed=True)

    return Validator(name="is_valid_phone_number", fn=validate)

5. Test Your Validators¶

Write tests for custom validators:

def test_min_length_validator():
    series = pl.Series(["abc", "ab", "a"])
    validator = min_length(3)
    result = validator(series)

    assert not result.passed
    assert result.failed_count == 2  # "ab" and "a" are too short