Skip to content

Types API

The types module provides a unified type system for consistent schema handling across different compute engines (Polars, DuckDB).

Classes

DataType

mlforge.types.DataType dataclass

Immutable, hashable type representation.

This is the canonical type used throughout mlforge for type comparisons, schema hashing, and metadata storage.

Attributes:

Name Type Description
kind TypeKind

The fundamental type category

nullable bool

Whether null values are allowed (default True)

timezone str | None

Timezone for DATETIME types (e.g., "UTC", "America/New_York")

precision int | None

Decimal precision (total digits)

scale int | None

Decimal scale (digits after decimal point)

Example

Simple types

DataType(TypeKind.INT64) DataType(TypeKind.STRING, nullable=False)

Datetime with timezone

DataType(TypeKind.DATETIME, timezone="UTC")

Decimal with precision

DataType(TypeKind.DECIMAL, precision=38, scale=10)

Source code in src/mlforge/types.py
@dataclass(frozen=True)
class DataType:
    """
    Immutable, hashable type representation.

    This is the canonical type used throughout mlforge for type comparisons,
    schema hashing, and metadata storage.

    Attributes:
        kind: The fundamental type category
        nullable: Whether null values are allowed (default True)
        timezone: Timezone for DATETIME types (e.g., "UTC", "America/New_York")
        precision: Decimal precision (total digits)
        scale: Decimal scale (digits after decimal point)

    Example:
        # Simple types
        DataType(TypeKind.INT64)
        DataType(TypeKind.STRING, nullable=False)

        # Datetime with timezone
        DataType(TypeKind.DATETIME, timezone="UTC")

        # Decimal with precision
        DataType(TypeKind.DECIMAL, precision=38, scale=10)
    """

    kind: TypeKind
    nullable: bool = True
    timezone: str | None = None
    precision: int | None = None
    scale: int | None = None

    def to_canonical_string(self) -> str:
        """
        Convert to canonical string representation.

        This is the format used in metadata files for human readability
        and consistent hashing across engines.

        Returns:
            Canonical type string (e.g., "int64", "datetime[UTC]")
        """
        base = self.kind.value

        # Add timezone for datetime
        if self.kind == TypeKind.DATETIME and self.timezone:
            return f"{base}[{self.timezone}]"

        # Add precision/scale for decimal
        if self.kind == TypeKind.DECIMAL:
            if self.precision is not None and self.scale is not None:
                return f"{base}[{self.precision},{self.scale}]"
            elif self.precision is not None:
                return f"{base}[{self.precision}]"

        return base

    def to_json(self) -> dict[str, Any]:
        """
        Serialize to JSON-compatible dict.

        Returns:
            Dictionary suitable for JSON serialization
        """
        result: dict[str, Any] = {"kind": self.kind.value}

        if not self.nullable:
            result["nullable"] = False

        if self.timezone is not None:
            result["timezone"] = self.timezone

        if self.precision is not None:
            result["precision"] = self.precision

        if self.scale is not None:
            result["scale"] = self.scale

        return result

    @classmethod
    def from_json(cls, data: dict[str, Any]) -> "DataType":
        """
        Deserialize from JSON dict.

        Args:
            data: Dictionary from JSON deserialization

        Returns:
            DataType instance
        """
        return cls(
            kind=TypeKind(data["kind"]),
            nullable=data.get("nullable", True),
            timezone=data.get("timezone"),
            precision=data.get("precision"),
            scale=data.get("scale"),
        )

    @classmethod
    def from_canonical_string(cls, s: str) -> "DataType":
        """
        Parse canonical string representation.

        Args:
            s: Canonical type string (e.g., "int64", "datetime[UTC]")

        Returns:
            DataType instance
        """
        # Handle parameterized types like "datetime[UTC]" or "decimal[38,10]"
        if "[" in s and s.endswith("]"):
            base, params = s[:-1].split("[", 1)
            kind = TypeKind(base)

            if kind == TypeKind.DATETIME:
                return cls(kind=kind, timezone=params)
            elif kind == TypeKind.DECIMAL:
                parts = params.split(",")
                precision = int(parts[0])
                scale = int(parts[1]) if len(parts) > 1 else None
                return cls(kind=kind, precision=precision, scale=scale)

        return cls(kind=TypeKind(s))

    def is_numeric(self) -> bool:
        """Check if this is a numeric type."""
        return self.kind in {
            TypeKind.INT8,
            TypeKind.INT16,
            TypeKind.INT32,
            TypeKind.INT64,
            TypeKind.UINT8,
            TypeKind.UINT16,
            TypeKind.UINT32,
            TypeKind.UINT64,
            TypeKind.FLOAT32,
            TypeKind.FLOAT64,
            TypeKind.DECIMAL,
        }

    def is_integer(self) -> bool:
        """Check if this is an integer type."""
        return self.kind in {
            TypeKind.INT8,
            TypeKind.INT16,
            TypeKind.INT32,
            TypeKind.INT64,
            TypeKind.UINT8,
            TypeKind.UINT16,
            TypeKind.UINT32,
            TypeKind.UINT64,
        }

    def is_floating(self) -> bool:
        """Check if this is a floating point type."""
        return self.kind in {TypeKind.FLOAT32, TypeKind.FLOAT64}

    def is_temporal(self) -> bool:
        """Check if this is a temporal type."""
        return self.kind in {
            TypeKind.DATE,
            TypeKind.DATETIME,
            TypeKind.TIME,
            TypeKind.DURATION,
        }

from_canonical_string classmethod

from_canonical_string(s: str) -> DataType

Parse canonical string representation.

Parameters:

Name Type Description Default
s str

Canonical type string (e.g., "int64", "datetime[UTC]")

required

Returns:

Type Description
DataType

DataType instance

Source code in src/mlforge/types.py
@classmethod
def from_canonical_string(cls, s: str) -> "DataType":
    """
    Parse canonical string representation.

    Args:
        s: Canonical type string (e.g., "int64", "datetime[UTC]")

    Returns:
        DataType instance
    """
    # Handle parameterized types like "datetime[UTC]" or "decimal[38,10]"
    if "[" in s and s.endswith("]"):
        base, params = s[:-1].split("[", 1)
        kind = TypeKind(base)

        if kind == TypeKind.DATETIME:
            return cls(kind=kind, timezone=params)
        elif kind == TypeKind.DECIMAL:
            parts = params.split(",")
            precision = int(parts[0])
            scale = int(parts[1]) if len(parts) > 1 else None
            return cls(kind=kind, precision=precision, scale=scale)

    return cls(kind=TypeKind(s))

from_json classmethod

from_json(data: dict[str, Any]) -> DataType

Deserialize from JSON dict.

Parameters:

Name Type Description Default
data dict[str, Any]

Dictionary from JSON deserialization

required

Returns:

Type Description
DataType

DataType instance

Source code in src/mlforge/types.py
@classmethod
def from_json(cls, data: dict[str, Any]) -> "DataType":
    """
    Deserialize from JSON dict.

    Args:
        data: Dictionary from JSON deserialization

    Returns:
        DataType instance
    """
    return cls(
        kind=TypeKind(data["kind"]),
        nullable=data.get("nullable", True),
        timezone=data.get("timezone"),
        precision=data.get("precision"),
        scale=data.get("scale"),
    )

is_floating

is_floating() -> bool

Check if this is a floating point type.

Source code in src/mlforge/types.py
def is_floating(self) -> bool:
    """Check if this is a floating point type."""
    return self.kind in {TypeKind.FLOAT32, TypeKind.FLOAT64}

is_integer

is_integer() -> bool

Check if this is an integer type.

Source code in src/mlforge/types.py
def is_integer(self) -> bool:
    """Check if this is an integer type."""
    return self.kind in {
        TypeKind.INT8,
        TypeKind.INT16,
        TypeKind.INT32,
        TypeKind.INT64,
        TypeKind.UINT8,
        TypeKind.UINT16,
        TypeKind.UINT32,
        TypeKind.UINT64,
    }

is_numeric

is_numeric() -> bool

Check if this is a numeric type.

Source code in src/mlforge/types.py
def is_numeric(self) -> bool:
    """Check if this is a numeric type."""
    return self.kind in {
        TypeKind.INT8,
        TypeKind.INT16,
        TypeKind.INT32,
        TypeKind.INT64,
        TypeKind.UINT8,
        TypeKind.UINT16,
        TypeKind.UINT32,
        TypeKind.UINT64,
        TypeKind.FLOAT32,
        TypeKind.FLOAT64,
        TypeKind.DECIMAL,
    }

is_temporal

is_temporal() -> bool

Check if this is a temporal type.

Source code in src/mlforge/types.py
def is_temporal(self) -> bool:
    """Check if this is a temporal type."""
    return self.kind in {
        TypeKind.DATE,
        TypeKind.DATETIME,
        TypeKind.TIME,
        TypeKind.DURATION,
    }

to_canonical_string

to_canonical_string() -> str

Convert to canonical string representation.

This is the format used in metadata files for human readability and consistent hashing across engines.

Returns:

Type Description
str

Canonical type string (e.g., "int64", "datetime[UTC]")

Source code in src/mlforge/types.py
def to_canonical_string(self) -> str:
    """
    Convert to canonical string representation.

    This is the format used in metadata files for human readability
    and consistent hashing across engines.

    Returns:
        Canonical type string (e.g., "int64", "datetime[UTC]")
    """
    base = self.kind.value

    # Add timezone for datetime
    if self.kind == TypeKind.DATETIME and self.timezone:
        return f"{base}[{self.timezone}]"

    # Add precision/scale for decimal
    if self.kind == TypeKind.DECIMAL:
        if self.precision is not None and self.scale is not None:
            return f"{base}[{self.precision},{self.scale}]"
        elif self.precision is not None:
            return f"{base}[{self.precision}]"

    return base

to_json

to_json() -> dict[str, Any]

Serialize to JSON-compatible dict.

Returns:

Type Description
dict[str, Any]

Dictionary suitable for JSON serialization

Source code in src/mlforge/types.py
def to_json(self) -> dict[str, Any]:
    """
    Serialize to JSON-compatible dict.

    Returns:
        Dictionary suitable for JSON serialization
    """
    result: dict[str, Any] = {"kind": self.kind.value}

    if not self.nullable:
        result["nullable"] = False

    if self.timezone is not None:
        result["timezone"] = self.timezone

    if self.precision is not None:
        result["precision"] = self.precision

    if self.scale is not None:
        result["scale"] = self.scale

    return result

TypeKind

mlforge.types.TypeKind

Bases: Enum

Canonical type kinds for mlforge.

Type names are Arrow-compatible for industry standard interoperability.

Source code in src/mlforge/types.py
class TypeKind(Enum):
    """
    Canonical type kinds for mlforge.

    Type names are Arrow-compatible for industry standard interoperability.
    """

    # Integer types (signed)
    INT8 = "int8"
    INT16 = "int16"
    INT32 = "int32"
    INT64 = "int64"

    # Integer types (unsigned)
    UINT8 = "uint8"
    UINT16 = "uint16"
    UINT32 = "uint32"
    UINT64 = "uint64"

    # Floating point types
    FLOAT32 = "float32"
    FLOAT64 = "float64"

    # String type
    STRING = "string"

    # Boolean type
    BOOLEAN = "boolean"

    # Temporal types
    DATE = "date"
    DATETIME = "datetime"
    TIME = "time"
    DURATION = "duration"

    # Complex types (for future expansion)
    DECIMAL = "decimal"
    LIST = "list"
    STRUCT = "struct"

    # Fallback for unknown types
    UNKNOWN = "unknown"

Functions

from_polars

mlforge.types.from_polars

from_polars(dtype: Any) -> DataType

Convert Polars dtype to canonical DataType.

Parameters:

Name Type Description Default
dtype Any

Polars DataType object (e.g., pl.Int64, pl.Utf8)

required

Returns:

Type Description
DataType

Canonical DataType representation

Example

from_polars(pl.Int64) # DataType(TypeKind.INT64) from_polars(pl.Datetime("us", "UTC")) # DataType(TypeKind.DATETIME, timezone="UTC")

Source code in src/mlforge/types.py
def from_polars(dtype: Any) -> DataType:
    """
    Convert Polars dtype to canonical DataType.

    Args:
        dtype: Polars DataType object (e.g., pl.Int64, pl.Utf8)

    Returns:
        Canonical DataType representation

    Example:
        from_polars(pl.Int64)  # DataType(TypeKind.INT64)
        from_polars(pl.Datetime("us", "UTC"))  # DataType(TypeKind.DATETIME, timezone="UTC")
    """
    # Get the dtype class name
    dtype_str = str(dtype)

    # Handle Datetime with timezone
    if dtype_str.startswith("Datetime"):
        # Extract timezone from Datetime(time_unit='us', time_zone='UTC')
        # or Datetime (no params)
        try:
            import polars as pl

            if hasattr(dtype, "time_zone"):
                tz = dtype.time_zone
            elif isinstance(dtype, type) and issubclass(dtype, pl.Datetime):
                tz = None
            else:
                tz = None
            return DataType(TypeKind.DATETIME, timezone=tz)
        except (AttributeError, ImportError):
            return DataType(TypeKind.DATETIME)

    # Handle simple type names
    # Extract base type name (e.g., "Int64" from "Int64")
    base_name = dtype_str.split("(")[0].strip()

    kind = _POLARS_TO_KIND.get(base_name, TypeKind.UNKNOWN)
    return DataType(kind)

from_duckdb

mlforge.types.from_duckdb

from_duckdb(dtype_str: str) -> DataType

Convert DuckDB type string to canonical DataType.

Parameters:

Name Type Description Default
dtype_str str

DuckDB type string (e.g., "BIGINT", "VARCHAR")

required

Returns:

Type Description
DataType

Canonical DataType representation

Example

from_duckdb("BIGINT") # DataType(TypeKind.INT64) from_duckdb("DOUBLE") # DataType(TypeKind.FLOAT64)

Source code in src/mlforge/types.py
def from_duckdb(dtype_str: str) -> DataType:
    """
    Convert DuckDB type string to canonical DataType.

    Args:
        dtype_str: DuckDB type string (e.g., "BIGINT", "VARCHAR")

    Returns:
        Canonical DataType representation

    Example:
        from_duckdb("BIGINT")  # DataType(TypeKind.INT64)
        from_duckdb("DOUBLE")  # DataType(TypeKind.FLOAT64)
    """
    # Normalize to uppercase
    dtype_upper = dtype_str.upper().strip()

    # Handle parameterized types like DECIMAL(10,2) or VARCHAR(255)
    base_type = dtype_upper.split("(")[0].strip()

    # Handle TIMESTAMP WITH TIME ZONE
    if "TIMESTAMP" in dtype_upper and "TIME ZONE" in dtype_upper:
        # Extract timezone if present (DuckDB doesn't expose it directly)
        return DataType(TypeKind.DATETIME, timezone="UTC")

    # Handle DECIMAL with precision/scale
    if base_type in ("DECIMAL", "NUMERIC") and "(" in dtype_upper:
        params = dtype_upper.split("(")[1].rstrip(")")
        parts = params.split(",")
        precision = int(parts[0].strip())
        scale = int(parts[1].strip()) if len(parts) > 1 else 0
        return DataType(TypeKind.DECIMAL, precision=precision, scale=scale)

    kind = _DUCKDB_TO_KIND.get(base_type, TypeKind.UNKNOWN)
    return DataType(kind)