Edit on GitHub

sqlmesh.integrations.dlt

View Source

  1import typing as t
  2import click
  3from datetime import datetime, timedelta, timezone
  4from pydantic import ValidationError
  5from sqlglot import exp, parse_one
  6from sqlmesh.core.config.connection import parse_connection_config
  7from sqlmesh.core.context import Context
  8from sqlmesh.utils.date import yesterday_ds
  9
 10
 11def generate_dlt_models_and_settings(
 12    pipeline_name: str,
 13    dialect: str,
 14    tables: t.Optional[t.List[str]] = None,
 15    dlt_path: t.Optional[str] = None,
 16) -> t.Tuple[t.Set[t.Tuple[str, str]], t.Optional[str], str]:
 17    """
 18    This function attaches to a DLT pipeline and retrieves the connection configs and
 19    SQLMesh models based on the tables present in the pipeline's default schema.
 20
 21    Args:
 22        pipeline_name: The name of the DLT pipeline to attach to.
 23        dialect: The SQL dialect to use for generating SQLMesh models.
 24        tables: A list of table names to include.
 25        dlt_path: The path to the DLT pipelines working directory, where DLT stores
 26            pipeline state (by default ~/.dlt/pipelines).
 27
 28    Returns:
 29        A tuple containing a set of the SQLMesh model definitions, the connection config and the start date.
 30    """
 31
 32    import dlt
 33    from dlt.common.schema.utils import has_table_seen_data, is_complete_column
 34    from dlt.pipeline.exceptions import CannotRestorePipelineException
 35
 36    try:
 37        pipeline = dlt.attach(pipeline_name=pipeline_name, pipelines_dir=dlt_path or "")
 38    except CannotRestorePipelineException as e:
 39        from pathlib import Path
 40        from dlt.common.pipeline import get_dlt_pipelines_dir
 41
 42        searched_dir = dlt_path or get_dlt_pipelines_dir()
 43        msg = f"Could not attach to pipeline {pipeline_name}.\nSearched in: {searched_dir}\n{e}"
 44        if dlt_path and (Path(get_dlt_pipelines_dir()) / pipeline_name).exists():
 45            msg += (
 46                f"\nHint: A pipeline named '{pipeline_name}' exists in the default pipelines "
 47                f"working directory '{get_dlt_pipelines_dir()}'. Note that --dlt-path must "
 48                "point to the directory where DLT stores pipeline working state (by default "
 49                "~/.dlt/pipelines), not the directory containing your pipeline scripts. "
 50                "Try omitting --dlt-path."
 51            )
 52        raise click.ClickException(msg)
 53
 54    schema = pipeline.default_schema
 55    dataset = pipeline.dataset_name
 56
 57    # Get the start date from the load_ids
 58    storage_ids = list(pipeline._get_load_storage().list_loaded_packages())
 59    start_date = get_start_date(storage_ids)
 60
 61    # Get the connection credentials
 62    db_type = pipeline.destination.to_name(pipeline.destination)
 63    if db_type == "filesystem":
 64        connection_config = None
 65    else:
 66        client = pipeline.destination_client()
 67        config = client.config
 68        credentials = config.credentials
 69        configs = {
 70            key: value
 71            for key in dir(credentials)
 72            if not key.startswith("_")
 73            and not callable(value := getattr(credentials, key))
 74            and value is not None
 75        }
 76        connection_config = format_config(configs, db_type)
 77
 78    dlt_tables = {
 79        name: table
 80        for name, table in schema.tables.items()
 81        if (
 82            (has_table_seen_data(table) and not name.startswith(schema._dlt_tables_prefix))
 83            or name == schema.loads_table_name
 84        )
 85        and (name in tables if tables else True)
 86    }
 87
 88    sqlmesh_models = set()
 89    for table_name, table in dlt_tables.items():
 90        dlt_columns = {}
 91        primary_key = []
 92
 93        # is_complete_column returns true if column contains a name and a data type
 94        for col in filter(is_complete_column, table["columns"].values()):
 95            dlt_columns[col["name"]] = exp.DataType.build(str(col["data_type"]), dialect=dialect)
 96            if col.get("primary_key"):
 97                primary_key.append(str(col["name"]))
 98
 99        load_id = next(
100            (col for col in ["_dlt_load_id", "load_id"] if col in dlt_columns),
101            None,
102        )
103        load_key = "c." + load_id if load_id else ""
104        parent_table = None
105
106        # Handling for nested tables: https://dlthub.com/docs/general-usage/destination-tables#nested-tables
107        if not load_id:
108            if (
109                "_dlt_parent_id" in dlt_columns
110                and (parent_table := table["parent"])
111                and parent_table in dlt_tables
112            ):
113                load_key = "p._dlt_load_id"
114                parent_table = dataset + "." + parent_table
115            else:
116                break
117
118        column_types = [
119            exp.cast(exp.column(column, table="c"), data_type, dialect=dialect)
120            .as_(column)
121            .sql(dialect=dialect)
122            for column, data_type in dlt_columns.items()
123            if isinstance(column, str)
124        ]
125        select_columns = (
126            ",\n".join(f"  {column_name}" for column_name in column_types) if column_types else ""
127        )
128
129        grain = f"\n  grain ({', '.join(primary_key)})," if primary_key else ""
130        incremental_model_name = f"{dataset}_sqlmesh.incremental_{table_name}"
131        incremental_model_sql = generate_incremental_model(
132            incremental_model_name,
133            select_columns,
134            grain,
135            dataset + "." + table_name,
136            dialect,
137            load_key,
138            parent_table,
139        )
140        sqlmesh_models.add((incremental_model_name, incremental_model_sql))
141
142    return sqlmesh_models, connection_config, start_date
143
144
145def generate_dlt_models(
146    context: Context,
147    pipeline_name: str,
148    tables: t.List[str],
149    force: bool,
150    dlt_path: t.Optional[str] = None,
151) -> t.List[str]:
152    from sqlmesh.cli.project_init import _create_object_files
153
154    sqlmesh_models, _, _ = generate_dlt_models_and_settings(
155        pipeline_name=pipeline_name,
156        dialect=context.config.dialect or "",
157        tables=tables if tables else None,
158        dlt_path=dlt_path,
159    )
160
161    if not tables and not force:
162        existing_models = [m.name for m in context.models.values()]
163        sqlmesh_models = {model for model in sqlmesh_models if model[0] not in existing_models}
164
165    if sqlmesh_models:
166        _create_object_files(
167            context.path / "models",
168            {model[0].split(".")[-1]: model[1] for model in sqlmesh_models},
169            "sql",
170        )
171        return [model[0] for model in sqlmesh_models]
172    return []
173
174
175def generate_incremental_model(
176    model_name: str,
177    select_columns: str,
178    grain: str,
179    from_table: str,
180    dialect: str,
181    load_id: str,
182    parent_table: t.Optional[str] = None,
183) -> str:
184    """Generate the SQL definition for an incremental model."""
185
186    time_column = parse_one(f"to_timestamp(CAST({load_id} AS DOUBLE))").sql(dialect=dialect)
187
188    from_clause = f"{from_table} as c"
189    if parent_table:
190        from_clause += f"""\nJOIN
191  {parent_table} as p
192ON
193  c._dlt_parent_id = p._dlt_id"""
194
195    return f"""MODEL (
196  name {model_name},
197  kind INCREMENTAL_BY_TIME_RANGE (
198    time_column _dlt_load_time,
199  ),{grain}
200);
201
202SELECT
203{select_columns},
204  {time_column} as _dlt_load_time
205FROM
206  {from_clause}
207WHERE
208  {time_column} BETWEEN @start_ts AND @end_ts
209"""
210
211
212def format_config(configs: t.Dict[str, str], db_type: str) -> str:
213    """Generate a string for the gateway connection config."""
214    config = {
215        "type": db_type,
216    }
217
218    for key, value in configs.items():
219        if key == "password":
220            config[key] = f'"{value}"'
221        elif key == "username":
222            config["user"] = value
223        else:
224            config[key] = value
225
226    # Validate the connection config fields
227    invalid_fields = []
228    try:
229        parse_connection_config(config)
230    except ValidationError as e:
231        for error in e.errors():
232            invalid_fields.append(error.get("loc", [])[0])
233
234    return "\n".join(
235        [f"      {key}: {value}" for key, value in config.items() if key not in invalid_fields]
236    )
237
238
239def get_start_date(load_ids: t.List[str]) -> str:
240    """Convert the earliest load_id to UTC timestamp, subtract a day and format as 'YYYY-MM-DD'."""
241
242    timestamps = [datetime.fromtimestamp(float(id), tz=timezone.utc) for id in load_ids]
243    if timestamps:
244        start_timestamp = min(timestamps) - timedelta(days=1)
245        return start_timestamp.strftime("%Y-%m-%d")
246    return yesterday_ds()

def generate_dlt_models_and_settings( pipeline_name: str, dialect: str, tables: Optional[List[str]] = None, dlt_path: Optional[str] = None) -> Tuple[Set[Tuple[str, str]], Optional[str], str]: View Source

 12def generate_dlt_models_and_settings(
 13    pipeline_name: str,
 14    dialect: str,
 15    tables: t.Optional[t.List[str]] = None,
 16    dlt_path: t.Optional[str] = None,
 17) -> t.Tuple[t.Set[t.Tuple[str, str]], t.Optional[str], str]:
 18    """
 19    This function attaches to a DLT pipeline and retrieves the connection configs and
 20    SQLMesh models based on the tables present in the pipeline's default schema.
 21
 22    Args:
 23        pipeline_name: The name of the DLT pipeline to attach to.
 24        dialect: The SQL dialect to use for generating SQLMesh models.
 25        tables: A list of table names to include.
 26        dlt_path: The path to the DLT pipelines working directory, where DLT stores
 27            pipeline state (by default ~/.dlt/pipelines).
 28
 29    Returns:
 30        A tuple containing a set of the SQLMesh model definitions, the connection config and the start date.
 31    """
 32
 33    import dlt
 34    from dlt.common.schema.utils import has_table_seen_data, is_complete_column
 35    from dlt.pipeline.exceptions import CannotRestorePipelineException
 36
 37    try:
 38        pipeline = dlt.attach(pipeline_name=pipeline_name, pipelines_dir=dlt_path or "")
 39    except CannotRestorePipelineException as e:
 40        from pathlib import Path
 41        from dlt.common.pipeline import get_dlt_pipelines_dir
 42
 43        searched_dir = dlt_path or get_dlt_pipelines_dir()
 44        msg = f"Could not attach to pipeline {pipeline_name}.\nSearched in: {searched_dir}\n{e}"
 45        if dlt_path and (Path(get_dlt_pipelines_dir()) / pipeline_name).exists():
 46            msg += (
 47                f"\nHint: A pipeline named '{pipeline_name}' exists in the default pipelines "
 48                f"working directory '{get_dlt_pipelines_dir()}'. Note that --dlt-path must "
 49                "point to the directory where DLT stores pipeline working state (by default "
 50                "~/.dlt/pipelines), not the directory containing your pipeline scripts. "
 51                "Try omitting --dlt-path."
 52            )
 53        raise click.ClickException(msg)
 54
 55    schema = pipeline.default_schema
 56    dataset = pipeline.dataset_name
 57
 58    # Get the start date from the load_ids
 59    storage_ids = list(pipeline._get_load_storage().list_loaded_packages())
 60    start_date = get_start_date(storage_ids)
 61
 62    # Get the connection credentials
 63    db_type = pipeline.destination.to_name(pipeline.destination)
 64    if db_type == "filesystem":
 65        connection_config = None
 66    else:
 67        client = pipeline.destination_client()
 68        config = client.config
 69        credentials = config.credentials
 70        configs = {
 71            key: value
 72            for key in dir(credentials)
 73            if not key.startswith("_")
 74            and not callable(value := getattr(credentials, key))
 75            and value is not None
 76        }
 77        connection_config = format_config(configs, db_type)
 78
 79    dlt_tables = {
 80        name: table
 81        for name, table in schema.tables.items()
 82        if (
 83            (has_table_seen_data(table) and not name.startswith(schema._dlt_tables_prefix))
 84            or name == schema.loads_table_name
 85        )
 86        and (name in tables if tables else True)
 87    }
 88
 89    sqlmesh_models = set()
 90    for table_name, table in dlt_tables.items():
 91        dlt_columns = {}
 92        primary_key = []
 93
 94        # is_complete_column returns true if column contains a name and a data type
 95        for col in filter(is_complete_column, table["columns"].values()):
 96            dlt_columns[col["name"]] = exp.DataType.build(str(col["data_type"]), dialect=dialect)
 97            if col.get("primary_key"):
 98                primary_key.append(str(col["name"]))
 99
100        load_id = next(
101            (col for col in ["_dlt_load_id", "load_id"] if col in dlt_columns),
102            None,
103        )
104        load_key = "c." + load_id if load_id else ""
105        parent_table = None
106
107        # Handling for nested tables: https://dlthub.com/docs/general-usage/destination-tables#nested-tables
108        if not load_id:
109            if (
110                "_dlt_parent_id" in dlt_columns
111                and (parent_table := table["parent"])
112                and parent_table in dlt_tables
113            ):
114                load_key = "p._dlt_load_id"
115                parent_table = dataset + "." + parent_table
116            else:
117                break
118
119        column_types = [
120            exp.cast(exp.column(column, table="c"), data_type, dialect=dialect)
121            .as_(column)
122            .sql(dialect=dialect)
123            for column, data_type in dlt_columns.items()
124            if isinstance(column, str)
125        ]
126        select_columns = (
127            ",\n".join(f"  {column_name}" for column_name in column_types) if column_types else ""
128        )
129
130        grain = f"\n  grain ({', '.join(primary_key)})," if primary_key else ""
131        incremental_model_name = f"{dataset}_sqlmesh.incremental_{table_name}"
132        incremental_model_sql = generate_incremental_model(
133            incremental_model_name,
134            select_columns,
135            grain,
136            dataset + "." + table_name,
137            dialect,
138            load_key,
139            parent_table,
140        )
141        sqlmesh_models.add((incremental_model_name, incremental_model_sql))
142
143    return sqlmesh_models, connection_config, start_date

This function attaches to a DLT pipeline and retrieves the connection configs and SQLMesh models based on the tables present in the pipeline's default schema.

Arguments:

pipeline_name: The name of the DLT pipeline to attach to.
dialect: The SQL dialect to use for generating SQLMesh models.
tables: A list of table names to include.
dlt_path: The path to the DLT pipelines working directory, where DLT stores pipeline state (by default ~/.dlt/pipelines).

Returns:

A tuple containing a set of the SQLMesh model definitions, the connection config and the start date.

def generate_dlt_models( context: sqlmesh.core.context.Context, pipeline_name: str, tables: List[str], force: bool, dlt_path: Optional[str] = None) -> List[str]: View Source

146def generate_dlt_models(
147    context: Context,
148    pipeline_name: str,
149    tables: t.List[str],
150    force: bool,
151    dlt_path: t.Optional[str] = None,
152) -> t.List[str]:
153    from sqlmesh.cli.project_init import _create_object_files
154
155    sqlmesh_models, _, _ = generate_dlt_models_and_settings(
156        pipeline_name=pipeline_name,
157        dialect=context.config.dialect or "",
158        tables=tables if tables else None,
159        dlt_path=dlt_path,
160    )
161
162    if not tables and not force:
163        existing_models = [m.name for m in context.models.values()]
164        sqlmesh_models = {model for model in sqlmesh_models if model[0] not in existing_models}
165
166    if sqlmesh_models:
167        _create_object_files(
168            context.path / "models",
169            {model[0].split(".")[-1]: model[1] for model in sqlmesh_models},
170            "sql",
171        )
172        return [model[0] for model in sqlmesh_models]
173    return []

def generate_incremental_model( model_name: str, select_columns: str, grain: str, from_table: str, dialect: str, load_id: str, parent_table: Optional[str] = None) -> str: View Source

176def generate_incremental_model(
177    model_name: str,
178    select_columns: str,
179    grain: str,
180    from_table: str,
181    dialect: str,
182    load_id: str,
183    parent_table: t.Optional[str] = None,
184) -> str:
185    """Generate the SQL definition for an incremental model."""
186
187    time_column = parse_one(f"to_timestamp(CAST({load_id} AS DOUBLE))").sql(dialect=dialect)
188
189    from_clause = f"{from_table} as c"
190    if parent_table:
191        from_clause += f"""\nJOIN
192  {parent_table} as p
193ON
194  c._dlt_parent_id = p._dlt_id"""
195
196    return f"""MODEL (
197  name {model_name},
198  kind INCREMENTAL_BY_TIME_RANGE (
199    time_column _dlt_load_time,
200  ),{grain}
201);
202
203SELECT
204{select_columns},
205  {time_column} as _dlt_load_time
206FROM
207  {from_clause}
208WHERE
209  {time_column} BETWEEN @start_ts AND @end_ts
210"""

Generate the SQL definition for an incremental model.

def format_config(configs: Dict[str, str], db_type: str) -> str: View Source

213def format_config(configs: t.Dict[str, str], db_type: str) -> str:
214    """Generate a string for the gateway connection config."""
215    config = {
216        "type": db_type,
217    }
218
219    for key, value in configs.items():
220        if key == "password":
221            config[key] = f'"{value}"'
222        elif key == "username":
223            config["user"] = value
224        else:
225            config[key] = value
226
227    # Validate the connection config fields
228    invalid_fields = []
229    try:
230        parse_connection_config(config)
231    except ValidationError as e:
232        for error in e.errors():
233            invalid_fields.append(error.get("loc", [])[0])
234
235    return "\n".join(
236        [f"      {key}: {value}" for key, value in config.items() if key not in invalid_fields]
237    )

Generate a string for the gateway connection config.

def get_start_date(load_ids: List[str]) -> str: View Source

240def get_start_date(load_ids: t.List[str]) -> str:
241    """Convert the earliest load_id to UTC timestamp, subtract a day and format as 'YYYY-MM-DD'."""
242
243    timestamps = [datetime.fromtimestamp(float(id), tz=timezone.utc) for id in load_ids]
244    if timestamps:
245        start_timestamp = min(timestamps) - timedelta(days=1)
246        return start_timestamp.strftime("%Y-%m-%d")
247    return yesterday_ds()

Convert the earliest load_id to UTC timestamp, subtract a day and format as 'YYYY-MM-DD'.