Pipeline API¶

`odibi.pipeline` ¶

Pipeline executor and orchestration.

`PipelineManager` ¶

Manages multiple pipelines from a YAML configuration.

Source code in odibi/pipeline.py

class PipelineManager:
    """Manages multiple pipelines from a YAML configuration."""

    def __init__(
        self,
        project_config: ProjectConfig,
        connections: Dict[str, Any],
    ):
        """Initialize pipeline manager.

        Args:
            project_config: Validated project configuration
            connections: Connection objects (already instantiated)
        """
        self.project_config = project_config
        self.connections = connections
        self._pipelines: Dict[str, Pipeline] = {}
        self.catalog_manager = None
        self.lineage_adapter = None

        # Configure logging
        configure_logging(
            structured=project_config.logging.structured, level=project_config.logging.level.value
        )

        # Create manager-level logging context
        self._ctx = create_logging_context(engine=project_config.engine)

        self._ctx.info(
            "Initializing PipelineManager",
            project=project_config.project,
            engine=project_config.engine,
            pipeline_count=len(project_config.pipelines),
            connection_count=len(connections),
        )

        # Initialize Lineage Adapter
        self.lineage_adapter = OpenLineageAdapter(project_config.lineage)

        # Initialize CatalogManager if configured
        if project_config.system:
            from odibi.catalog import CatalogManager

            spark = None
            engine_instance = None

            if project_config.engine == "spark":
                try:
                    from odibi.engine.spark_engine import SparkEngine

                    temp_engine = SparkEngine(connections=connections, config={})
                    spark = temp_engine.spark
                    self._ctx.debug("Spark session initialized for System Catalog")
                except Exception as e:
                    self._ctx.warning(
                        f"Failed to initialize Spark for System Catalog: {e}",
                        suggestion="Check Spark configuration",
                    )

            sys_conn = connections.get(project_config.system.connection)
            if sys_conn:
                base_path = sys_conn.get_path(project_config.system.path)

                if not spark:
                    try:
                        from odibi.engine.pandas_engine import PandasEngine

                        engine_instance = PandasEngine(config={})
                        self._ctx.debug("PandasEngine initialized for System Catalog")
                    except Exception as e:
                        self._ctx.warning(
                            f"Failed to initialize PandasEngine for System Catalog: {e}"
                        )

                if spark or engine_instance:
                    self.catalog_manager = CatalogManager(
                        spark=spark,
                        config=project_config.system,
                        base_path=base_path,
                        engine=engine_instance,
                        connection=sys_conn,
                    )
                    # Set project name for tagging all catalog records
                    self.catalog_manager.project = project_config.project

                    # Skip bootstrap if catalog writes are disabled
                    skip_catalog = (
                        getattr(project_config.performance, "skip_catalog_writes", False)
                        if project_config.performance
                        else False
                    )
                    if not skip_catalog:
                        self.catalog_manager.bootstrap()
                        self._ctx.info(
                            "System Catalog initialized",
                            path=base_path,
                            project=project_config.project,
                        )
                    else:
                        self._ctx.debug(
                            "System Catalog bootstrap skipped (skip_catalog_writes=true)"
                        )
            else:
                self._ctx.warning(
                    f"System connection '{project_config.system.connection}' not found",
                    suggestion="Configure the system connection in your config",
                )

        # Get story configuration
        story_config = self._get_story_config()

        # Create all pipeline instances
        self._ctx.debug(
            "Creating pipeline instances",
            pipelines=[p.pipeline for p in project_config.pipelines],
        )
        for pipeline_config in project_config.pipelines:
            pipeline_name = pipeline_config.pipeline

            if pipeline_name in self._pipelines:
                self._ctx.warning(
                    f"Pipeline '{pipeline_name}' defined multiple times, "
                    f"using last definition (environment override)",
                    pipeline=pipeline_name,
                )

            self._pipelines[pipeline_name] = Pipeline(
                pipeline_config=pipeline_config,
                engine=project_config.engine,
                connections=connections,
                generate_story=story_config.get("auto_generate", True),
                story_config=story_config,
                retry_config=project_config.retry,
                alerts=project_config.alerts,
                performance_config=project_config.performance,
                catalog_manager=self.catalog_manager,
                lineage_adapter=self.lineage_adapter,
            )
            self._pipelines[pipeline_name].project_config = project_config

        self._ctx.info(
            "PipelineManager ready",
            pipelines=list(self._pipelines.keys()),
        )

    def _get_story_config(self) -> Dict[str, Any]:
        """Build story config from project_config.story.

        Resolves story output path using connection.

        Returns:
            Dictionary for StoryGenerator initialization
        """
        story_cfg = self.project_config.story

        # Resolve story path using connection
        story_conn = self.connections[story_cfg.connection]
        output_path = story_conn.get_path(story_cfg.path)

        # Get storage options (e.g., credentials) from connection if available
        storage_options = {}
        if hasattr(story_conn, "pandas_storage_options"):
            storage_options = story_conn.pandas_storage_options()

        # Build docs config dict if present
        docs_config = None
        if story_cfg.docs:
            docs_config = story_cfg.docs.model_dump()

        return {
            "auto_generate": story_cfg.auto_generate,
            "max_sample_rows": story_cfg.max_sample_rows,
            "output_path": output_path,
            "storage_options": storage_options,
            "async_generation": story_cfg.async_generation,
            "docs": docs_config,
        }

    @classmethod
    def from_yaml(
        cls: Type["PipelineManager"], yaml_path: str, env: str = None
    ) -> "PipelineManager":
        """Create PipelineManager from YAML file.

        Args:
            yaml_path: Path to YAML configuration file
            env: Environment name to apply overrides (e.g. 'prod')

        Returns:
            PipelineManager instance ready to run pipelines

        Example:
            >>> manager = PipelineManager.from_yaml("config.yaml", env="prod")
            >>> results = manager.run()  # Run all pipelines
        """
        logger.info(f"Loading configuration from: {yaml_path}")

        register_standard_library()

        yaml_path_obj = Path(yaml_path)
        config_dir = yaml_path_obj.parent.absolute()

        import importlib.util
        import os
        import sys

        # Load .env file from config directory if it exists
        env_file = config_dir / ".env"
        if env_file.exists():
            try:
                from dotenv import load_dotenv

                load_dotenv(env_file, override=True)
                logger.debug(f"Loaded environment from: {env_file}")
            except ImportError:
                logger.warning("python-dotenv not installed, skipping .env file")

        def load_transforms_module(path: str) -> None:
            if os.path.exists(path):
                try:
                    spec = importlib.util.spec_from_file_location("transforms_autodiscovered", path)
                    if spec and spec.loader:
                        module = importlib.util.module_from_spec(spec)
                        sys.modules["transforms_autodiscovered"] = module
                        spec.loader.exec_module(module)
                        logger.info(f"Auto-loaded transforms from: {path}")
                except Exception as e:
                    logger.warning(f"Failed to auto-load transforms from {path}: {e}")

        load_transforms_module(os.path.join(config_dir, "transforms.py"))

        cwd = os.getcwd()
        if os.path.abspath(cwd) != str(config_dir):
            load_transforms_module(os.path.join(cwd, "transforms.py"))

        try:
            config = load_yaml_with_env(str(yaml_path_obj), env=env)
            logger.debug("Configuration loaded successfully")
        except FileNotFoundError:
            logger.error(f"YAML file not found: {yaml_path}")
            raise FileNotFoundError(
                f"YAML file not found: {yaml_path}. "
                f"Verify the file exists and consider using an absolute path."
            )

        project_config = ProjectConfig(**config)
        logger.debug(
            "Project config validated",
            project=project_config.project,
            pipelines=len(project_config.pipelines),
        )

        connections = cls._build_connections(project_config.connections)

        return cls(
            project_config=project_config,
            connections=connections,
        )

    @staticmethod
    def _build_connections(conn_configs: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
        """Convert connection configs to connection objects.

        Args:
            conn_configs: Connection configurations from ProjectConfig

        Returns:
            Dictionary of connection name -> connection object

        Raises:
            ValueError: If connection type is not supported
        """
        from odibi.connections.factory import register_builtins

        logger.debug(f"Building {len(conn_configs)} connections")

        connections = {}

        register_builtins()
        load_plugins()

        for conn_name, conn_config in conn_configs.items():
            if hasattr(conn_config, "model_dump"):
                conn_config = conn_config.model_dump()
            elif hasattr(conn_config, "dict"):
                conn_config = conn_config.model_dump()

            conn_type = conn_config.get("type", "local")

            factory = get_connection_factory(conn_type)
            if factory:
                try:
                    connections[conn_name] = factory(conn_name, conn_config)
                    logger.debug(
                        f"Connection created: {conn_name}",
                        type=conn_type,
                    )
                except Exception as e:
                    logger.error(
                        f"Failed to create connection '{conn_name}'",
                        type=conn_type,
                        error=str(e),
                    )
                    raise ValueError(
                        f"Failed to create connection '{conn_name}' (type={conn_type}): {e}"
                    ) from e
            else:
                logger.error(
                    f"Unsupported connection type: {conn_type}",
                    connection=conn_name,
                    suggestion="Check supported connection types in docs",
                )
                raise ValueError(
                    f"Unsupported connection type: {conn_type}. "
                    f"Supported types: local, azure_adls, azure_sql, delta, etc. "
                    f"See docs for connection setup."
                )

        try:
            from odibi.utils import configure_connections_parallel

            connections, errors = configure_connections_parallel(connections, verbose=False)
            if errors:
                for error in errors:
                    logger.warning(error)
        except ImportError:
            pass

        logger.info(f"Built {len(connections)} connections successfully")

        return connections

    def register_outputs(
        self,
        pipelines: Optional[Union[str, List[str]]] = None,
    ) -> Dict[str, int]:
        """
        Pre-register node outputs from pipeline configs without running them.

        Scans pipeline nodes for output locations (write blocks, merge/scd2 params)
        and registers them to meta_outputs. This enables cross-pipeline references
        without requiring the source pipelines to have run first.

        Args:
            pipelines: Pipeline name(s) to register. If None, registers all pipelines.

        Returns:
            Dict mapping pipeline name to number of outputs registered

        Example:
            >>> manager = PipelineManager.from_yaml("pipelines.yaml")
            >>> counts = manager.register_outputs("silver")  # Register just silver
            >>> counts = manager.register_outputs()  # Register all pipelines
        """
        if pipelines is None:
            pipeline_names = list(self._pipelines.keys())
        elif isinstance(pipelines, str):
            pipeline_names = [pipelines]
        else:
            pipeline_names = pipelines

        results = {}
        for name in pipeline_names:
            if name not in self._pipelines:
                self._ctx.warning(f"Pipeline not found: {name}")
                continue

            pipeline = self._pipelines[name]
            count = pipeline.register_outputs()
            results[name] = count

        total = sum(results.values())
        self._ctx.info(f"Pre-registered {total} outputs from {len(results)} pipelines")
        return results

    def run(
        self,
        pipelines: Optional[Union[str, List[str]]] = None,
        dry_run: bool = False,
        resume_from_failure: bool = False,
        parallel: bool = False,
        max_workers: int = 4,
        on_error: Optional[str] = None,
        tag: Optional[str] = None,
        node: Optional[Union[str, List[str]]] = None,
        console: bool = False,
    ) -> Union[PipelineResults, Dict[str, PipelineResults]]:
        """Run one, multiple, or all pipelines.

        Args:
            pipelines: Pipeline name(s) to run.
            dry_run: Whether to simulate execution.
            resume_from_failure: Whether to skip successfully completed nodes from last run.
            parallel: Whether to run nodes in parallel.
            max_workers: Maximum number of worker threads for parallel execution.
            on_error: Override error handling strategy (fail_fast, fail_later, ignore).
            tag: Filter nodes by tag (only nodes with this tag will run).
            node: Run only specific node(s) by name - can be a string or list of strings.
            console: Whether to show rich console output with progress.

        Returns:
            PipelineResults or Dict of results
        """
        import time

        t_start = time.time()
        overhead_timings = {}

        if pipelines is None:
            pipeline_names = list(self._pipelines.keys())
        elif isinstance(pipelines, str):
            pipeline_names = [pipelines]
        else:
            pipeline_names = pipelines

        for name in pipeline_names:
            if name not in self._pipelines:
                available = ", ".join(self._pipelines.keys())
                self._ctx.error(
                    f"Pipeline not found: {name}",
                    available=list(self._pipelines.keys()),
                )
                raise ValueError(f"Pipeline '{name}' not found. Available pipelines: {available}")

        # Phase 2: Auto-register pipelines and nodes before execution
        t_register_start = time.time()
        if self.catalog_manager:
            self._auto_register_pipelines(pipeline_names)
        t_register_end = time.time()
        overhead_timings["auto_register"] = t_register_end - t_register_start

        self._ctx.info(
            f"Running {len(pipeline_names)} pipeline(s)",
            pipelines=pipeline_names,
            dry_run=dry_run,
            parallel=parallel,
        )

        results = {}
        inter_pipeline_gaps = []
        lineage_futures = []  # Track incremental lineage building

        for idx, name in enumerate(pipeline_names):
            t_pre_pipeline = time.time()

            self._ctx.info(
                f"Executing pipeline {idx + 1}/{len(pipeline_names)}: {name}",
                pipeline=name,
                order=idx + 1,
            )

            t_pipeline_start = time.time()
            results[name] = self._pipelines[name].run(
                dry_run=dry_run,
                resume_from_failure=resume_from_failure,
                parallel=parallel,
                max_workers=max_workers,
                on_error=on_error,
                tag=tag,
                node=node,
                console=console,
            )

            result = results[name]
            status = "SUCCESS" if not result.failed else "FAILED"
            self._ctx.info(
                f"Pipeline {status}: {name}",
                status=status,
                duration_s=round(result.duration, 2),
                completed=len(result.completed),
                failed=len(result.failed),
            )

            if result.story_path:
                self._ctx.debug(f"Story generated: {result.story_path}")

            # Start building lineage for this pipeline incrementally (if async enabled)
            has_story = hasattr(self.project_config, "story") and self.project_config.story
            generate_lineage_enabled = has_story and self.project_config.story.generate_lineage
            async_lineage = True
            if self.project_config and self.project_config.system:
                async_lineage = getattr(self.project_config.system, "async_lineage", True)

            if generate_lineage_enabled and async_lineage and result.story_path:
                # Build this pipeline's lineage piece in background
                from concurrent.futures import ThreadPoolExecutor

                if not hasattr(self, "_lineage_executor"):
                    self._lineage_executor = ThreadPoolExecutor(
                        max_workers=3, thread_name_prefix="Lineage"
                    )

                future = self._lineage_executor.submit(
                    self._build_pipeline_lineage_piece, name, result.story_path
                )
                lineage_futures.append((name, future))
                self._ctx.debug(f"Started incremental lineage building for {name}")

            # Track inter-pipeline overhead (time between pipelines excluding actual execution)
            if idx > 0:
                gap = t_pipeline_start - t_pre_pipeline
                inter_pipeline_gaps.append(gap)

        overhead_timings["inter_pipeline_gaps_total"] = sum(inter_pipeline_gaps)

        # Generate combined lineage if configured
        t_lineage_start = time.time()
        has_story = hasattr(self.project_config, "story") and self.project_config.story
        generate_lineage_enabled = has_story and self.project_config.story.generate_lineage
        async_lineage = True
        if self.project_config and self.project_config.system:
            async_lineage = getattr(self.project_config.system, "async_lineage", True)

        self._ctx.debug(
            "Lineage check",
            has_story=has_story,
            generate_lineage_enabled=generate_lineage_enabled,
        )

        if generate_lineage_enabled:
            # Flush any pending async story writes before generating lineage
            self._ctx.info("Generating combined lineage...")
            self.flush_stories()

            try:
                if async_lineage and lineage_futures:
                    # Merge incrementally-built lineage pieces (fast!)
                    self._ctx.debug(
                        f"Waiting for {len(lineage_futures)} incremental lineage pieces..."
                    )
                    lineage_pieces = []
                    for pipeline_name, future in lineage_futures:
                        try:
                            piece = future.result(timeout=30)  # Should be done already
                            if piece:
                                lineage_pieces.append(piece)
                            self._ctx.debug(f"Got lineage piece for {pipeline_name}")
                        except Exception as e:
                            self._ctx.warning(
                                f"Failed to get lineage piece for {pipeline_name}: {e}"
                            )

                    # Merge the pieces
                    if lineage_pieces:
                        lineage_result = self._merge_lineage_pieces(lineage_pieces)
                        if lineage_result:
                            self._ctx.info(
                                "Combined lineage generated from incremental pieces",
                                nodes=len(lineage_result.nodes),
                                edges=len(lineage_result.edges),
                                json_path=lineage_result.json_path,
                            )
                    else:
                        self._ctx.warning("No lineage pieces generated")

                    # Cleanup executor
                    if hasattr(self, "_lineage_executor"):
                        self._lineage_executor.shutdown(wait=False)
                else:
                    # Synchronous - build entire lineage at once (original behavior)
                    lineage_result = generate_lineage(self.project_config)
                    if lineage_result:
                        self._ctx.info(
                            "Combined lineage generated",
                            nodes=len(lineage_result.nodes),
                            edges=len(lineage_result.edges),
                            json_path=lineage_result.json_path,
                        )
                    else:
                        self._ctx.warning("Lineage generation returned None")
            except Exception as e:
                self._ctx.warning(f"Failed to generate combined lineage: {e}")
        t_lineage_end = time.time()
        overhead_timings["lineage_generation"] = t_lineage_end - t_lineage_start

        # Wait for any pending async catalog syncs to complete (with reduced timeout)
        t_sync_start = time.time()

        # Check if we should skip waiting in Databricks
        skip_sync_wait = False
        if self.project_config and self.project_config.system:
            skip_in_databricks = getattr(
                self.project_config.system, "skip_sync_wait_in_databricks", True
            )
            if skip_in_databricks and self._is_databricks():
                skip_sync_wait = True
                self._ctx.info(
                    "Running in Databricks - skipping sync wait (threads continue in background)"
                )

        if not skip_sync_wait:
            sync_timeout = 30.0  # Default reduced timeout for performance
            if self.project_config and self.project_config.system:
                sync_timeout = getattr(self.project_config.system, "sync_timeout_seconds", 30.0)

            for name in pipeline_names:
                pipeline = self._pipelines[name]
                if hasattr(pipeline, "flush_sync"):
                    # Configurable timeout (default 30s, was 300s)
                    # Sync is incremental, so incomplete syncs will catch up next run
                    pipeline.flush_sync(timeout=sync_timeout)

        t_sync_end = time.time()
        overhead_timings["catalog_sync"] = t_sync_end - t_sync_start

        t_end = time.time()
        total_overhead = t_end - t_start

        # Calculate actual pipeline execution time
        actual_execution = sum(r.duration for r in results.values())
        overhead_timings["total_overhead"] = total_overhead
        overhead_timings["actual_execution"] = actual_execution
        overhead_timings["overhead_delta"] = total_overhead - actual_execution

        # Print overhead audit report
        def _pct(value, total):
            return 100 * value / total if total > 0 else 0.0

        print("\n" + "=" * 100)
        print("INTER-PIPELINE OVERHEAD AUDIT")
        print("=" * 100)
        print(f"Total wall-clock time:          {total_overhead:>8.2f}s (100.0%)")
        print(
            f"Actual pipeline execution:      {actual_execution:>8.2f}s ({_pct(actual_execution, total_overhead):>5.1f}%)"
        )
        print(
            f"Total overhead:                 {overhead_timings['overhead_delta']:>8.2f}s ({_pct(overhead_timings['overhead_delta'], total_overhead):>5.1f}%)"
        )
        print("-" * 100)
        print("OVERHEAD BREAKDOWN:")
        print(
            f"  Auto-register pipelines:      {overhead_timings.get('auto_register', 0):>8.2f}s ({_pct(overhead_timings.get('auto_register', 0), total_overhead):>5.1f}%)"
        )
        print(
            f"  Inter-pipeline gaps:          {overhead_timings.get('inter_pipeline_gaps_total', 0):>8.2f}s ({_pct(overhead_timings.get('inter_pipeline_gaps_total', 0), total_overhead):>5.1f}%)"
        )
        print(
            f"  Lineage generation:           {overhead_timings.get('lineage_generation', 0):>8.2f}s ({_pct(overhead_timings.get('lineage_generation', 0), total_overhead):>5.1f}%)"
        )
        print(
            f"  Catalog sync flush:           {overhead_timings.get('catalog_sync', 0):>8.2f}s ({_pct(overhead_timings.get('catalog_sync', 0), total_overhead):>5.1f}%)"
        )
        print("=" * 100 + "\n")

        if len(pipeline_names) == 1:
            return results[pipeline_names[0]]
        else:
            return results

    def list_pipelines(self) -> List[str]:
        """Get list of available pipeline names.

        Returns:
            List of pipeline names
        """
        return list(self._pipelines.keys())

    def flush_stories(self, timeout: float = 60.0) -> Dict[str, Optional[str]]:
        """Wait for all pending async story generation to complete.

        Call this before operations that need story files to be written,
        such as lineage generation with SemanticLayerRunner.

        Args:
            timeout: Maximum seconds to wait per pipeline

        Returns:
            Dict mapping pipeline name to story path (or None if no pending story)

        Example:
            >>> manager.run(pipelines=['bronze', 'silver', 'gold'])
            >>> manager.flush_stories()  # Wait for all stories to be written
            >>> semantic_runner.run()    # Now lineage can read the stories
        """
        results = {}
        for name, pipeline in self._pipelines.items():
            story_path = pipeline.flush_stories(timeout=timeout)
            if story_path:
                results[name] = story_path
                self._ctx.debug(f"Story flushed for {name}", path=story_path)
        if results:
            self._ctx.info(f"Flushed {len(results)} pending story writes")
        return results

    def get_pipeline(self, name: str) -> Pipeline:
        """Get a specific pipeline instance.

        Args:
            name: Pipeline name

        Returns:
            Pipeline instance

        Raises:
            ValueError: If pipeline not found
        """
        if name not in self._pipelines:
            available = ", ".join(self._pipelines.keys())
            raise ValueError(f"Pipeline '{name}' not found. Available: {available}")
        return self._pipelines[name]

    def deploy(self, pipelines: Optional[Union[str, List[str]]] = None) -> bool:
        """Deploy pipeline definitions to the System Catalog.

        This registers pipeline and node configurations in the catalog,
        enabling drift detection and governance features.

        Args:
            pipelines: Optional pipeline name(s) to deploy. If None, deploys all.

        Returns:
            True if deployment succeeded, False otherwise.

        Example:
            >>> manager = PipelineManager.from_yaml("odibi.yaml")
            >>> manager.deploy()  # Deploy all pipelines
            >>> manager.deploy("sales_daily")  # Deploy specific pipeline
        """
        if not self.catalog_manager:
            self._ctx.warning(
                "System Catalog not configured. Cannot deploy.",
                suggestion="Configure system catalog in your YAML config",
            )
            return False

        if pipelines is None:
            to_deploy = self.project_config.pipelines
        elif isinstance(pipelines, str):
            to_deploy = [p for p in self.project_config.pipelines if p.pipeline == pipelines]
        else:
            to_deploy = [p for p in self.project_config.pipelines if p.pipeline in pipelines]

        if not to_deploy:
            self._ctx.warning("No matching pipelines found to deploy.")
            return False

        self._ctx.info(
            f"Deploying {len(to_deploy)} pipeline(s) to System Catalog",
            pipelines=[p.pipeline for p in to_deploy],
        )

        try:
            # Skip bootstrap if catalog writes are disabled
            skip_catalog = (
                getattr(self.project_config.performance, "skip_catalog_writes", False)
                if self.project_config.performance
                else False
            )
            if not skip_catalog:
                self.catalog_manager.bootstrap()

            for pipeline_config in to_deploy:
                self._ctx.debug(
                    f"Deploying pipeline: {pipeline_config.pipeline}",
                    node_count=len(pipeline_config.nodes),
                )
                self.catalog_manager.register_pipeline(pipeline_config, self.project_config)

                for node in pipeline_config.nodes:
                    self.catalog_manager.register_node(pipeline_config.pipeline, node)

            self._ctx.info(
                f"Deployment complete: {len(to_deploy)} pipeline(s)",
                deployed=[p.pipeline for p in to_deploy],
            )
            return True

        except Exception as e:
            self._ctx.error(
                f"Deployment failed: {e}",
                error_type=type(e).__name__,
                suggestion="Check catalog configuration and permissions",
            )
            return False

    def _auto_register_pipelines(self, pipeline_names: List[str]) -> None:
        """Auto-register pipelines and nodes before execution.

        This ensures meta_pipelines and meta_nodes are populated automatically
        when running pipelines, without requiring explicit deploy() calls.

        Uses "check-before-write" pattern with batch writes for performance:
        - Reads existing hashes in one read
        - Compares version_hash to skip unchanged records
        - Batch writes only changed/new records

        Args:
            pipeline_names: List of pipeline names to register
        """
        if not self.catalog_manager:
            return

        try:
            import hashlib
            import json
            import time

            t_fetch_start = time.time()
            existing_pipelines = self.catalog_manager.get_all_registered_pipelines()
            existing_nodes = self.catalog_manager.get_all_registered_nodes(pipeline_names)
            t_fetch_end = time.time()
            print(
                f"[OVERHEAD] Auto-register: Fetched existing metadata in {t_fetch_end - t_fetch_start:.2f}s",
                flush=True,
            )

            pipeline_records = []
            node_records = []

            for name in pipeline_names:
                pipeline = self._pipelines[name]
                config = pipeline.config

                if hasattr(config, "model_dump"):
                    dump = config.model_dump(mode="json")
                else:
                    dump = config.model_dump()
                dump_str = json.dumps(dump, sort_keys=True)
                pipeline_hash = hashlib.md5(dump_str.encode("utf-8")).hexdigest()

                if existing_pipelines.get(name) != pipeline_hash:
                    all_tags = set()
                    for node in config.nodes:
                        if node.tags:
                            all_tags.update(node.tags)

                    pipeline_records.append(
                        {
                            "pipeline_name": name,
                            "version_hash": pipeline_hash,
                            "description": config.description or "",
                            "layer": config.layer or "",
                            "schedule": "",
                            "tags_json": json.dumps(list(all_tags)),
                        }
                    )

                pipeline_existing_nodes = existing_nodes.get(name, {})
                for node in config.nodes:
                    if hasattr(node, "model_dump"):
                        node_dump = node.model_dump(
                            mode="json", exclude={"description", "tags", "log_level"}
                        )
                    else:
                        node_dump = node.model_dump(exclude={"description", "tags", "log_level"})
                    node_dump_str = json.dumps(node_dump, sort_keys=True)
                    node_hash = hashlib.md5(node_dump_str.encode("utf-8")).hexdigest()

                    if pipeline_existing_nodes.get(node.name) != node_hash:
                        node_type = "transform"
                        if node.read:
                            node_type = "read"
                        if node.write:
                            node_type = "write"

                        node_records.append(
                            {
                                "pipeline_name": name,
                                "node_name": node.name,
                                "version_hash": node_hash,
                                "type": node_type,
                                "config_json": json.dumps(node_dump),
                            }
                        )

            if pipeline_records:
                t_write_p_start = time.time()
                self.catalog_manager.register_pipelines_batch(pipeline_records)
                t_write_p_end = time.time()
                print(
                    f"[OVERHEAD] Batch registered {len(pipeline_records)} changed pipeline(s) in {t_write_p_end - t_write_p_start:.2f}s",
                    flush=True,
                )
            else:
                self._ctx.debug("All pipelines unchanged - skipping registration")

            if node_records:
                t_write_n_start = time.time()
                self.catalog_manager.register_nodes_batch(node_records)
                t_write_n_end = time.time()
                print(
                    f"[OVERHEAD] Batch registered {len(node_records)} changed node(s) in {t_write_n_end - t_write_n_start:.2f}s",
                    flush=True,
                )
            else:
                self._ctx.debug("All nodes unchanged - skipping registration")

        except Exception as e:
            self._ctx.warning(
                f"Auto-registration failed (non-fatal): {e}",
                error_type=type(e).__name__,
            )

    # -------------------------------------------------------------------------
    # Utility Helpers
    # -------------------------------------------------------------------------

    def _is_databricks(self) -> bool:
        """Check if running in Databricks environment.

        Returns:
            True if running in Databricks, False otherwise
        """
        import os

        # Check for Databricks runtime environment variable
        if os.getenv("DATABRICKS_RUNTIME_VERSION"):
            return True

        # Check if dbutils is available (Databricks-specific)
        try:
            import builtins

            if hasattr(builtins, "dbutils"):
                return True
        except Exception:
            pass

        return False

    # -------------------------------------------------------------------------
    # Lineage Helpers
    # -------------------------------------------------------------------------

    def _build_pipeline_lineage_piece(self, pipeline_name: str, story_path: str):
        """Build lineage piece for a single pipeline (runs in background thread).

        Args:
            pipeline_name: Name of pipeline
            story_path: Path to generated story file

        Returns:
            Lineage piece dict or None
        """
        try:
            from odibi.story.lineage_utils import generate_lineage_for_pipeline

            return generate_lineage_for_pipeline(self.project_config, pipeline_name)
        except ImportError:
            self._ctx.debug("generate_lineage_for_pipeline not available, using generate_lineage")
            try:
                from odibi.story.lineage_utils import generate_lineage

                return generate_lineage(self.project_config)
            except Exception as e:
                self._ctx.warning(f"Failed to build lineage piece for {pipeline_name}: {e}")
                return None
        except Exception as e:
            self._ctx.warning(f"Failed to build lineage piece for {pipeline_name}: {e}")
            return None

    def _merge_lineage_pieces(self, pieces: list):
        """Merge incrementally-built lineage pieces into combined lineage.

        Args:
            pieces: List of lineage dicts from individual pipelines

        Returns:
            Combined lineage result
        """
        try:
            from odibi.story.lineage_utils import merge_lineage_pieces

            return merge_lineage_pieces(pieces, self.project_config)
        except ImportError:
            # Fallback: If merge function doesn't exist, use original generate_lineage
            self._ctx.debug("merge_lineage_pieces not available, using generate_lineage")
            from odibi.story.lineage_utils import generate_lineage

            return generate_lineage(self.project_config)
        except Exception as e:
            self._ctx.warning(f"Failed to merge lineage pieces: {e}")
            return None

    # -------------------------------------------------------------------------
    # Phase 5: List/Query Methods
    # -------------------------------------------------------------------------

    def list_registered_pipelines(self) -> "pd.DataFrame":
        """List all registered pipelines from the system catalog.

        Returns:
            DataFrame with pipeline metadata from meta_pipelines
        """
        import pandas as pd

        if not self.catalog_manager:
            self._ctx.warning("Catalog manager not configured")
            return pd.DataFrame()

        try:
            df = self.catalog_manager._read_local_table(
                self.catalog_manager.tables["meta_pipelines"]
            )
            return df
        except Exception as e:
            self._ctx.warning(f"Failed to list pipelines: {e}")
            return pd.DataFrame()

    def list_registered_nodes(self, pipeline: Optional[str] = None) -> "pd.DataFrame":
        """List nodes from the system catalog.

        Args:
            pipeline: Optional pipeline name to filter by

        Returns:
            DataFrame with node metadata from meta_nodes
        """
        import pandas as pd

        if not self.catalog_manager:
            self._ctx.warning("Catalog manager not configured")
            return pd.DataFrame()

        try:
            df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_nodes"])
            if not df.empty and pipeline:
                df = df[df["pipeline_name"] == pipeline]
            return df
        except Exception as e:
            self._ctx.warning(f"Failed to list nodes: {e}")
            return pd.DataFrame()

    def list_runs(
        self,
        pipeline: Optional[str] = None,
        node: Optional[str] = None,
        status: Optional[str] = None,
        limit: int = 10,
    ) -> "pd.DataFrame":
        """List recent runs with optional filters.

        Args:
            pipeline: Optional pipeline name to filter by
            node: Optional node name to filter by
            status: Optional status to filter by (SUCCESS, FAILURE)
            limit: Maximum number of runs to return

        Returns:
            DataFrame with run history from meta_runs
        """
        import pandas as pd

        if not self.catalog_manager:
            self._ctx.warning("Catalog manager not configured")
            return pd.DataFrame()

        try:
            df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_runs"])
            if df.empty:
                return df

            if pipeline:
                df = df[df["pipeline_name"] == pipeline]
            if node:
                df = df[df["node_name"] == node]
            if status:
                df = df[df["status"] == status]

            if "timestamp" in df.columns:
                df = df.sort_values("timestamp", ascending=False)

            return df.head(limit)
        except Exception as e:
            self._ctx.warning(f"Failed to list runs: {e}")
            return pd.DataFrame()

    def list_tables(self) -> "pd.DataFrame":
        """List registered assets from meta_tables.

        Returns:
            DataFrame with table/asset metadata
        """
        import pandas as pd

        if not self.catalog_manager:
            self._ctx.warning("Catalog manager not configured")
            return pd.DataFrame()

        try:
            df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_tables"])
            return df
        except Exception as e:
            self._ctx.warning(f"Failed to list tables: {e}")
            return pd.DataFrame()

    # -------------------------------------------------------------------------
    # Discovery API
    # -------------------------------------------------------------------------

    def discover(
        self,
        connection_name: Optional[str] = None,
        dataset: Optional[str] = None,
        include_schema: bool = False,
        include_stats: bool = False,
        profile: bool = False,
        sample_rows: int = 1000,
        **kwargs,
    ) -> Dict[str, Any]:
        """Discover data sources using connection's discovery API.

        Convenience method that delegates to connection.discover_catalog(),
        connection.get_schema(), or connection.profile().

        When called without a connection_name, discovers across ALL connections.

        Args:
            connection_name: Name of connection from YAML config (None = all connections)
            dataset: Optional specific dataset to inspect (table name or file path)
            include_schema: Include column schemas (catalog mode)
            include_stats: Include row counts/stats (catalog mode)
            profile: Run detailed profiling (requires dataset)
            sample_rows: Rows to sample for profiling
            **kwargs: Additional connection-specific options
                - limit: Max datasets per connection (default: 200)
                - pattern: Filter datasets by pattern (e.g. "*.csv", "fact_*")
                - columns: Specific columns for profiling

        Returns:
            Discovery result dict (CatalogSummary, Schema, or TableProfile)

        Examples:
            # Discover ALL connections at once
            pm.discover()

            # List all tables in a database
            pm.discover("crm_db")

            # Get schema for specific table
            pm.discover("crm_db", dataset="dbo.Orders", include_schema=True)

            # Profile a table
            pm.discover("crm_db", dataset="dbo.Orders", profile=True, sample_rows=5000)

            # Discover ADLS folder
            pm.discover("raw_adls", dataset="sales/2024/")

            # Search across all connections with a pattern
            pm.discover(pattern="fact_*")
        """
        # Cross-connection discovery when no connection specified
        if connection_name is None:
            results = {}
            pattern = kwargs.get("pattern", "")
            limit = kwargs.get("limit", 200)
            for name, conn in self.connections.items():
                try:
                    results[name] = conn.discover_catalog(
                        include_schema=include_schema,
                        include_stats=include_stats,
                        limit=limit,
                        pattern=pattern,
                    )
                except NotImplementedError:
                    results[name] = {
                        "connection_type": type(conn).__name__,
                        "note": "Discovery not supported",
                    }
                except Exception as e:
                    results[name] = {"error": str(e)}
            return {
                "connections_scanned": len(results),
                "results": results,
            }

        # Single connection discovery
        conn = self.connections.get(connection_name)

        if not conn:
            available = list(self.connections.keys())
            return {
                "error": {
                    "code": "CONNECTION_NOT_FOUND",
                    "message": f"Connection '{connection_name}' not found",
                    "available_connections": available,
                    "fix": f"Use one of: {', '.join(available)}",
                }
            }

        try:
            # Profile specific dataset
            if profile and dataset:
                return conn.profile(
                    dataset=dataset, sample_rows=sample_rows, columns=kwargs.get("columns")
                )

            # Get schema for specific dataset
            elif dataset and (include_schema or not profile):
                schema_result = conn.get_table_info(dataset)
                return schema_result

            # Discover catalog
            else:
                return conn.discover_catalog(
                    include_schema=include_schema,
                    include_stats=include_stats,
                    limit=kwargs.get("limit", 200),
                    pattern=kwargs.get("pattern", ""),
                )

        except NotImplementedError as e:
            return {
                "error": {
                    "code": "NOT_SUPPORTED",
                    "message": str(e),
                    "connection_type": type(conn).__name__,
                    "fix": "This connection type does not support discovery yet",
                }
            }
        except Exception as e:
            self._ctx.error(
                f"Discovery failed for {connection_name}", error=str(e), dataset=dataset
            )
            return {
                "error": {
                    "code": "DISCOVERY_FAILED",
                    "message": str(e),
                    "connection": connection_name,
                    "dataset": dataset,
                }
            }

    def preview(
        self,
        connection_name: str,
        dataset: str,
        rows: int = 5,
        columns: Optional[List[str]] = None,
    ) -> Dict[str, Any]:
        """Preview sample rows from a dataset.

        Args:
            connection_name: Name of connection from YAML config
            dataset: Table name or file path to preview
            rows: Number of rows to return (default: 5, max: 100)
            columns: Specific columns to include (None = all)

        Returns:
            PreviewResult dict with sample rows

        Examples:
            # Preview a SQL table
            pm.preview("crm_db", "dbo.Orders", rows=10)

            # Preview specific columns
            pm.preview("crm_db", "dbo.Orders", columns=["order_id", "total"])

            # Preview a CSV file
            pm.preview("raw_data", "sales/2024.csv")
        """
        conn = self.connections.get(connection_name)
        if not conn:
            available = list(self.connections.keys())
            return {
                "error": {
                    "code": "CONNECTION_NOT_FOUND",
                    "message": f"Connection '{connection_name}' not found",
                    "available_connections": available,
                }
            }

        try:
            return conn.preview(dataset=dataset, rows=rows, columns=columns)
        except NotImplementedError as e:
            return {
                "error": {
                    "code": "NOT_SUPPORTED",
                    "message": str(e),
                    "fix": "This connection type does not support preview",
                }
            }
        except Exception as e:
            self._ctx.error(f"Preview failed for {connection_name}", error=str(e), dataset=dataset)
            return {"error": {"code": "PREVIEW_FAILED", "message": str(e)}}

    def freshness(
        self,
        connection_name: str,
        dataset: str,
        timestamp_column: Optional[str] = None,
    ) -> Dict[str, Any]:
        """Check data freshness for a dataset.

        Args:
            connection_name: Name of connection from YAML config
            dataset: Table name or file path
            timestamp_column: Column to check for max timestamp (SQL only)

        Returns:
            FreshnessResult dict with last_updated, age_hours, etc.

        Examples:
            # Check freshness using table metadata
            pm.freshness("crm_db", "dbo.Orders")

            # Check freshness using a specific timestamp column
            pm.freshness("crm_db", "dbo.Orders", timestamp_column="order_date")

            # Check file freshness
            pm.freshness("raw_data", "sales/2024.csv")
        """
        conn = self.connections.get(connection_name)
        if not conn:
            available = list(self.connections.keys())
            return {
                "error": {
                    "code": "CONNECTION_NOT_FOUND",
                    "message": f"Connection '{connection_name}' not found",
                    "available_connections": available,
                }
            }

        try:
            return conn.get_freshness(dataset=dataset, timestamp_column=timestamp_column)
        except TypeError:
            # LocalConnection/ADLS don't accept timestamp_column
            return conn.get_freshness(dataset=dataset)
        except NotImplementedError as e:
            return {
                "error": {
                    "code": "NOT_SUPPORTED",
                    "message": str(e),
                    "fix": "This connection type does not support freshness checks",
                }
            }
        except Exception as e:
            self._ctx.error(
                f"Freshness check failed for {connection_name}", error=str(e), dataset=dataset
            )
            return {"error": {"code": "FRESHNESS_FAILED", "message": str(e)}}

    def relationships(
        self,
        connection_name: str,
        schema: Optional[str] = None,
    ) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
        """Discover foreign key relationships in a SQL database.

        Args:
            connection_name: Name of SQL connection from YAML config
            schema: Limit to specific schema (default: all schemas)

        Returns:
            List of Relationship dicts, or error dict

        Examples:
            # Discover all FKs
            pm.relationships("crm_db")

            # FKs in a specific schema
            pm.relationships("crm_db", schema="dbo")
        """
        conn = self.connections.get(connection_name)
        if not conn:
            available = list(self.connections.keys())
            return {
                "error": {
                    "code": "CONNECTION_NOT_FOUND",
                    "message": f"Connection '{connection_name}' not found",
                    "available_connections": available,
                }
            }

        if not hasattr(conn, "relationships"):
            return {
                "error": {
                    "code": "NOT_SUPPORTED",
                    "message": f"{type(conn).__name__} does not support relationship discovery",
                    "fix": "Only SQL connections support FK discovery",
                }
            }

        try:
            return conn.relationships(schema=schema)
        except Exception as e:
            self._ctx.error(f"Relationship discovery failed for {connection_name}", error=str(e))
            return {"error": {"code": "DISCOVERY_FAILED", "message": str(e)}}

    def scaffold_project(
        self, project_name: str, connections: Dict[str, Dict[str, Any]], **kwargs
    ) -> str:
        """Generate project YAML scaffold.

        Args:
            project_name: Name of the project
            connections: Dict of connection name -> connection config
            **kwargs: Additional options (imports, story_connection, system_connection)

        Returns:
            YAML string for project.yaml

        Example:
            >>> connections = {
            ...     "local": {"type": "local", "base_path": "data/"},
            ...     "azure": {"type": "azure_blob", "account_name": "myaccount"}
            ... }
            >>> yaml = pm.scaffold_project("my_project", connections)
        """
        from odibi.scaffold import generate_project_yaml

        return generate_project_yaml(project_name, connections, **kwargs)

    def scaffold_sql_pipeline(
        self,
        pipeline_name: str,
        source_connection: str,
        target_connection: str,
        tables: List[Dict[str, Any]],
        **kwargs,
    ) -> str:
        """Generate SQL ingestion pipeline YAML.

        Args:
            pipeline_name: Name for the pipeline
            source_connection: SQL database connection name
            target_connection: Target storage connection name
            tables: List of table specs (see generate_sql_pipeline for details)
            **kwargs: Additional options (target_format, target_schema, layer, node_prefix)

        Returns:
            YAML string for pipeline file

        Example:
            >>> tables = [
            ...     {"schema": "dbo", "table": "customers", "primary_key": ["id"]},
            ...     {"schema": "dbo", "table": "orders"}
            ... ]
            >>> yaml = pm.scaffold_sql_pipeline("ingest", "sqldb", "lake", tables)
        """
        from odibi.scaffold import generate_sql_pipeline

        return generate_sql_pipeline(
            pipeline_name, source_connection, target_connection, tables, **kwargs
        )

    def validate_yaml(self, yaml_content: str) -> Dict[str, Any]:
        """Validate pipeline YAML.

        Args:
            yaml_content: YAML string to validate

        Returns:
            Validation result with errors/warnings

        Example:
            >>> result = pm.validate_yaml(yaml_string)
            >>> if not result["valid"]:
            ...     for error in result["errors"]:
            ...         print(f"{error['field_path']}: {error['message']}")
        """
        from odibi.validate import validate_yaml

        return validate_yaml(yaml_content)

    def doctor(self) -> Dict[str, Any]:
        """Run diagnostics check on environment and configuration.

        Returns:
            Diagnostic result with status, packages, connections, and issues

        Example:
            >>> result = pm.doctor()
            >>> if result["status"] == "healthy":
            ...     print("All good!")
            >>> else:
            ...     for issue in result["issues"]:
            ...         print(f"{issue['severity']}: {issue['message']}")
        """
        from odibi.doctor import doctor

        return doctor()

    # -------------------------------------------------------------------------
    # Phase 5.2: State Methods
    # -------------------------------------------------------------------------

    def get_state(self, key: str) -> Optional[Dict[str, Any]]:
        """Get a specific state entry (HWM, content hash, etc.).

        Args:
            key: The state key to look up

        Returns:
            Dictionary with state data or None if not found
        """

        if not self.catalog_manager:
            return None

        try:
            df = self.catalog_manager._read_table(self.catalog_manager.tables["meta_state"])
            if df.empty or "key" not in df.columns:
                return None

            row = df[df["key"] == key]
            if row.empty:
                return None

            return row.iloc[0].to_dict()
        except Exception:
            return None

    def get_all_state(self, prefix: Optional[str] = None) -> "pd.DataFrame":
        """Get all state entries, optionally filtered by key prefix.

        Args:
            prefix: Optional key prefix to filter by

        Returns:
            DataFrame with state entries
        """
        import pandas as pd

        if not self.catalog_manager:
            return pd.DataFrame()

        try:
            df = self.catalog_manager._read_table(self.catalog_manager.tables["meta_state"])
            if not df.empty and prefix and "key" in df.columns:
                df = df[df["key"].str.startswith(prefix)]
            return df
        except Exception as e:
            self._ctx.warning(f"Failed to get state: {e}")
            return pd.DataFrame()

    def clear_state(self, key: str) -> bool:
        """Remove a state entry.

        Args:
            key: The state key to remove

        Returns:
            True if deleted, False otherwise
        """
        if not self.catalog_manager:
            return False

        try:
            return self.catalog_manager.clear_state_key(key)
        except Exception as e:
            self._ctx.warning(f"Failed to clear state: {e}")
            return False

    # -------------------------------------------------------------------------
    # Phase 5.3-5.4: Schema/Lineage and Stats Methods
    # -------------------------------------------------------------------------

    def get_schema_history(
        self,
        table: str,
        limit: int = 5,
    ) -> "pd.DataFrame":
        """Get schema version history for a table.

        Args:
            table: Table identifier (supports smart path resolution)
            limit: Maximum number of versions to return

        Returns:
            DataFrame with schema history
        """
        import pandas as pd

        if not self.catalog_manager:
            return pd.DataFrame()

        try:
            resolved_path = self._resolve_table_path(table)
            history = self.catalog_manager.get_schema_history(resolved_path, limit)
            return pd.DataFrame(history)
        except Exception as e:
            self._ctx.warning(f"Failed to get schema history: {e}")
            return pd.DataFrame()

    def get_lineage(
        self,
        table: str,
        direction: str = "both",
    ) -> "pd.DataFrame":
        """Get lineage for a table.

        Args:
            table: Table identifier (supports smart path resolution)
            direction: "upstream", "downstream", or "both"

        Returns:
            DataFrame with lineage relationships
        """
        import pandas as pd

        if not self.catalog_manager:
            return pd.DataFrame()

        try:
            resolved_path = self._resolve_table_path(table)

            results = []
            if direction in ("upstream", "both"):
                upstream = self.catalog_manager.get_upstream(resolved_path)
                for r in upstream:
                    r["direction"] = "upstream"
                results.extend(upstream)

            if direction in ("downstream", "both"):
                downstream = self.catalog_manager.get_downstream(resolved_path)
                for r in downstream:
                    r["direction"] = "downstream"
                results.extend(downstream)

            return pd.DataFrame(results)
        except Exception as e:
            self._ctx.warning(f"Failed to get lineage: {e}")
            return pd.DataFrame()

    def get_pipeline_status(self, pipeline: str) -> Dict[str, Any]:
        """Get last run status, duration, timestamp for a pipeline.

        Args:
            pipeline: Pipeline name

        Returns:
            Dict with status info
        """
        if not self.catalog_manager:
            return {}

        try:
            runs = self.list_runs(pipeline=pipeline, limit=1)
            if runs.empty:
                return {"status": "never_run", "pipeline": pipeline}

            last_run = runs.iloc[0].to_dict()
            return {
                "pipeline": pipeline,
                "last_status": last_run.get("status"),
                "last_run_at": last_run.get("timestamp"),
                "last_duration_ms": last_run.get("duration_ms"),
                "last_node": last_run.get("node_name"),
            }
        except Exception as e:
            self._ctx.warning(f"Failed to get pipeline status: {e}")
            return {}

    def get_node_stats(self, node: str, days: int = 7) -> Dict[str, Any]:
        """Get average duration, row counts, success rate over period.

        Args:
            node: Node name
            days: Number of days to look back

        Returns:
            Dict with node statistics
        """
        import pandas as pd

        if not self.catalog_manager:
            return {}

        try:
            avg_duration = self.catalog_manager.get_average_duration(node, days)

            df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_runs"])
            if df.empty:
                return {"node": node, "runs": 0}

            if "timestamp" in df.columns:
                cutoff = pd.Timestamp.now(tz="UTC") - pd.Timedelta(days=days)
                if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
                    df["timestamp"] = pd.to_datetime(df["timestamp"])
                if df["timestamp"].dt.tz is None:
                    df["timestamp"] = df["timestamp"].dt.tz_localize("UTC")
                df = df[df["timestamp"] >= cutoff]

            node_runs = df[df["node_name"] == node]
            if node_runs.empty:
                return {"node": node, "runs": 0}

            total = len(node_runs)
            success = len(node_runs[node_runs["status"] == "SUCCESS"])
            avg_rows = node_runs["rows_processed"].mean() if "rows_processed" in node_runs else None

            return {
                "node": node,
                "runs": total,
                "success_rate": success / total if total > 0 else 0,
                "avg_duration_s": avg_duration,
                "avg_rows": avg_rows,
                "period_days": days,
            }
        except Exception as e:
            self._ctx.warning(f"Failed to get node stats: {e}")
            return {}

    # -------------------------------------------------------------------------
    # Phase 6: Smart Path Resolution
    # -------------------------------------------------------------------------

    def _resolve_table_path(self, identifier: str) -> str:
        """Resolve a user-friendly identifier to a full table path.

        Accepts:
        - Relative path: "bronze/project/test"
        - Registered table: "test.test"
        - Node name: "project_test"
        - Full path: "abfss://..." (used as-is)

        Args:
            identifier: User-friendly table identifier

        Returns:
            Full table path
        """
        if self._is_full_path(identifier):
            return identifier

        if self.catalog_manager:
            resolved = self._lookup_in_catalog(identifier)
            if resolved:
                return resolved

        for pipeline in self._pipelines.values():
            for node in pipeline.config.nodes:
                if node.name == identifier and node.write:
                    conn = self.connections.get(node.write.connection)
                    if conn:
                        return conn.get_path(node.write.path or node.write.table)

        sys_conn_name = (
            self.project_config.system.connection if self.project_config.system else None
        )
        if sys_conn_name:
            sys_conn = self.connections.get(sys_conn_name)
            if sys_conn:
                return sys_conn.get_path(identifier)

        return identifier

    def _is_full_path(self, identifier: str) -> bool:
        """Check if identifier is already a full path."""
        full_path_prefixes = ("abfss://", "s3://", "gs://", "hdfs://", "/", "C:", "D:")
        return identifier.startswith(full_path_prefixes)

    def _lookup_in_catalog(self, identifier: str) -> Optional[str]:
        """Look up identifier in meta_tables catalog."""
        if not self.catalog_manager:
            return None

        try:
            df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_tables"])
            if df.empty or "table_name" not in df.columns:
                return None

            match = df[df["table_name"] == identifier]
            if not match.empty and "path" in match.columns:
                return match.iloc[0]["path"]

            if "." in identifier:
                parts = identifier.split(".", 1)
                if len(parts) == 2:
                    match = df[df["table_name"] == parts[1]]
                    if not match.empty and "path" in match.columns:
                        return match.iloc[0]["path"]

        except Exception:
            pass

        return None

`init(project_config, connections)` ¶

Initialize pipeline manager.

Parameters:

Name	Type	Description	Default
`project_config`	`ProjectConfig`	Validated project configuration	required
`connections`	`Dict[str, Any]`	Connection objects (already instantiated)	required

Source code in odibi/pipeline.py

def __init__(
    self,
    project_config: ProjectConfig,
    connections: Dict[str, Any],
):
    """Initialize pipeline manager.

    Args:
        project_config: Validated project configuration
        connections: Connection objects (already instantiated)
    """
    self.project_config = project_config
    self.connections = connections
    self._pipelines: Dict[str, Pipeline] = {}
    self.catalog_manager = None
    self.lineage_adapter = None

    # Configure logging
    configure_logging(
        structured=project_config.logging.structured, level=project_config.logging.level.value
    )

    # Create manager-level logging context
    self._ctx = create_logging_context(engine=project_config.engine)

    self._ctx.info(
        "Initializing PipelineManager",
        project=project_config.project,
        engine=project_config.engine,
        pipeline_count=len(project_config.pipelines),
        connection_count=len(connections),
    )

    # Initialize Lineage Adapter
    self.lineage_adapter = OpenLineageAdapter(project_config.lineage)

    # Initialize CatalogManager if configured
    if project_config.system:
        from odibi.catalog import CatalogManager

        spark = None
        engine_instance = None

        if project_config.engine == "spark":
            try:
                from odibi.engine.spark_engine import SparkEngine

                temp_engine = SparkEngine(connections=connections, config={})
                spark = temp_engine.spark
                self._ctx.debug("Spark session initialized for System Catalog")
            except Exception as e:
                self._ctx.warning(
                    f"Failed to initialize Spark for System Catalog: {e}",
                    suggestion="Check Spark configuration",
                )

        sys_conn = connections.get(project_config.system.connection)
        if sys_conn:
            base_path = sys_conn.get_path(project_config.system.path)

            if not spark:
                try:
                    from odibi.engine.pandas_engine import PandasEngine

                    engine_instance = PandasEngine(config={})
                    self._ctx.debug("PandasEngine initialized for System Catalog")
                except Exception as e:
                    self._ctx.warning(
                        f"Failed to initialize PandasEngine for System Catalog: {e}"
                    )

            if spark or engine_instance:
                self.catalog_manager = CatalogManager(
                    spark=spark,
                    config=project_config.system,
                    base_path=base_path,
                    engine=engine_instance,
                    connection=sys_conn,
                )
                # Set project name for tagging all catalog records
                self.catalog_manager.project = project_config.project

                # Skip bootstrap if catalog writes are disabled
                skip_catalog = (
                    getattr(project_config.performance, "skip_catalog_writes", False)
                    if project_config.performance
                    else False
                )
                if not skip_catalog:
                    self.catalog_manager.bootstrap()
                    self._ctx.info(
                        "System Catalog initialized",
                        path=base_path,
                        project=project_config.project,
                    )
                else:
                    self._ctx.debug(
                        "System Catalog bootstrap skipped (skip_catalog_writes=true)"
                    )
        else:
            self._ctx.warning(
                f"System connection '{project_config.system.connection}' not found",
                suggestion="Configure the system connection in your config",
            )

    # Get story configuration
    story_config = self._get_story_config()

    # Create all pipeline instances
    self._ctx.debug(
        "Creating pipeline instances",
        pipelines=[p.pipeline for p in project_config.pipelines],
    )
    for pipeline_config in project_config.pipelines:
        pipeline_name = pipeline_config.pipeline

        if pipeline_name in self._pipelines:
            self._ctx.warning(
                f"Pipeline '{pipeline_name}' defined multiple times, "
                f"using last definition (environment override)",
                pipeline=pipeline_name,
            )

        self._pipelines[pipeline_name] = Pipeline(
            pipeline_config=pipeline_config,
            engine=project_config.engine,
            connections=connections,
            generate_story=story_config.get("auto_generate", True),
            story_config=story_config,
            retry_config=project_config.retry,
            alerts=project_config.alerts,
            performance_config=project_config.performance,
            catalog_manager=self.catalog_manager,
            lineage_adapter=self.lineage_adapter,
        )
        self._pipelines[pipeline_name].project_config = project_config

    self._ctx.info(
        "PipelineManager ready",
        pipelines=list(self._pipelines.keys()),
    )

`clear_state(key)` ¶

Remove a state entry.

Parameters:

Name	Type	Description	Default
`key`	`str`	The state key to remove	required

Returns:

Type	Description
`bool`	True if deleted, False otherwise

Source code in odibi/pipeline.py

def clear_state(self, key: str) -> bool:
    """Remove a state entry.

    Args:
        key: The state key to remove

    Returns:
        True if deleted, False otherwise
    """
    if not self.catalog_manager:
        return False

    try:
        return self.catalog_manager.clear_state_key(key)
    except Exception as e:
        self._ctx.warning(f"Failed to clear state: {e}")
        return False

`deploy(pipelines=None)` ¶

Deploy pipeline definitions to the System Catalog.

This registers pipeline and node configurations in the catalog, enabling drift detection and governance features.

Parameters:

Name	Type	Description	Default
`pipelines`	`Optional[Union[str, List[str]]]`	Optional pipeline name(s) to deploy. If None, deploys all.	`None`

Returns:

Type	Description
`bool`	True if deployment succeeded, False otherwise.

Example

manager = PipelineManager.from_yaml("odibi.yaml") manager.deploy() # Deploy all pipelines manager.deploy("sales_daily") # Deploy specific pipeline

Source code in odibi/pipeline.py

def deploy(self, pipelines: Optional[Union[str, List[str]]] = None) -> bool:
    """Deploy pipeline definitions to the System Catalog.

    This registers pipeline and node configurations in the catalog,
    enabling drift detection and governance features.

    Args:
        pipelines: Optional pipeline name(s) to deploy. If None, deploys all.

    Returns:
        True if deployment succeeded, False otherwise.

    Example:
        >>> manager = PipelineManager.from_yaml("odibi.yaml")
        >>> manager.deploy()  # Deploy all pipelines
        >>> manager.deploy("sales_daily")  # Deploy specific pipeline
    """
    if not self.catalog_manager:
        self._ctx.warning(
            "System Catalog not configured. Cannot deploy.",
            suggestion="Configure system catalog in your YAML config",
        )
        return False

    if pipelines is None:
        to_deploy = self.project_config.pipelines
    elif isinstance(pipelines, str):
        to_deploy = [p for p in self.project_config.pipelines if p.pipeline == pipelines]
    else:
        to_deploy = [p for p in self.project_config.pipelines if p.pipeline in pipelines]

    if not to_deploy:
        self._ctx.warning("No matching pipelines found to deploy.")
        return False

    self._ctx.info(
        f"Deploying {len(to_deploy)} pipeline(s) to System Catalog",
        pipelines=[p.pipeline for p in to_deploy],
    )

    try:
        # Skip bootstrap if catalog writes are disabled
        skip_catalog = (
            getattr(self.project_config.performance, "skip_catalog_writes", False)
            if self.project_config.performance
            else False
        )
        if not skip_catalog:
            self.catalog_manager.bootstrap()

        for pipeline_config in to_deploy:
            self._ctx.debug(
                f"Deploying pipeline: {pipeline_config.pipeline}",
                node_count=len(pipeline_config.nodes),
            )
            self.catalog_manager.register_pipeline(pipeline_config, self.project_config)

            for node in pipeline_config.nodes:
                self.catalog_manager.register_node(pipeline_config.pipeline, node)

        self._ctx.info(
            f"Deployment complete: {len(to_deploy)} pipeline(s)",
            deployed=[p.pipeline for p in to_deploy],
        )
        return True

    except Exception as e:
        self._ctx.error(
            f"Deployment failed: {e}",
            error_type=type(e).__name__,
            suggestion="Check catalog configuration and permissions",
        )
        return False

`discover(connection_name=None, dataset=None, include_schema=False, include_stats=False, profile=False, sample_rows=1000, **kwargs)` ¶

Discover data sources using connection's discovery API.

Convenience method that delegates to connection.discover_catalog(), connection.get_schema(), or connection.profile().

When called without a connection_name, discovers across ALL connections.

Parameters:

Name	Type	Description	Default
`connection_name`	`Optional[str]`	Name of connection from YAML config (None = all connections)	`None`
`dataset`	`Optional[str]`	Optional specific dataset to inspect (table name or file path)	`None`
`include_schema`	`bool`	Include column schemas (catalog mode)	`False`
`include_stats`	`bool`	Include row counts/stats (catalog mode)	`False`
`profile`	`bool`	Run detailed profiling (requires dataset)	`False`
`sample_rows`	`int`	Rows to sample for profiling	`1000`
`**kwargs`		Additional connection-specific options - limit: Max datasets per connection (default: 200) - pattern: Filter datasets by pattern (e.g. ".csv", "fact_") - columns: Specific columns for profiling	`{}`

Returns:

Type	Description
`Dict[str, Any]`	Discovery result dict (CatalogSummary, Schema, or TableProfile)

Examples:

Discover ALL connections at once¶

pm.discover()

List all tables in a database¶

pm.discover("crm_db")

Get schema for specific table¶

pm.discover("crm_db", dataset="dbo.Orders", include_schema=True)

Profile a table¶

pm.discover("crm_db", dataset="dbo.Orders", profile=True, sample_rows=5000)

Discover ADLS folder¶

pm.discover("raw_adls", dataset="sales/2024/")

Search across all connections with a pattern¶

pm.discover(pattern="fact_*")

Source code in odibi/pipeline.py

def discover(
    self,
    connection_name: Optional[str] = None,
    dataset: Optional[str] = None,
    include_schema: bool = False,
    include_stats: bool = False,
    profile: bool = False,
    sample_rows: int = 1000,
    **kwargs,
) -> Dict[str, Any]:
    """Discover data sources using connection's discovery API.

    Convenience method that delegates to connection.discover_catalog(),
    connection.get_schema(), or connection.profile().

    When called without a connection_name, discovers across ALL connections.

    Args:
        connection_name: Name of connection from YAML config (None = all connections)
        dataset: Optional specific dataset to inspect (table name or file path)
        include_schema: Include column schemas (catalog mode)
        include_stats: Include row counts/stats (catalog mode)
        profile: Run detailed profiling (requires dataset)
        sample_rows: Rows to sample for profiling
        **kwargs: Additional connection-specific options
            - limit: Max datasets per connection (default: 200)
            - pattern: Filter datasets by pattern (e.g. "*.csv", "fact_*")
            - columns: Specific columns for profiling

    Returns:
        Discovery result dict (CatalogSummary, Schema, or TableProfile)

    Examples:
        # Discover ALL connections at once
        pm.discover()

        # List all tables in a database
        pm.discover("crm_db")

        # Get schema for specific table
        pm.discover("crm_db", dataset="dbo.Orders", include_schema=True)

        # Profile a table
        pm.discover("crm_db", dataset="dbo.Orders", profile=True, sample_rows=5000)

        # Discover ADLS folder
        pm.discover("raw_adls", dataset="sales/2024/")

        # Search across all connections with a pattern
        pm.discover(pattern="fact_*")
    """
    # Cross-connection discovery when no connection specified
    if connection_name is None:
        results = {}
        pattern = kwargs.get("pattern", "")
        limit = kwargs.get("limit", 200)
        for name, conn in self.connections.items():
            try:
                results[name] = conn.discover_catalog(
                    include_schema=include_schema,
                    include_stats=include_stats,
                    limit=limit,
                    pattern=pattern,
                )
            except NotImplementedError:
                results[name] = {
                    "connection_type": type(conn).__name__,
                    "note": "Discovery not supported",
                }
            except Exception as e:
                results[name] = {"error": str(e)}
        return {
            "connections_scanned": len(results),
            "results": results,
        }

    # Single connection discovery
    conn = self.connections.get(connection_name)

    if not conn:
        available = list(self.connections.keys())
        return {
            "error": {
                "code": "CONNECTION_NOT_FOUND",
                "message": f"Connection '{connection_name}' not found",
                "available_connections": available,
                "fix": f"Use one of: {', '.join(available)}",
            }
        }

    try:
        # Profile specific dataset
        if profile and dataset:
            return conn.profile(
                dataset=dataset, sample_rows=sample_rows, columns=kwargs.get("columns")
            )

        # Get schema for specific dataset
        elif dataset and (include_schema or not profile):
            schema_result = conn.get_table_info(dataset)
            return schema_result

        # Discover catalog
        else:
            return conn.discover_catalog(
                include_schema=include_schema,
                include_stats=include_stats,
                limit=kwargs.get("limit", 200),
                pattern=kwargs.get("pattern", ""),
            )

    except NotImplementedError as e:
        return {
            "error": {
                "code": "NOT_SUPPORTED",
                "message": str(e),
                "connection_type": type(conn).__name__,
                "fix": "This connection type does not support discovery yet",
            }
        }
    except Exception as e:
        self._ctx.error(
            f"Discovery failed for {connection_name}", error=str(e), dataset=dataset
        )
        return {
            "error": {
                "code": "DISCOVERY_FAILED",
                "message": str(e),
                "connection": connection_name,
                "dataset": dataset,
            }
        }

`doctor()` ¶

Run diagnostics check on environment and configuration.

Returns:

Type	Description
`Dict[str, Any]`	Diagnostic result with status, packages, connections, and issues

Example

result = pm.doctor() if result["status"] == "healthy": ... print("All good!") else: ... for issue in result["issues"]: ... print(f"{issue['severity']}: {issue['message']}")

Source code in odibi/pipeline.py

def doctor(self) -> Dict[str, Any]:
    """Run diagnostics check on environment and configuration.

    Returns:
        Diagnostic result with status, packages, connections, and issues

    Example:
        >>> result = pm.doctor()
        >>> if result["status"] == "healthy":
        ...     print("All good!")
        >>> else:
        ...     for issue in result["issues"]:
        ...         print(f"{issue['severity']}: {issue['message']}")
    """
    from odibi.doctor import doctor

    return doctor()

`flush_stories(timeout=60.0)` ¶

Wait for all pending async story generation to complete.

Call this before operations that need story files to be written, such as lineage generation with SemanticLayerRunner.

Parameters:

Name	Type	Description	Default
`timeout`	`float`	Maximum seconds to wait per pipeline	`60.0`

Returns:

Type	Description
`Dict[str, Optional[str]]`	Dict mapping pipeline name to story path (or None if no pending story)

Example

manager.run(pipelines=['bronze', 'silver', 'gold']) manager.flush_stories() # Wait for all stories to be written semantic_runner.run() # Now lineage can read the stories

Source code in odibi/pipeline.py

def flush_stories(self, timeout: float = 60.0) -> Dict[str, Optional[str]]:
    """Wait for all pending async story generation to complete.

    Call this before operations that need story files to be written,
    such as lineage generation with SemanticLayerRunner.

    Args:
        timeout: Maximum seconds to wait per pipeline

    Returns:
        Dict mapping pipeline name to story path (or None if no pending story)

    Example:
        >>> manager.run(pipelines=['bronze', 'silver', 'gold'])
        >>> manager.flush_stories()  # Wait for all stories to be written
        >>> semantic_runner.run()    # Now lineage can read the stories
    """
    results = {}
    for name, pipeline in self._pipelines.items():
        story_path = pipeline.flush_stories(timeout=timeout)
        if story_path:
            results[name] = story_path
            self._ctx.debug(f"Story flushed for {name}", path=story_path)
    if results:
        self._ctx.info(f"Flushed {len(results)} pending story writes")
    return results

`freshness(connection_name, dataset, timestamp_column=None)` ¶

Check data freshness for a dataset.

Parameters:

Name	Type	Description	Default
`connection_name`	`str`	Name of connection from YAML config	required
`dataset`	`str`	Table name or file path	required
`timestamp_column`	`Optional[str]`	Column to check for max timestamp (SQL only)	`None`

Returns:

Type	Description
`Dict[str, Any]`	FreshnessResult dict with last_updated, age_hours, etc.

Examples:

Check freshness using table metadata¶

pm.freshness("crm_db", "dbo.Orders")

Check freshness using a specific timestamp column¶

pm.freshness("crm_db", "dbo.Orders", timestamp_column="order_date")

Check file freshness¶

pm.freshness("raw_data", "sales/2024.csv")

Source code in odibi/pipeline.py

def freshness(
    self,
    connection_name: str,
    dataset: str,
    timestamp_column: Optional[str] = None,
) -> Dict[str, Any]:
    """Check data freshness for a dataset.

    Args:
        connection_name: Name of connection from YAML config
        dataset: Table name or file path
        timestamp_column: Column to check for max timestamp (SQL only)

    Returns:
        FreshnessResult dict with last_updated, age_hours, etc.

    Examples:
        # Check freshness using table metadata
        pm.freshness("crm_db", "dbo.Orders")

        # Check freshness using a specific timestamp column
        pm.freshness("crm_db", "dbo.Orders", timestamp_column="order_date")

        # Check file freshness
        pm.freshness("raw_data", "sales/2024.csv")
    """
    conn = self.connections.get(connection_name)
    if not conn:
        available = list(self.connections.keys())
        return {
            "error": {
                "code": "CONNECTION_NOT_FOUND",
                "message": f"Connection '{connection_name}' not found",
                "available_connections": available,
            }
        }

    try:
        return conn.get_freshness(dataset=dataset, timestamp_column=timestamp_column)
    except TypeError:
        # LocalConnection/ADLS don't accept timestamp_column
        return conn.get_freshness(dataset=dataset)
    except NotImplementedError as e:
        return {
            "error": {
                "code": "NOT_SUPPORTED",
                "message": str(e),
                "fix": "This connection type does not support freshness checks",
            }
        }
    except Exception as e:
        self._ctx.error(
            f"Freshness check failed for {connection_name}", error=str(e), dataset=dataset
        )
        return {"error": {"code": "FRESHNESS_FAILED", "message": str(e)}}

`from_yaml(yaml_path, env=None)` `classmethod` ¶

Create PipelineManager from YAML file.

Parameters:

Name	Type	Description	Default
`yaml_path`	`str`	Path to YAML configuration file	required
`env`	`str`	Environment name to apply overrides (e.g. 'prod')	`None`

Returns:

Type	Description
`PipelineManager`	PipelineManager instance ready to run pipelines

Example

manager = PipelineManager.from_yaml("config.yaml", env="prod") results = manager.run() # Run all pipelines

Source code in odibi/pipeline.py

@classmethod
def from_yaml(
    cls: Type["PipelineManager"], yaml_path: str, env: str = None
) -> "PipelineManager":
    """Create PipelineManager from YAML file.

    Args:
        yaml_path: Path to YAML configuration file
        env: Environment name to apply overrides (e.g. 'prod')

    Returns:
        PipelineManager instance ready to run pipelines

    Example:
        >>> manager = PipelineManager.from_yaml("config.yaml", env="prod")
        >>> results = manager.run()  # Run all pipelines
    """
    logger.info(f"Loading configuration from: {yaml_path}")

    register_standard_library()

    yaml_path_obj = Path(yaml_path)
    config_dir = yaml_path_obj.parent.absolute()

    import importlib.util
    import os
    import sys

    # Load .env file from config directory if it exists
    env_file = config_dir / ".env"
    if env_file.exists():
        try:
            from dotenv import load_dotenv

            load_dotenv(env_file, override=True)
            logger.debug(f"Loaded environment from: {env_file}")
        except ImportError:
            logger.warning("python-dotenv not installed, skipping .env file")

    def load_transforms_module(path: str) -> None:
        if os.path.exists(path):
            try:
                spec = importlib.util.spec_from_file_location("transforms_autodiscovered", path)
                if spec and spec.loader:
                    module = importlib.util.module_from_spec(spec)
                    sys.modules["transforms_autodiscovered"] = module
                    spec.loader.exec_module(module)
                    logger.info(f"Auto-loaded transforms from: {path}")
            except Exception as e:
                logger.warning(f"Failed to auto-load transforms from {path}: {e}")

    load_transforms_module(os.path.join(config_dir, "transforms.py"))

    cwd = os.getcwd()
    if os.path.abspath(cwd) != str(config_dir):
        load_transforms_module(os.path.join(cwd, "transforms.py"))

    try:
        config = load_yaml_with_env(str(yaml_path_obj), env=env)
        logger.debug("Configuration loaded successfully")
    except FileNotFoundError:
        logger.error(f"YAML file not found: {yaml_path}")
        raise FileNotFoundError(
            f"YAML file not found: {yaml_path}. "
            f"Verify the file exists and consider using an absolute path."
        )

    project_config = ProjectConfig(**config)
    logger.debug(
        "Project config validated",
        project=project_config.project,
        pipelines=len(project_config.pipelines),
    )

    connections = cls._build_connections(project_config.connections)

    return cls(
        project_config=project_config,
        connections=connections,
    )

`get_all_state(prefix=None)` ¶

Get all state entries, optionally filtered by key prefix.

Parameters:

Name	Type	Description	Default
`prefix`	`Optional[str]`	Optional key prefix to filter by	`None`

Returns:

Type	Description
`DataFrame`	DataFrame with state entries

Source code in odibi/pipeline.py

def get_all_state(self, prefix: Optional[str] = None) -> "pd.DataFrame":
    """Get all state entries, optionally filtered by key prefix.

    Args:
        prefix: Optional key prefix to filter by

    Returns:
        DataFrame with state entries
    """
    import pandas as pd

    if not self.catalog_manager:
        return pd.DataFrame()

    try:
        df = self.catalog_manager._read_table(self.catalog_manager.tables["meta_state"])
        if not df.empty and prefix and "key" in df.columns:
            df = df[df["key"].str.startswith(prefix)]
        return df
    except Exception as e:
        self._ctx.warning(f"Failed to get state: {e}")
        return pd.DataFrame()

`get_lineage(table, direction='both')` ¶

Get lineage for a table.

Parameters:

Name	Type	Description	Default
`table`	`str`	Table identifier (supports smart path resolution)	required
`direction`	`str`	"upstream", "downstream", or "both"	`'both'`

Returns:

Type	Description
`DataFrame`	DataFrame with lineage relationships

Source code in odibi/pipeline.py

def get_lineage(
    self,
    table: str,
    direction: str = "both",
) -> "pd.DataFrame":
    """Get lineage for a table.

    Args:
        table: Table identifier (supports smart path resolution)
        direction: "upstream", "downstream", or "both"

    Returns:
        DataFrame with lineage relationships
    """
    import pandas as pd

    if not self.catalog_manager:
        return pd.DataFrame()

    try:
        resolved_path = self._resolve_table_path(table)

        results = []
        if direction in ("upstream", "both"):
            upstream = self.catalog_manager.get_upstream(resolved_path)
            for r in upstream:
                r["direction"] = "upstream"
            results.extend(upstream)

        if direction in ("downstream", "both"):
            downstream = self.catalog_manager.get_downstream(resolved_path)
            for r in downstream:
                r["direction"] = "downstream"
            results.extend(downstream)

        return pd.DataFrame(results)
    except Exception as e:
        self._ctx.warning(f"Failed to get lineage: {e}")
        return pd.DataFrame()

`get_node_stats(node, days=7)` ¶

Get average duration, row counts, success rate over period.

Parameters:

Name	Type	Description	Default
`node`	`str`	Node name	required
`days`	`int`	Number of days to look back	`7`

Returns:

Type	Description
`Dict[str, Any]`	Dict with node statistics

Source code in odibi/pipeline.py

def get_node_stats(self, node: str, days: int = 7) -> Dict[str, Any]:
    """Get average duration, row counts, success rate over period.

    Args:
        node: Node name
        days: Number of days to look back

    Returns:
        Dict with node statistics
    """
    import pandas as pd

    if not self.catalog_manager:
        return {}

    try:
        avg_duration = self.catalog_manager.get_average_duration(node, days)

        df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_runs"])
        if df.empty:
            return {"node": node, "runs": 0}

        if "timestamp" in df.columns:
            cutoff = pd.Timestamp.now(tz="UTC") - pd.Timedelta(days=days)
            if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
                df["timestamp"] = pd.to_datetime(df["timestamp"])
            if df["timestamp"].dt.tz is None:
                df["timestamp"] = df["timestamp"].dt.tz_localize("UTC")
            df = df[df["timestamp"] >= cutoff]

        node_runs = df[df["node_name"] == node]
        if node_runs.empty:
            return {"node": node, "runs": 0}

        total = len(node_runs)
        success = len(node_runs[node_runs["status"] == "SUCCESS"])
        avg_rows = node_runs["rows_processed"].mean() if "rows_processed" in node_runs else None

        return {
            "node": node,
            "runs": total,
            "success_rate": success / total if total > 0 else 0,
            "avg_duration_s": avg_duration,
            "avg_rows": avg_rows,
            "period_days": days,
        }
    except Exception as e:
        self._ctx.warning(f"Failed to get node stats: {e}")
        return {}

`get_pipeline(name)` ¶

Get a specific pipeline instance.

Parameters:

Name	Type	Description	Default
`name`	`str`	Pipeline name	required

Returns:

Type	Description
`Pipeline`	Pipeline instance

Raises:

Type	Description
`ValueError`	If pipeline not found

Source code in odibi/pipeline.py

def get_pipeline(self, name: str) -> Pipeline:
    """Get a specific pipeline instance.

    Args:
        name: Pipeline name

    Returns:
        Pipeline instance

    Raises:
        ValueError: If pipeline not found
    """
    if name not in self._pipelines:
        available = ", ".join(self._pipelines.keys())
        raise ValueError(f"Pipeline '{name}' not found. Available: {available}")
    return self._pipelines[name]

`get_pipeline_status(pipeline)` ¶

Get last run status, duration, timestamp for a pipeline.

Parameters:

Name	Type	Description	Default
`pipeline`	`str`	Pipeline name	required

Returns:

Type	Description
`Dict[str, Any]`	Dict with status info

Source code in odibi/pipeline.py

def get_pipeline_status(self, pipeline: str) -> Dict[str, Any]:
    """Get last run status, duration, timestamp for a pipeline.

    Args:
        pipeline: Pipeline name

    Returns:
        Dict with status info
    """
    if not self.catalog_manager:
        return {}

    try:
        runs = self.list_runs(pipeline=pipeline, limit=1)
        if runs.empty:
            return {"status": "never_run", "pipeline": pipeline}

        last_run = runs.iloc[0].to_dict()
        return {
            "pipeline": pipeline,
            "last_status": last_run.get("status"),
            "last_run_at": last_run.get("timestamp"),
            "last_duration_ms": last_run.get("duration_ms"),
            "last_node": last_run.get("node_name"),
        }
    except Exception as e:
        self._ctx.warning(f"Failed to get pipeline status: {e}")
        return {}

`get_schema_history(table, limit=5)` ¶

Get schema version history for a table.

Parameters:

Name	Type	Description	Default
`table`	`str`	Table identifier (supports smart path resolution)	required
`limit`	`int`	Maximum number of versions to return	`5`

Returns:

Type	Description
`DataFrame`	DataFrame with schema history

Source code in odibi/pipeline.py

def get_schema_history(
    self,
    table: str,
    limit: int = 5,
) -> "pd.DataFrame":
    """Get schema version history for a table.

    Args:
        table: Table identifier (supports smart path resolution)
        limit: Maximum number of versions to return

    Returns:
        DataFrame with schema history
    """
    import pandas as pd

    if not self.catalog_manager:
        return pd.DataFrame()

    try:
        resolved_path = self._resolve_table_path(table)
        history = self.catalog_manager.get_schema_history(resolved_path, limit)
        return pd.DataFrame(history)
    except Exception as e:
        self._ctx.warning(f"Failed to get schema history: {e}")
        return pd.DataFrame()

`get_state(key)` ¶

Get a specific state entry (HWM, content hash, etc.).

Parameters:

Name	Type	Description	Default
`key`	`str`	The state key to look up	required

Returns:

Type	Description
`Optional[Dict[str, Any]]`	Dictionary with state data or None if not found

Source code in odibi/pipeline.py

def get_state(self, key: str) -> Optional[Dict[str, Any]]:
    """Get a specific state entry (HWM, content hash, etc.).

    Args:
        key: The state key to look up

    Returns:
        Dictionary with state data or None if not found
    """

    if not self.catalog_manager:
        return None

    try:
        df = self.catalog_manager._read_table(self.catalog_manager.tables["meta_state"])
        if df.empty or "key" not in df.columns:
            return None

        row = df[df["key"] == key]
        if row.empty:
            return None

        return row.iloc[0].to_dict()
    except Exception:
        return None

`list_pipelines()` ¶

Get list of available pipeline names.

Returns:

Type	Description
`List[str]`	List of pipeline names

Source code in odibi/pipeline.py

def list_pipelines(self) -> List[str]:
    """Get list of available pipeline names.

    Returns:
        List of pipeline names
    """
    return list(self._pipelines.keys())

`list_registered_nodes(pipeline=None)` ¶

List nodes from the system catalog.

Parameters:

Name	Type	Description	Default
`pipeline`	`Optional[str]`	Optional pipeline name to filter by	`None`

Returns:

Type	Description
`DataFrame`	DataFrame with node metadata from meta_nodes

Source code in odibi/pipeline.py

def list_registered_nodes(self, pipeline: Optional[str] = None) -> "pd.DataFrame":
    """List nodes from the system catalog.

    Args:
        pipeline: Optional pipeline name to filter by

    Returns:
        DataFrame with node metadata from meta_nodes
    """
    import pandas as pd

    if not self.catalog_manager:
        self._ctx.warning("Catalog manager not configured")
        return pd.DataFrame()

    try:
        df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_nodes"])
        if not df.empty and pipeline:
            df = df[df["pipeline_name"] == pipeline]
        return df
    except Exception as e:
        self._ctx.warning(f"Failed to list nodes: {e}")
        return pd.DataFrame()

`list_registered_pipelines()` ¶

List all registered pipelines from the system catalog.

Returns:

Type	Description
`DataFrame`	DataFrame with pipeline metadata from meta_pipelines

Source code in odibi/pipeline.py

def list_registered_pipelines(self) -> "pd.DataFrame":
    """List all registered pipelines from the system catalog.

    Returns:
        DataFrame with pipeline metadata from meta_pipelines
    """
    import pandas as pd

    if not self.catalog_manager:
        self._ctx.warning("Catalog manager not configured")
        return pd.DataFrame()

    try:
        df = self.catalog_manager._read_local_table(
            self.catalog_manager.tables["meta_pipelines"]
        )
        return df
    except Exception as e:
        self._ctx.warning(f"Failed to list pipelines: {e}")
        return pd.DataFrame()

`list_runs(pipeline=None, node=None, status=None, limit=10)` ¶

List recent runs with optional filters.

Parameters:

Name	Type	Description	Default
`pipeline`	`Optional[str]`	Optional pipeline name to filter by	`None`
`node`	`Optional[str]`	Optional node name to filter by	`None`
`status`	`Optional[str]`	Optional status to filter by (SUCCESS, FAILURE)	`None`
`limit`	`int`	Maximum number of runs to return	`10`

Returns:

Type	Description
`DataFrame`	DataFrame with run history from meta_runs

Source code in odibi/pipeline.py

def list_runs(
    self,
    pipeline: Optional[str] = None,
    node: Optional[str] = None,
    status: Optional[str] = None,
    limit: int = 10,
) -> "pd.DataFrame":
    """List recent runs with optional filters.

    Args:
        pipeline: Optional pipeline name to filter by
        node: Optional node name to filter by
        status: Optional status to filter by (SUCCESS, FAILURE)
        limit: Maximum number of runs to return

    Returns:
        DataFrame with run history from meta_runs
    """
    import pandas as pd

    if not self.catalog_manager:
        self._ctx.warning("Catalog manager not configured")
        return pd.DataFrame()

    try:
        df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_runs"])
        if df.empty:
            return df

        if pipeline:
            df = df[df["pipeline_name"] == pipeline]
        if node:
            df = df[df["node_name"] == node]
        if status:
            df = df[df["status"] == status]

        if "timestamp" in df.columns:
            df = df.sort_values("timestamp", ascending=False)

        return df.head(limit)
    except Exception as e:
        self._ctx.warning(f"Failed to list runs: {e}")
        return pd.DataFrame()

`list_tables()` ¶

List registered assets from meta_tables.

Returns:

Type	Description
`DataFrame`	DataFrame with table/asset metadata

Source code in odibi/pipeline.py

def list_tables(self) -> "pd.DataFrame":
    """List registered assets from meta_tables.

    Returns:
        DataFrame with table/asset metadata
    """
    import pandas as pd

    if not self.catalog_manager:
        self._ctx.warning("Catalog manager not configured")
        return pd.DataFrame()

    try:
        df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_tables"])
        return df
    except Exception as e:
        self._ctx.warning(f"Failed to list tables: {e}")
        return pd.DataFrame()

`preview(connection_name, dataset, rows=5, columns=None)` ¶

Preview sample rows from a dataset.

Parameters:

Name	Type	Description	Default
`connection_name`	`str`	Name of connection from YAML config	required
`dataset`	`str`	Table name or file path to preview	required
`rows`	`int`	Number of rows to return (default: 5, max: 100)	`5`
`columns`	`Optional[List[str]]`	Specific columns to include (None = all)	`None`

Returns:

Type	Description
`Dict[str, Any]`	PreviewResult dict with sample rows

Examples:

Preview a SQL table¶

pm.preview("crm_db", "dbo.Orders", rows=10)

Preview specific columns¶

pm.preview("crm_db", "dbo.Orders", columns=["order_id", "total"])

Preview a CSV file¶

pm.preview("raw_data", "sales/2024.csv")

Source code in odibi/pipeline.py

def preview(
    self,
    connection_name: str,
    dataset: str,
    rows: int = 5,
    columns: Optional[List[str]] = None,
) -> Dict[str, Any]:
    """Preview sample rows from a dataset.

    Args:
        connection_name: Name of connection from YAML config
        dataset: Table name or file path to preview
        rows: Number of rows to return (default: 5, max: 100)
        columns: Specific columns to include (None = all)

    Returns:
        PreviewResult dict with sample rows

    Examples:
        # Preview a SQL table
        pm.preview("crm_db", "dbo.Orders", rows=10)

        # Preview specific columns
        pm.preview("crm_db", "dbo.Orders", columns=["order_id", "total"])

        # Preview a CSV file
        pm.preview("raw_data", "sales/2024.csv")
    """
    conn = self.connections.get(connection_name)
    if not conn:
        available = list(self.connections.keys())
        return {
            "error": {
                "code": "CONNECTION_NOT_FOUND",
                "message": f"Connection '{connection_name}' not found",
                "available_connections": available,
            }
        }

    try:
        return conn.preview(dataset=dataset, rows=rows, columns=columns)
    except NotImplementedError as e:
        return {
            "error": {
                "code": "NOT_SUPPORTED",
                "message": str(e),
                "fix": "This connection type does not support preview",
            }
        }
    except Exception as e:
        self._ctx.error(f"Preview failed for {connection_name}", error=str(e), dataset=dataset)
        return {"error": {"code": "PREVIEW_FAILED", "message": str(e)}}

`register_outputs(pipelines=None)` ¶

Pre-register node outputs from pipeline configs without running them.

Scans pipeline nodes for output locations (write blocks, merge/scd2 params) and registers them to meta_outputs. This enables cross-pipeline references without requiring the source pipelines to have run first.

Parameters:

Name	Type	Description	Default
`pipelines`	`Optional[Union[str, List[str]]]`	Pipeline name(s) to register. If None, registers all pipelines.	`None`

Returns:

Type	Description
`Dict[str, int]`	Dict mapping pipeline name to number of outputs registered

Example

manager = PipelineManager.from_yaml("pipelines.yaml") counts = manager.register_outputs("silver") # Register just silver counts = manager.register_outputs() # Register all pipelines

Source code in odibi/pipeline.py

def register_outputs(
    self,
    pipelines: Optional[Union[str, List[str]]] = None,
) -> Dict[str, int]:
    """
    Pre-register node outputs from pipeline configs without running them.

    Scans pipeline nodes for output locations (write blocks, merge/scd2 params)
    and registers them to meta_outputs. This enables cross-pipeline references
    without requiring the source pipelines to have run first.

    Args:
        pipelines: Pipeline name(s) to register. If None, registers all pipelines.

    Returns:
        Dict mapping pipeline name to number of outputs registered

    Example:
        >>> manager = PipelineManager.from_yaml("pipelines.yaml")
        >>> counts = manager.register_outputs("silver")  # Register just silver
        >>> counts = manager.register_outputs()  # Register all pipelines
    """
    if pipelines is None:
        pipeline_names = list(self._pipelines.keys())
    elif isinstance(pipelines, str):
        pipeline_names = [pipelines]
    else:
        pipeline_names = pipelines

    results = {}
    for name in pipeline_names:
        if name not in self._pipelines:
            self._ctx.warning(f"Pipeline not found: {name}")
            continue

        pipeline = self._pipelines[name]
        count = pipeline.register_outputs()
        results[name] = count

    total = sum(results.values())
    self._ctx.info(f"Pre-registered {total} outputs from {len(results)} pipelines")
    return results

`relationships(connection_name, schema=None)` ¶

Discover foreign key relationships in a SQL database.

Parameters:

Name	Type	Description	Default
`connection_name`	`str`	Name of SQL connection from YAML config	required
`schema`	`Optional[str]`	Limit to specific schema (default: all schemas)	`None`

Returns:

Type	Description
`Union[List[Dict[str, Any]], Dict[str, Any]]`	List of Relationship dicts, or error dict

Examples:

Discover all FKs¶

pm.relationships("crm_db")

FKs in a specific schema¶

pm.relationships("crm_db", schema="dbo")

Source code in odibi/pipeline.py

def relationships(
    self,
    connection_name: str,
    schema: Optional[str] = None,
) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
    """Discover foreign key relationships in a SQL database.

    Args:
        connection_name: Name of SQL connection from YAML config
        schema: Limit to specific schema (default: all schemas)

    Returns:
        List of Relationship dicts, or error dict

    Examples:
        # Discover all FKs
        pm.relationships("crm_db")

        # FKs in a specific schema
        pm.relationships("crm_db", schema="dbo")
    """
    conn = self.connections.get(connection_name)
    if not conn:
        available = list(self.connections.keys())
        return {
            "error": {
                "code": "CONNECTION_NOT_FOUND",
                "message": f"Connection '{connection_name}' not found",
                "available_connections": available,
            }
        }

    if not hasattr(conn, "relationships"):
        return {
            "error": {
                "code": "NOT_SUPPORTED",
                "message": f"{type(conn).__name__} does not support relationship discovery",
                "fix": "Only SQL connections support FK discovery",
            }
        }

    try:
        return conn.relationships(schema=schema)
    except Exception as e:
        self._ctx.error(f"Relationship discovery failed for {connection_name}", error=str(e))
        return {"error": {"code": "DISCOVERY_FAILED", "message": str(e)}}

`run(pipelines=None, dry_run=False, resume_from_failure=False, parallel=False, max_workers=4, on_error=None, tag=None, node=None, console=False)` ¶

Run one, multiple, or all pipelines.

Parameters:

Name	Type	Description	Default
`pipelines`	`Optional[Union[str, List[str]]]`	Pipeline name(s) to run.	`None`
`dry_run`	`bool`	Whether to simulate execution.	`False`
`resume_from_failure`	`bool`	Whether to skip successfully completed nodes from last run.	`False`
`parallel`	`bool`	Whether to run nodes in parallel.	`False`
`max_workers`	`int`	Maximum number of worker threads for parallel execution.	`4`
`on_error`	`Optional[str]`	Override error handling strategy (fail_fast, fail_later, ignore).	`None`
`tag`	`Optional[str]`	Filter nodes by tag (only nodes with this tag will run).	`None`
`node`	`Optional[Union[str, List[str]]]`	Run only specific node(s) by name - can be a string or list of strings.	`None`
`console`	`bool`	Whether to show rich console output with progress.	`False`

Returns:

Type	Description
`Union[PipelineResults, Dict[str, PipelineResults]]`	PipelineResults or Dict of results

Source code in odibi/pipeline.py

def run(
    self,
    pipelines: Optional[Union[str, List[str]]] = None,
    dry_run: bool = False,
    resume_from_failure: bool = False,
    parallel: bool = False,
    max_workers: int = 4,
    on_error: Optional[str] = None,
    tag: Optional[str] = None,
    node: Optional[Union[str, List[str]]] = None,
    console: bool = False,
) -> Union[PipelineResults, Dict[str, PipelineResults]]:
    """Run one, multiple, or all pipelines.

    Args:
        pipelines: Pipeline name(s) to run.
        dry_run: Whether to simulate execution.
        resume_from_failure: Whether to skip successfully completed nodes from last run.
        parallel: Whether to run nodes in parallel.
        max_workers: Maximum number of worker threads for parallel execution.
        on_error: Override error handling strategy (fail_fast, fail_later, ignore).
        tag: Filter nodes by tag (only nodes with this tag will run).
        node: Run only specific node(s) by name - can be a string or list of strings.
        console: Whether to show rich console output with progress.

    Returns:
        PipelineResults or Dict of results
    """
    import time

    t_start = time.time()
    overhead_timings = {}

    if pipelines is None:
        pipeline_names = list(self._pipelines.keys())
    elif isinstance(pipelines, str):
        pipeline_names = [pipelines]
    else:
        pipeline_names = pipelines

    for name in pipeline_names:
        if name not in self._pipelines:
            available = ", ".join(self._pipelines.keys())
            self._ctx.error(
                f"Pipeline not found: {name}",
                available=list(self._pipelines.keys()),
            )
            raise ValueError(f"Pipeline '{name}' not found. Available pipelines: {available}")

    # Phase 2: Auto-register pipelines and nodes before execution
    t_register_start = time.time()
    if self.catalog_manager:
        self._auto_register_pipelines(pipeline_names)
    t_register_end = time.time()
    overhead_timings["auto_register"] = t_register_end - t_register_start

    self._ctx.info(
        f"Running {len(pipeline_names)} pipeline(s)",
        pipelines=pipeline_names,
        dry_run=dry_run,
        parallel=parallel,
    )

    results = {}
    inter_pipeline_gaps = []
    lineage_futures = []  # Track incremental lineage building

    for idx, name in enumerate(pipeline_names):
        t_pre_pipeline = time.time()

        self._ctx.info(
            f"Executing pipeline {idx + 1}/{len(pipeline_names)}: {name}",
            pipeline=name,
            order=idx + 1,
        )

        t_pipeline_start = time.time()
        results[name] = self._pipelines[name].run(
            dry_run=dry_run,
            resume_from_failure=resume_from_failure,
            parallel=parallel,
            max_workers=max_workers,
            on_error=on_error,
            tag=tag,
            node=node,
            console=console,
        )

        result = results[name]
        status = "SUCCESS" if not result.failed else "FAILED"
        self._ctx.info(
            f"Pipeline {status}: {name}",
            status=status,
            duration_s=round(result.duration, 2),
            completed=len(result.completed),
            failed=len(result.failed),
        )

        if result.story_path:
            self._ctx.debug(f"Story generated: {result.story_path}")

        # Start building lineage for this pipeline incrementally (if async enabled)
        has_story = hasattr(self.project_config, "story") and self.project_config.story
        generate_lineage_enabled = has_story and self.project_config.story.generate_lineage
        async_lineage = True
        if self.project_config and self.project_config.system:
            async_lineage = getattr(self.project_config.system, "async_lineage", True)

        if generate_lineage_enabled and async_lineage and result.story_path:
            # Build this pipeline's lineage piece in background
            from concurrent.futures import ThreadPoolExecutor

            if not hasattr(self, "_lineage_executor"):
                self._lineage_executor = ThreadPoolExecutor(
                    max_workers=3, thread_name_prefix="Lineage"
                )

            future = self._lineage_executor.submit(
                self._build_pipeline_lineage_piece, name, result.story_path
            )
            lineage_futures.append((name, future))
            self._ctx.debug(f"Started incremental lineage building for {name}")

        # Track inter-pipeline overhead (time between pipelines excluding actual execution)
        if idx > 0:
            gap = t_pipeline_start - t_pre_pipeline
            inter_pipeline_gaps.append(gap)

    overhead_timings["inter_pipeline_gaps_total"] = sum(inter_pipeline_gaps)

    # Generate combined lineage if configured
    t_lineage_start = time.time()
    has_story = hasattr(self.project_config, "story") and self.project_config.story
    generate_lineage_enabled = has_story and self.project_config.story.generate_lineage
    async_lineage = True
    if self.project_config and self.project_config.system:
        async_lineage = getattr(self.project_config.system, "async_lineage", True)

    self._ctx.debug(
        "Lineage check",
        has_story=has_story,
        generate_lineage_enabled=generate_lineage_enabled,
    )

    if generate_lineage_enabled:
        # Flush any pending async story writes before generating lineage
        self._ctx.info("Generating combined lineage...")
        self.flush_stories()

        try:
            if async_lineage and lineage_futures:
                # Merge incrementally-built lineage pieces (fast!)
                self._ctx.debug(
                    f"Waiting for {len(lineage_futures)} incremental lineage pieces..."
                )
                lineage_pieces = []
                for pipeline_name, future in lineage_futures:
                    try:
                        piece = future.result(timeout=30)  # Should be done already
                        if piece:
                            lineage_pieces.append(piece)
                        self._ctx.debug(f"Got lineage piece for {pipeline_name}")
                    except Exception as e:
                        self._ctx.warning(
                            f"Failed to get lineage piece for {pipeline_name}: {e}"
                        )

                # Merge the pieces
                if lineage_pieces:
                    lineage_result = self._merge_lineage_pieces(lineage_pieces)
                    if lineage_result:
                        self._ctx.info(
                            "Combined lineage generated from incremental pieces",
                            nodes=len(lineage_result.nodes),
                            edges=len(lineage_result.edges),
                            json_path=lineage_result.json_path,
                        )
                else:
                    self._ctx.warning("No lineage pieces generated")

                # Cleanup executor
                if hasattr(self, "_lineage_executor"):
                    self._lineage_executor.shutdown(wait=False)
            else:
                # Synchronous - build entire lineage at once (original behavior)
                lineage_result = generate_lineage(self.project_config)
                if lineage_result:
                    self._ctx.info(
                        "Combined lineage generated",
                        nodes=len(lineage_result.nodes),
                        edges=len(lineage_result.edges),
                        json_path=lineage_result.json_path,
                    )
                else:
                    self._ctx.warning("Lineage generation returned None")
        except Exception as e:
            self._ctx.warning(f"Failed to generate combined lineage: {e}")
    t_lineage_end = time.time()
    overhead_timings["lineage_generation"] = t_lineage_end - t_lineage_start

    # Wait for any pending async catalog syncs to complete (with reduced timeout)
    t_sync_start = time.time()

    # Check if we should skip waiting in Databricks
    skip_sync_wait = False
    if self.project_config and self.project_config.system:
        skip_in_databricks = getattr(
            self.project_config.system, "skip_sync_wait_in_databricks", True
        )
        if skip_in_databricks and self._is_databricks():
            skip_sync_wait = True
            self._ctx.info(
                "Running in Databricks - skipping sync wait (threads continue in background)"
            )

    if not skip_sync_wait:
        sync_timeout = 30.0  # Default reduced timeout for performance
        if self.project_config and self.project_config.system:
            sync_timeout = getattr(self.project_config.system, "sync_timeout_seconds", 30.0)

        for name in pipeline_names:
            pipeline = self._pipelines[name]
            if hasattr(pipeline, "flush_sync"):
                # Configurable timeout (default 30s, was 300s)
                # Sync is incremental, so incomplete syncs will catch up next run
                pipeline.flush_sync(timeout=sync_timeout)

    t_sync_end = time.time()
    overhead_timings["catalog_sync"] = t_sync_end - t_sync_start

    t_end = time.time()
    total_overhead = t_end - t_start

    # Calculate actual pipeline execution time
    actual_execution = sum(r.duration for r in results.values())
    overhead_timings["total_overhead"] = total_overhead
    overhead_timings["actual_execution"] = actual_execution
    overhead_timings["overhead_delta"] = total_overhead - actual_execution

    # Print overhead audit report
    def _pct(value, total):
        return 100 * value / total if total > 0 else 0.0

    print("\n" + "=" * 100)
    print("INTER-PIPELINE OVERHEAD AUDIT")
    print("=" * 100)
    print(f"Total wall-clock time:          {total_overhead:>8.2f}s (100.0%)")
    print(
        f"Actual pipeline execution:      {actual_execution:>8.2f}s ({_pct(actual_execution, total_overhead):>5.1f}%)"
    )
    print(
        f"Total overhead:                 {overhead_timings['overhead_delta']:>8.2f}s ({_pct(overhead_timings['overhead_delta'], total_overhead):>5.1f}%)"
    )
    print("-" * 100)
    print("OVERHEAD BREAKDOWN:")
    print(
        f"  Auto-register pipelines:      {overhead_timings.get('auto_register', 0):>8.2f}s ({_pct(overhead_timings.get('auto_register', 0), total_overhead):>5.1f}%)"
    )
    print(
        f"  Inter-pipeline gaps:          {overhead_timings.get('inter_pipeline_gaps_total', 0):>8.2f}s ({_pct(overhead_timings.get('inter_pipeline_gaps_total', 0), total_overhead):>5.1f}%)"
    )
    print(
        f"  Lineage generation:           {overhead_timings.get('lineage_generation', 0):>8.2f}s ({_pct(overhead_timings.get('lineage_generation', 0), total_overhead):>5.1f}%)"
    )
    print(
        f"  Catalog sync flush:           {overhead_timings.get('catalog_sync', 0):>8.2f}s ({_pct(overhead_timings.get('catalog_sync', 0), total_overhead):>5.1f}%)"
    )
    print("=" * 100 + "\n")

    if len(pipeline_names) == 1:
        return results[pipeline_names[0]]
    else:
        return results

`scaffold_project(project_name, connections, **kwargs)` ¶

Generate project YAML scaffold.

Parameters:

Name	Type	Description	Default
`project_name`	`str`	Name of the project	required
`connections`	`Dict[str, Dict[str, Any]]`	Dict of connection name -> connection config	required
`**kwargs`		Additional options (imports, story_connection, system_connection)	`{}`

Returns:

Type	Description
`str`	YAML string for project.yaml

Example

connections = { ... "local": {"type": "local", "base_path": "data/"}, ... "azure": {"type": "azure_blob", "account_name": "myaccount"} ... } yaml = pm.scaffold_project("my_project", connections)

Source code in odibi/pipeline.py

def scaffold_project(
    self, project_name: str, connections: Dict[str, Dict[str, Any]], **kwargs
) -> str:
    """Generate project YAML scaffold.

    Args:
        project_name: Name of the project
        connections: Dict of connection name -> connection config
        **kwargs: Additional options (imports, story_connection, system_connection)

    Returns:
        YAML string for project.yaml

    Example:
        >>> connections = {
        ...     "local": {"type": "local", "base_path": "data/"},
        ...     "azure": {"type": "azure_blob", "account_name": "myaccount"}
        ... }
        >>> yaml = pm.scaffold_project("my_project", connections)
    """
    from odibi.scaffold import generate_project_yaml

    return generate_project_yaml(project_name, connections, **kwargs)

`scaffold_sql_pipeline(pipeline_name, source_connection, target_connection, tables, **kwargs)` ¶

Generate SQL ingestion pipeline YAML.

Parameters:

Name	Type	Description	Default
`pipeline_name`	`str`	Name for the pipeline	required
`source_connection`	`str`	SQL database connection name	required
`target_connection`	`str`	Target storage connection name	required
`tables`	`List[Dict[str, Any]]`	List of table specs (see generate_sql_pipeline for details)	required
`**kwargs`		Additional options (target_format, target_schema, layer, node_prefix)	`{}`

Returns:

Type	Description
`str`	YAML string for pipeline file

Example

tables = [ ... {"schema": "dbo", "table": "customers", "primary_key": ["id"]}, ... {"schema": "dbo", "table": "orders"} ... ] yaml = pm.scaffold_sql_pipeline("ingest", "sqldb", "lake", tables)

Source code in odibi/pipeline.py

def scaffold_sql_pipeline(
    self,
    pipeline_name: str,
    source_connection: str,
    target_connection: str,
    tables: List[Dict[str, Any]],
    **kwargs,
) -> str:
    """Generate SQL ingestion pipeline YAML.

    Args:
        pipeline_name: Name for the pipeline
        source_connection: SQL database connection name
        target_connection: Target storage connection name
        tables: List of table specs (see generate_sql_pipeline for details)
        **kwargs: Additional options (target_format, target_schema, layer, node_prefix)

    Returns:
        YAML string for pipeline file

    Example:
        >>> tables = [
        ...     {"schema": "dbo", "table": "customers", "primary_key": ["id"]},
        ...     {"schema": "dbo", "table": "orders"}
        ... ]
        >>> yaml = pm.scaffold_sql_pipeline("ingest", "sqldb", "lake", tables)
    """
    from odibi.scaffold import generate_sql_pipeline

    return generate_sql_pipeline(
        pipeline_name, source_connection, target_connection, tables, **kwargs
    )

`validate_yaml(yaml_content)` ¶

Validate pipeline YAML.

Parameters:

Name	Type	Description	Default
`yaml_content`	`str`	YAML string to validate	required

Returns:

Type	Description
`Dict[str, Any]`	Validation result with errors/warnings

Example

result = pm.validate_yaml(yaml_string) if not result["valid"]: ... for error in result["errors"]: ... print(f"{error['field_path']}: {error['message']}")

Source code in odibi/pipeline.py

def validate_yaml(self, yaml_content: str) -> Dict[str, Any]:
    """Validate pipeline YAML.

    Args:
        yaml_content: YAML string to validate

    Returns:
        Validation result with errors/warnings

    Example:
        >>> result = pm.validate_yaml(yaml_string)
        >>> if not result["valid"]:
        ...     for error in result["errors"]:
        ...         print(f"{error['field_path']}: {error['message']}")
    """
    from odibi.validate import validate_yaml

    return validate_yaml(yaml_content)

`PipelineResults` `dataclass` ¶

Results from pipeline execution.

Source code in odibi/pipeline.py

@dataclass
class PipelineResults:
    """Results from pipeline execution."""

    pipeline_name: str
    completed: List[str] = field(default_factory=list)
    failed: List[str] = field(default_factory=list)
    skipped: List[str] = field(default_factory=list)
    node_results: Dict[str, NodeResult] = field(default_factory=dict)
    duration: float = 0.0
    start_time: Optional[str] = None
    end_time: Optional[str] = None
    story_path: Optional[str] = None

    def get_node_result(self, name: str) -> Optional[NodeResult]:
        """Get result for specific node.

        Args:
            name: Node name

        Returns:
            NodeResult if available, None otherwise
        """
        return self.node_results.get(name)

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary.

        Returns:
            Dictionary representation
        """
        return {
            "pipeline_name": self.pipeline_name,
            "completed": self.completed,
            "failed": self.failed,
            "skipped": self.skipped,
            "duration": self.duration,
            "start_time": self.start_time,
            "end_time": self.end_time,
            "node_count": len(self.node_results),
        }

    def debug_summary(self) -> str:
        """Generate a debug summary with next steps for failed pipelines.

        Returns:
            Formatted string with failure details and suggested next steps.
            Returns empty string if pipeline succeeded.
        """
        if not self.failed:
            return ""

        lines = []
        lines.append(f"\n{'=' * 60}")
        lines.append(f"❌ Pipeline '{self.pipeline_name}' failed")
        lines.append(f"{'=' * 60}")

        # List failed nodes with errors
        lines.append("\nFailed nodes:")
        for node_name in self.failed:
            node_res = self.node_results.get(node_name)
            if node_res and node_res.error:
                error_msg = str(node_res.error)[:200]
                lines.append(f"  • {node_name}: {error_msg}")
            else:
                lines.append(f"  • {node_name}")

        # Story path if available
        lines.append("\n📖 Next Steps:")
        if self.story_path:
            lines.append("  1. View the execution story:")
            lines.append(f"     odibi story show {self.story_path}")
            lines.append("")
            lines.append("  2. Inspect a specific failed node:")
            first_failed = self.failed[0] if self.failed else "<node_name>"
            lines.append(f"     odibi story last --node {first_failed}")
        else:
            lines.append("  1. Check the logs for error details")

        lines.append("")
        lines.append("  3. If this is an environment issue:")
        lines.append("     odibi doctor")
        lines.append("")

        return "\n".join(lines)

`debug_summary()` ¶

Generate a debug summary with next steps for failed pipelines.

Returns:

Type	Description
`str`	Formatted string with failure details and suggested next steps.
`str`	Returns empty string if pipeline succeeded.

Source code in odibi/pipeline.py

def debug_summary(self) -> str:
    """Generate a debug summary with next steps for failed pipelines.

    Returns:
        Formatted string with failure details and suggested next steps.
        Returns empty string if pipeline succeeded.
    """
    if not self.failed:
        return ""

    lines = []
    lines.append(f"\n{'=' * 60}")
    lines.append(f"❌ Pipeline '{self.pipeline_name}' failed")
    lines.append(f"{'=' * 60}")

    # List failed nodes with errors
    lines.append("\nFailed nodes:")
    for node_name in self.failed:
        node_res = self.node_results.get(node_name)
        if node_res and node_res.error:
            error_msg = str(node_res.error)[:200]
            lines.append(f"  • {node_name}: {error_msg}")
        else:
            lines.append(f"  • {node_name}")

    # Story path if available
    lines.append("\n📖 Next Steps:")
    if self.story_path:
        lines.append("  1. View the execution story:")
        lines.append(f"     odibi story show {self.story_path}")
        lines.append("")
        lines.append("  2. Inspect a specific failed node:")
        first_failed = self.failed[0] if self.failed else "<node_name>"
        lines.append(f"     odibi story last --node {first_failed}")
    else:
        lines.append("  1. Check the logs for error details")

    lines.append("")
    lines.append("  3. If this is an environment issue:")
    lines.append("     odibi doctor")
    lines.append("")

    return "\n".join(lines)

`get_node_result(name)` ¶

Get result for specific node.

Parameters:

Name	Type	Description	Default
`name`	`str`	Node name	required

Returns:

Type	Description
`Optional[NodeResult]`	NodeResult if available, None otherwise

Source code in odibi/pipeline.py

def get_node_result(self, name: str) -> Optional[NodeResult]:
    """Get result for specific node.

    Args:
        name: Node name

    Returns:
        NodeResult if available, None otherwise
    """
    return self.node_results.get(name)

`to_dict()` ¶

Convert to dictionary.

Returns:

Type	Description
`Dict[str, Any]`	Dictionary representation

Source code in odibi/pipeline.py

def to_dict(self) -> Dict[str, Any]:
    """Convert to dictionary.

    Returns:
        Dictionary representation
    """
    return {
        "pipeline_name": self.pipeline_name,
        "completed": self.completed,
        "failed": self.failed,
        "skipped": self.skipped,
        "duration": self.duration,
        "start_time": self.start_time,
        "end_time": self.end_time,
        "node_count": len(self.node_results),
    }

`create_context(engine, spark_session=None)` ¶

Factory function to create appropriate context.

Parameters:

Name	Type	Description	Default
`engine`	`str`	Engine type ('pandas' or 'spark')	required
`spark_session`	`Optional[Any]`	SparkSession (required if engine='spark')	`None`

Returns:

Type	Description
`Context`	Context instance for the specified engine

Raises:

Type	Description
`ValueError`	If engine is invalid or SparkSession missing for Spark

Source code in odibi/context.py

def create_context(engine: str, spark_session: Optional[Any] = None) -> Context:
    """Factory function to create appropriate context.

    Args:
        engine: Engine type ('pandas' or 'spark')
        spark_session: SparkSession (required if engine='spark')

    Returns:
        Context instance for the specified engine

    Raises:
        ValueError: If engine is invalid or SparkSession missing for Spark
    """
    if engine == "pandas":
        return PandasContext()
    elif engine == "spark":
        if spark_session is None:
            raise ValueError("SparkSession required for Spark engine")
        return SparkContext(spark_session)
    elif engine == "polars":
        return PolarsContext()
    else:
        raise ValueError(f"Unsupported engine: {engine}. Use 'pandas' or 'spark'")

Pipeline API¶

odibi.pipeline ¶

PipelineManager ¶

__init__(project_config, connections) ¶

clear_state(key) ¶

deploy(pipelines=None) ¶

discover(connection_name=None, dataset=None, include_schema=False, include_stats=False, profile=False, sample_rows=1000, **kwargs) ¶

Discover ALL connections at once¶

List all tables in a database¶

Get schema for specific table¶

Profile a table¶

Discover ADLS folder¶

Search across all connections with a pattern¶

doctor() ¶

flush_stories(timeout=60.0) ¶

freshness(connection_name, dataset, timestamp_column=None) ¶

Check freshness using table metadata¶

Check freshness using a specific timestamp column¶

Check file freshness¶

from_yaml(yaml_path, env=None) classmethod ¶

get_all_state(prefix=None) ¶

get_lineage(table, direction='both') ¶

get_node_stats(node, days=7) ¶

get_pipeline(name) ¶

get_pipeline_status(pipeline) ¶

get_schema_history(table, limit=5) ¶

get_state(key) ¶

list_pipelines() ¶

list_registered_nodes(pipeline=None) ¶

list_registered_pipelines() ¶

list_runs(pipeline=None, node=None, status=None, limit=10) ¶

list_tables() ¶

preview(connection_name, dataset, rows=5, columns=None) ¶

Preview a SQL table¶

Preview specific columns¶

Preview a CSV file¶

register_outputs(pipelines=None) ¶

relationships(connection_name, schema=None) ¶

Discover all FKs¶

FKs in a specific schema¶

run(pipelines=None, dry_run=False, resume_from_failure=False, parallel=False, max_workers=4, on_error=None, tag=None, node=None, console=False) ¶

scaffold_project(project_name, connections, **kwargs) ¶

scaffold_sql_pipeline(pipeline_name, source_connection, target_connection, tables, **kwargs) ¶

validate_yaml(yaml_content) ¶

PipelineResults dataclass ¶

debug_summary() ¶

get_node_result(name) ¶

to_dict() ¶

create_context(engine, spark_session=None) ¶

`odibi.pipeline` ¶

`PipelineManager` ¶

`init(project_config, connections)` ¶

`clear_state(key)` ¶

`deploy(pipelines=None)` ¶

`discover(connection_name=None, dataset=None, include_schema=False, include_stats=False, profile=False, sample_rows=1000, **kwargs)` ¶

`doctor()` ¶

`flush_stories(timeout=60.0)` ¶

`freshness(connection_name, dataset, timestamp_column=None)` ¶

`from_yaml(yaml_path, env=None)` `classmethod` ¶

`get_all_state(prefix=None)` ¶

`get_lineage(table, direction='both')` ¶

`get_node_stats(node, days=7)` ¶

`get_pipeline(name)` ¶

`get_pipeline_status(pipeline)` ¶

`get_schema_history(table, limit=5)` ¶

`get_state(key)` ¶

`list_pipelines()` ¶

`list_registered_nodes(pipeline=None)` ¶

`list_registered_pipelines()` ¶

`list_runs(pipeline=None, node=None, status=None, limit=10)` ¶

`list_tables()` ¶

`preview(connection_name, dataset, rows=5, columns=None)` ¶

`register_outputs(pipelines=None)` ¶

`relationships(connection_name, schema=None)` ¶

`run(pipelines=None, dry_run=False, resume_from_failure=False, parallel=False, max_workers=4, on_error=None, tag=None, node=None, console=False)` ¶

`scaffold_project(project_name, connections, **kwargs)` ¶

`scaffold_sql_pipeline(pipeline_name, source_connection, target_connection, tables, **kwargs)` ¶

`validate_yaml(yaml_content)` ¶

`PipelineResults` `dataclass` ¶

`debug_summary()` ¶

`get_node_result(name)` ¶

`to_dict()` ¶

`create_context(engine, spark_session=None)` ¶