Skip to content

Commit

Permalink
Move DAG bundle config into config, not db (apache#44924)
Browse files Browse the repository at this point in the history
This moves the DAG bundle config into the Airflow config, instead of
being in the db. This:

- makes it much easier to configure a fresh Airflow instance - no
  api/cli calls required
- avoids some security concerns by ensuring only deployment managers,
  with direct access to the instance, can configure these

The primary downside is this does mean you cannot reconfigure an
existing bundle in a running Airflow instance.
  • Loading branch information
jedcunningham authored Dec 17, 2024
1 parent 947c25d commit 4000905
Show file tree
Hide file tree
Showing 15 changed files with 2,072 additions and 1,765 deletions.
31 changes: 31 additions & 0 deletions airflow/config_templates/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2654,3 +2654,34 @@ usage_data_collection:
example: ~
default: "True"
see_also: ":ref:`Usage data collection FAQ <usage-data-collection>`"
dag_bundles:
description: |
Configuration for the DAG bundles. This allows Airflow to load DAGs from different sources.
Airflow will consume all options added to this section. Below you will see only the default,
``dags_folder``. The option name is the bundle name and the value is a json object with the following
keys:
* classpath: The classpath of the bundle class
* kwargs: The keyword arguments to pass to the bundle class
* refresh_interval: The interval in seconds to refresh the bundle from its source.
For example, to add a new bundle named ``hello`` to my Airflow instance, add the following to your
airflow.cfg (this is just an example, the classpath and kwargs are not real):
.. code-block:: ini
[dag_bundles]
hello: {classpath: "airflow.some.classpath", kwargs: {"hello": "world"}, refresh_interval: 60}
options:
dags_folder:
description: |
This is the default DAG bundle that loads DAGs from the traditional ``[core] dags_folder``.
By default, ``refresh_interval`` is set to ``[scheduler] dag_dir_list_interval``, but that can be
overridden here if desired.
Parsing DAGs from the DAG folder can be disabled by setting this option to an empty string.
version_added: ~
type: string
example: ~
default: '{{"classpath": "airflow.dag_processing.bundles.dagfolder.DagsFolderDagBundle",
"kwargs": {{}}}}'
4 changes: 3 additions & 1 deletion airflow/dag_processing/bundles/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,16 @@ class BaseDagBundle(ABC):
multiple versions of the same bundle in use at the same time. The DAG processor will always use the latest version.
:param name: String identifier for the DAG bundle
:param refresh_interval: How often the bundle should be refreshed from the source (in seconds)
:param version: Version of the DAG bundle (Optional)
"""

supports_versioning: bool = False

def __init__(self, *, name: str, version: str | None = None) -> None:
def __init__(self, *, name: str, refresh_interval: int, version: str | None = None) -> None:
self.name = name
self.version = version
self.refresh_interval = refresh_interval

@property
def _dag_bundle_root_storage_path(self) -> Path:
Expand Down
12 changes: 10 additions & 2 deletions airflow/dag_processing/bundles/dagfolder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,19 @@
from __future__ import annotations

from airflow import settings
from airflow.configuration import conf
from airflow.dag_processing.bundles.local import LocalDagBundle


class DagsFolderDagBundle(LocalDagBundle):
"""A bundle for the DAGs folder."""

def __init__(self, **kwargs):
super().__init__(local_folder=settings.DAGS_FOLDER, **kwargs)
def __init__(self, refresh_interval: int | None = None, **kwargs):
if refresh_interval is None:
refresh_interval = conf.getint("scheduler", "dag_dir_list_interval")

super().__init__(
local_folder=settings.DAGS_FOLDER,
refresh_interval=refresh_interval,
**kwargs,
)
96 changes: 96 additions & 0 deletions airflow/dag_processing/bundles/manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

from typing import TYPE_CHECKING

from airflow.configuration import conf
from airflow.exceptions import AirflowConfigException
from airflow.models.dagbundle import DagBundleModel
from airflow.utils.log.logging_mixin import LoggingMixin
from airflow.utils.module_loading import import_string
from airflow.utils.session import NEW_SESSION, provide_session

if TYPE_CHECKING:
from sqlalchemy.orm import Session

from airflow.dag_processing.bundles.base import BaseDagBundle


class DagBundlesManager(LoggingMixin):
"""Manager for DAG bundles."""

@property
def bundle_configs(self) -> dict[str, dict]:
"""Get all DAG bundle configurations."""
configured_bundles = conf.getsection("dag_bundles")

if not configured_bundles:
return {}

# If dags_folder is empty string, we remove it. This allows the default dags_folder bundle to be disabled.
if not configured_bundles["dags_folder"]:
del configured_bundles["dags_folder"]

dict_bundles: dict[str, dict] = {}
for key in configured_bundles.keys():
config = conf.getjson("dag_bundles", key)
if not isinstance(config, dict):
raise AirflowConfigException(f"Bundle config for {key} is not a dict: {config}")
dict_bundles[key] = config

return dict_bundles

@provide_session
def sync_bundles_to_db(self, *, session: Session = NEW_SESSION) -> None:
known_bundles = {b.name: b for b in session.query(DagBundleModel).all()}

for name in self.bundle_configs.keys():
if bundle := known_bundles.get(name):
bundle.active = True
else:
session.add(DagBundleModel(name=name))
self.log.info("Added new DAG bundle %s to the database", name)

for name, bundle in known_bundles.items():
if name not in self.bundle_configs:
bundle.active = False
self.log.warning("DAG bundle %s is no longer found in config and has been disabled", name)

def get_all_dag_bundles(self) -> list[BaseDagBundle]:
"""
Get all DAG bundles.
:param session: A database session.
:return: list of DAG bundles.
"""
return [self.get_bundle(name, version=None) for name in self.bundle_configs.keys()]

def get_bundle(self, name: str, version: str | None = None) -> BaseDagBundle:
"""
Get a DAG bundle by name.
:param name: The name of the DAG bundle.
:param version: The version of the DAG bundle you need (optional). If not provided, ``tracking_ref`` will be used instead.
:return: The DAG bundle.
"""
# TODO: proper validation of the bundle configuration so we have better error messages
bundle_config = self.bundle_configs[name]
bundle_class = import_string(bundle_config["classpath"])
return bundle_class(name=name, version=version, **bundle_config["kwargs"])
4 changes: 4 additions & 0 deletions airflow/dag_processing/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,10 @@ def start(self):
"Checking for new files in %s every %s seconds", self._dag_directory, self.dag_dir_list_interval
)

from airflow.dag_processing.bundles.manager import DagBundlesManager

DagBundlesManager().sync_bundles_to_db()

return self._run_parsing_loop()

def _scan_stale_dags(self):
Expand Down
25 changes: 10 additions & 15 deletions airflow/migrations/versions/0050_3_0_0_add_dagbundlemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,8 @@

import sqlalchemy as sa
from alembic import op
from sqlalchemy_utils import UUIDType

from airflow.models.base import StringID
from airflow.utils.sqlalchemy import ExtendedJSON, UtcDateTime
from airflow.utils.sqlalchemy import UtcDateTime

revision = "e229247a6cb1"
down_revision = "eed27faa34e3"
Expand All @@ -43,27 +41,24 @@
def upgrade():
op.create_table(
"dag_bundle",
sa.Column("id", UUIDType(binary=False), nullable=False),
sa.Column("name", StringID(), nullable=False),
sa.Column("classpath", sa.String(length=1000), nullable=False),
sa.Column("kwargs", ExtendedJSON(), nullable=True),
sa.Column("refresh_interval", sa.Integer(), nullable=True),
sa.Column("name", sa.String(length=250), nullable=False),
sa.Column("active", sa.Boolean(), nullable=True),
sa.Column("latest_version", sa.String(length=200), nullable=True),
sa.Column("last_refreshed", UtcDateTime(timezone=True), nullable=True),
sa.PrimaryKeyConstraint("id", name=op.f("dag_bundle_pkey")),
sa.UniqueConstraint("name", name=op.f("dag_bundle_name_uq")),
sa.PrimaryKeyConstraint("name", name=op.f("dag_bundle_pkey")),
)
with op.batch_alter_table("dag", schema=None) as batch_op:
batch_op.add_column(sa.Column("bundle_id", UUIDType(binary=False), nullable=True))
batch_op.add_column(sa.Column("bundle_name", sa.String(length=250), nullable=True))
batch_op.add_column(sa.Column("latest_bundle_version", sa.String(length=200), nullable=True))
batch_op.create_foreign_key(batch_op.f("dag_bundle_id_fkey"), "dag_bundle", ["bundle_id"], ["id"])
batch_op.create_foreign_key(
batch_op.f("dag_bundle_name_fkey"), "dag_bundle", ["bundle_name"], ["name"]
)


def downgrade():
"""Unapply Add DagBundleModel."""
with op.batch_alter_table("dag", schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f("dag_bundle_id_fkey"), type_="foreignkey")
batch_op.drop_constraint(batch_op.f("dag_bundle_name_fkey"), type_="foreignkey")
batch_op.drop_column("latest_bundle_version")
batch_op.drop_column("bundle_id")
batch_op.drop_column("bundle_name")

op.drop_table("dag_bundle")
3 changes: 1 addition & 2 deletions airflow/models/dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@
from sqlalchemy.ext.hybrid import hybrid_property
from sqlalchemy.orm import backref, relationship
from sqlalchemy.sql import Select, expression
from sqlalchemy_utils import UUIDType

from airflow import settings, utils
from airflow.configuration import conf as airflow_conf, secrets_backend_list
Expand Down Expand Up @@ -2028,7 +2027,7 @@ class DagModel(Base):
fileloc = Column(String(2000))
# The base directory used by Dag Processor that parsed this dag.
processor_subdir = Column(String(2000), nullable=True)
bundle_id = Column(UUIDType(binary=False), ForeignKey("dag_bundle.id"), nullable=True)
bundle_name = Column(StringID(), ForeignKey("dag_bundle.name"), nullable=True)
# The version of the bundle the last time the DAG was parsed
latest_bundle_version = Column(String(200), nullable=True)
# String representing the owners
Expand Down
58 changes: 14 additions & 44 deletions airflow/models/dagbundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,58 +16,28 @@
# under the License.
from __future__ import annotations

from typing import TYPE_CHECKING

import uuid6
from sqlalchemy import Column, Integer, String
from sqlalchemy_utils import UUIDType
from sqlalchemy import Boolean, Column, String

from airflow.models.base import Base, StringID
from airflow.utils.module_loading import import_string
from airflow.utils.session import NEW_SESSION, provide_session
from airflow.utils.sqlalchemy import ExtendedJSON, UtcDateTime

if TYPE_CHECKING:
from sqlalchemy.orm import Session

from airflow.dag_processing.bundles.base import BaseDagBundle
from airflow.utils.sqlalchemy import UtcDateTime


class DagBundleModel(Base):
"""A table for DAG Bundle config."""
"""
A table for storing DAG bundle metadata.
We track the following information about each bundle, as it can be useful for
informational purposes and for debugging:
- active: Is the bundle currently found in configuration?
- latest_version: The latest version Airflow has seen for the bundle.
- last_refreshed: When the bundle was last refreshed.
"""

__tablename__ = "dag_bundle"
id = Column(UUIDType(binary=False), primary_key=True, default=uuid6.uuid7)
name = Column(StringID(), nullable=False, unique=True)
classpath = Column(String(1000), nullable=False)
kwargs = Column(ExtendedJSON, nullable=True)
refresh_interval = Column(Integer, nullable=True)
name = Column(StringID(), primary_key=True)
active = Column(Boolean, default=True)
latest_version = Column(String(200), nullable=True)
last_refreshed = Column(UtcDateTime, nullable=True)

def __init__(self, *, name, classpath, kwargs, refresh_interval):
def __init__(self, *, name: str):
self.name = name
self.classpath = classpath
self.kwargs = kwargs
self.refresh_interval = refresh_interval

@classmethod
@provide_session
def get_all_dag_bundles(
cls, *, session: Session = NEW_SESSION
) -> list[tuple[DagBundleModel, BaseDagBundle]]:
"""
Get all DAG bundles.
:param session: A database session.
:return: list of DAG bundles.
"""
bundle_configs = session.query(cls).all()

bundles = []
for bundle_config in bundle_configs:
bundle_class = import_string(bundle_config.classpath)
bundle = bundle_class(name=bundle_config.name, **bundle_config.kwargs)
bundles.append((bundle_config, bundle))

return bundles
2 changes: 1 addition & 1 deletion docs/apache-airflow/img/airflow_erd.sha256
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ccb8ef5583b2a6b3ee3ab4212139c112b92953675655010a6775fffb4945b206
ba10504bc54d15b2faca37ae9db172848a498e471bbf332e031715f728158ff8
Loading

0 comments on commit 4000905

Please sign in to comment.