Skip to content

Commit c7ea01d

Browse files
ashbjedcunningham
authored andcommitted
Better validation of Dataset URI during dag parse (#26389)
Previously we had the validation on the Dataset model, but we since moved the "dag" facing class to a separate one. This adds validation to the public class, and extends the validation to not allow space-only strings (cherry picked from commit bd181da)
1 parent 3871f00 commit c7ea01d

File tree

3 files changed

+62
-47
lines changed

3 files changed

+62
-47
lines changed

airflow/datasets/__init__.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from __future__ import annotations
1818

1919
from typing import Any
20+
from urllib.parse import urlparse
2021

2122
import attr
2223

@@ -25,5 +26,17 @@
2526
class Dataset:
2627
"""A Dataset is used for marking data dependencies between workflows."""
2728

28-
uri: str
29+
uri: str = attr.field(validator=[attr.validators.min_len(1), attr.validators.max_len(3000)])
2930
extra: dict[str, Any] | None = None
31+
32+
@uri.validator
33+
def _check_uri(self, attr, uri: str):
34+
if uri.isspace():
35+
raise ValueError(f'{attr.name} cannot be just whitespace')
36+
try:
37+
uri.encode('ascii')
38+
except UnicodeEncodeError:
39+
raise ValueError(f'{attr.name!r} must be ascii')
40+
parsed = urlparse(uri)
41+
if parsed.scheme and parsed.scheme.lower() == 'airflow':
42+
raise ValueError(f'{attr.name!r} scheme `airflow` is reserved')

tests/datasets/test_dataset.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
from __future__ import annotations
19+
20+
import pytest
21+
22+
from airflow.datasets import Dataset
23+
from airflow.operators.empty import EmptyOperator
24+
25+
26+
@pytest.mark.parametrize(
27+
["uri"],
28+
[
29+
pytest.param("", id="empty"),
30+
pytest.param("\n\t", id="whitespace"),
31+
pytest.param("a" * 3001, id="too_long"),
32+
pytest.param("airflow:" * 3001, id="reserved_scheme"),
33+
pytest.param("😊" * 3001, id="non-ascii"),
34+
],
35+
)
36+
def test_invalid_uris(uri):
37+
with pytest.raises(ValueError):
38+
Dataset(uri=uri)
39+
40+
41+
def test_uri_with_scheme():
42+
dataset = Dataset(uri="s3://example_dataset")
43+
EmptyOperator(task_id="task1", outlets=[dataset])
44+
45+
46+
def test_uri_without_scheme():
47+
dataset = Dataset(uri="example_dataset")
48+
EmptyOperator(task_id="task1", outlets=[dataset])

tests/models/test_dataset.py

Lines changed: 0 additions & 46 deletions
This file was deleted.

0 commit comments

Comments
 (0)