-
Notifications
You must be signed in to change notification settings - Fork 2k
Fixes #24348: Strip URL scheme from hostPort to prevent ValueError #27191
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 8 commits
83c69c3
4418065
31c851f
8bc7b40
0d4d233
d23d309
456502e
0bc267c
30c6ee4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |||||||||||||||||||||||||||||||||||
| import json | ||||||||||||||||||||||||||||||||||||
| import logging | ||||||||||||||||||||||||||||||||||||
| from typing import Any, Callable, Dict, Literal, Optional, Union | ||||||||||||||||||||||||||||||||||||
| from urllib.parse import urlparse | ||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||
| from pydantic import BaseModel as PydanticBaseModel | ||||||||||||||||||||||||||||||||||||
| from pydantic import WrapSerializer, model_validator | ||||||||||||||||||||||||||||||||||||
|
|
@@ -34,6 +35,55 @@ | |||||||||||||||||||||||||||||||||||
| JSON_ENCODERS = "json_encoders" | ||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||
| def _strip_hostport_scheme(raw: str) -> str: | ||||||||||||||||||||||||||||||||||||
| """ | ||||||||||||||||||||||||||||||||||||
| Strip an accidental URL scheme from a hostPort string. | ||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||
| Self-contained helper that depends only on the standard library so | ||||||||||||||||||||||||||||||||||||
| ``model_post_init`` never drags heavy ``metadata.*`` imports into the | ||||||||||||||||||||||||||||||||||||
| bootstrap path of every generated Connection class. | ||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||
| Raises ValueError if the scheme carries a non-numeric port so the user | ||||||||||||||||||||||||||||||||||||
| gets a clear error instead of a silently broken hostPort. | ||||||||||||||||||||||||||||||||||||
| """ | ||||||||||||||||||||||||||||||||||||
| value = raw.strip() | ||||||||||||||||||||||||||||||||||||
| if "://" not in value: | ||||||||||||||||||||||||||||||||||||
| return value | ||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||
| parsed = urlparse(value) | ||||||||||||||||||||||||||||||||||||
| hostname = parsed.hostname or "" | ||||||||||||||||||||||||||||||||||||
| safe_label = ( | ||||||||||||||||||||||||||||||||||||
| f"{parsed.scheme}://{hostname}" | ||||||||||||||||||||||||||||||||||||
| if parsed.scheme and hostname | ||||||||||||||||||||||||||||||||||||
| else "URL with scheme" | ||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||
| logger.warning( | ||||||||||||||||||||||||||||||||||||
| "The hostPort '%s' contains a URL scheme. Expected format is " | ||||||||||||||||||||||||||||||||||||
| "'hostname[:port]' (e.g. 'localhost:3306'). Stripping the scheme prefix.", | ||||||||||||||||||||||||||||||||||||
| safe_label, | ||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||
| try: | ||||||||||||||||||||||||||||||||||||
| port = parsed.port | ||||||||||||||||||||||||||||||||||||
| except ValueError as exc: | ||||||||||||||||||||||||||||||||||||
| raise ValueError( | ||||||||||||||||||||||||||||||||||||
| f"Invalid hostPort '{safe_label}'. Expected format is " | ||||||||||||||||||||||||||||||||||||
| "'hostname[:port]' (e.g. 'localhost:3306')." | ||||||||||||||||||||||||||||||||||||
| ) from exc | ||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||
| if not hostname: | ||||||||||||||||||||||||||||||||||||
| # urlparse couldn't extract a hostname (e.g. 'jdbc:postgresql://host:5432/db') | ||||||||||||||||||||||||||||||||||||
| # Fall back to stripping scheme and any trailing path/query/fragment/userinfo. | ||||||||||||||||||||||||||||||||||||
| tail = value.rsplit("://", 1)[-1] | ||||||||||||||||||||||||||||||||||||
| for sep in ("/", "?", "#"): | ||||||||||||||||||||||||||||||||||||
| tail = tail.split(sep, 1)[0] | ||||||||||||||||||||||||||||||||||||
| if "@" in tail: | ||||||||||||||||||||||||||||||||||||
| tail = tail.rsplit("@", 1)[-1] | ||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||
| tail = tail.rsplit("@", 1)[-1] | |
| tail = tail.rsplit("@", 1)[-1] | |
| fallback_port = None | |
| if tail.startswith("["): | |
| closing_bracket = tail.find("]") | |
| if closing_bracket != -1 and len(tail) > closing_bracket + 1: | |
| if tail[closing_bracket + 1] == ":": | |
| fallback_port = tail[closing_bracket + 2 :] | |
| elif ":" in tail: | |
| _, fallback_port = tail.rsplit(":", 1) | |
| if fallback_port and not fallback_port.isdigit(): | |
| raise ValueError( | |
| f"Invalid hostPort '{safe_label}'. Expected format is " | |
| "'hostname[:port]' (e.g. 'localhost:3306')." | |
| ) |
Copilot
AI
Apr 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The PR description includes an auto-generated “Summary by Gitar” referencing UI changes (e.g., fontSize in AutocompleteBase/ComboBox), but this PR’s diff only modifies ingestion framework Python files. Please update/remove that summary to avoid a mismatch between the PR description and the actual changes being reviewed.
Copilot
AI
Apr 14, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The broad try/except Exception around the whole model_post_init will also swallow failures from clean_host_port() (e.g. http://localhost:abc). In that case the model keeps the original invalid hostPort and downstream code can still crash with the same unhelpful parsing errors. Consider narrowing the exception handling to just the FilterPattern conversion, and let clean_host_port() errors propagate (or re-raise with a connection/hostPort-specific message). Also, the current warning text says it's parsing FilterPattern even when the failure is from hostPort sanitization.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ | |
| """ | ||
| Helpers module for db sources | ||
| """ | ||
|
|
||
| import time | ||
| import traceback | ||
| from typing import Iterable, List, Union | ||
|
|
@@ -32,6 +33,7 @@ | |
| get_lineage_by_query, | ||
| get_lineage_via_table_entity, | ||
| ) | ||
| from metadata.ingestion.models.custom_pydantic import _strip_hostport_scheme | ||
|
||
| from metadata.ingestion.ometa.ometa_api import OpenMetadata | ||
| from metadata.ingestion.source.models import TableView | ||
| from metadata.utils import fqn | ||
|
|
@@ -43,12 +45,32 @@ | |
| PUBLIC_SCHEMA = "public" | ||
|
|
||
|
|
||
| def clean_host_port(host_port: str) -> str: | ||
| """ | ||
| Strip URL scheme prefixes from a hostPort string. | ||
|
|
||
| Users sometimes enter a full URL (e.g. 'http://localhost:3306') | ||
| instead of just 'localhost:3306'. This strips the scheme to avoid | ||
| ValueError when parsing host and port. | ||
|
|
||
| Delegates to the stdlib-only helper colocated with ``BaseModel`` so the | ||
| behaviour stays in lockstep with Pydantic's ``model_post_init`` hook. | ||
| """ | ||
| value = host_port.strip() | ||
| if "://" not in value: | ||
| return value.rstrip("/") | ||
| return _strip_hostport_scheme(value) | ||
|
|
||
|
|
||
| def get_host_from_host_port(uri: str) -> str: | ||
| """ | ||
| if uri is like "localhost:9000" | ||
| then return the host "localhost" | ||
| """ | ||
| return uri.split(":")[0] | ||
| cleaned = clean_host_port(uri) | ||
| if cleaned.startswith("["): | ||
| return cleaned.split("]")[0] + "]" | ||
| return cleaned.split(":")[0] | ||
|
|
||
|
|
||
| # pylint: disable=too-many-locals | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -12,6 +12,7 @@ | |||||||||
| """ | ||||||||||
| Unit tests for db_utils module | ||||||||||
| """ | ||||||||||
|
|
||||||||||
| import uuid | ||||||||||
| from copy import deepcopy | ||||||||||
| from unittest import TestCase | ||||||||||
|
|
@@ -36,7 +37,11 @@ | |||||||||
| from metadata.ingestion.lineage.models import Dialect | ||||||||||
| from metadata.ingestion.lineage.sql_lineage import search_cache | ||||||||||
| from metadata.ingestion.source.models import TableView | ||||||||||
| from metadata.utils.db_utils import get_host_from_host_port, get_view_lineage | ||||||||||
| from metadata.utils.db_utils import ( | ||||||||||
| clean_host_port, | ||||||||||
| get_host_from_host_port, | ||||||||||
| get_view_lineage, | ||||||||||
| ) | ||||||||||
|
|
||||||||||
|
|
||||||||||
| # Mock LineageTable class to simulate collate_sqllineage.core.models.Table | ||||||||||
|
|
@@ -118,6 +123,84 @@ def test_get_host_from_host_port(self): | |||||||||
| self.assertEqual(get_host_from_host_port("localhost"), "localhost") | ||||||||||
| self.assertEqual(get_host_from_host_port("example.com"), "example.com") | ||||||||||
|
|
||||||||||
| # Test with URL scheme prefixes | ||||||||||
| self.assertEqual(get_host_from_host_port("http://localhost:3306"), "localhost") | ||||||||||
| self.assertEqual( | ||||||||||
| get_host_from_host_port("https://example.com:5432"), "example.com" | ||||||||||
| ) | ||||||||||
| self.assertEqual(get_host_from_host_port("http://localhost"), "localhost") | ||||||||||
|
|
||||||||||
| # Test with IPv6 addresses | ||||||||||
| self.assertEqual(get_host_from_host_port("http://[::1]:3306"), "[::1]") | ||||||||||
| self.assertEqual(get_host_from_host_port("[::1]:3306"), "[::1]") | ||||||||||
|
|
||||||||||
| def test_clean_host_port(self): | ||||||||||
| """Test clean_host_port strips URL scheme prefixes""" | ||||||||||
| # Already-clean values pass through unchanged | ||||||||||
| self.assertEqual(clean_host_port("localhost:3306"), "localhost:3306") | ||||||||||
| self.assertEqual(clean_host_port("127.0.0.1:5432"), "127.0.0.1:5432") | ||||||||||
| self.assertEqual(clean_host_port("example.com"), "example.com") | ||||||||||
|
|
||||||||||
| # HTTP prefix is stripped | ||||||||||
| self.assertEqual(clean_host_port("http://localhost:3306"), "localhost:3306") | ||||||||||
| self.assertEqual(clean_host_port("http://example.com:8080"), "example.com:8080") | ||||||||||
|
||||||||||
| self.assertEqual(clean_host_port("http://example.com:8080"), "example.com:8080") | |
| self.assertEqual( | |
| clean_host_port("http://example.com:8080"), "example.com:8080" | |
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
_strip_hostport_schemedocstring says it “Raises ValueError if the scheme carries a non-numeric port”, but the JDBC-stylenot hostnamefallback returns the stripped tail without port validation. Please either validate the port in that branch as well, or clarify the docstring to match the behavior.