|
9 | 9 | # See the License for the specific language governing permissions and |
10 | 10 | # limitations under the License. |
11 | 11 | """ |
12 | | -Bigquery Incremental Table processing logic |
| 12 | +Bigquery Incremental Table processing logic. |
| 13 | +
|
| 14 | +Uses Cloud Logging API (entries.list) to detect table changes since last run. |
| 15 | +Optimized around the hard 60 requests/min quota per project: |
| 16 | +- Batches datasets into groups using the indexed field resource.labels.dataset_id |
| 17 | +- Bounded timestamp window [start_date, end_date) for deterministic results |
| 18 | +- Retries with linear backoff on ResourceExhausted (429) |
| 19 | +
|
| 20 | +Memory-optimized: |
| 21 | +- Processes entries page-by-page, releasing each page before fetching the next |
| 22 | +- Stores only (table_name -> is_deleted) per schema, no Pydantic models or timestamps |
13 | 23 | """ |
14 | | -from datetime import datetime |
15 | | -from typing import List |
| 24 | +import time |
| 25 | +from datetime import datetime, timezone |
| 26 | +from typing import Dict, Iterable, List, Optional |
16 | 27 |
|
17 | 28 | import google.cloud.logging |
| 29 | +from google.api_core.exceptions import ResourceExhausted |
18 | 30 | from google.cloud.logging_v2.entries import LogEntry |
19 | 31 |
|
20 | 32 | from metadata.ingestion.source.database.bigquery.models import ( |
21 | | - BigQueryTable, |
22 | 33 | BigQueryTableMap, |
23 | 34 | SchemaName, |
24 | 35 | TableName, |
25 | 36 | ) |
26 | 37 | from metadata.ingestion.source.database.bigquery.queries import ( |
27 | 38 | BIGQUERY_GET_CHANGED_TABLES_FROM_CLOUD_LOGGING, |
28 | 39 | ) |
| 40 | +from metadata.utils.logger import ingestion_logger |
| 41 | + |
| 42 | +logger = ingestion_logger() |
| 43 | + |
| 44 | +MAX_RETRIES = 3 |
| 45 | +RETRY_BASE_WAIT = 60 # Cloud Logging quota resets per minute |
| 46 | +PAGE_SIZE = 10000 |
| 47 | +DATASET_BATCH_SIZE = 50 |
| 48 | + |
| 49 | + |
| 50 | +def _batch(items: List[str], batch_size: int) -> Iterable[List[str]]: |
| 51 | + """Yield successive batches from a list.""" |
| 52 | + for i in range(0, len(items), batch_size): |
| 53 | + yield items[i : i + batch_size] |
| 54 | + |
| 55 | + |
| 56 | +def _build_dataset_filter(datasets: List[str]) -> str: |
| 57 | + """Build a Cloud Logging filter clause for a batch of dataset IDs. |
| 58 | +
|
| 59 | + Uses the indexed field resource.labels.dataset_id for efficient |
| 60 | + server-side filtering. |
| 61 | + """ |
| 62 | + if len(datasets) == 1: |
| 63 | + return f'AND resource.labels.dataset_id = "{datasets[0]}"' |
| 64 | + or_clause = " OR ".join(f'resource.labels.dataset_id = "{ds}"' for ds in datasets) |
| 65 | + return f"AND ({or_clause})" |
29 | 66 |
|
30 | 67 |
|
31 | 68 | class BigQueryIncrementalTableProcessor: |
32 | 69 | def __init__(self, client: google.cloud.logging.Client): |
33 | 70 | self._client = client |
34 | | - self._changed_tables_map = BigQueryTableMap(table_map={}) |
| 71 | + self._changed_tables_map = BigQueryTableMap() |
| 72 | + self._query_failed = False |
35 | 73 |
|
36 | 74 | @classmethod |
37 | 75 | def from_project(cls, project: str) -> "BigQueryIncrementalTableProcessor": |
38 | 76 | client = google.cloud.logging.Client(project=project) |
39 | 77 | return cls(client) |
40 | 78 |
|
41 | | - def _is_table_deleted(self, entry: LogEntry) -> bool: |
42 | | - if "tableDeletion" in entry.payload.get("metadata").keys(): |
43 | | - return True |
44 | | - return False |
| 79 | + @staticmethod |
| 80 | + def _is_table_deleted(entry: LogEntry) -> bool: |
| 81 | + metadata = entry.payload.get("metadata") or {} |
| 82 | + return "tableDeletion" in metadata |
| 83 | + |
| 84 | + def _process_entry(self, entry: LogEntry): |
| 85 | + """Extract dataset/table from a single Cloud Logging entry.""" |
| 86 | + payload = entry.payload |
| 87 | + if not isinstance(payload, dict): |
| 88 | + logger.debug("Skipping non-dict Cloud Logging entry payload: %s", payload) |
| 89 | + return |
| 90 | + resource_name = payload.get("resourceName", "") |
| 91 | + parts = resource_name.split("/") |
| 92 | + if len(parts) < 6: |
| 93 | + return |
| 94 | + |
| 95 | + self._changed_tables_map.update( |
| 96 | + schema_name=parts[3], |
| 97 | + table_name=parts[5], |
| 98 | + deleted=self._is_table_deleted(entry), |
| 99 | + ) |
45 | 100 |
|
46 | | - def set_changed_tables_map( |
| 101 | + def _fetch_batch( |
47 | 102 | self, |
48 | 103 | project: str, |
49 | | - dataset: str, |
50 | 104 | start_date: datetime, |
| 105 | + end_date: datetime, |
| 106 | + dataset_filter: str, |
51 | 107 | ): |
52 | | - table_map = {} |
| 108 | + """Fetch Cloud Logging entries for a batch of datasets with retry logic. |
53 | 109 |
|
| 110 | + Iterates entries one-by-one from the Cloud Logging generator and |
| 111 | + feeds each to _process_entry. On ResourceExhausted (429), retries |
| 112 | + up to MAX_RETRIES times with linear backoff. On retry, |
| 113 | + already-processed entries are deduplicated by BigQueryTableMap.update(). |
| 114 | + """ |
54 | 115 | resource_names = [f"projects/{project}"] |
55 | 116 | filters = BIGQUERY_GET_CHANGED_TABLES_FROM_CLOUD_LOGGING.format( |
56 | 117 | project=project, |
57 | | - dataset=dataset, |
58 | 118 | start_date=start_date.strftime("%Y-%m-%dT%H:%M:%SZ"), |
| 119 | + end_date=end_date.strftime("%Y-%m-%dT%H:%M:%SZ"), |
| 120 | + dataset_filter=dataset_filter, |
59 | 121 | ) |
60 | 122 |
|
61 | | - entries = self._client.list_entries( |
62 | | - resource_names=resource_names, |
63 | | - filter_=filters, |
64 | | - order_by=google.cloud.logging.DESCENDING, |
65 | | - ) |
| 123 | + for attempt in range(MAX_RETRIES): |
| 124 | + try: |
| 125 | + entries = self._client.list_entries( |
| 126 | + resource_names=resource_names, |
| 127 | + filter_=filters, |
| 128 | + order_by=google.cloud.logging.DESCENDING, |
| 129 | + page_size=PAGE_SIZE, |
| 130 | + ) |
| 131 | + total = 0 |
| 132 | + for entry in entries: |
| 133 | + total += 1 |
| 134 | + self._process_entry(entry) |
| 135 | + if total % 10000 == 0: |
| 136 | + logger.info("Processed %d Cloud Logging entries so far", total) |
| 137 | + if total > 0: |
| 138 | + logger.info("Finished processing %d Cloud Logging entries", total) |
| 139 | + return |
| 140 | + except ResourceExhausted: |
| 141 | + if attempt < MAX_RETRIES - 1: |
| 142 | + wait = RETRY_BASE_WAIT * (attempt + 1) |
| 143 | + logger.warning( |
| 144 | + "Cloud Logging quota exceeded, retrying in %ds " |
| 145 | + "(attempt %d/%d)", |
| 146 | + wait, |
| 147 | + attempt + 1, |
| 148 | + MAX_RETRIES, |
| 149 | + ) |
| 150 | + time.sleep(wait) |
| 151 | + else: |
| 152 | + logger.error( |
| 153 | + "Cloud Logging quota exceeded after %d retries. " |
| 154 | + "Falling back to full extraction.", |
| 155 | + MAX_RETRIES, |
| 156 | + ) |
| 157 | + self._query_failed = True |
| 158 | + except Exception as exc: |
| 159 | + logger.error("Failed to query Cloud Logging: %s", exc) |
| 160 | + self._query_failed = True |
| 161 | + return |
| 162 | + |
| 163 | + def set_tables_map( |
| 164 | + self, |
| 165 | + project: str, |
| 166 | + start_date: datetime, |
| 167 | + datasets: Optional[List[str]] = None, |
| 168 | + ): |
| 169 | + """Fetch changed tables from Cloud Logging, batching datasets for efficiency. |
66 | 170 |
|
67 | | - for entry in entries: |
68 | | - table_name = entry.payload.get("resourceName", "").split("/")[-1] |
| 171 | + Batches datasets into groups of DATASET_BATCH_SIZE and queries each batch |
| 172 | + separately. This keeps the indexed field resource.labels.dataset_id in the |
| 173 | + filter while reducing total API calls from N to ceil(N / DATASET_BATCH_SIZE). |
69 | 174 |
|
70 | | - if table_name in table_map: |
71 | | - continue |
| 175 | + Uses a bounded timestamp window [start_date, end_date) to ensure |
| 176 | + deterministic results and prevent data gaps between runs. |
72 | 177 |
|
73 | | - table_map[table_name] = BigQueryTable( |
74 | | - name=table_name, |
75 | | - timestamp=entry.timestamp, |
76 | | - deleted=self._is_table_deleted(entry), |
| 178 | + Args: |
| 179 | + project: GCP project ID |
| 180 | + start_date: Only fetch changes after this timestamp |
| 181 | + datasets: List of dataset IDs to query. If None, queries all datasets |
| 182 | + in the project (no dataset_id filter). |
| 183 | + """ |
| 184 | + end_date = datetime.now(timezone.utc) |
| 185 | + num_datasets = len(datasets) if datasets else 0 |
| 186 | + num_batches = ( |
| 187 | + (num_datasets + DATASET_BATCH_SIZE - 1) // DATASET_BATCH_SIZE |
| 188 | + if num_datasets |
| 189 | + else 1 |
| 190 | + ) |
| 191 | + |
| 192 | + logger.info( |
| 193 | + "Querying Cloud Logging for project '%s': %d datasets in %d batch(es), " |
| 194 | + "window [%s, %s)", |
| 195 | + project, |
| 196 | + num_datasets, |
| 197 | + num_batches, |
| 198 | + start_date.strftime("%Y-%m-%dT%H:%M:%SZ"), |
| 199 | + end_date.strftime("%Y-%m-%dT%H:%M:%SZ"), |
| 200 | + ) |
| 201 | + |
| 202 | + if datasets is None: |
| 203 | + logger.debug("No dataset filter — querying all datasets in project") |
| 204 | + self._fetch_batch(project, start_date, end_date, dataset_filter="") |
| 205 | + elif datasets: |
| 206 | + for batch_idx, dataset_batch in enumerate( |
| 207 | + _batch(datasets, DATASET_BATCH_SIZE), start=1 |
| 208 | + ): |
| 209 | + if self._query_failed: |
| 210 | + logger.warning( |
| 211 | + "Skipping remaining %d batch(es) due to prior failure", |
| 212 | + num_batches - batch_idx + 1, |
| 213 | + ) |
| 214 | + return |
| 215 | + logger.debug( |
| 216 | + "Fetching batch %d/%d (%d datasets)", |
| 217 | + batch_idx, |
| 218 | + num_batches, |
| 219 | + len(dataset_batch), |
| 220 | + ) |
| 221 | + dataset_filter = _build_dataset_filter(dataset_batch) |
| 222 | + self._fetch_batch(project, start_date, end_date, dataset_filter) |
| 223 | + else: |
| 224 | + logger.info( |
| 225 | + "No datasets to query after filtering for project '%s'", project |
77 | 226 | ) |
78 | | - self._changed_tables_map.add(dataset, table_map) |
79 | 227 |
|
80 | 228 | def get_deleted(self, schema_name: SchemaName) -> List[TableName]: |
81 | | - if self._changed_tables_map: |
82 | | - return self._changed_tables_map.get_deleted(schema_name) |
83 | | - return [] |
| 229 | + return self._changed_tables_map.get_deleted(schema_name) |
84 | 230 |
|
85 | 231 | def get_not_deleted(self, schema_name: SchemaName) -> List[TableName]: |
86 | | - if self._changed_tables_map: |
87 | | - return self._changed_tables_map.get_not_deleted(schema_name) |
88 | | - return [] |
| 232 | + return self._changed_tables_map.get_not_deleted(schema_name) |
| 233 | + |
| 234 | + def get_all_deleted(self) -> Dict[SchemaName, List[TableName]]: |
| 235 | + return self._changed_tables_map.get_all_deleted() |
| 236 | + |
| 237 | + @property |
| 238 | + def query_failed(self) -> bool: |
| 239 | + return self._query_failed |
0 commit comments