Merge branch 'datahub-project:master' into master

acryldata · Sep 25, 2024 · 9322be8 · 9322be8
2 parents 02c0a34 + cf1d296
commit 9322be8
Show file tree

Hide file tree

Showing 20 changed files with 1,953 additions and 79 deletions.
diff --git a/build.gradle b/build.gradle
@@ -63,7 +63,7 @@ buildscript {
   buildscript.repositories.addAll(project.repositories)
   dependencies {
     classpath 'com.linkedin.pegasus:gradle-plugins:' + pegasusVersion
-    classpath 'com.github.node-gradle:gradle-node-plugin:7.0.1'
+    classpath 'com.github.node-gradle:gradle-node-plugin:7.0.2'
     classpath 'io.acryl.gradle.plugin:gradle-avro-plugin:0.2.0'
     classpath 'org.springframework.boot:spring-boot-gradle-plugin:' + springBootVersion
     classpath "io.codearte.gradle.nexus:gradle-nexus-staging-plugin:0.30.0"
@@ -267,7 +267,7 @@ project.ext.externalDependency = [
     'testContainersOpenSearch': 'org.opensearch:opensearch-testcontainers:2.0.0',
     'typesafeConfig':'com.typesafe:config:1.4.1',
     'wiremock':'com.github.tomakehurst:wiremock:2.10.0',
-    'zookeeper': 'org.apache.zookeeper:zookeeper:3.6.2',
+    'zookeeper': 'org.apache.zookeeper:zookeeper:3.8.4',
     'wire': 'com.squareup.wire:wire-compiler:3.7.1',
     'charle':  'com.charleskorn.kaml:kaml:0.53.0',
     'common': 'commons-io:commons-io:2.7',

diff --git a/...in/datahub/graphql/resolvers/structuredproperties/UpsertStructuredPropertiesResolver.java b/...in/datahub/graphql/resolvers/structuredproperties/UpsertStructuredPropertiesResolver.java
@@ -1,6 +1,7 @@
 package com.linkedin.datahub.graphql.resolvers.structuredproperties;
 
 import static com.linkedin.datahub.graphql.resolvers.ResolverUtils.bindArgument;
+import static com.linkedin.metadata.Constants.SCHEMA_FIELD_ENTITY_NAME;
 import static com.linkedin.metadata.Constants.STRUCTURED_PROPERTIES_ASPECT_NAME;
 
 import com.datahub.authentication.Authentication;
@@ -74,7 +75,9 @@ public CompletableFuture<com.linkedin.datahub.graphql.generated.StructuredProper
             final AuditStamp auditStamp =
                 AuditStampUtils.createAuditStamp(authentication.getActor().toUrnStr());
 
-            if (!_entityClient.exists(context.getOperationContext(), assetUrn)) {
+            // schemaField entities often don't exist, create it if upserting on a schema field
+            if (!assetUrn.getEntityType().equals(SCHEMA_FIELD_ENTITY_NAME)
+                && !_entityClient.exists(context.getOperationContext(), assetUrn)) {
               throw new RuntimeException(
                   String.format("Asset with provided urn %s does not exist", assetUrn));
             }

diff --git a/datahub-web-react/build.gradle b/datahub-web-react/build.gradle
@@ -19,7 +19,7 @@ node {
   version = '21.2.0'
 
   // Version of Yarn to use.
-  yarnVersion = '1.22.21'
+  yarnVersion = '1.22.22'
 
   // Base URL for fetching node distributions (set nodeDistBaseUrl if you have a mirror).
   if (project.hasProperty('nodeDistBaseUrl')) {
@@ -43,7 +43,7 @@ node {
   Wrappers around Yarn Tasks.
  */
 task yarnInstall(type: YarnTask) {
-  args = ['install']
+  args = ['install', '--network-timeout', '300000']
 
   // The node_modules directory can contain built artifacts, so
   // it's not really safe to cache it.

diff --git a/docs-website/build.gradle b/docs-website/build.gradle
@@ -17,7 +17,7 @@ node {
   version = '21.2.0'
 
   // Version of Yarn to use.
-  yarnVersion = '1.22.1'
+  yarnVersion = '1.22.22'
 
   // Base URL for fetching node distributions (set nodeDistBaseUrl if you have a mirror).
   if (project.hasProperty('nodeDistBaseUrl')) {
@@ -64,9 +64,9 @@ task generateJsonSchema(type: Exec, dependsOn: [':metadata-ingestion:docGen']) {
 task yarnInstall(type: YarnTask) {
   logger.info('CI = "{}"', System.env.CI)
   if (System.env.CI != null && System.env.CI == "true") {
-    args = ['install','--frozen-lockfile']
+    args = ['install', '--frozen-lockfile', '--network-timeout', '300000']
   } else {
-    args = ['install']
+    args = ['install', '--network-timeout', '300000']
   }
 
   // The node_modules directory can contain built artifacts, so

diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js
@@ -65,6 +65,12 @@ module.exports = {
     //     isCloseable: false,
     //   },
     // }),
+    colorMode: {
+      // Only support light mode.
+      defaultMode: 'light',
+      disableSwitch: true,
+      respectPrefersColorScheme: false,
+    },
     navbar: {
       title: null,
       logo: {
@@ -245,7 +251,7 @@ module.exports = {
             },
             {
               label: "Adoption",
-              to: "docs/#adoption",
+              href: "/adoption-stories",
             },
           ],
         },

diff --git a/...champions/_components/ChampionQualityCardsSection/championqualitycardssection.module.scss b/...champions/_components/ChampionQualityCardsSection/championqualitycardssection.module.scss
@@ -1,3 +1,9 @@
+.subtitle {
+    font-size: 1.7rem;
+    font-weight: 500;
+    margin: 1.5rem 0;
+}
+
 .section {
     margin: 2rem 3rem 3rem 3rem;
 }

diff --git a/docs-website/src/pages/champions/_components/ChampionQualityCardsSection/index.js b/docs-website/src/pages/champions/_components/ChampionQualityCardsSection/index.js
@@ -6,7 +6,7 @@ import { CodeTwoTone, HeartTwoTone, SoundTwoTone } from "@ant-design/icons";
 const ChampionQualityCardsSection = () => {
   return (
     <div>
-    <h2>Our Champions...</h2>
+    <div class={clsx(styles.subtitle)}>Our Champions...</div>
     <div class={clsx("row section", styles.section)}>
       <div class={clsx("card col col-4", styles.card)}>
         <div class="card-body">

diff --git a/docs-website/src/pages/champions/index.js b/docs-website/src/pages/champions/index.js
@@ -372,7 +372,7 @@ function Champion() {
           <div className="hero__content">
             <div>
               <HeroImage /> 
-              <h1>DataHub Champions</h1>
+              <div className="hero__title">DataHub Champions</div>
               <p className="hero__subtitle">
                 Recognizing community members who have made exceptional contributions to further the collective success of DataHub.              
               </p>

diff --git a/docs-website/src/styles/global.scss b/docs-website/src/styles/global.scss
@@ -262,26 +262,26 @@ div[class^="announcementBar"] {
 
 /* Hero */
 
-// .hero {
-//   padding: 5vh 0;
-
-//   .hero__subtitle {
-//     font-size: 1.25em;
-//     max-width: 800px;
-
-//     img {
-//       vertical-align: middle;
-//       margin-top: -0.3em;
-//     }
-//   }
-
-//   .hero__content {
-//     text-align: center;
-//     padding: 2rem 0;
-//     height: 100%;
-//     display: flex;
-//   }
-// }
+.hero {
+  padding: 5vh 0;
+
+  .hero__subtitle {
+    font-size: 1.25em;
+    max-width: 800px;
+    display: inline-block;
+
+    img {
+      vertical-align: middle;
+      margin-top: -0.3em;
+    }
+  }
+
+  .hero__content {
+    text-align: center;
+    padding: 2rem 0;
+    height: 100%;
+  }
+}
 
 /* Sidebar Menu */
 

diff --git a/docs/automations/docs-propagation.md b/docs/automations/docs-propagation.md
@@ -47,7 +47,7 @@ Notice that the user must have the `Manage Ingestion` permission to view and ena
   <img width="20%"  src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/automation/oss/features-settings-link.png"/>
 </p>
 
-3**Enable Documentation Propagation**: Locate the 'Documentation Propagation' section and toggle the feature to enable it for column-level and asset-level propagation. 
+3. **Enable Documentation Propagation**: Locate the 'Documentation Propagation' section and toggle the feature to enable it for column-level and asset-level propagation. 
 Currently, Column Level propagation is supported, with asset level propagation coming soon. 
 
 <p align="left">

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
@@ -12,7 +12,7 @@
 from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
 from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
 from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
-from datahub.utilities.lossy_collections import LossyDict, LossyList
+from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
 
@@ -69,6 +69,9 @@ class BigQueryQueriesExtractorReport(Report):
     num_total_queries: int = 0
     num_unique_queries: int = 0
 
+    num_discovered_tables: Optional[int] = None
+    inferred_temp_tables: LossySet[str] = field(default_factory=LossySet)
+
 
 @dataclass
 class BigQueryV2Report(

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py
@@ -173,6 +173,9 @@ def __init__(
             format_queries=False,
         )
         self.report.sql_aggregator = self.aggregator.report
+        self.report.num_discovered_tables = (
+            len(self.discovered_tables) if self.discovered_tables else None
+        )
 
     @functools.cached_property
     def local_temp_path(self) -> pathlib.Path:
@@ -201,6 +204,7 @@ def is_temp_table(self, name: str) -> bool:
                 and self.discovered_tables
                 and str(BigQueryTableRef(table)) not in self.discovered_tables
             ):
+                self.report.inferred_temp_tables.add(name)
                 return True
 
         except Exception:
@@ -264,6 +268,8 @@ def get_workunits_internal(
                 for query in query_instances.values():
                     if i > 0 and i % 10000 == 0:
                         logger.info(f"Added {i} query log entries to SQL aggregator")
+                        if self.report.sql_aggregator:
+                            logger.info(self.report.sql_aggregator.as_string())
 
                     self.aggregator.add(query)
                     i += 1

diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
@@ -415,15 +415,7 @@ def meta_mapping_validator(
             if v["operation"] == "add_owner":
                 owner_category = v["config"].get("owner_category")
                 if owner_category:
-                    allowed_categories = [
-                        value
-                        for name, value in vars(OwnershipTypeClass).items()
-                        if not name.startswith("_")
-                    ]
-                    if (owner_category.upper()) not in allowed_categories:
-                        raise ValueError(
-                            f"Owner category {owner_category} is not one of {allowed_categories}"
-                        )
+                    mce_builder.validate_ownership_type(owner_category)
         return meta_mapping
 
     @validator("include_column_lineage")

diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
@@ -380,15 +380,16 @@ def _get_input_fields_from_query(
                 )
             )
 
-        # A query uses fields for filtering and those fields are defined in views, find the views those fields use
+        # A query uses fields for filtering, and those fields are defined in views, find the views those fields use
         filters: MutableMapping[str, Any] = (
             query.filters if query.filters is not None else {}
         )
         for field in filters.keys():
             if field is None:
                 continue
 
-            # we haven't loaded in metadata about the explore yet, so we need to wait until explores are populated later to fetch this
+            # we haven't loaded in metadata about the explore yet, so we need to wait until explores are populated
+            # later to fetch this
             result.append(
                 InputFieldElement(
                     name=field, view_field=None, model=query.model, explore=query.view
@@ -1486,13 +1487,27 @@ def extract_independent_looks(self) -> Iterable[MetadataWorkUnit]:
         )
         for look in all_looks:
             if look.id in self.reachable_look_registry:
-                # This look is reachable from Dashboard
+                # This look is reachable from the Dashboard
                 continue
 
             if look.query_id is None:
                 logger.info(f"query_id is None for look {look.title}({look.id})")
                 continue
 
+            if self.source_config.skip_personal_folders:
+                if look.folder is not None and (
+                    look.folder.is_personal or look.folder.is_personal_descendant
+                ):
+                    self.reporter.info(
+                        title="Dropped Look",
+                        message="Dropped due to being a personal folder",
+                        context=f"Look ID: {look.id}",
+                    )
+
+                    assert look.id, "Looker id is null"
+                    self.reporter.report_charts_dropped(look.id)
+                    continue
+
             if look.id is not None:
                 query: Optional[Query] = self.looker_api.get_look(
                     look.id, fields=["query"]
@@ -1510,11 +1525,12 @@ def extract_independent_looks(self) -> Iterable[MetadataWorkUnit]:
                 LookerDashboardElement
             ] = self._get_looker_dashboard_element(
                 DashboardElement(
-                    id=f"looks_{look.id}",  # to avoid conflict with non-standalone looks (element.id prefixes), we add the "looks_" prefix to look.id.
+                    id=f"looks_{look.id}",  # to avoid conflict with non-standalone looks (element.id prefixes),
+                    # we add the "looks_" prefix to look.id.
                     title=look.title,
                     subtitle_text=look.description,
                     look_id=look.id,
-                    dashboard_id=None,  # As this is independent look
+                    dashboard_id=None,  # As this is an independent look
                     look=LookWithQuery(
                         query=query, folder=look.folder, user_id=look.user_id
                     ),

diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
@@ -39,10 +39,12 @@
     ColumnRef,
     DownstreamColumnRef,
     SqlParsingResult,
+    _sqlglot_lineage_cached,
     infer_output_schema,
     sqlglot_lineage,
 )
 from datahub.sql_parsing.sqlglot_utils import (
+    _parse_statement,
     generate_hash,
     get_query_fingerprint,
     try_format_query,
@@ -222,6 +224,9 @@ class SqlAggregatorReport(Report):
     sql_parsing_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
     sql_fingerprinting_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
     sql_formatting_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
+    sql_parsing_cache_stats: Optional[dict] = dataclasses.field(default=None)
+    parse_statement_cache_stats: Optional[dict] = dataclasses.field(default=None)
+    format_query_cache_stats: Optional[dict] = dataclasses.field(default=None)
 
     # Other lineage loading metrics.
     num_known_query_lineage: int = 0
@@ -239,6 +244,7 @@ class SqlAggregatorReport(Report):
     queries_with_non_authoritative_session: LossyList[QueryId] = dataclasses.field(
         default_factory=LossyList
     )
+    make_schema_resolver_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
 
     # Lineage-related.
     schema_resolver_count: Optional[int] = None
@@ -272,6 +278,10 @@ def compute_stats(self) -> None:
         self.num_temp_sessions = len(self._aggregator._temp_lineage_map)
         self.num_inferred_temp_schemas = len(self._aggregator._inferred_temp_schemas)
 
+        self.sql_parsing_cache_stats = _sqlglot_lineage_cached.cache_info()._asdict()
+        self.parse_statement_cache_stats = _parse_statement.cache_info()._asdict()
+        self.format_query_cache_stats = try_format_query.cache_info()._asdict()
+
         return super().compute_stats()
 
 
@@ -679,11 +689,12 @@ def add_observed_query(
         # All queries with no session ID are assumed to be part of the same session.
         session_id = observed.session_id or _MISSING_SESSION_ID
 
-        # Load in the temp tables for this session.
-        schema_resolver: SchemaResolverInterface = (
-            self._make_schema_resolver_for_session(session_id)
-        )
-        session_has_temp_tables = schema_resolver.includes_temp_tables()
+        with self.report.make_schema_resolver_timer:
+            # Load in the temp tables for this session.
+            schema_resolver: SchemaResolverInterface = (
+                self._make_schema_resolver_for_session(session_id)
+            )
+            session_has_temp_tables = schema_resolver.includes_temp_tables()
 
         # Run the SQL parser.
         parsed = self._run_sql_parser(