Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
anshbansal authored Sep 25, 2024
2 parents 02c0a34 + cf1d296 commit 9322be8
Show file tree
Hide file tree
Showing 20 changed files with 1,953 additions and 79 deletions.
4 changes: 2 additions & 2 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ buildscript {
buildscript.repositories.addAll(project.repositories)
dependencies {
classpath 'com.linkedin.pegasus:gradle-plugins:' + pegasusVersion
classpath 'com.github.node-gradle:gradle-node-plugin:7.0.1'
classpath 'com.github.node-gradle:gradle-node-plugin:7.0.2'
classpath 'io.acryl.gradle.plugin:gradle-avro-plugin:0.2.0'
classpath 'org.springframework.boot:spring-boot-gradle-plugin:' + springBootVersion
classpath "io.codearte.gradle.nexus:gradle-nexus-staging-plugin:0.30.0"
Expand Down Expand Up @@ -267,7 +267,7 @@ project.ext.externalDependency = [
'testContainersOpenSearch': 'org.opensearch:opensearch-testcontainers:2.0.0',
'typesafeConfig':'com.typesafe:config:1.4.1',
'wiremock':'com.github.tomakehurst:wiremock:2.10.0',
'zookeeper': 'org.apache.zookeeper:zookeeper:3.6.2',
'zookeeper': 'org.apache.zookeeper:zookeeper:3.8.4',
'wire': 'com.squareup.wire:wire-compiler:3.7.1',
'charle': 'com.charleskorn.kaml:kaml:0.53.0',
'common': 'commons-io:commons-io:2.7',
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.linkedin.datahub.graphql.resolvers.structuredproperties;

import static com.linkedin.datahub.graphql.resolvers.ResolverUtils.bindArgument;
import static com.linkedin.metadata.Constants.SCHEMA_FIELD_ENTITY_NAME;
import static com.linkedin.metadata.Constants.STRUCTURED_PROPERTIES_ASPECT_NAME;

import com.datahub.authentication.Authentication;
Expand Down Expand Up @@ -74,7 +75,9 @@ public CompletableFuture<com.linkedin.datahub.graphql.generated.StructuredProper
final AuditStamp auditStamp =
AuditStampUtils.createAuditStamp(authentication.getActor().toUrnStr());

if (!_entityClient.exists(context.getOperationContext(), assetUrn)) {
// schemaField entities often don't exist, create it if upserting on a schema field
if (!assetUrn.getEntityType().equals(SCHEMA_FIELD_ENTITY_NAME)
&& !_entityClient.exists(context.getOperationContext(), assetUrn)) {
throw new RuntimeException(
String.format("Asset with provided urn %s does not exist", assetUrn));
}
Expand Down
4 changes: 2 additions & 2 deletions datahub-web-react/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ node {
version = '21.2.0'

// Version of Yarn to use.
yarnVersion = '1.22.21'
yarnVersion = '1.22.22'

// Base URL for fetching node distributions (set nodeDistBaseUrl if you have a mirror).
if (project.hasProperty('nodeDistBaseUrl')) {
Expand All @@ -43,7 +43,7 @@ node {
Wrappers around Yarn Tasks.
*/
task yarnInstall(type: YarnTask) {
args = ['install']
args = ['install', '--network-timeout', '300000']

// The node_modules directory can contain built artifacts, so
// it's not really safe to cache it.
Expand Down
6 changes: 3 additions & 3 deletions docs-website/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ node {
version = '21.2.0'

// Version of Yarn to use.
yarnVersion = '1.22.1'
yarnVersion = '1.22.22'

// Base URL for fetching node distributions (set nodeDistBaseUrl if you have a mirror).
if (project.hasProperty('nodeDistBaseUrl')) {
Expand Down Expand Up @@ -64,9 +64,9 @@ task generateJsonSchema(type: Exec, dependsOn: [':metadata-ingestion:docGen']) {
task yarnInstall(type: YarnTask) {
logger.info('CI = "{}"', System.env.CI)
if (System.env.CI != null && System.env.CI == "true") {
args = ['install','--frozen-lockfile']
args = ['install', '--frozen-lockfile', '--network-timeout', '300000']
} else {
args = ['install']
args = ['install', '--network-timeout', '300000']
}

// The node_modules directory can contain built artifacts, so
Expand Down
8 changes: 7 additions & 1 deletion docs-website/docusaurus.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ module.exports = {
// isCloseable: false,
// },
// }),
colorMode: {
// Only support light mode.
defaultMode: 'light',
disableSwitch: true,
respectPrefersColorScheme: false,
},
navbar: {
title: null,
logo: {
Expand Down Expand Up @@ -245,7 +251,7 @@ module.exports = {
},
{
label: "Adoption",
to: "docs/#adoption",
href: "/adoption-stories",
},
],
},
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
.subtitle {
font-size: 1.7rem;
font-weight: 500;
margin: 1.5rem 0;
}

.section {
margin: 2rem 3rem 3rem 3rem;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { CodeTwoTone, HeartTwoTone, SoundTwoTone } from "@ant-design/icons";
const ChampionQualityCardsSection = () => {
return (
<div>
<h2>Our Champions...</h2>
<div class={clsx(styles.subtitle)}>Our Champions...</div>
<div class={clsx("row section", styles.section)}>
<div class={clsx("card col col-4", styles.card)}>
<div class="card-body">
Expand Down
2 changes: 1 addition & 1 deletion docs-website/src/pages/champions/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ function Champion() {
<div className="hero__content">
<div>
<HeroImage />
<h1>DataHub Champions</h1>
<div className="hero__title">DataHub Champions</div>
<p className="hero__subtitle">
Recognizing community members who have made exceptional contributions to further the collective success of DataHub.
</p>
Expand Down
40 changes: 20 additions & 20 deletions docs-website/src/styles/global.scss
Original file line number Diff line number Diff line change
Expand Up @@ -262,26 +262,26 @@ div[class^="announcementBar"] {

/* Hero */

// .hero {
// padding: 5vh 0;

// .hero__subtitle {
// font-size: 1.25em;
// max-width: 800px;

// img {
// vertical-align: middle;
// margin-top: -0.3em;
// }
// }

// .hero__content {
// text-align: center;
// padding: 2rem 0;
// height: 100%;
// display: flex;
// }
// }
.hero {
padding: 5vh 0;

.hero__subtitle {
font-size: 1.25em;
max-width: 800px;
display: inline-block;

img {
vertical-align: middle;
margin-top: -0.3em;
}
}

.hero__content {
text-align: center;
padding: 2rem 0;
height: 100%;
}
}

/* Sidebar Menu */

Expand Down
2 changes: 1 addition & 1 deletion docs/automations/docs-propagation.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ Notice that the user must have the `Manage Ingestion` permission to view and ena
<img width="20%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/automation/oss/features-settings-link.png"/>
</p>

3**Enable Documentation Propagation**: Locate the 'Documentation Propagation' section and toggle the feature to enable it for column-level and asset-level propagation.
3. **Enable Documentation Propagation**: Locate the 'Documentation Propagation' section and toggle the feature to enable it for column-level and asset-level propagation.
Currently, Column Level propagation is supported, with asset level propagation coming soon.

<p align="left">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
from datahub.utilities.lossy_collections import LossyDict, LossyList
from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
from datahub.utilities.perf_timer import PerfTimer
from datahub.utilities.stats_collections import TopKDict, int_top_k_dict

Expand Down Expand Up @@ -69,6 +69,9 @@ class BigQueryQueriesExtractorReport(Report):
num_total_queries: int = 0
num_unique_queries: int = 0

num_discovered_tables: Optional[int] = None
inferred_temp_tables: LossySet[str] = field(default_factory=LossySet)


@dataclass
class BigQueryV2Report(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ def __init__(
format_queries=False,
)
self.report.sql_aggregator = self.aggregator.report
self.report.num_discovered_tables = (
len(self.discovered_tables) if self.discovered_tables else None
)

@functools.cached_property
def local_temp_path(self) -> pathlib.Path:
Expand Down Expand Up @@ -201,6 +204,7 @@ def is_temp_table(self, name: str) -> bool:
and self.discovered_tables
and str(BigQueryTableRef(table)) not in self.discovered_tables
):
self.report.inferred_temp_tables.add(name)
return True

except Exception:
Expand Down Expand Up @@ -264,6 +268,8 @@ def get_workunits_internal(
for query in query_instances.values():
if i > 0 and i % 10000 == 0:
logger.info(f"Added {i} query log entries to SQL aggregator")
if self.report.sql_aggregator:
logger.info(self.report.sql_aggregator.as_string())

self.aggregator.add(query)
i += 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -415,15 +415,7 @@ def meta_mapping_validator(
if v["operation"] == "add_owner":
owner_category = v["config"].get("owner_category")
if owner_category:
allowed_categories = [
value
for name, value in vars(OwnershipTypeClass).items()
if not name.startswith("_")
]
if (owner_category.upper()) not in allowed_categories:
raise ValueError(
f"Owner category {owner_category} is not one of {allowed_categories}"
)
mce_builder.validate_ownership_type(owner_category)
return meta_mapping

@validator("include_column_lineage")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -380,15 +380,16 @@ def _get_input_fields_from_query(
)
)

# A query uses fields for filtering and those fields are defined in views, find the views those fields use
# A query uses fields for filtering, and those fields are defined in views, find the views those fields use
filters: MutableMapping[str, Any] = (
query.filters if query.filters is not None else {}
)
for field in filters.keys():
if field is None:
continue

# we haven't loaded in metadata about the explore yet, so we need to wait until explores are populated later to fetch this
# we haven't loaded in metadata about the explore yet, so we need to wait until explores are populated
# later to fetch this
result.append(
InputFieldElement(
name=field, view_field=None, model=query.model, explore=query.view
Expand Down Expand Up @@ -1486,13 +1487,27 @@ def extract_independent_looks(self) -> Iterable[MetadataWorkUnit]:
)
for look in all_looks:
if look.id in self.reachable_look_registry:
# This look is reachable from Dashboard
# This look is reachable from the Dashboard
continue

if look.query_id is None:
logger.info(f"query_id is None for look {look.title}({look.id})")
continue

if self.source_config.skip_personal_folders:
if look.folder is not None and (
look.folder.is_personal or look.folder.is_personal_descendant
):
self.reporter.info(
title="Dropped Look",
message="Dropped due to being a personal folder",
context=f"Look ID: {look.id}",
)

assert look.id, "Looker id is null"
self.reporter.report_charts_dropped(look.id)
continue

if look.id is not None:
query: Optional[Query] = self.looker_api.get_look(
look.id, fields=["query"]
Expand All @@ -1510,11 +1525,12 @@ def extract_independent_looks(self) -> Iterable[MetadataWorkUnit]:
LookerDashboardElement
] = self._get_looker_dashboard_element(
DashboardElement(
id=f"looks_{look.id}", # to avoid conflict with non-standalone looks (element.id prefixes), we add the "looks_" prefix to look.id.
id=f"looks_{look.id}", # to avoid conflict with non-standalone looks (element.id prefixes),
# we add the "looks_" prefix to look.id.
title=look.title,
subtitle_text=look.description,
look_id=look.id,
dashboard_id=None, # As this is independent look
dashboard_id=None, # As this is an independent look
look=LookWithQuery(
query=query, folder=look.folder, user_id=look.user_id
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,12 @@
ColumnRef,
DownstreamColumnRef,
SqlParsingResult,
_sqlglot_lineage_cached,
infer_output_schema,
sqlglot_lineage,
)
from datahub.sql_parsing.sqlglot_utils import (
_parse_statement,
generate_hash,
get_query_fingerprint,
try_format_query,
Expand Down Expand Up @@ -222,6 +224,9 @@ class SqlAggregatorReport(Report):
sql_parsing_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
sql_fingerprinting_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
sql_formatting_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
sql_parsing_cache_stats: Optional[dict] = dataclasses.field(default=None)
parse_statement_cache_stats: Optional[dict] = dataclasses.field(default=None)
format_query_cache_stats: Optional[dict] = dataclasses.field(default=None)

# Other lineage loading metrics.
num_known_query_lineage: int = 0
Expand All @@ -239,6 +244,7 @@ class SqlAggregatorReport(Report):
queries_with_non_authoritative_session: LossyList[QueryId] = dataclasses.field(
default_factory=LossyList
)
make_schema_resolver_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)

# Lineage-related.
schema_resolver_count: Optional[int] = None
Expand Down Expand Up @@ -272,6 +278,10 @@ def compute_stats(self) -> None:
self.num_temp_sessions = len(self._aggregator._temp_lineage_map)
self.num_inferred_temp_schemas = len(self._aggregator._inferred_temp_schemas)

self.sql_parsing_cache_stats = _sqlglot_lineage_cached.cache_info()._asdict()
self.parse_statement_cache_stats = _parse_statement.cache_info()._asdict()
self.format_query_cache_stats = try_format_query.cache_info()._asdict()

return super().compute_stats()


Expand Down Expand Up @@ -679,11 +689,12 @@ def add_observed_query(
# All queries with no session ID are assumed to be part of the same session.
session_id = observed.session_id or _MISSING_SESSION_ID

# Load in the temp tables for this session.
schema_resolver: SchemaResolverInterface = (
self._make_schema_resolver_for_session(session_id)
)
session_has_temp_tables = schema_resolver.includes_temp_tables()
with self.report.make_schema_resolver_timer:
# Load in the temp tables for this session.
schema_resolver: SchemaResolverInterface = (
self._make_schema_resolver_for_session(session_id)
)
session_has_temp_tables = schema_resolver.includes_temp_tables()

# Run the SQL parser.
parsed = self._run_sql_parser(
Expand Down
Loading

0 comments on commit 9322be8

Please sign in to comment.