-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[sqlserver] Refactor sqlserver database metrics (#18883)
* migrate dynamic metrics to database metrics * add changelog * update changelog * restore to original database after use * rename to restore_current_database_context * migrate tempdb space usage metrics * migrate index usage metrics * update changelog * migrate database fragmentation metrics * remove deprecated metrics * migrate os tasks metrics * migrate master files metrics * migrate database files metricsrics * calcute total page size using extra transformers * migrate database stats and backup metrics * migrate os schedulers metrics * migrate ao metrics * fix tests * increase database backup metrics collection interval * fix repr * increase index fragmentation metrics collection interval to 5 mins * config to skip tempdb for index fragmentation metrics * fix tests * fix changelog * fix lint
- Loading branch information
1 parent
87acd78
commit b84e4d8
Showing
24 changed files
with
2,326 additions
and
1,285 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
Migrate following dynamic metrics to database_metrics for better maintainability and testability. | ||
- SQLServer AlwaysOn metrics | ||
- SQLServer FCI metrics | ||
- SQLServer file stats metrics | ||
- SQLServer primary log shipping metrics | ||
- SQLServer secondary log shipping metrics | ||
- SQLServer server state metrics | ||
- SQLServer tempdb file space usage metrics | ||
- SQLServer index usage metrics | ||
- SQLServer database index fragmentation metrics | ||
- SQLServer os tasks metrics | ||
- SQLServer master files metrics | ||
- SQLServer database files metrics | ||
- SQLServer database stats metrics | ||
- SQLServer database backup metrics | ||
- SQLServer os schedulers metrics | ||
- SQLServer database replication stats metrics | ||
- SQLServer availability replicas metrics | ||
- SQLServer availability groups metrics | ||
Increase database backup metrics and index fragmentation metrics collection interval to 5 minutes. |
17 changes: 16 additions & 1 deletion
17
sqlserver/datadog_checks/sqlserver/database_metrics/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,23 @@ | ||
# (C) Datadog, Inc. 2024-present | ||
# All rights reserved | ||
# Licensed under a 3-clause BSD style license (see LICENSE) | ||
from .ao_metrics import SqlserverAoMetrics | ||
from .availability_groups_metrics import SqlserverAvailabilityGroupsMetrics | ||
from .availability_replicas_metrics import SqlserverAvailabilityReplicasMetrics | ||
from .database_agent_metrics import SqlserverAgentMetrics | ||
from .database_backup_metrics import SqlserverDatabaseBackupMetrics | ||
from .database_files_metrics import SqlserverDatabaseFilesMetrics | ||
from .database_replication_stats_metrics import SqlserverDatabaseReplicationStatsMetrics | ||
from .database_stats_metrics import SqlserverDatabaseStatsMetrics | ||
from .db_fragmentation_metrics import SqlserverDBFragmentationMetrics | ||
from .fci_metrics import SqlserverFciMetrics | ||
from .file_stats_metrics import SqlserverFileStatsMetrics | ||
from .index_usage_metrics import SqlserverIndexUsageMetrics | ||
from .database_agent_metrics import SqlserverAgentMetrics | ||
from .master_files_metrics import SqlserverMasterFilesMetrics | ||
from .os_schedulers_metrics import SqlserverOsSchedulersMetrics | ||
from .os_tasks_metrics import SqlserverOsTasksMetrics | ||
from .primary_log_shipping_metrics import SqlserverPrimaryLogShippingMetrics | ||
from .secondary_log_shipping_metrics import SqlserverSecondaryLogShippingMetrics | ||
from .server_state_metrics import SqlserverServerStateMetrics | ||
from .tempdb_file_space_usage_metrics import SqlserverTempDBFileSpaceUsageMetrics | ||
from .xe_session_metrics import SQLServerXESessionMetrics |
221 changes: 221 additions & 0 deletions
221
sqlserver/datadog_checks/sqlserver/database_metrics/ao_metrics.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,221 @@ | ||
# (C) Datadog, Inc. 2024-present | ||
# All rights reserved | ||
# Licensed under a 3-clause BSD style license (see LICENSE) | ||
|
||
from typing import List | ||
|
||
from datadog_checks.base.config import is_affirmative | ||
from datadog_checks.sqlserver.utils import is_azure_database | ||
|
||
from .base import SqlserverDatabaseMetricsBase | ||
|
||
QUERY_AO_FAILOVER_CLUSTER = { | ||
"name": "sys.dm_hadr_cluster", | ||
"query": """ | ||
SELECT | ||
LOWER(quorum_type_desc) AS quorum_type_desc, | ||
LOWER(quorum_state_desc) AS quorum_state_desc, | ||
cluster_name, | ||
1, | ||
1 | ||
FROM sys.dm_hadr_cluster | ||
""".strip(), | ||
"columns": [ | ||
{"name": "quorum_type", "type": "tag"}, | ||
{"name": "quorum_state", "type": "tag"}, | ||
{"name": "failover_cluster", "type": "tag"}, | ||
{"name": "ao.quorum_type", "type": "gauge"}, | ||
{"name": "ao.quorum_state", "type": "gauge"}, | ||
], | ||
} | ||
|
||
# sys.dm_hadr_cluster does not have a related column to join on, this cross join will add the | ||
# cluster_name column to every row by multiplying all the rows in the left table against | ||
# all the rows in the right table. Note, there will only be one row from sys.dm_hadr_cluster. | ||
QUERY_AO_FAILOVER_CLUSTER_MEMBER = { | ||
"name": "sys.dm_hadr_cluster_members", | ||
"query": """ | ||
SELECT | ||
member_name, | ||
LOWER(member_type_desc) AS member_type_desc, | ||
LOWER(member_state_desc) AS member_state_desc, | ||
FC.cluster_name, | ||
1, | ||
1, | ||
number_of_quorum_votes | ||
FROM sys.dm_hadr_cluster_members | ||
CROSS JOIN (SELECT TOP 1 cluster_name FROM sys.dm_hadr_cluster) AS FC | ||
""".strip(), | ||
"columns": [ | ||
{"name": "member_name", "type": "tag"}, | ||
{"name": "member_type", "type": "tag"}, | ||
{"name": "member_state", "type": "tag"}, | ||
{"name": "failover_cluster", "type": "tag"}, | ||
{"name": "ao.member.type", "type": "gauge"}, | ||
{"name": "ao.member.state", "type": "gauge"}, | ||
{"name": "ao.member.number_of_quorum_votes", "type": "gauge"}, | ||
], | ||
} | ||
|
||
|
||
class SqlserverAoMetrics(SqlserverDatabaseMetricsBase): | ||
@property | ||
def include_ao_metrics(self) -> bool: | ||
return is_affirmative(self.instance_config.get('include_ao_metrics', False)) | ||
|
||
@property | ||
def enabled(self) -> bool: | ||
if not self.include_ao_metrics: | ||
return False | ||
if not self.major_version and not is_azure_database(self.engine_edition): | ||
return False | ||
if self.major_version > 2012 or is_azure_database(self.engine_edition): | ||
return True | ||
return False | ||
|
||
@property | ||
def queries(self) -> List[dict]: | ||
return [ | ||
self.__get_query_ao_availability_groups(), | ||
QUERY_AO_FAILOVER_CLUSTER, | ||
QUERY_AO_FAILOVER_CLUSTER_MEMBER, | ||
] | ||
|
||
def __repr__(self) -> str: | ||
return ( | ||
f"{self.__class__.__name__}(" | ||
f"enabled={self.enabled}, " | ||
f"major_version={self.major_version}, " | ||
f"engine_edition={self.engine_edition}, " | ||
f"include_ao_metrics={self.include_ao_metrics})" | ||
) | ||
|
||
def __get_query_ao_availability_groups(self) -> dict: | ||
""" | ||
Construct the sys.availability_groups QueryExecutor configuration based on the SQL Server major version | ||
:params sqlserver_major_version: SQL Server major version (i.e. 2012, 2019, ...) | ||
:return: a QueryExecutor query config object | ||
""" | ||
column_definitions_tags = { | ||
# AG - sys.availability_groups | ||
"AG.group_id AS availability_group": { | ||
"name": "availability_group", | ||
"type": "tag", | ||
}, | ||
"AG.name AS availability_group_name": { | ||
"name": "availability_group_name", | ||
"type": "tag", | ||
}, | ||
# AR - sys.availability_replicas | ||
"AR.replica_server_name": {"name": "replica_server_name", "type": "tag"}, | ||
"LOWER(AR.failover_mode_desc) AS failover_mode_desc": { | ||
"name": "failover_mode", | ||
"type": "tag", | ||
}, | ||
"LOWER(AR.availability_mode_desc) AS availability_mode_desc": { | ||
"name": "availability_mode", | ||
"type": "tag", | ||
}, | ||
# ADC - sys.availability_databases_cluster | ||
"ADC.database_name": {"name": "database_name", "type": "tag"}, | ||
# DRS - sys.dm_hadr_database_replica_states | ||
"DRS.replica_id": {"name": "replica_id", "type": "tag"}, | ||
"DRS.database_id": {"name": "database_id", "type": "tag"}, | ||
"LOWER(DRS.database_state_desc) AS database_state_desc": { | ||
"name": "database_state", | ||
"type": "tag", | ||
}, | ||
"LOWER(DRS.synchronization_state_desc) AS synchronization_state_desc": { | ||
"name": "synchronization_state", | ||
"type": "tag", | ||
}, | ||
# FC - sys.dm_hadr_cluster | ||
"FC.cluster_name": { | ||
"name": "failover_cluster", | ||
"type": "tag", | ||
}, | ||
} | ||
column_definitions_metrics = { | ||
"(DRS.log_send_queue_size * 1024) AS log_send_queue_size": { | ||
"name": "ao.log_send_queue_size", | ||
"type": "gauge", | ||
}, | ||
"(DRS.log_send_rate * 1024) AS log_send_rate": { | ||
"name": "ao.log_send_rate", | ||
"type": "gauge", | ||
}, | ||
"(DRS.redo_queue_size * 1024) AS redo_queue_size": { | ||
"name": "ao.redo_queue_size", | ||
"type": "gauge", | ||
}, | ||
"(DRS.redo_rate * 1024) AS redo_rate": { | ||
"name": "ao.redo_rate", | ||
"type": "gauge", | ||
}, | ||
"DRS.low_water_mark_for_ghosts": { | ||
"name": "ao.low_water_mark_for_ghosts", | ||
"type": "gauge", | ||
}, | ||
"(DRS.filestream_send_rate * 1024) AS filestream_send_rate": { | ||
"name": "ao.filestream_send_rate", | ||
"type": "gauge", | ||
}, | ||
# Other | ||
"1 AS replica_sync_topology_indicator": { | ||
"name": "ao.replica_status", | ||
"type": "gauge", | ||
}, | ||
} | ||
|
||
# Include metrics based on version | ||
if self.major_version >= 2016: | ||
column_definitions_metrics["DRS.secondary_lag_seconds"] = { | ||
"name": "ao.secondary_lag_seconds", | ||
"type": "gauge", | ||
} | ||
if self.major_version >= 2014: | ||
column_definitions_metrics["DRS.is_primary_replica"] = { | ||
"name": "ao.is_primary_replica", | ||
"type": "gauge", | ||
} | ||
column_definitions_tags[ | ||
""" | ||
CASE | ||
WHEN DRS.is_primary_replica = 1 THEN 'primary' | ||
WHEN DRS.is_primary_replica = 0 THEN 'secondary' | ||
END AS replica_role_desc | ||
""" | ||
] = {"name": "replica_role", "type": "tag"} | ||
|
||
# Sort columns to ensure a static column order | ||
sql_columns = [] | ||
metric_columns = [] | ||
for column in sorted(column_definitions_tags.keys()): | ||
sql_columns.append(column) | ||
metric_columns.append(column_definitions_tags[column]) | ||
for column in sorted(column_definitions_metrics.keys()): | ||
sql_columns.append(column) | ||
metric_columns.append(column_definitions_metrics[column]) | ||
|
||
return { | ||
"name": "sys.availability_groups", | ||
"query": """ | ||
SELECT | ||
{sql_columns} | ||
FROM | ||
sys.availability_groups AS AG | ||
INNER JOIN sys.availability_replicas AS AR ON AG.group_id = AR.group_id | ||
INNER JOIN sys.availability_databases_cluster AS ADC ON AG.group_id = ADC.group_id | ||
INNER JOIN sys.dm_hadr_database_replica_states AS DRS ON AG.group_id = DRS.group_id | ||
AND ADC.group_database_id = DRS.group_database_id | ||
AND AR.replica_id = DRS.replica_id | ||
-- `sys.dm_hadr_cluster` does not have a related column to join on, this cross join will add the | ||
-- `cluster_name` column to every row by multiplying all the rows in the left table against | ||
-- all the rows in the right table. Note, there will only be one row from `sys.dm_hadr_cluster`. | ||
CROSS JOIN (SELECT TOP 1 cluster_name FROM sys.dm_hadr_cluster) AS FC | ||
""".strip().format( | ||
sql_columns=", ".join(sql_columns), | ||
), | ||
"columns": metric_columns, | ||
} |
66 changes: 66 additions & 0 deletions
66
sqlserver/datadog_checks/sqlserver/database_metrics/availability_groups_metrics.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# (C) Datadog, Inc. 2024-present | ||
# All rights reserved | ||
# Licensed under a 3-clause BSD style license (see LICENSE) | ||
|
||
from datadog_checks.base.config import is_affirmative | ||
|
||
from .base import SqlserverDatabaseMetricsBase | ||
|
||
AVAILABILITY_GROUPS_METRICS_QUERY = { | ||
"name": "sys.dm_hadr_availability_group_states", | ||
"query": """SELECT | ||
resource_group_id, | ||
name, | ||
synchronization_health_desc, | ||
synchronization_health, | ||
primary_recovery_health, | ||
secondary_recovery_health | ||
from sys.dm_hadr_availability_group_states as dhdrcs | ||
inner join sys.availability_groups as ag | ||
on ag.group_id = dhdrcs.group_id | ||
""".strip(), | ||
"columns": [ | ||
{"name": "availability_group", "type": "tag"}, | ||
{"name": "availability_group_name", "type": "tag"}, | ||
{"name": "synchronization_health_desc", "type": "tag"}, | ||
{"name": "ao.ag_sync_health", "type": "gauge"}, | ||
{"name": "ao.primary_replica_health", "type": "gauge"}, | ||
{"name": "ao.secondary_replica_health", "type": "gauge"}, | ||
], | ||
} | ||
|
||
|
||
class SqlserverAvailabilityGroupsMetrics(SqlserverDatabaseMetricsBase): | ||
# sys.dm_hadr_availability_group_states | ||
# Returns a row for each Always On availability group that possesses an availability replica on the local instance | ||
# of SQL Server. Each row displays the states that define the health of a given availability group. | ||
# | ||
# https://docs.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-hadr-availability-group-states-transact-sql?view=sql-server-ver15 | ||
@property | ||
def include_ao_metrics(self) -> bool: | ||
return is_affirmative(self.instance_config.get('include_ao_metrics', False)) | ||
|
||
@property | ||
def availability_group(self): | ||
return self.instance_config.get('availability_group') | ||
|
||
@property | ||
def enabled(self): | ||
if not self.include_ao_metrics: | ||
return False | ||
return True | ||
|
||
@property | ||
def queries(self): | ||
query = AVAILABILITY_GROUPS_METRICS_QUERY.copy() | ||
if self.availability_group: | ||
query['query'] += f" where resource_group_id = '{self.availability_group}'" | ||
return [query] | ||
|
||
def __repr__(self) -> str: | ||
return ( | ||
f"{self.__class__.__name__}(" | ||
f"enabled={self.enabled}, " | ||
f"include_ao_metrics={self.include_ao_metrics}, " | ||
f"availability_group={self.availability_group})" | ||
) |
Oops, something went wrong.