From 9e2c17d44972f493cefdac027ceeaf791b4e7f35 Mon Sep 17 00:00:00 2001 From: 0fatal <72899968+0fatal@users.noreply.github.com> Date: Tue, 22 Aug 2023 15:26:00 +0800 Subject: [PATCH] feat(server): support resource monitor (#1468) * feat(server): support resource monitor * test(server): add monitor test * fix(server): get app namespace by GetApplicationNamespaceByAppId * chore: change prometheus conf * chore(build): add prometheus * fix: build * chore --- .vscode/settings.json | 3 +- .../laf-server/templates/deployment.yaml | 2 + build/charts/minio/values.yaml | 6 +- build/prometheus-helm.yaml | 25 +++ build/start.sh | 27 ++- e2e/.gitignore | 1 + e2e/.prettierrc | 5 + e2e/2-monitor/00-query.test.ts | 104 ++++++++++ e2e/config.ts | 20 +- e2e/package-lock.json | 12 ++ e2e/package.json | 1 + lerna.json | 3 +- server/src/app.module.ts | 2 + server/src/constants.ts | 8 + server/src/initializer/initializer.service.ts | 3 + server/src/monitor/dto/query-metrics.dto.ts | 22 ++ server/src/monitor/monitor.controller.ts | 33 +++ server/src/monitor/monitor.module.ts | 13 ++ server/src/monitor/monitor.service.ts | 190 ++++++++++++++++++ server/src/region/entities/region.ts | 5 + 20 files changed, 475 insertions(+), 10 deletions(-) create mode 100644 build/prometheus-helm.yaml create mode 100644 e2e/.gitignore create mode 100644 e2e/.prettierrc create mode 100644 e2e/2-monitor/00-query.test.ts create mode 100644 server/src/monitor/dto/query-metrics.dto.ts create mode 100644 server/src/monitor/monitor.controller.ts create mode 100644 server/src/monitor/monitor.module.ts create mode 100644 server/src/monitor/monitor.service.ts diff --git a/.vscode/settings.json b/.vscode/settings.json index 7a03c82b31..59c76625fa 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -134,5 +134,6 @@ "i18n-ally.keystyle": "nested", "i18n-ally.keysInUse": [ "description.part2_whatever" - ] + ], + "jest.rootPath": "e2e", } diff --git a/build/charts/laf-server/templates/deployment.yaml b/build/charts/laf-server/templates/deployment.yaml index 23e57414a7..0aec7e2f33 100644 --- a/build/charts/laf-server/templates/deployment.yaml +++ b/build/charts/laf-server/templates/deployment.yaml @@ -89,6 +89,8 @@ spec: value: {{ .Values.default_region.log_server_secret }} - name: DEFAULT_REGION_LOG_SERVER_DATABASE_URL value: {{ .Values.default_region.log_server_database_url }} + - name: DEFAULT_REGION_PROMETHEUS_URL + value: {{ .Values.default_region.prometheus_url }} - name: SITE_NAME value: {{ .Values.siteName | quote}} {{- with .Values.nodeSelector }} diff --git a/build/charts/minio/values.yaml b/build/charts/minio/values.yaml index 0a9b608a04..db0ed7f5a8 100644 --- a/build/charts/minio/values.yaml +++ b/build/charts/minio/values.yaml @@ -510,7 +510,7 @@ metrics: serviceMonitor: enabled: false # scrape each node/pod individually for additional metrics - includeNode: false + includeNode: false public: true additionalLabels: {} # for node metrics @@ -521,8 +521,8 @@ metrics: # - regex: (server|pod) # action: labeldrop # namespace: monitoring - # interval: 30s - # scrapeTimeout: 10s + interval: 30s + scrapeTimeout: 10s ## ETCD settings: https://github.com/minio/minio/blob/master/docs/sts/etcd.md ## Define endpoints to enable this section. diff --git a/build/prometheus-helm.yaml b/build/prometheus-helm.yaml new file mode 100644 index 0000000000..6ce92b9b3c --- /dev/null +++ b/build/prometheus-helm.yaml @@ -0,0 +1,25 @@ +fullnameOverride: prometheus +alertmanager: + enabled: false +grafana: + enabled: false +coreDns: + enabled: false +nodeExporter: + enabled: false +kubeApiServer: + enabled: false +kubeScheduler: + enabled: false +kubeControllerManager: + enabled: false +kubeEtcd: + enabled: false +kubeProxy: + enabled: false +kubeStateMetrics: + enabled: false +prometheus: + networkPolicy: + enabled: true + ingress: [] \ No newline at end of file diff --git a/build/start.sh b/build/start.sh index bd5efb43c6..67ff5a45bb 100644 --- a/build/start.sh +++ b/build/start.sh @@ -18,6 +18,8 @@ ENABLE_APISIX_HOST_NETWORK=${ENABLE_APISIX_HOST_NETWORK:-true} NAMESPACE=${NAMESPACE:-laf-system} PASSWD_OR_SECRET=$(tr -cd 'a-z0-9' prometheus-helm-with-values.yaml + + helm install prometheus -n ${NAMESPACE} \ + -f ./prometheus-helm-with-values.yaml \ + prometheus-community/kube-prometheus-stack + + helm install prometheus-mongodb-exporter -n ${NAMESPACE} \ + --set mongodb.uri=${DATABASE_URL} \ + --set serviceMonitor.enabled=true \ + --set serviceMonitor.additionalLabels.release=prometheus \ + --set serviceMonitor.additionalLabels.namespace=${NAMESPACE} \ + prometheus-community/prometheus-mongodb-exporter +fi -## 4. install laf-server +## 5. install laf-server SERVER_JWT_SECRET=$PASSWD_OR_SECRET LOG_SERVER_URL="http://log-server.${NAMESPACE}.svc.cluster.local:5060" LOG_SERVER_DATABASE_URL="mongodb://${DB_USERNAME:-admin}:${PASSWD_OR_SECRET}@mongodb-0.mongo.${NAMESPACE}.svc.cluster.local:27017/function-logs?authSource=admin&replicaSet=rs0&w=majority" @@ -100,6 +124,7 @@ helm install server -n ${NAMESPACE} \ --set default_region.log_server_url=${LOG_SERVER_URL} \ --set default_region.log_server_secret=${LOG_SERVER_SECRET} \ --set default_region.log_server_database_url=${LOG_SERVER_DATABASE_URL} \ + $( [[ $ENABLE_MONITOR ]] && echo "--set default_region.prometheus_url=${PROMETHEUS_URL}" ) \ ./charts/laf-server ## 6. install laf-web diff --git a/e2e/.gitignore b/e2e/.gitignore new file mode 100644 index 0000000000..4c49bd78f1 --- /dev/null +++ b/e2e/.gitignore @@ -0,0 +1 @@ +.env diff --git a/e2e/.prettierrc b/e2e/.prettierrc new file mode 100644 index 0000000000..5a6fd01584 --- /dev/null +++ b/e2e/.prettierrc @@ -0,0 +1,5 @@ +{ + "singleQuote": true, + "trailingComma": "all", + "semi": false +} \ No newline at end of file diff --git a/e2e/2-monitor/00-query.test.ts b/e2e/2-monitor/00-query.test.ts new file mode 100644 index 0000000000..8fa4fd83b3 --- /dev/null +++ b/e2e/2-monitor/00-query.test.ts @@ -0,0 +1,104 @@ +import { describe, expect, test } from '@jest/globals' +import { api, EnsureTestToken, GetRegion, GetRuntime } from '../api' +import { Config } from '../config' + +describe('query monitor metrics normally', () => { + let appid = Config.TEST_APPID + let token = null + beforeAll(async () => { + token = await EnsureTestToken() + expect(token).toBeTruthy() + }) + + test( + 'query monitor metrics', + async () => { + const metrics = [ + 'cpuUsage', + 'memoryUsage', + 'storageUsage', + 'databaseUsage', + ] + + const query = { + q: metrics, + step: 300, + } + + const res = await api.get(`/v1/monitor/${appid}/metrics`, { + params: query, + headers: { Authorization: `Bearer ${token}` }, + }) + + expect(res.status).toBe(200) + + const data = res.data?.data + expect(Object.keys(data)).toEqual(expect.arrayContaining(metrics)) + + Object.values(data).forEach((v: []) => { + expect(Array.isArray(v)).toBeTruthy() + v.forEach((item) => { + expect([ + ['values', 'metric'], + ['value', 'metric'], + ]).toContainEqual(expect.arrayContaining(Object.keys(item))) + }) + }) + }, + 10 * 1000, + ) +}) + +describe('query monitor metrics with invalid inputs', () => { + let appid = Config.TEST_APPID + let token = null + beforeAll(async () => { + token = await EnsureTestToken() + expect(token).toBeTruthy() + }) + + test( + 'query monitor metrics with invalid step', + async () => { + const metrics = [ + 'cpuUsage', + 'memoryUsage', + 'storageUsage', + 'databaseUsage', + ] + + const query = { + q: metrics, + step: 50, + } + + const res = await api.get(`/v1/monitor/${appid}/metrics`, { + params: query, + headers: { Authorization: `Bearer ${token}` }, + }) + + expect(res.status).toBe(400) + }, + 10 * 1000, + ) + + test( + 'query monitor metrics with invalid metrics', + async () => { + const metrics = ['cpuUsage', 'memoryUsage', 'storageUsage', 'invalid'] + + const query = { + q: metrics, + step: 50, + } + + const res = await api.get(`/v1/monitor/${appid}/metrics`, { + params: query, + headers: { Authorization: `Bearer ${token}` }, + }) + + expect(res.status).toBe(400) + }, + 10 * 1000, + ) +}) diff --git a/e2e/config.ts b/e2e/config.ts index 0e691c33e8..abd8562184 100644 --- a/e2e/config.ts +++ b/e2e/config.ts @@ -1,4 +1,6 @@ +import * as dotenv from 'dotenv' +dotenv.config() export class Config { static get DOMAIN() { @@ -16,12 +18,17 @@ export class Config { } static get APISIX_ADMIN_URL() { - if (!process.env.APISIX_ADMIN_URL) throw new Error('APISIX_ADMIN_URL is not set') - return process.env.APISIX_ADMIN_URL || `http://${Config.DOMAIN}:9180/apisix/admin` + if (!process.env.APISIX_ADMIN_URL) + throw new Error('APISIX_ADMIN_URL is not set') + return ( + process.env.APISIX_ADMIN_URL || + `http://${Config.DOMAIN}:9180/apisix/admin` + ) } static get APISIX_ADMIN_KEY() { - if (!process.env.APISIX_ADMIN_KEY) throw new Error('APISIX_ADMIN_KEY is not set') + if (!process.env.APISIX_ADMIN_KEY) + throw new Error('APISIX_ADMIN_KEY is not set') return process.env.APISIX_ADMIN_KEY } @@ -36,4 +43,9 @@ export class Config { static get TEST_APP_NAME() { return process.env.TEST_APP_NAME || 'testing-e2e-application-name' } -} \ No newline at end of file + + static get TEST_APPID() { + if (!process.env.TEST_APPID) throw new Error('TEST_APPID is not set') + return process.env.TEST_APPID + } +} diff --git a/e2e/package-lock.json b/e2e/package-lock.json index dd61630eab..da73dfb84d 100644 --- a/e2e/package-lock.json +++ b/e2e/package-lock.json @@ -10,6 +10,7 @@ "license": "ISC", "dependencies": { "axios": "^1.4.0", + "dotenv": "^16.3.1", "mongodb": "^5.7.0" }, "devDependencies": { @@ -1637,6 +1638,17 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, + "node_modules/dotenv": { + "version": "16.3.1", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.3.1.tgz", + "integrity": "sha512-IPzF4w4/Rd94bA9imS68tZBaYyBWSCE47V1RGuMrB94iyTOIEwRmVL2x/4An+6mETpLrKJ5hQkB8W4kFAadeIQ==", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/motdotla/dotenv?sponsor=1" + } + }, "node_modules/electron-to-chromium": { "version": "1.4.485", "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.485.tgz", diff --git a/e2e/package.json b/e2e/package.json index 503ed898a3..e74becf1e5 100644 --- a/e2e/package.json +++ b/e2e/package.json @@ -16,6 +16,7 @@ }, "dependencies": { "axios": "^1.4.0", + "dotenv": "^16.3.1", "mongodb": "^5.7.0" } } diff --git a/lerna.json b/lerna.json index 1cbede8bdf..ba15ac22ab 100644 --- a/lerna.json +++ b/lerna.json @@ -6,7 +6,8 @@ "./web", "./runtimes/nodejs", "./cli", - "./services/*" + "./services/*", + "./e2e" ], "version": "1.0.0-beta.10", "command": { diff --git a/server/src/app.module.ts b/server/src/app.module.ts index d326171ece..3b50568680 100644 --- a/server/src/app.module.ts +++ b/server/src/app.module.ts @@ -29,6 +29,7 @@ import { GroupModule } from './group/group.module' import { APP_INTERCEPTOR } from '@nestjs/core' import { AppInterceptor } from './app.interceptor' import { InterceptorModule } from './interceptor/interceptor.module' +import { MonitorModule } from './monitor/monitor.module' @Module({ imports: [ @@ -74,6 +75,7 @@ import { InterceptorModule } from './interceptor/interceptor.module' RecycleBinModule, GroupModule, InterceptorModule, + MonitorModule, ], controllers: [AppController], providers: [ diff --git a/server/src/constants.ts b/server/src/constants.ts index 655b8d2be9..83c8316d27 100644 --- a/server/src/constants.ts +++ b/server/src/constants.ts @@ -10,6 +10,10 @@ export class ServerConfig { return process.env.DATABASE_URL } + static get PROMETHEUS_URL() { + return process.env.PROMETHEUS_URL + } + static get METERING_DATABASE_URL() { if (!process.env.METERING_DATABASE_URL) { throw new Error('METERING_DATABASE_URL is not defined') @@ -172,6 +176,10 @@ export class ServerConfig { return process.env.DEFAULT_REGION_LOG_SERVER_DATABASE_URL } + static get DEFAULT_REGION_PROMETHEUS_URL() { + return process.env.DEFAULT_REGION_PROMETHEUS_URL + } + // HTTP interceptor static get HTTP_INTERCEPTOR_URL() { return process.env.HTTP_INTERCEPTOR_URL diff --git a/server/src/initializer/initializer.service.ts b/server/src/initializer/initializer.service.ts index 6f161bca05..ab984d0fea 100644 --- a/server/src/initializer/initializer.service.ts +++ b/server/src/initializer/initializer.service.ts @@ -71,6 +71,9 @@ export class InitializerService { secret: ServerConfig.DEFAULT_REGION_LOG_SERVER_SECRET, databaseUrl: ServerConfig.DEFAULT_REGION_LOG_SERVER_DATABASE_URL, }, + prometheusConf: { + apiUrl: ServerConfig.DEFAULT_REGION_PROMETHEUS_URL, + }, updatedAt: new Date(), createdAt: new Date(), state: 'Active', diff --git a/server/src/monitor/dto/query-metrics.dto.ts b/server/src/monitor/dto/query-metrics.dto.ts new file mode 100644 index 0000000000..8b56ada79b --- /dev/null +++ b/server/src/monitor/dto/query-metrics.dto.ts @@ -0,0 +1,22 @@ +import { ApiProperty } from '@nestjs/swagger' +import { IsArray, IsEnum, IsNumber, Max, Min } from 'class-validator' +import { MonitorMetric } from '../monitor.service' +import { Transform } from 'class-transformer' + +export class QueryMetricsDto { + @ApiProperty({ isArray: true, enum: MonitorMetric }) + @IsEnum(MonitorMetric, { each: true }) + @IsArray() + q: MonitorMetric[] + + @ApiProperty({ + minimum: 60, + maximum: 3600, + description: 'Query step in seconds', + }) + @IsNumber() + @Min(60) + @Max(3600) + @Transform(({ value }) => Number(value)) + step: number +} diff --git a/server/src/monitor/monitor.controller.ts b/server/src/monitor/monitor.controller.ts new file mode 100644 index 0000000000..56b082a70e --- /dev/null +++ b/server/src/monitor/monitor.controller.ts @@ -0,0 +1,33 @@ +import { Controller, Get, Param, Query, UseGuards } from '@nestjs/common' +import { MonitorService } from './monitor.service' +import { ResponseUtil } from 'src/utils/response' +import { JwtAuthGuard } from 'src/authentication/jwt.auth.guard' +import { ApplicationAuthGuard } from 'src/authentication/application.auth.guard' +import { + ApiBearerAuth, + ApiOperation, + ApiResponse, + ApiTags, +} from '@nestjs/swagger' +import { QueryMetricsDto } from './dto/query-metrics.dto' + +@ApiTags('Monitor') +@ApiBearerAuth('Authorization') +@Controller('monitor') +export class MonitorController { + constructor(private readonly monitorService: MonitorService) {} + + @ApiOperation({ summary: 'Get monitor metrics data' }) + @ApiResponse({ type: ResponseUtil }) + @UseGuards(JwtAuthGuard, ApplicationAuthGuard) + @Get(':appid/metrics') + async getData(@Param('appid') appid: string, @Query() dto: QueryMetricsDto) { + const { q: metrics, step } = dto + + const res = await this.monitorService.getData(appid, metrics, { + step, + }) + + return ResponseUtil.ok(res) + } +} diff --git a/server/src/monitor/monitor.module.ts b/server/src/monitor/monitor.module.ts new file mode 100644 index 0000000000..60a12882fb --- /dev/null +++ b/server/src/monitor/monitor.module.ts @@ -0,0 +1,13 @@ +import { HttpModule } from '@nestjs/axios' +import { Module } from '@nestjs/common' +import { MonitorController } from './monitor.controller' +import { MonitorService } from './monitor.service' +import { ApplicationModule } from 'src/application/application.module' + +@Module({ + imports: [HttpModule, ApplicationModule], + controllers: [MonitorController], + providers: [MonitorService], + exports: [MonitorService], +}) +export class MonitorModule {} diff --git a/server/src/monitor/monitor.service.ts b/server/src/monitor/monitor.service.ts new file mode 100644 index 0000000000..a292356194 --- /dev/null +++ b/server/src/monitor/monitor.service.ts @@ -0,0 +1,190 @@ +import { HttpService } from '@nestjs/axios' +import { Injectable, Logger } from '@nestjs/common' +import { ApplicationService } from 'src/application/application.service' +import { PrometheusConf } from 'src/region/entities/region' +import { RegionService } from 'src/region/region.service' +import { GetApplicationNamespaceByAppId } from 'src/utils/getter' + +const requestConfig = { + retryAttempts: 5, + retryDelayBase: 300, + rateAccuracy: '1m', +} + +export const getQuery = + ({ rateAccuracy }: { rateAccuracy: string }) => + (opts: Record, metric: MonitorMetric) => { + switch (metric) { + case MonitorMetric.cpuUsage: + return { + instant: false, + query: `sum(rate(container_cpu_usage_seconds_total{pod=~"${opts.pods}",namespace="${opts.namespace}"}[${rateAccuracy}])) by (${opts.selector})`, + } + // case MonitorMetric.cpuRequests: + // return { + // instant: false, + // query: `sum(kube_pod_container_resource_requests{pod=~"${opts.pods}",resource="cpu",namespace="${opts.namespace}"}) by (${opts.selector})`, + // } + // case MonitorMetric.cpuLimits: + // return { + // instant: false, + // query: `sum(kube_pod_container_resource_limits{pod=~"${opts.pods}",resource="cpu",namespace="${opts.namespace}"}) by (${opts.selector})`, + // } + case MonitorMetric.memoryUsage: + return { + instant: false, + query: `sum(container_memory_working_set_bytes{pod=~"${opts.pods}",namespace="${opts.namespace}"}) by (${opts.selector})`, + } + // case MonitorMetric.memoryRequests: + // return { + // instant: false, + // query: `sum(kube_pod_container_resource_requests{pod=~"${opts.pods}",resource="memory",namespace="${opts.namespace}"}) by (${opts.selector})`, + // } + // case MonitorMetric.memoryLimits: + // return { + // instant: false, + // query: `sum(kube_pod_container_resource_limits{pod=~"${opts.pods}",resource="memory",namespace="${opts.namespace}"}) by (${opts.selector})`, + // } + // case MonitorMetric.networkReceive: + // return { + // instant: false, + // query: `sum(rate(container_network_receive_bytes_total{pod=~"${opts.pods}",namespace="${opts.namespace}"}[${rateAccuracy}])) by (${opts.selector})`, + // } + // case MonitorMetric.networkTransmit: + // return { + // instant: false, + // query: `sum(rate(container_network_transmit_bytes_total{pod=~"${opts.pods}",namespace="${opts.namespace}"}[${rateAccuracy}])) by (${opts.selector})`, + // } + case MonitorMetric.databaseUsage: + return { + instant: true, + query: `sum(mongodb_dbstats_dataSize{database="${opts.appid}"})`, + } + case MonitorMetric.storageUsage: + return { + instant: true, + query: `sum(minio_bucket_usage_total_bytes{bucket=~"${opts.appid}.+"})`, + } + } + } + +export enum MonitorMetric { + cpuUsage = 'cpuUsage', // + // cpuRequests = 'cpuRequests', + // cpuLimits = 'cpuLimits', + memoryUsage = 'memoryUsage', // + // memoryRequests = 'memoryRequests', + // memoryLimits = 'memoryLimits', + // networkReceive = 'networkReceive', // + // networkTransmit = 'networkTransmit', // + databaseUsage = 'databaseUsage', // + storageUsage = 'storageUsage', // +} + +@Injectable() +export class MonitorService { + constructor( + private readonly httpService: HttpService, + private readonly applicationService: ApplicationService, + private readonly regionService: RegionService, + ) {} + private readonly logger = new Logger(MonitorService.name) + + private async getPrometheusConf(appid: string) { + const app = await this.applicationService.findOne(appid) + const region = await this.regionService.findOne(app.regionId) + return region?.prometheusConf + } + + async getData( + appid: string, + metrics: MonitorMetric[], + queryParams: Record, + ) { + const conf = await this.getPrometheusConf(appid) + if (!conf?.apiUrl) { + this.logger.warn('Metrics not available for no endpoint') + return [] + } + + const opts = { + appid, + selector: 'pod', + namespace: GetApplicationNamespaceByAppId(appid), + pods: appid + '.+', + } + const data = {} + const res = metrics.map(async (metric) => { + const { query, instant } = getQuery({ + rateAccuracy: requestConfig.rateAccuracy, + })(opts, metric) + + data[metric] = instant + ? await this.query(conf, query) + : await this.queryRange(conf, query, queryParams) + }) + + await Promise.all(res) + return data + } + + private async query(conf: PrometheusConf, query: string) { + const endpoint = `${conf.apiUrl}/api/v1/query` + + return await this.queryInternal(endpoint, { query }) + } + + private async queryRange( + conf: PrometheusConf, + query: string, + queryParams: Record, + ) { + const range = 3600 // 1 hour + const now = Math.floor(Date.now() / 1000) + const start = now - range + const end = now + + queryParams = { + range, + step: 60, + start, + end, + ...queryParams, + } + + const endpoint = `${conf.apiUrl}/api/v1/query_range` + + return await this.queryInternal(endpoint, { + query, + ...queryParams, + }) + } + + private async queryInternal( + endpoint: string, + query: Record, + ) { + for (let attempt = 1; attempt <= requestConfig.retryAttempts; attempt++) { + try { + const res = await this.httpService + .post(endpoint, query, { + headers: { + 'Content-Type': 'application/x-www-form-urlencoded', + }, + }) + .toPromise() + + return res.data.data.result + } catch (error) { + if (attempt >= requestConfig.retryAttempts) { + this.logger.error('Metrics not available', error.message) + return [] + } + + await new Promise((resolve) => + setTimeout(resolve, attempt * requestConfig.retryDelayBase), + ) + } + } + } +} diff --git a/server/src/region/entities/region.ts b/server/src/region/entities/region.ts index c076ad95b9..6bb93cfc89 100644 --- a/server/src/region/entities/region.ts +++ b/server/src/region/entities/region.ts @@ -38,6 +38,10 @@ export type LogServerConf = { databaseUrl: string } +export type PrometheusConf = { + apiUrl: string +} + export class Region { @ApiProperty({ type: String }) _id?: ObjectId @@ -53,6 +57,7 @@ export class Region { gatewayConf: RegionGatewayConf storageConf: RegionStorageConf logServerConf: LogServerConf + prometheusConf: PrometheusConf @ApiProperty() tls: boolean