diff --git a/localenv/telemetry/grafana/provisioning/dashboards/example.json b/localenv/telemetry/grafana/provisioning/dashboards/example.json index a4a59f4abd..513c1b1448 100644 --- a/localenv/telemetry/grafana/provisioning/dashboards/example.json +++ b/localenv/telemetry/grafana/provisioning/dashboards/example.json @@ -106,7 +106,7 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", - "expr": "(increase(transactions_amount_total[30s]) * 0.5 * 0.0001)", + "expr": "(increase(packet_amount_fulfill_total[30s]) * 0.5 * 0.0001)", "hide": false, "instant": false, "interval": "", diff --git a/packages/backend/src/open_payments/payment/outgoing/service.test.ts b/packages/backend/src/open_payments/payment/outgoing/service.test.ts index d2b09e516d..3b7ab0ef1a 100644 --- a/packages/backend/src/open_payments/payment/outgoing/service.test.ts +++ b/packages/backend/src/open_payments/payment/outgoing/service.test.ts @@ -1161,6 +1161,79 @@ describe('OutgoingPaymentService', (): void => { return payment } + test('Telemetry Transaction Counter increments for COMPLETED transactions', async (): Promise => { + const incrementTrxCounterSpy = jest + .spyOn(telemetryService!, 'incrementCounter') + .mockImplementation(() => Promise.resolve()) + + incomingPayment = await createIncomingPayment(deps, { + walletAddressId: receiverWalletAddress.id, + incomingAmount: { + value: receiveAmount.value, + assetCode: receiverWalletAddress.asset.code, + assetScale: receiverWalletAddress.asset.scale + } + }) + assert.ok(incomingPayment.walletAddress) + + const createdPayment = await setup({ + receiver: incomingPayment.getUrl(incomingPayment.walletAddress), + receiveAmount, + method: 'ilp' + }) + + mockPaymentHandlerPay(createdPayment, incomingPayment) + const payment = await processNext( + createdPayment.id, + OutgoingPaymentState.Completed + ) + await expectOutcome(payment, { + accountBalance: 0n, + amountSent: payment.debitAmount.value, + amountDelivered: payment.receiveAmount.value, + incomingPaymentReceived: payment.receiveAmount.value, + withdrawAmount: 0n + }) + + expect(incrementTrxCounterSpy).toHaveBeenCalledTimes(1) + }) + + test('Telemetry Transaction Counter does not increments for FAILED transactions', async (): Promise => { + const incrementTrxCounterSpy = jest + .spyOn(telemetryService!, 'incrementCounter') + .mockImplementation(() => Promise.resolve()) + const createdPayment = await setup({ + receiver, + debitAmount, + method: 'ilp' + }) + + mockPaymentHandlerPay( + createdPayment, + incomingPayment, + new PaymentMethodHandlerError('Failed payment', { + description: '', + retryable: false + }) + ) + + const payment = await processNext( + createdPayment.id, + OutgoingPaymentState.Failed, + 'Failed payment' + ) + + await expectOutcome(payment, { + accountBalance: payment.debitAmount.value, + amountSent: 0n, + amountDelivered: 0n, + incomingPaymentReceived: incomingPayment.receivedAmount.value, + withdrawAmount: 0n + }) + + expect(incrementTrxCounterSpy).not.toHaveBeenCalled() + }) + test('COMPLETED with Telemetry Fee Counter (receiveAmount < incomingPayment.incomingAmount)', async (): Promise => { const spyTelFeeAmount = jest.spyOn( telemetryService, diff --git a/packages/backend/src/payment-method/ilp/connector/core/README.md b/packages/backend/src/payment-method/ilp/connector/core/README.md index 539024ff30..612cce1acc 100644 --- a/packages/backend/src/payment-method/ilp/connector/core/README.md +++ b/packages/backend/src/payment-method/ilp/connector/core/README.md @@ -1,3 +1,43 @@ # `core` > Toolkit comprised of core services and middleware needed to develop ILP applications. + +## Telemetry Collection + +Currently, we collect packet count and packet amount metrics directly within the connector core. These metrics are captured at the Interledger layer to track packet activity while ensuring privacy by collecting amounts at the packet level, rather than at the transaction level. This approach helps to preserve privacy, as we do not expose entire transaction amounts, while also incorporating privacy-preserving measures into the collected amounts. You can read more about privacy [here](https://rafiki.dev/telemetry/privacy/). + +### Why We Collect on the Sending Side + +The first decision in collecting this data was whether to do so on the sender's side or the receiver's side. We opted for the sender’s side to maintain consistency across our metrics, particularly for calculating average transaction amounts and times, which are tied to the outgoing payment flow in the Open Payments (OP) layer. By collecting metrics from the same perspective, we ensure alignment with how other metrics (like transaction completion times) are captured. + +We also considered collecting metrics on both the sending and receiving sides, capturing metrics when prepare packets were received by the receiver and when fulfill or reject responses were received by the sender. However, this could lead to unreliable metrics, as telemetry is optional and might not be enabled on all nodes. + +### Why We Chose handleIlpData + +Given these considerations, we decided to place our packet count and amount metrics in the `handleIlpData` function within the Rafiki Connector Core. This function plays a crucial role in processing ILP packets, handling both outgoing payments and quotes. + +The specific metrics we collect here include: + +- packet_count_prepare: Counts the prepare packets sent, collected before the middleware routes are executed. +- packet_count_fulfill: Counts the fulfill packets received, collected after receiving a reply from the receiver. +- packet_count_reject: Counts the reject packets received. +- packet_amount_fulfill: Records the amount sent in fulfill packets. + +These metrics provide valuable insights, including potential packet loss. + +### Challenges with the Current Setup + +While `handleIlpData` is an effective location for telemetry collection, it has some limitations: + +Sender-Side Limitation: Currently, metrics are only collected on the sender side. This is adequate for now but does not capture data from connecting nodes, which we plan to address in future implementations when multi-hop support is added. + +### Other Considered Locations for Metrics Collection + +We explored several alternative locations for collecting telemetry metrics within the Rafiki Connector Core: + +- Dedicated Middleware: Initially, we implemented a dedicated telemetry middleware. However, this approach resulted in data being duplicated on both the sender and receiver sides, leading to inaccurate metrics. To address this, we would need to filter the data to ensure metrics are only collected on the sender side. Additionally, the telemetry middleware would need to effectively handle errors thrown in the middleware chain. To achieve this, it might be necessary to place the telemetry middleware right before the error-handling middleware, allowing it to catch and reflect any errors that occur. This would involve collecting the prepare packet count, wrapping the `next()` function in a try-catch block to capture any new errors, and then collecting reply and amount metrics after `next()` has resolved. +- `ilpHandler` on the Receiving Side: We also considered adding telemetry to the `ilpHandler` function, which processes middleware on receiving and connecting nodes. However, this would lead to a fragmented metric collection, with some metrics gathered on the receiver side and others on the sender side. This fragmentation could complicate the handling of concepts like transaction count, which would require dual collection on both sender and receiver sides. We'd also have to watch for the possibility of data duplication on the connectors because their middleware might trigger in each direction, as they receive prepares and again as they receive responses. + +### Moving Forward + +As we implement multi-hop capabilities and further refine the connector architecture, we will likely identify better entry and exit points for telemetry collection. This will allow us to capture a more comprehensive set of metrics across both sender and receiver nodes, ensuring a complete and accurate understanding of the network’s behavior. diff --git a/packages/backend/src/payment-method/ilp/connector/core/middleware/telemetry.ts b/packages/backend/src/payment-method/ilp/connector/core/middleware/telemetry.ts deleted file mode 100644 index 8b28cf5cf6..0000000000 --- a/packages/backend/src/payment-method/ilp/connector/core/middleware/telemetry.ts +++ /dev/null @@ -1,30 +0,0 @@ -import { ValueType } from '@opentelemetry/api' -import { ILPContext, ILPMiddleware } from '../rafiki' - -export function createTelemetryMiddleware(): ILPMiddleware { - return async ( - { request, services, accounts, response }: ILPContext, - next: () => Promise - ): Promise => { - await next() - - const value = BigInt(request.prepare.amount) - - if (services.telemetry && value && response.fulfill) { - const { code: assetCode, scale: assetScale } = accounts.outgoing.asset - - await services.telemetry.incrementCounterWithTransactionAmount( - 'transactions_amount', - { - value, - assetCode, - assetScale - }, - { - description: 'Amount sent through the network', - valueType: ValueType.DOUBLE - } - ) - } - } -} diff --git a/packages/backend/src/payment-method/ilp/connector/core/rafiki.ts b/packages/backend/src/payment-method/ilp/connector/core/rafiki.ts index 16e58f8e69..d23c19377f 100644 --- a/packages/backend/src/payment-method/ilp/connector/core/rafiki.ts +++ b/packages/backend/src/payment-method/ilp/connector/core/rafiki.ts @@ -21,6 +21,11 @@ import { createIlpPacketMiddleware } from './middleware/ilp-packet' import { TelemetryService } from '../../../../telemetry/service' +import { + incrementPreparePacketCount, + incrementFulfillOrRejectPacketCount, + incrementAmount +} from './telemetry' // Model classes that represent an Interledger sender, receiver, or // connector SHOULD implement this ConnectorAccount interface. @@ -155,6 +160,11 @@ export class Rafiki { ): Promise { const prepare = new ZeroCopyIlpPrepare(rawPrepare) const response = new IlpResponse() + const telemetry = this.publicServer.context.services.telemetry + + if (telemetry) { + incrementPreparePacketCount(unfulfillable, prepare.amount, telemetry) + } await this.routes( { @@ -183,6 +193,24 @@ export class Rafiki { } ) if (!response.rawReply) throw new Error('error generating reply') + + if (telemetry) { + const { code, scale } = sourceAccount.asset + incrementFulfillOrRejectPacketCount( + unfulfillable, + prepare.amount, + response, + telemetry + ) + incrementAmount( + unfulfillable, + prepare.amount, + response, + code, + scale, + telemetry + ) + } return response.rawReply } diff --git a/packages/backend/src/payment-method/ilp/connector/core/telemetry.ts b/packages/backend/src/payment-method/ilp/connector/core/telemetry.ts new file mode 100644 index 0000000000..95ba2f0cb0 --- /dev/null +++ b/packages/backend/src/payment-method/ilp/connector/core/telemetry.ts @@ -0,0 +1,61 @@ +import { TelemetryService } from '../../../../telemetry/service' +import { IlpResponse } from './middleware/ilp-packet' +import { ValueType } from '@opentelemetry/api' + +export function incrementPreparePacketCount( + unfulfillable: boolean, + prepareAmount: string, + telemetry: TelemetryService +): void { + if (!unfulfillable && Number(prepareAmount)) { + telemetry.incrementCounter('packet_count_prepare', 1, { + description: 'Count of prepare packets that are sent' + }) + } +} + +export function incrementFulfillOrRejectPacketCount( + unfulfillable: boolean, + prepareAmount: string, + response: IlpResponse, + telemetry: TelemetryService +): void { + if (!unfulfillable && Number(prepareAmount)) { + if (response.fulfill) { + telemetry.incrementCounter('packet_count_fulfill', 1, { + description: 'Count of fulfill packets' + }) + } else if (response.reject) { + telemetry.incrementCounter('packet_count_reject', 1, { + description: 'Count of reject packets' + }) + } + } +} + +export async function incrementAmount( + unfulfillable: boolean, + prepareAmount: string, + response: IlpResponse, + code: string, + scale: number, + telemetry: TelemetryService +): Promise { + if (!unfulfillable && Number(prepareAmount)) { + if (response.fulfill) { + const value = BigInt(prepareAmount) + await telemetry.incrementCounterWithTransactionAmount( + 'packet_amount_fulfill', + { + value, + assetCode: code, + assetScale: scale + }, + { + description: 'Amount sent through the network', + valueType: ValueType.DOUBLE + } + ) + } + } +} diff --git a/packages/backend/src/payment-method/ilp/connector/core/test/middleware/telemetry.test.ts b/packages/backend/src/payment-method/ilp/connector/core/test/middleware/telemetry.test.ts deleted file mode 100644 index e33e21afa9..0000000000 --- a/packages/backend/src/payment-method/ilp/connector/core/test/middleware/telemetry.test.ts +++ /dev/null @@ -1,113 +0,0 @@ -import assert from 'assert' -import { IlpResponse, OutgoingAccount, ZeroCopyIlpPrepare } from '../..' -import { IncomingAccountFactory, RafikiServicesFactory } from '../../factories' -import { createTelemetryMiddleware } from '../../middleware/telemetry' -import { createILPContext } from '../../utils' -import { IlpFulfill } from 'ilp-packet' - -const incomingAccount = IncomingAccountFactory.build({ id: 'alice' }) - -assert.ok(incomingAccount.id) -const services = RafikiServicesFactory.build({}) - -const ctx = createILPContext({ - services, - request: { - prepare: { - amount: 100n - } as unknown as ZeroCopyIlpPrepare, - rawPrepare: Buffer.from('') - }, - accounts: { - incoming: incomingAccount, - outgoing: { asset: { code: 'USD', scale: 2 } } as OutgoingAccount - }, - state: { - unfulfillable: false, - incomingAccount: { - quote: 'exists' - } - }, - response: { - fulfill: 'exists' as unknown as IlpFulfill - } as IlpResponse -}) - -const middleware = createTelemetryMiddleware() -const next = jest.fn().mockImplementation(() => Promise.resolve()) - -beforeEach(async () => { - incomingAccount.balance = 100n - incomingAccount.asset.scale = 2 - incomingAccount.asset.code = 'USD' -}) - -describe('Telemetry Middleware', function () { - it('should not gather telemetry if telemetry is not enabled (service is undefined)', async () => { - const incrementCounterWithTransactionAmountSpy = jest - .spyOn(services.telemetry, 'incrementCounterWithTransactionAmount') - .mockImplementation(() => Promise.resolve()) - - await middleware( - { ...ctx, services: { ...ctx.services, telemetry: undefined } }, - next - ) - - expect(incrementCounterWithTransactionAmountSpy).not.toHaveBeenCalled() - expect(next).toHaveBeenCalled() - }) - - it('should not gather telemetry if response.fulfill undefined', async () => { - const incrementCounterWithTransactionAmountSpy = jest - .spyOn(services.telemetry, 'incrementCounterWithTransactionAmount') - .mockImplementation(() => Promise.resolve()) - - await middleware( - { ...ctx, response: { fulfill: undefined } as IlpResponse }, - next - ) - - expect(incrementCounterWithTransactionAmountSpy).not.toHaveBeenCalled() - expect(next).toHaveBeenCalled() - }) - - it('should not gather telemetry if request.prepare.amount is 0', async () => { - const incrementCounterWithTransactionAmountSpy = jest - .spyOn(services.telemetry, 'incrementCounterWithTransactionAmount') - .mockImplementation(() => Promise.resolve()) - - await middleware( - { - ...ctx, - request: { - ...ctx.request, - prepare: { amount: '0' } as ZeroCopyIlpPrepare - } - }, - next - ) - - expect(incrementCounterWithTransactionAmountSpy).not.toHaveBeenCalled() - expect(next).toHaveBeenCalled() - }) - - it('should gather telemetry without blocking middleware chain', async () => { - let nextCalled = false - const next = jest.fn().mockImplementation(() => { - nextCalled = true - return Promise.resolve() - }) - - const incrementCounterWithTransactionAmountSpy = jest - .spyOn(services.telemetry, 'incrementCounterWithTransactionAmount') - .mockImplementation(() => { - expect(nextCalled).toBe(true) - return Promise.resolve() - }) - - await middleware(ctx, next) - - expect(incrementCounterWithTransactionAmountSpy).toHaveBeenCalled() - expect(next).toHaveBeenCalled() - }) -}) diff --git a/packages/backend/src/payment-method/ilp/connector/core/test/telemetry.test.ts b/packages/backend/src/payment-method/ilp/connector/core/test/telemetry.test.ts new file mode 100644 index 0000000000..98988018f9 --- /dev/null +++ b/packages/backend/src/payment-method/ilp/connector/core/test/telemetry.test.ts @@ -0,0 +1,125 @@ +import { TelemetryService } from '../../../../../telemetry/service' +import { initIocContainer } from '../../../../../index' +import { Config } from '../../../../../config/app' +import { IocContract } from '@adonisjs/fold' +import { AppServices } from '../../../../../app' +import { + incrementPreparePacketCount, + incrementFulfillOrRejectPacketCount, + incrementAmount +} from '../telemetry' +import { IlpResponse } from '../middleware/ilp-packet' +import { IlpFulfillFactory, IlpRejectFactory } from '../factories' +import { ValueType } from '@opentelemetry/api' + +describe('Connector Core Telemetry', () => { + let deps: IocContract + let telemetryService: TelemetryService + let unfulfillable: boolean + let prepareAmount: string + + beforeAll(async (): Promise => { + unfulfillable = false + prepareAmount = '100' + deps = initIocContainer({ + ...Config, + enableTelemetry: true + }) + + telemetryService = await deps.use('telemetry')! + }) + + afterAll(async (): Promise => { + jest.restoreAllMocks() + }) + + it('incrementPreparePacketCount should create a packet_count_prepare counter and increment it by one', () => { + const incrementCounterSpy = jest + .spyOn(telemetryService!, 'incrementCounter') + .mockImplementation(() => Promise.resolve()) + const amount = 1 + const name = 'packet_count_prepare' + const attributes = { + description: 'Count of prepare packets that are sent' + } + + incrementPreparePacketCount(unfulfillable, prepareAmount, telemetryService) + + expect(incrementCounterSpy).toHaveBeenCalledWith(name, amount, attributes) + }) + + it('incrementFulfillOrRejectPacketCount should create a packet_count_fulfill counter and increment it by one when there is a fulfill response', () => { + const incrementCounterSpy = jest + .spyOn(telemetryService!, 'incrementCounter') + .mockImplementation(() => Promise.resolve()) + const amount = 1 + const name = 'packet_count_fulfill' + const prepareAmount = '100' + const attributes = { + description: 'Count of fulfill packets' + } + const response = new IlpResponse() + response.fulfill = IlpFulfillFactory.build() + + incrementFulfillOrRejectPacketCount( + unfulfillable, + prepareAmount, + response, + telemetryService + ) + + expect(incrementCounterSpy).toHaveBeenCalledWith(name, amount, attributes) + }) + + it('incrementFulfillOrRejectPacketCount should create a packet_count_reject counter and increment it by one when there is a reject response', () => { + const incrementCounterSpy = jest + .spyOn(telemetryService!, 'incrementCounter') + .mockImplementation(() => Promise.resolve()) + const amount = 1 + const name = 'packet_count_reject' + const prepareAmount = '100' + const attributes = { + description: 'Count of reject packets' + } + const response = new IlpResponse() + response.reject = IlpRejectFactory.build() + + incrementFulfillOrRejectPacketCount( + unfulfillable, + prepareAmount, + response, + telemetryService + ) + + expect(incrementCounterSpy).toHaveBeenCalledWith(name, amount, attributes) + }) + + it('incrementAmount should create a packet_amount_fulfill counter and increment it by one', () => { + const incrementCounterSpy = jest + .spyOn(telemetryService!, 'incrementCounterWithTransactionAmount') + .mockImplementation(() => Promise.resolve()) + + const amount = { value: 100n, assetCode: 'USD', assetScale: 2 } + const name = 'packet_amount_fulfill' + const code = 'USD' + const scale = 2 + const prepareAmount = '100' + const attributes = { + description: 'Amount sent through the network', + valueType: ValueType.DOUBLE + } + const response = new IlpResponse() + response.fulfill = IlpFulfillFactory.build() + + incrementAmount( + unfulfillable, + prepareAmount, + response, + code, + scale, + telemetryService + ) + + expect(incrementCounterSpy).toHaveBeenCalledWith(name, amount, attributes) + }) +}) diff --git a/packages/backend/src/payment-method/ilp/connector/index.ts b/packages/backend/src/payment-method/ilp/connector/index.ts index c97f3349d2..9f3ecc3f95 100644 --- a/packages/backend/src/payment-method/ilp/connector/index.ts +++ b/packages/backend/src/payment-method/ilp/connector/index.ts @@ -27,20 +27,18 @@ import { createStreamAddressMiddleware, createStreamController } from './core' - import { TelemetryService } from '../../../telemetry/service' -import { createTelemetryMiddleware } from './core/middleware/telemetry' interface ServiceDependencies extends BaseService { redis: Redis ratesService: RatesService accountingService: AccountingService - telemetry?: TelemetryService walletAddressService: WalletAddressService incomingPaymentService: IncomingPaymentService peerService: PeerService streamServer: StreamServer ilpAddress: string + telemetry?: TelemetryService } export async function createConnectorService({ @@ -48,12 +46,12 @@ export async function createConnectorService({ redis, ratesService, accountingService, - telemetry, walletAddressService, incomingPaymentService, peerService, streamServer, - ilpAddress + ilpAddress, + telemetry }: ServiceDependencies): Promise { return createApp( { @@ -62,13 +60,13 @@ export async function createConnectorService({ service: 'ConnectorService' }), accounting: accountingService, - telemetry, walletAddresses: walletAddressService, incomingPayments: incomingPaymentService, peers: peerService, redis, rates: ratesService, - streamServer + streamServer, + telemetry }, compose([ // Incoming Rules @@ -82,7 +80,6 @@ export async function createConnectorService({ // Local pay createBalanceMiddleware(), - createTelemetryMiddleware(), // Outgoing Rules createStreamController(), diff --git a/packages/documentation/src/content/docs/telemetry/overview.md b/packages/documentation/src/content/docs/telemetry/overview.md index 71590422ae..16b436581b 100644 --- a/packages/documentation/src/content/docs/telemetry/overview.md +++ b/packages/documentation/src/content/docs/telemetry/overview.md @@ -7,8 +7,10 @@ title: Overview The objective of the telemetry feature is to gather metrics and establish an infrastructure for visualizing valuable network insights. The metrics we at the Interledger Foundation collect include: - The total amount of money transferred via packet data within a specified time frame (daily, weekly, monthly). -- The number of transactions from outgoing payments that have been at least partially successful. +- The number of transactions that have been at least partially successful. +- The number of ILP packets flowing through the network. - The average amount of money held within the network per transaction. +- The avergae time it takes for an outgoing payment to complete. We aim to track the growth of the network in terms of transaction sizes and the number of transactions processed. Our goal is to use these data for our own insights and to enable [Account Servicing Entities](/reference/glossary#account-servicing-entity) (ASEs) to gain their own insights. @@ -60,21 +62,28 @@ If an ASE does not provide the necessary exchange rate for a transaction, the te ## Instrumentation -Rafiki currently has three counter metrics. All data points (counter increases and histogram records) are exported to collection endpoints at a configurable interval (default recommended to 15s). +Data points for all metrics (e.g. counter increases) are exported to collection endpoints at a configurable interval (default recommended to 15s). Currently collected metrics: - `transactions_total` - Counter metric - - Description: “Count of funded transactions” - - This counter metric increases by 1 for each successfully sent transaction. -- `transactions_amount` - Counter metric - - Description: “Amount sent through the network”. + - Description: “Count of funded outgoing transactions” + - This counter metric increases by 1 for each successfully funded outgoing payment resource. +- `packet_count_prepare` - Counter metric + - Description: “Count of prepare packets that are sent” + - This counter metric increases by 1 for each prepare packet that is sent. +- `packet_count_fulfill` - Counter metric + - Description: “Count of fulfill packets” + - This counter metric increases by 1 for each fulfill packet that is received. +- `packet_count_reject` - Counter metric + - Description: “Count of reject packets” + - This counter metric increases by 1 for each reject packet that is received. +- `packet_amount_fulfill` - Counter metric + - Description: “Amount sent through the network” - This amount metric increases by the amount sent in each ILP packet. - `transaction_fee_amounts` - Counter metric - Description: “Fee amount sent through the network”. - This fee amount metric increases by the (amount sent minus amount received) for an outgoing payment. - `ilp_pay_time_ms` - Histogram metric - - Description: “Time to complete an ILP payment” + - Description: “Time to complete an outgoing ILP payment” - This histogram metric records the time taken to make an ILP payment. - -**Note**: The current implementation only collects metrics on the SENDING side of a transaction. Metrics for external open-payments transactions RECEIVED by a Rafiki instance in the network are not collected.