forked from influxdata/telegraf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vsphere.go
385 lines (347 loc) · 13.2 KB
/
vsphere.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
package vsphere
import (
"context"
"sync"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/plugins/common/tls"
"github.com/influxdata/telegraf/plugins/inputs"
"github.com/vmware/govmomi/vim25/soap"
)
// VSphere is the top level type for the vSphere input plugin. It contains all the configuration
// and a list of connected vSphere endpoints
type VSphere struct {
Vcenters []string
Username string
Password string
DatacenterInstances bool
DatacenterMetricInclude []string
DatacenterMetricExclude []string
DatacenterInclude []string
DatacenterExclude []string
ClusterInstances bool
ClusterMetricInclude []string
ClusterMetricExclude []string
ClusterInclude []string
ClusterExclude []string
HostInstances bool
HostMetricInclude []string
HostMetricExclude []string
HostInclude []string
HostExclude []string
VMInstances bool `toml:"vm_instances"`
VMMetricInclude []string `toml:"vm_metric_include"`
VMMetricExclude []string `toml:"vm_metric_exclude"`
VMInclude []string `toml:"vm_include"`
VMExclude []string `toml:"vm_exclude"`
DatastoreInstances bool
DatastoreMetricInclude []string
DatastoreMetricExclude []string
DatastoreInclude []string
DatastoreExclude []string
Separator string
CustomAttributeInclude []string
CustomAttributeExclude []string
UseIntSamples bool
IPAddresses []string
MetricLookback int
MaxQueryObjects int
MaxQueryMetrics int
CollectConcurrency int
DiscoverConcurrency int
ForceDiscoverOnInit bool
ObjectDiscoveryInterval config.Duration
Timeout config.Duration
HistoricalInterval config.Duration
endpoints []*Endpoint
cancel context.CancelFunc
// Mix in the TLS/SSL goodness from core
tls.ClientConfig
Log telegraf.Logger
}
var sampleConfig = `
## List of vCenter URLs to be monitored. These three lines must be uncommented
## and edited for the plugin to work.
vcenters = [ "https://vcenter.local/sdk" ]
username = "[email protected]"
password = "secret"
## VMs
## Typical VM metrics (if omitted or empty, all metrics are collected)
# vm_include = [ "/*/vm/**"] # Inventory path to VMs to collect (by default all are collected)
# vm_exclude = [] # Inventory paths to exclude
vm_metric_include = [
"cpu.demand.average",
"cpu.idle.summation",
"cpu.latency.average",
"cpu.readiness.average",
"cpu.ready.summation",
"cpu.run.summation",
"cpu.usagemhz.average",
"cpu.used.summation",
"cpu.wait.summation",
"mem.active.average",
"mem.granted.average",
"mem.latency.average",
"mem.swapin.average",
"mem.swapinRate.average",
"mem.swapout.average",
"mem.swapoutRate.average",
"mem.usage.average",
"mem.vmmemctl.average",
"net.bytesRx.average",
"net.bytesTx.average",
"net.droppedRx.summation",
"net.droppedTx.summation",
"net.usage.average",
"power.power.average",
"virtualDisk.numberReadAveraged.average",
"virtualDisk.numberWriteAveraged.average",
"virtualDisk.read.average",
"virtualDisk.readOIO.latest",
"virtualDisk.throughput.usage.average",
"virtualDisk.totalReadLatency.average",
"virtualDisk.totalWriteLatency.average",
"virtualDisk.write.average",
"virtualDisk.writeOIO.latest",
"sys.uptime.latest",
]
# vm_metric_exclude = [] ## Nothing is excluded by default
# vm_instances = true ## true by default
## Hosts
## Typical host metrics (if omitted or empty, all metrics are collected)
# host_include = [ "/*/host/**"] # Inventory path to hosts to collect (by default all are collected)
# host_exclude [] # Inventory paths to exclude
host_metric_include = [
"cpu.coreUtilization.average",
"cpu.costop.summation",
"cpu.demand.average",
"cpu.idle.summation",
"cpu.latency.average",
"cpu.readiness.average",
"cpu.ready.summation",
"cpu.swapwait.summation",
"cpu.usage.average",
"cpu.usagemhz.average",
"cpu.used.summation",
"cpu.utilization.average",
"cpu.wait.summation",
"disk.deviceReadLatency.average",
"disk.deviceWriteLatency.average",
"disk.kernelReadLatency.average",
"disk.kernelWriteLatency.average",
"disk.numberReadAveraged.average",
"disk.numberWriteAveraged.average",
"disk.read.average",
"disk.totalReadLatency.average",
"disk.totalWriteLatency.average",
"disk.write.average",
"mem.active.average",
"mem.latency.average",
"mem.state.latest",
"mem.swapin.average",
"mem.swapinRate.average",
"mem.swapout.average",
"mem.swapoutRate.average",
"mem.totalCapacity.average",
"mem.usage.average",
"mem.vmmemctl.average",
"net.bytesRx.average",
"net.bytesTx.average",
"net.droppedRx.summation",
"net.droppedTx.summation",
"net.errorsRx.summation",
"net.errorsTx.summation",
"net.usage.average",
"power.power.average",
"storageAdapter.numberReadAveraged.average",
"storageAdapter.numberWriteAveraged.average",
"storageAdapter.read.average",
"storageAdapter.write.average",
"sys.uptime.latest",
]
## Collect IP addresses? Valid values are "ipv4" and "ipv6"
# ip_addresses = ["ipv6", "ipv4" ]
# host_metric_exclude = [] ## Nothing excluded by default
# host_instances = true ## true by default
## Clusters
# cluster_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected)
# cluster_exclude = [] # Inventory paths to exclude
# cluster_metric_include = [] ## if omitted or empty, all metrics are collected
# cluster_metric_exclude = [] ## Nothing excluded by default
# cluster_instances = false ## false by default
## Datastores
# datastore_include = [ "/*/datastore/**"] # Inventory path to datastores to collect (by default all are collected)
# datastore_exclude = [] # Inventory paths to exclude
# datastore_metric_include = [] ## if omitted or empty, all metrics are collected
# datastore_metric_exclude = [] ## Nothing excluded by default
# datastore_instances = false ## false by default
## Datacenters
# datacenter_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected)
# datacenter_exclude = [] # Inventory paths to exclude
datacenter_metric_include = [] ## if omitted or empty, all metrics are collected
datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
# datacenter_instances = false ## false by default
## Plugin Settings
## separator character to use for measurement and field names (default: "_")
# separator = "_"
## number of objects to retrieve per query for realtime resources (vms and hosts)
## set to 64 for vCenter 5.5 and 6.0 (default: 256)
# max_query_objects = 256
## number of metrics to retrieve per query for non-realtime resources (clusters and datastores)
## set to 64 for vCenter 5.5 and 6.0 (default: 256)
# max_query_metrics = 256
## number of go routines to use for collection and discovery of objects and metrics
# collect_concurrency = 1
# discover_concurrency = 1
## the interval before (re)discovering objects subject to metrics collection (default: 300s)
# object_discovery_interval = "300s"
## timeout applies to any of the api request made to vcenter
# timeout = "60s"
## When set to true, all samples are sent as integers. This makes the output
## data types backwards compatible with Telegraf 1.9 or lower. Normally all
## samples from vCenter, with the exception of percentages, are integer
## values, but under some conditions, some averaging takes place internally in
## the plugin. Setting this flag to "false" will send values as floats to
## preserve the full precision when averaging takes place.
# use_int_samples = true
## Custom attributes from vCenter can be very useful for queries in order to slice the
## metrics along different dimension and for forming ad-hoc relationships. They are disabled
## by default, since they can add a considerable amount of tags to the resulting metrics. To
## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include
## to select the attributes you want to include.
## By default, since they can add a considerable amount of tags to the resulting metrics. To
## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include
## to select the attributes you want to include.
# custom_attribute_include = []
# custom_attribute_exclude = ["*"]
## The number of vSphere 5 minute metric collection cycles to look back for non-realtime metrics. In
## some versions (6.7, 7.0 and possible more), certain metrics, such as cluster metrics, may be reported
## with a significant delay (>30min). If this happens, try increasing this number. Please note that increasing
## it too much may cause performance issues.
# metric_lookback = 3
## Optional SSL Config
# ssl_ca = "/path/to/cafile"
# ssl_cert = "/path/to/certfile"
# ssl_key = "/path/to/keyfile"
## Use SSL but skip chain & host verification
# insecure_skip_verify = false
## The Historical Interval value must match EXACTLY the interval in the daily
# "Interval Duration" found on the VCenter server under Configure > General > Statistics > Statistic intervals
# historical_interval = "5m"
`
// SampleConfig returns a set of default configuration to be used as a boilerplate when setting up
// Telegraf.
func (v *VSphere) SampleConfig() string {
return sampleConfig
}
// Description returns a short textual description of the plugin
func (v *VSphere) Description() string {
return "Read metrics from VMware vCenter"
}
// Start is called from telegraf core when a plugin is started and allows it to
// perform initialization tasks.
func (v *VSphere) Start(_ telegraf.Accumulator) error {
v.Log.Info("Starting plugin")
ctx, cancel := context.WithCancel(context.Background())
v.cancel = cancel
// Check for deprecated settings
if !v.ForceDiscoverOnInit {
v.Log.Warn("The 'force_discover_on_init' configuration parameter has been deprecated. Setting it to 'false' has no effect")
}
// Create endpoints, one for each vCenter we're monitoring
v.endpoints = make([]*Endpoint, len(v.Vcenters))
for i, rawURL := range v.Vcenters {
u, err := soap.ParseURL(rawURL)
if err != nil {
return err
}
ep, err := NewEndpoint(ctx, v, u, v.Log)
if err != nil {
return err
}
v.endpoints[i] = ep
}
return nil
}
// Stop is called from telegraf core when a plugin is stopped and allows it to
// perform shutdown tasks.
func (v *VSphere) Stop() {
v.Log.Info("Stopping plugin")
v.cancel()
// Wait for all endpoints to finish. No need to wait for
// Gather() to finish here, since it Stop() will only be called
// after the last Gather() has finished. We do, however, need to
// wait for any discovery to complete by trying to grab the
// "busy" mutex.
for _, ep := range v.endpoints {
v.Log.Debugf("Waiting for endpoint %q to finish", ep.URL.Host)
func() {
ep.busy.Lock() // Wait until discovery is finished
defer ep.busy.Unlock()
ep.Close()
}()
}
}
// Gather is the main data collection function called by the Telegraf core. It performs all
// the data collection and writes all metrics into the Accumulator passed as an argument.
func (v *VSphere) Gather(acc telegraf.Accumulator) error {
var wg sync.WaitGroup
for _, ep := range v.endpoints {
wg.Add(1)
go func(endpoint *Endpoint) {
defer wg.Done()
err := endpoint.Collect(context.Background(), acc)
if err == context.Canceled {
// No need to signal errors if we were merely canceled.
err = nil
}
if err != nil {
acc.AddError(err)
}
}(ep)
}
wg.Wait()
return nil
}
func init() {
inputs.Add("vsphere", func() telegraf.Input {
return &VSphere{
Vcenters: []string{},
DatacenterInstances: false,
DatacenterMetricInclude: nil,
DatacenterMetricExclude: nil,
DatacenterInclude: []string{"/*"},
ClusterInstances: false,
ClusterMetricInclude: nil,
ClusterMetricExclude: nil,
ClusterInclude: []string{"/*/host/**"},
HostInstances: true,
HostMetricInclude: nil,
HostMetricExclude: nil,
HostInclude: []string{"/*/host/**"},
VMInstances: true,
VMMetricInclude: nil,
VMMetricExclude: nil,
VMInclude: []string{"/*/vm/**"},
DatastoreInstances: false,
DatastoreMetricInclude: nil,
DatastoreMetricExclude: nil,
DatastoreInclude: []string{"/*/datastore/**"},
Separator: "_",
CustomAttributeInclude: []string{},
CustomAttributeExclude: []string{"*"},
UseIntSamples: true,
IPAddresses: []string{},
MaxQueryObjects: 256,
MaxQueryMetrics: 256,
CollectConcurrency: 1,
DiscoverConcurrency: 1,
MetricLookback: 3,
ForceDiscoverOnInit: true,
ObjectDiscoveryInterval: config.Duration(time.Second * 300),
Timeout: config.Duration(time.Second * 60),
HistoricalInterval: config.Duration(time.Second * 300),
}
})
}