Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add long-lived stress test #213

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .github/workflows/stress.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,15 @@ jobs:
matrix:
python-version: [3.7]
go-version: [1.18]
suite: ["network-forming", "commissioning", "connectivity", "network-latency", "multicast-performance", "otns-performance", "network-limits"]
suite:
- "network-forming"
- "commissioning"
- "connectivity"
- "network-latency"
- "multicast-performance"
- "otns-performance"
- "network-limits"
- "long-duration"
runs-on: ubuntu-20.04
env:
HOMEBREW_NO_AUTO_UPDATE: 1
Expand Down
7 changes: 4 additions & 3 deletions dispatcher/Node.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,15 +187,16 @@ func (node *Node) onPingReply(timestamp uint64, dstaddr string, datasize int, ho
// if datasize < 4, timestamp is 0, these ping replies are ignored
return
}
const maxPingDelayUs uint64 = 10 * 1000000

pingTimeout := node.D.cfg.PingTimeout
var leftPingRequests []*pingRequest
for _, req := range node.pendingPings {
if req.Timestamp == timestamp && req.Dst == dstaddr {
// ping replied
node.addPingResult(req.Dst, req.DataSize, node.D.CurTime-req.Timestamp)
} else if req.Timestamp+maxPingDelayUs < node.D.CurTime {
} else if req.Timestamp+pingTimeout < node.D.CurTime {
// ping timeout
node.addPingResult(req.Dst, req.DataSize, maxPingDelayUs)
node.addPingResult(req.Dst, req.DataSize, pingTimeout)
} else {
leftPingRequests = append(leftPingRequests, req)
}
Expand Down
2 changes: 2 additions & 0 deletions dispatcher/dispatcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ type Config struct {
Port int
DumpPackets bool
NoPcap bool
PingTimeout uint64
}

func DefaultConfig() *Config {
Expand All @@ -81,6 +82,7 @@ func DefaultConfig() *Config {
Host: "localhost",
Port: threadconst.InitialDispatcherPort,
DumpPackets: false,
PingTimeout: 10 * 1000000,
}
}

Expand Down
3 changes: 3 additions & 0 deletions otns_main/otns_main.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ type MainArgs struct {
DumpPackets bool
NoPcap bool
NoReplay bool
PingTimeout float64
}

var (
Expand All @@ -101,6 +102,7 @@ func parseArgs() {
flag.BoolVar(&args.DumpPackets, "dump-packets", false, "dump packets")
flag.BoolVar(&args.NoPcap, "no-pcap", false, "do not generate Pcap")
flag.BoolVar(&args.NoReplay, "no-replay", false, "do not generate Replay")
flag.Float64Var(&args.PingTimeout, "ping-timeout", 10, "set ping timeout")

flag.Parse()
}
Expand Down Expand Up @@ -254,6 +256,7 @@ func createSimulation(ctx *progctx.ProgCtx) *simulation.Simulation {

dispatcherCfg := dispatcher.DefaultConfig()
dispatcherCfg.NoPcap = args.NoPcap
dispatcherCfg.PingTimeout = uint64(args.PingTimeout * 1000000) // Dispatcher uses microseconds

sim, err := simulation.NewSimulation(ctx, simcfg, dispatcherCfg)
simplelogger.FatalIfError(err)
Expand Down
7 changes: 7 additions & 0 deletions pylibs/otns/cli/OTNS.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,13 @@ def get_poll_period(self, nodeid: int) -> float:
ms = self._expect_int(self.node_cmd(nodeid, 'pollperiod'))
return ms / 1000.0

def set_child_timeout(self, nodeid: int, timeout:int) -> None:
self.node_cmd(nodeid, f'childtimeout {timeout}')

def get_child_timeout(self, nodeid: int) -> int:
timeout = self._expect_int(self.node_cmd(nodeid, 'childtimeout'))
return timeout

@staticmethod
def _detect_otns_path() -> str:
env_otns_path = os.getenv('OTNS')
Expand Down
7 changes: 5 additions & 2 deletions pylibs/stress_tests/BaseStressTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@
import time
import traceback
from functools import wraps
from typing import Collection

from otns.cli import OTNS
from otns.cli.errors import UnexpectedError
from typing import Collection

from StressTestResult import StressTestResult
from errors import UnexpectedNodeAddr
Expand Down Expand Up @@ -66,11 +67,13 @@ def run_wrapper(self: 'BaseStressTest', report=True):


class BaseStressTest(object, metaclass=StressTestMetaclass):
def __init__(self, name, headers, raw=False):
def __init__(self, name, headers, raw=False, ping_timeout: float = None):
self.name = name
self._otns_args = []
if raw:
self._otns_args.append('-raw')
if ping_timeout is not None:
self._otns_args += ['-ping-timeout', str(ping_timeout)]
self.ns = OTNS(otns_args=self._otns_args)
self.ns.speed = float('inf')
self.ns.web()
Expand Down
126 changes: 126 additions & 0 deletions pylibs/stress_tests/long_duration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/env python3
#
# Copyright (c) 2022, The OTNS Authors.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. Neither the name of the copyright holder nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# OTNS Long Duration Stress test:
# Simulate 4 nodes for a long duration (100 days).
# OpenThread use MilliTimers with `uint32_t` as the underlying value representation.
# These timers would wrap in about 50 days.
# This test tries to make sure the OpenThread is functioning properly after a long duration.
# Topology:
# Router x2, MED x1, SED x1
# Fault Injections:
# 10% packet loss ratio
# Pass Criteria:
# All nodes are pinging successfully after running for a long duration.
#
import os
import random
import time

from BaseStressTest import BaseStressTest

RADIO_RANGE = 200
XMAX = 300
YMAX = 300

PACKET_LOSS_RATIO = 0.1
TOTAL_SIMULATION_TIME = 10 * 86400 * int(os.getenv("STRESS_LEVEL", "1"))
MOVE_INTERVAL = 3600
PING_INTERVAL = 300
PING_DATA_SIZE = 64

PING_TIMEOUT = PING_INTERVAL

assert TOTAL_SIMULATION_TIME // PING_INTERVAL <= 65535, "too many ping count"


class LongDurationStressTest(BaseStressTest):
SUITE = 'long-duration'

def __init__(self):
super(LongDurationStressTest, self).__init__("Long-Duration stress test",
['Simulation Time', 'Execution Time', 'Speed Up'],
ping_timeout=PING_TIMEOUT)
self._cur_time = 0
self._last_ping_succ_time = {}

def rand_pos(self):
return random.randint(0, XMAX), random.randint(0, YMAX)

def run(self):
ns = self.ns
ns.packet_loss_ratio = PACKET_LOSS_RATIO

router1 = ns.add("router", *self.rand_pos(), radio_range=RADIO_RANGE)
router1_addr = self.expect_node_mleid(router1, 10)

router2 = ns.add("router", *self.rand_pos(), radio_range=RADIO_RANGE)
med = ns.add("med", *self.rand_pos(), radio_range=RADIO_RANGE)
ns.set_child_timeout(med, PING_INTERVAL * 3)

sed = ns.add("sed", *self.rand_pos(), radio_range=RADIO_RANGE)
ns.set_poll_period(sed, 60)
ns.set_child_timeout(sed, PING_INTERVAL * 3)

for nodeid in (med, sed):
self._last_ping_succ_time[nodeid] = 0
ns.ping(nodeid, router1_addr, datasize=PING_DATA_SIZE, count=TOTAL_SIMULATION_TIME // PING_INTERVAL,
interval=PING_INTERVAL)

t0 = time.time()

for _ in range(TOTAL_SIMULATION_TIME // MOVE_INTERVAL):
self.ns.go(MOVE_INTERVAL)
self._cur_time += MOVE_INTERVAL

self._collect_pings()

for nodeid in (router1, router2, med, sed):
self.ns.move(nodeid, *self.rand_pos())

duration = time.time() - t0

self.result.append_row('%ds' % TOTAL_SIMULATION_TIME, '%ds' % duration,
'%d' % (TOTAL_SIMULATION_TIME / duration))
self.result.fail_if(TOTAL_SIMULATION_TIME / duration < 3000, "Speed Up < 3000")
self.result.fail_if(self._last_ping_succ_time[med] < self._cur_time - 86400,
"MED not connected for a long time")
self.result.fail_if(self._last_ping_succ_time[sed] < self._cur_time - 86400,
"SED not connected for a long time")

def _collect_pings(self):
for srcid, dstaddr, _, delay in self.ns.pings():
if delay >= PING_TIMEOUT:
# ignore failed pings
continue

self._last_ping_succ_time[srcid] = self._cur_time


if __name__ == '__main__':
LongDurationStressTest().run()
1 change: 1 addition & 0 deletions script/test
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ build_openthread()
"-DOT_SERVICE=ON"
"-DOT_COAP=ON"
"-DOT_THREAD_VERSION=${THREAD_VERSION:-1.2}"
"-DOT_UPTIME=ON"
)

local COVERAGE=${COVERAGE:-0}
Expand Down