From bbfca91e27f141abec23633a98a08c8dc0258b7b Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Sun, 20 Feb 2022 00:28:09 +0800 Subject: [PATCH] add long-lived stress test --- .github/workflows/stress.yml | 10 +- dispatcher/Node.go | 7 +- dispatcher/dispatcher.go | 2 + otns_main/otns_main.go | 3 + pylibs/otns/cli/OTNS.py | 7 ++ pylibs/stress_tests/BaseStressTest.py | 7 +- pylibs/stress_tests/long_duration.py | 126 ++++++++++++++++++++++++++ script/test | 1 + 8 files changed, 157 insertions(+), 6 deletions(-) create mode 100644 pylibs/stress_tests/long_duration.py diff --git a/.github/workflows/stress.yml b/.github/workflows/stress.yml index e8e6f7d1..ce4fd390 100644 --- a/.github/workflows/stress.yml +++ b/.github/workflows/stress.yml @@ -47,7 +47,15 @@ jobs: matrix: python-version: [3.7] go-version: [1.18] - suite: ["network-forming", "commissioning", "connectivity", "network-latency", "multicast-performance", "otns-performance", "network-limits"] + suite: + - "network-forming" + - "commissioning" + - "connectivity" + - "network-latency" + - "multicast-performance" + - "otns-performance" + - "network-limits" + - "long-duration" runs-on: ubuntu-20.04 env: HOMEBREW_NO_AUTO_UPDATE: 1 diff --git a/dispatcher/Node.go b/dispatcher/Node.go index a30941b2..c084c655 100644 --- a/dispatcher/Node.go +++ b/dispatcher/Node.go @@ -187,15 +187,16 @@ func (node *Node) onPingReply(timestamp uint64, dstaddr string, datasize int, ho // if datasize < 4, timestamp is 0, these ping replies are ignored return } - const maxPingDelayUs uint64 = 10 * 1000000 + + pingTimeout := node.D.cfg.PingTimeout var leftPingRequests []*pingRequest for _, req := range node.pendingPings { if req.Timestamp == timestamp && req.Dst == dstaddr { // ping replied node.addPingResult(req.Dst, req.DataSize, node.D.CurTime-req.Timestamp) - } else if req.Timestamp+maxPingDelayUs < node.D.CurTime { + } else if req.Timestamp+pingTimeout < node.D.CurTime { // ping timeout - node.addPingResult(req.Dst, req.DataSize, maxPingDelayUs) + node.addPingResult(req.Dst, req.DataSize, pingTimeout) } else { leftPingRequests = append(leftPingRequests, req) } diff --git a/dispatcher/dispatcher.go b/dispatcher/dispatcher.go index 7ad75145..dd1722ca 100644 --- a/dispatcher/dispatcher.go +++ b/dispatcher/dispatcher.go @@ -72,6 +72,7 @@ type Config struct { Port int DumpPackets bool NoPcap bool + PingTimeout uint64 } func DefaultConfig() *Config { @@ -81,6 +82,7 @@ func DefaultConfig() *Config { Host: "localhost", Port: threadconst.InitialDispatcherPort, DumpPackets: false, + PingTimeout: 10 * 1000000, } } diff --git a/otns_main/otns_main.go b/otns_main/otns_main.go index 1991440f..b731851a 100644 --- a/otns_main/otns_main.go +++ b/otns_main/otns_main.go @@ -77,6 +77,7 @@ type MainArgs struct { DumpPackets bool NoPcap bool NoReplay bool + PingTimeout float64 } var ( @@ -101,6 +102,7 @@ func parseArgs() { flag.BoolVar(&args.DumpPackets, "dump-packets", false, "dump packets") flag.BoolVar(&args.NoPcap, "no-pcap", false, "do not generate Pcap") flag.BoolVar(&args.NoReplay, "no-replay", false, "do not generate Replay") + flag.Float64Var(&args.PingTimeout, "ping-timeout", 10, "set ping timeout") flag.Parse() } @@ -254,6 +256,7 @@ func createSimulation(ctx *progctx.ProgCtx) *simulation.Simulation { dispatcherCfg := dispatcher.DefaultConfig() dispatcherCfg.NoPcap = args.NoPcap + dispatcherCfg.PingTimeout = uint64(args.PingTimeout * 1000000) // Dispatcher uses microseconds sim, err := simulation.NewSimulation(ctx, simcfg, dispatcherCfg) simplelogger.FatalIfError(err) diff --git a/pylibs/otns/cli/OTNS.py b/pylibs/otns/cli/OTNS.py index bd5c1b97..b833c671 100644 --- a/pylibs/otns/cli/OTNS.py +++ b/pylibs/otns/cli/OTNS.py @@ -131,6 +131,13 @@ def get_poll_period(self, nodeid: int) -> float: ms = self._expect_int(self.node_cmd(nodeid, 'pollperiod')) return ms / 1000.0 + def set_child_timeout(self, nodeid: int, timeout:int) -> None: + self.node_cmd(nodeid, f'childtimeout {timeout}') + + def get_child_timeout(self, nodeid: int) -> int: + timeout = self._expect_int(self.node_cmd(nodeid, 'childtimeout')) + return timeout + @staticmethod def _detect_otns_path() -> str: env_otns_path = os.getenv('OTNS') diff --git a/pylibs/stress_tests/BaseStressTest.py b/pylibs/stress_tests/BaseStressTest.py index af13580b..a244b16b 100644 --- a/pylibs/stress_tests/BaseStressTest.py +++ b/pylibs/stress_tests/BaseStressTest.py @@ -32,9 +32,10 @@ import time import traceback from functools import wraps +from typing import Collection + from otns.cli import OTNS from otns.cli.errors import UnexpectedError -from typing import Collection from StressTestResult import StressTestResult from errors import UnexpectedNodeAddr @@ -66,11 +67,13 @@ def run_wrapper(self: 'BaseStressTest', report=True): class BaseStressTest(object, metaclass=StressTestMetaclass): - def __init__(self, name, headers, raw=False): + def __init__(self, name, headers, raw=False, ping_timeout: float = None): self.name = name self._otns_args = [] if raw: self._otns_args.append('-raw') + if ping_timeout is not None: + self._otns_args += ['-ping-timeout', str(ping_timeout)] self.ns = OTNS(otns_args=self._otns_args) self.ns.speed = float('inf') self.ns.web() diff --git a/pylibs/stress_tests/long_duration.py b/pylibs/stress_tests/long_duration.py new file mode 100644 index 00000000..3255c3b4 --- /dev/null +++ b/pylibs/stress_tests/long_duration.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2022, The OTNS Authors. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# OTNS Long Duration Stress test: +# Simulate 4 nodes for a long duration (100 days). +# OpenThread use MilliTimers with `uint32_t` as the underlying value representation. +# These timers would wrap in about 50 days. +# This test tries to make sure the OpenThread is functioning properly after a long duration. +# Topology: +# Router x2, MED x1, SED x1 +# Fault Injections: +# 10% packet loss ratio +# Pass Criteria: +# All nodes are pinging successfully after running for a long duration. +# +import os +import random +import time + +from BaseStressTest import BaseStressTest + +RADIO_RANGE = 200 +XMAX = 300 +YMAX = 300 + +PACKET_LOSS_RATIO = 0.1 +TOTAL_SIMULATION_TIME = 10 * 86400 * int(os.getenv("STRESS_LEVEL", "1")) +MOVE_INTERVAL = 3600 +PING_INTERVAL = 300 +PING_DATA_SIZE = 64 + +PING_TIMEOUT = PING_INTERVAL + +assert TOTAL_SIMULATION_TIME // PING_INTERVAL <= 65535, "too many ping count" + + +class LongDurationStressTest(BaseStressTest): + SUITE = 'long-duration' + + def __init__(self): + super(LongDurationStressTest, self).__init__("Long-Duration stress test", + ['Simulation Time', 'Execution Time', 'Speed Up'], + ping_timeout=PING_TIMEOUT) + self._cur_time = 0 + self._last_ping_succ_time = {} + + def rand_pos(self): + return random.randint(0, XMAX), random.randint(0, YMAX) + + def run(self): + ns = self.ns + ns.packet_loss_ratio = PACKET_LOSS_RATIO + + router1 = ns.add("router", *self.rand_pos(), radio_range=RADIO_RANGE) + router1_addr = self.expect_node_mleid(router1, 10) + + router2 = ns.add("router", *self.rand_pos(), radio_range=RADIO_RANGE) + med = ns.add("med", *self.rand_pos(), radio_range=RADIO_RANGE) + ns.set_child_timeout(med, PING_INTERVAL * 3) + + sed = ns.add("sed", *self.rand_pos(), radio_range=RADIO_RANGE) + ns.set_poll_period(sed, 60) + ns.set_child_timeout(sed, PING_INTERVAL * 3) + + for nodeid in (med, sed): + self._last_ping_succ_time[nodeid] = 0 + ns.ping(nodeid, router1_addr, datasize=PING_DATA_SIZE, count=TOTAL_SIMULATION_TIME // PING_INTERVAL, + interval=PING_INTERVAL) + + t0 = time.time() + + for _ in range(TOTAL_SIMULATION_TIME // MOVE_INTERVAL): + self.ns.go(MOVE_INTERVAL) + self._cur_time += MOVE_INTERVAL + + self._collect_pings() + + for nodeid in (router1, router2, med, sed): + self.ns.move(nodeid, *self.rand_pos()) + + duration = time.time() - t0 + + self.result.append_row('%ds' % TOTAL_SIMULATION_TIME, '%ds' % duration, + '%d' % (TOTAL_SIMULATION_TIME / duration)) + self.result.fail_if(TOTAL_SIMULATION_TIME / duration < 3000, "Speed Up < 3000") + self.result.fail_if(self._last_ping_succ_time[med] < self._cur_time - 86400, + "MED not connected for a long time") + self.result.fail_if(self._last_ping_succ_time[sed] < self._cur_time - 86400, + "SED not connected for a long time") + + def _collect_pings(self): + for srcid, dstaddr, _, delay in self.ns.pings(): + if delay >= PING_TIMEOUT: + # ignore failed pings + continue + + self._last_ping_succ_time[srcid] = self._cur_time + + +if __name__ == '__main__': + LongDurationStressTest().run() diff --git a/script/test b/script/test index cf56ecf6..bc7d07a6 100755 --- a/script/test +++ b/script/test @@ -99,6 +99,7 @@ build_openthread() "-DOT_SERVICE=ON" "-DOT_COAP=ON" "-DOT_THREAD_VERSION=${THREAD_VERSION:-1.2}" + "-DOT_UPTIME=ON" ) local COVERAGE=${COVERAGE:-0}