Skip to content

Commit

Permalink
partial syncScope livelock (#121)
Browse files Browse the repository at this point in the history
* Fix syncScope livelock #119

* Alternative fix by ensuring dispatch to idle workers

* stash - this seems hopeless
  • Loading branch information
mratsim authored May 4, 2020
1 parent 943d04a commit 6dbf0e5
Show file tree
Hide file tree
Showing 9 changed files with 72 additions and 93 deletions.
16 changes: 8 additions & 8 deletions benchmarks/matmul_gemm_blas/test_gemm_output.nim
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,14 @@ proc testVsReference*(M, N, K: int) =
0, result_blas[0].addr, N
)

block weave:
gemm_strided(
M, N, K,
1'f32, a[0].unsafeaddr, K, 1, # stride row, stride col
b[0].unsafeAddr, N, 1,
0'f32, result_weave[0].addr, N, 1
)
syncRoot(Weave)
# block weave:
# gemm_strided(
# M, N, K,
# 1'f32, a[0].unsafeaddr, K, 1, # stride row, stride col
# b[0].unsafeAddr, N, 1,
# 0'f32, result_weave[0].addr, N, 1
# )
# syncRoot(Weave)

block weave_nestable:
gemm_strided_nestable(
Expand Down
2 changes: 1 addition & 1 deletion weave/cross_thread_com/pledges.nim
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ proc delayedUntilIter(taskNode: TaskNode, curTask: Task): bool =
discard taskNode.pledge.p.union.iter.impls[taskNode.bucketID].deferredOut.fetchAdd(1, moRelaxed)
return true

proc delayedUntil*(taskNode: TaskNode, curTask: Task): bool =
proc delayedUntil(taskNode: TaskNode, curTask: Task): bool =
## Redelay a task that depends on multiple pledges (in the `taskNode` linked list)
## with 1 or more pledge fulfilled but still some unfulfilled.
##
Expand Down
9 changes: 2 additions & 7 deletions weave/scheduler.nim
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import
./contexts, ./config,
./victims,
./random/rng,
./state_machines/event_loop
./state_machines/[event_loop, dispatch_events]

# Local context
# ----------------------------------------------------------------------------------
Expand Down Expand Up @@ -189,11 +189,6 @@ proc schedule*(task: sink Task) =
log(">>> Worker %2d resumes execution after barrier <<<\n", myID())
localCtx.runtimeIsQuiescent = false

shareWork()

# Check if someone requested a steal
var req: StealRequest
while recv(req):
dispatchElseDecline(req)
dispatchToChildrenAndThieves()

profile_start(enq_deq_task)
6 changes: 6 additions & 0 deletions weave/state_machines/dispatch_events.nim
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,9 @@ proc declineAll*() =
decline(req)

profile_start(idle)

proc dispatchToChildrenAndThieves*() =
shareWork()
var req: StealRequest
while recv(req):
dispatchElseDecline(req)
36 changes: 14 additions & 22 deletions weave/state_machines/sync.nim
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ import
../contexts, ../config,
../victims,
../thieves, ../workers,
./recv_task_else_steal, ./handle_thieves
./recv_task_else_steal, ./dispatch_events

# Await/sync future - Finite Automaton rewrite
# ----------------------------------------------------------------------------------

type AwaitState = enum
AW_CheckTask
AW_OutOfChildTasks
AW_OutOfDirectChildTasks
AW_Steal
AW_SuccessfulTheft

Expand Down Expand Up @@ -53,7 +53,7 @@ implEvent(awaitFSA, AWE_FutureReady):
behavior(awaitFSA):
# In AW_Steal we might recv tasks and steal requests which get stuck in our queues
# when we exit once the future is ready.
ini: [AW_CheckTask, AW_OutOfChildTasks, AW_Steal]
ini: [AW_CheckTask, AW_OutOfDirectChildTasks, AW_Steal]
interrupt: AWE_FutureReady
transition: discard
fin: AW_Exit
Expand All @@ -62,18 +62,7 @@ implEvent(awaitFSA, AWE_HasChildTask):
not task.isNil

onEntry(awaitFSA, AW_CheckTask):
task = myWorker().deque.popFirstIfChild(myTask())

when WV_StealEarly > 0:
if not task.isNil:
# If we have a big loop should we allow early thefts?
stealEarly()

shareWork()
# Check if someone requested to steal from us
# Send them extra tasks if we have them
# or split our popped task if possible
handleThieves(task)
task = nextTask(childTask = true)

behavior(awaitFSA):
ini: AW_CheckTask
Expand All @@ -94,15 +83,19 @@ behavior(awaitFSA):
# 2. Run out-of-task, become a thief and help other threads
# to reach children faster
debug: log("Worker %2d: forcefut 2 - becoming a thief\n", myID())
fin: AW_OutOfChildTasks
fin: AW_OutOfDirectChildTasks

# -------------------------------------------
# These states are interrupted when future is ready

behavior(awaitFSA):
ini: AW_OutOfChildTasks
ini: AW_OutOfDirectChildTasks
transition:
# Steal and hope to advance towards the child tasks in other workers' queues.
trySteal(isOutOfTasks = false)
# If someone wants our non-direct child tasks, let's oblige
# Note that we might have grandchildren tasks stuck in our own queue.
dispatchToChildrenAndThieves()
profile_start(idle)
fin: AW_Steal

Expand All @@ -125,10 +118,9 @@ behavior(awaitFSA):
# dispatchElseDecline so resteal
profile_stop(idle)
trySteal(isOutOfTasks = false)
# If someone wants our non-child tasks, let's oblige
var req: StealRequest
while recv(req):
dispatchElseDecline(req)
# If someone wants our non-direct child tasks, let's oblige
# Note that we might have grandchildren tasks stuck in our own queue.
dispatchToChildrenAndThieves()
profile_start(idle)

# -------------------------------------------
Expand Down Expand Up @@ -164,7 +156,7 @@ behavior(awaitFSA):
profile(enq_deq_task):
# The memory is reused but not zero-ed
localCtx.taskCache.add(task)
fin: AW_OutOfChildTasks
fin: AW_OutOfDirectChildTasks

# -------------------------------------------

Expand Down
15 changes: 2 additions & 13 deletions weave/state_machines/sync_root.nim
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import
../cross_thread_com/channels_spsc_single_ptr,
../memory/lookaside_lists,
../workers, ../thieves, ../victims,
./handle_thieves, ./recv_task_else_steal,
./recv_task_else_steal,
./dispatch_events

# Sync Root - Global runtime barrier
Expand Down Expand Up @@ -78,18 +78,7 @@ implEvent(syncRootFSA, SYE_SoleWorker):
# -------------------------------------------

onEntry(syncRootFSA, SY_CheckTask):
task = myWorker().deque.popFirst()

when WV_StealEarly > 0:
if not task.isNil:
# If we have a big loop should we allow early thefts?
stealEarly()

shareWork()
# Check if someone requested to steal from us
# Send them extra tasks if we have them
# or split our popped task if possible
handleThieves(task)
task = nextTask(childTask = false)

behavior(syncRootFSA):
ini: SY_CheckTask
Expand Down
24 changes: 12 additions & 12 deletions weave/state_machines/sync_scope.dot
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
digraph syncScopeFSA{
splines=ortho;
node [shape = doublecircle]; InitialState SB_Exit;
node [shape = circle, fontcolor=white, fillcolor=darkslategrey, style="filled"]; SB_CheckTask SB_Steal SB_SuccessfulTheft SB_OutOfChildTasks;
node [shape = circle, fontcolor=white, fillcolor=darkslategrey, style="filled"]; SB_OutOfTasks SB_CheckTask SB_Steal SB_SuccessfulTheft;
InitialState -> SB_CheckTask [color="black:invis:black", xlabel="entry point"];
node [shape = octagon, fontcolor=black, fillcolor=lightsteelblue, style="rounded,filled"]; SB_CheckTask_SBE_HasChildTask SB_Steal_SBE_ReceivedTask ;
node [shape = diamond, fontcolor=black, fillcolor=coral, style="rounded,filled"]; SB_CheckTask_SBE_NoDescendants SB_Steal_SBE_NoDescendants SB_OutOfChildTasks_SBE_NoDescendants ;
SB_CheckTask_SBE_HasChildTask [label="SBE_HasChildTask\nnot isNil(task)"];
node [shape = octagon, fontcolor=black, fillcolor=lightsteelblue, style="rounded,filled"]; SB_CheckTask_SBE_HasTask SB_Steal_SBE_ReceivedTask ;
node [shape = diamond, fontcolor=black, fillcolor=coral, style="rounded,filled"]; SB_OutOfTasks_SBE_NoDescendants SB_CheckTask_SBE_NoDescendants SB_Steal_SBE_NoDescendants ;
SB_CheckTask_SBE_HasTask [label="SBE_HasTask\nnot isNil(task)"];
SB_Steal_SBE_ReceivedTask [label="SBE_ReceivedTask\nlootedTask"];
SB_OutOfTasks_SBE_NoDescendants [label="SBE_NoDescendants\nhasDescendantTasks(scopedBarrier)"];
SB_CheckTask_SBE_NoDescendants [label="SBE_NoDescendants\nhasDescendantTasks(scopedBarrier)"];
SB_Steal_SBE_NoDescendants [label="SBE_NoDescendants\nhasDescendantTasks(scopedBarrier)"];
SB_OutOfChildTasks_SBE_NoDescendants [label="SBE_NoDescendants\nhasDescendantTasks(scopedBarrier)"];
SB_OutOfTasks -> SB_OutOfTasks_SBE_NoDescendants[style=bold, xlabel="always"];
SB_OutOfTasks_SBE_NoDescendants -> SB_Exit [color="coral", fontcolor="coral", xlabel="interrupted"];
SB_OutOfTasks_SBE_NoDescendants -> SB_Steal [xlabel="default"];
SB_CheckTask -> SB_CheckTask_SBE_NoDescendants[style=bold, xlabel="always"];
SB_CheckTask_SBE_NoDescendants -> SB_Exit [color="coral", fontcolor="coral", xlabel="interrupted"];
SB_CheckTask_SBE_NoDescendants -> SB_CheckTask_SBE_HasChildTask[xlabel="normal flow"];
SB_CheckTask_SBE_HasChildTask -> SB_CheckTask [style=dashed, xlabel="true"];
SB_CheckTask_SBE_HasChildTask -> SB_OutOfChildTasks [xlabel="default"];
SB_CheckTask_SBE_NoDescendants -> SB_CheckTask_SBE_HasTask[xlabel="normal flow"];
SB_CheckTask_SBE_HasTask -> SB_CheckTask [style=dashed, xlabel="true"];
SB_CheckTask_SBE_HasTask -> SB_OutOfTasks [xlabel="default"];
SB_Steal -> SB_Steal_SBE_NoDescendants[style=bold, xlabel="always"];
SB_Steal_SBE_NoDescendants -> SB_Exit [color="coral", fontcolor="coral", xlabel="interrupted"];
SB_Steal_SBE_NoDescendants -> SB_Steal_SBE_ReceivedTask[xlabel="normal flow"];
SB_Steal_SBE_ReceivedTask -> SB_SuccessfulTheft [style=dashed, xlabel="true"];
SB_Steal_SBE_ReceivedTask -> SB_Steal [xlabel="default"];
SB_SuccessfulTheft -> SB_OutOfChildTasks [xlabel="default"];
SB_OutOfChildTasks -> SB_OutOfChildTasks_SBE_NoDescendants[style=bold, xlabel="always"];
SB_OutOfChildTasks_SBE_NoDescendants -> SB_Exit [color="coral", fontcolor="coral", xlabel="interrupted"];
SB_OutOfChildTasks_SBE_NoDescendants -> SB_Steal [xlabel="default"];
SB_SuccessfulTheft -> SB_OutOfTasks [xlabel="default"];
}
57 changes: 27 additions & 30 deletions weave/state_machines/sync_scope.nim
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import
../contexts, ../config,
../victims,
../thieves, ../workers,
./recv_task_else_steal, ./handle_thieves
./recv_task_else_steal, ./dispatch_events

# Scoped Barrier - Finite state machine
# ----------------------------------------------------------------------------------
Expand All @@ -30,15 +30,28 @@ import
# The awaiting thread continues to help the runtime
# by sharing the workload.

# Note: we can focus on the child tasks of the current task
# by calling `myWorker().deque.popFirstIfChild(myTask())`
# instead of `myWorker().deque.popFirst(myTask())`
#
# but this seems to livelock the root thread from time to time
# i.e. the task in deque could be a grandchildren task
# and so is not cleared and then the root thread is stuck trying to steal
# from idle threads.
#
# Furthermore while `sync_scope` guarantees only waiting for tasks created in the scope
# and not emptying all tasks, doing so satistfies the "greedy" scheduler requirement
# to have the asymptotically optimal speedup. (i.e. as long as there are tasks, workers are progressing on them)

type ScopedBarrierState = enum
SB_CheckTask
SB_OutOfChildTasks
SB_OutOfTasks
SB_Steal
SB_SuccessfulTheft

type ScopedBarrierEvent = enum
SBE_NoDescendants
SBE_HasChildTask
SBE_HasTask
SBE_ReceivedTask

declareAutomaton(syncScopeFSA, ScopedBarrierState, ScopedBarrierEvent)
Expand All @@ -58,31 +71,20 @@ implEvent(syncScopeFSA, SBE_NoDescendants):
behavior(syncScopeFSA):
# In SB_Steal state we might recv tasks and steal requests which get stuck
# in our queues when we exit once we have no descendant left.
ini: [SB_CheckTask, SB_OutOfChildTasks, SB_Steal]
ini: [SB_CheckTask, SB_OutOfTasks, SB_Steal]
interrupt: SBE_NoDescendants
transition: discard
fin: SB_Exit

implEvent(syncScopeFSA, SBE_HasChildTask):
implEvent(syncScopeFSA, SBE_HasTask):
not task.isNil

onEntry(syncScopeFSA, SB_CheckTask):
task = myWorker().deque.popFirstIfChild(myTask())

when WV_StealEarly > 0:
if not task.isNil:
# If we have a big loop should we allow early theft?
stealEarly()

shareWork()
# Check if someone requested to steal from us
# send them extra tasks if we have them
# or split our popped task if possible
handleThieves(task)
task = nextTask(childTask = false)

behavior(syncScopeFSA):
ini: SB_CheckTask
event: SBE_HasChildTask
event: SBE_HasTask
transition:
profile(run_task):
execute(task)
Expand All @@ -96,20 +98,21 @@ behavior(syncScopeFSA):
# 2. Run out-of-task, become a thief and help other threads
# to reach their children faster
debug: log("Worker %2d: syncScope 2 - becoming a thief\n", myID())
fin: SB_OutOfChildTasks
fin: SB_OutOfTasks

# -------------------------------------------
# These states are interrupted when the scope has no more descendant

behavior(syncScopeFSA):
ini: SB_OutOfChildTasks
ini: SB_OutOfTasks
transition:
trySteal(isOutOfTasks = false)
# Steal and hope to advance towards the child tasks in other workers' queues.
trySteal(isOutOfTasks = false) # Don't sleep here or we might stall the runtime
profile_start(idle)
fin: SB_Steal

onEntry(syncScopeFSA, SB_Steal):
let lootedTask = recvElseSteal(task, isOutOfTasks = false)
let lootedTask = recvElseSteal(task, isOutOfTasks = false) # Don't sleep here or we might stall the runtime

implEvent(syncScopeFSA, SBE_ReceivedTask):
lootedTask
Expand All @@ -123,14 +126,8 @@ behavior(syncScopeFSA):
behavior(syncScopeFSA):
steady: SB_Steal
transition:
# We might inadvertently remove our own steal request in
# dispatchElseDecline so resteal
profile_stop(idle)
trySteal(isOutOfTasks = false)
# If someone wants our non-child tasks, let's oblige
var req: StealRequest
while recv(req):
dispatchElseDecline(req)
dispatchToChildrenAndThieves()
profile_start(idle)

# -------------------------------------------
Expand Down Expand Up @@ -166,7 +163,7 @@ behavior(syncScopeFSA):
profile(enq_deq_task):
# The memory is re-used but not zero-ed
localCtx.taskCache.add(task)
fin: SB_OutOfChildTasks
fin: SB_OutOfTasks

# -------------------------------------------

Expand Down
Binary file modified weave/state_machines/sync_scope.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 6dbf0e5

Please sign in to comment.