-
Notifications
You must be signed in to change notification settings - Fork 11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ctrl-C when master process is waiting for crashed workers #29
Comments
Somehow related question: would it be possible to modify |
Can you post your code that results in this behavior? |
It was not as simple as I've described. In very simple examples Julia behaves normally. addprocs(1)
@everywhere module TestTake
# message with the candidate passed between the workers and the master
typealias WorkerChannel Channel{Float64}
typealias WorkerChannelRef RemoteRef{WorkerChannel}
type ParallelPopulationOptimizer
worker_procs::Vector{Int} # IDs of worker processes
final_fitnesses::Vector{RemoteRef{Channel{Any}}} # references to the @spawnat ID run_worker()
from_workers::WorkerChannelRef # inbound channel of candidates from all workers
to_workers::Vector{WorkerChannelRef} # outgoing channels to each worker
is_started::RemoteRef{Channel{Bool}} # flag that all workers have started
end
nworkers(ppopt::ParallelPopulationOptimizer) = length(ppopt.worker_procs)
# outer parallel population optimizer constructor that
# also spawns worker tasks
function ParallelPopulationOptimizer(NWorkers::Int = 2,
ArchiveCapacity::Int = 10,
ToWorkerChannelCapacity::Int = 1000,
FromWorkersChannelCapacity::Int = 10000)
# take the first NWorkers workers
Workers = workers()
ParallelPopulationOptimizer(Workers,
Vector{RemoteRef{Channel{Any}}}(length(Workers)),
RemoteRef(() -> WorkerChannel(FromWorkersChannelCapacity)),
WorkerChannelRef[RemoteRef(() -> WorkerChannel(ToWorkerChannelCapacity), id) for id in Workers],
RemoteRef(() -> Channel{Bool}(1)))
end
function setup!(ppopt::ParallelPopulationOptimizer)
info("Initializing parallel workers...")
workers_ready = RemoteRef(() -> Channel{Int}(nworkers(ppopt))) # FIXME do we need to wait for the worker?
@assert !isready(ppopt.is_started)
for i in eachindex(ppopt.worker_procs)
procid = ppopt.worker_procs[i]
info(" Spawning worker #$i at process #$procid...");
ppopt.final_fitnesses[i] = @spawnat procid run_worker(i,
workers_ready, ppopt.is_started
)
end
info("Waiting for the workers to be ready...")
nready = 0
while nready < nworkers(ppopt)
worker_id = take!(workers_ready)
info(" Worker #$worker_id is ready")
nready += 1
end
info("All workers ready")
return ppopt
end
function step!(ppopt::ParallelPopulationOptimizer)
#println("main#: n_evals=$(num_evals(ppopt.evaluator))")
if !isready(ppopt.is_started) put!(ppopt.is_started, true) end # if it's the first iteration
info("take!...")
candidate = take!(ppopt.from_workers)#::CandidateMessage
info("take!")
end
function run_worker(id::Int,
worker_ready::RemoteRef{Channel{Int}},
is_started::RemoteRef{Channel{Bool}} )
put!(worker_ready, id)
fetch(is_started) # wait until the master is started
try
error("just error")
catch e
rethrow(e)
end
end
end
ppopt = TestTake.ParallelPopulationOptimizer()
TestTake.setup!(ppopt)
TestTake.step!(ppopt) |
When master Julia process is waiting in
take!()
on its ownRemoteRef
and the worker processes have all thrown exceptions, pressing Ctrl-C in REPL results inbut doesn't bring the master from the waiting state.
The text was updated successfully, but these errors were encountered: