Skip to content

Commit

Permalink
Merge pull request #288 from lidavidm/robust-worker
Browse files Browse the repository at this point in the history
Explicitly notify user if compiled bot too large
  • Loading branch information
harikmenon authored Nov 14, 2017
2 parents 843f743 + 65c28f2 commit 2fae6a3
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 5 deletions.
3 changes: 2 additions & 1 deletion admin/setup_workers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@ set -e

source ./config.sh

# Max upload size is 100 MiB
gcloud compute --project "${GCLOUD_PROJECT}" \
instance-templates create "worker-instance-template" \
--machine-type "${MACHINE_TYPE}" \
--network "default" \
--metadata "^#&&#^halite-manager-url=${COORDINATOR_URL}#&&#halite-secret-folder=${SECRET_FOLDER}#&&#startup-script=$(cat setup_workers__startup_script.sh)" \
--metadata "^#&&#^halite-manager-url=${COORDINATOR_URL}#&&#halite-secret-folder=${SECRET_FOLDER}#&&#startup-script=$(cat setup_workers__startup_script.sh)#&&#halite-max-upload-size=104857600" \
--no-restart-on-failure \
--no-service-account --no-scopes \
--maintenance-policy "TERMINATE" \
Expand Down
1 change: 1 addition & 0 deletions apiserver/apiserver/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# Flask settings
# Max size of an upload, in bytes
MAX_BOT_UPLOAD_SIZE = 20 * 1024 * 1024
# Needs to match corresponding value in worker configuration
MAX_COMPILED_BOT_UPLOAD_SIZE = 100 * 1024 * 1024
# Secret key for Flask session cookies
FLASK_SECRET_KEY = ""
Expand Down
26 changes: 24 additions & 2 deletions apiserver/worker/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,25 @@
from hashlib import md5
import json
import os
from time import gmtime, strftime
from time import gmtime, strftime, sleep


# Needs to match corresponding value in apiserver/config.py
# This is the default value, 100 MiB
MAX_BOT_UPLOAD_SIZE = 100 * 1024 * 1024
# Maximum wait time in between compiled bot archive upload attempts,
# in seconds
MAX_UPLOAD_BACKOFF = 32


with open("config.json") as configfile:
config = json.load(configfile)
MANAGER_URL = config["MANAGER_URL"]
SECRET_FOLDER = config["SECRET_FOLDER"]
CAPABILITIES = config.get("CAPABILITIES", [])
provided_size = config.get("MAX_BOT_UPLOAD_SIZE", MAX_BOT_UPLOAD_SIZE)
if provided_size:
MAX_BOT_UPLOAD_SIZE = provided_size


def getTask():
Expand Down Expand Up @@ -84,22 +95,33 @@ def storeBotLocally(user_id, bot_id, storage_dir, is_compile=False):
def storeBotRemotely(user_id, bot_id, zip_file_path):
"""Posts a bot file to the manager"""
zip_contents = open(zip_file_path, "rb").read()
if len(zip_contents) > MAX_BOT_UPLOAD_SIZE:
raise RuntimeError("Bot archive exceeds maximum size of 100 MiB.")

iterations = 0
local_hash = md5(zip_contents).hexdigest()
backoff = 1

while iterations < 100:
while iterations < 10:
r = requests.post(MANAGER_URL+"botFile",
data={
"user_id": str(user_id),
"bot_id": str(bot_id),
},
files={"bot.zip": zip_contents})
print("Posting compiled bot archive %s\n" % r.text)
if r.status_code >= 400 and r.status_code <= 499:
print("Got a 4xx status code")
r.raise_for_status()

# Try again if local and remote hashes differ
if local_hash != getBotHash(user_id, bot_id):
print("Hashes do not match! Redoing file upload...\n")
iterations += 1
sleep(backoff)
if backoff < MAX_UPLOAD_BACKOFF:
backoff *= 2

continue

return
Expand Down
11 changes: 11 additions & 0 deletions apiserver/worker/grab_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
MANAGER_URL_METADATA_URL = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/halite-manager-url"
SECRET_FOLDER_METADATA_URL = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/halite-secret-folder"
GPU_CAPABILITY_METADATA_URL = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/halite-gpu"
MAX_UPLOAD_SIZE_METADATA_URL = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/halite-max-upload-size"

MANAGER_URL = requests.get(MANAGER_URL_METADATA_URL, headers={
"Metadata-Flavor": "Google"
Expand All @@ -18,10 +19,20 @@
HAS_GPU = requests.get(GPU_CAPABILITY_METADATA_URL, headers={
"Metadata-Flavor": "Google"
}).text == "true"
MAX_UPLOAD_SIZE = requests.get(GPU_CAPABILITY_METADATA_URL, headers={
"Metadata-Flavor": "Google"
}).text

try:
MAX_UPLOAD_SIZE = int(MAX_UPLOAD_SIZE)
except:
MAX_UPLOAD_SIZE = None


with open("config.json", "w") as configfile:
json.dump({
"MANAGER_URL": MANAGER_URL,
"SECRET_FOLDER": SECRET_FOLDER,
"CAPABILITIES": ["gpu"] if HAS_GPU else [],
"MAX_BOT_UPLOAD_SIZE": MAX_UPLOAD_SIZE,
}, configfile)
19 changes: 17 additions & 2 deletions apiserver/worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@
"""


UPLOAD_ERROR_MESSAGE = """
We had some trouble uploading your bot. If you cannot figure out why
this happened, please email us at [email protected]. We can help.
For our reference, here is the trace of the error:
"""


def makePath(path):
"""Deletes anything residing at path, creates path, and chmods the directory"""
if os.path.exists(path):
Expand Down Expand Up @@ -138,15 +146,22 @@ def executeCompileTask(user_id, bot_id, backend):
try:
if didCompile:
logging.debug("Bot did compile\n")
archive.zipFolder(temp_dir, os.path.join(temp_dir, str(user_id)+".zip"))
backend.storeBotRemotely(user_id, bot_id, os.path.join(temp_dir, str(user_id)+".zip"))
archive_path = os.path.join(temp_dir, str(user_id)+".zip")
archive.zipFolder(temp_dir, archive_path)
backend.storeBotRemotely(user_id, bot_id, archive_path)
else:
logging.debug("Bot did not compile\n")
logging.debug("Bot errors %s\n" % str(errors))


backend.compileResult(user_id, bot_id, didCompile, language,
errors=(None if didCompile else "\n".join(errors)))
except:
logging.debug("Bot did not upload\n")
traceback.print_exc()
errors.append(UPLOAD_ERROR_MESSAGE + traceback.format_exc())
backend.compileResult(user_id, bot_id, False, language,
errors="\n".join(errors))
finally:
# Remove files as bot user (Python will clean up tempdir, but we don't
# necessarily have permissions to clean up files)
Expand Down

0 comments on commit 2fae6a3

Please sign in to comment.