Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for custom memory and cpu limit #3

Merged
merged 8 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions src/controllers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@ import Router from "koa-router"
import { CustomContext, CustomState } from "./types"
import multer from "@koa/multer"
import gateKeeper, {
INSTANCES,
CPU_CORES_IN_SYSTEM,
getBusyInstances,
freeInstance,
getReservedMemory,
TOTAL_SYSTEM_MEMORY_GB,
} from "./middleware/gatekeeper"
import { BadRequestError } from "./util/error"
import handleSubmission, { RunResult } from "./sandbox"
import Axios from "axios"
import { SupportedMimeTypes } from "./util/file_extractor"
import extractResourceLimitsFromRequest from "./util/extractResourceLimitsFromRequest"

const upload = multer({ dest: "uploads/" })
export const ALLOWED_ALTERNATIVE_DOCKER_IMAGES = ["nygrenh/sandbox-next"]
Expand All @@ -19,7 +22,12 @@ const api = new Router<CustomState, CustomContext>()
.get("/status.json", async (ctx) => {
ctx.body = {
busy_instances: getBusyInstances(),
total_instances: INSTANCES,
// This is intentionally the same as busy instances, this is more descriptive name but we're keeping busy_instances for backwards compatibility
reserved_cpu_cores: getBusyInstances(),

total_instances: CPU_CORES_IN_SYSTEM,
reserved_memory: getReservedMemory(),
total_memory: TOTAL_SYSTEM_MEMORY_GB,
}
})

Expand All @@ -29,11 +37,13 @@ const api = new Router<CustomState, CustomContext>()
// concurrent tasks in a middleware because we want to do it before receiving
// the uploaded file.

const resourceLimits = extractResourceLimitsFromRequest(ctx.request.body)

if (
ctx.file.mimetype !== "application/x-tar" &&
ctx.file.mimetype !== "application/zstd"
) {
freeInstance()
freeInstance(resourceLimits)
throw new BadRequestError(
`Uploaded file type is not supported! Mimetype was: ${ctx.file.mimetype}}. Supported types are application/x-tar and application/zstd.`,
)
Expand All @@ -47,7 +57,7 @@ const api = new Router<CustomState, CustomContext>()
ALLOWED_ALTERNATIVE_DOCKER_IMAGES.indexOf(dockerImage) !== -1
)
) {
freeInstance()
freeInstance(resourceLimits)
throw new BadRequestError("Docker image was not whitelisted.")
}

Expand All @@ -66,12 +76,13 @@ const api = new Router<CustomState, CustomContext>()
dockerImage,
ctx.log.child({ async: true }),
ctx.file.mimetype as SupportedMimeTypes,
resourceLimits,
)
} catch (reason1) {
ctx.log.error("Handling submission failed.", { reason: reason1 })
return
} finally {
freeInstance()
freeInstance(resourceLimits)
}

ctx.log.info(`Notifying ${ctx.request.body.notify}...`, {
Expand All @@ -89,7 +100,7 @@ const api = new Router<CustomState, CustomContext>()
exit_code: output.exit_code,
})
} catch (reason2) {
ctx.log.error("Notifying failed", { error: reason2.message })
ctx.log.error("Notifying failed", { error: (reason2 as Error).message })
}
})

Expand Down
39 changes: 29 additions & 10 deletions src/middleware/gatekeeper.ts
Original file line number Diff line number Diff line change
@@ -1,31 +1,50 @@
import { CustomContext } from "../types"
import { cpus } from "os"
import { cpus, totalmem } from "os"
import { SandboxBusyError } from "../util/error"
import extractResourceLimitsFromRequest, {
ResourceLimits,
} from "../util/extractResourceLimitsFromRequest"

export const INSTANCES = cpus().length
let busyInstances = 0
export const CPU_CORES_IN_SYSTEM = cpus().length
export const TOTAL_SYSTEM_MEMORY_GB = totalmem() / 1024 ** 3

let reservedCPUCores = 0
let reservedMemory = 0

export function getBusyInstances(): number {
return busyInstances
return reservedCPUCores
}

export function getReservedMemory(): number {
return reservedMemory
}

export function freeInstance(): void {
busyInstances--
export function freeInstance(limits: ResourceLimits): void {
reservedCPUCores -= limits.cpus
reservedMemory -= limits.memoryGB
}

function reserveInstance() {
busyInstances++
function reserveInstance(limits: ResourceLimits): void {
reservedCPUCores += limits.cpus
reservedMemory += limits.memoryGB
}

// Enforces the server is not processing too many submissions at once.
const gateKeeper = async (
ctx: CustomContext,
next: () => Promise<unknown>,
): Promise<void> => {
if (busyInstances >= INSTANCES) {
const limits = extractResourceLimitsFromRequest(ctx.request.body)
console.info(
`Sandbox sumbission requesting ${limits.memoryGB}GB of memory and ${limits.cpus} CPUs`,
)
if (reservedCPUCores + limits.cpus > CPU_CORES_IN_SYSTEM) {
throw new SandboxBusyError()
}
if (reservedMemory + limits.memoryGB > TOTAL_SYSTEM_MEMORY_GB) {
throw new SandboxBusyError()
}
reserveInstance()
reserveInstance(limits)
await next()
}

Expand Down
25 changes: 21 additions & 4 deletions src/sandbox.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import winston from "winston"
import { exec as origExec } from "child_process"
import { readFile as origReadFile, unlink as origUnlink } from "fs"
import extract, { SupportedMimeTypes } from "./util/file_extractor"
import { ResourceLimits } from "./util/extractResourceLimitsFromRequest"
const exec = promisify(origExec)
const readFile = promisify(origReadFile)
const unlink = promisify(origUnlink)
Expand All @@ -29,6 +30,7 @@ const handleSubmission = async (
dockerImage: string | undefined,
log: winston.Logger,
mimetype: SupportedMimeTypes,
resourceLimits: ResourceLimits,
): Promise<RunResult> => {
log.info("Handling submission")
const outputPath = join("work", id)
Expand All @@ -38,7 +40,13 @@ const handleSubmission = async (
await exec(`chmod -R 777 ${outputPath}`)
try {
await exec(`chmod -R 777 ${outputPath}`)
const results = await runTests(outputPath, id, dockerImage, log)
const results = await runTests(
outputPath,
id,
dockerImage,
log,
resourceLimits,
)
return results
} catch (e) {
log.error(`Error while running: ${e}`)
Expand All @@ -62,6 +70,7 @@ async function runTests(
submission_id: string,
dockerImage: string | undefined,
log: winston.Logger,
resourceLimits: ResourceLimits,
): Promise<RunResult> {
const id = `sandbox-submission-${submission_id}`
let status = "failed"
Expand All @@ -79,11 +88,19 @@ async function runTests(
const image = dockerImage || "nygrenh/sandbox-next"
let command
if (SUPERDEBUG) {
command = `docker create --name '${id}' --memory 2G --kernel-memory=50M --pids-limit=200 --ulimit nproc=10000:10000 --cpus 1 --mount type=bind,source=${resolve(
command = `docker create --name '${id}' --memory '${
resourceLimits.memoryGB
}G' --kernel-memory=50M --pids-limit=200 --ulimit nproc=10000:10000 --cpus '${
resourceLimits.cpus
}' --mount type=bind,source=${resolve(
path,
)},target=/app -it '${image}' /bin/sleep infinity `
} else {
command = `docker create --name '${id}' --network none --memory 2G --kernel-memory=50M --pids-limit=200 --ulimit nproc=10000:10000 --cpus 1 --cap-drop SETPCAP --cap-drop SETFCAP --cap-drop AUDIT_WRITE --cap-drop SETGID --cap-drop SETUID --cap-drop NET_BIND_SERVICE --cap-drop SYS_CHROOT --cap-drop NET_RAW --mount type=bind,source=${resolve(
command = `docker create --name '${id}' --network none --memory '${
resourceLimits.memoryGB
}G' --kernel-memory=50M --pids-limit=200 --ulimit nproc=10000:10000 --cpus '${
resourceLimits.cpus
}' --cap-drop SETPCAP --cap-drop SETFCAP --cap-drop AUDIT_WRITE --cap-drop SETGID --cap-drop SETUID --cap-drop NET_BIND_SERVICE --cap-drop SYS_CHROOT --cap-drop NET_RAW --mount type=bind,source=${resolve(
path,
)},target=/app -it '${image}' /app/init`
}
Expand Down Expand Up @@ -114,7 +131,7 @@ async function runTests(
} catch (e) {
const executionEndTime = new Date().getTime()
const durationMs = executionEndTime - executionStartTime
log.error("Running tests failed", { error: e.message })
log.error("Running tests failed", { error: (e as Error).message })
// If the process died within the last 5 seconds before timeout, it was
// likely a timeout.
if (durationMs > timeout_ms - 5000) {
Expand Down
23 changes: 23 additions & 0 deletions src/util/extractResourceLimitsFromRequest.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
export interface ResourceLimits {
memoryGB: number
cpus: number
}

const MAX_MEMORY_REQUEST_GB = 4
const MAX_CPUS_REQUEST = 2

/** Extracts and validatates cpu and memory requests. Handles too big requests by making them smaller. */
export default function extractResourceLimitsFromRequest(
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/explicit-module-boundary-types
requestBody: any,
): ResourceLimits {
let memoryGB = Number(requestBody.memory_limit_gb ?? 1)
let cpus = Number(requestBody.cpu_limit ?? 1)
if (memoryGB > MAX_MEMORY_REQUEST_GB) {
memoryGB = MAX_MEMORY_REQUEST_GB
}
if (cpus > MAX_CPUS_REQUEST) {
cpus = MAX_CPUS_REQUEST
}
return { memoryGB, cpus }
}
53 changes: 43 additions & 10 deletions tests/submissions.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,47 @@ test("POST /tasks.json works", async () => {
"docker_image",
"eu.gcr.io/moocfi-public/tmc-sandbox-tmc-langs-rust",
)
.field("token", "SUPER_SECERET")
.field("token", "SUPER_SECRET")
.field("notify", notifyAddress)
.set("Accept", "application/json")
.expect("Content-Type", /json/)
.expect(200)
},
)
expect(notifyResult.token).toBe("SUPER_SECERET")
expect(notifyResult.token).toBe("SUPER_SECRET")
expect(notifyResult.exit_code).toBe("0")
expect(notifyResult.status).toBe("finished")
expect(notifyResult.vm_log.length).toBeGreaterThan(5)
const testOutput = JSON.parse(notifyResult.test_output)
expect(testOutput.status).toBe("PASSED")
expect(testOutput.testResults.length).toBe(1)
})

test("POST /tasks.json with higher resource limits works", async () => {
jest.setTimeout(60000)
const notifyResult: NotifyResult = await new Promise(
async (resolve, _reject) => {
const notifyAddress = createResultServer((res) => {
resolve(res)
})

await request(server)
.post("/tasks.json")
.attach("file", "tests/data/submission.tar")
.field(
"docker_image",
"eu.gcr.io/moocfi-public/tmc-sandbox-tmc-langs-rust",
)
.field("memory_limit_gb", "3")
.field("cpu_limit", "2")
.field("token", "SUPER_SECRET")
.field("notify", notifyAddress)
.set("Accept", "application/json")
.expect("Content-Type", /json/)
.expect(200)
},
)
expect(notifyResult.token).toBe("SUPER_SECRET")
expect(notifyResult.exit_code).toBe("0")
expect(notifyResult.status).toBe("finished")
expect(notifyResult.vm_log.length).toBeGreaterThan(5)
Expand All @@ -83,15 +116,15 @@ test("POST /tasks.json works with .tar.zst files", async () => {
"docker_image",
"eu.gcr.io/moocfi-public/tmc-sandbox-tmc-langs-rust",
)
.field("token", "SUPER_SECERET")
.field("token", "SUPER_SECRET")
.field("notify", notifyAddress)
.set("Accept", "application/json")
.expect("Content-Type", /json/)
.expect(200)
},
)

expect(notifyResult.token).toBe("SUPER_SECERET")
expect(notifyResult.token).toBe("SUPER_SECRET")
expect(notifyResult.exit_code).toBe("0")
expect(notifyResult.status).toBe("finished")
expect(notifyResult.vm_log.length).toBeGreaterThan(5)
Expand All @@ -117,15 +150,15 @@ testSkipOnCi("POST /tasks.json does not crash with fork bombs", async () => {
"docker_image",
"eu.gcr.io/moocfi-public/tmc-sandbox-tmc-langs-rust",
)
.field("token", "SUPER_SECERET")
.field("token", "SUPER_SECRET")
.field("notify", notifyAddress)
.set("Accept", "application/json")
.expect("Content-Type", /json/)
.expect(200)
},
)

expect(notifyResult.token).toBe("SUPER_SECERET")
expect(notifyResult.token).toBe("SUPER_SECRET")

// hard to predict what happens in this case
const case1 =
Expand Down Expand Up @@ -155,14 +188,14 @@ test("POST /tasks.json works when submission uses too much memory", async () =>
"docker_image",
"eu.gcr.io/moocfi-public/tmc-sandbox-tmc-langs-rust",
)
.field("token", "SUPER_SECERET")
.field("token", "SUPER_SECRET")
.field("notify", notifyAddress)
.set("Accept", "application/json")
.expect("Content-Type", /json/)
.expect(200)
},
)
expect(notifyResult.token).toBe("SUPER_SECERET")
expect(notifyResult.token).toBe("SUPER_SECRET")
expect(notifyResult.status).toBe("out-of-memory")
})

Expand All @@ -178,14 +211,14 @@ test("POST /tasks.json works with java", async () => {
.post("/tasks.json")
.attach("file", "tests/data/java.tar")
.field("docker_image", "eu.gcr.io/moocfi-public/tmc-sandbox-java")
.field("token", "SUPER_SECERET")
.field("token", "SUPER_SECRET")
.field("notify", notifyAddress)
.set("Accept", "application/json")
.expect("Content-Type", /json/)
.expect(200)
},
)
expect(notifyResult.token).toBe("SUPER_SECERET")
expect(notifyResult.token).toBe("SUPER_SECRET")
expect(notifyResult.exit_code).toBe("0")
expect(notifyResult.status).toBe("finished")
expect(notifyResult.vm_log.length).toBeGreaterThan(5)
Expand Down
Loading