-
Notifications
You must be signed in to change notification settings - Fork 694
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use io_uring to batch handle clients pending writes to reduce SYSCALL count. #112
base: unstable
Are you sure you want to change the base?
Changes from all commits
cbe6361
1b53184
f6c6dd7
97243aa
2664c48
efc4fe4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#include "io_uring.h" | ||
|
||
#ifdef HAVE_LIBURING | ||
#include <liburing.h> | ||
#include <string.h> | ||
#include "zmalloc.h" | ||
|
||
/* io_uring instance queue depth. */ | ||
#define IO_URING_DEPTH 256 | ||
|
||
static struct io_uring *_io_uring; | ||
static size_t io_uring_write_queue_len = 0; | ||
|
||
/* Initialize io_uring at server startup if io_uring enabled, | ||
* setup io_uring submission and completion. */ | ||
int initIOUring(void) { | ||
struct io_uring_params params; | ||
_io_uring = zmalloc(sizeof(struct io_uring)); | ||
memset(¶ms, 0, sizeof(params)); | ||
/* On success, io_uring_queue_init_params(3) returns 0 and _io_uring will | ||
* point to the shared memory containing the io_uring queues. | ||
* On failure -errno is returned. */ | ||
if (io_uring_queue_init_params(IO_URING_DEPTH, _io_uring, ¶ms) < 0) return IO_URING_ERR; | ||
return IO_URING_OK; | ||
} | ||
|
||
/* Use io_uring to handle the client write request. */ | ||
int ioUringPrepWrite(void *data, int fd, const void *buf, size_t len) { | ||
struct io_uring_sqe *sqe = io_uring_get_sqe(_io_uring); | ||
if (sqe == NULL) return IO_URING_ERR; | ||
io_uring_prep_send(sqe, fd, buf, len, MSG_DONTWAIT); | ||
io_uring_sqe_set_data(sqe, data); | ||
io_uring_write_queue_len++; | ||
return IO_URING_OK; | ||
} | ||
|
||
/* Submit requests to the submission queue and wait for completion. */ | ||
int ioUringWaitWriteBarrier(io_uring_cqe_handler cqe_handler) { | ||
if (io_uring_submit(_io_uring) < 0) return IO_URING_ERR; | ||
while (io_uring_write_queue_len) { | ||
struct io_uring_cqe *cqe; | ||
int ret = io_uring_wait_cqe(_io_uring, &cqe); | ||
if (ret == 0) { | ||
if (cqe_handler) { | ||
void *data = io_uring_cqe_get_data(cqe); | ||
cqe_handler(data, cqe->res); | ||
} | ||
io_uring_cqe_seen(_io_uring, cqe); | ||
io_uring_write_queue_len--; | ||
} else { | ||
return IO_URING_ERR; | ||
} | ||
} | ||
return IO_URING_OK; | ||
} | ||
|
||
/* Free io_uring. */ | ||
void freeIOUring(void) { | ||
io_uring_queue_exit(_io_uring); | ||
zfree(_io_uring); | ||
_io_uring = NULL; | ||
} | ||
#else | ||
#ifndef UNUSED | ||
#define UNUSED(V) ((void)V) | ||
#endif | ||
|
||
int initIOUring(void) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function is called if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got your point, maybe I should simply return C_ERR in the dummy initIOUring, error will be printed and exit. void InitServerLast(void) {
bioInit();
initIOThreads();
if (server.io_uring_enabled) {
if (initIOUring() == IO_URING_ERR) {
serverLog(LL_WARNING, "Failed to initialize io_uring.");
exit(1);
}
}
set_jemalloc_bg_thread(server.jemalloc_bg_thread);
server.initial_memory_usage = zmalloc_used_memory();
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @lipzhu
Once |
||
return 0; | ||
} | ||
|
||
int ioUringPrepWrite(void *data, int fd, const void *buf, size_t len) { | ||
UNUSED(data); | ||
UNUSED(fd); | ||
UNUSED(buf); | ||
UNUSED(len); | ||
return 0; | ||
} | ||
|
||
int ioUringWaitWriteBarrier(io_uring_cqe_handler cqe_handler) { | ||
UNUSED(cqe_handler); | ||
return 0; | ||
} | ||
|
||
void freeIOUring(void) { | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These dummy stubs are never called, right? They're defined just to make it compile for when we don't have liburing? Should we mark them as dead code in some way as? Assert that they're never called? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. assert sounds like a great idea. I also like @pizhenwei's suggestion of failing the dummy There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, they are unused, just to make sure it can compile.
How about adding the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @zuiderkwast
|
||
|
||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#ifndef IO_URING_H | ||
#define IO_URING_H | ||
#include <stddef.h> | ||
|
||
#define IO_URING_OK 0 | ||
#define IO_URING_ERR -1 | ||
|
||
typedef void (*io_uring_cqe_handler)(void *, int); | ||
|
||
int initIOUring(void); | ||
int ioUringPrepWrite(void *data, int fd, const void *buf, size_t len); | ||
int ioUringWaitWriteBarrier(io_uring_cqe_handler cqe_handler); | ||
void freeIOUring(void); | ||
|
||
#endif /* IO_URING_H */ |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,6 +35,7 @@ | |
#include "fpconv_dtoa.h" | ||
#include "fmtargs.h" | ||
#include "io_threads.h" | ||
#include "io_uring.h" | ||
#include <strings.h> | ||
#include <sys/socket.h> | ||
#include <sys/uio.h> | ||
|
@@ -2448,6 +2449,83 @@ int processIOThreadsWriteDone(void) { | |
return processed; | ||
} | ||
|
||
/* If client is suitable to use io_uring to handle the write request. */ | ||
static inline int _canWriteUsingIOUring(client *c) { | ||
if (server.io_uring_enabled && server.io_threads_num == 1) { | ||
/* Currently, we only use io_uring to handle the static buffer write requests. | ||
* If io-threads or tls is enabled, skip the io_uring. */ | ||
return connIsTLS(c->conn) == 0 && getClientType(c) != CLIENT_TYPE_REPLICA && listLength(c->reply) == 0 && | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These conditions don't cover RDMA. Does it work or should we exclude that too? What about other fake clients, like the fake client used from Lua? Rather then defining a negated condition for skipping it, like "not TLS", it's usually better to a have a positive condition for when it's known to work. In the future, we may add more connection types. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah I think we can safely say RDMA and io-uring are "mutually exclusive" or "non-overlapping". |
||
c->bufpos > 0; | ||
} | ||
return 0; | ||
} | ||
|
||
/* Check the completed io_uring event and update the state. */ | ||
static int _checkPendingIOUringWriteState(client *c) { | ||
/* Note that where synchronous system calls will return -1 on | ||
* failure and set errno to the actual error value, | ||
* io_uring never uses errno. Instead it returns the negated | ||
* errno directly in the CQE res field. */ | ||
if (c->nwritten <= 0) { | ||
if (c->nwritten != -EAGAIN) { | ||
c->conn->last_errno = -(c->nwritten); | ||
/* Don't overwrite the state of a connection that is not already | ||
* connected, not to mess with handler callbacks. */ | ||
if (c->nwritten != -EINTR && c->conn->state == CONN_STATE_CONNECTED) c->conn->state = CONN_STATE_ERROR; | ||
} | ||
if (connGetState(c->conn) != CONN_STATE_CONNECTED) { | ||
serverLog(LL_VERBOSE, "Error writing to client: %s", connGetLastError(c->conn)); | ||
freeClientAsync(c); | ||
} | ||
return IO_URING_ERR; | ||
} | ||
|
||
c->sentlen += c->nwritten; | ||
/* If the buffer was sent, set bufpos to zero to continue with | ||
* the remainder of the reply. */ | ||
if ((int)c->sentlen == c->bufpos) { | ||
c->bufpos = 0; | ||
c->sentlen = 0; | ||
} | ||
server.stat_net_output_bytes += c->nwritten; | ||
c->net_output_bytes += c->nwritten; | ||
|
||
/* For clients representing masters we don't count sending data | ||
* as an interaction, since we always send REPLCONF ACK commands | ||
* that take some time to just fill the socket output buffer. | ||
* We just rely on data / pings received for timeout detection. */ | ||
if (!c->flag.primary) c->last_interaction = server.unixtime; | ||
|
||
return IO_URING_OK; | ||
} | ||
|
||
static void _postIOUringWrite(void) { | ||
listIter li; | ||
listNode *ln; | ||
listRewind(server.clients_pending_write, &li); | ||
while ((ln = listNext(&li))) { | ||
client *c = listNodeValue(ln); | ||
listUnlinkNode(server.clients_pending_write, ln); | ||
|
||
if (_checkPendingIOUringWriteState(c) == IO_URING_ERR) continue; | ||
if (!clientHasPendingReplies(c)) { | ||
c->sentlen = 0; | ||
/* Close connection after entire reply has been sent. */ | ||
if (c->flag.close_after_reply) { | ||
freeClientAsync(c); | ||
continue; | ||
} | ||
} | ||
/* Update client's memory usage after writing.*/ | ||
updateClientMemUsageAndBucket(c); | ||
} | ||
} | ||
|
||
void setClientLastWritten(void *data, int res) { | ||
client *c = data; | ||
c->nwritten = res; | ||
} | ||
|
||
/* This function is called just before entering the event loop, in the hope | ||
* we can just write the replies to the client output buffer without any | ||
* need to use a syscall in order to install the writable event handler, | ||
|
@@ -2467,34 +2545,66 @@ int handleClientsWithPendingWrites(void) { | |
while ((ln = listNext(&li))) { | ||
client *c = listNodeValue(ln); | ||
c->flag.pending_write = 0; | ||
listUnlinkNode(server.clients_pending_write, ln); | ||
|
||
/* If a client is protected, don't do anything, | ||
* that may trigger write error or recreate handler. */ | ||
if (c->flag.protected) continue; | ||
if (c->flag.protected) { | ||
listUnlinkNode(server.clients_pending_write, ln); | ||
continue; | ||
} | ||
|
||
/* Don't write to clients that are going to be closed anyway. */ | ||
if (c->flag.close_asap) continue; | ||
if (c->flag.close_asap) { | ||
listUnlinkNode(server.clients_pending_write, ln); | ||
continue; | ||
} | ||
|
||
if (!clientHasPendingReplies(c)) continue; | ||
if (!clientHasPendingReplies(c)) { | ||
listUnlinkNode(server.clients_pending_write, ln); | ||
continue; | ||
} | ||
|
||
/* If we can send the client to the I/O thread, let it handle the write. */ | ||
if (trySendWriteToIOThreads(c) == C_OK) continue; | ||
if (server.io_threads_num > 1) { | ||
listUnlinkNode(server.clients_pending_write, ln); | ||
if (trySendWriteToIOThreads(c) == C_OK) { | ||
continue; | ||
} | ||
} | ||
|
||
/* We can't write to the client while IO operation is in progress. */ | ||
if (c->io_write_state != CLIENT_IDLE || c->io_read_state != CLIENT_IDLE) continue; | ||
if (c->io_write_state != CLIENT_IDLE || c->io_read_state != CLIENT_IDLE) { | ||
if (server.io_threads_num == 1) { | ||
listUnlinkNode(server.clients_pending_write, ln); | ||
} | ||
continue; | ||
} | ||
|
||
processed++; | ||
if (_canWriteUsingIOUring(c)) { | ||
if (ioUringPrepWrite(c, c->conn->fd, c->buf + c->sentlen, c->bufpos - c->sentlen) == IO_URING_ERR) { | ||
listUnlinkNode(server.clients_pending_write, ln); | ||
continue; | ||
} | ||
} else { | ||
if (server.io_threads_num == 1) { | ||
listUnlinkNode(server.clients_pending_write, ln); | ||
} | ||
/* Try to write buffers to the client socket. */ | ||
if (writeToClient(c) == C_ERR) continue; | ||
|
||
/* Try to write buffers to the client socket. */ | ||
if (writeToClient(c) == C_ERR) continue; | ||
|
||
/* If after the synchronous writes above we still have data to | ||
* output to the client, we need to install the writable handler. */ | ||
if (clientHasPendingReplies(c)) { | ||
installClientWriteHandler(c); | ||
/* If after the synchronous writes above we still have data to | ||
* output to the client, we need to install the writable handler. */ | ||
if (clientHasPendingReplies(c)) { | ||
installClientWriteHandler(c); | ||
} | ||
} | ||
} | ||
|
||
if (server.io_uring_enabled && server.io_threads_num == 1 && listLength(server.clients_pending_write) > 0) { | ||
ioUringWaitWriteBarrier(setClientLastWritten); | ||
_postIOUringWrite(); | ||
} | ||
return processed; | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Deal
IO_URING_OK
inio_uring.c
only, outside should not handle IO uring related code any more. Please convertIO_URING_OK
toC_OK
, so doesIO_URING_ERR
.