Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extremely Bad Latency on TCP Connection for receiving Data #842

Open
winstonzhao opened this issue Sep 23, 2024 · 0 comments
Open

Extremely Bad Latency on TCP Connection for receiving Data #842

winstonzhao opened this issue Sep 23, 2024 · 0 comments

Comments

@winstonzhao
Copy link

Hello, my usecase for F-Stack is that I want to optimize for packet read speed for TCP.

I've setup a simple test that has a python server that will send out timestamp packets:

#! /usr/bin/python
# a simple tcp server
import socket,os
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)  
sock.bind(('0.0.0.0', 12373))  
sock.listen(5)  
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, b"ens7")
import time
while True:  
    connection,address = sock.accept()  
    buf = connection.recv(1024)  
    print(buf)
    start= time.time()
    for i in range(int(1e6)):
        connection.send((str(time.time_ns()) * 200).encode('utf-8'))    		
    connection.close()

And then I setup a simple receiving using native linux sockets and F-stack, to compare how long it takes to proceses each of 5M messages.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <netdb.h>

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/epoll.h>
#include <arpa/inet.h>
#include <errno.h>
#include <assert.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <netdb.h>

#define SOCK_FSTACK 0x01000000
#define SOCK_KERNEL 0x02000000

#define MAX_EVENTS 10
#define BUFFER_SIZE 4096 

int64_t get_time_difference(const char *buffer, size_t bytes_read) {
    if (bytes_read < 16) {
        // printf("Buffer too small\n");
        return -1;
    }

    
    int64_t d = strtoll(buffer + (bytes_read - 20), NULL, 10);

    struct timespec current_time;
    clock_gettime(CLOCK_REALTIME, &current_time);

    // // Handle nanosecond underflow
    int64_t delta = current_time.tv_sec * 1000000000 + current_time.tv_nsec - d;

    // Step 6: Print the time difference
    return delta;
}

int set_nonblocking(int sockfd) {
    int opt = 1;
    if (ioctl(sockfd, FIONBIO, &opt)) {
        perror("ioctl FIONBIO");
        return -1;
    }
    return 0;
}

int do_req() {
    const char *hostname = "10.0.3.122";
    const char *port = "12373";
    const char *path = "/";

    // printf("socket\n");

 
    // Create a socket
    int sockfd = socket(AF_INET, SOCK_STREAM, 0);
    if (sockfd < 0) {
        perror("socket");
        exit(EXIT_FAILURE);
    }

    // Resolve hostname
    struct addrinfo hints, *res;
    memset(&hints, 0, sizeof(hints));
    hints.ai_family = AF_INET;
    hints.ai_socktype = SOCK_STREAM;

    if (getaddrinfo(hostname, port, &hints, &res) != 0) {
        perror("getaddrinfo");
        close(sockfd);
        exit(EXIT_FAILURE);
    }

    // Set the socket to non-blocking mode
    if (set_nonblocking(sockfd) == -1) {
        close(sockfd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    // Start connecting
    int connect_status = connect(sockfd, res->ai_addr, res->ai_addrlen);
    if (connect_status == -1 && errno != EINPROGRESS) {
        perror("connect");
        close(sockfd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    // Set up epoll
    int epoll_fd = epoll_create1(0);
    if (epoll_fd == -1) {
        perror("epoll_create1");
        close(sockfd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    struct epoll_event ev, events[MAX_EVENTS];
    ev.events = EPOLLOUT | EPOLLIN | EPOLLET;  // Wait for the socket to be writable (connect completion) and readable
    ev.data.fd = sockfd;

    if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, sockfd, &ev) == -1) {
        perror("epoll_ctl");
        close(sockfd);
        close(epoll_fd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    // HTTP GET request to send
    char request[512];
    snprintf(request, sizeof(request), "GET %s HTTP/1.1\r\nHost: %s\r\nConnection: keep-alive\r\n\r\n", path, hostname);

    int64_t br = 0;
    int64_t buffer_reads = 0;
    int last_latency = 0;
    int start_lat = 0;

    int done = 0;
    while (!done) {
        int n = epoll_wait(epoll_fd, events, MAX_EVENTS, -1);
        for (int i = 0; i < n; i++) {
            if (events[i].events & EPOLLOUT) {
                // Ready to write (send request)
                int sent = send(sockfd, request, strlen(request), 0);
                if (sent == -1) {
                    perror("send");
                    done = 1;
                    break;
                }
                // printf("Sent HTTP request:\n%s", request);
                ev.events = EPOLLIN | EPOLLET;  // Now, wait for the response
                epoll_ctl(epoll_fd, EPOLL_CTL_MOD, sockfd, &ev);
            } else if (events[i].events & EPOLLIN) {
                // Ready to read (response received)
                char buffer[BUFFER_SIZE];
                int bytes_read;
                while ((bytes_read = recv(sockfd, buffer, sizeof(buffer) - 1, 0)) > 0) {
                    br += bytes_read;
                    ++buffer_reads;
                    bytes_read += 1;
                    buffer[bytes_read - 1] = '\0';
                    last_latency = get_time_difference(buffer, bytes_read);
                    if (start_lat < 10 || start_lat > 10000000)
                        start_lat = last_latency;
                }

                if (bytes_read == 0) {
                    // Connection closed by the server
                    done = 1;
                } else if (bytes_read == -1 && errno != EAGAIN) {
                    perror("recv");
                    done = 1;
                }
            }
        }
    }

    float avg_bytes = (float)br / (float)buffer_reads;
    printf("avg bytes: %.6f\n", avg_bytes);
    printf("buffer_reads: %ld\n", buffer_reads);
    printf("bytes_read: %ld\n", br);
    printf("last_latency: %d\n", last_latency);
    printf("first_latency: %d\n", start_lat);
    printf("lat_diff: %d\n", last_latency - start_lat);

    // Clean up
    close(sockfd);
    close(epoll_fd);
    freeaddrinfo(res);

    return 0;
}

int main() {
    for (int i = 0; i < 1; ++i)    
    do_req();
    return 0;
}

Then, I write something similar using F-stack:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <netdb.h>

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/epoll.h>
#include <arpa/inet.h>
#include <errno.h>
#include <assert.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <netdb.h>

#include "ff_config.h"
#include "ff_api.h"
#include "ff_epoll.h"

#define SOCK_FSTACK 0x01000000
#define SOCK_KERNEL 0x02000000

#define MAX_EVENTS 10
#define BUFFER_SIZE 4096 

struct epoll_event ev;
struct epoll_event events[MAX_EVENTS];

int epfd;
int sockfd;

    // HTTP GET request to send
    char request[512];


  int64_t br = 0;
    int64_t buffer_reads = 0;
    int last_latency = 0;
    int start_lat = 0;


int64_t get_time_difference(const char *buffer, size_t bytes_read) {
    if (bytes_read < 16) {
        // printf("Buffer too small\n");
        return -1;
    }

    
    int64_t d = strtoll(buffer + (bytes_read - 20), NULL, 10);

    struct timespec current_time;
    clock_gettime(CLOCK_REALTIME, &current_time);

    // // Handle nanosecond underflow
    int64_t delta = current_time.tv_sec * 1000000000 + current_time.tv_nsec - d;

    // Step 6: Print the time difference
    return delta;
}

int set_nonblocking(int sockfd) {
    int opt = 1;
    if (ff_ioctl(sockfd, FIONBIO, &opt)) {
        perror("ioctl FIONBIO");
        return -1;
    }
    return 0;
}

int loop(void* arg) {
      int n = ff_epoll_wait(epfd, events, MAX_EVENTS, -1);
        for (int i = 0; i < n; i++) {
            if (events[i].events & EPOLLOUT) {
                // Ready to write (send request)
                int sent = ff_send(sockfd, request, strlen(request), 0);
                if (sent == -1) {
                    perror("send");
                    break;
                }
                // printf("Sent HTTP request:\n%s", request);
                ev.events = EPOLLIN | EPOLLET;  // Now, wait for the response
                ff_epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev);
            } else if (events[i].events & EPOLLIN) {
                // Ready to read (response received)
                char buffer[BUFFER_SIZE];
                int bytes_read;
                while ((bytes_read = ff_recv(sockfd, buffer, sizeof(buffer) - 1, 0)) > 0) {
                    br += bytes_read;
                    ++buffer_reads;
                    bytes_read += 1;
                    buffer[bytes_read - 1] = '\0';
                    last_latency = get_time_difference(buffer, bytes_read);
                    if (start_lat < 10 || start_lat > 10000000)
                        start_lat = last_latency;
                }

                if (bytes_read == 0) {
                    // Connection closed by the server
                    float avg_bytes = (float)br / (float)buffer_reads;
                    printf("avg bytes: %.6f\n", avg_bytes);
                    printf("buffer_reads: %ld\n", buffer_reads);
                    printf("bytes_read: %ld\n", br);
                    printf("last_latency: %d\n", last_latency);
                    printf("first_latency: %d\n", start_lat);
                    printf("lat_diff: %d\n", last_latency - start_lat);
                    exit(0);
                } else if (bytes_read == -1 && errno != EAGAIN) {
                    perror("recv");
                    exit(0);
                }
            }
        }
}
 
int do_req() {
    const char *hostname = "10.0.3.122";
    const char *port = "12373";
    const char *path = "/";

    // Create a socket
    sockfd = ff_socket(AF_INET, SOCK_STREAM, 0);
    if (sockfd < 0) {
        perror("socket");
        exit(EXIT_FAILURE);
    }

    // Resolve hostname
    struct addrinfo hints, *res;
    memset(&hints, 0, sizeof(hints));
    hints.ai_family = AF_INET;
    hints.ai_socktype = SOCK_STREAM;

    if (getaddrinfo(hostname, port, &hints, &res) != 0) {
        perror("getaddrinfo");
        ff_close(sockfd);
        exit(EXIT_FAILURE);
    }

    // Set the socket to non-blocking mode
    if (set_nonblocking(sockfd) == -1) {
        ff_close(sockfd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    // Start connecting
    int connect_status = ff_connect(sockfd, (struct linux_sockaddr *)res->ai_addr, res->ai_addrlen);
    if (connect_status == -1 && errno != EINPROGRESS) {
        perror("connect");  
        ff_close(sockfd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    // Set up epoll
    epfd = ff_epoll_create(0);
    if (epfd == -1) {
        perror("epoll_create1");
        ff_close(sockfd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    struct epoll_event ev, events[MAX_EVENTS];
    ev.events = EPOLLOUT | EPOLLIN | EPOLLET;  // Wait for the socket to be writable (connect completion) and readable
    ev.data.fd = sockfd;

    if (ff_epoll_ctl(epfd, EPOLL_CTL_ADD, sockfd, &ev) == -1) {
        perror("epoll_ctl");
        ff_close(sockfd);
        ff_close(epfd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    snprintf(request, sizeof(request), "GET %s HTTP/1.1\r\nHost: %s\r\nConnection: keep-alive\r\n\r\n", path, hostname);
    ff_run(loop, NULL);

    // Clean up
    ff_close(sockfd);
    ff_close(epfd);
    freeaddrinfo(res);

    return 0;
}

int main(int argc, char * argv[])
{
    // printf("hello1\n");
    ff_init(argc, argv);
    // printf("hello\n");
    do_req();
    return 0;
}

My config is as follows:

[dpdk]
# Hexadecimal bitmask of cores to run on.
lcore_mask=2

# Number of memory channels.
channel=10

# Specify base virtual address to map.
#base_virtaddr=0x7f0000000000

# Promiscuous mode of nic, defualt: enabled.
promiscuous=1
numa_on=1

# TX checksum offload skip, default: disabled.
# We need this switch enabled in the following cases:
# -> The application want to enforce wrong checksum for testing purposes
# -> Some cards advertize the offload capability. However, doesn't calculate checksum.
tx_csum_offoad_skip=0

# TCP segment offload, default: disabled.
tso=0

# HW vlan strip, default: enabled.
vlan_strip=1

# sleep when no pkts incomming
# unit: microseconds
idle_sleep=0

# sent packet delay time(0-100) while send less than 32 pkts.
# default 100 us.
# if set 0, means send pkts immediately.
# if set >100, will dealy 100 us.
# unit: microseconds
pkt_tx_delay=0

# use symmetric Receive-side Scaling(RSS) key, default: disabled.
symmetric_rss=0

# PCI device enable list.
# And driver options
#allow=02:00.0
# for multiple PCI devices
#allow=02:00.0,03:00.0

# enabled port list
#
# EBNF grammar:
#
#    exp      ::= num_list {"," num_list}
#    num_list ::= <num> | <range>
#    range    ::= <num>"-"<num>
#    num      ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
#
# examples
#    0-3       ports 0, 1,2,3 are enabled
#    1-3,4,7   ports 1,2,3,4,7 are enabled
#
# If use bonding, shoule config the bonding port id in port_list
# and not config slave port id in port_list
# such as, port 0 and port 1 trank to a bonding port 2,
# should set `port_list=2` and config `[port2]` section

port_list=0

# Number of vdev.
nb_vdev=0

# Number of bond.
nb_bond=0

# log level for dpdk, optional
# log_level=0

# Each core write into own pcap file, which is open one time, close one time if enough.
# Support dump the first snaplen bytes of each packet.
# if pcap file is lager than savelen bytes, it will be closed and next file was dumped into.
[pcap]
enable=0
snaplen=96
savelen=16777216
savepath=.

# Port config section
# Correspond to dpdk.port_list's index: port0, port1...
[port0]
addr=10.0.3.234
netmask=255.255.255.0
broadcast=10.0.3.255
gateway=10.0.3.1
# set interface name, Optional parameter.
if_name=ens6

# IPv6 net addr, Optional parameters.
#addr6=ff::02
#prefix_len=64
#gateway6=ff::01

# Multi virtual IPv4/IPv6 net addr, Optional parameters.
#	`vip_ifname`: default `f-stack-x`
#	`vip_addr`: Separated by semicolons, MAX number 64;
#		    Only support netmask 255.255.255.255, broadcast x.x.x.255 now, hard code in `ff_veth_setvaddr`.
#	`vip_addr6`: Separated by semicolons, MAX number 64.
#	`vip_prefix_len`: All addr6 use the same prefix now, default 64.
#vip_ifname=lo0
#vip_addr=192.168.1.3;192.168.1.4;192.168.1.5;192.168.1.6
#vip_addr6=ff::03;ff::04;ff::05;ff::06;ff::07
#vip_prefix_len=64

# lcore list used to handle this port
# the format is same as port_list
#lcore_list=0

# bonding slave port list used to handle this port
# need to config while this port is a bonding port
# the format is same as port_list
#slave_port_list=0,1

# Vdev config section
# orrespond to dpdk.nb_vdev's index: vdev0, vdev1...
#    iface : Shouldn't set always.
#    path : The vuser device path in container. Required.
#    queues : The max queues of vuser. Optional, default 1, greater or equal to the number of processes.
#    queue_size : Queue size.Optional, default 256.
#    mac : The mac address of vuser. Optional, default random, if vhost use phy NIC, it should be set to the phy NIC's mac.
#    cq : Optional, if queues = 1, default 0; if queues > 1 default 1.
#[vdev0]
##iface=/usr/local/var/run/openvswitch/vhost-user0
#path=/var/run/openvswitch/vhost-user0
#queues=1
#queue_size=256
#mac=00:00:00:00:00:01
#cq=0

# bond config section
# See http://doc.dpdk.org/guides/prog_guide/link_bonding_poll_mode_drv_lib.html
#[bond0]
#mode=4
#slave=0000:0a:00.0,slave=0000:0a:00.1
#primary=0000:0a:00.0
#mac=f0:98:38:xx:xx:xx
## opt argument
#socket_id=0
#xmit_policy=l23
#lsc_poll_period_ms=0
#up_delay=0
#down_delay=0

# Kni config: if enabled and method=reject,
# all packets that do not belong to the following tcp_port and udp_port
# will transmit to kernel; if method=accept, all packets that belong to
# the following tcp_port and udp_port will transmit to kernel.
#[kni]
#enable=1
#method=reject
## The format is same as port_list
#tcp_port=80,443
#udp_port=53

# FreeBSD network performance tuning configurations.
# Most native FreeBSD configurations are supported.
[freebsd.boot]
# If use rack/bbr which depend HPTS, you should set a greater value of hz, such as 1000000 means a tick is 1us.
hz=100

# Block out a range of descriptors to avoid overlap
# with the kernel's descriptor space.
# You can increase this value according to your app.
fd_reserve=1024

kern.ipc.maxsockets=262144

net.inet.tcp.syncache.hashsize=4096
net.inet.tcp.syncache.bucketlimit=100

net.inet.tcp.tcbhashsize=65536

kern.ncallout=262144

kern.features.inet6=1

[freebsd.sysctl]
kern.ipc.somaxconn=32768
kern.ipc.maxsockbuf=16777216

net.link.ether.inet.maxhold=5

net.inet.tcp.fast_finwait2_recycle=1
net.inet.tcp.sendspace=1677721
net.inet.tcp.recvspace=1677721
net.inet.tcp.nolocaltimewait=1
net.inet.tcp.cc.algorithm=bbr
net.inet.tcp.sendbuf_max=16777216
net.inet.tcp.recvbuf_max=16777216
net.inet.tcp.sendbuf_auto=1
net.inet.tcp.recvbuf_auto=1
net.inet.tcp.sendbuf_inc=16384
net.inet.tcp.recvbuf_inc=524288
net.inet.tcp.sack.enable=1
net.inet.tcp.blackhole=1
net.inet.tcp.msl=2000
net.inet.tcp.delayed_ack=0
net.inet.tcp.rfc1323=1

net.inet.udp.blackhole=1
net.inet.ip.redirect=0
net.inet.ip.forwarding=0

net.inet6.ip6.auto_linklocal=1
net.inet6.ip6.accept_rtadv=2
net.inet6.icmp6.rediraccept=1
net.inet6.ip6.forwarding=0

# set default stacks:freebsd, rack or bbr, may be you need increase the value of parameter 'freebsd.boot.hz' while use rack or bbr.
net.inet.tcp.functions_default=freebsd
# need by bbr, should enable it.
net.inet.tcp.hpts.skip_swi=1
# Interval between calls to hpts_timeout_dir. default min 250us, max 256-512ms, default 512ms.
net.inet.tcp.hpts.minsleep=0
# [25600-51200]
net.inet.tcp.hpts.maxsleep=51200

The native linux approach takes 2s, however the F-stack approach takes around 7.5s consistently.

Can anyone provide some advice on why F-stack is so much slower here?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant