From a3ef555ede0441509dfb6870c4edfb397f6a3a52 Mon Sep 17 00:00:00 2001 From: Struan Bartlett Date: Sun, 15 Oct 2023 14:53:42 +0000 Subject: [PATCH] Support for containers connected to multiple networks - In the container entrypoint: - Itemise network interfaces and save their configuration - Save any additional non-link-scope non-default routes to support networks with multiple subnets - Create a bridge to join each interface to a tap device in the VM - Restore network and gateway routes via each bridge - In the VM: - Identify each interface by MAC address and rename devices to match the original configuration in the underlying container - Restore device configuration and routes --- README.md | 10 ++- runcvm-scripts/runcvm-ctr-defaults | 29 ++++--- runcvm-scripts/runcvm-ctr-entrypoint | 113 +++++++++++++++------------ runcvm-scripts/runcvm-ctr-qemu | 37 +++++++-- runcvm-scripts/runcvm-ctr-qemu-ifup | 9 ++- runcvm-scripts/runcvm-ip-functions | 40 ++++++++++ runcvm-scripts/runcvm-runtime | 6 +- runcvm-scripts/runcvm-vm-init | 48 ++++++++++-- 8 files changed, 204 insertions(+), 88 deletions(-) create mode 100644 runcvm-scripts/runcvm-ip-functions diff --git a/README.md b/README.md index 32a1ddd..3c185cc 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,7 @@ RunCVM is free and open-source, licensed under the Apache Licence, Version 2.0. - Run any standard container workload in a VM using `docker run` with no need to customise images or the command line (except adding `--runtime=runcvm`) - Run unusual container workloads, like `dockerd` and `systemd` that will not run in standard container runtimes -- Maintain a similar experience within a RunCVM VM as within a container: process table, network interfaces, stdio, exit code handling should broadly similar to maximise compatibility +- Maintain a similar experience within a RunCVM VM as within a container: process table, network interfaces, stdio, exit code handling should be broadly similar to maximise compatibility - Container start/stop/kill semantics respected, where possible providing clean VM shutdown on stop - VM console accessible as one would expect using `docker run -it`, `docker start -ai` and `docker attach` (and so on), generally good support for other `docker container` subcommands - Efficient container startup, by using virtiofs to serve a container's filesystem directly to a VM (instead of unpacking an image into a backing file) @@ -106,7 +106,7 @@ RunCVM is free and open-source, licensed under the Apache Licence, Version 2.0. ## Project ambitions -- Support multiple network interfaces, when attached to a created (but not yet running) container using `docker network connect` +- Support multiple network interfaces, when attached to a created (but not yet running) container using `docker network connect` (COMPLETE - excluding IPv6) - Support running foreign-architecture VMs by using QEMU dynamic CPU emulation for the entire VM (instead of the approach used by [https://github.com/multiarch/qemu-user-static](https://github.com/multiarch/qemu-user-static) which uses dynamic CPU emulation for each individual binary) - Support for QEMU [microvm](https://qemu.readthedocs.io/en/latest/system/i386/microvm.html) or Amazon Firecracker - More natural console support with independent stdout and stderr channels for `docker run -it` @@ -185,11 +185,12 @@ In the below summary of RunCVM's current main features and limitations, [+] is u - Networking - [+] The default bridge network is supported - [+] Custom/user-defined networks specified using `--network` are supported, including Docker DNS resolution of container names and respect for custom network MTU + - [+] Multiple network interfaces - when attached via `docker run --network` or `docker network connect` (but only to a created and not yet running container) - are supported (including `scope=overlay` networks and those with multiple subnets) - [+] `--publish` (or `-p`) is supported - [+] `--dns`, `--dns-option`, `--dns-search` are supported - [+] `--ip` is supported - [+] `--hostname` (or `-h`) is supported - - [-] Only one network (that which is assigned during `docker run`) is supported per container. There is no support for `docker network connect`. + - [-] `docker network connect` on a running container is not supported - [-] `--network=host` and `--network=container:name|id` are not supported - [-] IPv6 is not supported - Execution environment @@ -408,7 +409,7 @@ In more detail, the RunCVM runtime `create` process: The `runcvm-ctr-entrypoint`: - Is always launched as PID1 within the standard Docker container. - Saves the container's originally-intended entrypoint and command line, environment variables and network configuration to files inside `/.runcvm`. -- Creates a bridge for the primary container network interface, that will be joined to a VM network interface. +- Creates a bridge (acting as a hub) for each container network interface, to join that interface to a VM tap network interface. - Launches `virtiofsd` to serve the container's root filesystem. - Configures `/etc/resolv.conf` in the container. - Adds container firewall rules, launches `dnsmasq` and modifies `/vm/etc/resolv.conf` to proxy DNS requests from the VM to Docker's DNS. @@ -421,6 +422,7 @@ The `runcvm-init` process: The `runcvm-ctr-qemu` script: - Prepares disk backing files as specified by `--env=RUNCVM_DISKS=` +- Prepares network configuration as saved from the container (modifying the MAC address of each container interface) - Launches [QEMU](https://www.qemu.org/) with the required kernel, network interfaces, disks, display, and with a root filesystem mounted via virtiofs from the container and with `runcvm-vm-init` as the VM's init process. The `runcvm-vm-init` process: diff --git a/runcvm-scripts/runcvm-ctr-defaults b/runcvm-scripts/runcvm-ctr-defaults index da30292..9e14186 100755 --- a/runcvm-scripts/runcvm-ctr-defaults +++ b/runcvm-scripts/runcvm-ctr-defaults @@ -1,28 +1,27 @@ +#!/bin/bash + RUNCVM=/opt/runcvm RUNCVM_PATH=$RUNCVM/usr/sbin:$RUNCVM/usr/bin:$RUNCVM/sbin:$RUNCVM/bin:$RUNCVM/usr/lib/qemu -QEMU_IFUP="$RUNCVM/scripts/runcvm-ctr-qemu-ifup" -QEMU_IFDOWN="$RUNCVM/scripts/runcvm-ctr-qemu-ifdown" -QEMU_BRIDGE='q0' - QEMU_VIRTIOFSD_SOCKET=/run/.virtiofs.sock QEMU_GUEST_AGENT=/run/.qemu-guest-agent QEMU_MONITOR_SOCKET=/run/.qemu-monitor-socket clean_env() { - export -n \ - RUNCVM_BREAK RUNCVM_INIT \ - RUNCVM_RUNTIME_DEBUG RUNCVM_BIOS_DEBUG RUNCVM_KERNEL_DEBUG \ - RUNCVM_KERNEL RUNCVM_KERNEL_ROOT RUNCVM_KERNEL_APPEND RUNCVM_KERNEL_INITRAMFS_PATH RUNCVM_KERNEL_PATH RUNCVM_DISKS \ - RUNCVM_UIDGID RUNCVM_VM_MOUNTPOINT RUNCVM_TMPFS \ - RUNCVM_CPUS RUNCVM_MEM_SIZE RUNCVM_HAS_HOME + export -n \ + RUNCVM_BREAK RUNCVM_INIT \ + RUNCVM_RUNTIME_DEBUG RUNCVM_BIOS_DEBUG RUNCVM_KERNEL_DEBUG \ + RUNCVM_KERNEL RUNCVM_KERNEL_ROOT RUNCVM_KERNEL_APPEND RUNCVM_KERNEL_INITRAMFS_PATH RUNCVM_KERNEL_PATH RUNCVM_DISKS \ + RUNCVM_UIDGID RUNCVM_VM_MOUNTPOINT RUNCVM_TMPFS \ + RUNCVM_CPUS RUNCVM_MEM_SIZE RUNCVM_HAS_HOME - # May be set in VM by busybox init process - export -n USER + # May be set in VM by busybox init process + export -n USER } load_network() { - [ -s /.runcvm/network ] || return 1 - read -r DOCKER_IF DOCKER_IF_MAC DOCKER_IF_MTU DOCKER_IF_IP DOCKER_IF_IP_NETPREFIX DOCKER_IF_IP_GW /.runcvm/config # NOW LOAD DEFAULT ENV AND PATH . /opt/runcvm/scripts/runcvm-ctr-defaults && PATH="$RUNCVM_PATH" +# LOAD IP MANIPULATION FUNCTIONS +. $RUNCVM/scripts/runcvm-ip-functions + # SAVE PWD busybox pwd >/.runcvm/pwd # DEBUG if [[ "$RUNCVM_BREAK" =~ prenet ]]; then bash; fi -# SAVE NETWORKING CONFIG +# SAVE NETWORKING CONFIG AND CONFIGURE BRIDGES -cidr_to_netmask() { - local value=$(( 0xffffffff ^ ((1 << (32 - $1)) - 1) )) - echo "$(( (value >> 24) & 0xff )).$(( (value >> 16) & 0xff )).$(( (value >> 8) & 0xff )).$(( value & 0xff ))" -} +# Identify default gateway device and IP address +IFS=$'\n' read -d '' -r DOCKER_GW_IF DOCKER_GW_IF_IP <<< \ + $(ip -json route show | jq -r '.[] | (select(.dst == "default") | .dev, .gateway)') +# e.g. eth0 172.25.10.1 -ip_prefix_to_network() { - local IFS i1 i2 i3 i4 m1 m2 m3 m4 - IFS=. read -r i1 i2 i3 i4 <<< "$1" +QEMU_BRIDGE_IP=169.254.1.1 +RUNCVM_DNS_IP=169.254.169.254 - local mask=$(cidr_to_netmask "$2") - IFS=. read -r m1 m2 m3 m4 <<< "$mask" +mkdir -p /.runcvm/network/devices - printf "%d.%d.%d.%d\n" "$((i1 & m1))" "$((i2 & m2))" "$((i3 & m3))" "$((i4 & m4))" -} +# Save non-link-scope non-default routes for later restoration in the running VM. +ip -json route show | jq -r '.[] | select(.scope != "link" and .dst != "default") | "\(.dst) \(.gateway) \(.dev) \(.prefsrc)"' >/.runcvm/network/routes -read -r DOCKER_IF DOCKER_IF_IP_GW <<< \ - $(ip -json route show | jq -j '.[] | (select(.dst == "default") | .dev, " ", .gateway)') -# e.g. eth0 172.25.10.1 172.25.10.0/24 +for if in $(ip -json link show | jq -r '.[] | .ifname') +do -read -r DOCKER_IF_IP DOCKER_IF_IP_NETPREFIX DOCKER_IF_MAC DOCKER_IF_MTU <<< \ - $(ip -json addr show eth0 | jq -j '.[0] | .addr_info[0].local, " ", .addr_info[0].prefixlen, " ", .address, " ", .mtu') -# e.g. 172.25.10.2 24 52:54:00:b7:0b:b6 1500 + [ "$if" = "lo" ] && continue -# Save container network parameters -echo "$DOCKER_IF $DOCKER_IF_MAC $DOCKER_IF_MTU $DOCKER_IF_IP $DOCKER_IF_IP_NETPREFIX $DOCKER_IF_IP_GW" >/.runcvm/network + IFS=$'\n' read -d '' -r DOCKER_IF_IP DOCKER_IF_IP_NETPREFIX DOCKER_IF_MAC DOCKER_IF_MTU <<< \ + $(ip -json addr show "$if" | jq -r '.[0] | .addr_info[0].local, .addr_info[0].prefixlen, .address, .mtu') + # e.g. 172.25.10.2 24 52:54:00:b7:0b:b6 1500 -QEMU_BRIDGE_IP=169.254.1.1 -RUNCVM_DNS_IP=169.254.169.254 + # Save container network parameters + if [ "$if" = "$DOCKER_GW_IF" ]; then + echo "$if $DOCKER_IF_MAC $DOCKER_IF_MTU $DOCKER_IF_IP $DOCKER_IF_IP_NETPREFIX $DOCKER_GW_IF_IP" >/.runcvm/network/devices/$if + ln -s "$if" /.runcvm/network/devices/default + else + echo "$if $DOCKER_IF_MAC $DOCKER_IF_MTU $DOCKER_IF_IP $DOCKER_IF_IP_NETPREFIX" >/.runcvm/network/devices/$if + fi + + # RECONFIGURE CONTAINER NETWORK + ip addr flush dev $if + + QEMU_BRIDGE="br-$if" + + # Create the container bridge + # See https://bugs.launchpad.net/neutron/+bug/1738659 + ip link add $QEMU_BRIDGE type bridge forward_delay 0 ageing 0 + # Add the original container interface to the bridge and bring it up. + ip link set dev "$if" master $QEMU_BRIDGE + ip link set dev "$if" up -# RECONFIGURE CONTAINER NETWORK -ip addr flush dev $DOCKER_IF + # Bring the bridge up. + ip link set dev $QEMU_BRIDGE up -# Create the container bridge -# See https://bugs.launchpad.net/neutron/+bug/1738659 -ip link add $QEMU_BRIDGE type bridge forward_delay 0 ageing 0 + # Restore network route via this bridge + DOCKER_NET=$(ip_prefix_to_network $DOCKER_IF_IP $DOCKER_IF_IP_NETPREFIX)/$DOCKER_IF_IP_NETPREFIX + ip route add $DOCKER_NET dev $QEMU_BRIDGE -# Add a private IP to the bridge. -# We need it so the bridge can receive traffic, but the IP won't ever see the light of day. -ip addr add $QEMU_BRIDGE_IP dev $QEMU_BRIDGE + # If this interface is the default gateway interface, perform additional special steps. + if [ "$if" = "$DOCKER_GW_IF" ]; then -# Add the original container interface to the bridge and bring it up. -ip link set dev $DOCKER_IF master $QEMU_BRIDGE -ip link set dev $DOCKER_IF up + # Add a private IP to this bridge. + # We need it so the bridge can receive traffic, but the IP won't ever see the light of day. + ip addr add $QEMU_BRIDGE_IP dev $QEMU_BRIDGE -# Bring the bridge up! -ip link set dev $QEMU_BRIDGE up + # Restore default gateway route via this bridge. + ip route add default via $DOCKER_GW_IF_IP dev $QEMU_BRIDGE -# Restore routes needed for the bridge -DOCKER_NET=$(ip_prefix_to_network $DOCKER_IF_IP $DOCKER_IF_IP_NETPREFIX)/$DOCKER_IF_IP_NETPREFIX -ip route add $DOCKER_NET dev $QEMU_BRIDGE -ip route add default via $DOCKER_IF_IP_GW dev $QEMU_BRIDGE + # Accept DNS requests for $RUNCVM_DNS_IP; these will be passed to dnsmasq + XTABLES_LIBDIR=/opt/runcvm/lib64/usr/lib/xtables/ /opt/runcvm/sbin/xtables-nft-multi iptables -t nat -A PREROUTING -d $RUNCVM_DNS_IP/32 -p udp -m udp --dport 53 -j REDIRECT -# Accept DNS requests for $RUNCVM_DNS_IP; these will be passed to dnsmasq -XTABLES_LIBDIR=/opt/runcvm/lib64/usr/lib/xtables/ /opt/runcvm/sbin/xtables-nft-multi iptables -t nat -A PREROUTING -d $RUNCVM_DNS_IP/32 -p udp -m udp --dport 53 -j REDIRECT + # Match UDP port 53 traffic, outgoing via the QEMU bridge, from the bridge's own IP: + # -> Masquerade as if from the VM's IP. + # This allows outgoing DNS requests from the VM to be received by dnsmasq running in the container. + XTABLES_LIBDIR=/opt/runcvm/lib64/usr/lib/xtables/ /opt/runcvm/sbin/xtables-nft-multi iptables -t nat -A POSTROUTING -o $QEMU_BRIDGE -s $QEMU_BRIDGE_IP/32 -p udp -m udp --sport 53 -j SNAT --to-source $DOCKER_IF_IP + XTABLES_LIBDIR=/opt/runcvm/lib64/usr/lib/xtables/ /opt/runcvm/sbin/xtables-nft-multi iptables -t nat -A POSTROUTING -o $QEMU_BRIDGE -s $QEMU_BRIDGE_IP/32 -p udp -m udp --dport 53 -j SNAT --to-source $DOCKER_IF_IP -# Match UDP port 53 traffic, outgoing via the QEMU bridge, from the bridge's own IP: -# -> Masquerade as if from the VM's IP. -# This allows outgoing DNS requests from the VM to be received by dnsmasq running in the container. -XTABLES_LIBDIR=/opt/runcvm/lib64/usr/lib/xtables/ /opt/runcvm/sbin/xtables-nft-multi iptables -t nat -A POSTROUTING -o $QEMU_BRIDGE -s $QEMU_BRIDGE_IP/32 -p udp -m udp --sport 53 -j SNAT --to-source $DOCKER_IF_IP -XTABLES_LIBDIR=/opt/runcvm/lib64/usr/lib/xtables/ /opt/runcvm/sbin/xtables-nft-multi iptables -t nat -A POSTROUTING -o $QEMU_BRIDGE -s $QEMU_BRIDGE_IP/32 -p udp -m udp --dport 53 -j SNAT --to-source $DOCKER_IF_IP + # Match TCP port 22 traffic, outgoing via the QEMU bridge, from the bridge's own IP: + # -> Masquerade it as if from the DNS_IP. + # This is necessary to allow SSH from within the container to the VM. + XTABLES_LIBDIR=/opt/runcvm/lib64/usr/lib/xtables/ /opt/runcvm/sbin/xtables-nft-multi iptables -t nat -A POSTROUTING -o $QEMU_BRIDGE -s $QEMU_BRIDGE_IP/32 -p tcp -m tcp --dport 22 -j SNAT --to-source $RUNCVM_DNS_IP + fi -# Match TCP port 22 traffic, outgoing via the QEMU bridge, from the bridge's own IP: -# -> Masquerade it as if from the DNS_IP. -# This is necessary to allow SSH from within the container to the VM. -XTABLES_LIBDIR=/opt/runcvm/lib64/usr/lib/xtables/ /opt/runcvm/sbin/xtables-nft-multi iptables -t nat -A POSTROUTING -o $QEMU_BRIDGE -s $QEMU_BRIDGE_IP/32 -p tcp -m tcp --dport 22 -j SNAT --to-source $RUNCVM_DNS_IP +done # FIXME: Bind-mount /etc/resolv.conf as well as /vm/etc/resolv.conf to prevent them showing in 'docker diff' cat /vm/etc/resolv.conf >/etc/resolv.conf diff --git a/runcvm-scripts/runcvm-ctr-qemu b/runcvm-scripts/runcvm-ctr-qemu index c05a059..206e33a 100755 --- a/runcvm-scripts/runcvm-ctr-qemu +++ b/runcvm-scripts/runcvm-ctr-qemu @@ -6,8 +6,8 @@ # Load defaults after exports (so that PATH is overridden) . /opt/runcvm/scripts/runcvm-ctr-defaults && PATH="$RUNCVM_PATH" -# Load network config -load_network +QEMU_IFUP="$RUNCVM/scripts/runcvm-ctr-qemu-ifup" +QEMU_IFDOWN="$RUNCVM/scripts/runcvm-ctr-qemu-ifdown" # BREAK="break=mountroot" PANIC="panic=-1" @@ -17,8 +17,6 @@ INIT="init=/opt/runcvm/scripts/runcvm-vm-init" SERIAL="mon:stdio" # SERIAL="stdio" -MAC=$(busybox sed -r 's/^..:..:../52:54:00/' <<<$DOCKER_IF_MAC) - error() { echo "$1" >&2 exit 1 @@ -58,7 +56,7 @@ do_disk() { mke2fs -F -t "$fs" "$src" >&2 || error "Error: disk spec '$spec' invalid: mke2fs on '$src' with fs '$fs' failed" fi - UUID=$(blkid -o value "$src" | head -n 1) + local UUID=$(blkid -o value "$src" | head -n 1) mkdir -p "$RUNCVM_VM_MOUNTPOINT/$dst" >&2 echo "UUID=$UUID $dst $fs defaults,noatime 0 0" >>/.runcvm/fstab DISKS+=("-drive file=$src,format=raw,if=virtio,media=disk,cache=directsync,aio=native") @@ -67,17 +65,44 @@ do_disk() { # Argument e.g. /disk1,/home,ext4,5G;/disk2,/var,ext4,1G do_disks() { local IFS=';' + local disk for disk in $1 do do_disk "$disk" done } +do_networks() { + local id=0 ifpath if mac + local DOCKER_IF DOCKER_IF_MAC DOCKER_IF_MTU DOCKER_IF_IP DOCKER_IF_IP_NETPREFIX DOCKER_IF_IP_GW + + for ifpath in /.runcvm/network/devices/* + do + if=$(busybox basename "$ifpath") + + [ "$if" = "default" ] && continue + + load_network "$if" + + mac=$(busybox sed -r 's/^..:..:../52:54:00/' <<<$DOCKER_IF_MAC) + + IFACES+=( + -netdev tap,id=qemu$id,ifname=tap-$DOCKER_IF,script=$QEMU_IFUP,downscript=$QEMU_IFDOWN + -device virtio-net-pci,netdev=qemu$id,mac=$mac,rombar=$id + ) + + id=$((id+1)) + done +} + DISKS=() if [ -n "$RUNCVM_DISKS" ]; then do_disks "$RUNCVM_DISKS" fi +IFACES=() +do_networks + if [ -n "$RUNCVM_TMPFS" ]; then echo "$RUNCVM_TMPFS" >>/.runcvm/fstab fi @@ -132,7 +157,7 @@ ARGS=( -numa node,memdev=mem -smp $RUNCVM_CPUS # Configure host/container tap device with PXE roms disabled - -netdev tap,id=qemu0,script=$QEMU_IFUP,downscript=$QEMU_IFDOWN -device virtio-net-pci,netdev=qemu0,mac=$MAC,rombar=0 + ${IFACES[@]} -no-reboot ${DISKS[@]} -action panic=none -action reboot=shutdown diff --git a/runcvm-scripts/runcvm-ctr-qemu-ifup b/runcvm-scripts/runcvm-ctr-qemu-ifup index 6d85f30..53ebfc6 100755 --- a/runcvm-scripts/runcvm-ctr-qemu-ifup +++ b/runcvm-scripts/runcvm-ctr-qemu-ifup @@ -2,7 +2,12 @@ . /opt/runcvm/scripts/runcvm-ctr-defaults && PATH="$RUNCVM_PATH" -load_network -ip link set dev "$1" up mtu "${DOCKER_IF_MTU:=1500}" master "$QEMU_BRIDGE" +tap="$1" +if="$(busybox sed 's/tap-//' <<<$tap)" +bri="$(busybox sed 's/tap-/br-/' <<<$tap)" + +load_network "$if" + +ip link set dev "$tap" up mtu "${DOCKER_IF_MTU:=1500}" master "$bri" exit 0 \ No newline at end of file diff --git a/runcvm-scripts/runcvm-ip-functions b/runcvm-scripts/runcvm-ip-functions new file mode 100644 index 0000000..628a4c8 --- /dev/null +++ b/runcvm-scripts/runcvm-ip-functions @@ -0,0 +1,40 @@ +#!/bin/bash + +cidr_to_int() { + echo "$(( 0xffffffff ^ ((1 << (32 - $1)) - 1) ))" +} + +int_to_ip() { + local value="$1" + echo "$(( ($1 >> 24) & 0xff )).$(( ($1 >> 16) & 0xff )).$(( ($1 >> 8) & 0xff )).$(( $1 & 0xff ))" +} + +cidr_to_netmask() { + local value=$(cidr_to_int "$1") + int_to_ip "$value" +} + +ip_prefix_to_network() { + local IFS i1 i2 i3 i4 m1 m2 m3 m4 + IFS=. read -r i1 i2 i3 i4 <<< "$1" + + local mask=$(cidr_to_netmask "$2") + IFS=. read -r m1 m2 m3 m4 <<< "$mask" + + printf "%d.%d.%d.%d\n" "$((i1 & m1))" "$((i2 & m2))" "$((i3 & m3))" "$((i4 & m4))" +} + +cidr_to_bcastmask() { + local value=$(( (1 << 32) - $(cidr_to_int "$1") - 1 )) + int_to_ip "$value" +} + +ip_prefix_to_bcast() { + local IFS i1 i2 i3 i4 m1 m2 m3 m4 + IFS=. read -r i1 i2 i3 i4 <<< "$1" + + local mask=$(cidr_to_bcastmask "$2") + IFS=. read -r m1 m2 m3 m4 <<< "$mask" + + printf "%d.%d.%d.%d\n" "$((i1 | m1))" "$((i2 | m2))" "$((i3 | m3))" "$((i4 | m4))" +} \ No newline at end of file diff --git a/runcvm-scripts/runcvm-runtime b/runcvm-scripts/runcvm-runtime index 0c24be2..c093117 100755 --- a/runcvm-scripts/runcvm-runtime +++ b/runcvm-scripts/runcvm-runtime @@ -349,15 +349,15 @@ if [ "$COMMAND" = "create" ]; then case "$RUNCVM_KERNEL_ID" in debian) RUNCVM_KERNEL_OS_KERNEL_PATH="/vmlinuz" RUNCVM_KERNEL_OS_INITRAMFS_PATH="/initrd.img" - RUNCVM_KERNEL_ROOT="rootfstype=virtiofs root=myfs noresume nomodeset" + RUNCVM_KERNEL_ROOT="rootfstype=virtiofs root=myfs noresume nomodeset net.ifnames=1" ;; ubuntu) RUNCVM_KERNEL_OS_KERNEL_PATH="/boot/vmlinuz" RUNCVM_KERNEL_OS_INITRAMFS_PATH="/boot/initrd.img" - RUNCVM_KERNEL_ROOT="rootfstype=virtiofs root=myfs noresume nomodeset" + RUNCVM_KERNEL_ROOT="rootfstype=virtiofs root=myfs noresume nomodeset net.ifnames=1" ;; ol) RUNCVM_KERNEL_OS_KERNEL_PATH="/boot/vmlinuz" RUNCVM_KERNEL_OS_INITRAMFS_PATH="/boot/initramfs" - RUNCVM_KERNEL_ROOT="root=virtiofs:myfs noresume nomodeset" + RUNCVM_KERNEL_ROOT="root=virtiofs:myfs noresume nomodeset net.ifnames=1" ;; alpine) RUNCVM_KERNEL_OS_KERNEL_PATH="/boot/vmlinuz-virt" RUNCVM_KERNEL_OS_INITRAMFS_PATH="/boot/initramfs-virt" diff --git a/runcvm-scripts/runcvm-vm-init b/runcvm-scripts/runcvm-vm-init index feff53e..aac45b0 100755 --- a/runcvm-scripts/runcvm-vm-init +++ b/runcvm-scripts/runcvm-vm-init @@ -33,14 +33,46 @@ done # Bring up local interface ip link set lo up -# Rename the first deterministically named interface to eth0 and configure it -load_network -ip link set $IF name $DOCKER_IF -ip addr add $DOCKER_IF_IP/$DOCKER_IF_IP_NETPREFIX dev $DOCKER_IF -ip link set $DOCKER_IF up mtu "${DOCKER_IF_MTU:=1500}" - -# Establish default gateway -ip route add default via $DOCKER_IF_IP_GW +# Identify each interface by MAC address, then give each a temporary name +# (as we might ultimately need to rename e.g. eth0->eth1 and eth1->eth0). +for ifpath in /.runcvm/network/devices/* +do + if=$(busybox basename "$ifpath") + + [ "$if" = "default" ] && continue + + load_network "$if" + + # Locate the actual network device by its MAC address. + mac=$(busybox sed -r 's/^..:..:../52:54:00/' <<<$DOCKER_IF_MAC) + device=$(ip -json link show | jq -r --arg mac "$mac" '.[] | select(.address == $mac) | .ifname') + + ip link set $device name $DOCKER_IF-tmp +done + +# Configure, rename and bring up all interfaces. +for ifpath in /.runcvm/network/devices/* +do + if=$(busybox basename "$ifpath") + + [ "$if" = "default" ] && continue + + load_network "$if" + + ip link set $DOCKER_IF-tmp name $DOCKER_IF + ip addr add $DOCKER_IF_IP/$DOCKER_IF_IP_NETPREFIX broadcast + dev $DOCKER_IF + ip link set $DOCKER_IF up mtu "${DOCKER_IF_MTU:=1500}" + + # If this is the default gateway interface, establish the default gateway + [ -n "$DOCKER_IF_IP_GW" ] && ip route add default via $DOCKER_IF_IP_GW +done + +# Read and install any supplementary routes. +while read -r DOCKER_RT_NET DOCKER_RT_GW DOCKER_RT_DEV DOCKER_RT_PREFSRC +do + [ -n "$DOCKER_RT_NET" ] && [ -n "$DOCKER_RT_GW" ] && [ -n "$DOCKER_RT_DEV" ] && \ + ip route add "$DOCKER_RT_NET" via "$DOCKER_RT_GW" dev "$DOCKER_RT_DEV" +done