From 4d1b1e7c41d04adad4c93d4a01c43551e100c0b1 Mon Sep 17 00:00:00 2001 From: Yin Congmin Date: Tue, 3 Jan 2023 16:51:04 +0800 Subject: [PATCH 01/10] control/discovery: add discovery controller The discovery contrller implement the basic function. Use command "python3 -m control.discovery" to start discovery controller. Client can use command "nvme discover -t tcp -a ip -s port" to get log pages. The configuration is in ceph-nvmeof.conf [discovery] part. feature: https://github.com/ceph/ceph-nvmeof/issues/108 Signed-off-by: Yin Congmin --- ceph-nvmeof.conf | 6 + control/discovery.py | 1077 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1083 insertions(+) create mode 100644 control/discovery.py diff --git a/ceph-nvmeof.conf b/ceph-nvmeof.conf index 078e3b9b..c895d6fc 100644 --- a/ceph-nvmeof.conf +++ b/ceph-nvmeof.conf @@ -19,6 +19,12 @@ state_update_interval_sec = 5 #max_controller_id = 65519 enable_discovery_controller = false +[discovery] + +addr = 0.0.0.0 +port = 8009 +debug = 20 + [ceph] pool = rbd config_file = /etc/ceph/ceph.conf diff --git a/control/discovery.py b/control/discovery.py new file mode 100644 index 00000000..ff875720 --- /dev/null +++ b/control/discovery.py @@ -0,0 +1,1077 @@ +# +# Copyright (c) 2021 International Business Machines +# All rights reserved. +# +# SPDX-License-Identifier: LGPL-3.0-or-later +# +# Authors: congmin.yin@intel.com +# + +import argparse +import grpc +import json +import logging +from .config import GatewayConfig +from .state import GatewayState, LocalGatewayState, OmapGatewayState, GatewayStateHandler + +import rados +from typing import Dict, Optional + +import socket +import threading +import time +import enum +import uuid +import struct +import selectors +from dataclasses import dataclass, field +from ctypes import Structure, LittleEndianStructure, c_bool, c_ubyte, c_uint8, c_uint16, c_uint32, c_uint64, c_float + +# NVMe tcp pdu type +class NVME_TCP_PDU(enum.IntFlag): + ICREQ = 0x0 + ICRESP = 0x1 + H2C_TERM = 0x2 + C2H_TERM = 0x3 + CMD = 0x4 + RSP = 0x5 + H2C_DATA = 0x6 + C2H_DATA = 0x7 + TCP_R2T = 0x9 + +# NVMe tcp opcode +class NVME_TCP_OPC(enum.IntFlag): + DELETE_SQ = 0x0 + CREATE_SQ = 0x1 + GET_LOG_PAGE = 0x2 + DELETE_CQ = 0x4 + CREATE_CQ = 0x5 + IDENTIFY = 0x6 + ABORT = 0x8 + SET_FEATURES = 0x9 + GET_FEATURES = 0xa + ASYNC_EVE_REQ = 0xc + NS_MGMT = 0xd + FW_COMMIT = 0x10 + FW_IMG_DOWNLOAD = 0x11 + NS_ATTACH = 0x15 + KEEP_ALIVE = 0x18 + FABRIC_TYPE = 0x7F + +# NVMe tcp fabric command type +class NVME_TCP_FCTYPE(enum.IntFlag): + PROP_SET = 0x0 + CONNECT = 0x1 + PROP_GET = 0x4 + AUTH_SEND = 0x5 + AUTH_RECV = 0x6 + DISCONNECT = 0x8 + +# NVMe controller register space offsets +class NVME_CTL(enum.IntFlag): + CAPABILITIES = 0x0 + VERSION = 0x08 + CONFIGURATION = 0x14 + STATUS = 0x1c + + +# NVM subsystem types +class NVMF_SUBTYPE(enum.IntFlag): + # Discovery type for NVM subsystem + DISCOVERY = 0x1 + # NVMe type for NVM subsystem + NVME = 0x2 + +# NVMe over Fabrics transport types +class TRANSPORT_TYPES(enum.IntFlag): + RDMA = 0x1 + FC = 0x2 + TCP = 0x3 + INTRA_HOST = 0xfe + +# Address family types +class ADRFAM_TYPES(enum.IntFlag): + ipv4 = 0x1 + ipv6 = 0x2 + ib = 0x3 + fc = 0x4 + intra_host = 0xfe + +# Transport requirement, secure channel requirements +# Connections shall be made over a fabric secure channel +class NVMF_TREQ_SECURE_CHANNEL(enum.IntFlag): + NOT_SPECIFIED = 0x0 + REQUIRED = 0x1 + NOT_REQUIRED = 0x2 + +# maximum number of connections +MAX_CONNECTION = 10240 + +# NVMe tcp package length, refer: MTU = 1500 bytes +NVME_TCP_PDU_UNIT = 1024 + +# Max SQ head pointer +SQ_HEAD_MAX = 128 + +@dataclass +class Connection: + """Data used multiple times in each connection.""" + + connection: socket.socket = None + allow_listeners: list = field(default_factory=list) + log_page: bytearray = field(default_factory=bytearray) + recv_buffer: bytearray = field(default_factory=bytearray) + nvmeof_connect_data_hostid: str = field(default_factory=str) + nvmeof_connect_data_cntlid: int = 0 + nvmeof_connect_data_subnqn: str = field(default_factory=str) + nvmeof_connect_data_hostnqn: str = field(default_factory=str) + sq_head_ptr: int = 0 + unsent_log_page_len: int = 0 + property_data: str = field(default_factory=str) + property_set: bool = False + shutdown_now: bool = False + controller_id: uuid = None + gen_cnt: int = 0 + recv_async: bool = False + async_cmd_id: int = 0 + keep_alive_time: float = 0.0 + keep_alive_timeout: int = 0 + +class AutoSerializableStructure(LittleEndianStructure): + def __add__(self, other): + if isinstance(other, LittleEndianStructure): + return bytes(self) + bytes(other) + elif isinstance(other, bytes): + return bytes(self) + other + else: + raise ValueError("error message format.") + +class Pdu(AutoSerializableStructure): + _fields_ = [ + ("type", c_uint8), + ("specical_flag", c_uint8), + ("header_length", c_uint8), + ("data_offset", c_uint8), + ("packet_length", c_uint32), + ] + +class ICResp(AutoSerializableStructure): + _fields_ = [ + # pdu version format + ("version_format", c_uint16), + # controller Pdu data alignment + ("data_alignment", c_uint8), + # digest types enabled + ("digest_types", c_uint8), + # Maximum data capsules per r2t supported + ("maximum_data_capsules", c_uint32) + ] + +class CqeConnect(AutoSerializableStructure): + _fields_ = [ + ("controller_id", c_uint16), + ("authentication", c_uint16), + ("reserved", c_uint32), + ("sq_head_ptr", c_uint16), + ("sq_id", c_uint16), + ("cmd_id", c_uint16), + ("status", c_uint16) + ] + +class CqePropertyGetSet(AutoSerializableStructure): + _fields_ = [ + # property data for property get, reserved for property set + ("property_data", c_ubyte * 8), + ("sq_head_ptr", c_uint16), + ("sq_id", c_uint16), + ("cmd_id", c_uint16), + ("status", c_uint16) + ] + +class NVMeTcpDataPdu(AutoSerializableStructure): + _fields_ = [ + ("cmd_id", c_uint16), + ("transfer_tag", c_uint16), + ("data_offset", c_uint32), + ("data_length", c_uint32), + ("reserved", c_uint32) + ] + +class NVMeIdentify(AutoSerializableStructure): + _fields_ = [ + # skip some fields, include VID, SSVID, SN, MN + ("todo_fields1", c_ubyte * 64), + ("firmware_revision", c_ubyte * 8), + # RAB, IEEE, CMIC + ("todo_fields2", c_ubyte * 5), + # maximum data transfer size + ("mdts", c_uint8), + ("controller_id", c_uint16), + ("version", c_uint8 * 4), + # RTD3R, RTD3E + ("todo_fields3", c_ubyte * 8), + # optional asynchronous events supported + ("oaes", c_ubyte * 4), + # CTRATT, RRLS, CNTRLTYPE, FGUID, NVMe Management Interface, OACS, ACL + ("todo_fields4", c_ubyte * 163), + # asynchronous events request limit + ("aerl", c_uint8), + ("firmware_updates", c_uint8), + # log page attributes + ("lpa", c_uint8), + # error log page entries(ELPE) + ("elpe", c_uint8), + # NPSS, AVSCC, APSTA, WCTEMP, CCTEMP, MTFA, HMPRE, HMIN, TNVMCAP... + # TODO: keep alive support - timer value(KAS)? + ("todo_fields5", c_ubyte * 251), + # maximum outstanding commands + ("max_cmd", c_uint16), + # number of namespace, optional NVM command support + ("todo_fields6", c_uint8 * 6), + # fused operation support + ("fused_operation", c_uint16), + # FNA, VWC, AWUN, AWUPF, NVSCC, NWPC + ("todo_fields7", c_uint8 * 8), + # atomic compare & write unit + ("acwu", c_uint16), + ("reserved1", c_uint16), + # SGL support + ("sgls", c_uint8 * 4), + # maxinum number of allowed namespaces + ("mnan", c_uint32), + ("reserved2", c_ubyte * 224), + ("subnqn", c_ubyte * 256), + ("reserved3", c_ubyte * 768), + ("nvmeof_attributes", c_ubyte * 256), + ("power_state_attributes", c_ubyte * 1024), + ("vendor_specific", c_ubyte * 1024) + ] + +# for set feature, keep alive and async +class CqeNVMe(AutoSerializableStructure): + _fields_ = [ + ("dword0", c_uint32), + ("dword1", c_uint32), + ("sq_head_ptr", c_uint16), + ("sq_id", c_uint16), + ("cmd_id", c_uint16), + ("status", c_uint16) + ] + +class NVMeGetLogPage(AutoSerializableStructure): + _fields_ = [ + # generation counter + ("genctr", c_uint64), + # number of records + ("numrec", c_uint64), + #record format + ("recfmt", c_uint16), + ("reserved", c_ubyte * 1006) + ] + +class DiscoveryLogEntry(AutoSerializableStructure): + _fields_ = [ + ("trtype", c_uint8), + ("adrfam", c_uint8), + ("subtype", c_uint8), + ("treq", c_uint8), + ("port_id", c_uint16), + ("controller_id", c_uint16), + # admin max SQ size + ("asqsz", c_uint16), + ("reserved1", c_ubyte * 22), + ("trsvcid", c_ubyte * 32), + ("reserved2", c_ubyte * 192), + ("subnqn", c_ubyte * 256), + ("traddr", c_ubyte * 256), + # Transport specific address subtype + ("tsas", c_ubyte * 256) + ] + +class DiscoveryService: + """Implements discovery controller. + + Response discover request from initiator. + + Instance attributes: + version: Discovery controller version + config: Basic gateway parameters + logger: Logger instance to track discovery controller events + omap_name: OMAP object name + ioctx: I/O context which allows OMAP access + discovery_addr: Discovery controller addr which allows initiator send command + discovery_port: Discovery controller's listening port + """ + + BDEV_PREFIX = "bdev_" + NAMESPACE_PREFIX = "namespace_" + SUBSYSTEM_PREFIX = "subsystem_" + HOST_PREFIX = "host_" + LISTENER_PREFIX = "listener_" + + def __init__(self, config): + self.version = 1 + self.config = config + self.lock = threading.Lock() + + self.logger = logging.getLogger(__name__) + log_level = self.config.get("discovery", "debug") + self.logger.setLevel(level=int(log_level)) + + gateway_group = self.config.get("gateway", "group") + self.omap_name = f"nvmeof.{gateway_group}.state" \ + if gateway_group else "nvmeof.state" + self.logger.info(f"log pages info from omap: {self.omap_name}") + + ceph_pool = self.config.get("ceph", "pool") + ceph_conf = self.config.get("ceph", "config_file") + conn = rados.Rados(conffile=ceph_conf) + conn.connect() + self.ioctx = conn.open_ioctx(ceph_pool) + + self.discovery_addr = self.config.get("discovery", "addr") + self.discovery_port = self.config.get("discovery", "port") + if self.discovery_addr == '' or self.discovery_port == '': + self.logger.error("discovery addr/port are empty.") + assert 0 + self.logger.info(f"discovery addr: {self.discovery_addr} port: {self.discovery_port}") + + self.conn_vals = {} + self.connection_counter = 1 + self.selector = selectors.DefaultSelector() + + def _read_all(self) -> Dict[str, str]: + """Reads OMAP and returns dict of all keys and values.""" + + with rados.ReadOpCtx() as read_op: + iter, _ = self.ioctx.get_omap_vals(read_op, "", "", -1) + self.ioctx.operate_read_op(read_op, self.omap_name) + omap_dict = dict(iter) + return omap_dict + + def _get_vals(self, omap_dict, prefix): + """Read values from the OMAP dict.""" + + return [json.loads(val.decode('utf-8')) for (key, val) in omap_dict.items() + if key.startswith(prefix)] + + def reply_initialize(self, conn): + """Reply initialize request.""" + + self.logger.debug("handle ICreq.") + pdu_reply = Pdu() + pdu_reply.type = NVME_TCP_PDU.ICRESP + pdu_reply.header_length = 128 + pdu_reply.packet_length = 128 + + icresp_reply = ICResp() + icresp_reply.maximum_data_capsules = 131072 + + try: + conn.sendall(pdu_reply + icresp_reply + bytes(112)) + except BrokenPipeError: + self.logger.error("client disconnected unexpectedly.") + return -1 + self.logger.debug("reply initialize connection request.") + return 0 + + def reply_fc_cmd_connect(self, conn, data, cmd_id): + """Reply connect request.""" + + self.logger.debug("handle connect request.") + self_conn = self.conn_vals[conn.fileno()] + hf_nvmeof_cmd_connect_rsvd1 = struct.unpack_from('<19B', data, 13) + SIGL1 = struct.unpack_from('> 6) & 0x3 + if shutdown_notification == 0: + if self_conn.property_set is True: + # controller status: ready + property_get.property_data = (c_ubyte * 8)(0x01, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) + else: + property_get.property_data = (c_ubyte * 8)(0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) + else: + # here shutdown_notification should be 0x1 + property_get.property_data = (c_ubyte * 8)(0x09, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) + self_conn.shutdown_now = True + elif NVME_CTL(nvmeof_prop_get_set_offset) == NVME_CTL.VERSION: + # nvme version: 1.3 + property_get.property_data = (c_ubyte * 8)(0x00, 0x03, \ + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00) + else: + self.logger.error("unsupported type for property getting.") + property_get.sq_head_ptr = self_conn.sq_head_ptr + property_get.cmd_id = cmd_id + + try: + conn.sendall(pdu_reply + property_get) + except BrokenPipeError: + self.logger.error("client disconnected unexpectedly.") + return -1 + self.logger.debug("reply property get request.") + return 0 + + def reply_fc_cmd_prop_set(self, conn, data, cmd_id): + """Reply property set request.""" + + self.logger.debug("handle property set request.") + self_conn = self.conn_vals[conn.fileno()] + nvmeof_prop_get_set_rsvd0 = struct.unpack_from('<35B', data, 13) + nvmeof_prop_get_set_attrib = struct.unpack_from('<1B', data, 48)[0] + nvmeof_prop_get_set_rsvd1 = struct.unpack_from('<3B', data, 49) + nvmeof_prop_get_set_offset = struct.unpack_from('> 8) & 0x1F + get_logpage_lsi = nvme_get_logpage_dword11 >> 16 + get_logpage_uid_idx = nvme_get_logpage_dword14 & 0x3F + + if get_logpage_lid != 0x70: + self.logger.error("request type error, not discovery request.") + return -1 + + # Filter listeners based on host access permissions + allow_listeners = self_conn.allow_listeners + if len(allow_listeners) == 0: + for host in hosts: + a = host["host_nqn"] + if host["host_nqn"] == '*' or host["host_nqn"] == hostnqn: + for listener in listeners: + # TODO: It is better to change nqn in the "listener" + # to subsystem_nqn to avoid confusion + if host["subsystem_nqn"] == listener["nqn"]: + allow_listeners += [listener,] + self_conn.allow_listeners = allow_listeners + + # Prepare all log page data segments + if self_conn.unsent_log_page_len == 0 and nvme_data_len > 16: + self_conn.unsent_log_page_len = 1024 * (len(allow_listeners) + 1) + self_conn.log_page = bytearray(self_conn.unsent_log_page_len) + + nvme_get_log_page_reply = NVMeGetLogPage() + nvme_get_log_page_reply.genctr = self_conn.gen_cnt + nvme_get_log_page_reply.numrec = len(allow_listeners) + self_conn.log_page[0:1024] = nvme_get_log_page_reply + + # log entries + log_entry_counter = 0 + while log_entry_counter < len(allow_listeners): + log_entry = DiscoveryLogEntry() + trtype = TRANSPORT_TYPES[allow_listeners[log_entry_counter]["trtype"]] + if trtype is None: + self.logger.error("unsupported transport type") + else: + log_entry.trtype = trtype + adrfam = ADRFAM_TYPES[allow_listeners[log_entry_counter]["adrfam"]] + if adrfam is None: + self.logger.error("unsupported adress family") + else: + log_entry.adrfam = adrfam + log_entry.subtype = NVMF_SUBTYPE.NVME + log_entry.treq = NVMF_TREQ_SECURE_CHANNEL.NOT_REQUIRED + log_entry.port_id = log_entry_counter + log_entry.controller_id = 0xffff + log_entry.asqsz = 128 + # transport service indentifier + log_entry.trsvcid = (c_ubyte * 32)(*[c_ubyte(x) for x \ + in allow_listeners[log_entry_counter]["trsvcid"].encode()]) + log_entry.trsvcid[len(allow_listeners[log_entry_counter]["trsvcid"]):] = \ + [c_ubyte(0x20)] * (32 - len(allow_listeners[log_entry_counter]["trsvcid"])) + # NVM subsystem qualified name + log_entry.subnqn = (c_ubyte * 256)(*[c_ubyte(x) for x \ + in allow_listeners[log_entry_counter]["nqn"].encode()]) + log_entry.subnqn[len(allow_listeners[log_entry_counter]["nqn"]):] = \ + [c_ubyte(0x00)] * (256 - len(allow_listeners[log_entry_counter]["nqn"])) + # Transport address + log_entry.traddr = (c_ubyte * 256)(*[c_ubyte(x) for x \ + in allow_listeners[log_entry_counter]["traddr"].encode()]) + log_entry.traddr[len(allow_listeners[log_entry_counter]["traddr"]):] = \ + [c_ubyte(0x20)] * (256 - len(allow_listeners[log_entry_counter]["traddr"])) + + self_conn.log_page[1024*(log_entry_counter+1): \ + 1024*(log_entry_counter+2)] = log_entry + log_entry_counter += 1 + else: + self.logger.debug("in the process of sending log pages...") + + reply = b'' + pdu_and_nvme_pdu_len = 8 + 16 + pdu_reply = Pdu() + pdu_reply.type = NVME_TCP_PDU.C2H_DATA + pdu_reply.specical_flag = 0x0c + pdu_reply.header_length = pdu_and_nvme_pdu_len + pdu_reply.data_offset = pdu_and_nvme_pdu_len + pdu_reply.packet_length = pdu_and_nvme_pdu_len + nvme_data_len + + nvme_tcp_data_pdu = NVMeTcpDataPdu() + nvme_tcp_data_pdu.cmd_id = cmd_id + nvme_tcp_data_pdu.data_length = nvme_data_len + + # reply based on the received get log page request packet(length) + if nvme_data_len == 16: + # nvme cli version: 1.x + nvme_get_log_page_reply = NVMeGetLogPage() + nvme_get_log_page_reply.genctr = self_conn.gen_cnt + nvme_get_log_page_reply.numrec = len(listeners) + + reply = pdu_reply + nvme_tcp_data_pdu + bytes(nvme_get_log_page_reply)[:16] + elif nvme_data_len == 1024 and nvme_logpage_offset == 0: + # nvme cli version: 2.x + nvme_get_log_page_reply = NVMeGetLogPage() + nvme_get_log_page_reply.genctr = self_conn.gen_cnt + nvme_get_log_page_reply.numrec = len(listeners) + + reply = pdu_reply + nvme_tcp_data_pdu+ nvme_get_log_page_reply + elif nvme_data_len % 1024 == 0: + # reply log pages + reply = pdu_reply + nvme_tcp_data_pdu + \ + self_conn.log_page[nvme_logpage_offset:nvme_logpage_offset+nvme_data_len] + self_conn.unsent_log_page_len -= nvme_data_len + if self_conn.unsent_log_page_len == 0: + self_conn.log_page = b'' + self_conn.property_set = False + self_conn.allow_listeners = [] + else: + self.logger.error("request log page lenghth error. It need to be 16 or n*1024") + return -1 + try: + conn.sendall(reply) + except BrokenPipeError: + self.logger.error("client disconnected unexpectedly.") + return -1 + self.logger.debug("reply get log page request.") + return 0 + + def reply_keep_alive(self, conn, data, cmd_id): + """Reply keep alive request.""" + + self.logger.debug("handle keep alive request.") + self_conn = self.conn_vals[conn.fileno()] + nvme_sgl = struct.unpack_from('<16B', data, 32) + nvme_sgl_desc_type = nvme_sgl[15] & 0xF0 + nvme_sgl_desc_sub_type = nvme_sgl[15] & 0x0F + nvme_keep_alive_dword10 = struct.unpack_from('= \ + self.conn_vals[key].keep_alive_timeout / 1000: + # Adding locks to prevent another thread from processing sudden requests. + # Is there a better way? + with self.lock: + self.logger.debug(f"discover request from {self.conn_vals[key].connection} timeout.") + self.selector.unregister(self.conn_vals[key].connection) + self.conn_vals[key].connection.close() + del self.conn_vals[key] + + time.sleep(1) + + def reply_fabric_request(self, conn, data, cmd_id): + """Reply fabric request.""" + + fabric_type = struct.unpack_from(' SQ_HEAD_MAX: + self_conn.sq_head_ptr = 1 + + if NVME_TCP_PDU(pdu_type) == NVME_TCP_PDU.ICREQ: + err = self.reply_initialize(conn) + + elif NVME_TCP_PDU(pdu_type) == NVME_TCP_PDU.CMD: + CMD1 = struct.unpack_from(' Date: Tue, 5 Sep 2023 17:02:33 +0800 Subject: [PATCH 02/10] update README for discovery service add the method of starting discovery service Signed-off-by: Yin Congmin --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index d3e6d3b9..f8b8441c 100644 --- a/README.md +++ b/README.md @@ -238,6 +238,15 @@ Once the NVMe-oF target is Hello NVMe-oF ``` +### Start Discovery Service(Optional) + +The discovery service can provide all the targets that the current user can access, and these target information is sourced from ceph omap. These targets may be running or just a record. + +1. Start Discovery Service + ```bash + $ python3 -m control.discovery + ``` + ## Advanced ### Configuration From deffe91c784fcc1515b1260ef455f7aa1f82670a Mon Sep 17 00:00:00 2001 From: Alexander Indenbaum Date: Fri, 25 Aug 2023 09:23:04 +0300 Subject: [PATCH 03/10] initial discovery - container - CI test Signed-off-by: Alexander Indenbaum --- .env | 8 +- .github/workflows/build-container.yml | 196 +++++++++++++++++++++++--- Makefile | 4 +- ceph-nvmeof.conf | 1 - docker-compose.yaml | 17 ++- mk/demo.mk | 2 + 6 files changed, 200 insertions(+), 28 deletions(-) diff --git a/.env b/.env index 787c843f..a7dbc404 100644 --- a/.env +++ b/.env @@ -24,7 +24,6 @@ NVMEOF_DESCRIPTION="Service to provide block storage on top of Ceph for platform NVMEOF_URL="https://github.com/ceph/ceph-nvmeof" NVMEOF_TAGS="ceph,nvme-of,nvme-of gateway,rbd,block storage" NVMEOF_WANTS="ceph,rbd" -NVMEOF_HOSTNAME="nvmeof" NVMEOF_IP_ADDRESS="192.168.13.3" NVMEOF_IO_PORT=4420 NVMEOF_GW_PORT=5500 @@ -61,4 +60,9 @@ RBD_IMAGE_NAME="demo_image" RBD_IMAGE_SIZE="10M" BDEV_NAME="demo_bdev" NQN="nqn.2016-06.io.spdk:cnode1" -NVMEOF_FIRST_GATEWAY_NAME="gw-1" +SERIAL="SPDK00000000000001" + +# Container names in docker compose environent +DISC1="ceph-nvmeof_discovery_1" +GW1="ceph-nvmeof_nvmeof_1" +GW2="ceph-nvmeof_nvmeof_2" diff --git a/.github/workflows/build-container.yml b/.github/workflows/build-container.yml index 5287c912..ebd58bed 100644 --- a/.github/workflows/build-container.yml +++ b/.github/workflows/build-container.yml @@ -181,7 +181,7 @@ jobs: run: | . .env - echo using gateway $NVMEOF_IP_ADDRESS port $NVMEOF_GW_PORT timeout ${{ env.WAIT_TIMEOUT_MINS }} + echo using gateway $NVMEOF_IP_ADDRESS port $NVMEOF_GW_PORT until nc -z $NVMEOF_IP_ADDRESS $NVMEOF_GW_PORT; do echo -n . sleep ${{ env.WAIT_INTERVAL_SECS }} @@ -243,21 +243,6 @@ jobs: path: | /tmp/coredump/core.* - #- name: Test mounting nvmeof device locally - # run: | - # . .env - # sudo modprobe nvme-fabrics - # sudo nvme list - # sudo nvme discover -t tcp -a $NVMEOF_IP_ADDRESS -s $NVMEOF_IO_PORT - # sudo nvme connect -t tcp --traddr $NVMEOF_IP_ADDRESS -s $NVMEOF_IO_PORT -n nqn.2016-06.io.spdk:cnode1 - # sudo nvme list - # NVMEOF_DEVICE=$(sudo nvme list -o json | jq '.Devices[] | select(.ModelNumber=="SPDK bdev Controller").DevicePath') - # sudo mkfs $NVMEOF_DEVICE - # MOUNT_POINT=$(mktemp -d) - # sudo mount $NVMEOF_DEVICE $MOUNT_POINT - # cd $MOUNT_POINT - # touch test - # For debugging purposes (provides an SSH connection to the runner) #- name: Setup tmate session # uses: mxschmitt/action-tmate@v3 @@ -274,18 +259,188 @@ jobs: make down make clean - push-to-registry: - if: github.event_name == 'release' && startsWith(github.ref, 'refs/tags/v') - needs: [pytest, demo] + discovery: + needs: build runs-on: ubuntu-latest + env: + HUGEPAGES: 768 # 3 spdk instances + steps: - name: Checkout code uses: actions/checkout@v3 + - name: Setup huge-pages + run: make setup HUGEPAGES=$HUGEPAGES + - name: Download container images uses: actions/download-artifact@v3 with: - name: images + name: ceph_nvmeof_container_images-${{ github.run_number }} + + - name: Load container images + run: | + docker load < nvmeof.tar + docker load < nvmeof-cli.tar + docker load < vstart-cluster.tar + docker load < bdevperf.tar + + - name: Start discovery controller + run: | + docker-compose up --detach discovery + + - name: Wait for discovery controller to be listening + timeout-minutes: 3 + run: | + . .env + container_ip() { + docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$1" + } + + ip=$(container_ip $DISC1) + echo using discovery controller $ip $NVMEOF_DISC_PORT + until nc -z $ip $NVMEOF_DISC_PORT; do + echo -n . + sleep ${{ env.WAIT_INTERVAL_SECS }} + done + + - name: Start gateway with scale=2 + run: | + docker-compose up --detach --scale nvmeof=2 nvmeof + + - name: Wait for gateways to be listening + timeout-minutes: 3 + run: | + . .env + container_ip() { + docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$1" + } + + for gw in $GW1 $GW2; do + ip=$(container_ip $gw) + echo using gateway $ip $NVMEOF_GW_PORT + until nc -z $ip $NVMEOF_GW_PORT; do + echo -n . + sleep ${{ env.WAIT_INTERVAL_SECS }} + done + echo + done + + - name: List containers + if: success() || failure() + run: | + docker-compose ps + + - name: List processes + if: success() || failure() + run: | + docker-compose top + + - name: Create RBD image + run: | + make rbd OPTS=-T + + - name: Set up target + run: | + . .env + + container_ip() { + docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$1" + } + # container id is the default hostname in docker environent + # i.e. default gateway name + container_id() { + docker ps -q -f name=$1 + } + cli_gw() { + gw=$1 + shift + docker-compose run --rm nvmeof-cli --server-address $gw --server-port $NVMEOF_GW_PORT "$@" + } + + gw1=$(container_ip $GW1) + echo ℹ️ Using GW RPC $GW1 address $gw1 port $NVMEOF_GW_PORT + cli_gw $gw1 get_subsystems + cli_gw $gw1 create_bdev --pool $RBD_POOL --image $RBD_IMAGE_NAME --bdev $BDEV_NAME + cli_gw $gw1 create_subsystem --subnqn $NQN --serial $SERIAL + cli_gw $gw1 add_namespace --subnqn $NQN --bdev $BDEV_NAME + for gw in $GW1 $GW2; do + ip=$(container_ip $gw) + name=$(container_id $gw) # default hostname - container id + echo ℹ️ Create listener address $ip gateway $name + cli_gw $ip create_listener --subnqn $NQN --gateway-name $name --traddr $ip --trsvcid $NVMEOF_IO_PORT + done + cli_gw $gw1 add_host --subnqn $NQN --host "*" + for gw in $GW1 $GW2; do + ip=$(container_ip $gw) + echo ℹ️ Subsystems for name $gw ip $ip + cli_gw $ip get_subsystems + done + + - name: Run bdevperf discovery + run: | + # See + # - https://github.com/spdk/spdk/blob/master/doc/jsonrpc.md + # - https://spdk.io/doc/nvmf_multipath_howto.html + . .env + container_ip() { + docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$1" + } + + echo -n "ℹ️ Starting bdevperf container" + make up SVC=bdevperf OPTS="--detach" + sleep 10 + echo "ℹ️ bdevperf start up logs" + make logs SVC=bdevperf + eval $(make run SVC=bdevperf OPTS="--entrypoint=env" | grep BDEVPERF_SOCKET | tr -d '\n\r' ) + ip=$(container_ip $DISC1) + rpc="/usr/libexec/spdk/scripts/rpc.py" + echo "ℹ️ bdevperf bdev_nvme_set_options" + make exec SVC=bdevperf OPTS=-T CMD="$rpc -v -s $BDEVPERF_SOCKET bdev_nvme_set_options -r -1" + echo "ℹ️ bdevperf start discovery ip: $ip port: $NVMEOF_DISC_PORT" + # -l -1 -o 10 + make exec SVC=bdevperf OPTS=-T CMD="$rpc -v -s $BDEVPERF_SOCKET bdev_nvme_start_discovery -b Nvme0 -t tcp -a $ip -s $NVMEOF_DISC_PORT -f ipv4 -w" + echo "ℹ️ bdevperf bdev_nvme_get_discovery_info" + make exec SVC=bdevperf OPTS=-T CMD="$rpc -v -s $BDEVPERF_SOCKET bdev_nvme_get_discovery_info" + echo "ℹ️ bdevperf perform_tests" + eval $(make run SVC=bdevperf OPTS="--entrypoint=env" | grep BDEVPERF_TEST_DURATION | tr -d '\n\r' ) + timeout=$(expr $BDEVPERF_TEST_DURATION \* 2) + bdevperf="/usr/libexec/spdk/scripts/bdevperf.py" + make exec SVC=bdevperf OPTS=-T CMD="$bdevperf -v -t $timeout -s $BDEVPERF_SOCKET perform_tests" + + - name: Check coredump existence + if: success() || failure() + id: check_coredumps + uses: andstor/file-existence-action@20b4d2e596410855db8f9ca21e96fbe18e12930b # v2, pinned to SHA for security reasons + with: + files: "/var/lib/systemd/coredump/*" + + - name: Upload demo core dumps + if: steps.check_coredumps.outputs.files_exists == 'true' + uses: actions/upload-artifact@v1 + with: + name: ceph_nvmeof_demo_cores-${{ github.run_number }} + path: /var/lib/systemd/coredump/* + + - name: Display logs + if: success() || failure() + run: make logs OPTS='' + + - name: Tear down + if: success() || failure() + run: | + make down + make clean + + push-to-registry: + if: github.event_name == 'release' && startsWith(github.ref, 'refs/tags/v') + needs: [pytest, demo, discovery] + runs-on: ubuntu-latest + + steps: + - name: Download container images + uses: actions/download-artifact@v3 + with: + name: ceph_nvmeof_container_images-${{ github.run_number }} - name: Load container images run: | @@ -302,4 +457,3 @@ jobs: - name: Publish nvmeof containers when release/tag is created run: | make push - diff --git a/Makefile b/Makefile index 1715f3c4..db01c41a 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,7 @@ setup: ## Configure huge-pages (requires sudo/root password) @echo Actual Hugepages allocation: $$(cat $(HUGEPAGES_DIR)) @[ $$(cat $(HUGEPAGES_DIR)) -eq $(HUGEPAGES) ] -build pull logs: SVC ?= spdk bdevperf nvmeof nvmeof-devel nvmeof-cli ceph +build pull logs: SVC ?= spdk bdevperf nvmeof nvmeof-devel nvmeof-cli discovery ceph build: export NVMEOF_GIT_BRANCH != git name-rev --name-only HEAD build: export NVMEOF_GIT_COMMIT != git rev-parse HEAD @@ -34,7 +34,7 @@ build: export BUILD_DATE != date -u +"%Y-%m-%dT%H:%M:%SZ" up: ## Launch services up: SVC ?= ceph nvmeof ## Services up: OPTS ?= --abort-on-container-exit --exit-code-from $(SVC) --remove-orphans -up: override OPTS += --scale nvmeof=$(SCALE) +#up: override OPTS += --scale nvmeof=$(SCALE) clean: $(CLEAN) setup ## Clean-up environment clean: override HUGEPAGES = 0 diff --git a/ceph-nvmeof.conf b/ceph-nvmeof.conf index c895d6fc..f05cd065 100644 --- a/ceph-nvmeof.conf +++ b/ceph-nvmeof.conf @@ -20,7 +20,6 @@ state_update_interval_sec = 5 enable_discovery_controller = false [discovery] - addr = 0.0.0.0 port = 8009 debug = 20 diff --git a/docker-compose.yaml b/docker-compose.yaml index 7c518080..2a6828c2 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -99,16 +99,15 @@ services: NVMEOF_GIT_COMMIT: labels: io.ceph.nvmeof: - hostname: nvmeof volumes: # sudo bash -c 'echo 2048 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages' # https://spdk.io/doc/containers.html # TODO: Pending of https://github.com/spdk/spdk/issues/2973 - /dev/hugepages:/dev/hugepages - /dev/vfio/vfio:/dev/vfio/vfio - - ceph-conf:/etc/ceph:ro - $NVMEOF_CONFIG:/src/ceph-nvmeof.conf - /tmp/coredump:/tmp/coredump # core dump + - ceph-conf:/etc/ceph:ro cap_add: - SYS_ADMIN # huge-pages - CAP_SYS_NICE # RTE @@ -127,9 +126,23 @@ services: extends: service: nvmeof-base image: $QUAY_NVMEOF:$NVMEOF_VERSION + ports: + - "$NVMEOF_IO_PORT" # I/O controllers + - "$NVMEOF_GW_PORT" # Gateway + depends_on: + ceph: + condition: service_healthy + discovery: + extends: + service: nvmeof-base + image: $QUAY_NVMEOF:$NVMEOF_VERSION + ports: + - "$NVMEOF_DISC_PORT" # Discovery depends_on: ceph: condition: service_healthy + entrypoint: >- + python3 -m control.discovery # Used to update lockfile (make update-lockfile) nvmeof-builder-base: extends: diff --git a/mk/demo.mk b/mk/demo.mk index cbeaf12a..4f2f9c80 100644 --- a/mk/demo.mk +++ b/mk/demo.mk @@ -6,6 +6,8 @@ rbd: SVC = ceph rbd: CMD = bash -c "rbd -p $(RBD_POOL) info $(RBD_IMAGE_NAME) || rbd -p $(RBD_POOL) create $(RBD_IMAGE_NAME) --size $(RBD_IMAGE_SIZE)" # demo +# the fist gateway in docker enviroment, hostname defaults to container id +demo: export NVMEOF_HOSTNAME != docker ps -q -f name=ceph-nvmeof_nvmeof_1 demo: rbd ## Expose RBD_IMAGE_NAME as NVMe-oF target $(NVMEOF_CLI) create_bdev --pool $(RBD_POOL) --image $(RBD_IMAGE_NAME) --bdev $(BDEV_NAME) $(NVMEOF_CLI) create_subsystem --subnqn $(NQN) From ca21b78ff93727a506c2da9d9eac477d44d7f2ef Mon Sep 17 00:00:00 2001 From: Alexander Indenbaum Date: Wed, 30 Aug 2023 17:23:19 +0300 Subject: [PATCH 04/10] discovery: refactor configuration property Signed-off-by: Alexander Indenbaum --- control/discovery.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/control/discovery.py b/control/discovery.py index ff875720..b8506b85 100644 --- a/control/discovery.py +++ b/control/discovery.py @@ -127,8 +127,9 @@ class Connection: nvmeof_connect_data_hostnqn: str = field(default_factory=str) sq_head_ptr: int = 0 unsent_log_page_len: int = 0 - property_data: str = field(default_factory=str) - property_set: bool = False + # NVM ExpressTM Revision 1.4, page 47 + # see Figure 78: Offset 14h: CC – Controller Configuration + property_configuration: tuple = tuple((c_ubyte *8)()) shutdown_now: bool = False controller_id: uuid = None gen_cnt: int = 0 @@ -458,17 +459,13 @@ def reply_fc_cmd_prop_get(self, conn, data, cmd_id): # b'\x00\x00\x46\x00\x00\x00\x00\x00' # 0x46: IO Submission Queue Entry Size: 0x6 (64 bytes) # IO Completion Queue Entry Size: 0x4 (16 bytes) - property_data = (c_ubyte * 8)(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) - if len(self_conn.property_data) == 8: - property_data = (c_ubyte * 8)(*self_conn.property_data) - property_get.property_data = property_data + property_get.property_data = self_conn.property_configuration elif NVME_CTL(nvmeof_prop_get_set_offset) == NVME_CTL.STATUS: - property_data = (c_ubyte * 8)(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) - if len(self_conn.property_data) == 8: - property_data = (c_ubyte * 8)(*self_conn.property_data) - shutdown_notification = ((property_data[1]) >> 6) & 0x3 + shutdown_notification = (self_conn.property_configuration[1] >> 6) & 0x3 if shutdown_notification == 0: - if self_conn.property_set is True: + # check CC.EN bit + enabled = self_conn.property_configuration[0] & 0x1 + if enabled != 0: # controller status: ready property_get.property_data = (c_ubyte * 8)(0x01, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) @@ -510,8 +507,7 @@ def reply_fc_cmd_prop_set(self, conn, data, cmd_id): if NVME_CTL(nvmeof_prop_get_set_offset) == NVME_CTL.CAPABILITIES: self.logger.error("property setting of capabilities is not supported.") elif NVME_CTL(nvmeof_prop_get_set_offset) == NVME_CTL.CONFIGURATION: - self_conn.property_data = struct.unpack_from('<8B', data, 56) - self_conn.property_set = True + self_conn.property_configuration = struct.unpack_from('<8B', data, 56) elif NVME_CTL(nvmeof_prop_get_set_offset) == NVME_CTL.STATUS: self.logger.error("property setting of status is not supported.") elif NVME_CTL(nvmeof_prop_get_set_offset) == NVME_CTL.VERSION: @@ -811,7 +807,6 @@ def reply_get_log_page(self, conn, data, cmd_id): self_conn.unsent_log_page_len -= nvme_data_len if self_conn.unsent_log_page_len == 0: self_conn.log_page = b'' - self_conn.property_set = False self_conn.allow_listeners = [] else: self.logger.error("request log page lenghth error. It need to be 16 or n*1024") From 7bfa6886f5d19537c05764f305f54c1601c99471 Mon Sep 17 00:00:00 2001 From: Alexander Indenbaum Date: Wed, 30 Aug 2023 17:27:38 +0300 Subject: [PATCH 05/10] get log page: spdk/bdevperf compatability Signed-off-by: Alexander Indenbaum --- control/discovery.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/control/discovery.py b/control/discovery.py index b8506b85..6364f968 100644 --- a/control/discovery.py +++ b/control/discovery.py @@ -686,9 +686,12 @@ def reply_get_log_page(self, conn, data, cmd_id): nvme_nsid = struct.unpack_from(' Date: Wed, 30 Aug 2023 18:07:03 +0300 Subject: [PATCH 06/10] discover: get log page unaligned memory access Signed-off-by: Alexander Indenbaum --- control/discovery.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/control/discovery.py b/control/discovery.py index 6364f968..7d8cd6a2 100644 --- a/control/discovery.py +++ b/control/discovery.py @@ -694,7 +694,9 @@ def reply_get_log_page(self, conn, data, cmd_id): nvme_sgl_len = nvme_sgl[8] + (nvme_sgl[9] << 8) + (nvme_sgl[10] << 16) + (nvme_sgl[11] << 24) nvme_get_logpage_dword10 = struct.unpack_from(' Date: Tue, 5 Sep 2023 18:42:54 +0300 Subject: [PATCH 07/10] README.md: discovery service container in docker-compose environment Signed-off-by: Alexander Indenbaum --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index f8b8441c..549e2563 100644 --- a/README.md +++ b/README.md @@ -247,6 +247,11 @@ The discovery service can provide all the targets that the current user can acce $ python3 -m control.discovery ``` +2. To start discovery service container in docker-compose environment + ```bash + $ docker-compose up --detach discovery + ``` + ## Advanced ### Configuration From ebdaa2f92f2d00ad0fcec4369deb21de71c34d12 Mon Sep 17 00:00:00 2001 From: Yin Congmin Date: Wed, 6 Sep 2023 16:27:14 +0800 Subject: [PATCH 08/10] README.md: discover target from discovery service Signed-off-by: Yin Congmin --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 549e2563..4ec10b49 100644 --- a/README.md +++ b/README.md @@ -252,6 +252,11 @@ The discovery service can provide all the targets that the current user can acce $ docker-compose up --detach discovery ``` +3. Discover targets from discovery service. The default port is 8009. + ```bash + $ nvme discover -t tcp -a 192.168.13.3 -s 8009 + ``` + ## Advanced ### Configuration From ede30654fa05f2edcbd32729adc1d9988d1a0fee Mon Sep 17 00:00:00 2001 From: Yin Congmin Date: Wed, 6 Sep 2023 16:42:34 +0800 Subject: [PATCH 09/10] discovery: update class Connection data type Signed-off-by: Yin Congmin --- control/discovery.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/control/discovery.py b/control/discovery.py index 7d8cd6a2..13606376 100644 --- a/control/discovery.py +++ b/control/discovery.py @@ -121,10 +121,10 @@ class Connection: allow_listeners: list = field(default_factory=list) log_page: bytearray = field(default_factory=bytearray) recv_buffer: bytearray = field(default_factory=bytearray) - nvmeof_connect_data_hostid: str = field(default_factory=str) + nvmeof_connect_data_hostid: tuple = tuple((c_ubyte *16)()) nvmeof_connect_data_cntlid: int = 0 - nvmeof_connect_data_subnqn: str = field(default_factory=str) - nvmeof_connect_data_hostnqn: str = field(default_factory=str) + nvmeof_connect_data_subnqn: tuple = tuple((c_ubyte *256)()) + nvmeof_connect_data_hostnqn: tuple = tuple((c_ubyte *256)()) sq_head_ptr: int = 0 unsent_log_page_len: int = 0 # NVM ExpressTM Revision 1.4, page 47 From d65c6b2e865cc2732a2cea2e594bed4286461718 Mon Sep 17 00:00:00 2001 From: Yin Congmin Date: Wed, 6 Sep 2023 16:58:35 +0800 Subject: [PATCH 10/10] discovery: merge code block in reply_get_log_page() Signed-off-by: Yin Congmin --- control/discovery.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/control/discovery.py b/control/discovery.py index 13606376..d91ee23d 100644 --- a/control/discovery.py +++ b/control/discovery.py @@ -795,20 +795,12 @@ def reply_get_log_page(self, conn, data, cmd_id): nvme_tcp_data_pdu.data_length = nvme_data_len # reply based on the received get log page request packet(length) - if nvme_data_len < 1024: - # nvme cli version: 1.x + if nvme_data_len <= 1024 and nvme_logpage_offset == 0: nvme_get_log_page_reply = NVMeGetLogPage() nvme_get_log_page_reply.genctr = self_conn.gen_cnt nvme_get_log_page_reply.numrec = len(listeners) reply = pdu_reply + nvme_tcp_data_pdu + bytes(nvme_get_log_page_reply)[:nvme_data_len] - elif nvme_data_len == 1024 and nvme_logpage_offset == 0: - # nvme cli version: 2.x - nvme_get_log_page_reply = NVMeGetLogPage() - nvme_get_log_page_reply.genctr = self_conn.gen_cnt - nvme_get_log_page_reply.numrec = len(listeners) - - reply = pdu_reply + nvme_tcp_data_pdu+ nvme_get_log_page_reply elif nvme_data_len % 1024 == 0: # reply log pages reply = pdu_reply + nvme_tcp_data_pdu + \