diff --git a/nullsound/nss-ssg.s b/nullsound/nss-ssg.s
index 8b86082..14a265c 100644
--- a/nullsound/nss-ssg.s
+++ b/nullsound/nss-ssg.s
@@ -44,7 +44,7 @@
 ;;; ------
         ;; This padding ensures the entire _state_ssg data sticks into
         ;; a single 256 byte boundary to make 16bit arithmetic faster
-        .blkb   90
+        ;; .blkb   90
 
 _state_ssg_start:
 
diff --git a/nullsound/stream.s b/nullsound/stream.s
index 480344a..2ea2a87 100644
--- a/nullsound/stream.s
+++ b/nullsound/stream.s
@@ -1,6 +1,6 @@
 ;;;
 ;;; nullsound - modular sound driver
-;;; Copyright (c) 2023 Damien Ciabrini
+;;; Copyright (c) 2023-2024 Damien Ciabrini
 ;;; This file is part of ngdevkit
 ;;;
 ;;; ngdevkit is free software: you can redistribute it and/or modify
@@ -23,6 +23,13 @@
 
         .include "ym2610.inc"
 
+        .equ    CH_STREAM_SAVED, (state_ch_stream_saved_pos-state_ch_stream)
+        .equ    CH_STREAM_START, (state_ch_stream_start-state_ch_stream)
+        .equ    CH_STREAM_POS, (state_ch_stream_pos-state_ch_stream)
+        .equ    CH_STREAM_SIZE, (state_ch_stream_end-state_ch_stream)
+        .equ    NB_YM2610_CHANNELS, 14
+        .equ    OPCODE_NSS_NOP, 8
+
 
 ;;;
 ;;; Sound stream state tracker
@@ -33,30 +40,158 @@
 ;;;
         .area  DATA
 
-state_stream_in_use::
-        .db     0x00
-
-state_stream_addr::
-        .dw     0x00
+;;; stream playback running
+state_stream_in_use::           .blkb   1
 
-state_stream_current_addr::
-        .dw     0x00
+;;; NSS instrument data used by this stream
+state_stream_instruments::      .blkb   2
 
-state_stream_instruments::
-        .dw     0x00
+;;; number of streams to play
+state_streams::                 .blkb   1
+        
+;;; per-channel context switch function
+;;; ---
+;;; When multiple streams are used, each stream represents a unique
+;;; YM2610 channel. Before evaluating NSS opcode for a stream, the
+;;; player has to set up the right NSS context, by calling the
+;;; right <x>_CTX opcode.
+state_ch_ctx_switch::           .blkb   NB_YM2610_CHANNELS
+
+;;; per-channel wait state
+;;; ---
+;;; Wait for a "number of rows" worth of time until processing further
+;;; opcodes in the stream.
+;;; When multiple streams are used, each YM2610 channel used in
+;;; the NSS data gets a dedicated wait state
+state_ch_wait_rows::           .blkb   NB_YM2610_CHANNELS
+
+;;; per-channel playback state
+;;; ---
+;;; Keep track of positional information for streams.
+;;;  - (absolute) saved caller position in the stream, for ret opcodes
+;;;  - (absolute) current position in the stream
+;;;  - (absolute) stream start for computing offset of jmp/call opcodes
+;;; When multiple streams are used, each YM2610 channel used in
+;;; the NSS data gets a dedicated playback state
+state_ch_stream:
+state_ch_stream_saved_pos::     .blkb   2
+state_ch_stream_start::         .blkb   2
+state_ch_stream_pos::           .blkb   2
+state_ch_stream_end:
+        .blkb   CH_STREAM_SIZE*(NB_YM2610_CHANNELS-1)
+
+;;; addresses/indices that points to state of the currently processed stream
+state_current_ch_ctx::          .blkb   2
+state_current_ch_wait_rows::    .blkb   2
+state_current_ch_stream::       .blkb   2
+state_stream_idx::              .blkb   1
+
+        ;; FIXME: temporary padding to ensures the next data sticks into
+        ;; a single 256 byte boundary to make 16bit arithmetic faster
+        .blkb   70
 
 
         .area  CODE
 
+
 init_stream_state_tracker::
         ld      a, #0
         ld      (state_stream_in_use), a
         ld      bc, #0
-        ld      (state_stream_addr), bc
-        ld      (state_stream_current_addr), bc
         ld      (state_stream_instruments), bc
         ret
 
+;;; substract one row from every stream's wait state
+;;; ------
+;;; . When called, one row passed since last update, so substract
+;;;   it from the wait state of all streams.
+;;; . Streams whose wait_rows value goes down to 0 become ready
+;;;   for processing NSS opcodes (in process_streams_opcodes)
+;;; . By design, substraction should never yield a negative wait
+;;; [no register modified]
+update_streams_wait_rows::
+        push    de
+        push    bc
+        push    hl
+        ;; c: streams playing
+        ld      a, (state_streams)
+        ld      c, a
+        ;; hl: streams wait state
+        ld      hl, #state_ch_wait_rows
+_tick_sub_loop:
+        dec     (hl)
+        inc     hl
+        dec     c
+        jr      nz, _tick_sub_loop
+
+        pop     hl
+        pop     bc
+        pop     de
+        ret
+
+
+;;; process opcodes for all the streams that are not waiting for ticks
+;;; ------
+;;; [all registers modified]
+process_streams_opcodes::
+        ;; init stream state pointers
+        ld      a, #0
+        ld      (state_stream_idx), a
+        ld      bc, #state_ch_wait_rows
+        ld      (state_current_ch_wait_rows), bc
+        ld      bc, #state_ch_ctx_switch
+        ld      (state_current_ch_ctx), bc
+        ld      bc, #state_ch_stream
+        ld      (state_current_ch_stream), bc
+_loop_chs:
+        ;; [de]: wait ticks for current stream
+        ld      de, (state_current_ch_wait_rows)
+        ld      a, (de)
+        ;; loop to next stream if this one is not ready
+        ;; to process more opcodes
+        cp      #0
+        jp      nz, _post_ch_process
+        ;; otherwise setup stream context
+        ;; (by processing the current ctx opcode)
+        ld      hl, (state_current_ch_ctx)
+        call    process_nss_opcode
+
+        ;; process the stream's next opcodes
+
+        ;; hl: current stream's position
+        ld      ix, (state_current_ch_stream)
+        ld      l, CH_STREAM_POS(ix)
+        ld      h, CH_STREAM_POS+1(ix)
+_loop_opcode:
+        call    process_nss_opcode
+        or      a
+        jp      nz, _loop_opcode
+        ;; no more opcodes can be processed, save stream's new pos
+        ld      ix, (state_current_ch_stream)
+        ld      CH_STREAM_POS(ix), l
+        ld      CH_STREAM_POS+1(ix), h
+_post_ch_process:
+        ld      a, (state_streams)
+        ld      b, a
+        ld      a, (state_stream_idx)
+        inc     a
+        cp      b
+        jr      nc, _end_process
+        ld      (state_stream_idx), a
+        ld      hl, (state_current_ch_wait_rows)
+        inc     hl
+        ld      (state_current_ch_wait_rows), hl
+        ld      hl, (state_current_ch_ctx)
+        inc     hl
+        ld      (state_current_ch_ctx), hl
+        ld      hl, (state_current_ch_stream)
+        ld      bc, #CH_STREAM_SIZE
+        add     hl, bc
+        ld      (state_current_ch_stream), hl
+        jr      _loop_chs
+_end_process:
+        ret
+
 
 ;;; Evaluate the opcodes from the current nullsound stream,
 ;;; until an opcode must yield the execution (end of stream, timer wait)
@@ -69,61 +204,162 @@ update_stream_state_tracker::
         ;; check whether stream is in use
         ld      a, (state_stream_in_use)
         or      a
-        jp      z, _no_more_processing
-        ;; check whether we can process the next nss opcodes
-        ld      a, (state_timer_int_b_wait)
+        jp      z, _end_update_stream
+        ;; check whether one row has passed and any stream is ready for
+        ;; processing more NSS opcodes
+        ld      a, (state_timer_ticks_per_row)
         ld      b, a
-        ld      a, (state_timer_int_b_count)
+        ld      a, (state_timer_ticks_count)
         cp      b
         ;; if we can't, check whether we have macros or effects to process
         jp      c, _check_update_macros_and_effects
-        sub     b
-        ld      (state_timer_int_b_count), a
-process_opcodes::
-        push    ix
-        ;; process the next opcodes
-        ld      hl, (state_stream_current_addr)
-_loop_opcode:
-        call    process_nss_opcode
-        or      a
-        jp      nz, _loop_opcode
-        ;; no more opcodes can be processed
-        ld      (state_stream_current_addr), hl
-        pop     ix
-_no_more_processing:
-        pop     bc
-        pop     hl
-        ret
+        call    update_streams_wait_rows
+        call    process_streams_opcodes
+        ;; reset row and tick reached counters and exit
+        ld      a, #0
+        ld      (state_timer_ticks_count), a
+        jp      _reset_tick_reached
 _check_update_macros_and_effects:
-        ld      a, (state_timer_int_b_reached)
-        cp      a, #1
-        jp      nz, _no_macro_update
+        ld      a, (state_timer_tick_reached)
+        cp      a, #0
+        jp      z, _end_update_stream
         call    update_fm_effects
         call    update_ssg_macros_and_effects
+_reset_tick_reached:
+        ;; reset the tick reached marker, next macro/effect processing
+        ;; will take place once a new tick is reached
         ld      a, #0
-        ld      (state_timer_int_b_reached), a
-_no_macro_update:
+        ld      (state_timer_tick_reached), a
+_end_update_stream:
         pop     bc
         pop     hl
         ret
 
 
-;;; Play music or sfx from a pre-compiled stream of sound opcodes
-;;; the data is encoded in the nullsound stream format
+;;; Initialize subsystems' state trackers and stream wait state
+;;; ------
+snd_stream_reset_state::
+        ;; reset state trackers
+        call    init_nss_fm_state_tracker
+        call    init_nss_ssg_state_tracker
+        call    init_nss_adpcm_state_tracker
+        ld      a, #1
+        ld      (state_stream_in_use), a
+
+        ;; init stream wait trackers
+        ld      a, #1
+        ld      hl, #state_ch_wait_rows
+        ld      (hl), a
+        ld      a, (state_streams)
+        dec     a
+        cp      #0
+        jr      z, _post_memset_wait_rows
+        ld      b, #0
+        ld      c, a
+        ld      d, h
+        ld      e, l
+        inc     de
+        ldir
+_post_memset_wait_rows:
+        ret
+
+
+;;; Play music or sfx from a series of NSS sound opcodes stored
+;;; in ROM in inline (1 stream) or compact (multi-stream) format
 ;;; ------
-;;; bc: nullsound instruments to use
-;;; de: nullsound stream to play
+;;; bc: nullsound instruments
+;;; de: nullsound stream (inline or compact format)
 ;;; [a modified - other registers saved]
 snd_stream_play::
+        ;; (de) = 0xff: inline NSS stream
+        ;; (de) > 0: multi-stream NSS
+        ld      a, (de)
+        cp      #0xff
+        jp      nz, snd_multi_stream_play
+
+        ;; prepare stream playback for a single stream
         call    snd_stream_stop
-        ld      (state_stream_addr), de
-        ld      (state_stream_current_addr), de
+
+        ;; setup current instruments
         ld      (state_stream_instruments), bc
+
+        ;; setup playback for a single NSS steam
         ld      a, #1
-        ld      (state_stream_in_use), a
-        call    init_nss_fm_state_tracker
-        call    init_nss_ssg_state_tracker
-        call    init_nss_adpcm_state_tracker
+        ld      (state_streams), a
+
+        ;; for single NSS stream, ctx switch table is not used (nop),
+        ;; context opcodes are part of the stream itself
+        ld      hl, #state_ch_ctx_switch
+        ld      a, #OPCODE_NSS_NOP
+        ld      (hl), a
+
+        ;; init stream state
+        inc     de
+        ld      (state_ch_stream_start), de
+        ld      (state_ch_stream_pos), de
+
+        ;; reset state trackers
+        call    snd_stream_reset_state
+
+        ;; start stream playback, it will get preempted
+        ;; as soon as a wait opcode shows up in the stream
+        call    update_stream_state_tracker
+        ret
+
+
+;;; Play music or sfx from a pre-compiled list of NSS opcodes,
+;;; encoded as multiple NSS streams (compact representation)
+;;; ------
+;;; bc: nullsound instruments
+;;; de: NSS data (compact representation)
+;;; [a modified - other registers saved]
+snd_multi_stream_play::
+        call    snd_stream_stop
+        push    de
+        pop     ix
+
+        ;; setup current instruments
+        ld      (state_stream_instruments), bc
+
+        ;; a: number of streams
+        ld      a, (ix)
+        ld      (state_streams), a
+
+        ;; configure every stream with the right channel ctx opcode
+        ;; hl: stream data from NSS
+        inc     ix
+        push    ix
+        pop     hl
+        ;; de: stream contexts
+        ld      de, #state_ch_ctx_switch
+        ;; bc: number of streams
+        ld      b, #0
+        ld      c, a
+        ldir
+
+        ;; init streams state
+        ld      ix, #state_ch_stream
+        ld      de, #CH_STREAM_SIZE
+        ld      a, (state_streams)
+        ld      c, a
+_stream_play_init_loop:
+        ;; a: stream data LSB
+        ld      a, (hl)
+        ld      CH_STREAM_START(ix), a
+        ld      CH_STREAM_POS(ix), a
+        inc     hl
+        ;; a: stream data MSB
+        ld      a, (hl)
+        ld      CH_STREAM_START+1(ix), a
+        ld      CH_STREAM_POS+1(ix), a
+        inc     hl
+        add     ix, de
+        dec     c
+        jr      nz, _stream_play_init_loop
+
+        ;; reset state trackers
+        call    snd_stream_reset_state
+
         ;; start stream playback, it will get preempted
         ;; as soon as a wait opcode shows up in the stream
         call    update_stream_state_tracker
@@ -139,8 +375,8 @@ snd_stream_stop::
         ;; clear playback state tracker
         ld      a, #0
         ld      (state_stream_in_use), a
-        ld      (state_timer_int_b_count), a
-        ld      (state_timer_int_b_wait), a
+        ld      (state_timer_ticks_count), a
+        ld      (state_timer_ticks_per_row), a
         ret
 
 
@@ -159,14 +395,14 @@ snd_stream_stop::
 nss_opcodes:
         .dw     write_port_a
         .dw     write_port_b
-        .dw     nss_loop
+        .dw     nss_jmp
         .dw     nss_end
-        .dw     run_timer_b
-        .dw     wait_int_b
-        .dw     fm_instrument_ext
-        .dw     fm_note_on_ext
-        .dw     fm_note_off_ext
-        .dw     adpcm_a_instrument_ext
+        .dw     timer_tempo
+        .dw     wait_rows
+        .dw     nss_call
+        .dw     nss_ret
+        .dw     nss_nop
+        .dw     row_speed
         .dw     adpcm_a_on_ext
         .dw     adpcm_a_off_ext
         .dw     adpcm_b_instrument
@@ -266,20 +502,25 @@ write_port_b::
         ret
 
 
-;;; NSS_LOOP
+;;; NSS_JMP
 ;;; jump to a location from the start of the NSS stream
 ;;; ------
 ;;; [ hl ]: offset LSB
 ;;; [hl+1]: offset MSB
-nss_loop::
+nss_jmp::
         push    bc
+        ;; bc: location offset
         ld      c, (hl)
         inc     hl
         ld      b, (hl)
-        inc     hl
-        ld      hl, (state_stream_addr)
+        ld      ix, (state_current_ch_stream)
+        ;; hl: start of stream
+        ld      l, CH_STREAM_START(ix)
+        ld      h, CH_STREAM_START+1(ix)
+        ;; hl: new pos (call offset)
         add     hl, bc
-        ld      (state_stream_current_addr), hl
+        ld      CH_STREAM_POS(ix), l
+        ld      CH_STREAM_POS+1(ix), h
         pop     bc
         ld      a, #1
         ret
@@ -294,55 +535,82 @@ nss_end::
         ret
 
 
-;;; RUN_TIMER_B
-;;; configure YM2610's timer B and start it
-;;; ------
-;;; [hl]: Timer B counter
-run_timer_b::
-        ;; reset all timers
-        ld      b, #REG_TIMER_FLAGS
-        ld      c, #0x30
-        call    ym2610_write_port_a
-        ;; configure timer B
-        ld      b, #REG_TIMER_B_COUNTER
-        ld      c, (hl)
-        inc     hl
-        call    ym2610_write_port_a
-        ;; deconfigure timer A (TODO remove it)
-        ld      b, #REG_TIMER_A_COUNTER_LSB
-        ld      c, #0x0
-        call    ym2610_write_port_a
-        ld      b, #REG_TIMER_A_COUNTER_MSB
-        ld      c, #0x0
-        call    ym2610_write_port_a
-        ;; start timer right away
-        ld      a, #0
-        ld      (state_timer_int_b_count), a
-        ld      b, #REG_TIMER_FLAGS
-        ld      c, #0x3A
-        call    ym2610_write_port_a
-        ei
-        ld      a, #1
-        ret
-
-
-;;; WAIT_INT_B
-;;; Suspend stream playback, resume after a number of Timer B
-;;; interrupts has passed.
+;;; WAIT_ROWS
+;;; Suspend stream playback, resume after a number of rows
+;;; worth of time has passed (Timer B interrupts * speed).
 ;;; ------
 ;;; [hl]: number of interrupts until playback resumes
-wait_int_b::
+wait_rows::
+        push    bc
         ;;  how many interrupts to wait for before moving on
         ld      a, (hl)
         inc     hl
-        ld      (state_timer_int_b_wait), a
-        xor     a
-        ld      (state_timer_int_b_reached), a
-
-        ;; reset playback contexts
+        ;; register the wait for this channel
+        ld      bc, (state_current_ch_wait_rows)
+        ld      (bc), a
+_post_wait_rows:
+        ;; reset playback contexts (only useful for inline stream)
         call    fm_ctx_reset
         call    ssg_ctx_reset
         call    adpcm_a_ctx_reset
-        
+
+        pop     bc
         ld      a, #0
         ret
+
+
+;;; NSS_CALL
+;;; Continue playback to a new position in the stream
+;;; Recall the current position so that a NSS_RET opcode
+;;; continue execution from there.
+;;; Note: no NSS_CALL can be executed again before a NSS_RET
+;;; ------
+;;; [ hl ]: LSB forward offset to jump to
+;;; [hl+1]: MSB forward offset to jump to
+nss_call::
+        push    bc
+
+        ld      ix, (state_current_ch_stream)
+        ;; bc: offset
+        ld      c, (hl)
+        inc     hl
+        ld      b, (hl)
+        inc     hl
+        ;; save current stream pos
+        ld      CH_STREAM_SAVED(ix), l
+        ld      CH_STREAM_SAVED+1(ix), h
+        ;; hl: start of stream
+        ld      l, CH_STREAM_START(ix)
+        ld      h, CH_STREAM_START+1(ix)
+        ;; hl: new pos (call offset)
+        add     hl, bc
+        ld      CH_STREAM_POS(ix), l
+        ld      CH_STREAM_POS+1(ix), h
+
+        pop     bc
+        ld      a, #1
+        ret
+
+
+;;; NSS_RET
+;;; Continue playback past the previous NSS_CALL statement
+;;; ------
+nss_ret::
+        ld      ix, (state_current_ch_stream)
+        ;; hl: saved current stream pos
+        ld      l, CH_STREAM_SAVED(ix)
+        ld      h, CH_STREAM_SAVED+1(ix)
+        ;; hl: restore new stream pos
+        ld      CH_STREAM_POS(ix), l
+        ld      CH_STREAM_POS+1(ix), h
+
+        ld      a, #1
+        ret
+
+
+;;; NSS_NOP
+;;; Empty operation
+;;; ------
+nss_nop::
+        ld      a, #1
+        ret
diff --git a/nullsound/timer.s b/nullsound/timer.s
index 87c5553..58ab2dd 100644
--- a/nullsound/timer.s
+++ b/nullsound/timer.s
@@ -1,6 +1,6 @@
 ;;;
 ;;; nullsound - modular sound driver
-;;; Copyright (c) 2023 Damien Ciabrini
+;;; Copyright (c) 2023-2024 Damien Ciabrini
 ;;; This file is part of ngdevkit
 ;;;
 ;;; ngdevkit is free software: you can redistribute it and/or modify
@@ -33,40 +33,33 @@
 ;;;
         .area  DATA
 
-state_timer_int_a_count::
+state_timer_tick_reached::
         .db     0
 
-state_timer_int_a_wait::
+state_timer_ticks_count::
         .db     0
 
-state_timer_int_b_count::
+state_timer_ticks_per_row::
         .db     0
 
-state_timer_int_b_wait::
-        .db     0
-
-state_timer_int_b_reached::
-        .db     0
 
         .area  CODE
 
 init_timer_state_tracker::
         ld      a, #0
-        ld      (state_timer_int_a_count), a
-        ld      (state_timer_int_a_wait), a
-        ld      (state_timer_int_b_count), a
-        ld      (state_timer_int_b_wait), a
-        ld      (state_timer_int_b_reached), a
+        ld      (state_timer_tick_reached), a
+        ld      (state_timer_ticks_count), a
+        ld      (state_timer_ticks_per_row), a
         ret
 
 
 update_timer_state_tracker::
         ld      a, #1
-        ld      (state_timer_int_b_reached), a
+        ld      (state_timer_tick_reached), a
         ;; keep track of the new interrupt
-        ld      a, (state_timer_int_b_count)
+        ld      a, (state_timer_ticks_count)
         inc     a
-        ld      (state_timer_int_b_count), a
+        ld      (state_timer_ticks_count), a
 
         ;; update the YM2610 here to reset the interrupt flags
         ;; and rearm the interrupt timer
@@ -90,3 +83,51 @@ update_timer_state_tracker::
         call    ym2610_restore_context_port_a
 
         ret
+
+
+;;;
+;;; NSS opcodes
+;;;
+
+;;; TIMER_TEMPO
+;;; configure YM2610's timer B for a specific tempo and start it
+;;; ------
+;;; [hl]: Timer B counter
+timer_tempo::
+        ;; reset all timers
+        ld      b, #REG_TIMER_FLAGS
+        ld      c, #0x30
+        call    ym2610_write_port_a
+        ;; configure timer B
+        ld      b, #REG_TIMER_B_COUNTER
+        ld      c, (hl)
+        inc     hl
+        call    ym2610_write_port_a
+        ;; deconfigure timer A (TODO remove it)
+        ld      b, #REG_TIMER_A_COUNTER_LSB
+        ld      c, #0x0
+        call    ym2610_write_port_a
+        ld      b, #REG_TIMER_A_COUNTER_MSB
+        ld      c, #0x0
+        call    ym2610_write_port_a
+        ;; start timer right away
+        ld      a, #0
+        ld      (state_timer_ticks_count), a
+        ld      b, #REG_TIMER_FLAGS
+        ld      c, #0x3A
+        call    ym2610_write_port_a
+        ei
+        ld      a, #1
+        ret
+
+
+;;; ROW_SPEED
+;;; number of ticks to wait before processing the next row in the streams
+;;; ------
+;;; [hl]: ticks
+row_speed::
+        ld      a, (hl)
+        inc     hl
+        ld      (state_timer_ticks_per_row), a
+        ld      a, #1
+        ret
diff --git a/tools/nsstool.py b/tools/nsstool.py
index ebdaf18..a32976c 100755
--- a/tools/nsstool.py
+++ b/tools/nsstool.py
@@ -203,15 +203,15 @@ def register_nss_ops():
         # 0x00
         None,
         None,
-        ("nss_loop", ["lsb", "msb"]),
-        ("nss_end", ),
-        ("timer_b" , ["val"]),
-        ("wait_b"  , ["val"]),
-        None,
-        None,
+        ("jmp"     , ["lsb", "msb"]),
+        ("nss_end" , ),
+        ("tempo"   , ["val"]),
+        ("wait"    , ["rows"]),
+        ("call"    , ["lsb", "msb"]),
+        ("nss_ret" , ),
         # 0x08
-        None,
-        None,
+        ("nop"     , ),
+        ("speed"   , ["ticks"]),
         None,
         None,
         ("b_instr" , ["inst"]),
@@ -276,9 +276,10 @@ def register_nss_ops():
 #
 # Furnace module conversion functions
 #
-def convert_fm_row(row, channel, opcodes): 
+def convert_fm_row(row, channel):
     ctx_t = {0: fm_ctx_1, 1: fm_ctx_2, 2: fm_ctx_3, 3: fm_ctx_4}
     jmp_to_order = -1
+    opcodes = []
     if not is_empty(row):
         # context
         opcodes.append(ctx_t[channel]())
@@ -325,12 +326,13 @@ def convert_fm_row(row, channel, opcodes):
                 opcodes.append(fm_stop())
             else:
                 opcodes.append(fm_note(to_nss_note(row.note)))
-    return jmp_to_order
+    return jmp_to_order, opcodes
 
 
-def convert_s_row(row, channel, opcodes): 
+def convert_s_row(row, channel):
     ctx_t = {4: s_ctx_1, 5: s_ctx_2, 6: s_ctx_3}
     jmp_to_order = -1
+    opcodes = []
     if not is_empty(row):
         # context
         opcodes.append(ctx_t[channel]())
@@ -367,12 +369,13 @@ def convert_s_row(row, channel, opcodes):
                 opcodes.append(s_stop())
             else:
                 opcodes.append(make_ssg_note(row.note))
-    return jmp_to_order
+    return jmp_to_order, opcodes
 
 
-def convert_a_row(row, channel, opcodes): 
+def convert_a_row(row, channel):
     ctx_t = {7: a_ctx_1, 8: a_ctx_2, 9: a_ctx_3, 10: a_ctx_4, 11: a_ctx_5, 12: a_ctx_6}
     jmp_to_order = -1
+    opcodes = []
     if not is_empty(row):
         # context
         opcodes.append(ctx_t[channel]())
@@ -398,11 +401,12 @@ def convert_a_row(row, channel, opcodes):
                 opcodes.append(a_stop())
             else:
                 opcodes.append(a_start())
-    return jmp_to_order
+    return jmp_to_order, opcodes
 
 
-def convert_b_row(row, channel, opcodes): 
+def convert_b_row(row, channel):
     jmp_to_order = -1
+    opcodes = []
     if not is_empty(row):
         # instrument
         if row.ins != -1:
@@ -426,24 +430,24 @@ def convert_b_row(row, channel, opcodes):
                 opcodes.append(b_stop())
             else:
                 opcodes.append(b_note(to_nss_b_note(row.note)))
-    return jmp_to_order
+    return jmp_to_order, opcodes
 
 
-def raw_nss(m, p, bs, channels):
+def raw_nss(m, p, bs, channels, compact):
     # unoptimized nss opcodes generated from the Furnace song
     nss = []
-    
-    # channels to consider for conversion
-    chlist = [int(c, 16) for c in sorted(list(channels.lower()))]
-    f_channels = list(filter(lambda x: 0 <= x <= 3, chlist))
-    s_channels = list(filter(lambda x: 4 <= x <= 6, chlist))
-    a_channels = list(filter(lambda x: 7 <= x <= 12, chlist))
-    b_channel = list(filter(lambda x: x == 13, chlist))
+
+    f_channels = list(range(0,3+1))
+    s_channels = list(range(4,6+1))
+    a_channels = list(range(7,12+1))
+    b_channel = list([13])
+    selected_f = [x for x in f_channels if x in channels]
+    selected_s = [x for x in s_channels if x in channels]
+    selected_a = [x for x in a_channels if x in channels]
+    selected_b = [x for x in b_channel if x in channels]
 
     # initialize stream speed from module 
     tick = m.speed
-    tb = round(256 - (4000000 / (1152 * m.frequency)))
-    nss.append(timer_b(tb))
 
     # -- structures
     # a song is composed of a sequence of orders
@@ -463,12 +467,14 @@ def raw_nss(m, p, bs, channels):
     # is essentially equivalent to looping the song playback.
     
     seen_orders=[]
+    seen_patterns=[]
     order=0
 
+    blocks = []
+
     while order < len(m.orders) and order not in seen_orders:
         # recall we've processed this order and set its location in the stream
         seen_orders.append(order)
-        nss.append(nss_label(order))
 
         #  -1: no jump required after row processed
         #   n: jump to order n for the next row to play
@@ -476,13 +482,20 @@ def raw_nss(m, p, bs, channels):
         # 257: jump outside the stream (i.e. stop)
         jmp_to_order = -1
 
+        # get pattern indices for current order
+        pattern_indices = m.orders[order]
         order_patterns = [p[(m.orders[order][f],f)] for f in range(14)]
 
+        # reference start of order
+        jmp_label = nss_label("jmp_%x"%order)
+        nss.append(jmp_label)
+
         # all channels should have the same number of rows
         pattern_length = len(order_patterns[0].rows)
         assert len(set([len(p.rows) for p in order_patterns])) == 1
         assert pattern_length == m.pattern_len
 
+        pattern_opcodes = []
         for index in range(pattern_length):
             # nss opcodes to add at the end of each processed Furnace row
             opcodes = []
@@ -490,29 +503,36 @@ def raw_nss(m, p, bs, channels):
             # FM channels
             for channel in f_channels:
                 row = order_patterns[channel].rows[index]
-                j = convert_fm_row(row, channel, opcodes)
+                j, f_opcodes = convert_fm_row(row, channel)
+                if channel in selected_f:
+                    opcodes.extend(f_opcodes)
                 jmp_to_order = max(jmp_to_order, j)
             # SSG channels
             for channel in s_channels:
                 row = order_patterns[channel].rows[index]
-                j = convert_s_row(row, channel, opcodes)
+                j, s_opcodes = convert_s_row(row, channel)
+                if channel in selected_s:
+                    opcodes.extend(s_opcodes)
                 jmp_to_order = max(jmp_to_order, j)
             # ADPCM-A channels
             for channel in a_channels:
                 row = order_patterns[channel].rows[index]
-                j = convert_a_row(row, channel, opcodes)
+                j, a_opcodes = convert_a_row(row, channel)
+                if channel in selected_a:
+                    opcodes.extend(a_opcodes)
                 jmp_to_order = max(jmp_to_order, j)
             # ADPCM-B channel
             for channel in b_channel:
                 row = order_patterns[channel].rows[index]
-                j = convert_b_row(row, channel, opcodes)
+                j, b_opcodes = convert_b_row(row, channel)
+                if channel in selected_b:
+                    opcodes.extend(b_opcodes)
                 jmp_to_order = max(jmp_to_order, j)
 
             # all channels are processed for this pos.
             # add all generated opcodes plus a time sync
-            nss.extend(opcodes)
-            nss.append(wait_b(tick))
-            
+            pattern_opcodes.extend(opcodes + [wait(1)])
+
             # stop processing further rows if a JMP fx was used
             if jmp_to_order != -1:
                 break
@@ -522,14 +542,40 @@ def raw_nss(m, p, bs, channels):
         else:
             order += 1
 
+        if compact:
+            # if this pattern was already processed, do not remember it twice
+            # NOTE: sometimes a patterns appears in a order where full playback
+            # is squeezed by a jump action from another pattern. In that case
+            # we have to consider that as a new pattern.
+            # To account for that, a pattern is identified by its index _and_
+            # its length.
+            pattern_index = pattern_indices[channels[0]]
+            pattern_waits = [x for x in pattern_opcodes if isinstance(x,wait)]
+            pattern_length = sum([x.rows for x in pattern_waits])
+            pattern_id = "%02x_%x"%(pattern_index,pattern_length)
+            if not pattern_id in seen_patterns:
+                # compact representation: labeled pattern that can be jump to
+                pattern_label = nss_label(pattern_id)
+                basic_block = [pattern_label] + pattern_opcodes + [nss_ret()]
+                blocks.extend(basic_block)
+                seen_patterns.append(pattern_id)
+            call_op = call(-1, -1)
+            call_op.pat = pattern_id
+            nss.append(call_op)
+        else:
+            nss.extend(pattern_opcodes)
+
     if order in seen_orders:
         # the last order was already processed, the stream will loop
-        nloop = nss_loop(-1, -1)
-        nloop.pat=order
+        nloop = jmp(-1, -1)
+        nloop.pat="jmp_%x"%order
         nss.append(nloop)
     else:
         # orders were processed in sequence, the stream will end
         nss.append(nss_end())
+    # add the pattern blocks that get called at the end of the stream,
+    # past the end opcode.
+    nss.extend(blocks)
     return nss
 
 
@@ -537,40 +583,51 @@ def raw_nss(m, p, bs, channels):
 # NSS optimization passes
 #
 
-def compact_wait_b(nss):
+def compact_wait(m, nss):
     compact = []
-    wait = 0
+    cur_wait = 0
     for op in nss:
-        if type(op) == wait_b:
-            wait += op.val
+        if type(op) == wait:
+            cur_wait += op.rows
             # the wait opcode cannot encode more than 255 ticks
-            if wait>255:
-                new_wait = wait_b(255)
+            if cur_wait>255:
+                new_wait = wait(255)
                 compact.append(new_wait)
-                wait -= 255
+                cur_wait -= 255
         else:
-            if wait>0:
-                new_wait = wait_b(wait)
+            if cur_wait>0:
+                new_wait = wait(cur_wait)
                 compact.append(new_wait)
-                wait=0
+                cur_wait=0
             compact.append(op)
     return compact
 
 
 def compact_instr(nss):
-    out = []
     fm_ctx_map = {fm_ctx_1: 0, fm_ctx_2: 1, fm_ctx_3: 2, fm_ctx_4: 3}
     fm_ctx = 0
     s_ctx_map = {s_ctx_1: 0, s_ctx_2: 1, s_ctx_3: 2}
     s_ctx = 0
     a_ctx_map = {a_ctx_1: 0, a_ctx_2: 1, a_ctx_3: 2, a_ctx_4: 3, a_ctx_5: 4, a_ctx_6: 5}
     a_ctx = 0
+
     fm_is = [-1, -1, -1, -1]
     s_is = [-1, -1, -1]
     a_is = [-1, -1, -1, -1, -1, -1]
     b_i = -1
-    for op in nss:
-        if type(op) == fm_instr:
+
+    def compact_instr_pass(op, out):
+        nonlocal fm_ctx
+        nonlocal s_ctx
+        nonlocal a_ctx
+        nonlocal b_i
+        nonlocal fm_is
+        nonlocal s_is
+        nonlocal a_is
+        nonlocal b_i
+        if type(op) == nss_label:
+            out.append(op)
+        elif type(op) == fm_instr:
             if fm_is[fm_ctx] != op.inst:
                 fm_is[fm_ctx] = op.inst
                 out.append(op)
@@ -597,19 +654,32 @@ def compact_instr(nss):
                 out.append(op)
         else:
             out.append(op)
+
+    out = run_control_flow_pass(compact_instr_pass, nss)
+    return out
+
+
+def remove_ctx(nss):
+    ctxs = [fm_ctx_1, fm_ctx_2, fm_ctx_3, fm_ctx_4,
+            s_ctx_1, s_ctx_2, s_ctx_3,
+            a_ctx_1, a_ctx_2, a_ctx_3, a_ctx_4, a_ctx_5, a_ctx_6]
+    out = [x for x in nss if type(x) not in ctxs]
     return out
 
 
 def compact_ctx(nss):
-    out = []
     fm_ctx_map = {fm_ctx_1: 0, fm_ctx_2: 1, fm_ctx_3: 2, fm_ctx_4: 3}
     s_ctx_map = {s_ctx_1: 0, s_ctx_2: 1, s_ctx_3: 2}
     a_ctx_map = {a_ctx_1: 0, a_ctx_2: 1, a_ctx_3: 2, a_ctx_4: 3, a_ctx_5: 4, a_ctx_6: 5}
     fm_ctx = 0
     s_ctx = 0
     a_ctx = 0
-    for op in nss:
-        if type(op) == wait_b:
+
+    def compact_ctx_pass(op, out):
+        nonlocal fm_ctx
+        nonlocal s_ctx
+        nonlocal a_ctx
+        if type(op) == wait:
             fm_ctx=0
             s_ctx=0
             a_ctx=0
@@ -621,20 +691,68 @@ def compact_ctx(nss):
             a_ctx+=1
         elif type(op) in fm_ctx_map.keys():
             val = fm_ctx_map[type(op)]
-            if fm_ctx == val: continue
+            if fm_ctx == val: return
             else: fm_ctx = val
         elif type(op) in s_ctx_map.keys():
             val = s_ctx_map[type(op)]
-            if s_ctx == val: continue
+            if s_ctx == val: return
             else: s_ctx = val
         elif type(op) in a_ctx_map.keys():
             val = a_ctx_map[type(op)]
-            if a_ctx == val: continue
+            if a_ctx == val: return
             else: a_ctx = val
         out.append(op)
+
+    out = run_control_flow_pass(compact_ctx_pass, nss)
     return out
 
 
+def stream_from_label(stream, label):
+    label = next((i for i, v in enumerate(stream) if isinstance(v,nss_label) and v.pat==label))
+    ret = next((i for i, v in enumerate(stream[label:]) if isinstance(v,nss_ret)))
+    return stream[label:label+ret+1]
+
+
+def run_control_flow_pass(pass_function, nss):
+    # a stream is composed of the main sequence of opcodes and
+    # optionally a series of blocks at the end, that are called by the
+    # main sequence.
+    out_main = []
+    out_blocks = []
+    # make sure we dump the block only once in the output
+    seen_blocks = {}
+    # current stream to push output opcodes to
+    out = out_main
+    # a stream can use call/ret opcodes, with a stack that is one call deep.
+    prev_stream = []
+    stream = list(nss)
+
+    while stream:
+        op = stream.pop(0)
+        if type(op) == call:
+            out.append(op)
+            if op.pat not in seen_blocks:
+                seen_blocks[op.pat] = True
+                out = out_blocks
+            else:
+                # evaluate this block to keep context up to date
+                # but do not keep the generated opcodes
+                out = []
+            prev_stream = stream
+            stream = stream_from_label(stream, op.pat)
+        elif type(op) == nss_ret:
+            out.append(op)
+            out = out_main
+            stream = prev_stream
+        elif type(op) in [jmp, nss_end]:
+            out.append(op)
+            break
+        else:
+            pass_function(op, out)
+
+    return out_main + out_blocks
+
+
 def simulate_ssg_autoenv(nss, ins):
     semitones = [ "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B" ]
     freqs = [
@@ -647,14 +765,15 @@ def simulate_ssg_autoenv(nss, ins):
         [2093.0 ,  2217.0 ,  2349.0 ,  2489.0 , 2637.0 , 2794.0 , 2960.0 , 3136.0 , 3322.0 , 3520.0 , 3729.0 , 3951.0 ],
         [4186.0 ,  4435.0 ,  4699.0 ,  4978.0 , 5274.0 , 5588.0 , 5920.0 , 6272.0 , 6645.0 , 7040.0 , 7459.0 , 7902.0 ]
     ]
-    out = []
     s_ctx_map = {s_ctx_1: 0, s_ctx_2: 1, s_ctx_3: 2}
     s_ctx = 0
     s_is = [-1, -1, -1]
     s_autoenv = [False, False, False]
     s_period = [-1, -1, -1]
-    for op in nss:
-        if type(op) == wait_b:
+
+    def autoenv_pass(op, out):
+        nonlocal s_ctx
+        if type(op) == wait:
             s_ctx=0
             out.append(op)
         elif type(op) == s_macro:
@@ -688,67 +807,156 @@ def simulate_ssg_autoenv(nss, ins):
             out.append(op)
         else:
             out.append(op)
+
+    out = run_control_flow_pass(autoenv_pass, nss)
     return out
 
 
 def remove_unreferenced_labels(nss):
-    if isinstance(nss[-1], nss_loop):
-        order = nss[-1].pat
-    else:
-        order = -1
-    out=[]
+    labels = [x for x in nss if isinstance(x, nss_label)]
+    callers = [x for x in nss if type(x) in [jmp, call]]
+    refs = set([x.pat for x in callers])
+    out = []
     for op in nss:
-        if isinstance(op, nss_label) and op.pat != order:
+        if isinstance(op, nss_label) and op.pat not in refs:
             continue
         else:
             out.append(op)
     return out
 
 
-def compute_loop_offset(nss):
-    if len(nss)==0 or not isinstance(nss[-1], nss_loop):
-        return nss
-    # when this pass is executed, there should be a single label in the nss
-    assert len(list(filter(lambda x: isinstance(x, nss_label), nss))) == 1
-    out = []
-    offset = 0
-    label_offset = -1
-
+def resolve_jmp_and_call_opcodes(nss):
+    labels = {}
+    # pass: position of each label in the stream (offset in bytes from start)
+    pos = 0
     for op in nss:
         if isinstance(op, nss_label):
-            label_offset = offset
+            labels[op.pat] = pos
         else:
             # 1 byte for opcode, + 1 byte per args, - 1 byte for _opcode arg)
-            offset += 1+len(astuple(op))-1
-            out.append(op)
-    out[-1].lsb = label_offset & 0xff
-    out[-1].msb = (label_offset>>8) & 0xff
-    return out
+            pos += 1+len(astuple(op))-1
 
+    # pass: resolve jmp and call opcodes
+    for op in nss:
+        if type(op) in [jmp, call]:
+            label_offset = labels[op.pat]
+            op.lsb = label_offset & 0xff
+            op.msb = (label_offset>>8) & 0xff
 
-def nss_to_asm(nss, m, name, fd):
-    size = sum([len(astuple(op))-1+1 for op in nss])
+    return nss
+
+
+def stream_size(stream):
+    def op_size(op):
+        if isinstance(op, nss_label):
+            return 0
+        else:
+            # size: all fields in the datatype (opcode + args)
+            return len(astuple(op))
+
+    sizes = [op_size(op) for op in stream]
+    return sum(sizes)
+
+
+def asm_header(nss, m, name, size, fd):
     print(";;; NSS music data", file=fd)
     print(";;; generated by nsstool.py (ngdevkit)", file=fd)
     print(";;; ---", file=fd)
     print(";;; Song title: %s" % m.name, file=fd)
     print(";;; Song author: %s" % m.author, file=fd)
-    print(";;; NSS size: %d"%size, file=fd)
+    print(";;; NSS size: %d" % size, file=fd)
     print(";;;", file=fd)
     print("", file=fd)
     print("        .area   CODE", file=fd)
     print("", file=fd)
+
+
+def stream_name(prefix, channel):
+    stream_type = ["f1", "f2", "f3", "f4", "s1", "s2", "s3",
+                   "a1", "a2", "a3", "a4", "a5", "a6", "b"]
+    return prefix+"_%s"%stream_type[channel]
+
+
+def nss_compact_header(channels, streams, name, fd):
+    stream_type = ["FM1", "FM2", "FM3", "FM4", "SSG1", "SSG2", "SSG3",
+                   "ADPCM-A1", "ADPCM-A2", "ADPCM-A3", "ADPCM-A4", "ADPCM-A5", "ADPCM-A6", "ADPCM-B"]
+    ctx_opcodes = [fm_ctx_1, fm_ctx_2, fm_ctx_3, fm_ctx_4, s_ctx_1, s_ctx_2, s_ctx_3,
+                   a_ctx_1 , a_ctx_2 , a_ctx_3 , a_ctx_4 , a_ctx_5, a_ctx_6, nop]
+    if name:
+        print("%s::" % name, file=fd)
+    print(("        .db     0x%02x"%len(streams)).ljust(40)+" ; number of streams", file=fd)
+    for i, c in enumerate(channels):
+        op = ctx_opcodes[c]
+        ch_name = stream_type[c]
+        comment = "stream %i: %s"%(i, ch_name)
+        print(("        .db     0x%02x"%op._opcode).ljust(40)+" ; "+comment, file=fd)
+    for i, c in enumerate(channels):
+        comment = "stream %i: NSS data"%i
+        print(("        .dw     %s"%(stream_name(name,c))).ljust(40)+" ; "+comment, file=fd)
+    print("", file=fd)
+
+
+def nss_inline_header(name, fd):
+    if name:
+        print("%s::" % name, file=fd)
+    print("        .db     0xff".ljust(40)+" ; inline NSS stream marker", file=fd)
+
+
+def nss_to_asm(nss, m, name, fd):
     if name:
         print("%s::" % name, file=fd)
     for op in nss:
+        if isinstance(op, nss_label):
+            if "jmp" not in op.pat:
+                print("        ;; pattern %s"%(op.pat,), file=fd)
+            continue
         opcode = [op._opcode]
         # remove the last _opcode field, it's just a metadata
         args = list(astuple(op)[:-1])
         hexdata = ", ".join(["0x%02x"%(x&0xff,) for x in opcode+args])
         comment = " ; %s"%type(op).__name__.upper()
+        if isinstance(op, call):
+            comment+=" "+op.pat
         print("        .db     "+hexdata.ljust(24)+comment, file=fd)
 
 
+def generate_nss_stream(m, p, bs, ins, channels, stream_idx):
+    compact = stream_idx >= 0
+
+    dbg("Convert Furnace patterns to unoptimized NSS opcodes")
+    nss = raw_nss(m, p, bs, channels, compact)
+
+    if stream_idx <= 0:
+        tb = round(256 - (4000000 / (1152 * m.frequency)))
+        nss.insert(0, tempo(tb))
+        nss.insert(0, speed(m.speed))
+
+    dbg("Transformation passes:")
+    dbg(" - remove unreference NSS labels")
+    nss = remove_unreferenced_labels(nss)
+
+    dbg(" - merge adjacent WAIT opcodes")
+    nss = compact_wait(m, nss)
+
+    dbg(" - remove successive INSTR opcodes if they keep intrument unchanged")
+    nss = compact_instr(nss)
+
+    if compact:
+        dbg(" - remove CTX opcodes for compact stream")
+        nss = remove_ctx(nss)
+    else:
+        dbg(" - remove CTX opcodes if they keep the current context unchanged")
+        nss = compact_ctx(nss)
+
+    dbg(" - look for SSG autoenv macros and insert opcodes to simulate them")
+    nss = simulate_ssg_autoenv(nss, ins)
+
+    dbg(" - resolve jmp and call opcodes")
+    nss = resolve_jmp_and_call_opcodes(nss)
+
+    return nss
+
+
 def main():
     global VERBOSE
     parser = argparse.ArgumentParser(
@@ -761,6 +969,7 @@ def main():
                         help="Name of the ASM label for the NSS data. Empty name skips label.")
 
     parser.add_argument("-c", "--channels", help="Process specific channels. One hex digit per channel", default='0123456789abcd')
+    parser.add_argument("-z", "--compact", help="Generate compact NSS stream", action="store_true")
 
     parser.add_argument("-v", "--verbose", dest="verbose", action="store_true",
                         default=False, help="print details of processing")
@@ -790,31 +999,30 @@ def main():
     smp = read_samples(m.samples, bs)
     ins = read_instruments(m.instruments, smp, bs)
     p = read_all_patterns(m, bs)
-    
-    dbg("Convert Furnace patterns to unoptimized sequence of NSS opcodes")
-    nss = raw_nss(m, p, bs, arguments.channels)
-    
-    dbg("Transformation passes:")
-    dbg(" - remove unreference NSS labels")
-    nss = remove_unreferenced_labels(nss)
+    channels = [int(c, 16) for c in sorted(list(arguments.channels.lower()))]
+
+    if arguments.compact:
+        streams = [generate_nss_stream(m, p, bs, ins, [c], i) for i, c in enumerate(channels)]
+        # NSS compact header (number of streams, streams types, stream pointers)
+        size = 1 + (2 * len(streams))
+        # all streams sizes
+        size += sum([stream_size(s) for s in streams])
+        asm_header(streams, m, name, size, outfd)
+        nss_compact_header(channels, streams, name, outfd)
+        for i, ch, stream in zip(range(len(channels)), channels, streams):
+            nss_to_asm(stream, m, stream_name(name, ch), outfd)
+    else:
+        stream = generate_nss_stream(m, p, bs, ins, channels, -1)
+        # NSS inline marker + stream size
+        size = 1 + stream_size(stream)
+        asm_header(stream, m, name, size, outfd)
+        nss_inline_header(name, outfd)
+        nss_to_asm(stream, m, False, outfd)
 
-    dbg(" - merge adjacent WAIT_B opcodes")
-    nss = compact_wait_b(nss)
 
-    dbg(" - remove successive INSTR opcodes if they keep intrument unchanged")
-    nss = compact_instr(nss)
-
-    dbg(" - remove CTX opcodes if they keep the current context unchanged")
-    nss = compact_ctx(nss)
 
-    dbg(" - look for SSG autoenv macros and insert opcodes to simulate them")
-    nss = simulate_ssg_autoenv(nss, ins)
-
-    dbg(" - compute label offset when LOOP opcode is used")
-    nss = compute_loop_offset(nss)
+if __name__ == "__main__":
+    main()
 
-    nss_to_asm(nss, m, name, outfd)
 
 
-if __name__ == "__main__":
-    main()