diff --git a/Makefile b/Makefile index d5a6ea725..6fb90da29 100644 --- a/Makefile +++ b/Makefile @@ -4,19 +4,6 @@ CFLAGS += -I. CYCLONE_CC ?= gcc CYCLONE_CXX ?= g++ -ifneq ("$(PLATFORM)", "libretro") - CFLAGS += -Wall -g - ifndef DEBUG - CFLAGS += -O3 -DNDEBUG - endif -endif - -# This is actually needed, bevieve me. -# If you really have to disable this, set NO_ALIGN_FUNCTIONS elsewhere. -ifndef NO_ALIGN_FUNCTIONS -CFLAGS += -falign-functions=2 -endif - all: config.mak target_ ifndef NO_CONFIG_MAK @@ -34,6 +21,39 @@ else # NO_CONFIG_MAK config.mak: endif +# This is actually needed, believe me. +# If you really have to disable this, set NO_ALIGN_FUNCTIONS elsewhere. +ifndef NO_ALIGN_FUNCTIONS +CFLAGS += -falign-functions=2 +endif + +# profiling +pprof ?= 0 +gperf ?= 0 + +ifneq ("$(PLATFORM)", "libretro") + CFLAGS += -Wall -g +ifneq ($(findstring gcc,$(CC)),) + CFLAGS += -ffunction-sections -fdata-sections + LDFLAGS += -Wl,--gc-sections +endif + ifndef DEBUG + CFLAGS += -O3 -DNDEBUG + endif + + LD = $(CC) + OBJOUT ?= -o + LINKOUT ?= -o +endif + +ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1")) +# very small caches, avoid optimization options making the binary much bigger +CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp -ffast-math +# this gets you about 20% better execution speed on 32bit arm/mips +CFLAGS += -fno-common -fno-stack-protector -fno-guess-branch-probability -fno-caller-saves -fno-tree-loop-if-convert -fno-regmove +endif +#OBJS += align.o + # default settings ifeq "$(ARCH)" "arm" use_cyclone ?= 1 @@ -47,12 +67,28 @@ asm_ym2612 ?= 1 asm_misc ?= 1 asm_cdmemory ?= 1 asm_mix ?= 1 -else # if not arm +asm_32xdraw ?= 1 +asm_32xmemory ?= 1 +else ifneq (,$(findstring 86,$(ARCH))) +use_fame ?= 1 +use_cz80 ?= 1 +use_sh2drc ?= 1 +else ifneq (,$(findstring mips,$(ARCH))) +use_fame ?= 1 +use_cz80 ?= 1 +use_sh2drc ?= 1 +else ifneq (,$(findstring aarch64,$(ARCH))) +use_fame ?= 1 +use_cz80 ?= 1 +use_sh2drc ?= 1 +else ifneq (,$(findstring riscv,$(ARCH))) +use_fame ?= 1 +use_cz80 ?= 1 +use_sh2drc ?= 1 +else ifneq (,$(findstring powerpc,$(ARCH))) use_fame ?= 1 use_cz80 ?= 1 -ifneq (,$(findstring 86,$(ARCH))) use_sh2drc ?= 1 -endif endif -include Makefile.local @@ -91,6 +127,7 @@ OBJS += platform/libpicofe/gl_platform.o USE_FRONTEND = 1 endif ifeq "$(PLATFORM)" "generic" +CFLAGS += -DSDL_OVERLAY_2X OBJS += platform/linux/emu.o platform/linux/blit.o # FIXME OBJS += platform/common/plat_sdl.o OBJS += platform/libpicofe/plat_sdl.o platform/libpicofe/in_sdl.o @@ -124,6 +161,8 @@ OBJS += platform/gp2x/vid_pollux.o OBJS += platform/gp2x/warm.o USE_FRONTEND = 1 PLATFORM_MP3 = 1 +PLATFORM_ZLIB = 1 +HAVE_ARMv6 = 0 endif ifeq "$(PLATFORM)" "libretro" OBJS += platform/libretro/libretro.o @@ -135,6 +174,7 @@ OBJS += platform/libretro/libretro-common/streams/file_stream.o OBJS += platform/libretro/libretro-common/streams/file_stream_transforms.o OBJS += platform/libretro/libretro-common/vfs/vfs_implementation.o endif +PLATFORM_ZLIB = 1 endif ifeq "$(USE_FRONTEND)" "1" @@ -169,15 +209,17 @@ endif endif # USE_FRONTEND -OBJS += platform/common/mp3.o +OBJS += platform/common/mp3.o platform/common/mp3_sync.o ifeq "$(PLATFORM_MP3)" "1" +platform/common/mp3_helix.o: CFLAGS += -Iplatform/libpicofe +OBJS += platform/common/mp3_helix.o else ifeq "$(HAVE_LIBAVCODEC)" "1" OBJS += platform/common/mp3_libavcodec.o else OBJS += platform/common/mp3_dummy.o endif -ifeq "$(PLATFORM)" "libretro" +ifeq "$(PLATFORM_ZLIB)" "1" # zlib OBJS += zlib/gzio.o zlib/inffast.o zlib/inflate.o zlib/inftrees.o zlib/trees.o \ zlib/deflate.o zlib/crc32.o zlib/adler32.o zlib/zutil.o zlib/compress.o zlib/uncompr.o @@ -198,10 +240,10 @@ LDFLAGS += -Wl,-Map=$(TARGET).map endif endif -target_: $(TARGET) +target_: pico/pico_int_offs.h $(TARGET) clean: - $(RM) $(TARGET) $(OBJS) + $(RM) $(TARGET) $(OBJS) pico/pico_int_offs.h $(RM) -r .opk_data $(TARGET): $(OBJS) @@ -213,10 +255,10 @@ else endif pprof: platform/linux/pprof.c - $(CC) -O2 -ggdb -DPPROF -DPPROF_TOOL -I../../ -I. $^ -o $@ + $(CC) $(CFLAGS) -O2 -ggdb -DPPROF -DPPROF_TOOL -I../../ -I. $^ -o $@ $(LDFLAGS) $(LDLIBS) -tools/textfilter: tools/textfilter.c - make -C tools/ textfilter +pico/pico_int_offs.h: tools/mkoffsets.sh + make -C tools/ XCC="$(CC)" XCFLAGS="$(CFLAGS)" %.o: %.c $(CC) -c $(OBJOUT)$@ $< $(CFLAGS) @@ -236,6 +278,14 @@ pico/cd/cd_file.o: CFLAGS += -fno-strict-aliasing pico/cd/pcm.o: CFLAGS += -fno-strict-aliasing pico/cd/LC89510.o: CFLAGS += -fno-strict-aliasing pico/cd/gfx_cd.o: CFLAGS += -fno-strict-aliasing +ifeq (1,$(use_sh2drc)) +ifneq (,$(findstring -flto,$(CFLAGS))) +# if using the DRC, memory and sh2soc directly use the DRC register for SH2 SR +# to avoid saving and reloading it. However, this collides with the use of LTO. +pico/32x/memory.o: CFLAGS += -fno-lto +pico/32x/sh2soc.o: CFLAGS += -fno-lto +endif +endif # fame needs ~2GB of RAM to compile on gcc 4.8 # on x86, this is reduced by ~300MB when debug info is off (but not on ARM) @@ -254,8 +304,8 @@ pico/carthw_cfg.c: pico/carthw.cfg # random deps pico/carthw/svp/compiler.o : cpu/drc/emit_arm.c -cpu/sh2/compiler.o : cpu/drc/emit_arm.c -cpu/sh2/compiler.o : cpu/drc/emit_x86.c +cpu/sh2/compiler.o : cpu/drc/emit_arm.c cpu/drc/emit_arm64.c cpu/drc/emit_ppc.c +cpu/sh2/compiler.o : cpu/drc/emit_x86.c cpu/drc/emit_mips.c cpu/drc/emit_riscv.c cpu/sh2/mame/sh2pico.o : cpu/sh2/mame/sh2.c pico/pico.o pico/cd/mcd.o pico/32x/32x.o : pico/pico_cmn.c pico/pico_int.h pico/memory.o pico/cd/memory.o pico/32x/memory.o : pico/pico_int.h pico/memory.h diff --git a/Makefile.libretro b/Makefile.libretro index daeb02855..4add450c3 100644 --- a/Makefile.libretro +++ b/Makefile.libretro @@ -41,8 +41,8 @@ CFLAGS += -I platform/libretro/libretro-common/include/vfs STATIC_LINKING:= 0 TARGET_NAME := picodrive LIBM := -lm -GIT_VERSION := " $(shell git rev-parse --short HEAD || echo unknown)" -ifneq ($(GIT_VERSION)," unknown") +GIT_VERSION ?= $(shell git rev-parse --short HEAD || echo unknown) +ifneq ($(GIT_VERSION),"unknown") CFLAGS += -DGIT_VERSION=\"$(GIT_VERSION)\" endif @@ -52,6 +52,8 @@ asm_ym2612 = 0 asm_misc = 0 asm_cdmemory = 0 asm_mix = 0 +asm_32xdraw = 0 +asm_32xmemory = 0 fpic := @@ -97,8 +99,8 @@ else ifeq ($(platform), osx) endif ifeq ($(arch),ppc) CFLAGS += -DBLARGG_BIG_ENDIAN=1 -D__ppc__ -DFAMEC_NO_GOTOS - else - use_sh2drc = 1 + else + use_sh2drc = 1 endif OSXVER = `sw_vers -productVersion | cut -d. -f 2` OSX_LT_MAVERICKS = `(( $(OSXVER) <= 9)) && echo "YES"` @@ -120,14 +122,14 @@ else ifeq ($(platform), staticios) CXX += -miphoneos-version-min=8.0 CC_AS += -miphoneos-version-min=8.0 CFLAGS += -miphoneos-version-min=8.0 - ARCH := arm + ARCH := aarch64 STATIC_LINKING = 1 use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 - use_sh2drc = 0 + use_sh2drc = 1 use_svpdrc = 0 # iOS @@ -142,7 +144,7 @@ else ifneq (,$(findstring ios,$(platform))) ifeq ($(platform),ios-arm64) CC = clang -arch arm64 -isysroot $(IOSSDK) CXX = clang++ -arch arm64 -isysroot $(IOSSDK) - CFLAGS += -marm -DARM -D__aarch64__=1 + CFLAGS += -marm -DARM -D__aarch64__=1 else CC = clang -arch armv7 -isysroot $(IOSSDK) CXX = clang++ -arch armv7 -isysroot $(IOSSDK) @@ -157,35 +159,32 @@ ifeq ($(platform),$(filter $(platform),ios9 ios-arm64)) CXX += -miphoneos-version-min=8.0 CC_AS += -miphoneos-version-min=8.0 CFLAGS += -miphoneos-version-min=8.0 + ARCH := aarch64 else CC += -miphoneos-version-min=5.0 CXX += -miphoneos-version-min=5.0 CC_AS += -miphoneos-version-min=5.0 CFLAGS += -miphoneos-version-min=5.0 -endif ARCH := arm +endif use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 - ifeq ($(platform),ios-arm64) - use_sh2drc = 0 - use_svpdrc = 0 - else - use_sh2drc = 1 - use_svpdrc = 1 - endif + use_sh2drc = 1 + use_svpdrc = 0 # tvOS else ifeq ($(platform), tvos-arm64) - ARCH := arm + ARCH := aarch64 use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 - use_sh2drc = 0 + use_sh2drc = 1 use_svpdrc = 0 + TARGET := $(TARGET_NAME)_libretro_tvos.dylib SHARED := -dynamiclib fpic := -fPIC @@ -194,6 +193,9 @@ else ifeq ($(platform), tvos-arm64) IOSSDK := $(shell xcodebuild -version -sdk appletvos Path) endif CC_AS = perl ./tools/gas-preprocessor.pl $(CC) + CC = clang -arch arm64 -isysroot $(IOSSDK) + CXX = clang++ -arch arm64 -isysroot $(IOSSDK) + CFLAGS += -marm -DARM -D__aarch64__=1 CFLAGS += -DIOS # PS3 @@ -206,17 +208,11 @@ else ifeq ($(platform), ps3) NO_MMAP = 1 DONT_COMPILE_IN_ZLIB = 1 - asm_memory = 0 - asm_render = 0 - asm_ym2612 = 0 - asm_misc = 0 - asm_cdpico = 0 - asm_cdmemory = 0 - asm_mix = 0 use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 + use_sh2drc = 1 # sncps3 else ifeq ($(platform), sncps3) @@ -228,17 +224,11 @@ else ifeq ($(platform), sncps3) NO_MMAP = 1 DONT_COMPILE_IN_ZLIB = 1 - asm_memory = 0 - asm_render = 0 - asm_ym2612 = 0 - asm_misc = 0 - asm_cdpico = 0 - asm_cdmemory = 0 - asm_mix = 0 use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 + use_sh2drc = 1 # Lightweight PS3 Homebrew SDK else ifeq ($(platform), psl1ght) @@ -250,17 +240,11 @@ else ifeq ($(platform), psl1ght) NO_MMAP = 1 DONT_COMPILE_IN_ZLIB = 1 - asm_memory = 0 - asm_render = 0 - asm_ym2612 = 0 - asm_misc = 0 - asm_cdpico = 0 - asm_cdmemory = 0 - asm_mix = 0 use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 + use_sh2drc = 1 # PSP else ifeq ($(platform), psp1) @@ -273,17 +257,11 @@ else ifeq ($(platform), psp1) NO_MMAP = 1 DONT_COMPILE_IN_ZLIB = 1 - asm_memory = 0 asm_render = 1 - asm_ym2612 = 0 - asm_misc = 0 - asm_cdpico = 0 - asm_cdmemory = 0 - asm_mix = 0 - use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 + use_sh2drc = 1 # PS2 else ifeq ($(platform), ps2) @@ -297,17 +275,12 @@ else ifeq ($(platform), ps2) NO_MMAP = 1 DONT_COMPILE_IN_ZLIB = 1 - asm_memory = 0 asm_render = 1 - asm_ym2612 = 0 - asm_misc = 0 - asm_cdpico = 0 - asm_cdmemory = 0 - asm_mix = 0 use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 + use_sh2drc = 1 OBJS += platform/ps2/asm.o @@ -327,14 +300,6 @@ else ifeq ($(platform), ctr) ARCH = arm ARM_ASM = 1 - asm_memory = 1 - asm_render = 1 - asm_ym2612 = 1 - asm_misc = 1 - asm_cdpico = 1 - asm_cdmemory = 1 - asm_mix = 1 - use_cyclone = 1 use_fame = 0 use_drz80 = 1 @@ -357,14 +322,6 @@ else ifeq ($(platform), raspberrypi) fpic := -fPIC DONT_COMPILE_IN_ZLIB = 1 - asm_memory = 1 - asm_render = 1 - asm_ym2612 = 1 - asm_misc = 1 - asm_cdpico = 1 - asm_cdmemory = 1 - asm_mix = 1 - use_cyclone = 1 use_fame = 0 use_drz80 = 1 @@ -386,14 +343,8 @@ else ifeq ($(platform), vita) NO_MMAP = 1 DONT_COMPILE_IN_ZLIB = 1 ARCH = arm + ARM_ASM=1 - asm_memory = 1 - asm_render = 1 - asm_ym2612 = 1 - asm_misc = 1 - asm_cdpico = 1 - asm_cdmemory = 1 - asm_mix = 1 use_cyclone = 1 use_fame = 0 use_drz80 = 1 @@ -443,13 +394,14 @@ else ifeq ($(platform), switch) else ifeq ($(platform), libnx) include $(DEVKITPRO)/libnx/switch_rules TARGET := $(TARGET_NAME)_libretro_$(platform).a - ARCH := arm64 + ARCH := aarch64 CFLAGS += -O3 -fomit-frame-pointer -ffast-math -I$(DEVKITPRO)/libnx/include/ -fPIE -Wl,--allow-multiple-definition CFLAGS += -specs=$(DEVKITPRO)/libnx/switch.specs CFLAGS += -D__SWITCH__ -DHAVE_LIBNX CFLAGS += -DARM -D__aarch64__=1 -march=armv8-a -mtune=cortex-a57 -mtp=soft -ffast-math -mcpu=cortex-a57+crc+fp+simd -ffunction-sections CFLAGS += -Ifrontend/switch -ftree-vectorize STATIC_LINKING=1 + use_sh2drc = 1 # QNX else ifeq ($(platform), qnx) @@ -489,7 +441,6 @@ else ifeq ($(platform), classic_armv7_a7) HAVE_NEON = 1 ARCH = arm BUILTIN_GPU = neon - USE_DYNAREC = 1 ifeq ($(shell echo `$(CC) -dumpversion` "< 4.9" | bc -l), 1) CFLAGS += -march=armv7-a else @@ -499,6 +450,9 @@ else ifeq ($(platform), classic_armv7_a7) LDFLAGS += -static-libgcc -static-libstdc++ endif endif + + use_sh2drc = 1 + use_svpdrc = 1 # (armv8 a35, hard point, neon based) ### # Playstation Classic @@ -518,19 +472,14 @@ else ifeq ($(platform), classic_armv8_a35) CPPFLAGS += $(CFLAGS) ASFLAGS += $(CFLAGS) HAVE_NEON = 1 - ARCH = arm + ARCH = aarch64 BUILTIN_GPU = neon CFLAGS += -march=armv8-a LDFLAGS += -static-libgcc -static-libstdc++ - asm_memory = 0 - asm_render = 0 - asm_ym2612 = 0 - asm_misc = 0 - asm_cdpico = 0 - asm_cdmemory = 0 - asm_mix = 0 + + use_sh2drc = 1 use_cyclone = 0 - use_fame = 1 + use_fame = 1 use_drz80 = 0 use_cz80 = 1 ####################################### @@ -597,25 +546,19 @@ else ifeq ($(platform), emscripten) # GCW0 else ifeq ($(platform), gcw0) TARGET := $(TARGET_NAME)_libretro.so - CC = /opt/gcw0-toolchain/usr/bin/mipsel-linux-gcc - AR = /opt/gcw0-toolchain/usr/bin/mipsel-linux-ar + CC = mipsel-linux-gcc + AR = mipsel-linux-ar SHARED := -shared -nostdlib fpic := -fPIC LIBM := DONT_COMPILE_IN_ZLIB = 1 CFLAGS += -ffast-math -march=mips32 -mtune=mips32r2 -mhard-float - asm_memory = 0 - asm_render = 0 - asm_ym2612 = 0 - asm_misc = 0 - asm_cdpico = 0 - asm_cdmemory = 0 - asm_mix = 0 use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 + use_sh2drc = 1 # Windows MSVC 2017 all architectures else ifneq (,$(findstring windows_msvc2017,$(platform))) @@ -743,6 +686,8 @@ asm_ym2612 = 1 asm_misc = 1 asm_cdmemory = 1 asm_mix = 1 +asm_32xdraw = 1 +asm_32xmemory = 1 endif CFLAGS += $(fpic) @@ -755,6 +700,9 @@ endif SHARED ?= -shared LDFLAGS += $(SHARED) $(fpic) +ifneq ($(ARCH), arm) +ARCH = $(shell $(CC) -dumpmachine | awk -F '-' '{print $$1}') +endif PLATFORM = libretro NO_CONFIG_MAK = yes diff --git a/README.md b/README.md new file mode 100644 index 000000000..a5d0ad3a2 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +This is my foray into dynamic recompilation using PicoDrive, a +Megadrive / Genesis / Sega CD / Mega CD / 32X / SMS emulator. + +I added support for MIPS (mips32r1), ARM64 (aarch64) and RISC-V (RV64IM) to the +SH2 recompiler, as well as spent much effort to optimize the DRC-generated code. +I also optimized SH2 memory access inside the emulator, and did some work on +M68K/SH2 CPU synchronization to fix some problems and speed up the emulator. + +It got a bit out of hand. I ended up doing fixes and optimizations all over the +place, mainly for 32X and CD, 32X graphics handling, and probably some more, +see the commit history. As a result, 32X emulation speed has improved a lot. + +### compiling + +I mainly worked with standalone PicoDrive versions as created by configure/make. +A list of platforms for which this is possible can be obtained with + +> configure --help + +If you want to build an executable for a unixoid platform not listed in the +platform list, just use + +> configure --platform=generic + +If DRC is available for the platform, it should be enabled automatically. + +For other platforms using a cross-compiling toolchain I used this, +assuming $TC points to the appropriate cross compile toolchain directory: + +platform|toolchain|configure command +--------|---------|----------------- +gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x +gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x +opendingux|opendingux|CROSS_COMPILE=mipsel-linux- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="--sysroot $TC -L$TC/lib" ./configure --platform=opendingux +opendingux|opendingux with ubuntu mips gcc 5.4|CROSS_COMPILE=mipsel-linux-gnu- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="-B$TC/usr/lib -B$TC/lib -Wl,-rpath-link=$TC/usr/lib -Wl,-rpath-link=$TC/lib" ./configure --platform=opendingux +gcw0|gcw0|CROSS_COMPILE=mipsel-gcw0-linux-uclibc- CFLAGS="-I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include -I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL" LDFLAGS="--sysroot $TC/usr/mipsel-gcw0-linux-uclibc/sysroot" ./configure --platform=gcw0 + +For gp2x, wiz, and caanoo you may need to compile libpng first. + +After configure, compile with + +> make opk # for opendingux and gcw0 +> +> make # for anything else + +### helix MP3 decoder + +For 32 bit ARM platforms, there is the possibility to compile the helix MP3 +decoder into a shared library to be able to use MP3 audio files with CD games. +The helix source files aren't supplied because of licensing issues. However, if +you have obtained the sources, put them into the platform/common/helix +directory, set CROSS to your cross compiler prefix (e.g. arm-linux-gnueabi-) +and LIBGCC to your cross compiler's libgcc.a +(e.g. /usr/lib/gcc-cross/arm-linux-gnueabi/4.7/libgcc.a), and compile with + +> make -C platform/common/helix CROSS=$CROSS LIBGCC=$LIBGCC + +Copy the resulting ${CROSS}helix_mp3.so as libhelix.so to the directory where +the PicoDrive binary is. + +### installing + +You need to install the resulting binary onto your device manually. +For opendingux and gcw0, copy the opk to your SD card. +For gp2x, wiz and caanoo, the easiest way is to unpack +[PicoDrive_191.zip](http://notaz.gp2x.de/releases/PicoDrive/PicoDrive_191.zip) +on your SD card and replace the PicoDrive binary. + +Send bug reports, fixes etc to +Kai-Uwe Bloem diff --git a/configure b/configure index 1310ab2c9..c82fe2053 100755 --- a/configure +++ b/configure @@ -22,6 +22,13 @@ compile_binary() $c >> config.log 2>&1 } +check_option() +{ + echo 'void test(void) { }' >$TMPC + compile_object $1 || return 1 + return 0 +} + check_define() { $CC -E -dD $CFLAGS pico/arm_features.h | grep -q $1 || return 1 @@ -31,17 +38,18 @@ check_define() # setting options to "yes" or "no" will make that choice default, # "" means "autodetect". -platform_list="generic pandora gp2x opendingux rpi1 rpi2" +platform_list="generic pandora gp2x wiz caanoo opendingux gcw0 rpi1 rpi2" platform="generic" sound_driver_list="oss alsa sdl" sound_drivers="" have_armv5="" have_armv6="" have_armv7="" +have_arm_oabi="" have_arm_neon="" have_libavcodec="" need_sdl="no" -need_xlib="no" +need_zlib="no" # these are for known platforms optimize_cortexa8="no" optimize_cortexa7="no" @@ -54,7 +62,7 @@ CC="${CC-${CROSS_COMPILE}gcc}" CXX="${CXX-${CROSS_COMPILE}g++}" AS="${AS-${CROSS_COMPILE}as}" STRIP="${STRIP-${CROSS_COMPILE}strip}" -test -n "$SDL_CONFIG" || SDL_CONFIG="`$CC --print-sysroot 2> /dev/null || true`/usr/bin/sdl-config" +test -n "$SDL_CONFIG" || SDL_CONFIG="`$CC $CFLAGS $LDFLAGS --print-sysroot 2> /dev/null || true`/usr/bin/sdl-config" MAIN_LDLIBS="$LDLIBS -lm" config_mak="config.mak" @@ -78,23 +86,27 @@ set_platform() ;; generic) ;; - opendingux) + opendingux | gcw0) sound_drivers="sdl" + # both are really an opendingux + platform="opendingux" ;; pandora) sound_drivers="oss alsa" optimize_cortexa8="yes" have_arm_neon="yes" ;; - gp2x) + gp2x | wiz | caanoo) sound_drivers="oss" optimize_arm920="yes" + # compile for OABI if toolchain provides it (faster code on caanoo) + have_arm_oabi="yes" + # always use static linking, since caanoo doesn't have OABI libs. Moreover, + # dynamic linking slows Wiz 1-10%, and libm on F100 isn't compatible + LDFLAGS="$LDFLAGS -static" + # unified binary for all of them CFLAGS="$CFLAGS -D__GP2X__" - if [ "$CROSS_COMPILE" = "arm-linux-" ]; then - # still using static, dynamic linking slows Wiz 1-10% - # also libm on F100 is not compatible - MAIN_LDLIBS="$MAIN_LDLIBS -static" - fi + platform="gp2x" ;; *) fail "unsupported platform: $platform" @@ -147,18 +159,11 @@ fi # fi #fi -# basic compiler test -cat > $TMPC < $TMPC < $TMPC <> $config_mak if [ "$have_libavcodec" = "yes" ]; then echo "HAVE_LIBAVCODEC = 1" >> $config_mak fi +if [ "$need_zlib" = "yes" ]; then + echo "PLATFORM_ZLIB = 1" >> $config_mak +fi # GP2X toolchains are too old for UAL asm, # so add this here to not litter main Makefile -if [ "$platform" = "g1p2x" ]; then - echo >> $config_mak - echo "%.o: %.S" >> $config_mak - echo " $(CC) $(CFLAGS) -E -c $^ -o /tmp/$(notdir $@).s" >> $config_mak - echo " $(AS) $(ASFLAGS) /tmp/$(notdir $@).s -o $@" >> $config_mak -fi +#if [ "$platform" = "gp2x" ]; then +# echo >> $config_mak +# echo '%.o: %.S' >> $config_mak +# echo ' $(CC) $(CFLAGS) -E -c $^ -o /tmp/$(notdir $@).s' >> $config_mak +# echo ' $(AS) $(ASFLAGS) /tmp/$(notdir $@).s -o $@' >> $config_mak +#fi # use pandora's skin (for now) test -e skin || ln -s platform/pandora/skin skin diff --git a/cpu/DrZ80/drz80.s b/cpu/DrZ80/drz80.S similarity index 90% rename from cpu/DrZ80/drz80.s rename to cpu/DrZ80/drz80.S index c2a64df3f..4d592b169 100644 --- a/cpu/DrZ80/drz80.s +++ b/cpu/DrZ80/drz80.S @@ -5,6 +5,8 @@ ;@ For commercial use, separate licencing terms must be obtained. +#include "../../pico/arm_features.h" + .data .align 4 @@ -102,6 +104,7 @@ DrZ80Ver: .long 0x0001 ;@--------------------------------------- .text + PIC_LDR_INIT() .if DRZ80_XMAP @@ -1370,7 +1373,7 @@ DrZ80Run: blne DoInterrupt .endif - ldr opcodes,MAIN_opcodes_POINTER2 + PIC_LDR(opcodes, r0, MAIN_opcodes) cmp z80_icount,#0 ;@ irq might have used all cycles ldrplb r0,[z80pc],#1 @@ -1382,11 +1385,7 @@ z80_execute_end: stmia cpucontext,{z80pc-z80sp} ;@ save Z80 registers mov r0,z80_icount ldmia sp!,{r4-r12,pc} ;@ restore registers from stack and return to C code - -MAIN_opcodes_POINTER2: .word MAIN_opcodes -.if INTERRUPT_MODE -Interrupt_local: .word Interrupt -.endif +.pool DoInterrupt: .if INTERRUPT_MODE @@ -1395,8 +1394,9 @@ DoInterrupt: ;@ save everything back into DrZ80 context stmia cpucontext,{z80pc-z80sp} ;@ save Z80 registers stmfd sp!,{r3,r4,r5,lr} ;@ save rest of regs on stack + PIC_LDR(r2, r3, Interrupt) mov lr,pc - ldr pc,Interrupt_local + bx r2 ldmfd sp!,{r3,r4,r5,lr} ;@ load regs from stack ;@ reload regs from DrZ80 context ldmia cpucontext,{z80pc-z80sp} ;@ load Z80 registers @@ -4469,7 +4469,6 @@ opcode_2_6: and z80hl,z80hl,#0xFF<<16 orr z80hl,z80hl,r1, lsl #24 fetch 7 -DAATABLE_LOCAL: .word DAATable ;@DAA opcode_2_7: mov r1,z80a, lsr #24 @@ -4479,13 +4478,14 @@ opcode_2_7: orrne r1,r1,#512 tst z80f,#1< #include #endif @@ -277,7 +278,8 @@ INT32 Cz80_Exec(cz80_struc *CPU, INT32 cycles) CPU->ICount -= CPU->ExtraCycles; CPU->ExtraCycles = 0; } - goto Cz80_Exec; + if (!CPU->HaltState) + goto Cz80_Exec; } } else CPU->ICount = 0; @@ -287,6 +289,8 @@ INT32 Cz80_Exec(cz80_struc *CPU, INT32 cycles) #if CZ80_ENCRYPTED_ROM CPU->OPBase = OPBase; #endif + if (CPU->HaltState) + CPU->ICount = 0; cycles -= CPU->ICount; #if !CZ80_EMULATE_R_EXACTLY zR = (zR + (cycles >> 2)) & 0x7f; diff --git a/cpu/cz80/cz80_op.c b/cpu/cz80/cz80_op.c index f84f8e754..b1520088b 100644 --- a/cpu/cz80/cz80_op.c +++ b/cpu/cz80/cz80_op.c @@ -687,13 +687,13 @@ switch (Opcode) OP(0x76): // HALT OP_HALT: CPU->HaltState = 1; - CPU->ICount = 0; goto Cz80_Check_Interrupt; OP(0xf3): // DI OP_DI: zIFF = 0; - RET(4) + USE_CYCLES(4) + goto Cz80_Exec_nocheck; OP(0xfb): // EI OP_EI: diff --git a/cpu/drc/cmn.h b/cpu/drc/cmn.h index f5c595f29..9c041e704 100644 --- a/cpu/drc/cmn.h +++ b/cpu/drc/cmn.h @@ -1,16 +1,44 @@ -#ifndef UTYPES_DEFINED -typedef unsigned char u8; -typedef signed char s8; -typedef unsigned short u16; -typedef signed short s16; -typedef unsigned int u32; -typedef signed int s32; -#endif -#define DRC_TCACHE_SIZE (2*1024*1024) +#define DRC_TCACHE_SIZE (4*1024*1024) extern u8 *tcache; void drc_cmn_init(void); void drc_cmn_cleanup(void); +#define BITMASK1(v0) (1 << (v0)) +#define BITMASK2(v0,v1) ((1 << (v0)) | (1 << (v1))) +#define BITMASK3(v0,v1,v2) (BITMASK2(v0,v1) | (1 << (v2))) +#define BITMASK4(v0,v1,v2,v3) (BITMASK3(v0,v1,v2) | (1 << (v3))) +#define BITMASK5(v0,v1,v2,v3,v4) (BITMASK4(v0,v1,v2,v3) | (1 << (v4))) +#define BITMASK6(v0,v1,v2,v3,v4,v5) (BITMASK5(v0,v1,v2,v3,v4) | (1 << (v5))) +#define BITRANGE(v0,v1) (BITMASK1(v1+1)-BITMASK1(v0)) // set with v0..v1 + +// binary search approach, since we don't have CLZ on ARM920T +#define FOR_ALL_BITS_SET_DO(mask, bit, code) { \ + u32 __mask = mask; \ + for (bit = 0; bit < 32 && mask; bit++, __mask >>= 1) { \ + if (!(__mask & 0xffff)) \ + bit += 16,__mask >>= 16; \ + if (!(__mask & 0xff)) \ + bit += 8, __mask >>= 8; \ + if (!(__mask & 0xf)) \ + bit += 4, __mask >>= 4; \ + if (!(__mask & 0x3)) \ + bit += 2, __mask >>= 2; \ + if (!(__mask & 0x1)) \ + bit += 1, __mask >>= 1; \ + if (__mask & 0x1) { \ + code; \ + } \ + } \ +} + +// inspired by https://graphics.stanford.edu/~seander/bithacks.html +static inline int count_bits(unsigned val) +{ + val = val - ((val >> 1) & 0x55555555); + val = (val & 0x33333333) + ((val >> 2) & 0x33333333); + return (((val + (val >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24; +} + diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 91b474024..3f373435b 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -1,34 +1,189 @@ /* * Basic macros to emit ARM instructions and some utils * Copyright (C) 2008,2009,2010 notaz + * Copyright (C) 2019 kub * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. */ -#define CONTEXT_REG 11 -#define RET_REG 0 +#define HOST_REGS 16 + +// OABI/EABI: params: r0-r3, return: r0-r1, temp: r12,r14, saved: r4-r8,r10,r11 +// SP,PC: r13,r15 must not be used. saved: r9 (for platform use, e.g. on ios) +#define RET_REG 0 +#define PARAM_REGS { 0, 1, 2, 3 } +#ifndef __MACH__ +#define PRESERVED_REGS { 4, 5, 6, 7, 8, 9, 10, 11 } +#else +#define PRESERVED_REGS { 4, 5, 6, 7, 8, 10, 11 } // no r9.. +#endif +#define TEMPORARY_REGS { 12, 14 } + +#define CONTEXT_REG 11 +#define STATIC_SH2_REGS { SHR_SR,10 , SHR_R(0),8 , SHR_R(1),9 } // XXX: tcache_ptr type for SVP and SH2 compilers differs.. #define EMIT_PTR(ptr, x) \ do { \ *(u32 *)ptr = x; \ ptr = (void *)((u8 *)ptr + sizeof(u32)); \ - COUNT_OP; \ } while (0) -#define EMIT(x) EMIT_PTR(tcache_ptr, x) +// ARM special registers and peephole optimization flags +#define SP 13 // stack pointer +#define LR 14 // link (return address) +#define PC 15 // program counter +#define SR 16 // CPSR, status register +#define MEM 17 // memory access (src=LDR, dst=STR) +#define CYC1 20 // 1 cycle interlock (LDR, reg-cntrld shift) +#define CYC2 (CYC1+1)// 2+ cycles interlock (LDR[BH], MUL/MLA etc) +#define NO 32 // token for "no register" + +// bitmask builders +#define M1(x) (u32)(1ULL<<(x)) // u32 to have NO evaluate to 0 +#define M2(x,y) (M1(x)|M1(y)) +#define M3(x,y,z) (M2(x,y)|M1(z)) +#define M4(x,y,z,a) (M3(x,y,z)|M1(a)) +#define M5(x,y,z,a,b) (M4(x,y,z,a)|M1(b)) +#define M6(x,y,z,a,b,c) (M5(x,y,z,a,b)|M1(c)) +#define M10(a,b,c,d,e,f,g,h,i,j) (M5(a,b,c,d,e)|M5(f,g,h,i,j)) + +// sys_cacheflush always flushes whole pages, and it's rather expensive on ARMs +// hold a list of pending cache updates and merge requests to reduce cacheflush +static struct { void *base, *end; } pageflush[4]; +static unsigned pagesize = 4096; + +static void emith_update_cache(void) +{ + int i; + + for (i = 0; i < 4 && pageflush[i].base; i++) { + cache_flush_d_inval_i(pageflush[i].base, pageflush[i].end + pagesize-1); + pageflush[i].base = NULL; + } +} + +static inline void emith_update_add(void *base, void *end) +{ + void *p_base = (void *)((uintptr_t)(base) & ~(pagesize-1)); + void *p_end = (void *)((uintptr_t)(end ) & ~(pagesize-1)); + int i; + + for (i = 0; i < 4 && pageflush[i].base; i++) { + if (p_base <= pageflush[i].end+pagesize && p_end >= pageflush[i].end) { + if (p_base < pageflush[i].base) pageflush[i].base = p_base; + pageflush[i].end = p_end; + return; + } + if (p_base <= pageflush[i].base && p_end >= pageflush[i].base-pagesize) { + if (p_end > pageflush[i].end) pageflush[i].end = p_end; + pageflush[i].base = p_base; + return; + } + } + if (i == 4) { + /* list full and not mergeable -> flush list */ + emith_update_cache(); + i = 0; + } + pageflush[i].base = p_base, pageflush[i].end = p_end; +} + +// peephole optimizer. ATM only tries to reduce interlock +#define EMIT_CACHE_SIZE 6 +struct emit_op { + u32 op; + u32 src, dst; +}; + +// peephole cache, last commited insn + cache + next insn = size+2 +static struct emit_op emit_cache[EMIT_CACHE_SIZE+2]; +static int emit_index; +#define emith_insn_ptr() (u8 *)((u32 *)tcache_ptr-emit_index) + +static inline void emith_pool_adjust(int tcache_offs, int move_offs); + +static NOINLINE void EMIT(u32 op, u32 dst, u32 src) +{ + void * emit_ptr = (u32 *)tcache_ptr - emit_index; + struct emit_op *const ptr = emit_cache; + const int n = emit_index+1; + int i, bi, bd = 0; + + // account for new insn in tcache + tcache_ptr = (void *)((u32 *)tcache_ptr + 1); + COUNT_OP; + // for conditional execution SR is always source + if (op < 0xe0000000 /*A_COND_AL << 28*/) + src |= M1(SR); + // put insn on back of queue // mask away the NO token + emit_cache[n] = (struct emit_op) + { .op=op, .src=src & ~M1(NO), .dst=dst & ~M1(NO) }; + // check insns down the queue as long as permitted by dependencies + for (bd = bi = 0, i = emit_index; i > 1 && !(dst & M1(PC)); i--) { + int deps = 0; + // dst deps between i and n must not be swapped, since any deps + // but [i].src & [n].src lead to changed semantics if swapped. + if ((ptr[i].dst & ptr[n].src) || (ptr[n].dst & ptr[i].src) || + (ptr[i].dst & ptr[n].dst)) + break; + // don't swap insns reading PC if it's not a word pool load + // (ptr[i].op&0xf700000) != EOP_C_AM2_IMM(0,0,0,1,0,0,0)) + if ((ptr[i].src & M1(PC)) && (ptr[i].op&0xf700000) != 0x5100000) + break; + + // calculate ARM920T interlock cycles (differences only) +#define D2(x,y) ((ptr[x].dst & ptr[y].src)?((ptr[x].src >> CYC2) & 1):0) +#define D1(x,y) ((ptr[x].dst & ptr[y].src)?((ptr[x].src >> CYC1) & 3):0) + // insn sequence: [..., i-2, i-1, i, i+1, ..., n-2, n-1, n] + deps -= D2(i-2,i)+D2(i-1,i+1)+D2(n-2,n ) + D1(i-1,i)+D1(n-1,n); + deps -= !!(ptr[n].src & M2(CYC1,CYC2));// favour moving LDR down + // insn sequence: [..., i-2, i-1, n, i, i+1, ..., n-2, n-1] + deps += D2(i-2,n)+D2(i-1,i )+D2(n ,i+1) + D1(i-1,n)+D1(n ,i); + deps += !!(ptr[i].src & M2(CYC1,CYC2));// penalize moving LDR up + // remember best match found + if (bd > deps) + bd = deps, bi = i; + } + // swap if fewer depencies + if (bd < 0) { + // make room for new insn at bi + struct emit_op tmp = ptr[n]; + for (i = n-1; i >= bi; i--) { + ptr[i+1] = ptr[i]; + if (ptr[i].src & M1(PC)) + emith_pool_adjust(n-i+1, 1); + } + // insert new insn at bi + ptr[bi] = tmp; + if (ptr[bi].src & M1(PC)) + emith_pool_adjust(1, bi-n); + } + if (dst & M1(PC)) { + // commit everything if a branch insn is emitted + for (i = 1; i <= emit_index+1; i++) + EMIT_PTR(emit_ptr, emit_cache[i].op); + emit_index = 0; + } else if (emit_index < EMIT_CACHE_SIZE) { + // queue not yet full + emit_index++; + } else { + // commit oldest insn from cache + EMIT_PTR(emit_ptr, emit_cache[1].op); + for (i = 0; i <= emit_index; i++) + emit_cache[i] = emit_cache[i+1]; + } +} + +static void emith_flush(void) +{ + int i; + void *emit_ptr = tcache_ptr - emit_index*sizeof(u32); -#define A_R4M (1 << 4) -#define A_R5M (1 << 5) -#define A_R6M (1 << 6) -#define A_R7M (1 << 7) -#define A_R8M (1 << 8) -#define A_R9M (1 << 9) -#define A_R10M (1 << 10) -#define A_R11M (1 << 11) -#define A_R12M (1 << 12) -#define A_R14M (1 << 14) -#define A_R15M (1 << 15) + for (i = 1; i <= emit_index; i++) + EMIT_PTR(emit_ptr, emit_cache[i].op); + emit_index = 0; +} #define A_COND_AL 0xe #define A_COND_EQ 0x0 @@ -47,6 +202,7 @@ #define A_COND_LE 0xd #define A_COND_CS A_COND_HS #define A_COND_CC A_COND_LO +#define A_COND_NV 0xf // Not Valid (aka NeVer :-) - ATTN: not a real condition! /* unified conditions */ #define DCOND_EQ A_COND_EQ @@ -64,6 +220,9 @@ #define DCOND_VS A_COND_VS #define DCOND_VC A_COND_VC +#define DCOND_CS A_COND_HS +#define DCOND_CC A_COND_LO + /* addressing mode 1 */ #define A_AM1_LSL 0 #define A_AM1_LSR 1 @@ -86,18 +245,26 @@ #define A_OP_TST 0x8 #define A_OP_TEQ 0x9 #define A_OP_CMP 0xa -#define A_OP_CMN 0xa +#define A_OP_CMN 0xb #define A_OP_ORR 0xc #define A_OP_MOV 0xd #define A_OP_BIC 0xe #define A_OP_MVN 0xf -#define EOP_C_DOP_X(cond,op,s,rn,rd,shifter_op) \ - EMIT(((cond)<<28) | ((op)<< 21) | ((s)<<20) | ((rn)<<16) | ((rd)<<12) | (shifter_op)) +// operation specific register usage in DOP +#define A_Rn(op,rn) (((op)&0xd)!=0xd ? rn:NO) // no rn for MOV,MVN +#define A_Rd(op,rd) (((op)&0xc)!=0x8 ? rd:NO) // no rd for TST,TEQ,CMP,CMN +// CSPR is dst if S set, CSPR is src if op is ADC/SBC/RSC or shift is RRX +#define A_Sd(s) ((s) ? SR:NO) +#define A_Sr(op,sop) (((op)>=0x5 && (op)<=0x7) || (sop)>>4==A_AM1_ROR<<1 ? SR:NO) -#define EOP_C_DOP_IMM( cond,op,s,rn,rd,ror2,imm8) EOP_C_DOP_X(cond,op,s,rn,rd,A_AM1_IMM(ror2,imm8)) -#define EOP_C_DOP_REG_XIMM(cond,op,s,rn,rd,shift_imm,shift_op,rm) EOP_C_DOP_X(cond,op,s,rn,rd,A_AM1_REG_XIMM(shift_imm,shift_op,rm)) -#define EOP_C_DOP_REG_XREG(cond,op,s,rn,rd,rs, shift_op,rm) EOP_C_DOP_X(cond,op,s,rn,rd,A_AM1_REG_XREG(rs, shift_op,rm)) +#define EOP_C_DOP_X(cond,op,s,rn,rd,sop,rm,rs) \ + EMIT(((cond)<<28) | ((op)<< 21) | ((s)<<20) | ((rn)<<16) | ((rd)<<12) | (sop), \ + M2(A_Rd(op,rd),A_Sd(s)), M5(A_Sr(op,sop),A_Rn(op,rn),rm,rs,rs==NO?NO:CYC1)) + +#define EOP_C_DOP_IMM( cond,op,s,rn,rd,ror2,imm8) EOP_C_DOP_X(cond,op,s,rn,rd,A_AM1_IMM(ror2,imm8), NO, NO) +#define EOP_C_DOP_REG_XIMM(cond,op,s,rn,rd,shift_imm,shift_op,rm) EOP_C_DOP_X(cond,op,s,rn,rd,A_AM1_REG_XIMM(shift_imm,shift_op,rm), rm, NO) +#define EOP_C_DOP_REG_XREG(cond,op,s,rn,rd,rs, shift_op,rm) EOP_C_DOP_X(cond,op,s,rn,rd,A_AM1_REG_XREG(rs, shift_op,rm), rm, rs) #define EOP_MOV_IMM(rd, ror2,imm8) EOP_C_DOP_IMM(A_COND_AL,A_OP_MOV,0, 0,rd,ror2,imm8) #define EOP_MVN_IMM(rd, ror2,imm8) EOP_C_DOP_IMM(A_COND_AL,A_OP_MVN,0, 0,rd,ror2,imm8) @@ -157,153 +324,239 @@ /* addressing mode 2 */ #define EOP_C_AM2_IMM(cond,u,b,l,rn,rd,offset_12) \ - EMIT(((cond)<<28) | 0x05000000 | ((u)<<23) | ((b)<<22) | ((l)<<20) | ((rn)<<16) | ((rd)<<12) | (offset_12)) + EMIT(((cond)<<28) | 0x05000000 | ((u)<<23) | ((b)<<22) | ((l)<<20) | ((rn)<<16) | ((rd)<<12) | \ + ((offset_12) & 0xfff), M1(l?rd:MEM), M3(rn,l?MEM:rd,l?b?CYC2:CYC1:NO)) #define EOP_C_AM2_REG(cond,u,b,l,rn,rd,shift_imm,shift_op,rm) \ EMIT(((cond)<<28) | 0x07000000 | ((u)<<23) | ((b)<<22) | ((l)<<20) | ((rn)<<16) | ((rd)<<12) | \ - ((shift_imm)<<7) | ((shift_op)<<5) | (rm)) + A_AM1_REG_XIMM(shift_imm, shift_op, rm), M1(l?rd:MEM), M4(rn,rm,l?MEM:rd,l?b?CYC2:CYC1:NO)) /* addressing mode 3 */ #define EOP_C_AM3(cond,u,r,l,rn,rd,s,h,immed_reg) \ EMIT(((cond)<<28) | 0x01000090 | ((u)<<23) | ((r)<<22) | ((l)<<20) | ((rn)<<16) | ((rd)<<12) | \ - ((s)<<6) | ((h)<<5) | (immed_reg)) + ((s)<<6) | ((h)<<5) | (immed_reg), M1(l?rd:MEM), M4(rn,r?NO:immed_reg,l?MEM:rd,l?CYC2:NO)) #define EOP_C_AM3_IMM(cond,u,l,rn,rd,s,h,offset_8) EOP_C_AM3(cond,u,1,l,rn,rd,s,h,(((offset_8)&0xf0)<<4)|((offset_8)&0xf)) #define EOP_C_AM3_REG(cond,u,l,rn,rd,s,h,rm) EOP_C_AM3(cond,u,0,l,rn,rd,s,h,rm) /* ldr and str */ -#define EOP_LDR_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,1,0,1,rn,rd,offset_12) -#define EOP_LDRB_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,1,1,1,rn,rd,offset_12) +#define EOP_LDR_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,(offset_12) >= 0,0,1,rn,rd,abs(offset_12)) +#define EOP_LDRB_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,(offset_12) >= 0,1,1,rn,rd,abs(offset_12)) +#define EOP_STR_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,(offset_12) >= 0,0,0,rn,rd,abs(offset_12)) -#define EOP_LDR_IMM( rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,1,0,1,rn,rd,offset_12) -#define EOP_LDR_NEGIMM(rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,0,0,1,rn,rd,offset_12) +#define EOP_LDR_IMM( rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,(offset_12) >= 0,0,1,rn,rd,abs(offset_12)) #define EOP_LDR_SIMPLE(rd,rn) EOP_C_AM2_IMM(A_COND_AL,1,0,1,rn,rd,0) -#define EOP_STR_IMM( rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,1,0,0,rn,rd,offset_12) +#define EOP_STR_IMM( rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,(offset_12) >= 0,0,0,rn,rd,abs(offset_12)) #define EOP_STR_SIMPLE(rd,rn) EOP_C_AM2_IMM(A_COND_AL,1,0,0,rn,rd,0) #define EOP_LDR_REG_LSL(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,0,1,rn,rd,shift_imm,A_AM1_LSL,rm) +#define EOP_LDR_REG_LSL_WB(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,0,3,rn,rd,shift_imm,A_AM1_LSL,rm) +#define EOP_LDRB_REG_LSL(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,1,1,rn,rd,shift_imm,A_AM1_LSL,rm) +#define EOP_STR_REG_LSL_WB(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,0,2,rn,rd,shift_imm,A_AM1_LSL,rm) -#define EOP_LDRH_IMM2(cond,rd,rn,offset_8) EOP_C_AM3_IMM(cond,1,1,rn,rd,0,1,offset_8) +#define EOP_LDRH_IMM2(cond,rd,rn,offset_8) EOP_C_AM3_IMM(cond,(offset_8) >= 0,1,rn,rd,0,1,abs(offset_8)) +#define EOP_LDRH_REG2(cond,rd,rn,rm) EOP_C_AM3_REG(cond,1,1,rn,rd,0,1,rm) -#define EOP_LDRH_IMM( rd,rn,offset_8) EOP_C_AM3_IMM(A_COND_AL,1,1,rn,rd,0,1,offset_8) +#define EOP_LDRH_IMM( rd,rn,offset_8) EOP_C_AM3_IMM(A_COND_AL,(offset_8) >= 0,1,rn,rd,0,1,abs(offset_8)) #define EOP_LDRH_SIMPLE(rd,rn) EOP_C_AM3_IMM(A_COND_AL,1,1,rn,rd,0,1,0) #define EOP_LDRH_REG( rd,rn,rm) EOP_C_AM3_REG(A_COND_AL,1,1,rn,rd,0,1,rm) -#define EOP_STRH_IMM( rd,rn,offset_8) EOP_C_AM3_IMM(A_COND_AL,1,0,rn,rd,0,1,offset_8) +#define EOP_STRH_IMM( rd,rn,offset_8) EOP_C_AM3_IMM(A_COND_AL,(offset_8) >= 0,0,rn,rd,0,1,abs(offset_8)) #define EOP_STRH_SIMPLE(rd,rn) EOP_C_AM3_IMM(A_COND_AL,1,0,rn,rd,0,1,0) #define EOP_STRH_REG( rd,rn,rm) EOP_C_AM3_REG(A_COND_AL,1,0,rn,rd,0,1,rm) +#define EOP_LDRSB_IMM2(cond,rd,rn,offset_8) EOP_C_AM3_IMM(cond,(offset_8) >= 0,1,rn,rd,1,0,abs(offset_8)) +#define EOP_LDRSB_REG2(cond,rd,rn,rm) EOP_C_AM3_REG(cond,1,1,rn,rd,1,0,rm) +#define EOP_LDRSH_IMM2(cond,rd,rn,offset_8) EOP_C_AM3_IMM(cond,(offset_8) >= 0,1,rn,rd,1,1,abs(offset_8)) +#define EOP_LDRSH_REG2(cond,rd,rn,rm) EOP_C_AM3_REG(cond,1,1,rn,rd,1,1,rm) + /* ldm and stm */ #define EOP_XXM(cond,p,u,s,w,l,rn,list) \ - EMIT(((cond)<<28) | (1<<27) | ((p)<<24) | ((u)<<23) | ((s)<<22) | ((w)<<21) | ((l)<<20) | ((rn)<<16) | (list)) + EMIT(((cond)<<28) | (1<<27) | ((p)<<24) | ((u)<<23) | ((s)<<22) | ((w)<<21) | ((l)<<20) | ((rn)<<16) | (list), \ + M2(rn,l?NO:MEM)|(l?list:0), M3(rn,l?MEM:NO,l?CYC2:NO)|(l?0:list)) #define EOP_STMIA(rb,list) EOP_XXM(A_COND_AL,0,1,0,0,0,rb,list) #define EOP_LDMIA(rb,list) EOP_XXM(A_COND_AL,0,1,0,0,1,rb,list) -#define EOP_STMFD_SP(list) EOP_XXM(A_COND_AL,1,0,0,1,0,13,list) -#define EOP_LDMFD_SP(list) EOP_XXM(A_COND_AL,0,1,0,1,1,13,list) +#define EOP_STMFD_SP(list) EOP_XXM(A_COND_AL,1,0,0,1,0,SP,list) +#define EOP_LDMFD_SP(list) EOP_XXM(A_COND_AL,0,1,0,1,1,SP,list) /* branches */ #define EOP_C_BX(cond,rm) \ - EMIT(((cond)<<28) | 0x012fff10 | (rm)) + EMIT(((cond)<<28) | 0x012fff10 | (rm), M1(PC), M1(rm)) #define EOP_C_B_PTR(ptr,cond,l,signed_immed_24) \ EMIT_PTR(ptr, ((cond)<<28) | 0x0a000000 | ((l)<<24) | (signed_immed_24)) #define EOP_C_B(cond,l,signed_immed_24) \ - EOP_C_B_PTR(tcache_ptr,cond,l,signed_immed_24) + EMIT(((cond)<<28) | 0x0a000000 | ((l)<<24) | (signed_immed_24), M2(PC,l?LR:NO), M1(PC)) #define EOP_B( signed_immed_24) EOP_C_B(A_COND_AL,0,signed_immed_24) #define EOP_BL(signed_immed_24) EOP_C_B(A_COND_AL,1,signed_immed_24) /* misc */ #define EOP_C_MUL(cond,s,rd,rs,rm) \ - EMIT(((cond)<<28) | ((s)<<20) | ((rd)<<16) | ((rs)<<8) | 0x90 | (rm)) + EMIT(((cond)<<28) | ((s)<<20) | ((rd)<<16) | ((rs)<<8) | 0x90 | (rm), M2(rd,s?SR:NO), M3(rs,rm,CYC2)) #define EOP_C_UMULL(cond,s,rdhi,rdlo,rs,rm) \ - EMIT(((cond)<<28) | 0x00800000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm)) + EMIT(((cond)<<28) | 0x00800000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm), M3(rdhi,rdlo,s?SR:NO), M4(rs,rm,CYC1,CYC2)) #define EOP_C_SMULL(cond,s,rdhi,rdlo,rs,rm) \ - EMIT(((cond)<<28) | 0x00c00000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm)) + EMIT(((cond)<<28) | 0x00c00000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm), M3(rdhi,rdlo,s?SR:NO), M4(rs,rm,CYC1,CYC2)) #define EOP_C_SMLAL(cond,s,rdhi,rdlo,rs,rm) \ - EMIT(((cond)<<28) | 0x00e00000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm)) + EMIT(((cond)<<28) | 0x00e00000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm), M3(rdhi,rdlo,s?SR:NO), M6(rs,rm,rdlo,rdhi,CYC1,CYC2)) #define EOP_MUL(rd,rm,rs) EOP_C_MUL(A_COND_AL,0,rd,rs,rm) // note: rd != rm #define EOP_C_MRS(cond,rd) \ - EMIT(((cond)<<28) | 0x010f0000 | ((rd)<<12)) + EMIT(((cond)<<28) | 0x010f0000 | ((rd)<<12), M1(rd), M1(SR)) #define EOP_C_MSR_IMM(cond,ror2,imm) \ - EMIT(((cond)<<28) | 0x0328f000 | ((ror2)<<8) | (imm)) // cpsr_f + EMIT(((cond)<<28) | 0x0328f000 | ((ror2)<<8) | (imm), M1(SR), 0) // cpsr_f #define EOP_C_MSR_REG(cond,rm) \ - EMIT(((cond)<<28) | 0x0128f000 | (rm)) // cpsr_f + EMIT(((cond)<<28) | 0x0128f000 | (rm), M1(SR), M1(rm)) // cpsr_f #define EOP_MRS(rd) EOP_C_MRS(A_COND_AL,rd) #define EOP_MSR_IMM(ror2,imm) EOP_C_MSR_IMM(A_COND_AL,ror2,imm) #define EOP_MSR_REG(rm) EOP_C_MSR_REG(A_COND_AL,rm) #define EOP_MOVW(rd,imm) \ - EMIT(0xe3000000 | ((rd)<<12) | ((imm)&0xfff) | (((imm)<<4)&0xf0000)) + EMIT(0xe3000000 | ((rd)<<12) | ((imm)&0xfff) | (((imm)<<4)&0xf0000), M1(rd), NO) #define EOP_MOVT(rd,imm) \ - EMIT(0xe3400000 | ((rd)<<12) | (((imm)>>16)&0xfff) | (((imm)>>12)&0xf0000)) + EMIT(0xe3400000 | ((rd)<<12) | (((imm)>>16)&0xfff) | (((imm)>>12)&0xf0000), M1(rd), NO) + +// host literal pool; must be significantly smaller than 1024 (max LDR offset = 4096) +#define MAX_HOST_LITERALS 128 +static u32 literal_pool[MAX_HOST_LITERALS]; +static u32 *literal_insn[MAX_HOST_LITERALS]; +static int literal_pindex, literal_iindex; -// XXX: AND, RSB, *C, will break if 1 insn is not enough +static int emith_pool_literal(u32 imm, int *offs) +{ + int idx = literal_pindex - 8; // max look behind in pool + // see if one of the last literals was the same (or close enough) + for (idx = (idx < 0 ? 0 : idx); idx < literal_pindex; idx++) + if (abs((int)(imm - literal_pool[idx])) <= 0xff) + break; + if (idx == literal_pindex) // store new literal + literal_pool[literal_pindex++] = imm; + *offs = imm - literal_pool[idx]; + return idx; +} + +// XXX: RSB, *S will break if 1 insn is not enough static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int imm) { int ror2; u32 v; - - switch (op) { - case A_OP_MOV: - rn = 0; - if (~imm < 0x10000) { - imm = ~imm; - op = A_OP_MVN; - } -#ifdef HAVE_ARMV7 - for (v = imm, ror2 = 0; v && !(v & 3); v >>= 2) - ror2--; - if (v >> 8) { - /* 2+ insns needed - prefer movw/movt */ - if (op == A_OP_MVN) + int i; + + if (cond == A_COND_NV) + return; + + do { + u32 u; + // try to get the topmost byte empty to possibly save an insn + for (v = imm, ror2 = 0; (v >> 24) && ror2 < 32/2; ror2++) + v = (v << 2) | (v >> 30); + + switch (op) { + case A_OP_MOV: + case A_OP_MVN: + rn = 0; + // use MVN if more bits 1 than 0 + if (count_bits(imm) > 16) { imm = ~imm; - EOP_MOVW(rd, imm); - if (imm & 0xffff0000) - EOP_MOVT(rd, imm); + op = A_OP_MVN; + ror2 = -1; + break; + } + // count insns needed for mov/orr #imm +#ifdef HAVE_ARMV7 + for (i = 2, u = v; i > 0; i--, u >>= 8) + while (u > 0xff && !(u & 3)) + u >>= 2; + if (u) { // 3+ insns needed... + if (op == A_OP_MVN) + imm = ~imm; + // ...prefer movw/movt + EOP_MOVW(rd, imm); + if (imm & 0xffff0000) + EOP_MOVT(rd, imm); + return; + } +#else + for (i = 2, u = v; i > 0; i--, u >>= 8) + while (u > 0xff && !(u & 3)) + u >>= 2; + if (u) { // 3+ insns needed... + if (op == A_OP_MVN) + imm = ~imm; + // ...emit literal load + int idx, o; + if (literal_iindex >= MAX_HOST_LITERALS) { + elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, + "pool overflow"); + exit(1); + } + idx = emith_pool_literal(imm, &o); + literal_insn[literal_iindex++] = (u32 *)tcache_ptr; + EOP_LDR_IMM2(cond, rd, PC, idx * sizeof(u32)); + if (o > 0) + EOP_C_DOP_IMM(cond, A_OP_ADD, 0,rd,rd,0,o); + else if (o < 0) + EOP_C_DOP_IMM(cond, A_OP_SUB, 0,rd,rd,0,-o); return; - } + } #endif - break; - - case A_OP_EOR: - case A_OP_SUB: - case A_OP_ADD: - case A_OP_ORR: - case A_OP_BIC: - if (s == 0 && imm == 0) - return; - break; - } - - for (v = imm, ror2 = 0; ; ror2 -= 8/2) { - /* shift down to get 'best' rot2 */ - for (; v && !(v & 3); v >>= 2) - ror2--; + break; - EOP_C_DOP_IMM(cond, op, s, rn, rd, ror2 & 0x0f, v & 0xff); + case A_OP_AND: + // AND must fit into 1 insn. if not, use BIC + for (u = v; u > 0xff && !(u & 3); u >>= 2) ; + if (u >> 8) { + imm = ~imm; + op = A_OP_BIC; + ror2 = -1; + } + break; - v >>= 8; - if (v == 0) + case A_OP_SUB: + case A_OP_ADD: + // swap ADD and SUB if more bits 1 than 0 + if (s == 0 && count_bits(imm) > 16) { + imm = -imm; + op ^= (A_OP_ADD^A_OP_SUB); + ror2 = -1; + } + case A_OP_EOR: + case A_OP_ORR: + case A_OP_BIC: + if (s == 0 && imm == 0 && rd == rn) + return; break; - if (op == A_OP_MOV) - op = A_OP_ORR; - if (op == A_OP_MVN) - op = A_OP_BIC; + } + } while (ror2 < 0); + + do { + // shift down to get 'best' rot2 + while (v > 0xff && !(v & 3)) + v >>= 2, ror2--; + EOP_C_DOP_IMM(cond, op, s, rn, rd, ror2 & 0xf, v & 0xff); + + switch (op) { + case A_OP_MOV: op = A_OP_ORR; break; + case A_OP_MVN: op = A_OP_BIC; break; + case A_OP_ADC: op = A_OP_ADD; break; + case A_OP_SBC: op = A_OP_SUB; break; + } rn = rd; - } + + v >>= 8, ror2 -= 8/2; + } while (v); } #define emith_op_imm(cond, s, op, r, imm) \ @@ -326,6 +579,9 @@ static int emith_xbranch(int cond, void *target, int is_call) int direct = is_offset_24(val); u32 *start_ptr = (u32 *)tcache_ptr; + if (cond == A_COND_NV) + return 0; // never taken + if (direct) { EOP_C_B(cond,is_call,val & 0xffffff); // b, bl target @@ -335,13 +591,13 @@ static int emith_xbranch(int cond, void *target, int is_call) #ifdef __EPOC32__ // elprintf(EL_SVP, "emitting indirect jmp %08x->%08x", tcache_ptr, target); if (is_call) - EOP_ADD_IMM(14,15,0,8); // add lr,pc,#8 - EOP_C_AM2_IMM(cond,1,0,1,15,15,0); // ldrcc pc,[pc] - EOP_MOV_REG_SIMPLE(15,15); // mov pc, pc - EMIT((u32)target); + EOP_ADD_IMM(LR,PC,0,8); // add lr,pc,#8 + EOP_C_AM2_IMM(cond,1,0,1,PC,PC,0); // ldrcc pc,[pc] + EOP_MOV_REG_SIMPLE(PC,PC); // mov pc, pc + EMIT((u32)target,M1(PC),0); #else // should never happen - elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, "indirect jmp %08x->%08x", target, tcache_ptr); + elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, "indirect jmp %8p->%8p", target, tcache_ptr); exit(1); #endif } @@ -349,12 +605,68 @@ static int emith_xbranch(int cond, void *target, int is_call) return (u32 *)tcache_ptr - start_ptr; } -#define JMP_POS(ptr) \ +static void emith_pool_commit(int jumpover) +{ + int i, sz = literal_pindex * sizeof(u32); + u8 *pool = (u8 *)tcache_ptr; + + // nothing to commit if pool is empty + if (sz == 0) + return; + // need branch over pool if not at block end + if (jumpover) { + pool += sizeof(u32); + emith_xbranch(A_COND_AL, (u8 *)pool + sz, 0); + } + emith_flush(); + // safety check - pool must be after insns and reachable + if ((u32)(pool - (u8 *)literal_insn[0] + 8) > 0xfff) { + elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, + "pool offset out of range"); + exit(1); + } + // copy pool and adjust addresses in insns accessing the pool + memcpy(pool, literal_pool, sz); + for (i = 0; i < literal_iindex; i++) { + *literal_insn[i] += (u8 *)pool - ((u8 *)literal_insn[i] + 8); + } + // count pool constants as insns for statistics + for (i = 0; i < literal_pindex; i++) + COUNT_OP; + + tcache_ptr = (void *)((u8 *)pool + sz); + literal_pindex = literal_iindex = 0; +} + +static inline void emith_pool_check(void) +{ + // check if pool must be committed + if (literal_iindex > MAX_HOST_LITERALS-4 || (literal_pindex && + (u8 *)tcache_ptr - (u8 *)literal_insn[0] > 0xe00)) + // pool full, or displacement is approaching the limit + emith_pool_commit(1); +} + +static inline void emith_pool_adjust(int tcache_offs, int move_offs) +{ + u32 *ptr = (u32 *)tcache_ptr - tcache_offs; + int i; + + for (i = literal_iindex-1; i >= 0 && literal_insn[i] >= ptr; i--) + if (literal_insn[i] == ptr) + literal_insn[i] += move_offs; +} + +#define EMITH_HINT_COND(cond) /**/ + +#define JMP_POS(ptr) { \ ptr = tcache_ptr; \ - tcache_ptr += sizeof(u32) + EMIT(0,M1(PC),0); \ +} #define JMP_EMIT(cond, ptr) { \ u32 val_ = (u32 *)tcache_ptr - (u32 *)(ptr) - 2; \ + emith_flush(); /* NO insn swapping across jump targets */ \ EOP_C_B_PTR(ptr, cond, 0, val_ & 0xffffff); \ } @@ -370,18 +682,22 @@ static int emith_xbranch(int cond, void *target, int is_call) #define EMITH_NOTHING1(cond) \ (void)(cond) -#define EMITH_SJMP_DECL_() -#define EMITH_SJMP_START_(cond) EMITH_NOTHING1(cond) -#define EMITH_SJMP_END_(cond) EMITH_NOTHING1(cond) #define EMITH_SJMP_START(cond) EMITH_NOTHING1(cond) #define EMITH_SJMP_END(cond) EMITH_NOTHING1(cond) +#define EMITH_SJMP2_START(cond) EMITH_NOTHING1(cond) +#define EMITH_SJMP2_MID(cond) EMITH_JMP_START((cond)^1) // inverse cond +#define EMITH_SJMP2_END(cond) EMITH_JMP_END((cond)^1) #define EMITH_SJMP3_START(cond) EMITH_NOTHING1(cond) #define EMITH_SJMP3_MID(cond) EMITH_NOTHING1(cond) #define EMITH_SJMP3_END() +#define emith_move_r_r_c(cond, d, s) \ + EOP_MOV_REG(cond,0,d,s,A_AM1_LSL,0) #define emith_move_r_r(d, s) \ - EOP_MOV_REG_SIMPLE(d, s) + emith_move_r_r_c(A_COND_AL, d, s) +#define emith_move_r_r_ptr_c(cond, d, s) \ + emith_move_r_r_c(cond, d, s) #define emith_move_r_r_ptr(d, s) \ emith_move_r_r(d, s) @@ -390,40 +706,108 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_add_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_ADD_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) +#define emith_add_r_r_r_lsl_ptr(d, s1, s2, lslimm) \ + emith_add_r_r_r_lsl(d, s1, s2, lslimm) + +#define emith_adc_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_ADC_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) + +#define emith_addf_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_ADD_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSL,lslimm) +#define emith_addf_r_r_r_lsr(d, s1, s2, lslimm) \ + EOP_ADD_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSR,lslimm) + +#define emith_adcf_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_ADC_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSL,lslimm) + +#define emith_sub_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_SUB_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) + +#define emith_sbc_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_SBC_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) + +#define emith_subf_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_SUB_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSL,lslimm) + +#define emith_sbcf_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_SBC_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSL,lslimm) #define emith_or_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_ORR_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) +#define emith_or_r_r_r_lsr(d, s1, s2, lsrimm) \ + EOP_ORR_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSR,lsrimm) #define emith_eor_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_EOR_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) - #define emith_eor_r_r_r_lsr(d, s1, s2, lsrimm) \ EOP_EOR_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSR,lsrimm) +#define emith_and_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_AND_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) + #define emith_or_r_r_lsl(d, s, lslimm) \ emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) +#define emith_eor_r_r_lsl(d, s, lslimm) \ + emith_eor_r_r_r_lsl(d, d, s, lslimm) #define emith_eor_r_r_lsr(d, s, lsrimm) \ emith_eor_r_r_r_lsr(d, d, s, lsrimm) #define emith_add_r_r_r(d, s1, s2) \ emith_add_r_r_r_lsl(d, s1, s2, 0) +#define emith_adc_r_r_r(d, s1, s2) \ + emith_adc_r_r_r_lsl(d, s1, s2, 0) + +#define emith_addf_r_r_r(d, s1, s2) \ + emith_addf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_adcf_r_r_r(d, s1, s2) \ + emith_adcf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_sub_r_r_r(d, s1, s2) \ + emith_sub_r_r_r_lsl(d, s1, s2, 0) + +#define emith_sbc_r_r_r(d, s1, s2) \ + emith_sbc_r_r_r_lsl(d, s1, s2, 0) + +#define emith_subf_r_r_r(d, s1, s2) \ + emith_subf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_sbcf_r_r_r(d, s1, s2) \ + emith_sbcf_r_r_r_lsl(d, s1, s2, 0) + #define emith_or_r_r_r(d, s1, s2) \ emith_or_r_r_r_lsl(d, s1, s2, 0) #define emith_eor_r_r_r(d, s1, s2) \ emith_eor_r_r_r_lsl(d, s1, s2, 0) +#define emith_and_r_r_r(d, s1, s2) \ + emith_and_r_r_r_lsl(d, s1, s2, 0) + #define emith_add_r_r(d, s) \ emith_add_r_r_r(d, d, s) -#define emith_sub_r_r(d, s) \ - EOP_SUB_REG(A_COND_AL,0,d,d,s,A_AM1_LSL,0) +#define emith_add_r_r_ptr(d, s) \ + emith_add_r_r_r(d, d, s) #define emith_adc_r_r(d, s) \ - EOP_ADC_REG(A_COND_AL,0,d,d,s,A_AM1_LSL,0) + emith_adc_r_r_r(d, d, s) + +#define emith_sub_r_r(d, s) \ + emith_sub_r_r_r(d, d, s) +#define emith_sbc_r_r(d, s) \ + emith_sbc_r_r_r(d, d, s) + +#define emith_negc_r_r(d, s) \ + EOP_C_DOP_IMM(A_COND_AL,A_OP_RSC,0,s,d,0,0) + +#define emith_and_r_r_c(cond, d, s) \ + EOP_AND_REG(cond,0,d,d,s,A_AM1_LSL,0) #define emith_and_r_r(d, s) \ EOP_AND_REG(A_COND_AL,0,d,d,s,A_AM1_LSL,0) @@ -463,12 +847,18 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_move_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 0, A_OP_MOV, r, imm) +#define emith_move_r_ptr_imm(r, imm) \ + emith_move_r_imm(r, (u32)(imm)) + #define emith_add_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 0, A_OP_ADD, r, imm) #define emith_adc_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 0, A_OP_ADC, r, imm) +#define emith_adcf_r_imm(r, imm) \ + emith_op_imm(A_COND_AL, 1, A_OP_ADC, r, imm) + #define emith_sub_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 0, A_OP_SUB, r, imm) @@ -484,18 +874,21 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_eor_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 0, A_OP_EOR, r, imm) +#define emith_eor_r_imm_ptr(r, imm) \ + emith_eor_r_imm(r, imm) + // note: only use 8bit imm for these #define emith_tst_r_imm(r, imm) \ emith_top_imm(A_COND_AL, A_OP_TST, r, imm) -#define emith_cmp_r_imm(r, imm) { \ - u32 op = A_OP_CMP, imm_ = imm; \ - if (~imm_ < 0x100) { \ - imm_ = ~imm_; \ - op = A_OP_CMN; \ +#define emith_cmp_r_imm(r, imm) do { \ + u32 op_ = A_OP_CMP, imm_ = (u8)imm; \ + if ((s8)imm_ < 0) { \ + imm_ = (u8)-imm_; \ + op_ = A_OP_CMN; \ } \ - emith_top_imm(A_COND_AL, op, r, imm); \ -} + emith_top_imm(A_COND_AL, op_, r, imm_); \ +} while (0) #define emith_subf_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 1, A_OP_SUB, r, imm) @@ -515,15 +908,29 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_eor_r_imm_c(cond, r, imm) \ emith_op_imm(cond, 0, A_OP_EOR, r, imm) +#define emith_eor_r_imm_ptr_c(cond, r, imm) \ + emith_eor_r_imm_c(cond, r, imm) + #define emith_bic_r_imm_c(cond, r, imm) \ emith_op_imm(cond, 0, A_OP_BIC, r, imm) -#define emith_move_r_imm_s8(r, imm) { \ - if ((imm) & 0x80) \ - EOP_MVN_IMM(r, 0, ((imm) ^ 0xff)); \ +#define emith_tst_r_imm_c(cond, r, imm) \ + emith_top_imm(cond, A_OP_TST, r, imm) + +#define emith_move_r_imm_s8_patchable(r, imm) do { \ + emith_flush(); /* pin insn at current tcache_ptr for patching */ \ + if ((s8)(imm) < 0) \ + EOP_MVN_IMM(r, 0, (u8)~(imm)); \ else \ - EOP_MOV_IMM(r, 0, imm); \ -} + EOP_MOV_IMM(r, 0, (u8)(imm)); \ +} while (0) +#define emith_move_r_imm_s8_patch(ptr, imm) do { \ + u32 *ptr_ = (u32 *)ptr; u32 op_ = *ptr_ & 0xfe1ff000; \ + if ((s8)(imm) < 0) \ + EMIT_PTR(ptr_, op_ | (A_OP_MVN<<21) | (u8)~(imm));\ + else \ + EMIT_PTR(ptr_, op_ | (A_OP_MOV<<21) | (u8)(imm));\ +} while (0) #define emith_and_r_r_imm(d, s, imm) \ emith_op_imm2(A_COND_AL, 0, A_OP_AND, d, s, imm) @@ -534,9 +941,21 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_add_r_r_ptr_imm(d, s, imm) \ emith_add_r_r_imm(d, s, imm) +#define emith_sub_r_r_imm_c(cond, d, s, imm) \ + emith_op_imm2(cond, 0, A_OP_SUB, d, s, (imm)) + #define emith_sub_r_r_imm(d, s, imm) \ emith_op_imm2(A_COND_AL, 0, A_OP_SUB, d, s, imm) +#define emith_subf_r_r_imm(d, s, imm) \ + emith_op_imm2(A_COND_AL, 1, A_OP_SUB, d, s, imm) + +#define emith_or_r_r_imm(d, s, imm) \ + emith_op_imm2(A_COND_AL, 0, A_OP_ORR, d, s, imm) + +#define emith_eor_r_r_imm(d, s, imm) \ + emith_op_imm2(A_COND_AL, 0, A_OP_EOR, d, s, imm) + #define emith_neg_r_r(d, s) \ EOP_RSB_IMM(d, s, 0, 0) @@ -568,30 +987,34 @@ static int emith_xbranch(int cond, void *target, int is_call) EOP_MOV_REG(A_COND_AL,1,d,s,A_AM1_ASR,cnt) // note: only C flag updated correctly -#define emith_rolf(d, s, cnt) { \ +#define emith_rolf(d, s, cnt) do { \ EOP_MOV_REG(A_COND_AL,1,d,s,A_AM1_ROR,32-(cnt)); \ /* we don't have ROL so we shift to get the right carry */ \ EOP_TST_REG(A_COND_AL,d,d,A_AM1_LSR,1); \ -} +} while (0) #define emith_rorf(d, s, cnt) \ EOP_MOV_REG(A_COND_AL,1,d,s,A_AM1_ROR,cnt) #define emith_rolcf(d) \ emith_adcf_r_r(d, d) +#define emith_rolc(d) \ + emith_adc_r_r(d, d) #define emith_rorcf(d) \ EOP_MOV_REG(A_COND_AL,1,d,d,A_AM1_ROR,0) /* ROR #0 -> RRX */ +#define emith_rorc(d) \ + EOP_MOV_REG(A_COND_AL,0,d,d,A_AM1_ROR,0) /* ROR #0 -> RRX */ #define emith_negcf_r_r(d, s) \ EOP_C_DOP_IMM(A_COND_AL,A_OP_RSC,1,s,d,0,0) -#define emith_mul(d, s1, s2) { \ +#define emith_mul(d, s1, s2) do { \ if ((d) != (s1)) /* rd != rm limitation */ \ EOP_MUL(d, s1, s2); \ else \ EOP_MUL(d, s2, s1); \ -} +} while (0) #define emith_mul_u64(dlo, dhi, s1, s2) \ EOP_C_UMULL(A_COND_AL,0,dhi,dlo,s1,s2) @@ -599,30 +1022,74 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_mul_s64(dlo, dhi, s1, s2) \ EOP_C_SMULL(A_COND_AL,0,dhi,dlo,s1,s2) +#define emith_mula_s64_c(cond, dlo, dhi, s1, s2) \ + EOP_C_SMLAL(cond,0,dhi,dlo,s1,s2) #define emith_mula_s64(dlo, dhi, s1, s2) \ EOP_C_SMLAL(A_COND_AL,0,dhi,dlo,s1,s2) // misc #define emith_read_r_r_offs_c(cond, r, rs, offs) \ EOP_LDR_IMM2(cond, r, rs, offs) - -#define emith_read8_r_r_offs_c(cond, r, rs, offs) \ - EOP_LDRB_IMM2(cond, r, rs, offs) - -#define emith_read16_r_r_offs_c(cond, r, rs, offs) \ - EOP_LDRH_IMM2(cond, r, rs, offs) - +#define emith_read_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_read_r_r_offs_c(cond, r, rs, offs) +#define emith_read_r_r_r_c(cond, r, rs, rm) \ + EOP_LDR_REG_LSL(cond, r, rs, rm, 0) #define emith_read_r_r_offs(r, rs, offs) \ emith_read_r_r_offs_c(A_COND_AL, r, rs, offs) +#define emith_read_r_r_offs_ptr(r, rs, offs) \ + emith_read_r_r_offs_c(A_COND_AL, r, rs, offs) +#define emith_read_r_r_r(r, rs, rm) \ + EOP_LDR_REG_LSL(A_COND_AL, r, rs, rm, 0) +#define emith_read8_r_r_offs_c(cond, r, rs, offs) \ + EOP_LDRB_IMM2(cond, r, rs, offs) +#define emith_read8_r_r_r_c(cond, r, rs, rm) \ + EOP_LDRB_REG_LSL(cond, r, rs, rm, 0) #define emith_read8_r_r_offs(r, rs, offs) \ emith_read8_r_r_offs_c(A_COND_AL, r, rs, offs) +#define emith_read8_r_r_r(r, rs, rm) \ + emith_read8_r_r_r_c(A_COND_AL, r, rs, rm) +#define emith_read16_r_r_offs_c(cond, r, rs, offs) \ + EOP_LDRH_IMM2(cond, r, rs, offs) +#define emith_read16_r_r_r_c(cond, r, rs, rm) \ + EOP_LDRH_REG2(cond, r, rs, rm) #define emith_read16_r_r_offs(r, rs, offs) \ emith_read16_r_r_offs_c(A_COND_AL, r, rs, offs) - +#define emith_read16_r_r_r(r, rs, rm) \ + emith_read16_r_r_r_c(A_COND_AL, r, rs, rm) + +#define emith_read8s_r_r_offs_c(cond, r, rs, offs) \ + EOP_LDRSB_IMM2(cond, r, rs, offs) +#define emith_read8s_r_r_r_c(cond, r, rs, rm) \ + EOP_LDRSB_REG2(cond, r, rs, rm) +#define emith_read8s_r_r_offs(r, rs, offs) \ + emith_read8s_r_r_offs_c(A_COND_AL, r, rs, offs) +#define emith_read8s_r_r_r(r, rs, rm) \ + emith_read8s_r_r_r_c(A_COND_AL, r, rs, rm) + +#define emith_read16s_r_r_offs_c(cond, r, rs, offs) \ + EOP_LDRSH_IMM2(cond, r, rs, offs) +#define emith_read16s_r_r_r_c(cond, r, rs, rm) \ + EOP_LDRSH_REG2(cond, r, rs, rm) +#define emith_read16s_r_r_offs(r, rs, offs) \ + emith_read16s_r_r_offs_c(A_COND_AL, r, rs, offs) +#define emith_read16s_r_r_r(r, rs, rm) \ + emith_read16s_r_r_r_c(A_COND_AL, r, rs, rm) + +#define emith_write_r_r_offs_c(cond, r, rs, offs) \ + EOP_STR_IMM2(cond, r, rs, offs) +#define emith_write_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_write_r_r_offs_c(cond, r, rs, offs) +#define emith_write_r_r_offs(r, rs, offs) \ + emith_write_r_r_offs_c(A_COND_AL, r, rs, offs) +#define emith_write_r_r_offs_ptr(r, rs, offs) \ + emith_write_r_r_offs_c(A_COND_AL, r, rs, offs) + +#define emith_ctx_read_c(cond, r, offs) \ + emith_read_r_r_offs_c(cond, r, CONTEXT_REG, offs) #define emith_ctx_read(r, offs) \ - emith_read_r_r_offs(r, CONTEXT_REG, offs) + emith_ctx_read_c(A_COND_AL, r, offs) #define emith_ctx_read_ptr(r, offs) \ emith_ctx_read(r, offs) @@ -633,13 +1100,13 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_ctx_do_multiple(op, r, offs, count, tmpr) do { \ int v_, r_ = r, c_ = count, b_ = CONTEXT_REG; \ for (v_ = 0; c_; c_--, r_++) \ - v_ |= 1 << r_; \ + v_ |= M1(r_); \ if ((offs) != 0) { \ EOP_ADD_IMM(tmpr,CONTEXT_REG,30/2,(offs)>>2);\ b_ = tmpr; \ } \ op(b_,v_); \ -} while(0) +} while (0) #define emith_ctx_read_multiple(r, offs, count, tmpr) \ emith_ctx_do_multiple(EOP_LDMIA, r, offs, count, tmpr) @@ -647,40 +1114,40 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_ctx_write_multiple(r, offs, count, tmpr) \ emith_ctx_do_multiple(EOP_STMIA, r, offs, count, tmpr) -#define emith_clear_msb_c(cond, d, s, count) { \ +#define emith_clear_msb_c(cond, d, s, count) do { \ u32 t; \ if ((count) <= 8) { \ - t = (count) - 8; \ + t = 8 - (count); \ t = (0xff << t) & 0xff; \ - EOP_BIC_IMM(d,s,8/2,t); \ EOP_C_DOP_IMM(cond,A_OP_BIC,0,s,d,8/2,t); \ } else if ((count) >= 24) { \ t = (count) - 24; \ t = 0xff >> t; \ - EOP_AND_IMM(d,s,0,t); \ EOP_C_DOP_IMM(cond,A_OP_AND,0,s,d,0,t); \ } else { \ EOP_MOV_REG(cond,0,d,s,A_AM1_LSL,count); \ EOP_MOV_REG(cond,0,d,d,A_AM1_LSR,count); \ } \ -} +} while (0) #define emith_clear_msb(d, s, count) \ emith_clear_msb_c(A_COND_AL, d, s, count) -#define emith_sext(d, s, bits) { \ +#define emith_sext(d, s, bits) do { \ EOP_MOV_REG_LSL(d,s,32 - (bits)); \ EOP_MOV_REG_ASR(d,d,32 - (bits)); \ -} +} while (0) -#define emith_do_caller_regs(mask, func) { \ +#define emith_uext_ptr(r) /**/ + +#define emith_do_caller_regs(mask, func) do { \ u32 _reg_mask = (mask) & 0x500f; \ if (_reg_mask) { \ if (__builtin_parity(_reg_mask) == 1) \ _reg_mask |= 0x10; /* eabi align */ \ func(_reg_mask); \ } \ -} +} while (0) #define emith_save_caller_regs(mask) \ emith_do_caller_regs(mask, EOP_STMFD_SP) @@ -703,20 +1170,25 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_jump_cond(cond, target) \ emith_xbranch(cond, target, 0) +#define emith_jump_cond_inrange(target) !0 #define emith_jump_cond_patchable(cond, target) \ emith_jump_cond(cond, target) -#define emith_jump_patch(ptr, target) do { \ - u32 *ptr_ = ptr; \ +#define emith_jump_patch(ptr, target, pos) do { \ + u32 *ptr_ = (u32 *)ptr; \ u32 val_ = (u32 *)(target) - ptr_ - 2; \ *ptr_ = (*ptr_ & 0xff000000) | (val_ & 0x00ffffff); \ + if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)ptr; \ } while (0) +#define emith_jump_patch_inrange(ptr, target) !0 +#define emith_jump_patch_size() 4 -#define emith_jump_at(ptr, target) { \ +#define emith_jump_at(ptr, target) do { \ u32 val_ = (u32 *)(target) - (u32 *)(ptr) - 2; \ EOP_C_B_PTR(ptr, A_COND_AL, 0, val_ & 0xffffff); \ -} +} while (0) +#define emith_jump_at_size() 4 #define emith_jump_reg_c(cond, r) \ EOP_C_BX(cond, r) @@ -725,7 +1197,7 @@ static int emith_xbranch(int cond, void *target, int is_call) emith_jump_reg_c(A_COND_AL, r) #define emith_jump_ctx_c(cond, offs) \ - EOP_LDR_IMM2(cond,15,CONTEXT_REG,offs) + EOP_LDR_IMM2(cond,PC,CONTEXT_REG,offs) #define emith_jump_ctx(offs) \ emith_jump_ctx_c(A_COND_AL, offs) @@ -736,48 +1208,76 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_call(target) \ emith_call_cond(A_COND_AL, target) -#define emith_call_ctx(offs) { \ - emith_move_r_r(14, 15); \ +#define emith_call_reg(r) do { \ + emith_move_r_r(LR, PC); \ + EOP_C_BX(A_COND_AL, r); \ +} while (0) + +#define emith_call_ctx(offs) do { \ + emith_move_r_r(LR, PC); \ emith_jump_ctx(offs); \ -} +} while (0) + +#define emith_call_cleanup() /**/ #define emith_ret_c(cond) \ - emith_jump_reg_c(cond, 14) + emith_jump_reg_c(cond, LR) #define emith_ret() \ emith_ret_c(A_COND_AL) #define emith_ret_to_ctx(offs) \ - emith_ctx_write(14, offs) + emith_ctx_write(LR, offs) + +#define emith_add_r_ret(r) \ + emith_add_r_r_ptr(r, LR) -#define emith_push_ret() \ - EOP_STMFD_SP(A_R14M) +/* pushes r12 for eabi alignment */ +#define emith_push_ret(r) do { \ + int r_ = (r >= 0 ? r : 12); \ + EOP_STMFD_SP(M2(r_,LR)); \ +} while (0) -#define emith_pop_and_ret() \ - EOP_LDMFD_SP(A_R15M) +#define emith_pop_and_ret(r) do { \ + int r_ = (r >= 0 ? r : 12); \ + EOP_LDMFD_SP(M2(r_,PC)); \ +} while (0) #define host_instructions_updated(base, end) \ - cache_flush_d_inval_i(base, end) + emith_update_add(base, end) #define host_arg2reg(rd, arg) \ rd = arg +#define emith_rw_offs_max() 0xff + /* SH2 drc specific */ /* pushes r12 for eabi alignment */ #define emith_sh2_drc_entry() \ - EOP_STMFD_SP(A_R4M|A_R5M|A_R6M|A_R7M|A_R8M|A_R9M|A_R10M|A_R11M|A_R12M|A_R14M) + EOP_STMFD_SP(M10(4,5,6,7,8,9,10,11,12,LR)) #define emith_sh2_drc_exit() \ - EOP_LDMFD_SP(A_R4M|A_R5M|A_R6M|A_R7M|A_R8M|A_R9M|A_R10M|A_R11M|A_R12M|A_R15M) + EOP_LDMFD_SP(M10(4,5,6,7,8,9,10,11,12,PC)) + +// assumes a is in arg0, tab, func and mask are temp +#define emith_sh2_rcall(a, tab, func, mask) do { \ + emith_lsr(mask, a, SH2_READ_SHIFT); \ + EOP_ADD_REG_LSL(tab, tab, mask, 3); \ + if (func < mask) EOP_LDMIA(tab, M2(func,mask)); /* ldm if possible */ \ + else { emith_read_r_r_offs(func, tab, 0); \ + emith_read_r_r_offs(mask, tab, 4); } \ + emith_addf_r_r_r(func,func,func); \ +} while (0) -#define emith_sh2_wcall(a, tab) { \ - emith_lsr(12, a, SH2_WRITE_SHIFT); \ - EOP_LDR_REG_LSL(A_COND_AL,12,tab,12,2); \ - emith_move_r_r(2, CONTEXT_REG); \ - emith_jump_reg(12); \ -} +// assumes a, val are in arg0 and arg1, tab and func are temp +#define emith_sh2_wcall(a, val, tab, func) do { \ + emith_lsr(func, a, SH2_WRITE_SHIFT); \ + EOP_LDR_REG_LSL(A_COND_AL,func,tab,func,2); \ + emith_move_r_r(2, CONTEXT_REG); /* arg2 */ \ + emith_jump_reg(func); \ +} while (0) -#define emith_sh2_dtbf_loop() { \ +#define emith_sh2_dtbf_loop() do { \ int cr, rn; \ int tmp_ = rcache_get_tmp(); \ cr = rcache_get_reg(SHR_SR, RC_GR_RMW); \ @@ -796,15 +1296,47 @@ static int emith_xbranch(int cond, void *target, int is_call) EOP_ORR_IMM_C(A_COND_LS,cr,cr,0,1); /* orrls cr, #1 */ \ EOP_MOV_IMM_C(A_COND_LS,rn,0,0); /* movls rn, #0 */ \ rcache_free_tmp(tmp_); \ -} +} while (0) + +#define emith_sh2_delay_loop(cycles, reg) do { \ + int sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); \ + int t1 = rcache_get_tmp(); \ + int t2 = rcache_get_tmp(); \ + int t3 = rcache_get_tmp(); \ + /* if (sr < 0) return */ \ + emith_asrf(t2, sr, 12); \ + EMITH_JMP_START(DCOND_LE); \ + /* turns = sr.cycles / cycles */ \ + emith_move_r_imm(t3, (u32)((1ULL<<32) / (cycles)) + 1); \ + emith_mul_u64(t1, t2, t2, t3); /* multiply by 1/x */ \ + rcache_free_tmp(t3); \ + if (reg >= 0) { \ + /* if (reg <= turns) turns = reg-1 */ \ + t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \ + emith_cmp_r_r(t3, t2); \ + emith_sub_r_r_imm_c(DCOND_LS, t2, t3, 1); \ + /* if (reg <= 1) turns = 0 */ \ + emith_cmp_r_imm(t3, 1); \ + emith_move_r_imm_c(DCOND_LS, t2, 0); \ + /* reg -= turns */ \ + emith_sub_r_r(t3, t2); \ + } \ + /* sr.cycles -= turns * cycles; */ \ + emith_move_r_imm(t1, cycles); \ + emith_mul(t1, t2, t1); \ + emith_sub_r_r_r_lsl(sr, sr, t1, 12); \ + EMITH_JMP_END(DCOND_LE); \ + rcache_free_tmp(t1); \ + rcache_free_tmp(t2); \ +} while (0) -#define emith_write_sr(sr, srcr) { \ +#define emith_write_sr(sr, srcr) do { \ emith_lsr(sr, sr, 10); \ emith_or_r_r_r_lsl(sr, sr, srcr, 22); \ emith_ror(sr, sr, 22); \ -} +} while (0) -#define emith_carry_to_t(srr, is_sub) { \ +#define emith_carry_to_t(srr, is_sub) do { \ if (is_sub) { /* has inverted C on ARM */ \ emith_or_r_imm_c(A_COND_CC, srr, 1); \ emith_bic_r_imm_c(A_COND_CS, srr, 1); \ @@ -812,37 +1344,148 @@ static int emith_xbranch(int cond, void *target, int is_call) emith_or_r_imm_c(A_COND_CS, srr, 1); \ emith_bic_r_imm_c(A_COND_CC, srr, 1); \ } \ -} +} while (0) + +#define emith_t_to_carry(srr, is_sub) do { \ + if (is_sub) { \ + int t_ = rcache_get_tmp(); \ + emith_eor_r_r_imm(t_, srr, 1); \ + emith_rorf(t_, t_, 1); \ + rcache_free_tmp(t_); \ + } else { \ + emith_rorf(srr, srr, 1); \ + emith_rol(srr, srr, 1); \ + } \ +} while (0) -#define emith_tpop_carry(sr, is_sub) { \ +#define emith_tpop_carry(sr, is_sub) do { \ if (is_sub) \ emith_eor_r_imm(sr, 1); \ emith_lsrf(sr, sr, 1); \ -} +} while (0) -#define emith_tpush_carry(sr, is_sub) { \ +#define emith_tpush_carry(sr, is_sub) do { \ emith_adc_r_r(sr, sr); \ if (is_sub) \ emith_eor_r_imm(sr, 1); \ -} +} while (0) /* + * T = carry(Rn = (Rn << 1) | T) * if Q - * t = carry(Rn += Rm) + * T ^= !carry(Rn += Rm) * else - * t = carry(Rn -= Rm) - * T ^= t + * T ^= !carry(Rn -= Rm) */ -#define emith_sh2_div1_step(rn, rm, sr) { \ +#define emith_sh2_div1_step(rn, rm, sr) do { \ void *jmp0, *jmp1; \ + emith_tpop_carry(sr, 0); /* Rn = 2*Rn+T */\ + emith_adcf_r_r_r(rn, rn, rn); \ + emith_tpush_carry(sr, 0); \ emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ JMP_POS(jmp0); /* beq do_sub */ \ - emith_addf_r_r(rn, rm); \ - emith_eor_r_imm_c(A_COND_CS, sr, T); \ + emith_addf_r_r(rn, rm); /* Rn += Rm */ \ + emith_eor_r_imm_c(A_COND_CC, sr, T); \ JMP_POS(jmp1); /* b done */ \ JMP_EMIT(A_COND_EQ, jmp0); /* do_sub: */ \ - emith_subf_r_r(rn, rm); \ - emith_eor_r_imm_c(A_COND_CC, sr, T); \ + emith_subf_r_r(rn, rm); /* Rn -= Rm */ \ + emith_eor_r_imm_c(A_COND_CS, sr, T); \ JMP_EMIT(A_COND_AL, jmp1); /* done: */ \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macl(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP2_START(DCOND_NE); \ + emith_mula_s64_c(DCOND_EQ, ml, mh, rn, rm); \ + EMITH_SJMP2_MID(DCOND_NE); \ + /* MACH top 16 bits unused if saturated. sign ext for overfl detect */ \ + emith_sext(mh, mh, 16); \ + emith_mula_s64(ml, mh, rn, rm); \ + /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ + /* to check: add MACH[15] to MACH[31:16]. this is 0 if no overflow */ \ + emith_asrf(rn, mh, 16); /* sum = (MACH>>16) + ((MACH>>15)&1) */ \ + emith_adcf_r_imm(rn, 0); /* (MACH>>15) is in carry after shift */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ + EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> +ovl */ \ + emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0xffffffff */ \ + emith_sub_r_imm_c(DCOND_GT, mh, 1); /* 0x00007fff */ \ + EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP2_END(DCOND_NE); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macw(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP2_START(DCOND_NE); \ + emith_mula_s64_c(DCOND_EQ, ml, mh, rn, rm); \ + EMITH_SJMP2_MID(DCOND_NE); \ + /* XXX: MACH should be untouched when S is set? */ \ + emith_asr(mh, ml, 31); /* sign ext MACL to MACH for ovrfl check */ \ + emith_mula_s64(ml, mh, rn, rm); \ + /* overflow if top 33 bits of MACH:MACL aren't all 1 or 0 */ \ + /* to check: add MACL[31] to MACH. this is 0 if no overflow */ \ + emith_addf_r_r_r_lsr(mh, mh, ml, 31); /* sum = MACH + ((MACL>>31)&1) */\ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> overflow */ \ + /* XXX: LSB signalling only in SH1, or in SH2 too? */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x00000001); /* LSB of MACH */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x80000000); /* negative ovrfl */ \ + EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> positive ovrfl */ \ + emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0x7fffffff */ \ + EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP2_END(DCOND_NE); \ +} while (0) + +#ifdef T +// T bit handling +static int tcond = -1; + +#define emith_invert_cond(cond) \ + ((cond) ^ 1) + +#define emith_clr_t_cond(sr) \ + (void)sr + +#define emith_set_t_cond(sr, cond) \ + tcond = cond + +#define emith_get_t_cond() \ + tcond + +#define emith_invalidate_t() \ + tcond = -1 + +#define emith_set_t(sr, val) \ + tcond = ((val) ? A_COND_AL: A_COND_NV) + +static void emith_sync_t(int sr) +{ + if (tcond == A_COND_AL) + emith_or_r_imm(sr, T); + else if (tcond == A_COND_NV) + emith_bic_r_imm(sr, T); + else if (tcond >= 0) { + emith_bic_r_imm_c(emith_invert_cond(tcond),sr, T); + emith_or_r_imm_c(tcond, sr, T); + } + tcond = -1; } +static int emith_tst_t(int sr, int tf) +{ + if (tcond < 0) { + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; + } else if (tcond >= A_COND_AL) { + // MUST sync because A_COND_NV isn't a real condition + emith_sync_t(sr); + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; + } else + return tf ? tcond : emith_invert_cond(tcond); +} +#endif diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c new file mode 100644 index 000000000..ae7077a01 --- /dev/null +++ b/cpu/drc/emit_arm64.c @@ -0,0 +1,1416 @@ +/* + * Basic macros to emit ARM A64 instructions and some utils + * Copyright (C) 2019 kub + * + * This work is licensed under the terms of MAME license. + * See COPYING file in the top-level directory. + */ +#define HOST_REGS 32 + +// AAPCS64: params: r0-r7, return: r0-r1, temp: r8-r17, saved: r19-r29 +// reserved: r18 (for platform use) +#define RET_REG 0 +#define PARAM_REGS { 0, 1, 2, 3, 4, 5, 6, 7 } +#define PRESERVED_REGS { 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 } +#define TEMPORARY_REGS { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 } + +#define CONTEXT_REG 29 +#define STATIC_SH2_REGS { SHR_SR,28 , SHR_R(0),27 , SHR_R(1),26 } + +// R31 doesn't exist, it aliases either with zero or SP +#define SP 31 // stack pointer +#define Z0 31 // zero register +#define LR 30 // link register +#define FP 29 // frame pointer +#define PR 18 // platform register + +// All operations but ptr ops are using the lower 32 bits of the A64 registers. +// The upper 32 bits are only used in ptr ops and are zeroed by A64 32 bit ops. + + +#define A64_COND_EQ 0x0 +#define A64_COND_NE 0x1 +#define A64_COND_HS 0x2 +#define A64_COND_LO 0x3 +#define A64_COND_MI 0x4 +#define A64_COND_PL 0x5 +#define A64_COND_VS 0x6 +#define A64_COND_VC 0x7 +#define A64_COND_HI 0x8 +#define A64_COND_LS 0x9 +#define A64_COND_GE 0xa +#define A64_COND_LT 0xb +#define A64_COND_GT 0xc +#define A64_COND_LE 0xd +#define A64_COND_CS A64_COND_HS +#define A64_COND_CC A64_COND_LO +// "fake" conditions for T bit handling +#define A64_COND_AL 0xe +#define A64_COND_NV 0xf + +// DRC conditions +#define DCOND_EQ A64_COND_EQ +#define DCOND_NE A64_COND_NE +#define DCOND_MI A64_COND_MI +#define DCOND_PL A64_COND_PL +#define DCOND_HI A64_COND_HI +#define DCOND_HS A64_COND_HS +#define DCOND_LO A64_COND_LO +#define DCOND_GE A64_COND_GE +#define DCOND_GT A64_COND_GT +#define DCOND_LT A64_COND_LT +#define DCOND_LS A64_COND_LS +#define DCOND_LE A64_COND_LE +#define DCOND_VS A64_COND_VS +#define DCOND_VC A64_COND_VC + +#define DCOND_CS A64_COND_HS +#define DCOND_CC A64_COND_LO + + +// unified insn +#define A64_INSN(op, b29, b22, b21, b16, b12, b10, b5, b0) \ + (((op)<<25)|((b29)<<29)|((b22)<<22)|((b21)<<21)|((b16)<<16)|((b12)<<12)|((b10)<<10)|((b5)<<5)|((b0)<<0)) + +#define _ 0 // marker for "field unused" + +#define A64_NOP \ + A64_INSN(0xa,0x6,0x4,_,0x3,0x2,_,0,0x1f) // 0xd503201f + +// arithmetic/logical + +enum { OP_AND, OP_OR, OP_EOR, OP_ANDS, OP_ADD, OP_ADDS, OP_SUB, OP_SUBS }; +enum { ST_LSL, ST_LSR, ST_ASR, ST_ROR }; +enum { XT_UXTW=0x4, XT_UXTX=0x6, XT_LSL=0x7, XT_SXTW=0xc, XT_SXTX=0xe }; +#define OP_SZ64 (1 << 31) // bit for 64 bit op selection +#define OP_N64 (1 << 22) // N-bit for 64 bit logical immediate ops + +#define A64_OP_REG(op, n, rd, rn, rm, stype, simm) /* arith+logical, ST_ */ \ + A64_INSN(0x5,(op)&3,((op)&4)|stype,n,rm,_,simm,rn,rd) +#define A64_OP_XREG(op, rd, rn, rm, xtopt, simm) /* arith, XT_ */ \ + A64_INSN(0x5,(op)&3,0x4,1,rm,xtopt,simm,rn,rd) +#define A64_OP_IMM12(op, rd, rn, imm, lsl12) /* arith */ \ + A64_INSN(0x8,(op)&3,((op)&4)|lsl12,_,_,_,(imm)&0xfff,rn,rd) +#define A64_OP_IMMBM(op, rd, rn, immr, imms) /* logical */ \ + A64_INSN(0x9,(op)&3,0x0,_,immr,_,(imms)&0x3f,rn,rd) + +// rd = rn OP (rm SHIFT simm) +#define A64_ADD_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_ADD,0,rd,rn,rm,stype,simm) +#define A64_ADDS_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_ADDS,0,rd,rn,rm,stype,simm) +#define A64_SUB_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_SUB,0,rd,rn,rm,stype,simm) +#define A64_SUBS_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_SUBS,0,rd,rn,rm,stype,simm) + +#define A64_NEG_REG(rd, rm, stype, simm) \ + A64_SUB_REG(rd,Z0,rm,stype,simm) +#define A64_NEGS_REG(rd, rm, stype, simm) \ + A64_SUBS_REG(rd,Z0,rm,stype,simm) +#define A64_NEGC_REG(rd, rm) \ + A64_SBC_REG(rd,Z0,rm) +#define A64_NEGCS_REG(rd, rm) \ + A64_SBCS_REG(rd,Z0,rm) +#define A64_CMP_REG(rn, rm, stype, simm) \ + A64_SUBS_REG(Z0, rn, rm, stype, simm) +#define A64_CMN_REG(rn, rm, stype, simm) \ + A64_ADDS_REG(Z0, rn, rm, stype, simm) + +#define A64_EOR_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_EOR,0,rd,rn,rm,stype,simm) +#define A64_OR_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_OR,0,rd,rn,rm,stype,simm) +#define A64_ORN_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_OR,1,rd,rn,rm,stype,simm) +#define A64_AND_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_AND,0,rd,rn,rm,stype,simm) +#define A64_ANDS_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_ANDS,0,rd,rn,rm,stype,simm) +#define A64_BIC_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_AND,1,rd,rn,rm,stype,simm) +#define A64_BICS_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_ANDS,1,rd,rn,rm,stype,simm) + +#define A64_TST_REG(rn, rm, stype, simm) \ + A64_ANDS_REG(Z0, rn, rm, stype, simm) +#define A64_MOV_REG(rd, rm, stype, simm) \ + A64_OR_REG(rd, Z0, rm, stype, simm) +#define A64_MVN_REG(rd, rm, stype, simm) \ + A64_ORN_REG(rd, Z0, rm, stype, simm) + +// rd = rn OP (rm EXTEND simm) +#define A64_ADD_XREG(rd, rn, rm, xtopt, simm) \ + A64_OP_XREG(OP_ADD,rd,rn,rm,xtopt,simm) +#define A64_ADDS_XREG(rd, rn, rm, xtopt, simm) \ + A64_OP_XREG(OP_ADDS,rd,rn,rm,xtopt,simm) +#define A64_SUB_XREG(rd, rn, rm, stype, simm) \ + A64_OP_XREG(OP_SUB,rd,rn,rm,xtopt,simm) +#define A64_SUBS_XREG(rd, rn, rm, stype, simm) \ + A64_OP_XREG(OP_SUBS,rd,rn,rm,xtopt,simm) + +// rd = rn OP rm OP carry +#define A64_ADC_REG(rd, rn, rm) \ + A64_INSN(0xd,OP_ADD &3,0x0,_,rm,_,_,rn,rd) +#define A64_ADCS_REG(rd, rn, rm) \ + A64_INSN(0xd,OP_ADDS&3,0x0,_,rm,_,_,rn,rd) +#define A64_SBC_REG(rd, rn, rm) \ + A64_INSN(0xd,OP_SUB &3,0x0,_,rm,_,_,rn,rd) +#define A64_SBCS_REG(rd, rn, rm) \ + A64_INSN(0xd,OP_SUBS&3,0x0,_,rm,_,_,rn,rd) + +// rd = rn SHIFT rm +#define A64_LSL_REG(rd, rn, rm) \ + A64_INSN(0xd,0x0,0x3,_,rm,_,0x8,rn,rd) +#define A64_LSR_REG(rd, rn, rm) \ + A64_INSN(0xd,0x0,0x3,_,rm,_,0xa,rn,rd) +#define A64_ASR_REG(rd, rn, rm) \ + A64_INSN(0xd,0x0,0x3,_,rm,_,0x9,rn,rd) +#define A64_ROR_REG(rd, rn, rm) \ + A64_INSN(0xd,0x0,0x3,_,rm,_,0xb,rn,rd) + +// rd = REVERSE(rn) +#define A64_RBIT_REG(rd, rn) \ + A64_INSN(0xd,0x2,0x3,_,_,_,_,rn,rd) + +// rd = rn OP (imm12 << (0|12)) +#define A64_ADD_IMM(rd, rn, imm12, lsl12) \ + A64_OP_IMM12(OP_ADD, rd, rn, imm12, lsl12) +#define A64_ADDS_IMM(rd, rn, imm12, lsl12) \ + A64_OP_IMM12(OP_ADDS, rd, rn, imm12, lsl12) +#define A64_SUB_IMM(rd, rn, imm12, lsl12) \ + A64_OP_IMM12(OP_SUB, rd, rn, imm12, lsl12) +#define A64_SUBS_IMM(rd, rn, imm12, lsl12) \ + A64_OP_IMM12(OP_SUBS, rd, rn, imm12, lsl12) + +#define A64_CMP_IMM(rn, imm12, lsl12) \ + A64_SUBS_IMM(Z0,rn,imm12,lsl12) +#define A64_CMN_IMM(rn, imm12, lsl12) \ + A64_ADDS_IMM(Z0,rn,imm12,lsl12) + +// rd = rn OP immbm; immbm is a repeated special pattern of 2^n bits length +#define A64_EOR_IMM(rd, rn, immr, imms) \ + A64_OP_IMMBM(OP_EOR,rd,rn,immr,imms) +#define A64_OR_IMM(rd, rn, immr, imms) \ + A64_OP_IMMBM(OP_OR,rd,rn,immr,imms) +#define A64_AND_IMM(rd, rn, immr, imms) \ + A64_OP_IMMBM(OP_AND,rd,rn,immr,imms) +#define A64_ANDS_IMM(rd, rn, immr, imms) \ + A64_OP_IMMBM(OP_ANDS,rd,rn,immr,imms) +#define A64_TST_IMM(rn, immr, imms) \ + A64_OP_IMMBM(OP_ANDS,Z0,rn,immr,imms) +#define A64_MOV_IMM(rd, rn, immr, imms) \ + A64_OP_IMMBM(OP_OR,rd,Z0,immr,imms) + +// rd = (imm16 << (0|16|32|48)) +#define A64_MOVN_IMM(rd, imm16, lsl16) \ + A64_INSN(0x9,0x0,0x2,lsl16,_,_,_,(imm16)&0xffff,rd) +#define A64_MOVZ_IMM(rd, imm16, lsl16) \ + A64_INSN(0x9,0x2,0x2,lsl16,_,_,_,(imm16)&0xffff,rd) +#define A64_MOVK_IMM(rd, imm16, lsl16) \ + A64_INSN(0x9,0x3,0x2,lsl16,_,_,_,(imm16)&0xffff,rd) +#define A64_MOVT_IMM(rd, imm16, lsl16) \ + A64_INSN(0x9,0x3,0x2,lsl16,_,_,_,(imm16)&0xffff,rd) + +// rd = rn SHIFT imm5/imm6 (for Wn/Xn) +#define A64_LSL_IMM(rd, rn, bits) /* UBFM */ \ + A64_INSN(0x9,0x2,0x4,_,32-(bits),_,31-(bits),rn,rd) +#define A64_LSR_IMM(rd, rn, bits) /* UBFM */ \ + A64_INSN(0x9,0x2,0x4,_,bits,_,31,rn,rd) +#define A64_ASR_IMM(rd, rn, bits) /* SBFM */ \ + A64_INSN(0x9,0x0,0x4,_,bits,_,31,rn,rd) +#define A64_ROR_IMM(rd, rn, bits) /* EXTR */ \ + A64_INSN(0x9,0x0,0x6,_,rn,_,bits,rn,rd) + +#define A64_SXT_IMM(rd, rn, bits) /* SBFM */ \ + A64_INSN(0x9,0x0,0x4,_,0,_,bits-1,rn,rd) +#define A64_UXT_IMM(rd, rn, bits) /* UBFM */ \ + A64_INSN(0x9,0x2,0x4,_,0,_,bits-1,rn,rd) + +#define A64_BFX_IMM(rd, rn, lsb, bits) /* UBFM */ \ + A64_INSN(0x9,0x2,0x4,_,lsb,_,bits-1,rn,rd) +#define A64_BFI_IMM(rd, rn, lsb, bits) /* BFM */ \ + A64_INSN(0x9,0x1,0x4,_,-(lsb)&0x1f,_,bits-1,rn,rd) + +// multiplication + +#define A64_SMULL(rd, rn, rm) /* Xd = Wn*Wm (+ Xa) */ \ + A64_INSN(0xd,0x4,0x4,1,rm,_,Z0,rn,rd) +#define A64_SMADDL(rd, rn, rm, ra) \ + A64_INSN(0xd,0x4,0x4,1,rm,_,ra,rn,rd) +#define A64_UMULL(rd, rn, rm) \ + A64_INSN(0xd,0x4,0x6,1,rm,_,Z0,rn,rd) +#define A64_UMADDL(rd, rn, rm, ra) \ + A64_INSN(0xd,0x4,0x6,1,rm,_,ra,rn,rd) +#define A64_MUL(rd, rn, rm) /* Wd = Wn*Wm (+ Wa) */ \ + A64_INSN(0xd,0x0,0x4,0,rm,_,Z0,rn,rd) +#define A64_MADD(rd, rn, rm, ra) \ + A64_INSN(0xd,0x0,0x4,0,rm,_,ra,rn,rd) + +// branching + +#define A64_B(offs26) \ + A64_INSN(0xa,0x0,_,_,_,_,_,_,(offs26) >> 2) +#define A64_BL(offs26) \ + A64_INSN(0xa,0x4,_,_,_,_,_,_,(offs26) >> 2) +#define A64_BR(rn) \ + A64_INSN(0xb,0x6,_,_,0x1f,_,_,rn,_) +#define A64_BLR(rn) \ + A64_INSN(0xb,0x6,_,_,0x3f,_,_,rn,_) +#define A64_RET(rn) /* same as BR, but hint for cpu */ \ + A64_INSN(0xb,0x6,_,_,0x5f,_,_,rn,_) +#define A64_BCOND(cond, offs19) \ + A64_INSN(0xa,0x2,_,_,_,_,_,(offs19) >> 2,(cond)) + +// conditional select + +#define A64_CINC(cond, rn, rm) \ + A64_INSN(0xd,0x0,0x2,0,rm,(cond)^1,0x1,rm,rn) /* CSINC */ +#define A64_CSET(cond, rn) \ + A64_CINC(cond, rn, Z0) + +// load pc-relative + +#define A64_LDRLIT_IMM(rd, offs19) \ + A64_INSN(0xc,0x0,0x0,_,_,_,_,(offs19) >> 2,rd) +#define A64_LDRXLIT_IMM(rd, offs19) \ + A64_INSN(0xc,0x2,0x0,_,_,_,_,(offs19) >> 2,rd) +#define A64_ADRXLIT_IMM(rd, offs21) \ + A64_INSN(0x8,(offs21)&3,0x0,_,_,_,_,(offs21) >> 2,rd) + +// load/store indexed base. Only the signed unscaled variant is used here. + +enum { LT_ST, LT_LD, LT_LDSX, LT_LDS }; +enum { AM_B=0x1, AM_H=0x3, AM_W=0x5, AM_X=0x7 }; +enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; +#define A64_LDST_AM(ir,rm,optimm) (((ir)<<9)|((rm)<<4)|((optimm)&0x1ff)) +#define A64_OP_LDST(sz, op, am, mode, rm, rd) \ + A64_INSN(0xc,sz,op,_,_,am,mode,rm,rd) + +#define A64_LDSTX_IMM(rd, rn, offs9, ld, mode) \ + A64_OP_LDST(AM_X,ld,A64_LDST_AM(0,_,offs9),mode,rn,rd) +#define A64_LDST_IMM(rd, rn, offs9, ld, mode) \ + A64_OP_LDST(AM_W,ld,A64_LDST_AM(0,_,offs9),mode,rn,rd) +#define A64_LDSTH_IMM(rd, rn, offs9, ld, mode) \ + A64_OP_LDST(AM_H,ld,A64_LDST_AM(0,_,offs9),mode,rn,rd) +#define A64_LDSTB_IMM(rd, rn, offs9, ld, mode) \ + A64_OP_LDST(AM_B,ld,A64_LDST_AM(0,_,offs9),mode,rn,rd) + +// NB: pre/postindex isn't available with register offset +#define A64_LDSTX_REG(rd, rn, rm, ld, opt) \ + A64_OP_LDST(AM_X,ld,A64_LDST_AM(1,rm,opt),AM_IDXREG,rn,rd) +#define A64_LDST_REG(rd, rn, rm, ld, opt) \ + A64_OP_LDST(AM_W,ld,A64_LDST_AM(1,rm,opt),AM_IDXREG,rn,rd) +#define A64_LDSTH_REG(rd, rn, rm, ld, opt) \ + A64_OP_LDST(AM_H,ld,A64_LDST_AM(1,rm,opt),AM_IDXREG,rn,rd) +#define A64_LDSTB_REG(rd, rn, rm, ld, opt) \ + A64_OP_LDST(AM_B,ld,A64_LDST_AM(1,rm,opt),AM_IDXREG,rn,rd) + +#define A64_LDSTPX_IMM(rn, r1, r2, offs7, ld, mode) \ + A64_INSN(0x4,0x5,(mode<<1)|ld,_,_,(offs7)&0x3f8,r2,rn,r1) + +// 64 bit stuff for pointer handling + +#define A64_ADDX_XREG(rd, rn, rm, xtopt, simm) \ + OP_SZ64|A64_OP_XREG(OP_ADD,rd,rn,rm,xtopt,simm) +#define A64_ADDX_REG(rd, rn, rm, stype, simm) \ + OP_SZ64|A64_ADD_REG(rd, rn, rm, stype, simm) +#define A64_ADDXS_REG(rd, rn, rm, stype, simm) \ + OP_SZ64|A64_ADDS_REG(rd, rn, rm, stype, simm) +#define A64_ORX_REG(rd, rn, rm, stype, simm) \ + OP_SZ64|A64_OR_REG(rd, rn, rm, stype, simm) +#define A64_TSTX_REG(rn, rm, stype, simm) \ + OP_SZ64|A64_TST_REG(rn, rm, stype, simm) +#define A64_MOVX_REG(rd, rm, stype, simm) \ + OP_SZ64|A64_MOV_REG(rd, rm, stype, simm) +#define A64_ADDX_IMM(rd, rn, imm12) \ + OP_SZ64|A64_ADD_IMM(rd, rn, imm12, 0) +#define A64_EORX_IMM(rd, rn, immr, imms) \ + OP_SZ64|OP_N64|A64_EOR_IMM(rd, rn, immr, imms) +#define A64_UXTX_IMM(rd, rn, bits) \ + OP_SZ64|OP_N64|A64_UXT_IMM(rd, rn, bits) +#define A64_LSRX_IMM(rd, rn, bits) \ + OP_SZ64|OP_N64|A64_LSR_IMM(rd, rn, bits)|(63<<10) + + +// XXX: tcache_ptr type for SVP and SH2 compilers differs.. +#define EMIT_PTR(ptr, x) \ + do { \ + *(u32 *)(ptr) = x; \ + ptr = (void *)((u8 *)(ptr) + sizeof(u32)); \ + } while (0) + +#define EMIT(op) \ + do { \ + EMIT_PTR(tcache_ptr, op); \ + COUNT_OP; \ + } while (0) + + +// if-then-else conditional execution helpers +#define JMP_POS(ptr) { \ + ptr = tcache_ptr; \ + EMIT(A64_B(0)); \ +} + +#define JMP_EMIT(cond, ptr) { \ + u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr); \ + EMIT_PTR(ptr, A64_BCOND(cond, val_ & 0x001fffff)); \ +} + +#define JMP_EMIT_NC(ptr) { \ + u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr); \ + EMIT_PTR(ptr, A64_B(val_ & 0x0fffffff)); \ +} + +#define EMITH_JMP_START(cond) { \ + u8 *cond_ptr; \ + JMP_POS(cond_ptr) + +#define EMITH_JMP_END(cond) \ + JMP_EMIT(cond, cond_ptr); \ +} + +#define EMITH_JMP3_START(cond) { \ + u8 *cond_ptr, *else_ptr; \ + JMP_POS(cond_ptr) + +#define EMITH_JMP3_MID(cond) \ + JMP_POS(else_ptr); \ + JMP_EMIT(cond, cond_ptr); + +#define EMITH_JMP3_END() \ + JMP_EMIT_NC(else_ptr); \ +} + +#define EMITH_HINT_COND(cond) /**/ + +// "simple" jump (no more than a few insns) +// ARM32 will use conditional instructions here +#define EMITH_SJMP_START EMITH_JMP_START +#define EMITH_SJMP_END EMITH_JMP_END + +#define EMITH_SJMP3_START EMITH_JMP3_START +#define EMITH_SJMP3_MID EMITH_JMP3_MID +#define EMITH_SJMP3_END EMITH_JMP3_END + +#define EMITH_SJMP2_START(cond) \ + EMITH_SJMP3_START(cond) +#define EMITH_SJMP2_MID(cond) \ + EMITH_SJMP3_MID(cond) +#define EMITH_SJMP2_END(cond) \ + EMITH_SJMP3_END() + + +// data processing, register +#define emith_move_r_r_ptr(d, s) \ + EMIT(A64_MOVX_REG(d, s, ST_LSL, 0)) +#define emith_move_r_r_ptr_c(cond, d, s) \ + emith_move_r_r_ptr(d, s) + +#define emith_move_r_r(d, s) \ + EMIT(A64_MOV_REG(d, s, ST_LSL, 0)) +#define emith_move_r_r_c(cond, d, s) \ + emith_move_r_r(d, s) + +#define emith_mvn_r_r(d, s) \ + EMIT(A64_MVN_REG(d, s, ST_LSL, 0)) + +#define emith_add_r_r_r_lsl_ptr(d, s1, s2, simm) do { \ + if (simm < 4) EMIT(A64_ADDX_XREG(d, s1, s2, XT_SXTW, simm)); \ + else EMIT(A64_ADDX_REG(d, s1, s2, ST_LSL, simm)); \ +} while (0) +#define emith_add_r_r_r_lsl(d, s1, s2, simm) \ + EMIT(A64_ADD_REG(d, s1, s2, ST_LSL, simm)) + +#define emith_addf_r_r_r_lsl(d, s1, s2, simm) \ + EMIT(A64_ADDS_REG(d, s1, s2, ST_LSL, simm)) + +#define emith_addf_r_r_r_lsr(d, s1, s2, simm) \ + EMIT(A64_ADDS_REG(d, s1, s2, ST_LSR, simm)) + +#define emith_adc_r_r_r_lsl(d, s1, s2, simm) \ + if (simm) { int _t = rcache_get_tmp(); \ + emith_lsl(_t, s2, simm); \ + emith_adc_r_r_r(d, s1, _t); \ + rcache_free_tmp(_t); \ + } else \ + emith_adc_r_r_r(d, s1, s2); \ +} while (0) + +#define emith_sbc_r_r_r_lsl(d, s1, s2, simm) \ + if (simm) { int _t = rcache_get_tmp(); \ + emith_lsl(_t, s2, simm); \ + emith_sbc_r_r_r(d, s1, _t); \ + rcache_free_tmp(_t); \ + } else \ + emith_sbc_r_r_r(d, s1, s2); \ +} while (0) + +#define emith_sub_r_r_r_lsl(d, s1, s2, simm) \ + EMIT(A64_SUB_REG(d, s1, s2, ST_LSL, simm)) + +#define emith_subf_r_r_r_lsl(d, s1, s2, simm) \ + EMIT(A64_SUBS_REG(d, s1, s2, ST_LSL, simm)) + +#define emith_or_r_r_r_lsl(d, s1, s2, simm) \ + EMIT(A64_OR_REG(d, s1, s2, ST_LSL, simm)) +#define emith_or_r_r_r_lsr(d, s1, s2, simm) \ + EMIT(A64_OR_REG(d, s1, s2, ST_LSR, simm)) + +#define emith_eor_r_r_r_lsl(d, s1, s2, simm) \ + EMIT(A64_EOR_REG(d, s1, s2, ST_LSL, simm)) +#define emith_eor_r_r_r_lsr(d, s1, s2, simm) \ + EMIT(A64_EOR_REG(d, s1, s2, ST_LSR, simm)) + +#define emith_and_r_r_r_lsl(d, s1, s2, simm) \ + EMIT(A64_AND_REG(d, s1, s2, ST_LSL, simm)) + +#define emith_or_r_r_lsl(d, s, lslimm) \ + emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) + +#define emith_eor_r_r_lsl(d, s, lslimm) \ + emith_eor_r_r_r_lsl(d, d, s, lslimm) +#define emith_eor_r_r_lsr(d, s, lsrimm) \ + emith_eor_r_r_r_lsr(d, d, s, lsrimm) + +#define emith_add_r_r_r(d, s1, s2) \ + emith_add_r_r_r_lsl(d, s1, s2, 0) + +#define emith_addf_r_r_r(d, s1, s2) \ + emith_addf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_sub_r_r_r(d, s1, s2) \ + emith_sub_r_r_r_lsl(d, s1, s2, 0) + +#define emith_subf_r_r_r(d, s1, s2) \ + emith_subf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_or_r_r_r(d, s1, s2) \ + emith_or_r_r_r_lsl(d, s1, s2, 0) + +#define emith_eor_r_r_r(d, s1, s2) \ + emith_eor_r_r_r_lsl(d, s1, s2, 0) + +#define emith_add_r_r_r_ptr(d, s1, s2) \ + emith_add_r_r_r_lsl_ptr(d, s1, s2, 0) +#define emith_and_r_r_r(d, s1, s2) \ + emith_and_r_r_r_lsl(d, s1, s2, 0) + +#define emith_add_r_r_ptr(d, s) \ + emith_add_r_r_r_lsl_ptr(d, d, s, 0) +#define emith_add_r_r(d, s) \ + emith_add_r_r_r(d, d, s) + +#define emith_sub_r_r(d, s) \ + emith_sub_r_r_r(d, d, s) + +#define emith_neg_r_r(d, s) \ + EMIT(A64_NEG_REG(d, s, ST_LSL, 0)) + +#define emith_negc_r_r(d, s) \ + EMIT(A64_NEGC_REG(d, s)) + +#define emith_adc_r_r_r(d, s1, s2) \ + EMIT(A64_ADC_REG(d, s1, s2)) + +#define emith_adc_r_r(d, s) \ + EMIT(A64_ADC_REG(d, d, s)) + +#define emith_adcf_r_r_r(d, s1, s2) \ + EMIT(A64_ADCS_REG(d, s1, s2)) + +#define emith_sbc_r_r_r(d, s1, s2) \ + EMIT(A64_SBC_REG(d, s1, s2)) + +#define emith_sbcf_r_r_r(d, s1, s2) \ + EMIT(A64_SBCS_REG(d, s1, s2)) + +#define emith_and_r_r(d, s) \ + emith_and_r_r_r(d, d, s) +#define emith_and_r_r_c(cond, d, s) \ + emith_and_r_r(d, s) + +#define emith_or_r_r(d, s) \ + emith_or_r_r_r(d, d, s) + +#define emith_eor_r_r(d, s) \ + emith_eor_r_r_r(d, d, s) + +#define emith_tst_r_r_ptr(d, s) \ + EMIT(A64_TSTX_REG(d, s, ST_LSL, 0)) +#define emith_tst_r_r(d, s) \ + EMIT(A64_TST_REG(d, s, ST_LSL, 0)) + +#define emith_teq_r_r(d, s) do { \ + int _t = rcache_get_tmp(); \ + emith_eor_r_r_r(_t, d, s); \ + emith_cmp_r_imm(_t, 0); \ + rcache_free_tmp(_t); \ +} while (0) + +#define emith_cmp_r_r(d, s) \ + EMIT(A64_CMP_REG(d, s, ST_LSL, 0)) + +#define emith_addf_r_r(d, s) \ + emith_addf_r_r_r(d, d, s) + +#define emith_subf_r_r(d, s) \ + emith_subf_r_r_r(d, d, s) + +#define emith_adcf_r_r(d, s) \ + emith_adcf_r_r_r(d, d, s) + +#define emith_sbcf_r_r(d, s) \ + emith_sbcf_r_r_r(d, d, s) + +#define emith_negcf_r_r(d, s) \ + emith_sbcf_r_r_r(d, Z0, s) + + +// move immediate + +static void emith_move_imm64(int r, int wx, int64_t imm) +{ + int sz64 = wx ? OP_SZ64:0; + int c, s; + + if (!imm) { + EMIT(sz64|A64_MOVZ_IMM(r, imm, 0)); + return; + } + if (imm && -imm == (u16)-imm) { + EMIT(sz64|A64_MOVN_IMM(r, ~imm, 0)); + return; + } + + for (c = s = 0; s < (wx ? 4:2) && imm; s++, imm >>= 16) + if ((u16)(imm)) { + if (c++) EMIT(sz64|A64_MOVK_IMM(r, imm, s)); + else EMIT(sz64|A64_MOVZ_IMM(r, imm, s)); + } +} + +#define emith_move_r_ptr_imm(r, imm) \ + emith_move_imm64(r, 1, (intptr_t)(imm)) + +#define emith_move_r_imm(r, imm) \ + emith_move_imm64(r, 0, (s32)(imm)) +#define emith_move_r_imm_c(cond, r, imm) \ + emith_move_r_imm(r, imm) + +#define emith_move_r_imm_s8_patchable(r, imm) do { \ + if ((s8)(imm) < 0) \ + EMIT(A64_MOVN_IMM(r, ~(s8)(imm), 0)); \ + else \ + EMIT(A64_MOVZ_IMM(r, (s8)(imm), 0)); \ +} while (0) +#define emith_move_r_imm_s8_patch(ptr, imm) do { \ + u32 *ptr_ = (u32 *)ptr; \ + int r_ = *ptr_ & 0x1f; \ + if ((s8)(imm) < 0) \ + EMIT_PTR(ptr_, A64_MOVN_IMM(r_, ~(s8)(imm), 0)); \ + else \ + EMIT_PTR(ptr_, A64_MOVZ_IMM(r_, (s8)(imm), 0)); \ +} while (0) + +// arithmetic, immediate +static void emith_arith_imm(int op, int wx, int rd, int rn, s32 imm) +{ + u32 sz64 = wx ? OP_SZ64:0; + + if (imm < 0) { + op ^= (OP_ADD ^ OP_SUB); + imm = -imm; + } + if (imm == 0) { + // value 0, must emit if op is *S or source isn't dest + if ((op & 1) || rd != rn) + EMIT(sz64|A64_OP_IMM12(op, rd, rn, 0, 0)); + } else if (imm >> 24) { + // value too large + int _t = rcache_get_tmp(); + emith_move_r_imm(_t, imm); + EMIT(sz64|A64_OP_REG(op, 0, rd, rn, _t, ST_LSL, 0)); + rcache_free_tmp(_t); + } else { + int rs = rn; + if ((imm) & 0x000fff) { + EMIT(sz64|A64_OP_IMM12(op, rd, rs, imm, 0)); rs = rd; + } + if ((imm) & 0xfff000) { + EMIT(sz64|A64_OP_IMM12(op, rd, rs, imm >>12, 1)); + } + } +} + +#define emith_add_r_imm(r, imm) \ + emith_arith_imm(OP_ADD, 0, r, r, imm) +#define emith_add_r_imm_c(cond, r, imm) \ + emith_add_r_imm(r, imm) + +#define emith_addf_r_imm(r, imm) \ + emith_arith_imm(OP_ADDS, 0, r, r, imm) + +#define emith_sub_r_imm(r, imm) \ + emith_arith_imm(OP_SUB, 0, r, r, imm) +#define emith_sub_r_imm_c(cond, r, imm) \ + emith_sub_r_imm(r, imm) + +#define emith_subf_r_imm(r, imm) \ + emith_arith_imm(OP_SUBS, 0, r, r, imm) + + +#define emith_adc_r_imm(r, imm) do { \ + int _t = rcache_get_tmp(); \ + emith_move_r_imm(_t, imm); \ + emith_adc_r_r(r, _t); \ + rcache_free_tmp(_t); \ +} while (0) + +#define emith_adcf_r_imm(r, imm) do { \ + int _t = rcache_get_tmp(); \ + emith_move_r_imm(_t, imm); \ + emith_adcf_r_r(r, _t); \ + rcache_free_tmp(_t); \ +} while (0) + +#define emith_cmp_r_imm(r, imm) do { \ + u32 op_ = OP_SUBS, imm_ = (u8)imm; \ + if ((s8)imm_ < 0) { \ + imm_ = (u8)-imm_; \ + op_ = OP_ADDS; \ + } \ + EMIT(A64_OP_IMM12(op_, Z0, r, imm_, 0)); \ +} while (0) + + +#define emith_add_r_r_ptr_imm(d, s, imm) \ + emith_arith_imm(OP_ADD, 1, d, s, imm) + +#define emith_add_r_r_imm(d, s, imm) \ + emith_arith_imm(OP_ADD, 0, d, s, imm) + +#define emith_sub_r_r_imm(d, s, imm) \ + emith_arith_imm(OP_SUB, 0, d, s, imm) +#define emith_sub_r_r_imm_c(cond, d, s, imm) \ + emith_sub_r_r_imm(d, s, imm) + +#define emith_subf_r_r_imm(d, s, imm) \ + emith_arith_imm(OP_SUBS, 0, d, s, imm) + + +// logical, immediate; the value describes a bitmask, see ARMv8 ArchRefMan +// NB: deal only with simple masks 0{n}1{m}0{o} or 1{n}0{m}1{o}, 0 16) { + emith_move_r_imm(_t, ~imm); + EMIT(sz64|A64_OP_REG(op, 1, rd, rn, _t, ST_LSL, 0)); + } else { + emith_move_r_imm(_t, imm); + EMIT(sz64|A64_OP_REG(op, 0, rd, rn, _t, ST_LSL, 0)); + } + rcache_free_tmp(_t); + } +} + +#define emith_and_r_imm(r, imm) \ + emith_log_imm(OP_AND, 0, r, r, imm) + +#define emith_or_r_imm(r, imm) \ + emith_log_imm(OP_OR, 0, r, r, imm) +#define emith_or_r_imm_c(cond, r, imm) \ + emith_or_r_imm(r, imm) + +#define emith_eor_r_imm_ptr(r, imm) \ + emith_log_imm(OP_EOR, 1, r, r, imm) +#define emith_eor_r_imm_ptr_c(cond, r, imm) \ + emith_eor_r_imm_ptr(r, imm) + +#define emith_eor_r_imm(r, imm) \ + emith_log_imm(OP_EOR, 0, r, r, imm) +#define emith_eor_r_imm_c(cond, r, imm) \ + emith_eor_r_imm(r, imm) + +/* NB: BIC #imm not available in A64; use AND #~imm instead */ +#define emith_bic_r_imm(r, imm) \ + emith_log_imm(OP_AND, 0, r, r, ~(imm)) +#define emith_bic_r_imm_c(cond, r, imm) \ + emith_bic_r_imm(r, imm) + +#define emith_tst_r_imm(r, imm) \ + emith_log_imm(OP_ANDS, 0, Z0, r, imm) +#define emith_tst_r_imm_c(cond, r, imm) \ + emith_tst_r_imm(r, imm) + +#define emith_and_r_r_imm(d, s, imm) \ + emith_log_imm(OP_AND, 0, d, s, imm) + +#define emith_or_r_r_imm(d, s, imm) \ + emith_log_imm(OP_OR, 0, d, s, imm) + +#define emith_eor_r_r_imm(d, s, imm) \ + emith_log_imm(OP_EOR, 0, d, s, imm) + + +// shift +#define emith_lsl(d, s, cnt) \ + EMIT(A64_LSL_IMM(d, s, cnt)) + +#define emith_lsr(d, s, cnt) \ + EMIT(A64_LSR_IMM(d, s, cnt)) + +#define emith_asr(d, s, cnt) \ + EMIT(A64_ASR_IMM(d, s, cnt)) + +#define emith_ror(d, s, cnt) \ + EMIT(A64_ROR_IMM(d, s, cnt)) +#define emith_ror_c(cond, d, s, cnt) \ + emith_ror(d, s, cnt) + +#define emith_rol(d, s, cnt) \ + EMIT(A64_ROR_IMM(d, s, 32-(cnt))) + +// NB: shift with carry not directly supported in A64 :-|. +#define emith_lslf(d, s, cnt) do { \ + if ((cnt) > 1) { \ + emith_lsl(d, s, cnt-1); \ + emith_addf_r_r_r(d, d, d); \ + } else if ((cnt) > 0) \ + emith_addf_r_r_r(d, s, s); \ +} while (0) + +#define emith_lsrf(d, s, cnt) do { \ + EMIT(A64_RBIT_REG(d, s)); \ + emith_lslf(d, d, cnt); \ + EMIT(A64_RBIT_REG(d, d)); \ +} while (0) + +#define emith_asrf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_asr(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_addf_r_r_r(Z0, _s, _s); \ + EMIT(A64_RBIT_REG(d, _s)); \ + emith_adcf_r_r_r(d, d, d); \ + EMIT(A64_RBIT_REG(d, d)); \ + } \ +} while (0) + +#define emith_rolf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_rol(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_addf_r_r_r(d, _s, _s); \ + emith_adc_r_r_r(d, d, Z0); \ + } \ +} while (0) + +#define emith_rorf(d, s, cnt) do { \ + if ((cnt) > 0) { \ + emith_ror(d, s, cnt); \ + emith_addf_r_r_r(Z0, d, d); \ + } \ +} while (0) + +#define emith_rolcf(d) \ + emith_adcf_r_r(d, d) +#define emith_rolc(d) \ + emith_adc_r_r(d, d) + +#define emith_rorcf(d) do { \ + EMIT(A64_RBIT_REG(d, d)); \ + emith_adcf_r_r(d, d); \ + EMIT(A64_RBIT_REG(d, d)); \ +} while (0) +#define emith_rorc(d) do { \ + EMIT(A64_RBIT_REG(d, d)); \ + emith_adc_r_r(d, d); \ + EMIT(A64_RBIT_REG(d, d)); \ +} while (0) + +// signed/unsigned extend +#define emith_clear_msb(d, s, count) /* bits to clear */ \ + EMIT(A64_UXT_IMM(d, s, 32-(count))) +#define emith_clear_msb_c(cond, d, s, count) \ + emith_clear_msb(d, s, count) + +#define emith_sext(d, s, count) /* bits to keep */ \ + EMIT(A64_SXT_IMM(d, s, count)) + +// multiply Rd = Rn*Rm (+ Ra) +#define emith_mul(d, s1, s2) \ + EMIT(A64_MUL(d, s1, s2)) + +// NB: must combine/split Xd from/into 2 Wd's; play safe and clear upper bits +#define emith_combine64(dlo, dhi) \ + EMIT(A64_UXTX_IMM(dlo, dlo, 32)); \ + EMIT(A64_ORX_REG(dlo, dlo, dhi, ST_LSL, 32)); + +#define emith_split64(dlo, dhi) \ + EMIT(A64_LSRX_IMM(dhi, dlo, 32)); \ + EMIT(A64_UXTX_IMM(dlo, dlo, 32)); + +#define emith_mul_u64(dlo, dhi, s1, s2) do { \ + EMIT(A64_UMULL(dlo, s1, s2)); \ + emith_split64(dlo, dhi); \ +} while (0) + +#define emith_mul_s64(dlo, dhi, s1, s2) do { \ + EMIT(A64_SMULL(dlo, s1, s2)); \ + emith_split64(dlo, dhi); \ +} while (0) + +#define emith_mula_s64(dlo, dhi, s1, s2) do { \ + emith_combine64(dlo, dhi); \ + EMIT(A64_SMADDL(dlo, s1, s2, dlo)); \ + emith_split64(dlo, dhi); \ +} while (0) +#define emith_mula_s64_c(cond, dlo, dhi, s1, s2) \ + emith_mula_s64(dlo, dhi, s1, s2) + +// load/store. offs has 9 bits signed, hence larger offs may use a temp +static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) +{ + if (o9 >= -256 && o9 < 256) { + EMIT(A64_OP_LDST(sz, ld, A64_LDST_AM(0,_,o9), mode, rn, rd)); + } else if (mode == AM_IDXPRE) { + emith_add_r_r_ptr_imm(rn, rn, o9); + EMIT(A64_OP_LDST(sz, ld, A64_LDST_AM(0,_,0), AM_IDX, rn, rd)); + } else if (mode == AM_IDXPOST) { + EMIT(A64_OP_LDST(sz, ld, A64_LDST_AM(0,_,0), AM_IDX, rn, rd)); + emith_add_r_r_ptr_imm(rn, rn, o9); + } else { + int _t = rcache_get_tmp(); + emith_add_r_r_ptr_imm(_t, rn, o9); + EMIT(A64_OP_LDST(sz, ld, A64_LDST_AM(0,_,0), AM_IDX, _t, rd)); + rcache_free_tmp(_t); + } +} + +#define emith_read_r_r_offs_ptr(r, rs, offs) \ + emith_ldst_offs(AM_X, r, rs, offs, LT_LD, AM_IDX) +#define emith_read_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_read_r_r_offs_ptr(r, rs, offs) + +#define emith_read_r_r_offs(r, rs, offs) \ + emith_ldst_offs(AM_W, r, rs, offs, LT_LD, AM_IDX) +#define emith_read_r_r_offs_c(cond, r, rs, offs) \ + emith_read_r_r_offs(r, rs, offs) + +#define emith_read_r_r_r_ptr(r, rs, rm) \ + EMIT(A64_LDSTX_REG(r, rs, rm, LT_LD, XT_SXTW)) + +#define emith_read_r_r_r(r, rs, rm) \ + EMIT(A64_LDST_REG(r, rs, rm, LT_LD, XT_SXTW)) +#define emith_read_r_r_r_c(cond, r, rs, rm) \ + emith_read_r_r_r(r, rs, rm) + +#define emith_read8_r_r_offs(r, rs, offs) \ + emith_ldst_offs(AM_B, r, rs, offs, LT_LD, AM_IDX) +#define emith_read8_r_r_offs_c(cond, r, rs, offs) \ + emith_read8_r_r_offs(r, rs, offs) + +#define emith_read8_r_r_r(r, rs, rm) \ + EMIT(A64_LDSTB_REG(r, rs, rm, LT_LD, XT_SXTW)) +#define emith_read8_r_r_r_c(cond, r, rs, rm) \ + emith_read8_r_r_r(r, rs, rm) + +#define emith_read16_r_r_offs(r, rs, offs) \ + emith_ldst_offs(AM_H, r, rs, offs, LT_LD, AM_IDX) +#define emith_read16_r_r_offs_c(cond, r, rs, offs) \ + emith_read16_r_r_offs(r, rs, offs) + +#define emith_read16_r_r_r(r, rs, rm) \ + EMIT(A64_LDSTH_REG(r, rs, rm, LT_LD, XT_SXTW)) +#define emith_read16_r_r_r_c(cond, r, rs, rm) \ + emith_read16_r_r_r(r, rs, rm) + +#define emith_read8s_r_r_offs(r, rs, offs) \ + emith_ldst_offs(AM_B, r, rs, offs, LT_LDS, AM_IDX) +#define emith_read8s_r_r_offs_c(cond, r, rs, offs) \ + emith_read8s_r_r_offs(r, rs, offs) + +#define emith_read8s_r_r_r(r, rs, rm) \ + EMIT(A64_LDSTB_REG(r, rs, rm, LT_LDS, XT_SXTW)) +#define emith_read8s_r_r_r_c(cond, r, rs, rm) \ + emith_read8s_r_r_r(r, rs, rm) + +#define emith_read16s_r_r_offs(r, rs, offs) \ + emith_ldst_offs(AM_H, r, rs, offs, LT_LDS, AM_IDX) +#define emith_read16s_r_r_offs_c(cond, r, rs, offs) \ + emith_read16s_r_r_offs(r, rs, offs) + +#define emith_read16s_r_r_r(r, rs, rm) \ + EMIT(A64_LDSTH_REG(r, rs, rm, LT_LDS, XT_SXTW)) +#define emith_read16s_r_r_r_c(cond, r, rs, rm) \ + emith_read16s_r_r_r(r, rs, rm) + + +#define emith_write_r_r_offs_ptr(r, rs, offs) \ + emith_ldst_offs(AM_X, r, rs, offs, LT_ST, AM_IDX) +#define emith_write_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_write_r_r_offs_ptr(r, rs, offs) + +#define emith_write_r_r_r_ptr(r, rs, rm) \ + EMIT(A64_LDSTX_REG(r, rs, rm, LT_ST, XT_SXTW)) +#define emith_write_r_r_r_ptr_c(cond, r, rs, rm) \ + emith_write_r_r_r_ptr(r, rs, rm) + +#define emith_write_r_r_offs(r, rs, offs) \ + emith_ldst_offs(AM_W, r, rs, offs, LT_ST, AM_IDX) +#define emith_write_r_r_offs_c(cond, r, rs, offs) \ + emith_write_r_r_offs(r, rs, offs) + +#define emith_write_r_r_r(r, rs, rm) \ + EMIT(A64_LDST_REG(r, rs, rm, LT_ST, XT_SXTW)) +#define emith_write_r_r_r_c(cond, r, rs, rm) \ + emith_write_r_r_r(r, rs, rm) + +#define emith_ctx_read_ptr(r, offs) \ + emith_read_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_read(r, offs) \ + emith_read_r_r_offs(r, CONTEXT_REG, offs) +#define emith_ctx_read_c(cond, r, offs) \ + emith_ctx_read(r, offs) + +#define emith_ctx_write_ptr(r, offs) \ + emith_write_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_write(r, offs) \ + emith_write_r_r_offs(r, CONTEXT_REG, offs) + +#define emith_ctx_read_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_read(r_, offs_); \ +} while (0) + +#define emith_ctx_write_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_write(r_, offs_); \ +} while (0) + +// push pairs; NB: SP must be 16 byte aligned (HW requirement!) +#define emith_push2(r1, r2) \ + EMIT(A64_LDSTPX_IMM(SP, r1, r2, -2*8, LT_ST, AM_IDXPRE)) +#define emith_pop2(r1, r2) \ + EMIT(A64_LDSTPX_IMM(SP, r1, r2, 2*8, LT_LD, AM_IDXPOST)) + +// function call handling +#define emith_save_caller_regs(mask) do { \ + int _c, _r1, _r2; u32 _m = mask & 0x3ffff; \ + if (__builtin_parity(_m) == 1) _m |= 0x40000; /* hardware align */ \ + for (_c = HOST_REGS-1, _r1 = -1; _m && _c >= 0; _m &= ~(1 << _c), _c--)\ + if (_m & (1 << _c)) { \ + _r2 = _r1, _r1 = _c; \ + if (_r2 != -1) { \ + emith_push2(_r1, _r2); \ + _r1 = -1; \ + } \ + } \ +} while (0) + +#define emith_restore_caller_regs(mask) do { \ + int _c, _r1, _r2; u32 _m = mask & 0x3ffff; \ + if (__builtin_parity(_m) == 1) _m |= 0x40000; /* hardware align */ \ + for (_c = 0, _r1 = -1; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) { \ + _r2 = _r1, _r1 = _c; \ + if (_r2 != -1) { \ + emith_pop2(_r2, _r1); \ + _r1 = -1; \ + } \ + } \ +} while (0) + +#define host_arg2reg(rd, arg) \ + rd = arg + +#define emith_pass_arg_r(arg, reg) \ + emith_move_r_r(arg, reg) + +#define emith_pass_arg_imm(arg, imm) \ + emith_move_r_imm(arg, imm) + +// branching; NB: A64 B.cond has only +/- 1MB range + +#define emith_jump(target) do {\ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ + EMIT(A64_B(disp_ & 0x0fffffff)); \ +} while (0) + +#define emith_jump_patchable(target) \ + emith_jump(target) + +#define emith_jump_cond(cond, target) do { \ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ + EMIT(A64_BCOND(cond, disp_ & 0x001fffff)); \ +} while (0) + +#define emith_jump_cond_patchable(cond, target) \ + emith_jump_cond(cond, target) + +#define emith_jump_cond_inrange(target) \ + !(((u8 *)target - (u8 *)tcache_ptr + 0x100000) >> 21) + +#define emith_jump_patch(ptr, target, pos) do { \ + u32 *ptr_ = (u32 *)ptr; \ + u32 disp_ = (u8 *)target - (u8 *)ptr, mask_; \ + if ((*ptr_ & 0xff000000) == 0x54000000) \ + mask_ = 0xff00001f, disp_ <<= 5; /* B.cond, range 21 bit */ \ + else mask_ = 0xfc000000; /* B[L], range 28 bit */ \ + EMIT_PTR(ptr_, (*ptr_ & mask_) | ((disp_ >> 2) & ~mask_)); \ + if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)(ptr_-1); \ +} while (0) + +#define emith_jump_patch_inrange(ptr, target) \ + !(((u8 *)target - (u8 *)ptr + 0x100000) >> 21) +#define emith_jump_patch_size() 4 + +#define emith_jump_at(ptr, target) do { \ + u32 disp_ = (u8 *)target - (u8 *)ptr; \ + EMIT_PTR(ptr, A64_B(disp_ & 0x0fffffff)); \ +} while (0) +#define emith_jump_at_size() 4 + +#define emith_jump_reg(r) \ + EMIT(A64_BR(r)) +#define emith_jump_reg_c(cond, r) \ + emith_jump_reg(r) + +#define emith_jump_ctx(offs) do { \ + int _t = rcache_get_tmp(); \ + emith_ctx_read_ptr(_t, offs); \ + emith_jump_reg(_t); \ + rcache_free_tmp(_t); \ +} while (0) +#define emith_jump_ctx_c(cond, offs) \ + emith_jump_ctx(offs) + +#define emith_call(target) do { \ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ + EMIT(A64_BL(disp_ & 0x0fffffff)); \ +} while (0) +#define emith_call_cond(cond, target) \ + emith_call(target) + +#define emith_call_reg(r) \ + EMIT(A64_BLR(r)) + +#define emith_call_ctx(offs) do { \ + int _t = rcache_get_tmp(); \ + emith_ctx_read_ptr(_t, offs); \ + emith_call_reg(_t); \ + rcache_free_tmp(_t); \ +} while (0) + +#define emith_call_cleanup() /**/ + +#define emith_ret() \ + EMIT(A64_RET(LR)) +#define emith_ret_c(cond) \ + emith_ret() + +#define emith_ret_to_ctx(offs) \ + emith_ctx_write_ptr(LR, offs) + +#define emith_add_r_ret(r) \ + emith_add_r_r_r_ptr(r, LR, r) + +// NB: pushes r or r18 for SP hardware alignment +#define emith_push_ret(r) do { \ + int r_ = (r >= 0 ? r : 18); \ + emith_push2(r_, LR); \ +} while (0) + +#define emith_pop_and_ret(r) do { \ + int r_ = (r >= 0 ? r : 18); \ + emith_pop2(r_, LR); \ + emith_ret(); \ +} while (0) + + +// emitter ABI stuff +#define emith_pool_check() /**/ +#define emith_pool_commit(j) /**/ +#define emith_insn_ptr() ((u8 *)tcache_ptr) +#define emith_flush() /**/ +#define host_instructions_updated(base, end) __builtin___clear_cache(base, end) +#define emith_update_cache() /**/ +#define emith_rw_offs_max() 0xff +#define emith_uext_ptr(r) /**/ + + +// SH2 drc specific +#define emith_sh2_drc_entry() do { \ + emith_push2(LR, FP); \ + emith_push2(28, 27); \ + emith_push2(26, 25); \ + emith_push2(24, 23); \ + emith_push2(22, 21); \ + emith_push2(20, 19); \ +} while (0) +#define emith_sh2_drc_exit() do { \ + emith_pop2(20, 19); \ + emith_pop2(22, 21); \ + emith_pop2(24, 23); \ + emith_pop2(26, 25); \ + emith_pop2(28, 27); \ + emith_pop2(LR, FP); \ + emith_ret(); \ +} while (0) + +// NB: assumes a is in arg0, tab, func and mask are temp +#define emith_sh2_rcall(a, tab, func, mask) do { \ + emith_lsr(mask, a, SH2_READ_SHIFT); \ + EMIT(A64_ADDX_REG(tab, tab, mask, ST_LSL, 4)); \ + emith_read_r_r_offs_ptr(func, tab, 0); \ + emith_read_r_r_offs(mask, tab, 8); \ + EMIT(A64_ADDXS_REG(func, func, func, ST_LSL, 0)); \ +} while (0) + +// NB: assumes a, val are in arg0 and arg1, tab and func are temp +#define emith_sh2_wcall(a, val, tab, func) do { \ + emith_lsr(func, a, SH2_WRITE_SHIFT); \ + emith_lsl(func, func, 3); \ + emith_read_r_r_r_ptr(func, tab, func); \ + emith_move_r_r_ptr(2, CONTEXT_REG); /* arg2 */ \ + emith_jump_reg(func); \ +} while (0) + +#define emith_sh2_delay_loop(cycles, reg) do { \ + int sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); \ + int t1 = rcache_get_tmp(); \ + int t2 = rcache_get_tmp(); \ + int t3 = rcache_get_tmp(); \ + /* if (sr < 0) return */ \ + emith_cmp_r_imm(sr, 0); \ + EMITH_JMP_START(DCOND_LE); \ + /* turns = sr.cycles / cycles */ \ + emith_asr(t2, sr, 12); \ + emith_move_r_imm(t3, (u32)((1ULL<<32) / (cycles)) + 1); \ + emith_mul_u64(t1, t2, t2, t3); /* multiply by 1/x */ \ + rcache_free_tmp(t3); \ + if (reg >= 0) { \ + /* if (reg <= turns) turns = reg-1 */ \ + t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \ + emith_cmp_r_r(t3, t2); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_sub_r_r_imm_c(DCOND_LS, t2, t3, 1); \ + EMITH_SJMP_END(DCOND_HI); \ + /* if (reg <= 1) turns = 0 */ \ + emith_cmp_r_imm(t3, 1); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_move_r_imm_c(DCOND_LS, t2, 0); \ + EMITH_SJMP_END(DCOND_HI); \ + /* reg -= turns */ \ + emith_sub_r_r(t3, t2); \ + } \ + /* sr.cycles -= turns * cycles; */ \ + emith_move_r_imm(t1, cycles); \ + emith_mul(t1, t2, t1); \ + emith_sub_r_r_r_lsl(sr, sr, t1, 12); \ + EMITH_JMP_END(DCOND_LE); \ + rcache_free_tmp(t1); \ + rcache_free_tmp(t2); \ +} while (0) + +/* + * T = carry(Rn = (Rn << 1) | T) + * if Q + * t = !carry(Rn += Rm) + * else + * t = !carry(Rn -= Rm) + * T ^= t + */ +#define emith_sh2_div1_step(rn, rm, sr) do { \ + int tmp_ = rcache_get_tmp(); \ + emith_tpop_carry(sr, 0); \ + emith_adcf_r_r_r(rn, rn, rn); \ + emith_tpush_carry(sr, 0); \ + emith_tst_r_imm(sr, Q); \ + EMITH_SJMP3_START(DCOND_EQ); \ + emith_addf_r_r(rn, rm); \ + emith_adc_r_r_r(tmp_, Z0, Z0); \ + emith_eor_r_imm(tmp_, 1); \ + EMITH_SJMP3_MID(DCOND_EQ); \ + emith_subf_r_r(rn, rm); \ + emith_adc_r_r_r(tmp_, Z0, Z0); \ + EMITH_SJMP3_END(); \ + emith_eor_r_r(sr, tmp_); \ + rcache_free_tmp(tmp_); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macl(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* MACH top 16 bits unused if saturated. sign ext for overfl detect */ \ + emith_sext(mh, mh, 16); \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ + /* to check: add MACH >> 31 to MACH >> 15. this is 0 if no overflow */ \ + emith_asr(rn, mh, 15); \ + emith_addf_r_r_r_lsr(rn, rn, mh, 31); \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ + EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> +ovl */ \ + emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0xffffffff */ \ + emith_sub_r_imm_c(DCOND_GT, mh, 1); /* 0x00007fff */ \ + EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macw(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* XXX: MACH should be untouched when S is set? */ \ + emith_asr(mh, ml, 31); /* sign ext MACL to MACH for ovrfl check */ \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 33 bits of MACH:MACL aren't all 1 or 0 */ \ + /* to check: add MACL[31] to MACH. this is 0 if no overflow */ \ + emith_addf_r_r_r_lsr(rn, mh, ml, 31); /* sum = MACH + (MACL>>31) */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> overflow */ \ + /* XXX: LSB signalling only in SH1, or in SH2 too? */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x00000001); /* LSB of MACH */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x80000000); /* negative ovrfl */ \ + EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> positive ovrfl */ \ + emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0x7fffffff */ \ + EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +#define emith_write_sr(sr, srcr) \ + EMIT(A64_BFI_IMM(sr, srcr, 0, 10)) + +#define emith_carry_to_t(srr, is_sub) do { \ + emith_lsr(sr, sr, 1); \ + emith_adc_r_r(sr, sr); \ + if (is_sub) /* SUB has inverted C on ARM */ \ + emith_eor_r_imm(sr, 1); \ +} while (0) + +#define emith_t_to_carry(srr, is_sub) do { \ + if (is_sub) { \ + int t_ = rcache_get_tmp(); \ + emith_eor_r_r_imm(t_, srr, 1); \ + emith_rorf(t_, t_, 1); \ + rcache_free_tmp(t_); \ + } else { \ + emith_rorf(srr, srr, 1); \ + emith_rol(srr, srr, 1); \ + } \ +} while (0) + +#define emith_tpop_carry(sr, is_sub) do { \ + if (is_sub) \ + emith_eor_r_imm(sr, 1); \ + emith_ror(sr, sr, 1); \ + emith_addf_r_r(sr, sr); \ +} while (0) + +#define emith_tpush_carry(sr, is_sub) do { \ + emith_adc_r_r(sr, Z0); \ + if (is_sub) \ + emith_eor_r_imm(sr, 1); \ +} while (0) + +#ifdef T +// T bit handling +static int tcond = -1; + +#define emith_invert_cond(cond) \ + ((cond) ^ 1) + +#define emith_clr_t_cond(sr) \ + (void)sr + +#define emith_set_t_cond(sr, cond) \ + tcond = cond + +#define emith_get_t_cond() \ + tcond + +#define emith_invalidate_t() \ + tcond = -1 + +#define emith_set_t(sr, val) \ + tcond = ((val) ? A64_COND_AL: A64_COND_NV) + +static void emith_sync_t(int sr) +{ + if (tcond == A64_COND_AL) + emith_or_r_imm(sr, T); + else if (tcond == A64_COND_NV) + emith_bic_r_imm(sr, T); + else if (tcond >= 0) { + int tmp = rcache_get_tmp(); + EMIT(A64_CSET(tcond, tmp)); + EMIT(A64_BFI_IMM(sr, tmp, __builtin_ffs(T)-1, 1)); + rcache_free_tmp(tmp); + } + tcond = -1; +} + +static int emith_tst_t(int sr, int tf) +{ + if (tcond < 0) { + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; + } else if (tcond >= A64_COND_AL) { + // MUST sync because A64_COND_AL/NV isn't a real condition + emith_sync_t(sr); + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; + } else + return tf ? tcond : emith_invert_cond(tcond); +} +#endif diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c new file mode 100644 index 000000000..fb7de3661 --- /dev/null +++ b/cpu/drc/emit_mips.c @@ -0,0 +1,1842 @@ +/* + * Basic macros to emit MIPS32/MIPS64 Release 1 or 2 instructions and some utils + * Copyright (C) 2019 kub + * + * This work is licensed under the terms of MAME license. + * See COPYING file in the top-level directory. + */ +#define HOST_REGS 32 + +// MIPS32 ABI: params: r4-r7, return: r2-r3, temp: r1(at),r8-r15,r24-r25,r31(ra) +// saved: r16-r23,r30, reserved: r0(zero), r26-r27(irq), r28(gp), r29(sp) +// r1,r15,r24,r25(at,t7-t9) are used internally by the code emitter +// MIPSN32/MIPS64 ABI: params: r4-r11, no caller-reserved save area on stack +#define RET_REG 2 // v0 +#define PARAM_REGS { 4, 5, 6, 7 } // a0-a3 +#define PRESERVED_REGS { 16, 17, 18, 19, 20, 21, 22, 23 } // s0-s7 +#define TEMPORARY_REGS { 2, 3, 8, 9, 10, 11, 12, 13, 14 } // v0-v1,t0-t6 + +#define CONTEXT_REG 23 // s7 +#define STATIC_SH2_REGS { SHR_SR,22 , SHR_R(0),21 , SHR_R(1),20 } + +// NB: the ubiquitous JZ74[46]0 uses MIPS32 Release 1, a slight MIPS II superset +#ifndef __mips_isa_rev +#define __mips_isa_rev 1 // surprisingly not always defined +#endif + +// registers usable for user code: r1-r25, others reserved or special +#define Z0 0 // zero register +#define GP 28 // global pointer +#define SP 29 // stack pointer +#define FP 30 // frame pointer +#define LR 31 // link register +// internally used by code emitter: +#define AT 1 // used to hold intermediate results +#define FNZ 15 // emulated processor flags: N (bit 31) ,Z (all bits) +#define FC 24 // emulated processor flags: C (bit 0), others 0 +#define FV 25 // emulated processor flags: Nt^Ns (bit 31). others x + +// All operations but ptr ops are using the lower 32 bits of the registers. +// The upper 32 bits always contain the sign extension from the lower 32 bits. + +// unified conditions; virtual, not corresponding to anything real on MIPS +#define DCOND_EQ 0x0 +#define DCOND_NE 0x1 +#define DCOND_HS 0x2 +#define DCOND_LO 0x3 +#define DCOND_MI 0x4 +#define DCOND_PL 0x5 +#define DCOND_VS 0x6 +#define DCOND_VC 0x7 +#define DCOND_HI 0x8 +#define DCOND_LS 0x9 +#define DCOND_GE 0xa +#define DCOND_LT 0xb +#define DCOND_GT 0xc +#define DCOND_LE 0xd + +#define DCOND_CS DCOND_LO +#define DCOND_CC DCOND_HS + +// unified insn +#define MIPS_INSN(op, rs, rt, rd, sa, fn) \ + (((op)<<26)|((rs)<<21)|((rt)<<16)|((rd)<<11)|((sa)<<6)|((fn)<<0)) + +#define _ 0 // marker for "field unused" +#define __(n) o##n // enum marker for "undefined" + +// opcode field (encoded in op) +enum { OP__FN=000, OP__RT, OP_J, OP_JAL, OP_BEQ, OP_BNE, OP_BLEZ, OP_BGTZ }; +enum { OP_ADDI=010, OP_ADDIU, OP_SLTI, OP_SLTIU, OP_ANDI, OP_ORI, OP_XORI, OP_LUI }; +enum { OP_DADDI=030, OP_DADDIU, OP_LDL, OP_LDR, OP__FN2=034, OP__FN3=037 }; +enum { OP_LB=040, OP_LH, OP_LWL, OP_LW, OP_LBU, OP_LHU, OP_LWR, OP_LWU }; +enum { OP_SB=050, OP_SH, OP_SWL, OP_SW, OP_SDL, OP_SDR, OP_SWR }; +enum { OP_SD=067, OP_LD=077 }; +// function field (encoded in fn if opcode = OP__FN) +enum { FN_SLL=000, __(01), FN_SRL, FN_SRA, FN_SLLV, __(05), FN_SRLV, FN_SRAV }; +enum { FN_JR=010, FN_JALR, FN_MOVZ, FN_MOVN, FN_SYNC=017 }; +enum { FN_MFHI=020, FN_MTHI, FN_MFLO, FN_MTLO, FN_DSSLV, __(25), FN_DSLRV, FN_DSRAV }; +enum { FN_MULT=030, FN_MULTU, FN_DIV, FN_DIVU, FN_DMULT, FN_DMULTU, FN_DDIV, FN_DDIVU }; +enum { FN_ADD=040, FN_ADDU, FN_SUB, FN_SUBU, FN_AND, FN_OR, FN_XOR, FN_NOR }; +enum { FN_SLT=052, FN_SLTU, FN_DADD, FN_DADDU, FN_DSUB, FN_DSUBU }; +enum { FN_DSLL=070, __(71), FN_DSRL, FN_DSRA, FN_DSLL32, __(75), FN_DSRL32, FN_DSRA32 }; +// function field (encoded in fn if opcode = OP__FN2) +enum { FN2_MADD=000, FN2_MADDU, FN2_MUL, __(03), FN2_MSUB, FN2_MSUBU }; +enum { FN2_CLZ=040, FN2_CLO, FN2_DCLZ=044, FN2_DCLO }; +// function field (encoded in fn if opcode = OP__FN3) +enum { FN3_EXT=000, FN3_DEXTM, FN3_DEXTU, FN3_DEXT, FN3_INS, FN3_DINSM, FN3_DINSU, FN3_DINS }; +enum { FN3_BSHFL=040, FN3_DBSHFL=044 }; +// rt field (encoded in rt if opcode = OP__RT) +enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; + +// bit shuffle function (encoded in sa if function = FN3_BSHFL) +enum { BS_SBH=002, BS_SHD=005, BS_SEB=020, BS_SEH=030 }; +// r (rotate) bit function (encoded in rs/sa if function = FN_SRL/FN_SRLV) +enum { RB_SRL=0, RB_ROTR=1 }; + +#define MIPS_NOP 000 // null operation: SLL r0, r0, #0 + +// arithmetic/logical + +#define MIPS_OP_REG(op, sa, rd, rs, rt) \ + MIPS_INSN(OP__FN, rs, rt, rd, sa, op) // R-type, SPECIAL +#define MIPS_OP2_REG(op, sa, rd, rs, rt) \ + MIPS_INSN(OP__FN2, rs, rt, rd, sa, op) // R-type, SPECIAL2 +#define MIPS_OP3_REG(op, sa, rd, rs, rt) \ + MIPS_INSN(OP__FN3, rs, rt, rd, sa, op) // R-type, SPECIAL3 +#define MIPS_OP_IMM(op, rt, rs, imm) \ + MIPS_INSN(op, rs, rt, _, _, (u16)(imm)) // I-type + +// rd = rs OP rt +#define MIPS_ADD_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_ADDU,_, rd, rs, rt) +#define MIPS_DADD_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_DADDU,_, rd, rs, rt) +#define MIPS_SUB_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_SUBU,_, rd, rs, rt) +#define MIPS_DSUB_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_DSUBU,_, rd, rs, rt) + +#define MIPS_NEG_REG(rd, rt) \ + MIPS_SUB_REG(rd, Z0, rt) + +#define MIPS_XOR_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_XOR,_, rd, rs, rt) +#define MIPS_OR_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_OR,_, rd, rs, rt) +#define MIPS_AND_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_AND,_, rd, rs, rt) +#define MIPS_NOR_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_NOR,_, rd, rs, rt) + +#define MIPS_MOVE_REG(rd, rs) \ + MIPS_OR_REG(rd, rs, Z0) +#define MIPS_MVN_REG(rd, rs) \ + MIPS_NOR_REG(rd, rs, Z0) + +// rd = rt SHIFT rs +#define MIPS_LSL_REG(rd, rt, rs) \ + MIPS_OP_REG(FN_SLLV,_, rd, rs, rt) +#define MIPS_LSR_REG(rd, rt, rs) \ + MIPS_OP_REG(FN_SRLV,RB_SRL, rd, rs, rt) +#define MIPS_ASR_REG(rd, rt, rs) \ + MIPS_OP_REG(FN_SRAV,_, rd, rs, rt) +#define MIPS_ROR_REG(rd, rt, rs) \ + MIPS_OP_REG(FN_SRLV,RB_ROTR, rd, rs, rt) + +#define MIPS_SEB_REG(rd, rt) \ + MIPS_OP3_REG(FN3_BSHFL, BS_SEB, rd, _, rt) +#define MIPS_SEH_REG(rd, rt) \ + MIPS_OP3_REG(FN3_BSHFL, BS_SEH, rd, _, rt) + +#define MIPS_EXT_IMM(rt, rs, lsb, sz) \ + MIPS_OP3_REG(FN3_EXT, lsb, (sz)-1, rs, rt) +#define MIPS_INS_IMM(rt, rs, lsb, sz) \ + MIPS_OP3_REG(FN3_INS, lsb, (lsb)+(sz)-1, rs, rt) + +// rd = (rs < rt) +#define MIPS_SLT_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_SLT,_, rd, rs, rt) +#define MIPS_SLTU_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_SLTU,_, rd, rs, rt) + +// rt = rs OP imm16 +#define MIPS_ADD_IMM(rt, rs, imm16) \ + MIPS_OP_IMM(OP_ADDIU, rt, rs, imm16) +#define MIPS_DADD_IMM(rt, rs, imm16) \ + MIPS_OP_IMM(OP_DADDIU, rt, rs, imm16) + +#define MIPS_XOR_IMM(rt, rs, imm16) \ + MIPS_OP_IMM(OP_XORI, rt, rs, imm16) +#define MIPS_OR_IMM(rt, rs, imm16) \ + MIPS_OP_IMM(OP_ORI, rt, rs, imm16) +#define MIPS_AND_IMM(rt, rs, imm16) \ + MIPS_OP_IMM(OP_ANDI, rt, rs, imm16) + +// rt = (imm16 << (0|16)) +#define MIPS_MOV_IMM(rt, imm16) \ + MIPS_OP_IMM(OP_ORI, rt, Z0, imm16) +#define MIPS_MOVT_IMM(rt, imm16) \ + MIPS_OP_IMM(OP_LUI, rt, _, imm16) + +// rd = rt SHIFT imm5 +#define MIPS_LSL_IMM(rd, rt, bits) \ + MIPS_INSN(OP__FN, _, rt, rd, bits, FN_SLL) +#define MIPS_LSR_IMM(rd, rt, bits) \ + MIPS_INSN(OP__FN, RB_SRL, rt, rd, bits, FN_SRL) +#define MIPS_ASR_IMM(rd, rt, bits) \ + MIPS_INSN(OP__FN, _, rt, rd, bits, FN_SRA) +#define MIPS_ROR_IMM(rd, rt, bits) \ + MIPS_INSN(OP__FN, RB_ROTR, rt, rd, bits, FN_SRL) + +#define MIPS_DLSL_IMM(rd, rt, bits) \ + MIPS_INSN(OP__FN, _, rt, rd, bits, FN_DSLL) +#define MIPS_DLSL32_IMM(rd, rt, bits) \ + MIPS_INSN(OP__FN, _, rt, rd, bits, FN_DSLL32) + +// rt = (rs < imm16) +#define MIPS_SLT_IMM(rt, rs, imm16) \ + MIPS_OP_IMM(OP_SLTI, rt, rs, imm16) +#define MIPS_SLTU_IMM(rt, rs, imm16) \ + MIPS_OP_IMM(OP_SLTIU, rt, rs, imm16) + +// multiplication + +#define MIPS_MULT(rt, rs) \ + MIPS_OP_REG(FN_MULT,_, _, rs, rt) +#define MIPS_MULTU(rt, rs) \ + MIPS_OP_REG(FN_MULTU,_, _, rs, rt) +#define MIPS_MADD(rt, rs) \ + MIPS_OP2_REG(FN_MADD,_, _, rs, rt) +#define MIPS_MADDU(rt, rs) \ + MIPS_OP2_REG(FN_MADDU,_, _, rs, rt) +#define MIPS_MFLO(rd) \ + MIPS_OP_REG(FN_MFLO,_, rd, _, _) +#define MIPS_MFHI(rd) \ + MIPS_OP_REG(FN_MFHI,_, rd, _, _) + +// branching + +#define MIPS_J(abs26) \ + MIPS_INSN(OP_J, _,_,_,_, (abs26) >> 2) // J-type +#define MIPS_JAL(abs26) \ + MIPS_INSN(OP_JAL, _,_,_,_, (abs26) >> 2) +#define MIPS_JR(rs) \ + MIPS_OP_REG(FN_JR,_, _,rs,_) +#define MIPS_JALR(rd, rs) \ + MIPS_OP_REG(FN_JALR,_, rd,rs,_) + +// conditional branches; no condition code, these compare rs against rt or Z0 +#define MIPS_BEQ (OP_BEQ << 5) // rs == rt (rt in lower 5 bits) +#define MIPS_BNE (OP_BNE << 5) // rs != rt (ditto) +#define MIPS_BLE (OP_BLEZ << 5) // rs <= 0 +#define MIPS_BGT (OP_BGTZ << 5) // rs > 0 +#define MIPS_BLT ((OP__RT << 5)|RT_BLTZ) // rs < 0 +#define MIPS_BGE ((OP__RT << 5)|RT_BGEZ) // rs >= 0 +#define MIPS_BGTL ((OP__RT << 5)|RT_BLTZAL) // rs > 0, link $ra if jumping +#define MIPS_BGEL ((OP__RT << 5)|RT_BGEZAL) // rs >= 0, link $ra if jumping + +#define MIPS_BCOND(cond, rs, rt, offs16) \ + MIPS_OP_IMM((cond >> 5), rt, rs, (offs16) >> 2) +#define MIPS_BCONDZ(cond, rs, offs16) \ + MIPS_OP_IMM((cond >> 5), (cond & 0x1f), rs, (offs16) >> 2) +#define MIPS_B(offs16) \ + MIPS_BCONDZ(MIPS_BEQ, Z0, offs16) +#define MIPS_BL(offs16) \ + MIPS_BCONDZ(MIPS_BGEL, Z0, offs16) + +// load/store indexed base + +#define MIPS_LD(rt, rs, offs16) \ + MIPS_OP_IMM(OP_LD, rt, rs, (u16)(offs16)) +#define MIPS_LW(rt, rs, offs16) \ + MIPS_OP_IMM(OP_LW, rt, rs, (u16)(offs16)) +#define MIPS_LH(rt, rs, offs16) \ + MIPS_OP_IMM(OP_LH, rt, rs, (u16)(offs16)) +#define MIPS_LB(rt, rs, offs16) \ + MIPS_OP_IMM(OP_LB, rt, rs, (u16)(offs16)) +#define MIPS_LHU(rt, rs, offs16) \ + MIPS_OP_IMM(OP_LHU, rt, rs, (u16)(offs16)) +#define MIPS_LBU(rt, rs, offs16) \ + MIPS_OP_IMM(OP_LBU, rt, rs, (u16)(offs16)) + +#define MIPS_SD(rt, rs, offs16) \ + MIPS_OP_IMM(OP_SD, rt, rs, (u16)(offs16)) +#define MIPS_SW(rt, rs, offs16) \ + MIPS_OP_IMM(OP_SW, rt, rs, (u16)(offs16)) +#define MIPS_SH(rt, rs, offs16) \ + MIPS_OP_IMM(OP_SH, rt, rs, (u16)(offs16)) +#define MIPS_SB(rt, rs, offs16) \ + MIPS_OP_IMM(OP_SB, rt, rs, (u16)(offs16)) + +// pointer operations + +#if _MIPS_SZPTR == 64 +#define OP_LP OP_LD +#define OP_SP OP_SD +#define OP_PADDIU OP_DADDIU +#define FN_PADDU FN_DADDU +#define FN_PSUBU FN_DSUBU +#define PTR_SCALE 3 +#else +#define OP_LP OP_LW +#define OP_SP OP_SW +#define OP_PADDIU OP_ADDIU +#define FN_PADDU FN_ADDU +#define FN_PSUBU FN_SUBU +#define PTR_SCALE 2 +#endif +#define PTR_SIZE (1< 0) { \ + u32 *p = (u32 *)tcache_ptr - emith_last_cnt; \ + int idx = (emith_last_idx - emith_last_cnt+1) %FSZ; \ + EMIT_PTR(p, emith_last_insns[idx]);\ + emith_last_cnt --; \ + } \ + } while (0) + +#define EMIT(op) \ + do { \ + if (emith_last_cnt >= FSZ) EMIT_PUSHOP(); \ + tcache_ptr = (void *)((u32 *)tcache_ptr + 1); \ + emith_last_idx = (emith_last_idx+1) %FSZ; \ + emith_last_insns[emith_last_idx] = op; \ + emith_last_cnt ++; \ + COUNT_OP; \ + } while (0) + +#define emith_flush() \ + do { \ + while (emith_last_cnt) EMIT_PUSHOP(); \ + emith_flg_hint = _FHV|_FHC; \ + } while (0) + +#define emith_insn_ptr() (u8 *)((u32 *)tcache_ptr - emith_last_cnt) + +// delay slot stuff +static int emith_is_j(u32 op) // J, JAL + { return ((op>>26) & 076) == OP_J; } +static int emith_is_jr(u32 op) // JR, JALR + { return (op>>26) == OP__FN && (op & 076) == FN_JR; } +static int emith_is_b(u32 op) // B + { return ((op>>26) & 074) == OP_BEQ || + ((op>>26) == OP__RT && ((op>>16) & 036) == RT_BLTZ); } +// register usage for dependency evaluation XXX better do this as in emit_arm? +static uint64_t emith_has_rs[5] = // OP__FN1-3, OP__RT, others + { 0x005ffcffffda0fd2ULL, 0x0000003300000037ULL, 0x00000000000000ffULL, + 0x800f5f0fUL, 0xf7ffffff0ff07ff0ULL }; +static uint64_t emith_has_rt[5] = // OP__FN1-3, OP__RT, others + { 0xdd5ffcffffd00cddULL, 0x0000000000000037ULL, 0x0000001100000000ULL, + 0x00000000UL, 0x80007f440c300030ULL }; +static uint64_t emith_has_rd[5] = // OP__FN1-3, OP__RT, others(rt instead of rd) + { 0xdd00fcff00d50edfULL, 0x0000003300000004ULL, 0x08000011000000ffULL, + 0x00000000UL, 0x119100ff0f00ff00ULL }; +#define emith_has_(rx,ix,op,sa,m) \ + (emith_has_##rx[ix] & (1ULL << (((op)>>(sa)) & (m)))) +static int emith_rs(u32 op) + { if ((op>>26) == OP__FN) + return emith_has_(rs,0,op, 0,0x3f) ? (op>>21)&0x1f : 0; + if ((op>>26) == OP__FN2) + return emith_has_(rs,1,op, 0,0x3f) ? (op>>21)&0x1f : 0; + if ((op>>26) == OP__FN3) + return emith_has_(rs,2,op, 0,0x3f) ? (op>>21)&0x1f : 0; + if ((op>>26) == OP__RT) + return emith_has_(rs,3,op,16,0x1f) ? (op>>21)&0x1f : 0; + return emith_has_(rs,4,op,26,0x3f) ? (op>>21)&0x1f : 0; + } +static int emith_rt(u32 op) + { if ((op>>26) == OP__FN) + return emith_has_(rt,0,op, 0,0x3f) ? (op>>16)&0x1f : 0; + if ((op>>26) == OP__FN2) + return emith_has_(rt,1,op, 0,0x3f) ? (op>>16)&0x1f : 0; + if ((op>>26) == OP__FN3) + return emith_has_(rt,2,op, 0,0x3f) ? (op>>16)&0x1f : 0; + if ((op>>26) == OP__RT) + return 0; + return emith_has_(rt,4,op,26,0x3f) ? (op>>16)&0x1f : 0; + } +static int emith_rd(u32 op) + { int ret = emith_has_(rd,4,op,26,0x3f) ? (op>>16)&0x1f :-1; + if ((op>>26) == OP__FN) + ret = emith_has_(rd,0,op, 0,0x3f) ? (op>>11)&0x1f :-1; + if ((op>>26) == OP__FN2) + ret = emith_has_(rd,1,op, 0,0x3f) ? (op>>11)&0x1f :-1; + if ((op>>26) == OP__FN3 && (op&0x3f) == FN3_BSHFL) + ret = emith_has_(rd,2,op, 0,0x3f) ? (op>>11)&0x1f :-1; + if ((op>>26) == OP__FN3 && (op&0x3f) != FN3_BSHFL) + ret = emith_has_(rd,2,op, 0,0x3f) ? (op>>16)&0x1f :-1; + if ((op>>26) == OP__RT) + ret = -1; + return (ret ?: -1); // Z0 doesn't have dependencies + } + +static int emith_b_isswap(u32 bop, u32 lop) +{ + if (emith_is_j(bop)) + return bop; + else if (emith_is_jr(bop) && emith_rd(lop) != emith_rs(bop)) + return bop; + else if (emith_is_b(bop) && emith_rd(lop) != emith_rs(bop) && + emith_rd(lop) != emith_rt(bop)) + if ((bop & 0xffff) != 0x7fff) // displacement overflow? + return (bop & 0xffff0000) | ((bop+1) & 0x0000ffff); + return 0; +} + +static int emith_insn_swappable(u32 op1, u32 op2) +{ + if (emith_rd(op1) != emith_rd(op2) && + emith_rs(op1) != emith_rd(op2) && emith_rt(op1) != emith_rd(op2) && + emith_rs(op2) != emith_rd(op1) && emith_rt(op2) != emith_rd(op1)) + return 1; + return 0; +} + +// emit branch, trying to fill the delay slot with one of the last insns +static void *emith_branch(u32 op) +{ + unsigned idx = emith_last_idx, ds = idx; + u32 bop = 0, sop; + void *bp; + int i, j, s; + + // check for ds insn; older mustn't interact with newer ones to overtake + for (i = 0; i < emith_last_cnt && !bop; i++) { + ds = (idx-i)%FSZ; + sop = emith_last_insns[ds]; + for (j = i, s = 1; j > 0 && s; j--) + s = emith_insn_swappable(emith_last_insns[(ds+j)%FSZ], sop); + if (s) + bop = emith_b_isswap(op, sop); + } + + // flush FIFO, but omit delay slot insn + tcache_ptr = (void *)((u32 *)tcache_ptr - emith_last_cnt); + idx = (idx-emith_last_cnt+1)%FSZ; + for (i = emith_last_cnt; i > 0; i--, idx = (idx+1)%FSZ) + if (!bop || idx != ds) + EMIT_PTR(tcache_ptr, emith_last_insns[idx]); + emith_last_cnt = 0; + // emit branch and delay slot + bp = tcache_ptr; + if (bop) { // can swap + EMIT_PTR(tcache_ptr, bop); COUNT_OP; + EMIT_PTR(tcache_ptr, emith_last_insns[ds]); + } else { // can't swap + EMIT_PTR(tcache_ptr, op); COUNT_OP; + EMIT_PTR(tcache_ptr, MIPS_NOP); COUNT_OP; + } + return bp; +} + +// if-then-else conditional execution helpers +#define JMP_POS(ptr) \ + ptr = emith_branch(MIPS_BCONDZ(cond_m, cond_r, 0)); + +#define JMP_EMIT(cond, ptr) { \ + u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; \ + emith_flush(); /* prohibit delay slot switching across jump targets */ \ + EMIT_PTR(ptr, MIPS_BCONDZ(cond_m, cond_r, val_ & 0x0003ffff)); \ +} + +#define JMP_EMIT_NC(ptr) { \ + u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; \ + emith_flush(); \ + EMIT_PTR(ptr, MIPS_B(val_ & 0x0003ffff)); \ +} + +#define EMITH_JMP_START(cond) { \ + int cond_r, cond_m = emith_cond_check(cond, &cond_r); \ + u8 *cond_ptr; \ + JMP_POS(cond_ptr) + +#define EMITH_JMP_END(cond) \ + JMP_EMIT(cond, cond_ptr); \ +} + +#define EMITH_JMP3_START(cond) { \ + int cond_r, cond_m = emith_cond_check(cond, &cond_r); \ + u8 *cond_ptr, *else_ptr; \ + JMP_POS(cond_ptr) + +#define EMITH_JMP3_MID(cond) \ + JMP_POS(else_ptr); \ + JMP_EMIT(cond, cond_ptr); + +#define EMITH_JMP3_END() \ + JMP_EMIT_NC(else_ptr); \ +} + +// "simple" jump (no more than a few insns) +// ARM32 will use conditional instructions here +#define EMITH_SJMP_START EMITH_JMP_START +#define EMITH_SJMP_END EMITH_JMP_END + +#define EMITH_SJMP3_START EMITH_JMP3_START +#define EMITH_SJMP3_MID EMITH_JMP3_MID +#define EMITH_SJMP3_END EMITH_JMP3_END + +#define EMITH_SJMP2_START(cond) \ + EMITH_SJMP3_START(cond) +#define EMITH_SJMP2_MID(cond) \ + EMITH_SJMP3_MID(cond) +#define EMITH_SJMP2_END(cond) \ + EMITH_SJMP3_END() + + +// flag register emulation. this is modelled after arm/x86. +// the FNZ register stores the result of the last flag setting operation for +// N and Z flag, used for EQ,NE,MI,PL branches. +// the FC register stores the C flag (used for HI,HS,LO,LS,CC,CS). +// the FV register stores information for V flag calculation (used for +// GT,GE,LT,LE,VC,VS). V flag is costly and only fully calculated when needed. +// the core registers may be temp registers, since the condition after calls +// is undefined anyway. + +// flag emulation creates 2 (ie cmp #0/beq) up to 9 (ie adcf/ble) extra insns. +// flag handling shortcuts may reduce this by 1-4 insns, see emith_cond_check() +static int emith_cmp_rs, emith_cmp_rt; // registers used in cmp_r_r/cmp_r_imm +static s32 emith_cmp_imm; // immediate value used in cmp_r_imm +enum { _FHC=1, _FHV=2 } emith_flg_hint; // C/V flag usage hinted by compiler +static int emith_flg_noV; // V flag known not to be set + +#define EMITH_HINT_COND(cond) do { \ + /* only need to check cond>>1 since the lowest bit inverts the cond */ \ + unsigned _mv = BITMASK3(DCOND_VS>>1,DCOND_GE>>1,DCOND_GT>>1); \ + unsigned _mc = _mv | BITMASK2(DCOND_HS>>1,DCOND_HI>>1); \ + emith_flg_hint = (_mv & BITMASK1(cond >> 1) ? _FHV : 0); \ + emith_flg_hint |= (_mc & BITMASK1(cond >> 1) ? _FHC : 0); \ +} while (0) + +// store minimal cc information: rd, rt^rs, carry +// NB: the result *must* first go to FNZ, in case rd == rs or rd == rt. +// NB: for adcf and sbcf, carry-in must be dealt with separately (see there) +static void emith_set_arith_flags(int rd, int rs, int rt, s32 imm, int sub) +{ + if (emith_flg_hint & _FHC) { + if (sub) // C = sub:rt Z0) // Nt^Ns in FV, bit 31 + EMIT(MIPS_XOR_REG(FV, rs, rt)); + else if (rt == Z0 || imm == 0) + emith_flg_noV = 1; // imm #0 can't overflow + else if ((imm < 0) == !sub) + EMIT(MIPS_NOR_REG(FV, rs, Z0)); + else if ((imm > 0) == !sub) + EMIT(MIPS_XOR_REG(FV, rs, Z0)); + } + // full V = Nd^Nt^Ns^C calculation is deferred until really needed + + if (rd && rd != FNZ) + EMIT(MIPS_MOVE_REG(rd, FNZ)); // N,Z via result value in FNZ + emith_cmp_rs = emith_cmp_rt = -1; +} + +// since MIPS has less-than and compare-branch insns, handle cmp separately by +// storing the involved regs for later use in one of those MIPS insns. +// This works for all conditions but VC/VS, but this is fortunately never used. +static void emith_set_compare_flags(int rs, int rt, s32 imm) +{ + emith_cmp_rt = rt; + emith_cmp_rs = rs; + emith_cmp_imm = imm; +} + +// data processing, register +#define emith_move_r_r_ptr(d, s) \ + EMIT(MIPS_MOVE_REG(d, s)) +#define emith_move_r_r_ptr_c(cond, d, s) \ + emith_move_r_r_ptr(d, s) + +#define emith_move_r_r(d, s) \ + emith_move_r_r_ptr(d, s) +#define emith_move_r_r_c(cond, d, s) \ + emith_move_r_r(d, s) + +#define emith_mvn_r_r(d, s) \ + EMIT(MIPS_MVN_REG(d, s)) + +#define emith_add_r_r_r_lsl_ptr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_OP_REG(FN_PADDU,_, d, s1, AT)); \ + } else EMIT(MIPS_OP_REG(FN_PADDU,_, d, s1, s2)); \ +} while (0) +#define emith_add_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_ADD_REG(d, s1, AT)); \ + } else EMIT(MIPS_ADD_REG(d, s1, s2)); \ +} while (0) + +#define emith_add_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSR_IMM(AT, s2, simm)); \ + EMIT(MIPS_ADD_REG(d, s1, AT)); \ + } else EMIT(MIPS_ADD_REG(d, s1, s2)); \ +} while (0) + +#define emith_addf_r_r_r_lsl_ptr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_OP_REG(FN_PADDU,_, FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(MIPS_OP_REG(FN_PADDU,_, FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) +#define emith_addf_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_ADD_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(MIPS_ADD_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) + +#define emith_addf_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSR_IMM(AT, s2, simm)); \ + EMIT(MIPS_ADD_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(MIPS_ADD_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) + +#define emith_sub_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_SUB_REG(d, s1, AT)); \ + } else EMIT(MIPS_SUB_REG(d, s1, s2)); \ +} while (0) + +#define emith_subf_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_SUB_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 1); \ + } else { \ + EMIT(MIPS_SUB_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 1); \ + } \ +} while (0) + +#define emith_or_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_OR_REG(d, s1, AT)); \ + } else EMIT(MIPS_OR_REG(d, s1, s2)); \ +} while (0) + +#define emith_or_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSR_IMM(AT, s2, simm)); \ + EMIT(MIPS_OR_REG(d, s1, AT)); \ + } else EMIT(MIPS_OR_REG(d, s1, s2)); \ +} while (0) + +#define emith_eor_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_XOR_REG(d, s1, AT)); \ + } else EMIT(MIPS_XOR_REG(d, s1, s2)); \ +} while (0) + +#define emith_eor_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSR_IMM(AT, s2, simm)); \ + EMIT(MIPS_XOR_REG(d, s1, AT)); \ + } else EMIT(MIPS_XOR_REG(d, s1, s2)); \ +} while (0) + +#define emith_and_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_AND_REG(d, s1, AT)); \ + } else EMIT(MIPS_AND_REG(d, s1, s2)); \ +} while (0) + +#define emith_or_r_r_lsl(d, s, lslimm) \ + emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) + +#define emith_eor_r_r_lsl(d, s, lslimm) \ + emith_eor_r_r_r_lsl(d, d, s, lslimm) +#define emith_eor_r_r_lsr(d, s, lsrimm) \ + emith_eor_r_r_r_lsr(d, d, s, lsrimm) + +#define emith_add_r_r_r(d, s1, s2) \ + emith_add_r_r_r_lsl(d, s1, s2, 0) + +#define emith_addf_r_r_r_ptr(d, s1, s2) \ + emith_addf_r_r_r_lsl_ptr(d, s1, s2, 0) +#define emith_addf_r_r_r(d, s1, s2) \ + emith_addf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_sub_r_r_r(d, s1, s2) \ + emith_sub_r_r_r_lsl(d, s1, s2, 0) + +#define emith_subf_r_r_r(d, s1, s2) \ + emith_subf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_or_r_r_r(d, s1, s2) \ + emith_or_r_r_r_lsl(d, s1, s2, 0) + +#define emith_eor_r_r_r(d, s1, s2) \ + emith_eor_r_r_r_lsl(d, s1, s2, 0) + +#define emith_and_r_r_r(d, s1, s2) \ + emith_and_r_r_r_lsl(d, s1, s2, 0) + +#define emith_add_r_r_ptr(d, s) \ + emith_add_r_r_r_lsl_ptr(d, d, s, 0) +#define emith_add_r_r(d, s) \ + emith_add_r_r_r(d, d, s) + +#define emith_sub_r_r(d, s) \ + emith_sub_r_r_r(d, d, s) + +#define emith_neg_r_r(d, s) \ + EMIT(MIPS_NEG_REG(d, s)) + +#define emith_adc_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(AT, s2, FC); \ + emith_add_r_r_r(d, s1, AT); \ +} while (0) + +#define emith_sbc_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(AT, s2, FC); \ + emith_sub_r_r_r(d, s1, AT); \ +} while (0) + +#define emith_adc_r_r(d, s) \ + emith_adc_r_r_r(d, d, s) + +#define emith_negc_r_r(d, s) \ + emith_sbc_r_r_r(d, Z0, s) + +// NB: the incoming carry Cin can cause Cout if s2+Cin=0 (or s1+Cin=0 FWIW) +// moreover, if s2+Cin=0 caused Cout, s1+s2+Cin=s1+0 can't cause another Cout +#define emith_adcf_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(FNZ, s2, FC); \ + EMIT(MIPS_SLTU_REG(AT, FNZ, FC)); \ + emith_add_r_r_r(FNZ, s1, FNZ); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + emith_or_r_r(FC, AT); \ +} while (0) + +#define emith_sbcf_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(FNZ, s2, FC); \ + EMIT(MIPS_SLTU_REG(AT, FNZ, FC)); \ + emith_sub_r_r_r(FNZ, s1, FNZ); \ + emith_set_arith_flags(d, s1, s2, 0, 1); \ + emith_or_r_r(FC, AT); \ +} while (0) + +#define emith_and_r_r(d, s) \ + emith_and_r_r_r(d, d, s) +#define emith_and_r_r_c(cond, d, s) \ + emith_and_r_r(d, s) + +#define emith_or_r_r(d, s) \ + emith_or_r_r_r(d, d, s) + +#define emith_eor_r_r(d, s) \ + emith_eor_r_r_r(d, d, s) + +#define emith_tst_r_r_ptr(d, s) do { \ + if (d != s) { \ + emith_and_r_r_r(FNZ, d, s); \ + emith_cmp_rs = emith_cmp_rt = -1; \ + } else emith_cmp_rs = s, emith_cmp_rt = Z0; \ +} while (0) +#define emith_tst_r_r(d, s) \ + emith_tst_r_r_ptr(d, s) + +#define emith_teq_r_r(d, s) do { \ + emith_eor_r_r_r(FNZ, d, s); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_cmp_r_r(d, s) \ + emith_set_compare_flags(d, s, 0) +// emith_subf_r_r_r(FNZ, d, s) + +#define emith_addf_r_r(d, s) \ + emith_addf_r_r_r(d, d, s) + +#define emith_subf_r_r(d, s) \ + emith_subf_r_r_r(d, d, s) + +#define emith_adcf_r_r(d, s) \ + emith_adcf_r_r_r(d, d, s) + +#define emith_sbcf_r_r(d, s) \ + emith_sbcf_r_r_r(d, d, s) + +#define emith_negcf_r_r(d, s) \ + emith_sbcf_r_r_r(d, Z0, s) + + +// move immediate +static void emith_move_imm(int r, uintptr_t imm) +{ +#if _MIPS_SZPTR == 64 + if ((s32)imm != imm) { + emith_move_imm(r, imm >> 32); + if (imm & 0xffff0000) { + EMIT(MIPS_DLSL_IMM(r, r, 16)); + EMIT(MIPS_OR_IMM(r, r, (imm >> 16) & 0xffff)); + EMIT(MIPS_DLSL_IMM(r, r, 16)); + } else EMIT(MIPS_DLSL32_IMM(r, r, 0)); + if (imm & 0x0000ffff) + EMIT(MIPS_OR_IMM(r, r, imm & 0xffff)); + } else +#endif + if ((s16)imm == imm) { + EMIT(MIPS_ADD_IMM(r, Z0, imm)); + } else if (!((u32)imm >> 16)) { + EMIT(MIPS_OR_IMM(r, Z0, imm)); + } else { + int s = Z0; + if ((u32)imm >> 16) { + EMIT(MIPS_MOVT_IMM(r, (u32)imm >> 16)); + s = r; + } + if ((u16)imm) + EMIT(MIPS_OR_IMM(r, s, (u16)imm)); + } +} + +#define emith_move_r_ptr_imm(r, imm) \ + emith_move_imm(r, (uintptr_t)(imm)) + +#define emith_move_r_imm(r, imm) \ + emith_move_imm(r, (u32)(imm)) +#define emith_move_r_imm_c(cond, r, imm) \ + emith_move_r_imm(r, imm) + +#define emith_move_r_imm_s8_patchable(r, imm) \ + EMIT(MIPS_ADD_IMM(r, Z0, (s8)(imm))) +#define emith_move_r_imm_s8_patch(ptr, imm) do { \ + u32 *ptr_ = (u32 *)ptr; \ + while (*ptr_ >> 26 != OP_ADDIU) ptr_++; \ + EMIT_PTR(ptr_, (*ptr_ & 0xffff0000) | (u16)(s8)(imm)); \ +} while (0) + +// arithmetic, immediate - can only be ADDI[U], since SUBI[U] doesn't exist +static void emith_add_imm(int ptr, int rd, int rs, u32 imm) +{ + if ((s16)imm == imm) { + if (imm || rd != rs) + EMIT(MIPS_OP_IMM(ptr ? OP_PADDIU:OP_ADDIU, rd,rs,imm)); + } else if ((s32)imm < 0) { + emith_move_r_imm(AT, -imm); + EMIT(MIPS_OP_REG((ptr ? FN_PSUBU:FN_SUBU),_, rd,rs,AT)); + } else { + emith_move_r_imm(AT, imm); + EMIT(MIPS_OP_REG((ptr ? FN_PADDU:FN_ADDU),_, rd,rs,AT)); + } +} + +#define emith_add_r_imm(r, imm) \ + emith_add_r_r_imm(r, r, imm) +#define emith_add_r_imm_c(cond, r, imm) \ + emith_add_r_imm(r, imm) + +#define emith_addf_r_imm(r, imm) \ + emith_addf_r_r_imm(r, imm) + +#define emith_sub_r_imm(r, imm) \ + emith_sub_r_r_imm(r, r, imm) +#define emith_sub_r_imm_c(cond, r, imm) \ + emith_sub_r_imm(r, imm) + +#define emith_subf_r_imm(r, imm) \ + emith_subf_r_r_imm(r, r, imm) + +#define emith_adc_r_imm(r, imm) \ + emith_adc_r_r_imm(r, r, imm) + +#define emith_adcf_r_imm(r, imm) \ + emith_adcf_r_r_imm(r, r, imm) + +#define emith_cmp_r_imm(r, imm) \ + emith_set_compare_flags(r, -1, imm) +// emith_subf_r_r_imm(FNZ, r, (s16)imm) + +#define emith_add_r_r_ptr_imm(d, s, imm) \ + emith_add_imm(1, d, s, imm) + +#define emith_add_r_r_imm(d, s, imm) \ + emith_add_imm(0, d, s, imm) + +#define emith_addf_r_r_imm(d, s, imm) do { \ + emith_add_r_r_imm(FNZ, s, imm); \ + emith_set_arith_flags(d, s, -1, imm, 0); \ +} while (0) + +#define emith_adc_r_r_imm(d, s, imm) do { \ + emith_add_r_r_r(AT, s, FC); \ + emith_add_r_r_imm(d, AT, imm); \ +} while (0) + +#define emith_adcf_r_r_imm(d, s, imm) do { \ + if (imm == 0) { \ + emith_add_r_r_r(FNZ, s, FC); \ + emith_set_arith_flags(d, s, -1, 1, 0); \ + } else { \ + emith_add_r_r_r(FNZ, s, FC); \ + EMIT(MIPS_SLTU_REG(AT, FNZ, FC)); \ + emith_add_r_r_imm(FNZ, FNZ, imm); \ + emith_set_arith_flags(d, s, -1, imm, 0); \ + emith_or_r_r(FC, AT); \ + } \ +} while (0) + +// NB: no SUBI in MIPS II, since ADDI takes a signed imm +#define emith_sub_r_r_imm(d, s, imm) \ + emith_add_r_r_imm(d, s, -(imm)) +#define emith_sub_r_r_imm_c(cond, d, s, imm) \ + emith_sub_r_r_imm(d, s, imm) + +#define emith_subf_r_r_imm(d, s, imm) do { \ + emith_sub_r_r_imm(FNZ, s, imm); \ + emith_set_arith_flags(d, s, -1, imm, 1); \ +} while (0) + +// logical, immediate +static void emith_log_imm(int op, int rd, int rs, u32 imm) +{ + if (imm >> 16) { + emith_move_r_imm(AT, imm); + EMIT(MIPS_OP_REG(FN_AND + (op-OP_ANDI),_, rd, rs, AT)); + } else if (op == OP_ANDI || imm || rd != rs) + EMIT(MIPS_OP_IMM(op, rd, rs, imm)); +} + +#define emith_and_r_imm(r, imm) \ + emith_log_imm(OP_ANDI, r, r, imm) + +#define emith_or_r_imm(r, imm) \ + emith_log_imm(OP_ORI, r, r, imm) +#define emith_or_r_imm_c(cond, r, imm) \ + emith_or_r_imm(r, imm) + +#define emith_eor_r_imm_ptr(r, imm) \ + emith_log_imm(OP_XORI, r, r, imm) +#define emith_eor_r_imm_ptr_c(cond, r, imm) \ + emith_eor_r_imm_ptr(r, imm) + +#define emith_eor_r_imm(r, imm) \ + emith_eor_r_imm_ptr(r, imm) +#define emith_eor_r_imm_c(cond, r, imm) \ + emith_eor_r_imm(r, imm) + +/* NB: BIC #imm not available in MIPS; use AND #~imm instead */ +#define emith_bic_r_imm(r, imm) \ + emith_log_imm(OP_ANDI, r, r, ~(imm)) +#define emith_bic_r_imm_c(cond, r, imm) \ + emith_bic_r_imm(r, imm) + +#define emith_tst_r_imm(r, imm) do { \ + emith_log_imm(OP_ANDI, FNZ, r, imm); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) +#define emith_tst_r_imm_c(cond, r, imm) \ + emith_tst_r_imm(r, imm) + +#define emith_and_r_r_imm(d, s, imm) \ + emith_log_imm(OP_ANDI, d, s, imm) + +#define emith_or_r_r_imm(d, s, imm) \ + emith_log_imm(OP_ORI, d, s, imm) + +#define emith_eor_r_r_imm(d, s, imm) \ + emith_log_imm(OP_XORI, d, s, imm) + +// shift +#define emith_lsl(d, s, cnt) \ + EMIT(MIPS_LSL_IMM(d, s, cnt)) + +#define emith_lsr(d, s, cnt) \ + EMIT(MIPS_LSR_IMM(d, s, cnt)) + +#define emith_asr(d, s, cnt) \ + EMIT(MIPS_ASR_IMM(d, s, cnt)) + +#define emith_ror(d, s, cnt) do { \ + if (__mips_isa_rev < 2) { \ + EMIT(MIPS_LSL_IMM(AT, s, 32-(cnt))); \ + EMIT(MIPS_LSR_IMM(d, s, cnt)); \ + EMIT(MIPS_OR_REG(d, d, AT)); \ + } else EMIT(MIPS_ROR_IMM(d, s, cnt)); \ +} while (0) +#define emith_ror_c(cond, d, s, cnt) \ + emith_ror(d, s, cnt) + +#define emith_rol(d, s, cnt) do { \ + if (__mips_isa_rev < 2) { \ + EMIT(MIPS_LSR_IMM(AT, s, 32-(cnt))); \ + EMIT(MIPS_LSL_IMM(d, s, cnt)); \ + EMIT(MIPS_OR_REG(d, d, AT)); \ + } else EMIT(MIPS_ROR_IMM(d, s, 32-(cnt))); \ +} while (0) + +#define emith_rorc(d) do { \ + emith_lsr(d, d, 1); \ + emith_lsl(AT, FC, 31); \ + emith_or_r_r(d, AT); \ +} while (0) + +#define emith_rolc(d) do { \ + emith_lsl(d, d, 1); \ + emith_or_r_r(d, FC); \ +} while (0) + +// NB: all flag setting shifts make V undefined +#define emith_lslf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_lsl(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_lsr(FC, _s, 31); \ + emith_lsl(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_lsrf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_lsr(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_and_r_r_imm(FC, _s, 1); \ + emith_lsr(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_asrf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_asr(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_and_r_r_imm(FC, _s, 1); \ + emith_asr(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_rolf(d, s, cnt) do { \ + emith_rol(d, s, cnt); \ + emith_and_r_r_imm(FC, d, 1); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_rorf(d, s, cnt) do { \ + emith_ror(d, s, cnt); \ + emith_lsr(FC, d, 31); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_rolcf(d) do { \ + emith_lsr(AT, d, 31); \ + emith_lsl(d, d, 1); \ + emith_or_r_r(d, FC); \ + emith_move_r_r(FC, AT); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_rorcf(d) do { \ + emith_and_r_r_imm(AT, d, 1); \ + emith_lsr(d, d, 1); \ + emith_lsl(FC, FC, 31); \ + emith_or_r_r(d, FC); \ + emith_move_r_r(FC, AT); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +// signed/unsigned extend +#define emith_clear_msb(d, s, count) /* bits to clear */ do { \ + u32 t; \ + if (__mips_isa_rev >= 2) \ + EMIT(MIPS_EXT_IMM(d, s, 0, 32-(count))); \ + else if ((count) >= 16) { \ + t = (count) - 16; \ + t = 0xffff >> t; \ + emith_and_r_r_imm(d, s, t); \ + } else { \ + emith_lsl(d, s, count); \ + emith_lsr(d, d, count); \ + } \ +} while (0) +#define emith_clear_msb_c(cond, d, s, count) \ + emith_clear_msb(d, s, count) + +#define emith_sext(d, s, count) /* bits to keep */ do { \ + if (__mips_isa_rev >= 2 && count == 8) \ + EMIT(MIPS_SEB_REG(d, s)); \ + else if (__mips_isa_rev >= 2 && count == 16) \ + EMIT(MIPS_SEH_REG(d, s)); \ + else { \ + emith_lsl(d, s, 32-(count)); \ + emith_asr(d, d, 32-(count)); \ + } \ +} while (0) + +// multiply Rd = Rn*Rm (+ Ra); NB: next 2 insns after MFLO/MFHI mustn't be MULT +static u8 *last_lohi; +static void emith_lohi_nops(void) +{ + u32 d; + while ((d = (u8 *)tcache_ptr - last_lohi) < 8 && d >= 0) EMIT(MIPS_NOP); +} + +#define emith_mul(d, s1, s2) do { \ + emith_lohi_nops(); \ + EMIT(MIPS_MULTU(s1, s2)); \ + EMIT(MIPS_MFLO(d)); \ + last_lohi = (u8 *)tcache_ptr; \ +} while (0) + +#define emith_mul_u64(dlo, dhi, s1, s2) do { \ + emith_lohi_nops(); \ + EMIT(MIPS_MULTU(s1, s2)); \ + EMIT(MIPS_MFLO(dlo)); \ + EMIT(MIPS_MFHI(dhi)); \ + last_lohi = (u8 *)tcache_ptr; \ +} while (0) + +#define emith_mul_s64(dlo, dhi, s1, s2) do { \ + emith_lohi_nops(); \ + EMIT(MIPS_MULT(s1, s2)); \ + EMIT(MIPS_MFLO(dlo)); \ + EMIT(MIPS_MFHI(dhi)); \ + last_lohi = (u8 *)tcache_ptr; \ +} while (0) + +#define emith_mula_s64(dlo, dhi, s1, s2) do { \ + int t_ = rcache_get_tmp(); \ + emith_lohi_nops(); \ + EMIT(MIPS_MULT(s1, s2)); \ + EMIT(MIPS_MFLO(AT)); \ + EMIT(MIPS_MFHI(t_)); \ + last_lohi = (u8 *)tcache_ptr; \ + emith_add_r_r(dlo, AT); \ + EMIT(MIPS_SLTU_REG(AT, dlo, AT)); \ + emith_add_r_r(dhi, AT); \ + emith_add_r_r(dhi, t_); \ + rcache_free_tmp(t_); \ +} while (0) +#define emith_mula_s64_c(cond, dlo, dhi, s1, s2) \ + emith_mula_s64(dlo, dhi, s1, s2) + +// load/store. offs has 16 bits signed, which is currently sufficient +#define emith_read_r_r_offs_ptr(r, rs, offs) \ + EMIT(MIPS_OP_IMM(OP_LP, r, rs, offs)) +#define emith_read_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_read_r_r_offs_ptr(r, rs, offs) + +#define emith_read_r_r_offs(r, rs, offs) \ + EMIT(MIPS_LW(r, rs, offs)) +#define emith_read_r_r_offs_c(cond, r, rs, offs) \ + emith_read_r_r_offs(r, rs, offs) + +#define emith_read_r_r_r_ptr(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_OP_IMM(OP_LP, r, AT, 0)); \ +} while (0) + +#define emith_read_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_LW(r, AT, 0)); \ +} while (0) +#define emith_read_r_r_r_c(cond, r, rs, rm) \ + emith_read_r_r_r(r, rs, rm) + +#define emith_read8_r_r_offs(r, rs, offs) \ + EMIT(MIPS_LBU(r, rs, offs)) +#define emith_read8_r_r_offs_c(cond, r, rs, offs) \ + emith_read8_r_r_offs(r, rs, offs) + +#define emith_read8_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_LBU(r, AT, 0)); \ +} while (0) +#define emith_read8_r_r_r_c(cond, r, rs, rm) \ + emith_read8_r_r_r(r, rs, rm) + +#define emith_read16_r_r_offs(r, rs, offs) \ + EMIT(MIPS_LHU(r, rs, offs)) +#define emith_read16_r_r_offs_c(cond, r, rs, offs) \ + emith_read16_r_r_offs(r, rs, offs) + +#define emith_read16_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_LHU(r, AT, 0)); \ +} while (0) +#define emith_read16_r_r_r_c(cond, r, rs, rm) \ + emith_read16_r_r_r(r, rs, rm) + +#define emith_read8s_r_r_offs(r, rs, offs) \ + EMIT(MIPS_LB(r, rs, offs)) +#define emith_read8s_r_r_offs_c(cond, r, rs, offs) \ + emith_read8s_r_r_offs(r, rs, offs) + +#define emith_read8s_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_LB(r, AT, 0)); \ +} while (0) +#define emith_read8s_r_r_r_c(cond, r, rs, rm) \ + emith_read8s_r_r_r(r, rs, rm) + +#define emith_read16s_r_r_offs(r, rs, offs) \ + EMIT(MIPS_LH(r, rs, offs)) +#define emith_read16s_r_r_offs_c(cond, r, rs, offs) \ + emith_read16s_r_r_offs(r, rs, offs) + +#define emith_read16s_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_LH(r, AT, 0)); \ +} while (0) +#define emith_read16s_r_r_r_c(cond, r, rs, rm) \ + emith_read16s_r_r_r(r, rs, rm) + + +#define emith_write_r_r_offs_ptr(r, rs, offs) \ + EMIT(MIPS_OP_IMM(OP_SP, r, rs, offs)) +#define emith_write_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_write_r_r_offs_ptr(r, rs, offs) + +#define emith_write_r_r_r_ptr(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_OP_IMM(OP_SP, r, AT, 0)); \ +} while (0) +#define emith_write_r_r_r_ptr_c(cond, r, rs, rm) \ + emith_write_r_r_r_ptr(r, rs, rm) + +#define emith_write_r_r_offs(r, rs, offs) \ + EMIT(MIPS_SW(r, rs, offs)) +#define emith_write_r_r_offs_c(cond, r, rs, offs) \ + emith_write_r_r_offs(r, rs, offs) + +#define emith_write_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_SW(r, AT, 0)); \ +} while (0) +#define emith_write_r_r_r_c(cond, r, rs, rm) \ + emith_write_r_r_r(r, rs, rm) + +#define emith_ctx_read_ptr(r, offs) \ + emith_read_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_read(r, offs) \ + emith_read_r_r_offs(r, CONTEXT_REG, offs) +#define emith_ctx_read_c(cond, r, offs) \ + emith_ctx_read(r, offs) + +#define emith_ctx_write_ptr(r, offs) \ + emith_write_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_write(r, offs) \ + emith_write_r_r_offs(r, CONTEXT_REG, offs) + +#define emith_ctx_read_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_read(r_, offs_); \ +} while (0) + +#define emith_ctx_write_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_write(r_, offs_); \ +} while (0) + +// function call handling +#define emith_save_caller_regs(mask) do { \ + int _c; u32 _m = mask & 0x300fffc; /* r2-r15,r24-r25 */ \ + if (__builtin_parity(_m) == 1) _m |= 0x1; /* ABI align */ \ + int _s = count_bits(_m) * 4, _o = _s; \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, -_s); \ + for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + if (_m & (1 << _c)) \ + { _o -= 4; if (_c) emith_write_r_r_offs(_c, SP, _o); } \ +} while (0) + +#define emith_restore_caller_regs(mask) do { \ + int _c; u32 _m = mask & 0x300fffc; \ + if (__builtin_parity(_m) == 1) _m |= 0x1; \ + int _s = count_bits(_m) * 4, _o = 0; \ + for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) \ + { if (_c) emith_read_r_r_offs(_c, SP, _o); _o += 4; } \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, _s); \ +} while (0) + +#define host_arg2reg(rd, arg) \ + rd = (arg+4) + +#define emith_pass_arg_r(arg, reg) \ + emith_move_r_r(arg, reg) + +#define emith_pass_arg_imm(arg, imm) \ + emith_move_r_imm(arg, imm) + +// branching +#define emith_invert_branch(cond) /* inverted conditional branch */ \ + (((cond) >> 5) == OP__RT ? (cond) ^ 0x01 : (cond) ^ 0x20) + +// evaluate the emulated condition, returns a register/branch type pair +static int emith_cmpr_check(int rs, int rt, int cond, int *r) +{ + int b = 0; + + // condition check for comparing 2 registers + switch (cond) { + case DCOND_EQ: *r = rs; b = MIPS_BEQ|rt; break; + case DCOND_NE: *r = rs; b = MIPS_BNE|rt; break; + case DCOND_LO: EMIT(MIPS_SLTU_REG(AT, rs, rt)); + *r = AT, b = MIPS_BNE; break; // s < t unsigned + case DCOND_HS: EMIT(MIPS_SLTU_REG(AT, rs, rt)); + *r = AT, b = MIPS_BEQ; break; // s >= t unsigned + case DCOND_LS: EMIT(MIPS_SLTU_REG(AT, rt, rs)); + *r = AT, b = MIPS_BEQ; break; // s <= t unsigned + case DCOND_HI: EMIT(MIPS_SLTU_REG(AT, rt, rs)); + *r = AT, b = MIPS_BNE; break; // s > t unsigned + case DCOND_LT: if (rt == 0) { *r = rs, b = MIPS_BLT; break; } // s < 0 + EMIT(MIPS_SLT_REG(AT, rs, rt)); + *r = AT, b = MIPS_BNE; break; // s < t + case DCOND_GE: if (rt == 0) { *r = rs, b = MIPS_BGE; break; } // s >= 0 + EMIT(MIPS_SLT_REG(AT, rs, rt)); + *r = AT, b = MIPS_BEQ; break; // s >= t + case DCOND_LE: if (rt == 0) { *r = rs, b = MIPS_BLE; break; } // s <= 0 + EMIT(MIPS_SLT_REG(AT, rt, rs)); + *r = AT, b = MIPS_BEQ; break; // s <= t + case DCOND_GT: if (rt == 0) { *r = rs, b = MIPS_BGT; break; } // s > 0 + EMIT(MIPS_SLT_REG(AT, rt, rs)); + *r = AT, b = MIPS_BNE; break; // s > t + } + + return b; +} + +static int emith_cmpi_check(int rs, s32 imm, int cond, int *r) +{ + int b = 0; + + // condition check for comparing register with immediate + if (imm == 0) return emith_cmpr_check(rs, Z0, cond, r); + switch (cond) { + case DCOND_EQ: emith_move_r_imm(AT, imm); + *r = rs; b = MIPS_BEQ|AT; break; + case DCOND_NE: emith_move_r_imm(AT, imm); + *r = rs; b = MIPS_BNE|AT; break; + case DCOND_LO: EMIT(MIPS_SLTU_IMM(AT, rs, imm)); + *r = AT, b = MIPS_BNE; break; // s < imm unsigned + case DCOND_HS: EMIT(MIPS_SLTU_IMM(AT, rs, imm)); + *r = AT, b = MIPS_BEQ; break; // s >= imm unsigned + case DCOND_LS: emith_move_r_imm(AT, imm); + EMIT(MIPS_SLTU_REG(AT, AT, rs)); + *r = AT, b = MIPS_BEQ; break; // s <= imm unsigned + case DCOND_HI: emith_move_r_imm(AT, imm); + EMIT(MIPS_SLTU_REG(AT, AT, rs)); + *r = AT, b = MIPS_BNE; break; // s > imm unsigned + case DCOND_LT: EMIT(MIPS_SLT_IMM(AT, rs, imm)); + *r = AT, b = MIPS_BNE; break; // s < imm + case DCOND_GE: EMIT(MIPS_SLT_IMM(AT, rs, imm)); + *r = AT, b = MIPS_BEQ; break; // s >= imm + case DCOND_LE: emith_move_r_imm(AT, imm); + EMIT(MIPS_SLT_REG(AT, AT, rs)); + *r = AT, b = MIPS_BEQ; break; // s <= imm + case DCOND_GT: emith_move_r_imm(AT, imm); + EMIT(MIPS_SLT_REG(AT, AT, rs)); + *r = AT, b = MIPS_BNE; break; // s > imm + } + return b; +} + +static int emith_cond_check(int cond, int *r) +{ + int b = 0; + + if (emith_cmp_rs >= 0) { + if (emith_cmp_rt != -1) + b = emith_cmpr_check(emith_cmp_rs,emith_cmp_rt, cond,r); + else b = emith_cmpi_check(emith_cmp_rs,emith_cmp_imm,cond,r); + } + + // shortcut for V known to be 0 + if (!b && emith_flg_noV) switch (cond) { + case DCOND_VS: *r = Z0; b = MIPS_BNE; break; // never + case DCOND_VC: *r = Z0; b = MIPS_BEQ; break; // always + case DCOND_LT: *r = FNZ, b = MIPS_BLT; break; // N + case DCOND_GE: *r = FNZ, b = MIPS_BGE; break; // !N + case DCOND_LE: *r = FNZ, b = MIPS_BLE; break; // N || Z + case DCOND_GT: *r = FNZ, b = MIPS_BGT; break; // !N && !Z + } + + // the full monty if no shortcut + if (!b) switch (cond) { + // conditions using NZ + case DCOND_EQ: *r = FNZ; b = MIPS_BEQ; break; // Z + case DCOND_NE: *r = FNZ; b = MIPS_BNE; break; // !Z + case DCOND_MI: *r = FNZ; b = MIPS_BLT; break; // N + case DCOND_PL: *r = FNZ; b = MIPS_BGE; break; // !N + // conditions using C + case DCOND_LO: *r = FC; b = MIPS_BNE; break; // C + case DCOND_HS: *r = FC; b = MIPS_BEQ; break; // !C + // conditions using CZ + case DCOND_LS: // C || Z + case DCOND_HI: // !C && !Z + EMIT(MIPS_ADD_IMM(AT, FC, -1)); // !C && !Z + EMIT(MIPS_AND_REG(AT, FNZ, AT)); + *r = AT, b = (cond == DCOND_HI ? MIPS_BNE : MIPS_BEQ); + break; + + // conditions using V + case DCOND_VS: // V + case DCOND_VC: // !V + EMIT(MIPS_XOR_REG(AT, FV, FNZ)); // V = Nt^Ns^Nd^C + EMIT(MIPS_LSR_IMM(AT, AT, 31)); + EMIT(MIPS_XOR_REG(AT, AT, FC)); + *r = AT, b = (cond == DCOND_VS ? MIPS_BNE : MIPS_BEQ); + break; + // conditions using VNZ + case DCOND_LT: // N^V + case DCOND_GE: // !(N^V) + EMIT(MIPS_LSR_IMM(AT, FV, 31)); // Nd^V = Nt^Ns^C + EMIT(MIPS_XOR_REG(AT, FC, AT)); + *r = AT, b = (cond == DCOND_LT ? MIPS_BNE : MIPS_BEQ); + break; + case DCOND_LE: // (N^V) || Z + case DCOND_GT: // !(N^V) && !Z + EMIT(MIPS_LSR_IMM(AT, FV, 31)); // Nd^V = Nt^Ns^C + EMIT(MIPS_XOR_REG(AT, FC, AT)); + EMIT(MIPS_ADD_IMM(AT, AT, -1)); // !(Nd^V) && !Z + EMIT(MIPS_AND_REG(AT, FNZ, AT)); + *r = AT, b = (cond == DCOND_GT ? MIPS_BNE : MIPS_BEQ); + break; + } + return b; +} + +// NB: assumes all targets are in the same 256MB segment +#define emith_jump(target) \ + emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)) +#define emith_jump_patchable(target) \ + emith_jump(target) + +// NB: MIPS conditional branches have only +/- 128KB range +#define emith_jump_cond(cond, target) do { \ + int r_, mcond_ = emith_cond_check(cond, &r_); \ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr - 4; \ + emith_branch(MIPS_BCONDZ(mcond_,r_,disp_ & 0x0003ffff)); \ +} while (0) +#define emith_jump_cond_patchable(cond, target) \ + emith_jump_cond(cond, target) + +#define emith_jump_cond_inrange(target) \ + ((u8 *)target - (u8 *)tcache_ptr - 4 < 0x20000 && \ + (u8 *)target - (u8 *)tcache_ptr - 4 >= -0x20000+0x10) //mind cond_check + +// NB: returns position of patch for cache maintenance +#define emith_jump_patch(ptr, target, pos) do { \ + u32 *ptr_ = (u32 *)ptr-1; /* must skip condition check code */ \ + u32 disp_, mask_; \ + while (!emith_is_j(*ptr_) && !emith_is_b(*ptr_)) ptr_ ++; \ + if (emith_is_b(*ptr_)) \ + mask_ = 0xffff0000, disp_ = (u8 *)target - (u8 *)ptr_ - 4; \ + else mask_ = 0xfc000000, disp_ = (uintptr_t)target; \ + EMIT_PTR(ptr_, (*ptr_ & mask_) | ((disp_ >> 2) & ~mask_)); \ + if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)(ptr_-1); \ +} while (0) + +#define emith_jump_patch_inrange(ptr, target) \ + ((u8 *)target - (u8 *)ptr - 4 < 0x20000 && \ + (u8 *)target - (u8 *)ptr - 4 >= -0x20000+0x10) // mind cond_check +#define emith_jump_patch_size() 4 + +#define emith_jump_at(ptr, target) do { \ + u32 *ptr_ = (u32 *)ptr; \ + EMIT_PTR(ptr_, MIPS_J((uintptr_t)target & 0x0fffffff)); \ + EMIT_PTR(ptr_, MIPS_NOP); \ +} while (0) +#define emith_jump_at_size() 8 + +#define emith_jump_reg(r) \ + emith_branch(MIPS_JR(r)) +#define emith_jump_reg_c(cond, r) \ + emith_jump_reg(r) + +#define emith_jump_ctx(offs) do { \ + emith_ctx_read_ptr(AT, offs); \ + emith_jump_reg(AT); \ +} while (0) +#define emith_jump_ctx_c(cond, offs) \ + emith_jump_ctx(offs) + +#define emith_call(target) \ + emith_branch(MIPS_JAL((uintptr_t)target & 0x0fffffff)) +#define emith_call_cond(cond, target) \ + emith_call(target) + +#define emith_call_reg(r) \ + emith_branch(MIPS_JALR(LR, r)) + +#define emith_call_ctx(offs) do { \ + emith_ctx_read_ptr(AT, offs); \ + emith_call_reg(AT); \ +} while (0) + +#define emith_call_cleanup() /**/ + +#define emith_ret() \ + emith_branch(MIPS_JR(LR)) +#define emith_ret_c(cond) \ + emith_ret() + +#define emith_ret_to_ctx(offs) \ + emith_ctx_write_ptr(LR, offs) + +#define emith_add_r_ret(r) \ + emith_add_r_r_ptr(r, LR) + +// NB: ABI SP alignment is 8 for 64 bit, O32 has a 16 byte arg save area +#define emith_push_ret(r) do { \ + int offs_ = 8+16 - 2*PTR_SIZE; \ + emith_add_r_r_ptr_imm(SP, SP, -8-16); \ + emith_write_r_r_offs_ptr(LR, SP, offs_ + PTR_SIZE); \ + if ((r) > 0) emith_write_r_r_offs(r, SP, offs_); \ +} while (0) + +#define emith_pop_and_ret(r) do { \ + int offs_ = 8+16 - 2*PTR_SIZE; \ + if ((r) > 0) emith_read_r_r_offs(r, SP, offs_); \ + emith_read_r_r_offs_ptr(LR, SP, offs_ + PTR_SIZE); \ + emith_add_r_r_ptr_imm(SP, SP, 8+16); \ + emith_ret(); \ +} while (0) + + +// emitter ABI stuff +#define emith_pool_check() /**/ +#define emith_pool_commit(j) /**/ +// NB: mips32r2 has SYNCI +#define host_instructions_updated(base, end) __builtin___clear_cache(base, end) +#define emith_update_cache() /**/ +#define emith_rw_offs_max() 0x7fff +#define emith_uext_ptr(r) /**/ + +// SH2 drc specific +#define emith_sh2_drc_entry() do { \ + int _c, _z = PTR_SIZE; u32 _m = 0xd0ff0000; \ + if (__builtin_parity(_m) == 1) _m |= 0x1; /* ABI align for SP is 8 */ \ + int _s = count_bits(_m) * _z + 16, _o = _s; /* 16 O32 arg save area */ \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, -_s); \ + for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + if (_m & (1 << _c)) \ + { _o -= _z; if (_c) emith_write_r_r_offs_ptr(_c, SP, _o); } \ +} while (0) +#define emith_sh2_drc_exit() do { \ + int _c, _z = PTR_SIZE; u32 _m = 0xd0ff0000; \ + if (__builtin_parity(_m) == 1) _m |= 0x1; \ + int _s = count_bits(_m) * _z + 16, _o = 16; \ + for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) \ + { if (_c) emith_read_r_r_offs_ptr(_c, SP, _o); _o += _z; } \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, _s); \ + emith_ret(); \ +} while (0) + +// NB: assumes a is in arg0, tab, func and mask are temp +#define emith_sh2_rcall(a, tab, func, mask) do { \ + emith_lsr(mask, a, SH2_READ_SHIFT); \ + emith_add_r_r_r_lsl_ptr(tab, tab, mask, PTR_SCALE+1); \ + emith_read_r_r_offs_ptr(func, tab, 0); \ + emith_read_r_r_offs(mask, tab, (1 << PTR_SCALE)); \ + emith_addf_r_r_r_ptr(func, func, func); \ +} while (0) + +// NB: assumes a, val are in arg0 and arg1, tab and func are temp +#define emith_sh2_wcall(a, val, tab, func) do { \ + emith_lsr(func, a, SH2_WRITE_SHIFT); \ + emith_lsl(func, func, PTR_SCALE); \ + emith_read_r_r_r_ptr(func, tab, func); \ + emith_move_r_r_ptr(6, CONTEXT_REG); /* arg2 */ \ + emith_jump_reg(func); \ +} while (0) + +#define emith_sh2_delay_loop(cycles, reg) do { \ + int sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); \ + int t1 = rcache_get_tmp(); \ + int t2 = rcache_get_tmp(); \ + int t3 = rcache_get_tmp(); \ + /* if (sr < 0) return */ \ + emith_cmp_r_imm(sr, 0); \ + EMITH_JMP_START(DCOND_LE); \ + /* turns = sr.cycles / cycles */ \ + emith_asr(t2, sr, 12); \ + emith_move_r_imm(t3, (u32)((1ULL<<32) / (cycles)) + 1); \ + emith_mul_u64(t1, t2, t2, t3); /* multiply by 1/x */ \ + rcache_free_tmp(t3); \ + if (reg >= 0) { \ + /* if (reg <= turns) turns = reg-1 */ \ + t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \ + emith_cmp_r_r(t3, t2); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_sub_r_r_imm_c(DCOND_LS, t2, t3, 1); \ + EMITH_SJMP_END(DCOND_HI); \ + /* if (reg <= 1) turns = 0 */ \ + emith_cmp_r_imm(t3, 1); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_move_r_imm_c(DCOND_LS, t2, 0); \ + EMITH_SJMP_END(DCOND_HI); \ + /* reg -= turns */ \ + emith_sub_r_r(t3, t2); \ + } \ + /* sr.cycles -= turns * cycles; */ \ + emith_move_r_imm(t1, cycles); \ + emith_mul(t1, t2, t1); \ + emith_sub_r_r_r_lsl(sr, sr, t1, 12); \ + EMITH_JMP_END(DCOND_LE); \ + rcache_free_tmp(t1); \ + rcache_free_tmp(t2); \ +} while (0) + +/* + * T = !carry(Rn = (Rn << 1) | T) + * if Q + * C = carry(Rn += Rm) + * else + * C = carry(Rn -= Rm) + * T ^= C + */ +#define emith_sh2_div1_step(rn, rm, sr) do { \ + int t_ = rcache_get_tmp(); \ + emith_and_r_r_imm(AT, sr, T); \ + emith_lsr(FC, rn, 31); /*Rn = (Rn<<1)+T*/ \ + emith_lsl(t_, rn, 1); \ + emith_or_r_r(t_, AT); \ + emith_or_r_imm(sr, T); /* T = !carry */ \ + emith_eor_r_r(sr, FC); \ + emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ + EMITH_JMP3_START(DCOND_EQ); \ + emith_add_r_r_r(rn, t_, rm); \ + EMIT(MIPS_SLTU_REG(FC, rn, t_)); \ + EMITH_JMP3_MID(DCOND_EQ); \ + emith_sub_r_r_r(rn, t_, rm); \ + EMIT(MIPS_SLTU_REG(FC, t_, rn)); \ + EMITH_JMP3_END(); \ + emith_eor_r_r(sr, FC); /* T ^= carry */ \ + rcache_free_tmp(t_); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macl(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* MACH top 16 bits unused if saturated. sign ext for overfl detect */ \ + emith_sext(mh, mh, 16); \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ + /* to check: add MACH >> 31 to MACH >> 15. this is 0 if no overflow */ \ + emith_asr(rn, mh, 15); \ + emith_add_r_r_r_lsr(rn, rn, mh, 31); /* sum = (MACH>>31)+(MACH>>15) */ \ + emith_teq_r_r(rn, Z0); /* (need only N and Z flags) */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ + EMITH_SJMP_START(DCOND_PL); /* sum > 0 -> +ovl */ \ + emith_sub_r_imm_c(DCOND_MI, ml, 1); /* 0xffffffff */ \ + emith_sub_r_imm_c(DCOND_MI, mh, 1); /* 0x00007fff */ \ + EMITH_SJMP_END(DCOND_PL); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macw(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* XXX: MACH should be untouched when S is set? */ \ + emith_asr(mh, ml, 31); /* sign ext MACL to MACH for ovrfl check */ \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 33 bits of MACH:MACL aren't all 1 or 0 */ \ + /* to check: add MACL[31] to MACH. this is 0 if no overflow */ \ + emith_lsr(rn, ml, 31); \ + emith_add_r_r(rn, mh); /* sum = MACH + ((MACL>>31)&1) */ \ + emith_teq_r_r(rn, Z0); /* (need only N and Z flags) */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> overflow */ \ + /* XXX: LSB signalling only in SH1, or in SH2 too? */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x00000001); /* LSB of MACH */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x80000000); /* negative ovrfl */ \ + EMITH_SJMP_START(DCOND_PL); /* sum > 0 -> positive ovrfl */ \ + emith_sub_r_imm_c(DCOND_MI, ml, 1); /* 0x7fffffff */ \ + EMITH_SJMP_END(DCOND_PL); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +#define emith_write_sr(sr, srcr) do { \ + if (__mips_isa_rev < 2) { \ + emith_lsr(sr, sr , 10); emith_lsl(sr, sr, 10); \ + emith_lsl(AT, srcr, 22); emith_lsr(AT, AT, 22); \ + emith_or_r_r(sr, AT); \ + } else EMIT(MIPS_INS_IMM(sr, srcr, 0, 10)); \ +} while (0) + +#define emith_carry_to_t(sr, is_sub) do { \ + if (__mips_isa_rev < 2) { \ + emith_and_r_imm(sr, 0xfffffffe); \ + emith_or_r_r(sr, FC); \ + } else EMIT(MIPS_INS_IMM(sr, FC, 0, 1)); \ +} while (0) + +#define emith_t_to_carry(sr, is_sub) do { \ + emith_and_r_r_imm(FC, sr, 1); \ +} while (0) + +#define emith_tpop_carry(sr, is_sub) do { \ + emith_and_r_r_imm(FC, sr, 1); \ + emith_eor_r_r(sr, FC); \ +} while (0) + +#define emith_tpush_carry(sr, is_sub) \ + emith_or_r_r(sr, FC) + +#ifdef T +// T bit handling +#define emith_invert_cond(cond) \ + ((cond) ^ 1) + +static void emith_clr_t_cond(int sr) +{ + emith_bic_r_imm(sr, T); +} + +static void emith_set_t_cond(int sr, int cond) +{ + int b, r; + u8 *ptr; + u32 val = 0, inv = 0; + + // try to avoid jumping around if possible + if (emith_cmp_rs >= 0) { + if (emith_cmp_rt >= 0) + b = emith_cmpr_check(emith_cmp_rs, emith_cmp_rt, cond, &r); + else + b = emith_cmpi_check(emith_cmp_rs, emith_cmp_imm, cond, &r); + + // XXX this relies on the inner workings of cmp_check... + if (r == AT) + // result of slt check which returns either 0 or 1 in AT + val++, inv = (b == MIPS_BEQ); + } else { + b = emith_cond_check(cond, &r); + if (r == Z0) { + if (b == MIPS_BEQ || b == MIPS_BLE || b == MIPS_BGE) + emith_or_r_imm(sr, T); + return; + } else if (r == FC) + val++, inv = (b == MIPS_BEQ); + } + + if (!val) switch (b) { // cases: b..z r, aka cmp r,Z0 or cmp r,#0 + case MIPS_BEQ: EMIT(MIPS_SLTU_IMM(AT, r, 1)); r=AT; val++; break; + case MIPS_BNE: EMIT(MIPS_SLTU_REG(AT,Z0, r)); r=AT; val++; break; + case MIPS_BLT: EMIT(MIPS_SLT_REG(AT, r, Z0)); r=AT; val++; break; + case MIPS_BGE: EMIT(MIPS_SLT_REG(AT, r, Z0)); r=AT; val++; inv++; break; + case MIPS_BLE: EMIT(MIPS_SLT_REG(AT, Z0, r)); r=AT; val++; inv++; break; + case MIPS_BGT: EMIT(MIPS_SLT_REG(AT, Z0, r)); r=AT; val++; break; + default: // cases: beq/bne r,s, aka cmp r,s + if ((b>>5) == OP_BEQ) { + EMIT(MIPS_XOR_REG(AT, r, b&0x1f)); + EMIT(MIPS_SLTU_IMM(AT,AT, 1)); r=AT; val++; break; + } else if ((b>>5) == OP_BNE) { + EMIT(MIPS_XOR_REG(AT, r, b&0x1f)); + EMIT(MIPS_SLTU_REG(AT,Z0,AT)); r=AT; val++; break; + } + } + if (val) { + emith_or_r_r(sr, r); + if (inv) + emith_eor_r_imm(sr, T); + return; + } + + // can't obtain result directly, use presumably slower jump !cond + or sr,T + b = emith_invert_branch(b); + ptr = emith_branch(MIPS_BCONDZ(b, r, 0)); + emith_or_r_imm(sr, T); + emith_flush(); // prohibit delay slot switching across jump targets + val = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; + EMIT_PTR(ptr, MIPS_BCONDZ(b, r, val & 0x0003ffff)); +} + +#define emith_get_t_cond() -1 + +#define emith_sync_t(sr) ((void)sr) + +#define emith_invalidate_t() + +static void emith_set_t(int sr, int val) +{ + if (val) + emith_or_r_imm(sr, T); + else + emith_bic_r_imm(sr, T); +} + +static int emith_tst_t(int sr, int tf) +{ + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; +} +#endif diff --git a/cpu/drc/emit_ppc.c b/cpu/drc/emit_ppc.c new file mode 100644 index 000000000..54050bad2 --- /dev/null +++ b/cpu/drc/emit_ppc.c @@ -0,0 +1,1773 @@ +/* + * Basic macros to emit PowerISA 2.03 64 bit instructions and some utils + * Copyright (C) 2020 kub + * + * This work is licensed under the terms of MAME license. + * See COPYING file in the top-level directory. + */ + +// NB bit numbers are reversed in PPC (MSB is bit 0). The emith_* functions and +// macros must take this into account. + +// NB PPC was a 64 bit architecture from the onset, so basically all operations +// are operating on 64 bits. 32 bit arch was only added later on, and there are +// very few 32 bit operations (cmp*, shift/rotate, extract/insert, load/store). +// For most operations the upper bits don't spill into the lower word, for the +// others there is an appropriate 32 bit operation available. + +// NB PowerPC isn't a clean RISC design. Several insns use microcode, which is +// AFAIK notably slower than using some 2-3 non-microcode insns. So, using +// such insns should by avoided if possible. Listed in Cell handbook, App. A: +// - shift/rotate having the amount in a register +// - arithmetic/logical having the RC flag set (except cmp*) +// - load/store algebraic (l?a*), multiple (lmw/stmw), string (ls*/sts*) +// - mtcrf (and some more SPR related, not used here) +// moreover, misaligned load/store crossing a cacheline boundary are microcoded. +// Note also that load/store string isn't available in little endian mode. + +// NB flag handling in PPC differs grossly from the ARM/X86 model. There are 8 +// fields in the condition register, each having 4 condition bits. However, only +// the EQ bit is similar to the Z flag. The CA and OV bits in the XER register +// are similar to the C and V bits, but shifts don't use CA, and cmp* doesn't +// use CA and OV. +// Moreover, there's no easy possibility to get CA and OV for 32 bit arithmetic +// since all arithmetic/logical insns use 64 bit. +// For now, use the "no flags" code from the RISC-V backend. + +#define HOST_REGS 32 + +// PPC64: params: r3-r10, return: r3, temp: r0,r11-r12, saved: r14-r31 +// reserved: r0(zero), r1(stack), r2(TOC), r13(TID) +#define RET_REG 3 +#define PARAM_REGS { 3, 4, 5, 6, 7, 8, 9, 10 } +#define PRESERVED_REGS { 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 } +#define TEMPORARY_REGS { 11, 12 } + +#define CONTEXT_REG 31 +#define STATIC_SH2_REGS { SHR_SR,30 , SHR_R(0),29 , SHR_R(1),28 } + +// if RA is 0 in non-update memory insns, ADDI/ADDIS, ISEL, it aliases with zero +#define Z0 0 // zero register +#define SP 1 // stack pointer +// SPR registers +#define XER -1 // exception register +#define LR -8 // link register +#define CTR -9 // counter register +// internally used by code emitter: +#define AT 0 // emitter temporary (can't be fully used anyway) +#define FNZ 14 // emulated processor flags: N (bit 31) ,Z (all bits) +#define FC 15 // emulated processor flags: C (bit 0), others 0 +#define FV 16 // emulated processor flags: Nt^Ns (bit 31). others x + + +// unified conditions; virtual, not corresponding to anything real on PPC +#define DCOND_EQ 0x0 +#define DCOND_NE 0x1 +#define DCOND_HS 0x2 +#define DCOND_LO 0x3 +#define DCOND_MI 0x4 +#define DCOND_PL 0x5 +#define DCOND_VS 0x6 +#define DCOND_VC 0x7 +#define DCOND_HI 0x8 +#define DCOND_LS 0x9 +#define DCOND_GE 0xa +#define DCOND_LT 0xb +#define DCOND_GT 0xc +#define DCOND_LE 0xd + +#define DCOND_CS DCOND_LO +#define DCOND_CC DCOND_HS + +// unified insn; use right-aligned bit offsets for the bitfields +#define PPC_INSN(op, b10, b15, b20, b31) \ + (((op)<<26)|((b10)<<21)|((b15)<<16)|((b20)<<11)|((b31)<<0)) + +#define _ 0 // marker for "field unused" +#define __(n) o##n // enum marker for "undefined" +#define _CB(v,l,s,d) ((((v)>>(s))&((1<<(l))-1))<<(d)) // copy l bits + +// NB everything privileged or unneeded at 1st sight is left out +// opcode field (encoded in OPCD, bits 0-5) +enum { OP__LMA=004, OP_MULLI=007, + OP_SUBFIC, __(11), OP_CMPLI, OP_CMPI, OP_ADDIC, OP_ADDICF, OP_ADDI, OP_ADDIS, + OP_BC, __(21), OP_B, OP__CR, OP_RLWIMI, OP_RLWINM, __(26), OP_RLWNM, + OP_ORI, OP_ORIS, OP_XORI, OP_XORIS, OP_ANDI, OP_ANDIS, OP__RLD, OP__EXT, + OP_LWZ, OP_LWZU, OP_LBZ, OP_LBZU, OP_STW, OP_STWU, OP_STB, OP_STBU, + OP_LHZ, OP_LHZU, OP_LHA, OP_LHAU, OP_STH, OP_STHU, OP_LMW, OP_STMW, + /*OP_LQ=070,*/ OP__LD=072, OP__ST=076 }; +// CR subops (encoded in bits 21-31) +enum { OPC_MCRF=0, OPC_BCLR=32, OPC_BCCTR=1056 }; +// RLD subops (encoded in XO bits 27-31) +enum { OPR_RLDICL=0, OPR_RLDICR=4, OPR_RLDIC=8, OPR_RLDIMI=12, OPR_RLDCL=16, OPR_RLDCR=18 }; +// EXT subops (encoded in XO bits 21-31) +enum { + // arith/logical + OPE_CMP=0, OPE_SUBFC=16, OPE_ADDC=20, OPE_AND=56, + OPE_CMPL=64, OPE_SUBF=80, OPE_ANDC=120, OPE_NEG=208, OPE_NOR=248, + OPE_SUBFE=272, OPE_ADDE=276, OPE_SUBFZE=400, OPE_ADDZE=404, OPE_SUBFME=464, OPE_ADDME=468, + OPE_ADD=532, OPE_EQV=568, OPE_XOR=632, OPE_ORC=824, OPE_OR=888, OPE_NAND=952, + // shift + OPE_SLW=48, OPE_SLD=54, OPE_SRW=1072, OPE_SRD=1078, OPE_SRAW=1584, OPE_SRAD=1588, OPE_SRAWI=1648, OPE_SRADI=1652, + // extend, bitcount + OPE_CNTLZW=52, OPE_CNTLZD=116, OPE_EXTSH=1844, OPE_EXTSB=1908, OPE_EXTSW=1972, + // mult/div + OPE_MULHDU=18, OPE_MULHWU=22, OPE_MULHD=146, OPE_MULHW=150, OPE_MULLD=466, OPE_MULLW=470, + OPE_DIVDU=914, OPE_DIVWU=918, OPE_DIVD=978, OPE_DIVW=982, + // load/store indexed + OPE_LDX=42, OPE_LDUX=106, OPE_STDX=298, OPE_STDUX=362, + OPE_LWZX=46, OPE_LWZUX=110, OPE_LWAX=682, OPE_LWAUX=746, OPE_STWX=302, OPE_STWUX=366, + OPE_LBZX=174, OPE_LBZUX=238, /* no LBAX/LBAUX... */ OPE_STBX=430, OPE_STBUX=494, + OPE_LHZX=558, OPE_LHZUX=622, OPE_LHAX=686, OPE_LHAUX=750, OPE_STHX=814, OPE_STHUX=878, + // SPR, CR related + OPE_ISEL=15, OPE_MFCR=38, OPE_MTCRF=288, OPE_MFSPR=678, OPE_MTSPR=934, OPE_MCRXR=1024, +}; +// LD subops (encoded in XO bits 30-31) +enum { OPL_LD, OPL_LDU, OPL_LWA }; +// ST subops (encoded in XO bits 30-31) +enum { OPS_STD, OPS_STDU /*,OPS_STQ*/ }; + +// X*,M*-forms insns often have overflow detect in b21 and CR0 update in b31 +#define XOE (1<<10) // (31-21) +#define XRC (1<<0) // (31-31) +#define XF (XOE|XRC) +// MB and ME in M*-forms rotate left +#define MM(b,e) (((b)<<6)|((e)<<1)) +#define MD(b,s) (_CB(b,5,0,6)|_CB(b,1,5,5)|_CB(s,5,0,11)|_CB(s,1,5,1)) +// AA and LK in I,B-forms branches +#define BAA (1<<1) +#define BLK (1<<0) +// BO and BI condition codes in B-form, BO0-BO4:BI2-BI4 since we only need CR0 +#define BLT 0x60 +#define BGE 0x20 +#define BGT 0x61 +#define BLE 0x21 +#define BEQ 0x62 +#define BNE 0x22 +#define BXX 0xa0 // unconditional, aka always + +#define PPC_NOP \ + PPC_INSN(OP_ORI, 0, 0, _, 0) // ori r0, r0, 0 + +// arithmetic/logical + +#define PPC_OP_REG(op, xop, rt, ra, rb) /* X*,M*-form */ \ + PPC_INSN((unsigned)op, rt, ra, rb, xop) +#define PPC_OP_IMM(op, rt, ra, imm) /* D,B,I-form */ \ + PPC_INSN((unsigned)op, rt, ra, _, imm) + +// rt = ra OP rb +#define PPC_ADD_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_ADD,rt,ra,rb) +#define PPC_ADDC_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_ADD|XOE,rt,ra,rb) +#define PPC_SUB_REG(rt, rb, ra) /* NB reversed args (rb-ra) */ \ + PPC_OP_REG(OP__EXT,OPE_SUBF,rt,ra,rb) +#define PPC_SUBC_REG(rt, rb, ra) \ + PPC_OP_REG(OP__EXT,OPE_SUBF|XOE,rt,ra,rb) +#define PPC_NEG_REG(rt, ra) \ + PPC_OP_REG(OP__EXT,OPE_NEG,rt,ra,_) +#define PPC_NEGC_REG(rt, ra) \ + PPC_OP_REG(OP__EXT,OPE_NEG|XOE,rt,ra,_) + +#define PPC_CMP_REG(ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_CMP,1,ra,rb) +#define PPC_CMPL_REG(ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_CMPL,1,ra,rb) + +#define PPC_CMPW_REG(ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_CMP,0,ra,rb) +#define PPC_CMPLW_REG(ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_CMPL,0,ra,rb) + +#define PPC_XOR_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_XOR,rt,ra,rb) +#define PPC_OR_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_OR,rt,ra,rb) +#define PPC_ORN_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_ORC,rt,ra,rb) +#define PPC_NOR_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_NOR,rt,ra,rb) +#define PPC_AND_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_AND,rt,ra,rb) +#define PPC_BIC_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_ANDC,rt,ra,rb) + +#define PPC_MOV_REG(rt, ra) \ + PPC_OR_REG(rt, ra, ra) +#define PPC_MVN_REG(rt, ra) \ + PPC_NOR_REG(rt, ra, ra) + +// rt = ra OP rb OP carry +#define PPC_ADC_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_ADDE,rt,ra,rb) +#define PPC_SBC_REG(rt, rb, ra) \ + PPC_OP_REG(OP__EXT,OPE_SUBFE,rt,ra,rb) +#define PPC_NGC_REG(rt, ra) \ + PPC_OP_REG(OP__EXT,OPE_SUBFZE,rt,ra,_) + +// rt = ra SHIFT rb +#define PPC_LSL_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_SLD,rt,ra,rb) +#define PPC_LSR_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_SRD,rt,ra,rb) +#define PPC_ASR_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_SRAD,rt,ra,rb) +#define PPC_ROL_REG(ra, rt, rb) \ + PPC_OP_REG(OP__RLD,OPR_RLDCL,rt,ra,rb,0) + +#define PPC_LSLW_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_SLW,rt,ra,rb) +#define PPC_LSRW_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_SRW,rt,ra,rb) +#define PPC_ASRW_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_SRAW,rt,ra,rb) +#define PPC_ROLW_REG(ra, rt, rb) \ + PPC_OP_REG(OP_RLWNM,MM(0,31),rt,ra,rb) + +// rt = ra OP (imm16 << (0|16)) +#define PPC_ADD_IMM(rt, ra, imm16) \ + PPC_OP_IMM(OP_ADDI, rt, ra, imm16) +#define PPC_ADDT_IMM(rt, ra, imm16) \ + PPC_OP_IMM(OP_ADDIS, rt, ra, imm16) + +#define PPC_XOR_IMM(ra, rt, imm16) \ + PPC_OP_IMM(OP_XORI, rt, ra, imm16) +#define PPC_XORT_IMM(ra, rt, imm16) \ + PPC_OP_IMM(OP_XORIS, rt, ra, imm16) +#define PPC_OR_IMM(ra, rt, imm16) \ + PPC_OP_IMM(OP_ORI, rt, ra, imm16) +#define PPC_ORT_IMM(ra, rt, imm16) \ + PPC_OP_IMM(OP_ORIS, rt, ra, imm16) + +#define PPC_ANDS_IMM(rt, ra, imm16) \ + PPC_OP_IMM(OP_ANDI, rt, ra, imm16) +#define PPC_ANDTS_IMM(rt, ra, imm16) \ + PPC_OP_IMM(OP_ANDIS, rt, ra, imm16) +#define PPC_CMP_IMM(ra, imm16) \ + PPC_OP_IMM(OP_CMPI, 1, ra, imm16) +#define PPC_CMPL_IMM(ra, imm16) \ + PPC_OP_IMM(OP_CMPLI, 1, ra, imm16) + +#define PPC_CMPW_IMM(ra, imm16) \ + PPC_OP_IMM(OP_CMPI, 0, ra, imm16) +#define PPC_CMPLW_IMM(ra, imm16) \ + PPC_OP_IMM(OP_CMPLI, 0, ra, imm16) + +#define PPC_TST_IMM(rt, imm16) \ + PPC_ANDS_IMM(Z0,ra,imm16) + +#define PPC_MOV_IMM(rt, ra, imm16) \ + PPC_ADD_IMM(rt,ra,imm16) +#define PPC_MOVT_IMM(rt, ra, imm16) \ + PPC_ADDT_IMM(rt,ra,imm16) + +// rt = EXTEND ra +#define PPC_EXTSW_REG(ra, rt) \ + PPC_OP_REG(OP__EXT,OPE_EXTSW,rt,ra,_) +#define PPC_EXTSH_REG(ra, rt) \ + PPC_OP_REG(OP__EXT,OPE_EXTSH,rt,ra,_) +#define PPC_EXTSB_REG(ra, rt) \ + PPC_OP_REG(OP__EXT,OPE_EXTSB,rt,ra,_) +#define PPC_EXTUW_REG(ra, rt) \ + PPC_OP_REG(OP__RLD,OPR_RLDICL|MD(32,0),rt,ra,_) +#define PPC_EXTUH_REG(ra, rt) \ + PPC_OP_REG(OP__RLD,OPR_RLDICL|MD(48,0),rt,ra,_) +#define PPC_EXTUB_REG(ra, rt) \ + PPC_OP_REG(OP__RLD,OPR_RLDICL|MD(56,0),rt,ra,_) + +// rt = ra SHIFT imm5/imm6 +#define PPC_LSL_IMM(ra, rt, bits) \ + PPC_OP_REG(OP__RLD,OPR_RLDICR|MD(63-(bits),bits),rt,ra,_) +#define PPC_LSR_IMM(ra, rt, bits) \ + PPC_OP_REG(OP__RLD,OPR_RLDICL|MD(bits,64-(bits)),rt,ra,_) +#define PPC_ASR_IMM(ra, rt, bits) \ + PPC_OP_REG(OP__EXT,OPE_SRADI|MD(_,bits),rt,ra,_) +#define PPC_ROL_IMM(ra, rt, bits) \ + PPC_OP_REG(OP__RLD,OPR_RLDICL|MD(0,bits),rt,ra,_) + +#define PPC_LSLW_IMM(ra, rt, bits) \ + PPC_OP_REG(OP_RLWINM,MM(0,31-(bits)),rt,ra,bits) +#define PPC_LSRW_IMM(ra, rt, bits) \ + PPC_OP_REG(OP_RLWINM,MM(bits,31),rt,ra,32-(bits)) +#define PPC_ASRW_IMM(ra, rt, bits) \ + PPC_OP_REG(OP__EXT,OPE_SRAWI,rt,ra,bits) +#define PPC_ROLW_IMM(ra, rt, bits) \ + PPC_OP_REG(OP_RLWINM,MM(0,31),rt,ra,bits) + +// rt = EXTRACT/INSERT ra +#define PPC_BFX_IMM(ra, rt, lsb, bits) \ + PPC_OP_REG(OP__RLD,OPR_RLDICL|MD(64-(bits),63&(lsb+bits)),rt,ra,_) +#define PPC_BFXD_IMM(ra, rt, lsb, bits) /* extract to high bits, 64 bit */ \ + PPC_OP_REG(OP__RLD,OPR_RLDICR|MD(bits-1,lsb),rt,ra,_) +#define PPC_BFI_IMM(ra, rt, lsb, bits) \ + PPC_OP_REG(OP__RLD,OPR_RLDIMI|MD(lsb,64-(lsb+bits)),rt,ra,_) + +#define PPC_BFXW_IMM(ra, rt, lsb, bits) \ + PPC_OP_REG(OP_RLWINM,MM(32-(bits),31),rt,ra,31&(lsb+bits)) +#define PPC_BFXT_IMM(ra, rt, lsb, bits) /* extract to high bits, 32 bit */ \ + PPC_OP_REG(OP_RLWINM,MM(0,bits-1),rt,ra,lsb) +#define PPC_BFIW_IMM(ra, rt, lsb, bits) \ + PPC_OP_REG(OP_RLWIMI,MM(lsb,lsb+bits-1),rt,ra,32-(lsb+bits)) + +// multiplication; NB in 32 bit results the topmost 32 bits are undefined +#define PPC_MULL(rt, ra, rb) /* 64 bit */ \ + PPC_OP_REG(OP__EXT,OPE_MULLD,rt,ra,rb) +#define PPC_MUL(rt, ra, rb) /* low 32 bit */ \ + PPC_OP_REG(OP__EXT,OPE_MULLW,rt,ra,rb) +#define PPC_MULHS(rt, ra, rb) /* high 32 bit, signed */ \ + PPC_OP_REG(OP__EXT,OPE_MULHW,rt,ra,rb) +#define PPC_MULHU(rt, ra, rb) /* high 32 bit, unsigned */ \ + PPC_OP_REG(OP__EXT,OPE_MULHWU,rt,ra,rb) +// XXX use MAC* insns from the LMA group? + +// branching (only PC-relative) + +#define PPC_B(offs26) \ + PPC_OP_IMM(OP_B,_,_,(offs26)&~3) +#define PPC_BL(offs26) \ + PPC_OP_IMM(OP_B,_,_,((offs26)&~3)|BLK) +#define PPC_RET() \ + PPC_OP_REG(OP__CR,OPC_BCLR,BXX>>3,_,_) +#define PPC_RETCOND(cond) \ + PPC_OP_REG(OP__CR,OPC_BCLR,(cond)>>3,(cond)&0x7,_) +#define PPC_BCTRCOND(cond) \ + PPC_OP_REG(OP__CR,OPC_BCCTR,(cond)>>3,(cond)&0x7,_) +#define PPC_BLCTRCOND(cond) \ + PPC_OP_REG(OP__CR,OPC_BCCTR|BLK,(cond)>>3,(cond)&0x7,_) +#define PPC_BCOND(cond, offs19) \ + PPC_OP_IMM(OP_BC,(cond)>>3,(cond)&0x7,(offs19)&~3) + +// load/store, offset + +#define PPC_LDX_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP__LD,rt,ra,((u16)(offs16)&~3)|OPL_LD) +#define PPC_LDW_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP_LWZ,rt,ra,(u16)(offs16)) +#define PPC_LDH_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP_LHZ,rt,ra,(u16)(offs16)) +#define PPC_LDB_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP_LBZ,rt,ra,(u16)(offs16)) + +#define PPC_LDSH_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP_LHA,rt,ra,(u16)(offs16)) + +#define PPC_STX_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP__ST,rt,ra,((u16)(offs16)&~3)|OPS_STD) +#define PPC_STW_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP_STW,rt,ra,(u16)(offs16)) +#define PPC_STH_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP_STH,rt,ra,(u16)(offs16)) +#define PPC_STB_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP_STB,rt,ra,(u16)(offs16)) + +// load/store, indexed + +#define PPC_LDX_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_LDX,rt,ra,rb) +#define PPC_LDW_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_LWZX,rt,ra,rb) +#define PPC_LDH_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_LHZX,rt,ra,rb) +#define PPC_LDB_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_LBZX,rt,ra,rb) + +#define PPC_LDSH_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_LHAX,rt,ra,rb) + +#define PPC_STX_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_STX,rt,ra,rb) +#define PPC_STW_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_STWX,rt,ra,rb) +#define PPC_STH_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_STHX,rt,ra,rb) +#define PPC_STB_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_STBX,rt,ra,rb) + +// special regs: LR, CTR, XER, CR + +#define PPC_MFSP_REG(rt, spr) \ + PPC_OP_REG(OP__EXT,OPE_MFSPR,rt,_,_CB(-(spr),5,0,5)|_CB(-(spr),5,5,0)) +#define PPC_MTSP_REG(rs, spr) \ + PPC_OP_REG(OP__EXT,OPE_MTSPR,rs,_,_CB(-(spr),5,0,5)|_CB(-(spr),5,5,0)) + +#define PPC_MFCR_REG(rt) \ + PPC_OP_REG(OP__EXT,OPE_MFCR,rt,_,_) +#define PPC_MTCRF_REG(rs, fm) \ + PPC_OP_REG(OP__EXT,OPE_MTCRF,rs,_,(fm)<<1) +#define PPC_MCRXR_REG(crt) \ + PPC_OP_REG(OP__EXT,OPE_MCRXR,(crt)<<2,_,_) +#define PPC_MCRCR_REG(crt, crf) \ + PPC_OP_REG(OP__CR,OPC_MCRF,(crt)<<2,(crf)<<1,_) + +#ifdef __powerpc64__ +#define PTR_SCALE 3 +#define PPC_LDP_IMM PPC_LDX_IMM +#define PPC_LDP_REG PPC_LDX_REG +#define PPC_STP_IMM PPC_STX_IMM +#define PPC_STP_REG PPC_STX_REG +#define PPC_BFXP_IMM PPC_BFX_IMM + +#define emith_uext_ptr(r) EMIT(PPC_EXTUW_REG(r, r)) + +// "long" multiplication, 32x32 bit = 64 bit +#define EMIT_PPC_MULLU_REG(dlo, dhi, s1, s2) do { \ + EMIT(PPC_EXTUW_REG(s1, s1)); \ + EMIT(PPC_EXTUW_REG(s2, s2)); \ + EMIT(PPC_MULL(dlo, s1, s2)); \ + EMIT(PPC_ASR_IMM(dhi, dlo, 32)); \ +} while (0) + +#define EMIT_PPC_MULLS_REG(dlo, dhi, s1, s2) do { \ + EMIT(PPC_EXTSW_REG(s1, s1)); \ + EMIT(PPC_EXTSW_REG(s2, s2)); \ + EMIT(PPC_MULL(dlo, s1, s2)); \ + EMIT(PPC_ASR_IMM(dhi, dlo, 32)); \ +} while (0) + +#define EMIT_PPC_MACLS_REG(dlo, dhi, s1, s2) do { \ + EMIT(PPC_EXTSW_REG(s1, s1)); \ + EMIT(PPC_EXTSW_REG(s2, s2)); \ + EMIT(PPC_MULL(AT, s1, s2)); \ + EMIT(PPC_BFI_IMM(dlo, dhi, 0, 32)); \ + emith_add_r_r(dlo, AT); \ + EMIT(PPC_ASR_IMM(dhi, dlo, 32)); \ +} while (0) +#else +#define PTR_SCALE 2 +#define PPC_LDP_IMM PPC_LDW_IMM +#define PPC_LDP_REG PPC_LDW_REG +#define PPC_STP_IMM PPC_STW_IMM +#define PPC_STP_REG PPC_STW_REG +#define PPC_BFXP_IMM PPC_BFXW_IMM + +#define emith_uext_ptr(r) /**/ + +// "long" multiplication, 32x32 bit = 64 bit +#define EMIT_PPC_MULLU_REG(dlo, dhi, s1, s2) do { \ + int at = (dlo == s1 || dlo == s2 ? AT : dlo); \ + EMIT(PPC_MUL(at, s1, s2)); \ + EMIT(PPC_MULHU(dhi, s1, s2)); \ + if (at != dlo) emith_move_r_r(dlo, at); \ +} while (0) + +#define EMIT_PPC_MULLS_REG(dlo, dhi, s1, s2) do { \ + int at = (dlo == s1 || dlo == s2 ? AT : dlo); \ + EMIT(PPC_MUL(at, s1, s2)); \ + EMIT(PPC_MULHS(dhi, s1, s2)); \ + if (at != dlo) emith_move_r_r(dlo, at); \ +} while (0) + +#define EMIT_PPC_MACLS_REG(dlo, dhi, s1, s2) do { \ + int t_ = rcache_get_tmp(); \ + EMIT_PPC_MULLS_REG(t_, AT, s1, s2); \ + EMIT(PPC_ADDC_REG(dlo, dlo, t_)); \ + EMIT(PPC_ADC_REG(dhi, dhi, AT)); \ + rcache_free_tmp(t_); \ +} while (0) +#endif +#define PTR_SIZE (1<>1 since the lowest bit inverts the cond */ \ + unsigned _mv = BITMASK3(DCOND_VS>>1,DCOND_GE>>1,DCOND_GT>>1); \ + unsigned _mc = _mv | BITMASK2(DCOND_HS>>1,DCOND_HI>>1); \ + emith_flg_hint = (_mv & BITMASK1(cond >> 1) ? _FHV : 0); \ + emith_flg_hint |= (_mc & BITMASK1(cond >> 1) ? _FHC : 0); \ +} while (0) + +// store minimal cc information: rt, rb^ra, carry +// NB: the result *must* first go to FNZ, in case rt == ra or rt == rb. +// NB: for adcf and sbcf, carry-in must be dealt with separately (see there) +static void emith_set_arith_flags(int rt, int ra, int rb, s32 imm, int sub) +{ + if (emith_flg_hint & _FHC) { + if (sub) // C = sub:rb= 0) // Nt^Ns in FV, bit 31 + EMIT(PPC_XOR_REG(FV, ra, rb)); + else if (imm == 0) + emith_flg_noV = 1; // imm #0 can't overflow + else if ((imm < 0) == !sub) + EMIT(PPC_MVN_REG(FV, ra)); + else if ((imm > 0) == !sub) + EMIT(PPC_MOV_REG(FV, ra)); + } + // full V = Nd^Nt^Ns^C calculation is deferred until really needed + + if (rt && rt != FNZ) + EMIT(PPC_MOV_REG(rt, FNZ)); // N,Z via result value in FNZ + emith_cmp_ra = emith_cmp_rb = -1; +} + +// handle cmp separately by storing the involved regs for later use. +// this works for all conditions but VC/VS, but this is fortunately never used. +static void emith_set_compare_flags(int ra, int rb, s32 imm) +{ + emith_cmp_rb = rb; + emith_cmp_ra = ra; + emith_cmp_imm = imm; +} + + +// data processing, register + +#define emith_move_r_r_ptr(d, s) \ + EMIT(PPC_MOV_REG(d, s)) +#define emith_move_r_r_ptr_c(cond, d, s) \ + emith_move_r_r_ptr(d, s) + +#define emith_move_r_r(d, s) \ + emith_move_r_r_ptr(d, s) +#define emith_move_r_r_c(cond, d, s) \ + emith_move_r_r(d, s) + +#define emith_mvn_r_r(d, s) \ + EMIT(PPC_MVN_REG(d, s)) + +#define emith_add_r_r_r_lsl_ptr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_ADD_REG(d, s1, AT)); \ + } else EMIT(PPC_ADD_REG(d, s1, s2)); \ +} while (0) +#define emith_add_r_r_r_lsl(d, s1, s2, simm) \ + emith_add_r_r_r_lsl_ptr(d, s1, s2, simm) + +#define emith_add_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSRW_IMM(AT, s2, simm)); \ + EMIT(PPC_ADD_REG(d, s1, AT)); \ + } else EMIT(PPC_ADD_REG(d, s1, s2)); \ +} while (0) + +#define emith_addf_r_r_r_lsl_ptr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_ADD_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(PPC_ADD_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) +#define emith_addf_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_ADD_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(PPC_ADD_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) + +#define emith_addf_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSRW_IMM(AT, s2, simm)); \ + EMIT(PPC_ADD_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(PPC_ADD_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) + +#define emith_sub_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_SUB_REG(d, s1, AT)); \ + } else EMIT(PPC_SUB_REG(d, s1, s2)); \ +} while (0) + +#define emith_subf_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_SUB_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 1); \ + } else { \ + EMIT(PPC_SUB_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 1); \ + } \ +} while (0) + +#define emith_or_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_OR_REG(d, s1, AT)); \ + } else EMIT(PPC_OR_REG(d, s1, s2)); \ +} while (0) + +#define emith_or_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSRW_IMM(AT, s2, simm)); \ + EMIT(PPC_OR_REG(d, s1, AT)); \ + } else EMIT(PPC_OR_REG(d, s1, s2)); \ +} while (0) + +#define emith_eor_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_XOR_REG(d, s1, AT)); \ + } else EMIT(PPC_XOR_REG(d, s1, s2)); \ +} while (0) + +#define emith_eor_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSRW_IMM(AT, s2, simm)); \ + EMIT(PPC_XOR_REG(d, s1, AT)); \ + } else EMIT(PPC_XOR_REG(d, s1, s2)); \ +} while (0) + +#define emith_and_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_AND_REG(d, s1, AT)); \ + } else EMIT(PPC_AND_REG(d, s1, s2)); \ +} while (0) + +#define emith_or_r_r_lsl(d, s, lslimm) \ + emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) + +#define emith_eor_r_r_lsl(d, s, lslimm) \ + emith_eor_r_r_r_lsl(d, d, s, lslimm) +#define emith_eor_r_r_lsr(d, s, lsrimm) \ + emith_eor_r_r_r_lsr(d, d, s, lsrimm) + +#define emith_add_r_r_r(d, s1, s2) \ + emith_add_r_r_r_lsl(d, s1, s2, 0) + +#define emith_addf_r_r_r_ptr(d, s1, s2) \ + emith_addf_r_r_r_lsl_ptr(d, s1, s2, 0) +#define emith_addf_r_r_r(d, s1, s2) \ + emith_addf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_sub_r_r_r(d, s1, s2) \ + emith_sub_r_r_r_lsl(d, s1, s2, 0) + +#define emith_subf_r_r_r(d, s1, s2) \ + emith_subf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_or_r_r_r(d, s1, s2) \ + emith_or_r_r_r_lsl(d, s1, s2, 0) + +#define emith_eor_r_r_r(d, s1, s2) \ + emith_eor_r_r_r_lsl(d, s1, s2, 0) + +#define emith_and_r_r_r(d, s1, s2) \ + emith_and_r_r_r_lsl(d, s1, s2, 0) + +#define emith_add_r_r_ptr(d, s) \ + emith_add_r_r_r_lsl_ptr(d, d, s, 0) +#define emith_add_r_r(d, s) \ + emith_add_r_r_r(d, d, s) + +#define emith_sub_r_r(d, s) \ + emith_sub_r_r_r(d, d, s) + +#define emith_neg_r_r(d, s) \ + EMIT(PPC_NEG_REG(d, s)) + +#define emith_adc_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(AT, s2, FC); \ + emith_add_r_r_r(d, s1, AT); \ +} while (0) + +#define emith_sbc_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(AT, s2, FC); \ + emith_sub_r_r_r(d, s1, AT); \ +} while (0) + +#define emith_adc_r_r(d, s) \ + emith_adc_r_r_r(d, d, s) + +#define emith_negc_r_r(d, s) do { \ + emith_neg_r_r(d, s); \ + emith_sub_r_r(d, FC); \ +} while (0) + +// NB: the incoming carry Cin can cause Cout if s2+Cin=0 (or s1+Cin=0 FWIW) +// moreover, if s2+Cin=0 caused Cout, s1+s2+Cin=s1+0 can't cause another Cout +#define emith_adcf_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(FNZ, s2, FC); \ + EMIT_PPC_SLTWU_REG(AT, FNZ, FC); \ + emith_add_r_r_r(FNZ, s1, FNZ); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + emith_or_r_r(FC, AT); \ +} while (0) + +#define emith_sbcf_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(FNZ, s2, FC); \ + EMIT_PPC_SLTWU_REG(AT, FNZ, FC); \ + emith_sub_r_r_r(FNZ, s1, FNZ); \ + emith_set_arith_flags(d, s1, s2, 0, 1); \ + emith_or_r_r(FC, AT); \ +} while (0) + +#define emith_and_r_r(d, s) \ + emith_and_r_r_r(d, d, s) +#define emith_and_r_r_c(cond, d, s) \ + emith_and_r_r(d, s) + +#define emith_or_r_r(d, s) \ + emith_or_r_r_r(d, d, s) + +#define emith_eor_r_r(d, s) \ + emith_eor_r_r_r(d, d, s) + +#define emith_tst_r_r_ptr(d, s) do { \ + if (d != s) { \ + emith_and_r_r_r(FNZ, d, s); \ + emith_cmp_ra = emith_cmp_rb = -1; \ + } else emith_cmp_ra = s, emith_cmp_rb = -1, emith_cmp_imm = 0; \ +} while (0) +#define emith_tst_r_r(d, s) \ + emith_tst_r_r_ptr(d, s) + +#define emith_teq_r_r(d, s) do { \ + emith_eor_r_r_r(FNZ, d, s); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +#define emith_cmp_r_r(d, s) \ + emith_set_compare_flags(d, s, 0) +// emith_subf_r_r_r(FNZ, d, s) + +#define emith_addf_r_r(d, s) \ + emith_addf_r_r_r(d, d, s) + +#define emith_subf_r_r(d, s) \ + emith_subf_r_r_r(d, d, s) + +#define emith_adcf_r_r(d, s) \ + emith_adcf_r_r_r(d, d, s) + +#define emith_sbcf_r_r(d, s) \ + emith_sbcf_r_r_r(d, d, s) + +#define emith_negcf_r_r(d, s) do { \ + emith_add_r_r_r(FNZ, s, FC); \ + EMIT_PPC_SLTWU_REG(AT, FNZ, FC); \ + emith_neg_r_r(FNZ, FNZ); \ + emith_set_arith_flags(d, Z0, s, 0, 1); \ + emith_or_r_r(FC, AT); \ +} while (0) + +// move immediate + +static void emith_move_imm(int r, int ptr, uintptr_t imm) +{ +#ifdef __powerpc64__ + if (ptr && (s32)imm != imm) { + emith_move_imm(r, 0, imm >> 32); + if (imm >> 32) + EMIT(PPC_LSL_IMM(r, r, 32)); + if (imm & 0x0000ffff) + EMIT(PPC_OR_IMM(r, r, imm & 0x0000ffff)); + if (imm & 0xffff0000) + EMIT(PPC_ORT_IMM(r, r, (imm & 0xffff0000) >> 16)); + } else +#endif + if ((s16)imm != (s32)imm) { + EMIT(PPC_ADDT_IMM(r, Z0, (u16)(imm>>16))); + if ((s16)imm) + EMIT(PPC_OR_IMM(r, r, (u16)(imm))); + } else EMIT(PPC_ADD_IMM(r, Z0, (u16)imm)); +} + +#define emith_move_r_ptr_imm(r, imm) \ + emith_move_imm(r, 1, (uintptr_t)(imm)) + +#define emith_move_r_imm(r, imm) \ + emith_move_imm(r, 0, (u32)(imm)) +#define emith_move_r_imm_c(cond, r, imm) \ + emith_move_r_imm(r, imm) + +#define emith_move_r_imm_s8_patchable(r, imm) \ + EMIT(PPC_ADD_IMM(r, Z0, (s8)(imm))) +#define emith_move_r_imm_s8_patch(ptr, imm) do { \ + u32 *ptr_ = (u32 *)ptr; \ + EMIT_PTR(ptr_, (*ptr_ & 0xffff0000) | (u16)(s8)(imm)); \ +} while (0) + +// arithmetic, immediate - can only be ADDI, since SUBI doesn't exist + +static void emith_add_imm(int rt, int ra, u32 imm) +{ + int s = ra; + if ((u16)imm) { + EMIT(PPC_ADD_IMM(rt, s, (u16)imm)); + s = rt; + } + // adjust for sign extension in ADDI + imm = (imm >> 16) + ((s16)imm < 0); + if ((u16)imm || rt != s) + EMIT(PPC_ADDT_IMM(rt, s, (u16)imm)); +} + +#define emith_add_r_imm(r, imm) \ + emith_add_r_r_imm(r, r, imm) +#define emith_add_r_imm_c(cond, r, imm) \ + emith_add_r_imm(r, imm) + +#define emith_addf_r_imm(r, imm) \ + emith_addf_r_r_imm(r, imm) + +#define emith_sub_r_imm(r, imm) \ + emith_sub_r_r_imm(r, r, imm) +#define emith_sub_r_imm_c(cond, r, imm) \ + emith_sub_r_imm(r, imm) + +#define emith_subf_r_imm(r, imm) \ + emith_subf_r_r_imm(r, r, imm) + +#define emith_adc_r_imm(r, imm) \ + emith_adc_r_r_imm(r, r, imm) + +#define emith_adcf_r_imm(r, imm) \ + emith_adcf_r_r_imm(r, r, imm) + +#define emith_cmp_r_imm(r, imm) \ + emith_set_compare_flags(r, -1, imm) +// emith_subf_r_r_imm(FNZ, r, (s16)imm) + +#define emith_add_r_r_ptr_imm(d, s, imm) \ + emith_add_imm(d, s, imm) + +#define emith_add_r_r_imm(d, s, imm) \ + emith_add_r_r_ptr_imm(d, s, imm) + +#define emith_addf_r_r_imm(d, s, imm) do { \ + emith_add_r_r_imm(FNZ, s, imm); \ + emith_set_arith_flags(d, s, -1, imm, 0); \ +} while (0) + +#define emith_adc_r_r_imm(d, s, imm) do { \ + emith_add_r_r_r(AT, s, FC); \ + emith_add_r_r_imm(d, AT, imm); \ +} while (0) + + +#define emith_adcf_r_r_imm(d, s, imm) do { \ + if (imm == 0) { \ + emith_add_r_r_r(FNZ, s, FC); \ + emith_set_arith_flags(d, s, -1, 1, 0); \ + } else { \ + emith_add_r_r_r(FNZ, s, FC); \ + EMIT_PPC_SLTWU_REG(AT, FNZ, FC); \ + emith_add_r_r_imm(FNZ, FNZ, imm); \ + emith_set_arith_flags(d, s, -1, imm, 0); \ + emith_or_r_r(FC, AT); \ + } \ +} while (0) + +// NB: no SUBI, since ADDI takes a signed imm +#define emith_sub_r_r_imm(d, s, imm) \ + emith_add_r_r_imm(d, s, -(imm)) +#define emith_sub_r_r_imm_c(cond, d, s, imm) \ + emith_sub_r_r_imm(d, s, imm) + +#define emith_subf_r_r_imm(d, s, imm) do { \ + emith_sub_r_r_imm(FNZ, s, imm); \ + emith_set_arith_flags(d, s, -1, imm, 1); \ +} while (0) + +// logical, immediate + +#define emith_log_imm2(opi, opr, rt, ra, imm) do { \ + if ((imm) >> 16 || opi == OP_ANDI) { /* too big, or microcoded ANDI */ \ + emith_move_r_imm(AT, imm); \ + EMIT(PPC_OP_REG(OP__EXT, opr, ra, rt, AT)); \ + } else if (/*opi == OP_ANDI ||*/ imm || rt != ra) \ + EMIT(PPC_OP_IMM(opi, ra, rt, imm)); \ +} while (0) +#define emith_log_imm(op, rt, ra, imm) \ + emith_log_imm2(OP_##op##I, OPE_##op, rt, ra, imm) + +#define emith_and_r_imm(r, imm) \ + emith_log_imm(AND, r, r, imm) + +#define emith_or_r_imm(r, imm) \ + emith_log_imm(OR, r, r, imm) +#define emith_or_r_imm_c(cond, r, imm) \ + emith_or_r_imm(r, imm) + +#define emith_eor_r_imm_ptr(r, imm) \ + emith_log_imm(XOR, r, r, imm) +#define emith_eor_r_imm_ptr_c(cond, r, imm) \ + emith_eor_r_imm_ptr(r, imm) + +#define emith_eor_r_imm(r, imm) \ + emith_eor_r_imm_ptr(r, imm) +#define emith_eor_r_imm_c(cond, r, imm) \ + emith_eor_r_imm(r, imm) + +/* NB: BIC #imm not available; use AND #~imm instead */ +#define emith_bic_r_imm(r, imm) \ + emith_log_imm(AND, r, r, ~(imm)) +#define emith_bic_r_imm_c(cond, r, imm) \ + emith_bic_r_imm(r, imm) + +#define emith_tst_r_imm(r, imm) do { \ + emith_log_imm(AND, FNZ, r, imm); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) +#define emith_tst_r_imm_c(cond, r, imm) \ + emith_tst_r_imm(r, imm) + +#define emith_and_r_r_imm(d, s, imm) \ + emith_log_imm(AND, d, s, imm) + +#define emith_or_r_r_imm(d, s, imm) \ + emith_log_imm(OR, d, s, imm) + +#define emith_eor_r_r_imm(d, s, imm) \ + emith_log_imm(XOR, d, s, imm) + +// shift + +#define emith_lsl(d, s, cnt) \ + EMIT(PPC_LSLW_IMM(d, s, cnt)) + +#define emith_lsr(d, s, cnt) \ + EMIT(PPC_LSRW_IMM(d, s, cnt)) + +#define emith_asr(d, s, cnt) \ + EMIT(PPC_ASRW_IMM(d, s, cnt)) + +#define emith_ror(d, s, cnt) \ + EMIT(PPC_ROLW_IMM(d, s, 32-(cnt))) +#define emith_ror_c(cond, d, s, cnt) \ + emith_ror(d, s, cnt) + +#define emith_rol(d, s, cnt) \ + EMIT(PPC_ROLW_IMM(d, s, cnt)); \ + +#define emith_rorc(d) do { \ + emith_lsr(d, d, 1); \ + emith_lsl(AT, FC, 31); \ + emith_or_r_r(d, AT); \ +} while (0) + +#define emith_rolc(d) do { \ + emith_lsl(d, d, 1); \ + emith_or_r_r(d, FC); \ +} while (0) + +// NB: all flag setting shifts make V undefined +#define emith_lslf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_lsl(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_lsr(FC, _s, 31); \ + emith_lsl(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +#define emith_lsrf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_lsr(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_and_r_r_imm(FC, _s, 1); \ + emith_lsr(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +#define emith_asrf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_asr(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_and_r_r_imm(FC, _s, 1); \ + emith_asr(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +#define emith_rolf(d, s, cnt) do { \ + emith_rol(d, s, cnt); \ + emith_and_r_r_imm(FC, d, 1); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +#define emith_rorf(d, s, cnt) do { \ + emith_ror(d, s, cnt); \ + emith_lsr(FC, d, 31); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +#define emith_rolcf(d) do { \ + emith_lsr(AT, d, 31); \ + emith_lsl(d, d, 1); \ + emith_or_r_r(d, FC); \ + emith_move_r_r(FC, AT); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +#define emith_rorcf(d) do { \ + emith_and_r_r_imm(AT, d, 1); \ + emith_lsr(d, d, 1); \ + emith_lsl(FC, FC, 31); \ + emith_or_r_r(d, FC); \ + emith_move_r_r(FC, AT); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +// signed/unsigned extend + +#define emith_clear_msb(d, s, count) /* bits to clear */ \ + EMIT(PPC_BFXW_IMM(d, s, count, 32-(count))) + +#define emith_clear_msb_c(cond, d, s, count) \ + emith_clear_msb(d, s, count) + +#define emith_sext(d, s, count) /* bits to keep */ do { \ + if (count == 8) \ + EMIT(PPC_EXTSB_REG(d, s)); \ + else if (count == 16) \ + EMIT(PPC_EXTSH_REG(d, s)); \ + else { \ + emith_lsl(d, s, 32-(count)); \ + emith_asr(d, d, 32-(count)); \ + } \ +} while (0) + +// multiply Rd = Rn*Rm (+ Ra) + +#define emith_mul(d, s1, s2) \ + EMIT(PPC_MUL(d, s1, s2)) + +#define emith_mul_u64(dlo, dhi, s1, s2) \ + EMIT_PPC_MULLU_REG(dlo, dhi, s1, s2) + +#define emith_mul_s64(dlo, dhi, s1, s2) \ + EMIT_PPC_MULLS_REG(dlo, dhi, s1, s2) + +#define emith_mula_s64(dlo, dhi, s1, s2) \ + EMIT_PPC_MACLS_REG(dlo, dhi, s1, s2) +#define emith_mula_s64_c(cond, dlo, dhi, s1, s2) \ + emith_mula_s64(dlo, dhi, s1, s2) + +// load/store. offs has 16 bits signed, which is currently sufficient +#define emith_read_r_r_offs_ptr(r, ra, offs) \ + EMIT(PPC_LDP_IMM(r, ra, offs)) +#define emith_read_r_r_offs_ptr_c(cond, r, ra, offs) \ + emith_read_r_r_offs_ptr(r, ra, offs) + +#define emith_read_r_r_offs(r, ra, offs) \ + EMIT(PPC_LDW_IMM(r, ra, offs)) +#define emith_read_r_r_offs_c(cond, r, ra, offs) \ + emith_read_r_r_offs(r, ra, offs) + +#define emith_read_r_r_r_ptr(r, ra, rm) \ + EMIT(PPC_LDP_REG(r, ra, rm)) + +#define emith_read_r_r_r(r, ra, rm) \ + EMIT(PPC_LDW_REG(r, ra, rm)) +#define emith_read_r_r_r_c(cond, r, ra, rm) \ + emith_read_r_r_r(r, ra, rm) + +#define emith_read8_r_r_offs(r, ra, offs) \ + EMIT(PPC_LDB_IMM(r, ra, offs)) +#define emith_read8_r_r_offs_c(cond, r, ra, offs) \ + emith_read8_r_r_offs(r, ra, offs) + +#define emith_read8_r_r_r(r, ra, rm) \ + EMIT(PPC_LDB_REG(r, ra, rm)) +#define emith_read8_r_r_r_c(cond, r, ra, rm) \ + emith_read8_r_r_r(r, ra, rm) + +#define emith_read16_r_r_offs(r, ra, offs) \ + EMIT(PPC_LDH_IMM(r, ra, offs)) +#define emith_read16_r_r_offs_c(cond, r, ra, offs) \ + emith_read16_r_r_offs(r, ra, offs) + +#define emith_read16_r_r_r(r, ra, rm) \ + EMIT(PPC_LDH_REG(r, ra, rm)) +#define emith_read16_r_r_r_c(cond, r, ra, rm) \ + emith_read16_r_r_r(r, ra, rm) + +#define emith_read8s_r_r_offs(r, ra, offs) do { \ + EMIT(PPC_LDB_IMM(r, ra, offs)); \ + EMIT(PPC_EXTSB_REG(r, r)); \ +} while (0) +#define emith_read8s_r_r_offs_c(cond, r, ra, offs) \ + emith_read8s_r_r_offs(r, ra, offs) + +#define emith_read8s_r_r_r(r, ra, rm) do { \ + EMIT(PPC_LDB_REG(r, ra, rm)); \ + EMIT(PPC_EXTSB_REG(r, r)); \ +} while (0) +#define emith_read8s_r_r_r_c(cond, r, ra, rm) \ + emith_read8s_r_r_r(r, ra, rm) + +#define emith_read16s_r_r_offs(r, ra, offs) do { \ + EMIT(PPC_LDH_IMM(r, ra, offs)); \ + EMIT(PPC_EXTSH_REG(r, r)); \ +} while (0) +#define emith_read16s_r_r_offs_c(cond, r, ra, offs) \ + emith_read16s_r_r_offs(r, ra, offs) + +#define emith_read16s_r_r_r(r, ra, rm) do { \ + EMIT(PPC_LDH_REG(r, ra, rm)); \ + EMIT(PPC_EXTSH_REG(r, r)); \ +} while (0) +#define emith_read16s_r_r_r_c(cond, r, ra, rm) \ + emith_read16s_r_r_r(r, ra, rm) + + +#define emith_write_r_r_offs_ptr(r, ra, offs) \ + EMIT(PPC_STP_IMM(r, ra, offs)) +#define emith_write_r_r_offs_ptr_c(cond, r, ra, offs) \ + emith_write_r_r_offs_ptr(r, ra, offs) + +#define emith_write_r_r_r_ptr(r, ra, rm) \ + EMIT(PPC_STP_REG(r, ra, rm)) +#define emith_write_r_r_r_ptr_c(cond, r, ra, rm) \ + emith_write_r_r_r_ptr(r, ra, rm) + +#define emith_write_r_r_offs(r, ra, offs) \ + EMIT(PPC_STW_IMM(r, ra, offs)) +#define emith_write_r_r_offs_c(cond, r, ra, offs) \ + emith_write_r_r_offs(r, ra, offs) + +#define emith_write_r_r_r(r, ra, rm) \ + EMIT(PPC_STW_REG(r, ra, rm)) +#define emith_write_r_r_r_c(cond, r, ra, rm) \ + emith_write_r_r_r(r, ra, rm) + +#define emith_ctx_read_ptr(r, offs) \ + emith_read_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_read(r, offs) \ + emith_read_r_r_offs(r, CONTEXT_REG, offs) +#define emith_ctx_read_c(cond, r, offs) \ + emith_ctx_read(r, offs) + +#define emith_ctx_write_ptr(r, offs) \ + emith_write_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_write(r, offs) \ + emith_write_r_r_offs(r, CONTEXT_REG, offs) + +#define emith_ctx_read_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_read(r_, offs_); \ +} while (0) + +#define emith_ctx_write_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_write(r_, offs_); \ +} while (0) + +// function call handling +#define emith_save_caller_regs(mask) do { \ + int _c, _z = PTR_SIZE; u32 _m = mask & 0x1ff8; /* r3-r12 */ \ + if (__builtin_parity(_m) == 1) _m |= 0x1; /* ABI align */ \ + int _s = count_bits(_m) * _z, _o = _s; \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, -_s); \ + for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + if (_m & (1 << _c)) \ + { _o -= _z; if (_c) emith_write_r_r_offs_ptr(_c, SP, _o); } \ +} while (0) + +#define emith_restore_caller_regs(mask) do { \ + int _c, _z = PTR_SIZE; u32 _m = mask & 0x1ff8; \ + if (__builtin_parity(_m) == 1) _m |= 0x1; \ + int _s = count_bits(_m) * _z, _o = 0; \ + for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) \ + { if (_c) emith_read_r_r_offs_ptr(_c, SP, _o); _o += _z; } \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, _s); \ +} while (0) + +#define host_arg2reg(rt, arg) \ + rt = (arg+3) + +#define emith_pass_arg_r(arg, reg) \ + emith_move_r_r(arg, reg) + +#define emith_pass_arg_imm(arg, imm) \ + emith_move_r_imm(arg, imm) + +// branching +#define emith_invert_branch(cond) /* inverted conditional branch */ \ + ((cond) ^ 0x40) + +// evaluate the emulated condition, returns a register/branch type pair +static int emith_cmpr_check(int rs, int rt, int cond, u32 *op) +{ + int b = -1; + + // condition check for comparing 2 registers + switch (cond) { + case DCOND_EQ: *op = PPC_CMPW_REG(rs, rt); b = BEQ; break; + case DCOND_NE: *op = PPC_CMPW_REG(rs, rt); b = BNE; break; + case DCOND_LO: *op = PPC_CMPLW_REG(rs, rt); b = BLT; break; + case DCOND_HS: *op = PPC_CMPLW_REG(rs, rt); b = BGE; break; + case DCOND_LS: *op = PPC_CMPLW_REG(rs, rt); b = BLE; break; + case DCOND_HI: *op = PPC_CMPLW_REG(rs, rt); b = BGT; break; + case DCOND_LT: *op = PPC_CMPW_REG(rs, rt); b = BLT; break; + case DCOND_GE: *op = PPC_CMPW_REG(rs, rt); b = BGE; break; + case DCOND_LE: *op = PPC_CMPW_REG(rs, rt); b = BLE; break; + case DCOND_GT: *op = PPC_CMPW_REG(rs, rt); b = BGT; break; + } + + return b; +} + +static int emith_cmpi_check(int rs, s32 imm, int cond, u32 *op) +{ + int b = -1; + + // condition check for comparing register with immediate + switch (cond) { + case DCOND_EQ: *op = PPC_CMPW_IMM(rs, (u16)imm), b = BEQ; break; + case DCOND_NE: *op = PPC_CMPW_IMM(rs, (u16)imm), b = BNE; break; + case DCOND_LO: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = BLT; break; + case DCOND_HS: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = BGE; break; + case DCOND_LS: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = BLE; break; + case DCOND_HI: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = BGT; break; + case DCOND_LT: *op = PPC_CMPW_IMM(rs, (u16)imm), b = BLT; break; + case DCOND_GE: *op = PPC_CMPW_IMM(rs, (u16)imm), b = BGE; break; + case DCOND_LE: *op = PPC_CMPW_IMM(rs, (u16)imm), b = BLE; break; + case DCOND_GT: *op = PPC_CMPW_IMM(rs, (u16)imm), b = BGT; break; + } + + return b; +} + +static int emith_cond_check(int cond) +{ + int b = -1; + u32 op = 0; + + if (emith_cmp_ra >= 0) { + if (emith_cmp_rb != -1) + b = emith_cmpr_check(emith_cmp_ra,emith_cmp_rb, cond,&op); + else b = emith_cmpi_check(emith_cmp_ra,emith_cmp_imm,cond,&op); + } + + // shortcut for V known to be 0 + if (b < 0 && emith_flg_noV) switch (cond) { + case DCOND_VS: /* no branch */ break; // never + case DCOND_VC: b = BXX; break; // always + case DCOND_LT: op = PPC_CMPW_IMM(FNZ, 0); b = BLT; break; // N + case DCOND_GE: op = PPC_CMPW_IMM(FNZ, 0); b = BGE; break; // !N + case DCOND_LE: op = PPC_CMPW_IMM(FNZ, 0); b = BLE; break; // N || Z + case DCOND_GT: op = PPC_CMPW_IMM(FNZ, 0); b = BGT; break; // !N && !Z + } + + // the full monty if no shortcut + if (b < 0) switch (cond) { + // conditions using NZ + case DCOND_EQ: op = PPC_CMPW_IMM(FNZ, 0); b = BEQ; break; // Z + case DCOND_NE: op = PPC_CMPW_IMM(FNZ, 0); b = BNE; break; // !Z + case DCOND_MI: op = PPC_CMPW_IMM(FNZ, 0); b = BLT; break; // N + case DCOND_PL: op = PPC_CMPW_IMM(FNZ, 0); b = BGE; break; // !N + // conditions using C + case DCOND_LO: op = PPC_CMPW_IMM(FC , 0); b = BNE; break; // C + case DCOND_HS: op = PPC_CMPW_IMM(FC , 0); b = BEQ; break; // !C + // conditions using CZ + case DCOND_LS: // C || Z + case DCOND_HI: // !C && !Z + EMIT(PPC_ADD_IMM(AT, FC, -1)); // !C && !Z + EMIT(PPC_AND_REG(AT, FNZ, AT)); + op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_HI ? BNE : BEQ); + break; + + // conditions using V + case DCOND_VS: // V + case DCOND_VC: // !V + EMIT(PPC_XOR_REG(AT, FV, FNZ)); // V = Nt^Ns^Nd^C + EMIT(PPC_LSRW_IMM(AT, AT, 31)); + EMIT(PPC_XOR_REG(AT, AT, FC)); + op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_VS ? BNE : BEQ); + break; + // conditions using VNZ + case DCOND_LT: // N^V + case DCOND_GE: // !(N^V) + EMIT(PPC_LSRW_IMM(AT, FV, 31)); // Nd^V = Nt^Ns^C + EMIT(PPC_XOR_REG(AT, FC, AT)); + op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_LT ? BNE : BEQ); + break; + case DCOND_LE: // (N^V) || Z + case DCOND_GT: // !(N^V) && !Z + EMIT(PPC_LSRW_IMM(AT, FV, 31)); // Nd^V = Nt^Ns^C + EMIT(PPC_XOR_REG(AT, FC, AT)); + EMIT(PPC_ADD_IMM(AT, AT, -1)); // !(Nd^V) && !Z + EMIT(PPC_AND_REG(AT, FNZ, AT)); + op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_GT ? BNE : BEQ); + break; + } + + if (op) EMIT(op); + return b; +} + +#define emith_jump(target) do { \ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ + EMIT(PPC_B((uintptr_t)disp_ & 0x03ffffff)); \ +} while (0) +#define emith_jump_patchable(target) \ + emith_jump(target) + +// NB: PPC conditional branches have only +/- 64KB range +#define emith_jump_cond(cond, target) do { \ + int mcond_ = emith_cond_check(cond); \ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ + if (mcond_ >= 0) EMIT(PPC_BCOND(mcond_,disp_ & 0x0000ffff)); \ +} while (0) +#define emith_jump_cond_patchable(cond, target) \ + emith_jump_cond(cond, target) + +#define emith_jump_cond_inrange(target) \ + ((u8 *)target - (u8 *)tcache_ptr < 0x8000 && \ + (u8 *)target - (u8 *)tcache_ptr >= -0x8000+0x10) //mind cond_check + +// NB: returns position of patch for cache maintenance +#define emith_jump_patch(ptr, target, pos) do { \ + u32 *ptr_ = (u32 *)ptr; /* must skip condition check code */ \ + u32 disp_, mask_; \ + while (*ptr_>>26 != OP_BC && *ptr_>>26 != OP_B) ptr_ ++; \ + disp_ = (u8 *)target - (u8 *)ptr_; \ + mask_ = (*ptr_>>26 == OP_BC ? 0xffff0003 : 0xfc000003); \ + EMIT_PTR(ptr_, (*ptr_ & mask_) | (disp_ & ~mask_)); \ + if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)(ptr_-1); \ +} while (0) + +#define emith_jump_patch_inrange(ptr, target) \ + ((u8 *)target - (u8 *)ptr < 0x8000 && \ + (u8 *)target - (u8 *)ptr >= -0x8000+0x10) // mind cond_check +#define emith_jump_patch_size() 4 + +#define emith_jump_at(ptr, target) do { \ + u32 disp_ = (u8 *)target - (u8 *)ptr; \ + u32 *ptr_ = (u32 *)ptr; \ + EMIT_PTR(ptr_, PPC_B((uintptr_t)disp_ & 0x03ffffff)); \ +} while (0) +#define emith_jump_at_size() 4 + +#define emith_jump_reg(r) do { \ + EMIT(PPC_MTSP_REG(r, CTR)); \ + EMIT(PPC_BCTRCOND(BXX)); \ +} while(0) +#define emith_jump_reg_c(cond, r) \ + emith_jump_reg(r) + +#define emith_jump_ctx(offs) do { \ + emith_ctx_read_ptr(AT, offs); \ + emith_jump_reg(AT); \ +} while (0) +#define emith_jump_ctx_c(cond, offs) \ + emith_jump_ctx(offs) + +#define emith_call(target) do { \ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ + EMIT(PPC_BL((uintptr_t)disp_ & 0x03ffffff)); \ +} while(0) +#define emith_call_cond(cond, target) \ + emith_call(target) + +#define emith_call_reg(r) do { \ + EMIT(PPC_MTSP_REG(r, CTR)); \ + EMIT(PPC_BLCTRCOND(BXX)); \ +} while(0) + +#define emith_call_ctx(offs) do { \ + emith_ctx_read_ptr(AT, offs); \ + emith_call_reg(AT); \ +} while (0) + +#define emith_call_cleanup() /**/ + +#define emith_ret() \ + EMIT(PPC_RET()) +#define emith_ret_c(cond) \ + emith_ret() + +#define emith_ret_to_ctx(offs) do { \ + EMIT(PPC_MFSP_REG(AT, LR)); \ + emith_ctx_write_ptr(AT, offs); \ +} while (0) + +#define emith_add_r_ret(r) do { \ + EMIT(PPC_MFSP_REG(AT, LR)); \ + emith_add_r_r_ptr(r, AT); \ +} while (0) + +// NB: ABI SP alignment is 16 in 64 bit mode +#define emith_push_ret(r) do { \ + int offs_ = 16 - 2*PTR_SIZE; \ + emith_add_r_r_ptr_imm(SP, SP, -16); \ + EMIT(PPC_MFSP_REG(AT, LR)); \ + emith_write_r_r_offs_ptr(AT, SP, offs_ + PTR_SIZE); \ + if ((r) > 0) emith_write_r_r_offs(r, SP, offs_); \ +} while (0) + +#define emith_pop_and_ret(r) do { \ + int offs_ = 16 - 2*PTR_SIZE; \ + if ((r) > 0) emith_read_r_r_offs(r, SP, offs_); \ + emith_read_r_r_offs_ptr(AT, SP, offs_ + PTR_SIZE); \ + EMIT(PPC_MTSP_REG(AT, LR)); \ + emith_add_r_r_ptr_imm(SP, SP, 16); \ + emith_ret(); \ +} while (0) + + +// emitter ABI stuff +#define emith_pool_check() /**/ +#define emith_pool_commit(j) /**/ +#define emith_insn_ptr() ((u8 *)tcache_ptr) +#define emith_flush() /**/ +#define host_instructions_updated(base, end) __builtin___clear_cache(base, end) +#define emith_update_cache() /**/ +#define emith_rw_offs_max() 0x7fff + +// SH2 drc specific +#define STACK_EXTRA ((8+6)*PTR_SIZE) // Param, ABI (LR,CR,FP etc) save areas +#define emith_sh2_drc_entry() do { \ + int _c, _z = PTR_SIZE; u32 _m = 0xffffc000; /* r14-r31 */ \ + if (__builtin_parity(_m) == 1) _m |= 0x1; /* ABI align for SP is 16 */ \ + int _s = count_bits(_m) * _z, _o = 0; \ + for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + if (_m & (1 << _c)) \ + { _o -= _z; if (_c) emith_write_r_r_offs_ptr(_c, SP, _o); } \ + EMIT(PPC_MFSP_REG(10, LR)); \ + emith_write_r_r_offs_ptr(10, SP, 2*PTR_SIZE); \ + emith_write_r_r_offs_ptr(SP, SP, -_s-STACK_EXTRA); /* XXX stdu */ \ + emith_add_r_r_ptr_imm(SP, SP, -_s-STACK_EXTRA); \ +} while (0) +#define emith_sh2_drc_exit() do { \ + int _c, _z = PTR_SIZE; u32 _m = 0xffffc000; \ + if (__builtin_parity(_m) == 1) _m |= 0x1; \ + int _s = count_bits(_m) * _z, _o = STACK_EXTRA; \ + for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) \ + { if (_c) emith_read_r_r_offs_ptr(_c, SP, _o); _o += _z; } \ + emith_add_r_r_ptr_imm(SP, SP, _s+STACK_EXTRA); \ + emith_read_r_r_offs_ptr(10, SP, 2*PTR_SIZE); \ + EMIT(PPC_MTSP_REG(10, LR)); \ + emith_ret(); \ +} while (0) + +// NB: assumes a is in arg0, tab, func and mask are temp +#define emith_sh2_rcall(a, tab, func, mask) do { \ + emith_lsr(mask, a, SH2_READ_SHIFT); \ + emith_add_r_r_r_lsl_ptr(tab, tab, mask, PTR_SCALE+1); \ + emith_read_r_r_offs_ptr(func, tab, 0); \ + emith_read_r_r_offs(mask, tab, PTR_SIZE); \ + EMIT(PPC_BFXP_IMM(FC, func, 0, 1)); \ + emith_add_r_r_ptr(func, func); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +// NB: assumes a, val are in arg0 and arg1, tab and func are temp +#define emith_sh2_wcall(a, val, tab, func) do { \ + emith_lsr(func, a, SH2_WRITE_SHIFT); \ + emith_lsl(func, func, PTR_SCALE); \ + emith_read_r_r_r_ptr(func, tab, func); \ + emith_move_r_r_ptr(5, CONTEXT_REG); /* arg2 */ \ + emith_jump_reg(func); \ +} while (0) + +#define emith_sh2_delay_loop(cycles, reg) do { \ + int sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); \ + int t1 = rcache_get_tmp(); \ + int t2 = rcache_get_tmp(); \ + int t3 = rcache_get_tmp(); \ + /* if (sr < 0) return */ \ + emith_cmp_r_imm(sr, 0); \ + EMITH_JMP_START(DCOND_LE); \ + /* turns = sr.cycles / cycles */ \ + emith_asr(t2, sr, 12); \ + emith_move_r_imm(t3, (u32)((1ULL<<32) / (cycles)) + 1); \ + emith_mul_u64(t1, t2, t2, t3); /* multiply by 1/x */ \ + rcache_free_tmp(t3); \ + if (reg >= 0) { \ + /* if (reg <= turns) turns = reg-1 */ \ + t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \ + emith_cmp_r_r(t3, t2); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_sub_r_r_imm_c(DCOND_LS, t2, t3, 1); \ + EMITH_SJMP_END(DCOND_HI); \ + /* if (reg <= 1) turns = 0 */ \ + emith_cmp_r_imm(t3, 1); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_move_r_imm_c(DCOND_LS, t2, 0); \ + EMITH_SJMP_END(DCOND_HI); \ + /* reg -= turns */ \ + emith_sub_r_r(t3, t2); \ + } \ + /* sr.cycles -= turns * cycles; */ \ + emith_move_r_imm(t1, cycles); \ + emith_mul(t1, t2, t1); \ + emith_sub_r_r_r_lsl(sr, sr, t1, 12); \ + EMITH_JMP_END(DCOND_LE); \ + rcache_free_tmp(t1); \ + rcache_free_tmp(t2); \ +} while (0) + +/* + * T = !carry(Rn = (Rn << 1) | T) + * if Q + * C = carry(Rn += Rm) + * else + * C = carry(Rn -= Rm) + * T ^= C + */ +#define emith_sh2_div1_step(rn, rm, sr) do { \ + int t_ = rcache_get_tmp(); \ + emith_and_r_r_imm(AT, sr, T); \ + emith_lsr(FC, rn, 31); /*Rn = (Rn<<1)+T*/ \ + emith_lsl(t_, rn, 1); \ + emith_or_r_r(t_, AT); \ + emith_or_r_imm(sr, T); /* T = !carry */ \ + emith_eor_r_r(sr, FC); \ + emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ + EMITH_JMP3_START(DCOND_EQ); \ + emith_add_r_r_r(rn, t_, rm); \ + EMIT(PPC_CMPLW_REG(rn, t_)); \ + EMITH_JMP3_MID(DCOND_EQ); \ + emith_sub_r_r_r(rn, t_, rm); \ + EMIT(PPC_CMPLW_REG(t_, rn)); \ + EMITH_JMP3_END(); \ + EMIT(PPC_MFCR_REG(FC)); \ + EMIT(PPC_BFXW_IMM(FC, FC, 0, 1)); \ + emith_eor_r_r(sr, FC); /* T ^= carry */ \ + rcache_free_tmp(t_); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macl(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* MACH top 16 bits unused if saturated. sign ext for overfl detect */ \ + emith_sext(mh, mh, 16); \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ + /* to check: add MACH >> 31 to MACH >> 15. this is 0 if no overflow */ \ + emith_asr(rn, mh, 15); \ + emith_add_r_r_r_lsr(rn, rn, mh, 31); /* sum = (MACH>>31)+(MACH>>15) */ \ + emith_tst_r_r(rn, rn); /* (need only N and Z flags) */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ + EMITH_SJMP_START(DCOND_PL); /* sum > 0 -> +ovl */ \ + emith_sub_r_imm_c(DCOND_MI, ml, 1); /* 0xffffffff */ \ + emith_sub_r_imm_c(DCOND_MI, mh, 1); /* 0x00007fff */ \ + EMITH_SJMP_END(DCOND_PL); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macw(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* XXX: MACH should be untouched when S is set? */ \ + emith_asr(mh, ml, 31); /* sign ext MACL to MACH for ovrfl check */ \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 33 bits of MACH:MACL aren't all 1 or 0 */ \ + /* to check: add MACL[31] to MACH. this is 0 if no overflow */ \ + emith_lsr(rn, ml, 31); \ + emith_add_r_r(rn, mh); /* sum = MACH + ((MACL>>31)&1) */ \ + emith_tst_r_r(rn, rn); /* (need only N and Z flags) */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> overflow */ \ + /* XXX: LSB signalling only in SH1, or in SH2 too? */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x00000001); /* LSB of MACH */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x80000000); /* negative ovrfl */ \ + EMITH_SJMP_START(DCOND_PL); /* sum > 0 -> positive ovrfl */ \ + emith_sub_r_imm_c(DCOND_MI, ml, 1); /* 0x7fffffff */ \ + EMITH_SJMP_END(DCOND_PL); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +#define emith_write_sr(sr, srcr) \ + EMIT(PPC_BFIW_IMM(sr, srcr, 22, 10)) + +#define emith_carry_to_t(sr, is_sub) \ + EMIT(PPC_BFIW_IMM(sr, FC, 32-__builtin_ffs(T), 1)) + +#define emith_t_to_carry(sr, is_sub) \ + emith_and_r_r_imm(FC, sr, 1) + +#define emith_tpop_carry(sr, is_sub) do { \ + emith_and_r_r_imm(FC, sr, 1); \ + emith_eor_r_r(sr, FC); \ +} while (0) + +#define emith_tpush_carry(sr, is_sub) \ + emith_or_r_r(sr, FC) + +#ifdef T +#define emith_invert_cond(cond) \ + ((cond) ^ 1) + +// T bit handling +static void emith_set_t_cond(int sr, int cond) +{ + int b; + + // catch never and always cases + if ((b = emith_cond_check(cond)) < 0) + return; + else if (b == BXX) { + emith_or_r_imm(sr, T); + return; + } + + // extract bit from CR and insert into T + EMIT(PPC_MFCR_REG(AT)); + EMIT(PPC_BFXW_IMM(AT, AT, (b&7), 1)); + if (!(b & 0x40)) EMIT(PPC_XOR_IMM(AT, AT, 1)); + EMIT(PPC_BFIW_IMM(sr, AT, 32-__builtin_ffs(T), 1)); +} + +#define emith_clr_t_cond(sr) ((void)sr) + +#define emith_get_t_cond() -1 + +#define emith_sync_t(sr) ((void)sr) + +#define emith_invalidate_t() + +static void emith_set_t(int sr, int val) +{ + if (val) + emith_or_r_imm(sr, T); + else + emith_bic_r_imm(sr, T); +} + +static int emith_tst_t(int sr, int tf) +{ + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; +} +#endif diff --git a/cpu/drc/emit_riscv.c b/cpu/drc/emit_riscv.c new file mode 100644 index 000000000..de99d4fd0 --- /dev/null +++ b/cpu/drc/emit_riscv.c @@ -0,0 +1,1659 @@ +/* + * Basic macros to emit RISC-V RV64IM instructions and some utils + * Copyright (C) 2019 kub + * + * This work is licensed under the terms of MAME license. + * See COPYING file in the top-level directory. + */ +#define HOST_REGS 32 + +// RISC-V ABI: params: x10-x17, return: x10-x11, temp: x1(ra),x5-x7,x28-x31 +// saved: x8(fp),x9,x18-x27, reserved: x0(zero), x4(tp), x3(gp), x2(sp) +// x28-x31(t3-t6) are used internally by the code emitter +#define RET_REG 10 // a0 +#define PARAM_REGS { 10, 11, 12, 13, 14, 15, 16, 17 } // a0-a7 +#define PRESERVED_REGS { 9, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 } // s1-s11 +#define TEMPORARY_REGS { 5, 6, 7 } // t0-t2 + +#define CONTEXT_REG 9 // s1 +#define STATIC_SH2_REGS { SHR_SR,27 , SHR_R(0),26 , SHR_R(1),25 } + +// registers usable for user code: r1-r25, others reserved or special +#define Z0 0 // zero register +#define GP 3 // global pointer +#define SP 2 // stack pointer +#define FP 8 // frame pointer +#define LR 1 // link register +// internally used by code emitter: +#define AT 31 // used to hold intermediate results +#define FNZ 30 // emulated processor flags: N (bit 31) ,Z (all bits) +#define FC 29 // emulated processor flags: C (bit 0), others 0 +#define FV 28 // emulated processor flags: Nt^Ns (bit 31). others x + +// All operations but ptr ops are using the lower 32 bits of the registers. +// The upper 32 bits always contain the sign extension from the lower 32 bits. + +// unified conditions; virtual, not corresponding to anything real on RISC-V +#define DCOND_EQ 0x0 +#define DCOND_NE 0x1 +#define DCOND_HS 0x2 +#define DCOND_LO 0x3 +#define DCOND_MI 0x4 +#define DCOND_PL 0x5 +#define DCOND_VS 0x6 +#define DCOND_VC 0x7 +#define DCOND_HI 0x8 +#define DCOND_LS 0x9 +#define DCOND_GE 0xa +#define DCOND_LT 0xb +#define DCOND_GT 0xc +#define DCOND_LE 0xd + +#define DCOND_CS DCOND_LO +#define DCOND_CC DCOND_HS + +// unified insn +#define R5_INSN(b25, b20, b15, b12, b7, op) \ + (((b25)<<25)|((b20)<<20)|((b15)<<15)|((b12)<<12)|((b7)<<7)|((op)<<0)) + +#define _ 0 //marker for "field unused" +#define _CB(v,l,s,d) ((((v)>>(s))&((1<<(l))-1))<<(d)) // copy l bits + +#define R5_R_INSN(op, f1, f2, rd, rs, rt) \ + R5_INSN(f2, rt, rs, f1, rd, op) +#define R5_I_INSN(op, f1, rd, rs, imm) \ + R5_INSN(_, _CB(imm,12,0,0), rs, f1, rd, op) +#define R5_S_INSN(op, f1, rt, rs, imm) \ + R5_INSN(_CB(imm,7,5,0), rt, rs, f1, _CB(imm,5,0,0), op) +#define R5_U_INSN(op, rd, imm) \ + R5_INSN(_,_,_, _CB(imm,20,12,0), rd, op) +// oy vey... R5 immediate encoding in branches is really unwieldy :-/ +#define R5_B_INSN(op, f1, rt, rs, imm) \ + R5_INSN(_CB(imm,1,12,6)|_CB(imm,6,5,0), rt, rs, f1, \ + _CB(imm,4,1,1)|_CB(imm,1,11,0), op) +#define R5_J_INSN(op, rd, imm) \ + R5_INSN(_CB(imm,1,20,6)|_CB(imm,6,5,0), _CB(imm,4,1,1)|_CB(imm,1,11,0),\ + _CB(imm,8,12,0), rd, op) + +// opcode +enum { OP_LUI=0x37, OP_AUIPC=0x17, OP_JAL=0x6f, // 20-bit immediate + OP_JALR=0x67, OP_BCOND=0x63, OP_LD=0x03, OP_ST=0x23, // 12-bit immediate + OP_IMM=0x13, OP_REG=0x33, OP_IMM32=0x1b, OP_REG32=0x3b }; +// func3 +enum { F1_ADD, F1_SL, F1_SLT, F1_SLTU, F1_XOR, F1_SR, F1_OR, F1_AND };// IMM/REG +enum { F1_MUL, F1_MULH, F1_MULHSU, F1_MULHU, F1_DIV, F1_DIVU, F1_REM, F1_REMU }; +enum { F1_BEQ, F1_BNE, F1_BLT=4, F1_BGE, F1_BLTU, F1_BGEU }; // BCOND +enum { F1_B, F1_H, F1_W, F1_D, F1_BU, F1_HU, F1_WU }; // LD/ST +// func7 +enum { F2_ALT=0x20, F2_MULDIV=0x01 }; + +#define R5_NOP R5_I_INSN(OP_IMM, F1_ADD, Z0, Z0, 0) // nop: ADDI r0, r0, #0 + +// arithmetic/logical + +// rd = rs OP rt +#define R5_ADD_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_ADD, _, rd, rs, rt) +#define R5_SUB_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_ADD, F2_ALT, rd, rs, rt) + +#define R5_NEG_REG(rd, rt) \ + R5_SUB_REG(rd, Z0, rt) + +#define R5_XOR_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_XOR, _, rd, rs, rt) +#define R5_OR_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_OR , _, rd, rs, rt) +#define R5_AND_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_AND, _, rd, rs, rt) + +// rd = rs SHIFT rt +#define R5_LSL_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_SL , _, rd, rs, rt) +#define R5_LSR_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_SR , _, rd, rs, rt) +#define R5_ASR_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_SR , F2_ALT, rd, rs, rt) + +// rd = (rs < rt) +#define R5_SLT_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_SLT, _, rd, rs, rt) +#define R5_SLTU_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_SLTU,_, rd, rs, rt) + +// rd = rs OP imm12 +#define R5_ADD_IMM(rd, rs, imm12) \ + R5_I_INSN(OP_IMM, F1_ADD , rd, rs, imm12) + +#define R5_XOR_IMM(rd, rs, imm12) \ + R5_I_INSN(OP_IMM, F1_XOR , rd, rs, imm12) +#define R5_OR_IMM(rd, rs, imm12) \ + R5_I_INSN(OP_IMM, F1_OR , rd, rs, imm12) +#define R5_AND_IMM(rd, rs, imm12) \ + R5_I_INSN(OP_IMM, F1_AND , rd, rs, imm12) + +#define R5_MOV_REG(rd, rs) \ + R5_ADD_IMM(rd, rs, 0) +#define R5_MVN_REG(rd, rs) \ + R5_XOR_IMM(rd, rs, -1) + +// rd = (imm12 << (0|12)) +#define R5_MOV_IMM(rd, imm12) \ + R5_OR_IMM(rd, Z0, imm12) +#define R5_MOVT_IMM(rd, imm20) \ + R5_U_INSN(OP_LUI, rd, imm20) +#define R5_MOVA_IMM(rd, imm20) \ + R5_U_INSN(OP_AUIPC, rd, imm20) + +// rd = rs SHIFT imm5/imm6 +#define R5_LSL_IMM(rd, rs, bits) \ + R5_R_INSN(OP_IMM, F1_SL , _, rd, rs, bits) +#define R5_LSR_IMM(rd, rs, bits) \ + R5_R_INSN(OP_IMM, F1_SR , _, rd, rs, bits) +#define R5_ASR_IMM(rd, rs, bits) \ + R5_R_INSN(OP_IMM, F1_SR , F2_ALT, rd, rs, bits) + +// rd = (rs < imm12) +#define R5_SLT_IMM(rd, rs, imm12) \ + R5_I_INSN(OP_IMM, F1_SLT , rd, rs, imm12) +#define R5_SLTU_IMM(rd, rs, imm12) \ + R5_I_INSN(OP_IMM, F1_SLTU, rd, rs, imm12) + +// multiplication + +#define R5_MULHU(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_MULHU, F2_MULDIV, rd, rs, rt) +#define R5_MULHS(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_MULH, F2_MULDIV, rd, rs, rt) +#define R5_MUL(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_MUL, F2_MULDIV, rd, rs, rt) + +// branching + +#define R5_J(imm20) \ + R5_J_INSN(OP_JAL, Z0, imm20) +#define R5_JAL(rd, imm20) \ + R5_J_INSN(OP_JAL, rd, imm20) +#define R5_JR(rs, offs12) \ + R5_I_INSN(OP_JALR, _, Z0, rs, offs12) +#define R5_JALR(rd, rs, offs12) \ + R5_I_INSN(OP_JALR, _, rd, rs, offs12) + +// conditional branches; no condition code, these compare rs against rt +#define R5_BCOND(cond, rs, rt, offs13) \ + R5_B_INSN(OP_BCOND, cond, rt, rs, offs13) +#define R5_BCONDZ(cond, rs, offs13) \ + R5_B_INSN(OP_BCOND, cond, Z0, rs, offs13) +#define R5_B(offs13) \ + R5_BCOND(F1_BEQ, Z0, Z0, offs13) + +// load/store indexed base + +#define R5_LW(rd, rs, offs12) \ + R5_I_INSN(OP_LD, F1_W, rd, rs, offs12) +#define R5_LH(rd, rs, offs12) \ + R5_I_INSN(OP_LD, F1_H, rd, rs, offs12) +#define R5_LB(rd, rs, offs12) \ + R5_I_INSN(OP_LD, F1_B, rd, rs, offs12) +#define R5_LHU(rd, rs, offs12) \ + R5_I_INSN(OP_LD, F1_HU, rd, rs, offs12) +#define R5_LBU(rd, rs, offs12) \ + R5_I_INSN(OP_LD, F1_BU, rd, rs, offs12) + +#define R5_SW(rt, rs, offs12) \ + R5_S_INSN(OP_ST, F1_W, rt, rs, offs12) +#define R5_SH(rt, rs, offs12) \ + R5_S_INSN(OP_ST, F1_H, rt, rs, offs12) +#define R5_SB(rt, rs, offs12) \ + R5_S_INSN(OP_ST, F1_B, rt, rs, offs12) + +// pointer operations + +#if __riscv_xlen == 64 +#define R5_OP32 (OP_REG32 ^ OP_REG) +#define F1_P F1_D +#define PTR_SCALE 3 + +// NB: must split 64 bit result into 2 32 bit registers +// NB: expects 32 bit values in s1+s2, correctly sign extended to 64 bits +#define EMIT_R5_MULLU_REG(dlo, dhi, s1, s2) do { \ + EMIT(R5_MUL(dlo, s1, s2)); \ + EMIT(R5_ASR_IMM(dhi, dlo, 32)); \ + EMIT(R5_ADDW_IMM(dlo, dlo, 0)); \ +} while (0) + +#define EMIT_R5_MULLS_REG(dlo, dhi, s1, s2) \ + EMIT_R5_MULLU_REG(dlo, dhi, s1, s2) +#else +#define R5_OP32 0 +#define F1_P F1_W +#define PTR_SCALE 2 + +#define EMIT_R5_MULLU_REG(dlo, dhi, s1, s2) do { \ + int at = (dhi == s1 || dhi == s2 ? AT : dhi); \ + EMIT(R5_MULHU(at, s1, s2)); \ + EMIT(R5_MUL(dlo, s1, s2)); \ + if (at != dhi) emith_move_r_r(dhi, at); \ +} while (0) + +#define EMIT_R5_MULLS_REG(dlo, dhi, s1, s2) do { \ + int at = (dhi == s1 || dhi == s2 ? AT : dhi); \ + EMIT(R5_MULHS(at, s1, s2)); \ + EMIT(R5_MUL(dlo, s1, s2)); \ + if (at != dhi) emith_move_r_r(dhi, at); \ +} while (0) +#endif + +#define PTR_SIZE (1<>1 since the lowest bit inverts the cond */ \ + unsigned _mv = BITMASK3(DCOND_VS>>1,DCOND_GE>>1,DCOND_GT>>1); \ + unsigned _mc = _mv | BITMASK2(DCOND_HS>>1,DCOND_HI>>1); \ + emith_flg_hint = (_mv & BITMASK1(cond >> 1) ? _FHV : 0); \ + emith_flg_hint |= (_mc & BITMASK1(cond >> 1) ? _FHC : 0); \ +} while (0) + +// store minimal cc information: rd, rt^rs, carry +// NB: the result *must* first go to FNZ, in case rd == rs or rd == rt. +// NB: for adcf and sbcf, carry-in must be dealt with separately (see there) +static void emith_set_arith_flags(int rd, int rs, int rt, s32 imm, int sub) +{ + if (emith_flg_hint & _FHC) { + if (sub) // C = sub:rt Z0) // Nt^Ns in FV, bit 31 + EMIT(R5_XOR_REG(FV, rs, rt)); + else if (rt == Z0 || imm == 0) + emith_flg_noV = 1; // imm #0 can't overflow + else if ((imm < 0) == !sub) + EMIT(R5_XOR_IMM(FV, rs, -1)); + else if ((imm > 0) == !sub) + EMIT(R5_XOR_REG(FV, rs, Z0)); + } + // full V = Nd^Nt^Ns^C calculation is deferred until really needed + + if (rd && rd != FNZ) + EMIT(R5_MOV_REG(rd, FNZ)); // N,Z via result value in FNZ + emith_cmp_rs = emith_cmp_rt = -1; +} + +// since R5 has less-than and compare-branch insns, handle cmp separately by +// storing the involved regs for later use in one of those R5 insns. +// This works for all conditions but VC/VS, but this is fortunately never used. +static void emith_set_compare_flags(int rs, int rt, s32 imm) +{ + emith_cmp_rt = rt; + emith_cmp_rs = rs; + emith_cmp_imm = imm; +} + +// data processing, register +#define emith_move_r_r_ptr(d, s) \ + EMIT(R5_MOV_REG(d, s)) +#define emith_move_r_r_ptr_c(cond, d, s) \ + emith_move_r_r_ptr(d, s) + +#define emith_move_r_r(d, s) \ + emith_move_r_r_ptr(d, s) +#define emith_move_r_r_c(cond, d, s) \ + emith_move_r_r(d, s) + +#define emith_mvn_r_r(d, s) \ + EMIT(R5_MVN_REG(d, s)) + +#define emith_add_r_r_r_lsl_ptr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSL_IMM(AT, s2, simm)); \ + EMIT(R5_ADD_REG(d, s1, AT)); \ + } else EMIT(R5_ADD_REG(d, s1, s2)); \ +} while (0) +#define emith_add_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSLW_IMM(AT, s2, simm)); \ + EMIT(R5_ADDW_REG(d, s1, AT)); \ + } else EMIT(R5_ADDW_REG(d, s1, s2)); \ +} while (0) + +#define emith_add_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSRW_IMM(AT, s2, simm)); \ + EMIT(R5_ADDW_REG(d, s1, AT)); \ + } else EMIT(R5_ADDW_REG(d, s1, s2)); \ +} while (0) + +#define emith_addf_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSLW_IMM(AT, s2, simm)); \ + EMIT(R5_ADDW_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(R5_ADDW_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) + +#define emith_addf_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSRW_IMM(AT, s2, simm)); \ + EMIT(R5_ADDW_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(R5_ADDW_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) + +#define emith_sub_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSLW_IMM(AT, s2, simm)); \ + EMIT(R5_SUBW_REG(d, s1, AT)); \ + } else EMIT(R5_SUBW_REG(d, s1, s2)); \ +} while (0) + +#define emith_subf_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSLW_IMM(AT, s2, simm)); \ + EMIT(R5_SUBW_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 1); \ + } else { \ + EMIT(R5_SUBW_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 1); \ + } \ +} while (0) + +#define emith_or_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSLW_IMM(AT, s2, simm)); \ + EMIT(R5_OR_REG(d, s1, AT)); \ + } else EMIT(R5_OR_REG(d, s1, s2)); \ +} while (0) + +#define emith_or_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSRW_IMM(AT, s2, simm)); \ + EMIT(R5_OR_REG(d, s1, AT)); \ + } else EMIT(R5_OR_REG(d, s1, s2)); \ +} while (0) + +#define emith_eor_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSLW_IMM(AT, s2, simm)); \ + EMIT(R5_XOR_REG(d, s1, AT)); \ + } else EMIT(R5_XOR_REG(d, s1, s2)); \ +} while (0) + +#define emith_eor_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSRW_IMM(AT, s2, simm)); \ + EMIT(R5_XOR_REG(d, s1, AT)); \ + } else EMIT(R5_XOR_REG(d, s1, s2)); \ +} while (0) + +#define emith_and_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSLW_IMM(AT, s2, simm)); \ + EMIT(R5_AND_REG(d, s1, AT)); \ + } else EMIT(R5_AND_REG(d, s1, s2)); \ +} while (0) + +#define emith_or_r_r_lsl(d, s, lslimm) \ + emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) + +#define emith_eor_r_r_lsl(d, s, lslimm) \ + emith_eor_r_r_r_lsl(d, d, s, lslimm) +#define emith_eor_r_r_lsr(d, s, lsrimm) \ + emith_eor_r_r_r_lsr(d, d, s, lsrimm) + +#define emith_add_r_r_r(d, s1, s2) \ + emith_add_r_r_r_lsl(d, s1, s2, 0) + +#define emith_addf_r_r_r_ptr(d, s1, s2) \ + emith_addf_r_r_r_lsl(d, s1, s2, 0) +#define emith_addf_r_r_r(d, s1, s2) \ + emith_addf_r_r_r_ptr(d, s1, s2) + +#define emith_sub_r_r_r(d, s1, s2) \ + emith_sub_r_r_r_lsl(d, s1, s2, 0) + +#define emith_subf_r_r_r(d, s1, s2) \ + emith_subf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_or_r_r_r(d, s1, s2) \ + emith_or_r_r_r_lsl(d, s1, s2, 0) + +#define emith_eor_r_r_r(d, s1, s2) \ + emith_eor_r_r_r_lsl(d, s1, s2, 0) + +#define emith_and_r_r_r(d, s1, s2) \ + emith_and_r_r_r_lsl(d, s1, s2, 0) + +#define emith_add_r_r_ptr(d, s) \ + emith_add_r_r_r_lsl_ptr(d, d, s, 0) +#define emith_add_r_r(d, s) \ + emith_add_r_r_r(d, d, s) + +#define emith_sub_r_r(d, s) \ + emith_sub_r_r_r(d, d, s) + +#define emith_neg_r_r(d, s) \ + EMIT(R5_NEGW_REG(d, s)) + +#define emith_adc_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(AT, s2, FC); \ + emith_add_r_r_r(d, s1, AT); \ +} while (0) + +#define emith_sbc_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(AT, s2, FC); \ + emith_sub_r_r_r(d, s1, AT); \ +} while (0) + +#define emith_adc_r_r(d, s) \ + emith_adc_r_r_r(d, d, s) + +#define emith_negc_r_r(d, s) \ + emith_sbc_r_r_r(d, Z0, s) + +// NB: the incoming carry Cin can cause Cout if s2+Cin=0 (or s1+Cin=0 FWIW) +// moreover, if s2+Cin=0 caused Cout, s1+s2+Cin=s1+0 can't cause another Cout +#define emith_adcf_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(FNZ, s2, FC); \ + EMIT(R5_SLTU_REG(AT, FNZ, FC)); \ + emith_add_r_r_r(FNZ, s1, FNZ); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + emith_or_r_r(FC, AT); \ +} while (0) + +#define emith_sbcf_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(FNZ, s2, FC); \ + EMIT(R5_SLTU_REG(AT, FNZ, FC)); \ + emith_sub_r_r_r(FNZ, s1, FNZ); \ + emith_set_arith_flags(d, s1, s2, 0, 1); \ + emith_or_r_r(FC, AT); \ +} while (0) + +#define emith_and_r_r(d, s) \ + emith_and_r_r_r(d, d, s) +#define emith_and_r_r_c(cond, d, s) \ + emith_and_r_r(d, s) + +#define emith_or_r_r(d, s) \ + emith_or_r_r_r(d, d, s) + +#define emith_eor_r_r(d, s) \ + emith_eor_r_r_r(d, d, s) + +#define emith_tst_r_r_ptr(d, s) do { \ + if (d != s) { \ + emith_and_r_r_r(FNZ, d, s); \ + emith_cmp_rs = emith_cmp_rt = -1; \ + } else emith_cmp_rs = s, emith_cmp_rt = Z0; \ +} while (0) +#define emith_tst_r_r(d, s) \ + emith_tst_r_r_ptr(d, s) + +#define emith_teq_r_r(d, s) do { \ + emith_eor_r_r_r(FNZ, d, s); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_cmp_r_r(d, s) \ + emith_set_compare_flags(d, s, 0) +// emith_subf_r_r_r(FNZ, d, s) + +#define emith_addf_r_r(d, s) \ + emith_addf_r_r_r(d, d, s) + +#define emith_subf_r_r(d, s) \ + emith_subf_r_r_r(d, d, s) + +#define emith_adcf_r_r(d, s) \ + emith_adcf_r_r_r(d, d, s) + +#define emith_sbcf_r_r(d, s) \ + emith_sbcf_r_r_r(d, d, s) + +#define emith_negcf_r_r(d, s) \ + emith_sbcf_r_r_r(d, Z0, s) + + +// move immediate +#define MAX_HOST_LITERALS 32 // pool must be smaller than 4 KB +static uintptr_t literal_pool[MAX_HOST_LITERALS]; +static u32 *literal_insn[MAX_HOST_LITERALS]; +static int literal_pindex, literal_iindex; + +static inline int emith_pool_literal(uintptr_t imm) +{ + int idx = literal_pindex - 8; // max look behind in pool + // see if one of the last literals was the same + for (idx = (idx < 0 ? 0 : idx); idx < literal_pindex; idx++) + if (imm == literal_pool[idx]) + break; + if (idx == literal_pindex) // store new literal + literal_pool[literal_pindex++] = imm; + return idx; +} + +static void emith_pool_commit(int jumpover) +{ + int i, sz = literal_pindex * sizeof(uintptr_t); + u8 *pool = (u8 *)tcache_ptr; + + // nothing to commit if pool is empty + if (sz == 0) + return; + // align pool to pointer size + if (jumpover) + pool += sizeof(u32); + i = (uintptr_t)pool & (sizeof(void *)-1); + pool += (i ? sizeof(void *)-i : 0); + // need branch over pool if not at block end + if (jumpover) + EMIT(R5_B(sz + (pool-(u8 *)tcache_ptr))); + // safety check - pool must be after insns and reachable + if ((u32)(pool - (u8 *)literal_insn[0] + 8) > 0x7ff) { + elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, + "pool offset out of range"); + exit(1); + } + // copy pool and adjust addresses in insns accessing the pool + memcpy(pool, literal_pool, sz); + for (i = 0; i < literal_iindex; i++) { + *literal_insn[i] += ((u8 *)pool - (u8 *)literal_insn[i]) << 20; + } + // count pool constants as insns for statistics + for (i = 0; i < literal_pindex * sizeof(uintptr_t)/sizeof(u32); i++) + COUNT_OP; + + tcache_ptr = (void *)((u8 *)pool + sz); + literal_pindex = literal_iindex = 0; +} + +static void emith_pool_check(void) +{ + // check if pool must be committed + if (literal_iindex > MAX_HOST_LITERALS-4 || (literal_pindex && + (u8 *)tcache_ptr - (u8 *)literal_insn[0] > 0x700)) + // pool full, or displacement is approaching the limit + emith_pool_commit(1); +} + +static void emith_move_imm(int r, uintptr_t imm) +{ + u32 lui = imm + _CB(imm,1,11,12); // compensate for ADDI sign extension + if (lui >> 12) { + EMIT(R5_MOVT_IMM(r, lui)); + if (imm & 0xfff) + EMIT(R5_ADD_IMM(r, r, imm)); + } else + EMIT(R5_ADD_IMM(r, Z0, imm)); +} + +static void emith_move_ptr_imm(int r, uintptr_t imm) +{ +#if __riscv_xlen == 64 + if ((s32)imm != imm) { + int idx; + if (literal_iindex >= MAX_HOST_LITERALS) + emith_pool_commit(1); + idx = emith_pool_literal(imm); + EMIT(R5_MOVA_IMM(AT, 0)); // loads PC of MOVA insn... + 4 in LD + literal_insn[literal_iindex++] = (u32 *)tcache_ptr; + EMIT(R5_I_INSN(OP_LD, F1_P, r, AT, idx*sizeof(uintptr_t) + 4)); + } else +#endif + emith_move_imm(r, imm); +} + +#define emith_move_r_ptr_imm(r, imm) \ + emith_move_ptr_imm(r, (uintptr_t)(imm)) + +#define emith_move_r_imm(r, imm) \ + emith_move_imm(r, (u32)(imm)) +#define emith_move_r_imm_c(cond, r, imm) \ + emith_move_r_imm(r, imm) + +#define emith_move_r_imm_s8_patchable(r, imm) \ + EMIT(R5_ADD_IMM(r, Z0, (s8)(imm))) +#define emith_move_r_imm_s8_patch(ptr, imm) do { \ + u32 *ptr_ = (u32 *)ptr; \ + EMIT_PTR(ptr_, (*ptr_ & 0x000fffff) | ((u16)(s8)(imm)<<20)); \ +} while (0) + +// arithmetic/logical, immediate - R5 always takes a signed 12 bit immediate + +static void emith_op_imm(int f1, int rd, int rs, u32 imm) +{ + int op32 = (f1 == F1_ADD ? R5_OP32 : 0); + if ((imm + _CB(imm,1,11,12)) >> 12) { + emith_move_r_imm(AT, imm); + EMIT(R5_R_INSN(OP_REG^op32, f1&7,_, rd, rs, AT)); + } else if (imm + (f1 == F1_AND) || rd != rs) + EMIT(R5_I_INSN(OP_IMM^op32, f1&7, rd, rs, imm)); +} + +// arithmetic, immediate - can only be ADDI, since SUBI doesn't exist +#define emith_add_r_imm(r, imm) \ + emith_add_r_r_imm(r, r, imm) +#define emith_add_r_imm_c(cond, r, imm) \ + emith_add_r_imm(r, imm) + +#define emith_addf_r_imm(r, imm) \ + emith_addf_r_r_imm(r, imm) + +#define emith_sub_r_imm(r, imm) \ + emith_sub_r_r_imm(r, r, imm) +#define emith_sub_r_imm_c(cond, r, imm) \ + emith_sub_r_imm(r, imm) + +#define emith_subf_r_imm(r, imm) \ + emith_subf_r_r_imm(r, r, imm) + +#define emith_adc_r_imm(r, imm) \ + emith_adc_r_r_imm(r, r, imm); + +#define emith_adcf_r_imm(r, imm) \ + emith_adcf_r_r_imm(r, r, imm) + +#define emith_cmp_r_imm(r, imm) \ + emith_set_compare_flags(r, -1, imm) +// emith_subf_r_r_imm(FNZ, r, imm) + +#define emith_add_r_r_ptr_imm(d, s, imm) \ + emith_op_imm(F1_ADD|F2_ALT, d, s, imm) + +#define emith_add_r_r_imm(d, s, imm) \ + emith_op_imm(F1_ADD, d, s, imm) + +#define emith_addf_r_r_imm(d, s, imm) do { \ + emith_add_r_r_imm(FNZ, s, imm); \ + emith_set_arith_flags(d, s, -1, imm, 0); \ +} while (0) + +#define emith_adc_r_r_imm(d, s, imm) do { \ + emith_add_r_r_r(AT, s, FC); \ + emith_add_r_r_imm(d, AT, imm); \ +} while (0) + +#define emith_adcf_r_r_imm(d, s, imm) do { \ + if (imm == 0) { \ + emith_add_r_r_r(FNZ, s, FC); \ + emith_set_arith_flags(d, s, -1, 1, 0); \ + } else { \ + emith_add_r_r_r(FNZ, s, FC); \ + EMIT(R5_SLTU_REG(AT, FNZ, FC)); \ + emith_add_r_r_imm(FNZ, FNZ, imm); \ + emith_set_arith_flags(d, s, -1, imm, 0); \ + emith_or_r_r(FC, AT); \ + } \ +} while (0) + +// NB: no SUBI in R5, since ADDI takes a signed imm +#define emith_sub_r_r_imm(d, s, imm) \ + emith_add_r_r_imm(d, s, -(imm)) +#define emith_sub_r_r_imm_c(cond, d, s, imm) \ + emith_sub_r_r_imm(d, s, imm) + +#define emith_subf_r_r_imm(d, s, imm) do { \ + emith_sub_r_r_imm(FNZ, s, imm); \ + emith_set_arith_flags(d, s, -1, imm, 1); \ +} while (0) + +// logical, immediate +#define emith_and_r_imm(r, imm) \ + emith_op_imm(F1_AND, r, r, imm) + +#define emith_or_r_imm(r, imm) \ + emith_op_imm(F1_OR, r, r, imm) +#define emith_or_r_imm_c(cond, r, imm) \ + emith_or_r_imm(r, imm) + +#define emith_eor_r_imm_ptr(r, imm) \ + emith_op_imm(F1_XOR, r, r, imm) +#define emith_eor_r_imm_ptr_c(cond, r, imm) \ + emith_eor_r_imm_ptr(r, imm) + +#define emith_eor_r_imm(r, imm) \ + emith_eor_r_imm_ptr(r, imm) +#define emith_eor_r_imm_c(cond, r, imm) \ + emith_eor_r_imm(r, imm) + +/* NB: BIC #imm not available in R5; use AND #~imm instead */ +#define emith_bic_r_imm(r, imm) \ + emith_op_imm(F1_AND, r, r, ~(imm)) +#define emith_bic_r_imm_c(cond, r, imm) \ + emith_bic_r_imm(r, imm) + +#define emith_tst_r_imm(r, imm) do { \ + emith_op_imm(F1_AND, FNZ, r, imm); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) +#define emith_tst_r_imm_c(cond, r, imm) \ + emith_tst_r_imm(r, imm) + +#define emith_and_r_r_imm(d, s, imm) \ + emith_op_imm(F1_AND, d, s, imm) + +#define emith_or_r_r_imm(d, s, imm) \ + emith_op_imm(F1_OR, d, s, imm) + +#define emith_eor_r_r_imm(d, s, imm) \ + emith_op_imm(F1_XOR, d, s, imm) + +// shift +#define emith_lsl(d, s, cnt) \ + EMIT(R5_LSLW_IMM(d, s, cnt)) + +#define emith_lsr(d, s, cnt) \ + EMIT(R5_LSRW_IMM(d, s, cnt)) + +#define emith_asr(d, s, cnt) \ + EMIT(R5_ASRW_IMM(d, s, cnt)) + +#define emith_ror(d, s, cnt) do { \ + EMIT(R5_LSLW_IMM(AT, s, 32-(cnt))); \ + EMIT(R5_LSRW_IMM(d, s, cnt)); \ + EMIT(R5_OR_REG(d, d, AT)); \ +} while (0) +#define emith_ror_c(cond, d, s, cnt) \ + emith_ror(d, s, cnt) + +#define emith_rol(d, s, cnt) do { \ + EMIT(R5_LSRW_IMM(AT, s, 32-(cnt))); \ + EMIT(R5_LSLW_IMM(d, s, cnt)); \ + EMIT(R5_OR_REG(d, d, AT)); \ +} while (0) + +#define emith_rorc(d) do { \ + emith_lsr(d, d, 1); \ + emith_lsl(AT, FC, 31); \ + emith_or_r_r(d, AT); \ +} while (0) + +#define emith_rolc(d) do { \ + emith_lsl(d, d, 1); \ + emith_or_r_r(d, FC); \ +} while (0) + +// NB: all flag setting shifts make V undefined +#define emith_lslf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_lsl(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_lsr(FC, _s, 31); \ + emith_lsl(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_lsrf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_lsr(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_and_r_r_imm(FC, _s, 1); \ + emith_lsr(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_asrf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_asr(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_and_r_r_imm(FC, _s, 1); \ + emith_asr(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_rolf(d, s, cnt) do { \ + emith_rol(d, s, cnt); \ + emith_and_r_r_imm(FC, d, 1); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_rorf(d, s, cnt) do { \ + emith_ror(d, s, cnt); \ + emith_lsr(FC, d, 31); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_rolcf(d) do { \ + emith_lsr(AT, d, 31); \ + emith_lsl(d, d, 1); \ + emith_or_r_r(d, FC); \ + emith_move_r_r(FC, AT); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_rorcf(d) do { \ + emith_and_r_r_imm(AT, d, 1); \ + emith_lsr(d, d, 1); \ + emith_lsl(FC, FC, 31); \ + emith_or_r_r(d, FC); \ + emith_move_r_r(FC, AT); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +// signed/unsigned extend + +#define emith_clear_msb(d, s, count) /* bits to clear */ do { \ + u32 t; \ + if ((count) >= 21) { \ + t = (count) - 21; \ + t = 0x7ff >> t; \ + emith_and_r_r_imm(d, s, t); \ + } else { \ + emith_lsl(d, s, count); \ + emith_lsr(d, d, count); \ + } \ +} while (0) +#define emith_clear_msb_c(cond, d, s, count) \ + emith_clear_msb(d, s, count) + +#define emith_sext(d, s, count) /* bits to keep */ do { \ + emith_lsl(d, s, 32-(count)); \ + emith_asr(d, d, 32-(count)); \ +} while (0) + +// multiply Rd = Rn*Rm (+ Ra) + +#define emith_mul(d, s1, s2) \ + EMIT(R5_MULW(d, s1, s2)) \ + +#define emith_mul_u64(dlo, dhi, s1, s2) \ + EMIT_R5_MULLU_REG(dlo, dhi, s1, s2) + +#define emith_mul_s64(dlo, dhi, s1, s2) \ + EMIT_R5_MULLS_REG(dlo, dhi, s1, s2) + +#define emith_mula_s64(dlo, dhi, s1, s2) do { \ + int t_ = rcache_get_tmp(); \ + EMIT_R5_MULLS_REG(t_, AT, s1, s2); \ + emith_add_r_r(dhi, AT); \ + emith_add_r_r(dlo, t_); \ + EMIT(R5_SLTU_REG(AT, dlo, t_)); \ + emith_add_r_r(dhi, AT); \ + rcache_free_tmp(t_); \ +} while (0) +#define emith_mula_s64_c(cond, dlo, dhi, s1, s2) \ + emith_mula_s64(dlo, dhi, s1, s2) + +// load/store. offs has 12 bits signed, hence larger offs may use a temp +static void emith_ld_offs(int sz, int rd, int rs, int o12) +{ + if (o12 >= -0x800 && o12 < 0x800) { + EMIT(R5_I_INSN(OP_LD, sz, rd, rs, o12)); + } else { + EMIT(R5_MOVT_IMM(AT, o12 + _CB(o12,1,11,12))); \ + EMIT(R5_R_INSN(OP_REG, F1_ADD,_, AT, rs, AT)); \ + EMIT(R5_I_INSN(OP_LD, sz, rd, AT, o12)); + } +} + +#define emith_read_r_r_offs_ptr(r, rs, offs) \ + emith_ld_offs(F1_P, r, rs, offs) +#define emith_read_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_read_r_r_offs_ptr(r, rs, offs) + +#define emith_read_r_r_offs(r, rs, offs) \ + emith_ld_offs(F1_W, r, rs, offs) +#define emith_read_r_r_offs_c(cond, r, rs, offs) \ + emith_read_r_r_offs(r, rs, offs) + +#define emith_read_r_r_r_ptr(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_ld_offs(F1_P, r, AT, 0); \ +} while (0) +#define emith_read_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_ld_offs(F1_W, r, AT, 0); \ +} while (0) +#define emith_read_r_r_r_c(cond, r, rs, rm) \ + emith_read_r_r_r(r, rs, rm) + +#define emith_read8_r_r_offs(r, rs, offs) \ + emith_ld_offs(F1_BU, r, rs, offs) +#define emith_read8_r_r_offs_c(cond, r, rs, offs) \ + emith_read8_r_r_offs(r, rs, offs) + +#define emith_read8_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_ld_offs(F1_BU, r, AT, 0); \ +} while (0) +#define emith_read8_r_r_r_c(cond, r, rs, rm) \ + emith_read8_r_r_r(r, rs, rm) + +#define emith_read16_r_r_offs(r, rs, offs) \ + emith_ld_offs(F1_HU, r, rs, offs) +#define emith_read16_r_r_offs_c(cond, r, rs, offs) \ + emith_read16_r_r_offs(r, rs, offs) + +#define emith_read16_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_ld_offs(F1_HU, r, AT, 0); \ +} while (0) +#define emith_read16_r_r_r_c(cond, r, rs, rm) \ + emith_read16_r_r_r(r, rs, rm) + +#define emith_read8s_r_r_offs(r, rs, offs) \ + emith_ld_offs(F1_B, r, rs, offs) +#define emith_read8s_r_r_offs_c(cond, r, rs, offs) \ + emith_read8s_r_r_offs(r, rs, offs) + +#define emith_read8s_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_ld_offs(F1_B, r, AT, 0); \ +} while (0) +#define emith_read8s_r_r_r_c(cond, r, rs, rm) \ + emith_read8s_r_r_r(r, rs, rm) + +#define emith_read16s_r_r_offs(r, rs, offs) \ + emith_ld_offs(F1_H, r, rs, offs) +#define emith_read16s_r_r_offs_c(cond, r, rs, offs) \ + emith_read16s_r_r_offs(r, rs, offs) + +#define emith_read16s_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_ld_offs(F1_H, r, AT, 0); \ +} while (0) +#define emith_read16s_r_r_r_c(cond, r, rs, rm) \ + emith_read16s_r_r_r(r, rs, rm) + +static void emith_st_offs(int sz, int rt, int rs, int o12) +{ + if (o12 >= -0x800 && o12 < 800) { + EMIT(R5_S_INSN(OP_ST, sz, rt, rs, o12)); + } else { + EMIT(R5_MOVT_IMM(AT, o12 + _CB(o12,1,11,12))); \ + EMIT(R5_R_INSN(OP_REG, F1_ADD,_, AT, rs, AT)); \ + EMIT(R5_S_INSN(OP_ST, sz, rt, AT, o12)); + } +} + +#define emith_write_r_r_offs_ptr(r, rs, offs) \ + emith_st_offs(F1_P, r, rs, offs) +#define emith_write_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_write_r_r_offs_ptr(r, rs, offs) + +#define emith_write_r_r_r_ptr(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_st_offs(F1_P, r, AT, 0); \ +} while (0) +#define emith_write_r_r_r_ptr_c(cond, r, rs, rm) \ + emith_write_r_r_r_ptr(r, rs, rm) + +#define emith_write_r_r_offs(r, rs, offs) \ + emith_st_offs(F1_W, r, rs, offs) +#define emith_write_r_r_offs_c(cond, r, rs, offs) \ + emith_write_r_r_offs(r, rs, offs) + +#define emith_write_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_st_offs(F1_W, r, AT, 0); \ +} while (0) +#define emith_write_r_r_r_c(cond, r, rs, rm) \ + emith_write_r_r_r(r, rs, rm) + +#define emith_ctx_read_ptr(r, offs) \ + emith_read_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_read(r, offs) \ + emith_read_r_r_offs(r, CONTEXT_REG, offs) +#define emith_ctx_read_c(cond, r, offs) \ + emith_ctx_read(r, offs) + +#define emith_ctx_write_ptr(r, offs) \ + emith_write_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_write(r, offs) \ + emith_write_r_r_offs(r, CONTEXT_REG, offs) + +#define emith_ctx_read_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_read(r_, offs_); \ +} while (0) + +#define emith_ctx_write_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_write(r_, offs_); \ +} while (0) + +// function call handling +#define emith_save_caller_regs(mask) do { \ + int _c, _z = PTR_SIZE; u32 _m = mask & 0x3fce0; /* x5-x7,x10-x17 */ \ + _c = count_bits(_m)&3; _m |= (1<<((4-_c)&3))-1; /* ABI align */ \ + int _s = count_bits(_m) * _z, _o = _s; \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, -_s); \ + for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + if (_m & (1 << _c)) \ + { _o -= _z; if (_c) emith_write_r_r_offs_ptr(_c, SP, _o); } \ +} while (0) + +#define emith_restore_caller_regs(mask) do { \ + int _c, _z = PTR_SIZE; u32 _m = mask & 0x3fce0; \ + _c = count_bits(_m)&3; _m |= (1<<((4-_c)&3))-1; /* ABI align */ \ + int _s = count_bits(_m) * _z, _o = 0; \ + for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) \ + { if (_c) emith_read_r_r_offs_ptr(_c, SP, _o); _o += _z; } \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, _s); \ +} while (0) + +#define host_arg2reg(rd, arg) \ + rd = (arg+10) + +#define emith_pass_arg_r(arg, reg) \ + emith_move_r_r(arg, reg) + +#define emith_pass_arg_imm(arg, imm) \ + emith_move_r_imm(arg, imm) + +// branching +#define emith_invert_branch(cond) /* inverted conditional branch */ \ + ((cond) ^ 0x01) + +// evaluate the emulated condition, returns a register/branch type pair +static int emith_cmpr_check(int rs, int rt, int cond, int *r, int *s) +{ + int b = -1; + + // condition check for comparing 2 registers + switch (cond) { + case DCOND_EQ: *r = rs; *s = rt; b = F1_BEQ; break; + case DCOND_NE: *r = rs; *s = rt; b = F1_BNE; break; + case DCOND_LO: *r = rs, *s = rt, b = F1_BLTU; break; // s < t, u + case DCOND_HS: *r = rs, *s = rt, b = F1_BGEU; break; // s >= t, u + case DCOND_LS: *r = rt, *s = rs, b = F1_BGEU; break; // s <= t, u + case DCOND_HI: *r = rt, *s = rs, b = F1_BLTU; break; // s > t, u + case DCOND_LT: *r = rs, *s = rt, b = F1_BLT; break; // s < t + case DCOND_GE: *r = rs, *s = rt, b = F1_BGE; break; // s >= t + case DCOND_LE: *r = rt, *s = rs, b = F1_BGE; break; // s <= t + case DCOND_GT: *r = rt, *s = rs, b = F1_BLT; break; // s > t + } + + return b; +} + +static int emith_cmpi_check(int rs, s32 imm, int cond, int *r, int *s) +{ + int b = -1; + + // condition check for comparing register with immediate + if (imm == 0) return emith_cmpr_check(rs, Z0, cond, r, s); + + emith_move_r_imm(AT, imm); + switch (cond) { + case DCOND_EQ: *r = AT, *s = rs, b = F1_BEQ; break; + case DCOND_NE: *r = AT, *s = rs, b = F1_BNE; break; + case DCOND_LO: *r = rs, *s = AT, b = F1_BLTU; break; // s < imm, u + case DCOND_HS: *r = rs, *s = AT, b = F1_BGEU; break; // s >= imm, u + case DCOND_LS: *r = AT, *s = rs, b = F1_BGEU; break; // s <= imm, u + case DCOND_HI: *r = AT, *s = rs, b = F1_BLTU; break; // s > imm, u + case DCOND_LT: *r = rs, *s = AT, b = F1_BLT; break; // s < imm + case DCOND_GE: *r = rs, *s = AT, b = F1_BGE; break; // s >= imm + case DCOND_LE: *r = AT, *s = rs, b = F1_BGE; break; // s <= imm + case DCOND_GT: *r = AT, *s = rs, b = F1_BLT; break; // s > imm + } + return b; +} + +static int emith_cond_check(int cond, int *r, int *s) +{ + int b = -1; + + *s = Z0; + if (emith_cmp_rs >= 0) { + if (emith_cmp_rt != -1) + b = emith_cmpr_check(emith_cmp_rs,emith_cmp_rt, cond,r,s); + else b = emith_cmpi_check(emith_cmp_rs,emith_cmp_imm,cond,r,s); + } + + // shortcut for V known to be 0 + if (b < 0 && emith_flg_noV) switch (cond) { + case DCOND_VS: *r = Z0; b = F1_BNE; break; // never + case DCOND_VC: *r = Z0; b = F1_BEQ; break; // always + case DCOND_LT: *r = FNZ, b = F1_BLT; break; // N + case DCOND_GE: *r = FNZ, b = F1_BGE; break; // !N + case DCOND_LE: *r = Z0, *s = FNZ, b = F1_BGE; break; // N || Z + case DCOND_GT: *r = Z0, *s = FNZ, b = F1_BLT; break; // !N && !Z + } + + // the full monty if no shortcut + if (b < 0) switch (cond) { + // conditions using NZ + case DCOND_EQ: *r = FNZ; b = F1_BEQ; break; // Z + case DCOND_NE: *r = FNZ; b = F1_BNE; break; // !Z + case DCOND_MI: *r = FNZ; b = F1_BLT; break; // N + case DCOND_PL: *r = FNZ; b = F1_BGE; break; // !N + // conditions using C + case DCOND_LO: *r = FC; b = F1_BNE; break; // C + case DCOND_HS: *r = FC; b = F1_BEQ; break; // !C + // conditions using CZ + case DCOND_LS: // C || Z + case DCOND_HI: // !C && !Z + EMIT(R5_ADD_IMM(AT, FC, -1)); // !C && !Z + EMIT(R5_AND_REG(AT, FNZ, AT)); + *r = AT, b = (cond == DCOND_HI ? F1_BNE : F1_BEQ); + break; + + // conditions using V + case DCOND_VS: // V + case DCOND_VC: // !V + EMIT(R5_XOR_REG(AT, FV, FNZ)); // V = Nt^Ns^Nd^C + EMIT(R5_LSRW_IMM(AT, AT, 31)); + EMIT(R5_XOR_REG(AT, AT, FC)); + *r = AT, b = (cond == DCOND_VS ? F1_BNE : F1_BEQ); + break; + // conditions using VNZ + case DCOND_LT: // N^V + case DCOND_GE: // !(N^V) + EMIT(R5_LSRW_IMM(AT, FV, 31)); // Nd^V = Nt^Ns^C + EMIT(R5_XOR_REG(AT, FC, AT)); + *r = AT, b = (cond == DCOND_LT ? F1_BNE : F1_BEQ); + break; + case DCOND_LE: // (N^V) || Z + case DCOND_GT: // !(N^V) && !Z + EMIT(R5_LSRW_IMM(AT, FV, 31)); // Nd^V = Nt^Ns^C + EMIT(R5_XOR_REG(AT, FC, AT)); + EMIT(R5_ADD_IMM(AT, AT, -1)); // !(Nd^V) && !Z + EMIT(R5_AND_REG(AT, FNZ, AT)); + *r = AT, b = (cond == DCOND_GT ? F1_BNE : F1_BEQ); + break; + } + return b; +} + +// NB: R5 unconditional jumps have only +/- 1MB range, hence use reg jumps +#define emith_jump(target) do { \ + uintptr_t target_ = (uintptr_t)(target); \ + EMIT(R5_MOVT_IMM(AT, target_ + _CB(target_,1,11,12))); \ + EMIT(R5_JR(AT, target_)); \ +} while (0) +#define emith_jump_patchable(target) \ + emith_jump(target) + +// NB: R5 conditional branches have only +/- 4KB range +#define emith_jump_cond(cond, target) do { \ + int r_, s_, mcond_ = emith_cond_check(cond, &r_, &s_); \ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ + EMIT(R5_BCOND(mcond_,r_,s_,disp_ & 0x00001fff)); \ +} while (0) +#define emith_jump_cond_patchable(cond, target) \ + emith_jump_cond(cond, target) + +#define emith_jump_cond_inrange(target) \ + ((u8 *)target - (u8 *)tcache_ptr < 0x1000 && \ + (u8 *)target - (u8 *)tcache_ptr >= -0x1000+0x10) // mind cond_check + +// NB: returns position of patch for cache maintenance +#define emith_jump_patch(ptr, target, pos) do { \ + u32 *ptr_ = (u32 *)ptr; /* must skip condition check code */ \ + while ((*ptr_&0x77) != OP_BCOND && (*ptr_&0x77) != OP_LUI) ptr_ ++; \ + if ((*ptr_&0x77) == OP_BCOND) { \ + u32 *p_ = ptr_, disp_ = (u8 *)target - (u8 *)ptr_; \ + u32 f1_ = _CB(*ptr_,3,12,0); \ + u32 r_ = _CB(*ptr_,5,15,0), s_ = _CB(*ptr_,5,20,0); \ + EMIT_PTR(p_, R5_BCOND(f1_, r_, s_, disp_ & 0x00001fff)); \ + } else { \ + u32 *p_ = ptr_; \ + uintptr_t target_ = (uintptr_t)(target); \ + EMIT_PTR(p_, R5_MOVT_IMM(AT, target_ + _CB(target_,1,11,12))); \ + EMIT_PTR(p_, R5_JR(AT, target_)); \ + } \ + if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)(ptr_); \ +} while (0) + +#define emith_jump_patch_inrange(ptr, target) \ + ((u8 *)target - (u8 *)ptr < 0x1000 && \ + (u8 *)target - (u8 *)ptr >= -0x1000+0x10) // mind cond_check +#define emith_jump_patch_size() 8 + +#define emith_jump_at(ptr, target) do { \ + uintptr_t target_ = (uintptr_t)(target); \ + u32 *ptr_ = (u32 *)ptr; \ + EMIT_PTR(ptr_, R5_MOVT_IMM(AT, target_ + _CB(target_,1,11,12))); \ + EMIT_PTR(ptr_, R5_JR(AT, target_)); \ +} while (0) +#define emith_jump_at_size() 8 + +#define emith_jump_reg(r) \ + EMIT(R5_JR(r, 0)) +#define emith_jump_reg_c(cond, r) \ + emith_jump_reg(r) + +#define emith_jump_ctx(offs) do { \ + emith_ctx_read_ptr(AT, offs); \ + emith_jump_reg(AT); \ +} while (0) +#define emith_jump_ctx_c(cond, offs) \ + emith_jump_ctx(offs) + +#define emith_call(target) do { \ + uintptr_t target_ = (uintptr_t)(target); \ + EMIT(R5_MOVT_IMM(AT, target_ + _CB(target_,1,11,12))); \ + EMIT(R5_JALR(LR, AT, target_)); \ +} while (0) +#define emith_call_cond(cond, target) \ + emith_call(target) + +#define emith_call_reg(r) \ + EMIT(R5_JALR(LR, r, 0)) + +#define emith_call_ctx(offs) do { \ + emith_ctx_read_ptr(AT, offs); \ + emith_call_reg(AT); \ +} while (0) + +#define emith_call_cleanup() /**/ + +#define emith_ret() \ + EMIT(R5_JR(LR, 0)) +#define emith_ret_c(cond) \ + emith_ret() + +#define emith_ret_to_ctx(offs) \ + emith_ctx_write_ptr(LR, offs) + +#define emith_add_r_ret(r) \ + emith_add_r_r_ptr(r, LR) + +#define emith_push_ret(r) do { \ + emith_add_r_r_ptr_imm(SP, SP, -16); /* ABI requires 16 byte aligment */\ + emith_write_r_r_offs_ptr(LR, SP, 8); \ + if ((r) > 0) emith_write_r_r_offs(r, SP, 0); \ +} while (0) + +#define emith_pop_and_ret(r) do { \ + if ((r) > 0) emith_read_r_r_offs(r, SP, 0); \ + emith_read_r_r_offs_ptr(LR, SP, 8); \ + emith_add_r_r_ptr_imm(SP, SP, 16); \ + emith_ret(); \ +} while (0) + + +// emitter ABI stuff +#define emith_insn_ptr() ((u8 *)tcache_ptr) +#define emith_flush() /**/ +#define host_instructions_updated(base, end) __builtin___clear_cache(base, end) +#define emith_update_cache() /**/ +#define emith_rw_offs_max() 0x7ff +#define emith_uext_ptr(r) /**/ + +// SH2 drc specific +#define emith_sh2_drc_entry() do { \ + int _c, _z = PTR_SIZE; u32 _m = 0x0ffc0202; /* x1,x9,x18-x27 */ \ + _c = count_bits(_m)&3; _m |= (1<<((4-_c)&3))-1; /* ABI align */ \ + int _s = count_bits(_m) * _z, _o = _s; \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, -_s); \ + for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + if (_m & (1 << _c)) \ + { _o -= _z; if (_c) emith_write_r_r_offs_ptr(_c, SP, _o); } \ +} while (0) +#define emith_sh2_drc_exit() do { \ + int _c, _z = PTR_SIZE; u32 _m = 0x0ffc0202; \ + _c = count_bits(_m)&3; _m |= (1<<((4-_c)&3))-1; /* ABI align */ \ + int _s = count_bits(_m) * _z, _o = 0; \ + for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) \ + { if (_c) emith_read_r_r_offs_ptr(_c, SP, _o); _o += _z; } \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, _s); \ + emith_ret(); \ +} while (0) + +// NB: assumes a is in arg0, tab, func and mask are temp +#define emith_sh2_rcall(a, tab, func, mask) do { \ + emith_lsr(mask, a, SH2_READ_SHIFT); \ + emith_add_r_r_r_lsl_ptr(tab, tab, mask, PTR_SCALE+1); \ + emith_read_r_r_offs_ptr(func, tab, 0); \ + emith_read_r_r_offs(mask, tab, PTR_SIZE); \ + emith_addf_r_r_r_ptr(func, func, func); \ +} while (0) + +// NB: assumes a, val are in arg0 and arg1, tab and func are temp +#define emith_sh2_wcall(a, val, tab, func) do { \ + emith_lsr(func, a, SH2_WRITE_SHIFT); \ + emith_lsl(func, func, PTR_SCALE); \ + emith_read_r_r_r_ptr(func, tab, func); \ + emith_move_r_r_ptr(12, CONTEXT_REG); /* arg2 */ \ + emith_jump_reg(func); \ +} while (0) + +#define emith_sh2_delay_loop(cycles, reg) do { \ + int sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); \ + int t1 = rcache_get_tmp(); \ + int t2 = rcache_get_tmp(); \ + int t3 = rcache_get_tmp(); \ + /* if (sr < 0) return */ \ + emith_cmp_r_imm(sr, 0); \ + EMITH_JMP_START(DCOND_LE); \ + /* turns = sr.cycles / cycles */ \ + emith_asr(t2, sr, 12); \ + emith_move_r_imm(t3, (u32)((1ULL<<32) / (cycles)) + 1); \ + emith_mul_u64(t1, t2, t2, t3); /* multiply by 1/x */ \ + rcache_free_tmp(t3); \ + if (reg >= 0) { \ + /* if (reg <= turns) turns = reg-1 */ \ + t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \ + emith_cmp_r_r(t3, t2); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_sub_r_r_imm_c(DCOND_LS, t2, t3, 1); \ + EMITH_SJMP_END(DCOND_HI); \ + /* if (reg <= 1) turns = 0 */ \ + emith_cmp_r_imm(t3, 1); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_move_r_imm_c(DCOND_LS, t2, 0); \ + EMITH_SJMP_END(DCOND_HI); \ + /* reg -= turns */ \ + emith_sub_r_r(t3, t2); \ + } \ + /* sr.cycles -= turns * cycles; */ \ + emith_move_r_imm(t1, cycles); \ + emith_mul(t1, t2, t1); \ + emith_sub_r_r_r_lsl(sr, sr, t1, 12); \ + EMITH_JMP_END(DCOND_LE); \ + rcache_free_tmp(t1); \ + rcache_free_tmp(t2); \ +} while (0) + +/* + * T = !carry(Rn = (Rn << 1) | T) + * if Q + * C = carry(Rn += Rm) + * else + * C = carry(Rn -= Rm) + * T ^= C + */ +#define emith_sh2_div1_step(rn, rm, sr) do { \ + int t_ = rcache_get_tmp(); \ + emith_and_r_r_imm(AT, sr, T); \ + emith_lsr(FC, rn, 31); /*Rn = (Rn<<1)+T*/ \ + emith_lsl(t_, rn, 1); \ + emith_or_r_r(t_, AT); \ + emith_or_r_imm(sr, T); /* T = !carry */ \ + emith_eor_r_r(sr, FC); \ + emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ + EMITH_JMP3_START(DCOND_EQ); \ + emith_add_r_r_r(rn, t_, rm); \ + EMIT(R5_SLTU_REG(FC, rn, t_)); \ + EMITH_JMP3_MID(DCOND_EQ); \ + emith_sub_r_r_r(rn, t_, rm); \ + EMIT(R5_SLTU_REG(FC, t_, rn)); \ + EMITH_JMP3_END(); \ + emith_eor_r_r(sr, FC); /* T ^= carry */ \ + rcache_free_tmp(t_); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macl(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* MACH top 16 bits unused if saturated. sign ext for overfl detect */ \ + emith_sext(mh, mh, 16); \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ + /* to check: add MACH >> 31 to MACH >> 15. this is 0 if no overflow */ \ + emith_asr(rn, mh, 15); \ + emith_add_r_r_r_lsr(rn, rn, mh, 31); /* sum = (MACH>>31)+(MACH>>15) */ \ + emith_teq_r_r(rn, Z0); /* (need only N and Z flags) */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ + EMITH_SJMP_START(DCOND_PL); /* sum > 0 -> +ovl */ \ + emith_sub_r_imm_c(DCOND_MI, ml, 1); /* 0xffffffff */ \ + emith_sub_r_imm_c(DCOND_MI, mh, 1); /* 0x00007fff */ \ + EMITH_SJMP_END(DCOND_PL); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macw(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* XXX: MACH should be untouched when S is set? */ \ + emith_asr(mh, ml, 31); /* sign ext MACL to MACH for ovrfl check */ \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 33 bits of MACH:MACL aren't all 1 or 0 */ \ + /* to check: add MACL[31] to MACH. this is 0 if no overflow */ \ + emith_lsr(rn, ml, 31); \ + emith_add_r_r(rn, mh); /* sum = MACH + ((MACL>>31)&1) */ \ + emith_teq_r_r(rn, Z0); /* (need only N and Z flags) */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> overflow */ \ + /* XXX: LSB signalling only in SH1, or in SH2 too? */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x00000001); /* LSB of MACH */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x80000000); /* negative ovrfl */ \ + EMITH_SJMP_START(DCOND_PL); /* sum > 0 -> positive ovrfl */ \ + emith_sub_r_imm_c(DCOND_MI, ml, 1); /* 0x7fffffff */ \ + EMITH_SJMP_END(DCOND_PL); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +#define emith_write_sr(sr, srcr) do { \ + emith_lsr(sr, sr , 10); emith_lsl(sr, sr, 10); \ + emith_lsl(AT, srcr, 22); emith_lsr(AT, AT, 22); \ + emith_or_r_r(sr, AT); \ +} while (0) + +#define emith_carry_to_t(sr, is_sub) do { \ + emith_and_r_imm(sr, 0xfffffffe); \ + emith_or_r_r(sr, FC); \ +} while (0) + +#define emith_t_to_carry(sr, is_sub) do { \ + emith_and_r_r_imm(FC, sr, 1); \ +} while (0) + +#define emith_tpop_carry(sr, is_sub) do { \ + emith_and_r_r_imm(FC, sr, 1); \ + emith_eor_r_r(sr, FC); \ +} while (0) + +#define emith_tpush_carry(sr, is_sub) \ + emith_or_r_r(sr, FC) + +#ifdef T +// T bit handling +#define emith_invert_cond(cond) \ + ((cond) ^ 1) + +static void emith_clr_t_cond(int sr) +{ + emith_bic_r_imm(sr, T); +} + +static void emith_set_t_cond(int sr, int cond) +{ + int b, r, s; + u8 *ptr; + u32 val = 0, inv = 0; + + // try to avoid jumping around if possible + b = emith_cond_check(cond, &r, &s); + if (r == Z0) { + if (b == F1_BEQ || b == F1_BGE || b == F1_BGEU) + emith_or_r_imm(sr, T); + return; + } else if (r == FC) + val++, inv = (b == F1_BEQ); + + if (!val) switch (b) { + case F1_BEQ: if (s == Z0) { EMIT(R5_SLTU_IMM(AT,r ,1)); r=AT; val++; break; } + EMIT(R5_XOR_REG(AT, r, s)); + EMIT(R5_SLTU_IMM(AT,AT, 1)); r=AT; val++; break; + case F1_BNE: if (s == Z0) { EMIT(R5_SLTU_REG(AT,Z0,r)); r=AT; val++; break; } + EMIT(R5_XOR_REG(AT, r, s)); + EMIT(R5_SLTU_REG(AT,Z0,AT)); r=AT; val++; break; + case F1_BLTU: EMIT(R5_SLTU_REG(AT, r, s)); r=AT; val++; break; + case F1_BGEU: EMIT(R5_SLTU_REG(AT, r, s)); r=AT; val++; inv++; break; + case F1_BLT: EMIT(R5_SLT_REG(AT, r, s)); r=AT; val++; break; + case F1_BGE: EMIT(R5_SLT_REG(AT, r, s)); r=AT; val++; inv++; break; + } + if (val) { + emith_or_r_r(sr, r); + if (inv) + emith_eor_r_imm(sr, T); + return; + } + + // can't obtain result directly, use presumably slower jump !cond + or sr,T + b = emith_invert_branch(b); + ptr = tcache_ptr; + EMIT(R5_BCOND(b, r, s, 0)); + emith_or_r_imm(sr, T); + val = (u8 *)tcache_ptr - (u8 *)(ptr); + EMIT_PTR(ptr, R5_BCOND(b, r, s, val & 0x00001fff)); +} + +#define emith_get_t_cond() -1 + +#define emith_sync_t(sr) ((void)sr) + +#define emith_invalidate_t() + +static void emith_set_t(int sr, int val) +{ + if (val) + emith_or_r_imm(sr, T); + else + emith_bic_r_imm(sr, T); +} + +static int emith_tst_t(int sr, int tf) +{ + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; +} +#endif diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 865aab4b4..60b2b6a2c 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -1,6 +1,7 @@ /* * Basic macros to emit x86 instructions and some utils * Copyright (C) 2008,2009,2010 notaz + * Copyright (C) 2019 kub * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. @@ -13,10 +14,11 @@ */ #include -enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; +enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common + xR8, xR9, xR10, xR11, xR12, xR13, xR14, xR15 }; // x86-64 only -#define CONTEXT_REG xBP -#define RET_REG xAX +#define CONTEXT_REG xBP +#define RET_REG xAX #define ICOND_JO 0x00 #define ICOND_JNO 0x01 @@ -51,6 +53,9 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define DCOND_VS ICOND_JO // oVerflow Set #define DCOND_VC ICOND_JNO // oVerflow Clear +#define DCOND_CS ICOND_JB // carry set +#define DCOND_CC ICOND_JAE // carry clear + #define EMIT_PTR(ptr, val, type) \ *(type *)(ptr) = val @@ -61,7 +66,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define EMIT_OP(op) do { \ COUNT_OP; \ - EMIT(op, u8); \ + if ((op) > 0xff) EMIT((op) >> 8, u8); \ + EMIT((u8)(op), u8); \ } while (0) #define EMIT_MODRM(mod, r, rm) do { \ @@ -106,45 +112,70 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMIT_PTR(ptr + 1, (tcache_ptr - (ptr+2)), u8) // _r_r -#define emith_move_r_r(dst, src) \ - EMIT_OP_MODRM(0x8b, 3, dst, src) +#define emith_move_r_r(dst, src) do {\ + EMIT_REX_IF(0, dst, src); \ + EMIT_OP_MODRM64(0x8b, 3, dst, src); \ +} while (0) #define emith_move_r_r_ptr(dst, src) do { \ EMIT_REX_IF(1, dst, src); \ EMIT_OP_MODRM64(0x8b, 3, dst, src); \ } while (0) -#define emith_add_r_r(d, s) \ - EMIT_OP_MODRM(0x01, 3, s, d) +#define emith_add_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x01, 3, s, d); \ +} while (0) -#define emith_sub_r_r(d, s) \ - EMIT_OP_MODRM(0x29, 3, s, d) +#define emith_add_r_r_ptr(d, s) do { \ + EMIT_REX_IF(1, s, d); \ + EMIT_OP_MODRM64(0x01, 3, s, d); \ +} while (0) -#define emith_adc_r_r(d, s) \ - EMIT_OP_MODRM(0x11, 3, s, d) +#define emith_sub_r_r(d, s) do {\ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x29, 3, s, d); \ +} while (0) -#define emith_sbc_r_r(d, s) \ - EMIT_OP_MODRM(0x19, 3, s, d) /* SBB */ +#define emith_adc_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x11, 3, s, d); \ +} while (0) -#define emith_or_r_r(d, s) \ - EMIT_OP_MODRM(0x09, 3, s, d) +#define emith_sbc_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x19, 3, s, d); /* SBB */ \ +} while (0) -#define emith_and_r_r(d, s) \ - EMIT_OP_MODRM(0x21, 3, s, d) +#define emith_or_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x09, 3, s, d); \ +} while (0) -#define emith_eor_r_r(d, s) \ - EMIT_OP_MODRM(0x31, 3, s, d) /* XOR */ +#define emith_and_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x21, 3, s, d); \ +} while (0) -#define emith_tst_r_r(d, s) \ - EMIT_OP_MODRM(0x85, 3, s, d) /* TEST */ +#define emith_eor_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x31, 3, s, d); /* XOR */ \ +} while (0) + +#define emith_tst_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x85, 3, s, d); /* TEST */ \ +} while (0) #define emith_tst_r_r_ptr(d, s) do { \ EMIT_REX_IF(1, s, d); \ EMIT_OP_MODRM64(0x85, 3, s, d); /* TEST */ \ } while (0) -#define emith_cmp_r_r(d, s) \ - EMIT_OP_MODRM(0x39, 3, s, d) +#define emith_cmp_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x39, 3, s, d); \ +} while (0) // fake teq - test equivalence - get_flags(d ^ s) #define emith_teq_r_r(d, s) do { \ @@ -156,7 +187,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_mvn_r_r(d, s) do { \ if (d != s) \ emith_move_r_r(d, s); \ - EMIT_OP_MODRM(0xf7, 3, 2, d); /* NOT d */ \ + EMIT_REX_IF(0, 0, d); \ + EMIT_OP_MODRM64(0xf7, 3, 2, d); /* NOT d */ \ } while (0) #define emith_negc_r_r(d, s) do { \ @@ -170,7 +202,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_neg_r_r(d, s) do { \ if (d != s) \ emith_move_r_r(d, s); \ - EMIT_OP_MODRM(0xf7, 3, 3, d); /* NEG d */ \ + EMIT_REX_IF(0, 0, d); \ + EMIT_OP_MODRM64(0xf7, 3, 3, d); /* NEG d */ \ } while (0) // _r_r_r @@ -185,6 +218,72 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; } \ } while (0) +#define emith_add_r_r_r_ptr(d, s1, s2) do { \ + if (d == s1) { \ + emith_add_r_r_ptr(d, s2); \ + } else if (d == s2) { \ + emith_add_r_r_ptr(d, s1); \ + } else { \ + emith_move_r_r_ptr(d, s1); \ + emith_add_r_r_ptr(d, s2); \ + } \ +} while (0) + +#define emith_sub_r_r_r(d, s1, s2) do { \ + if (d == s1) { \ + emith_sub_r_r(d, s2); \ + } else if (d == s2) { \ + emith_sub_r_r(d, s1); \ + } else { \ + emith_move_r_r(d, s1); \ + emith_sub_r_r(d, s2); \ + } \ +} while (0) + +#define emith_adc_r_r_r(d, s1, s2) do { \ + if (d == s1) { \ + emith_adc_r_r(d, s2); \ + } else if (d == s2) { \ + emith_adc_r_r(d, s1); \ + } else { \ + emith_move_r_r(d, s1); \ + emith_adc_r_r(d, s2); \ + } \ +} while (0) + +#define emith_sbc_r_r_r(d, s1, s2) do { \ + if (d == s1) { \ + emith_sbc_r_r(d, s2); \ + } else if (d == s2) { \ + emith_sbc_r_r(d, s1); \ + } else { \ + emith_move_r_r(d, s1); \ + emith_sbc_r_r(d, s2); \ + } \ +} while (0) + +#define emith_and_r_r_r(d, s1, s2) do { \ + if (d == s1) { \ + emith_and_r_r(d, s2); \ + } else if (d == s2) { \ + emith_and_r_r(d, s1); \ + } else { \ + emith_move_r_r(d, s1); \ + emith_and_r_r(d, s2); \ + } \ +} while (0) + +#define emith_or_r_r_r(d, s1, s2) do { \ + if (d == s1) { \ + emith_or_r_r(d, s2); \ + } else if (d == s2) { \ + emith_or_r_r(d, s1); \ + } else { \ + emith_move_r_r(d, s1); \ + emith_or_r_r(d, s2); \ + } \ +} while (0) + #define emith_eor_r_r_r(d, s1, s2) do { \ if (d == s1) { \ emith_eor_r_r(d, s2); \ @@ -196,33 +295,114 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; } \ } while (0) -// _r_r_shift -#define emith_or_r_r_lsl(d, s, lslimm) do { \ - int tmp_ = rcache_get_tmp(); \ - emith_lsl(tmp_, s, lslimm); \ - emith_or_r_r(d, tmp_); \ - rcache_free_tmp(tmp_); \ +// _r_r_r_shift +#define emith_add_r_r_r_lsl(d, s1, s2, lslimm) do { \ + if (lslimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsl(tmp_, s2, lslimm); \ + emith_add_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_add_r_r_r(d, s1, s2); \ +} while (0) + +#define emith_add_r_r_r_lsl_ptr(d, s1, s2, lslimm) do { \ + if (lslimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsl(tmp_, s2, lslimm); \ + emith_add_r_r_r_ptr(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_add_r_r_r_ptr(d, s1, s2); \ +} while (0) + +#define emith_add_r_r_r_lsr(d, s1, s2, lsrimm) do { \ + if (lsrimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsr(tmp_, s2, lsrimm); \ + emith_add_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_add_r_r_r(d, s1, s2); \ +} while (0) + +#define emith_sub_r_r_r_lsl(d, s1, s2, lslimm) do { \ + if (lslimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsl(tmp_, s2, lslimm); \ + emith_sub_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_sub_r_r_r(d, s1, s2); \ } while (0) -// d != s +#define emith_or_r_r_r_lsl(d, s1, s2, lslimm) do { \ + if (lslimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsl(tmp_, s2, lslimm); \ + emith_or_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_or_r_r_r(d, s1, s2); \ +} while (0) +#define emith_or_r_r_r_lsr(d, s1, s2, lsrimm) do { \ + if (lsrimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsr(tmp_, s2, lsrimm); \ + emith_or_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_or_r_r_r(d, s1, s2); \ +} while (0) + +// _r_r_shift +#define emith_or_r_r_lsl(d, s, lslimm) \ + emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) + +#define emith_eor_r_r_lsl(d, s, lslimm) do { \ + if (lslimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsl(tmp_, s, lslimm); \ + emith_eor_r_r(d, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_eor_r_r(d, s); \ +} while (0) #define emith_eor_r_r_lsr(d, s, lsrimm) do { \ - emith_push(s); \ - emith_lsr(s, s, lsrimm); \ - emith_eor_r_r(d, s); \ - emith_pop(s); \ + if (lsrimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsr(tmp_, s, lsrimm); \ + emith_eor_r_r(d, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_eor_r_r(d, s); \ } while (0) // _r_imm #define emith_move_r_imm(r, imm) do { \ - EMIT_OP(0xb8 + (r)); \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP(0xb8 + ((r)&7)); \ EMIT(imm, u32); \ } while (0) -#define emith_move_r_imm_s8(r, imm) \ - emith_move_r_imm(r, (u32)(signed int)(signed char)(imm)) +#define emith_move_r_ptr_imm(r, imm) do { \ + if ((uintptr_t)(imm) <= UINT32_MAX) \ + emith_move_r_imm(r, (uintptr_t)(imm)); \ + else { \ + EMIT_REX_IF(1, 0, r); \ + EMIT_OP(0xb8 + ((r)&7)); \ + EMIT((uintptr_t)(imm), uint64_t); \ + } \ +} while (0) + +#define emith_move_r_imm_s8_patchable(r, imm) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP(0xb8 + ((r)&7)); \ + EMIT((s8)(imm), u32); \ +} while (0) +#define emith_move_r_imm_s8_patch(ptr, imm) do { \ + u8 *ptr_ = ptr; \ + while ((*ptr_ & 0xf8) != 0xb8) ptr_++; \ + EMIT_PTR(ptr_ + 1, (s8)(imm), u32); \ +} while (0) #define emith_arith_r_imm(op, r, imm) do { \ - EMIT_OP_MODRM(0x81, 3, op, r); \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM64(0x81, 3, op, r); \ EMIT(imm, u32); \ } while (0) @@ -241,14 +421,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_and_r_imm(r, imm) \ emith_arith_r_imm(4, r, imm) -/* used for sub cycles after test, so retain flags with lea */ -#define emith_sub_r_imm(r, imm) do { \ - assert(r != xSP); \ - EMIT_OP_MODRM(0x8d, 2, r, r); \ - EMIT(-(s32)(imm), s32); \ -} while (0) - -#define emith_subf_r_imm(r, imm) \ +#define emith_sub_r_imm(r, imm) \ emith_arith_r_imm(5, r, imm) #define emith_eor_r_imm(r, imm) \ @@ -257,8 +430,15 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_cmp_r_imm(r, imm) \ emith_arith_r_imm(7, r, imm) +#define emith_eor_r_imm_ptr(r, imm) do { \ + EMIT_REX_IF(1, 0, r); \ + EMIT_OP_MODRM64(0x81, 3, 6, r); \ + EMIT(imm, u32); \ +} while (0) + #define emith_tst_r_imm(r, imm) do { \ - EMIT_OP_MODRM(0xf7, 3, 0, r); \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM64(0xf7, 3, 0, r); \ EMIT(imm, u32); \ } while (0) @@ -267,34 +447,52 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_arith_r_imm(4, r, ~(imm)) // fake conditionals (using SJMP instead) -#define emith_move_r_imm_c(cond, r, imm) do { \ - (void)(cond); \ - emith_move_r_imm(r, imm); \ -} while (0) - -#define emith_add_r_imm_c(cond, r, imm) do { \ - (void)(cond); \ - emith_add_r_imm(r, imm); \ -} while (0) - -#define emith_sub_r_imm_c(cond, r, imm) do { \ - (void)(cond); \ - emith_sub_r_imm(r, imm); \ -} while (0) - +#define emith_move_r_imm_c(cond, r, imm) \ + emith_move_r_imm(r, imm) +#define emith_add_r_imm_c(cond, r, imm) \ + emith_add_r_imm(r, imm) +#define emith_sub_r_imm_c(cond, r, imm) \ + emith_sub_r_imm(r, imm) #define emith_or_r_imm_c(cond, r, imm) \ emith_or_r_imm(r, imm) #define emith_eor_r_imm_c(cond, r, imm) \ emith_eor_r_imm(r, imm) +#define emith_eor_r_imm_ptr_c(cond, r, imm) \ + emith_eor_r_imm_ptr(r, imm) #define emith_bic_r_imm_c(cond, r, imm) \ emith_bic_r_imm(r, imm) +#define emith_tst_r_imm_c(cond, r, imm) \ + emith_tst_r_imm(r, imm) +#define emith_move_r_r_ptr_c(cond, d, s) \ + emith_move_r_r_ptr(d, s) #define emith_ror_c(cond, d, s, cnt) \ emith_ror(d, s, cnt) +#define emith_and_r_r_c(cond, d, s) \ + emith_and_r_r(d, s) +#define emith_add_r_r_imm_c(cond, d, s, imm) \ + emith_add_r_r_imm(d, s, imm) +#define emith_sub_r_r_imm_c(cond, d, s, imm) \ + emith_sub_r_r_imm(d, s, imm) + +#define emith_read8_r_r_r_c(cond, r, rs, rm) \ + emith_read8_r_r_r(r, rs, rm) +#define emith_read8s_r_r_r_c(cond, r, rs, rm) \ + emith_read8s_r_r_r(r, rs, rm) +#define emith_read16_r_r_r_c(cond, r, rs, rm) \ + emith_read16_r_r_r(r, rs, rm) +#define emith_read16s_r_r_r_c(cond, r, rs, rm) \ + emith_read16s_r_r_r(r, rs, rm) +#define emith_read_r_r_r_c(cond, r, rs, rm) \ + emith_read_r_r_r(r, rs, rm) #define emith_read_r_r_offs_c(cond, r, rs, offs) \ emith_read_r_r_offs(r, rs, offs) +#define emith_read_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_read_r_r_offs_ptr(r, rs, offs) #define emith_write_r_r_offs_c(cond, r, rs, offs) \ emith_write_r_r_offs(r, rs, offs) +#define emith_write_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_write_r_r_offs_ptr(r, rs, offs) #define emith_read8_r_r_offs_c(cond, r, rs, offs) \ emith_read8_r_r_offs(r, rs, offs) #define emith_write8_r_r_offs_c(cond, r, rs, offs) \ @@ -312,36 +510,51 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; // _r_r_imm - use lea #define emith_add_r_r_imm(d, s, imm) do { \ - assert(s != xSP); \ - EMIT_OP_MODRM(0x8d, 2, d, s); /* lea */ \ + EMIT_REX_IF(0, d, s); \ + emith_deref_modrm(0x8d, 2, d, s); \ EMIT(imm, s32); \ } while (0) #define emith_add_r_r_ptr_imm(d, s, imm) do { \ - if ((s) != xSP) { \ - EMIT_REX_IF(1, d, s); \ - EMIT_OP_MODRM64(0x8d, 2, d, s); /* lea */ \ - } \ - else { \ - if (d != s) \ - emith_move_r_r_ptr(d, s); \ - EMIT_REX_IF(1, 0, d); \ - EMIT_OP_MODRM64(0x81, 3, 0, d); /* add */ \ - } \ + EMIT_REX_IF(1, d, s); \ + emith_deref_modrm(0x8d, 2, d, s); \ EMIT(imm, s32); \ } while (0) +#define emith_sub_r_r_imm(d, s, imm) do { \ + if (d != s) \ + emith_move_r_r(d, s); \ + if ((s32)(imm) != 0) \ + emith_sub_r_imm(d, imm); \ +} while (0) + #define emith_and_r_r_imm(d, s, imm) do { \ if (d != s) \ emith_move_r_r(d, s); \ - emith_and_r_imm(d, imm); \ + if ((s32)(imm) != -1) \ + emith_and_r_imm(d, imm); \ +} while (0) + +#define emith_or_r_r_imm(d, s, imm) do { \ + if (d != s) \ + emith_move_r_r(d, s); \ + if ((s32)(imm) != 0) \ + emith_or_r_imm(d, imm); \ +} while (0) + +#define emith_eor_r_r_imm(d, s, imm) do { \ + if (d != s) \ + emith_move_r_r(d, s); \ + if ((s32)(imm) != 0) \ + emith_eor_r_imm(d, imm); \ } while (0) // shift #define emith_shift(op, d, s, cnt) do { \ if (d != s) \ emith_move_r_r(d, s); \ - EMIT_OP_MODRM(0xc1, 3, op, d); \ + EMIT_REX_IF(0, 0, d); \ + EMIT_OP_MODRM64(0xc1, 3, op, d); \ EMIT(cnt, u8); \ } while (0) @@ -360,57 +573,69 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_ror(d, s, cnt) \ emith_shift(1, d, s, cnt) -#define emith_rolc(r) \ - EMIT_OP_MODRM(0xd1, 3, 2, r) +#define emith_rolc(r) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM64(0xd1, 3, 2, r); \ +} while (0) -#define emith_rorc(r) \ - EMIT_OP_MODRM(0xd1, 3, 3, r) +#define emith_rorc(r) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM64(0xd1, 3, 3, r); \ +} while (0) // misc -#define emith_push(r) \ - EMIT_OP(0x50 + (r)) +#define emith_push(r) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP(0x50 + ((r)&7)); \ +} while (0) #define emith_push_imm(imm) do { \ EMIT_OP(0x68); \ EMIT(imm, u32); \ } while (0) -#define emith_pop(r) \ - EMIT_OP(0x58 + (r)) +#define emith_pop(r) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP(0x58 + ((r)&7)); \ +} while (0) -#define emith_neg_r(r) \ - EMIT_OP_MODRM(0xf7, 3, 3, r) +#define emith_neg_r(r) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM64(0xf7, 3, 3, r); \ +} while (0) -#define emith_clear_msb(d, s, count) { \ +#define emith_clear_msb(d, s, count) do { \ u32 t = (u32)-1; \ t >>= count; \ if (d != s) \ emith_move_r_r(d, s); \ emith_and_r_imm(d, t); \ -} +} while (0) -#define emith_clear_msb_c(cond, d, s, count) { \ +#define emith_clear_msb_c(cond, d, s, count) do { \ (void)(cond); \ emith_clear_msb(d, s, count); \ -} +} while (0) -#define emith_sext(d, s, bits) { \ +#define emith_sext(d, s, bits) do { \ emith_lsl(d, s, 32 - (bits)); \ emith_asr(d, d, 32 - (bits)); \ -} +} while (0) + +#define emith_uext_ptr(r) /**/ #define emith_setc(r) do { \ assert(is_abcdx(r)); \ - EMIT_OP(0x0f); \ - EMIT_OP_MODRM(0x92, 3, 0, r); /* SETC r */ \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM64(0x0f92, 3, 0, r); /* SETC r */ \ } while (0) // XXX: stupid mess #define emith_mul_(op, dlo, dhi, s1, s2) do { \ int rmr; \ - if (dlo != xAX && dhi != xAX) \ + if (dlo != xAX && dhi != xAX && rcache_is_hreg_used(xAX)) \ emith_push(xAX); \ - if (dlo != xDX && dhi != xDX) \ + if (dlo != xDX && dhi != xDX && rcache_is_hreg_used(xDX)) \ emith_push(xDX); \ if ((s1) == xAX) \ rmr = s2; \ @@ -420,17 +645,17 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_move_r_r(xAX, s1); \ rmr = s2; \ } \ - EMIT_OP_MODRM(0xf7, 3, op, rmr); /* xMUL rmr */ \ - /* XXX: using push/pop for the case of edx->eax; eax->edx */ \ - if (dhi != xDX && dhi != -1) \ - emith_push(xDX); \ - if (dlo != xAX) \ - emith_move_r_r(dlo, xAX); \ - if (dhi != xDX && dhi != -1) \ - emith_pop(dhi); \ - if (dlo != xDX && dhi != xDX) \ + EMIT_REX_IF(0, 0, rmr); \ + EMIT_OP_MODRM64(0xf7, 3, op, rmr); /* xMUL rmr */ \ + if (dlo != xAX) { \ + EMIT_REX_IF(0, 0, dlo); \ + EMIT_OP(0x90 + ((dlo)&7)); /* XCHG eax, dlo */ \ + } \ + if (dhi != xDX && dhi != -1 && !(dhi == xAX && dlo == xDX)) \ + emith_move_r_r(dhi, (dlo == xDX ? xAX : xDX)); \ + if (dlo != xDX && dhi != xDX && rcache_is_hreg_used(xDX)) \ emith_pop(xDX); \ - if (dlo != xAX && dhi != xAX) \ + if (dlo != xAX && dhi != xAX && rcache_is_hreg_used(xAX)) \ emith_pop(xAX); \ } while (0) @@ -440,23 +665,36 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_mul_s64(dlo, dhi, s1, s2) \ emith_mul_(5, dlo, dhi, s1, s2) /* IMUL */ -#define emith_mul(d, s1, s2) \ - emith_mul_(4, d, -1, s1, s2) +#define emith_mul(d, s1, s2) do { \ + if (d == s1) { \ + EMIT_REX_IF(0, d, s2); \ + EMIT_OP_MODRM64(0x0faf, 3, d, s2); \ + } else if (d == s2) { \ + EMIT_REX_IF(0, d, s1); \ + EMIT_OP_MODRM64(0x0faf, 3, d, s1); \ + } else { \ + emith_move_r_r(d, s1); \ + EMIT_REX_IF(0, d, s2); \ + EMIT_OP_MODRM64(0x0faf, 3, d, s2); \ + } \ +} while (0) // (dlo,dhi) += signed(s1) * signed(s2) #define emith_mula_s64(dlo, dhi, s1, s2) do { \ emith_push(dhi); \ emith_push(dlo); \ emith_mul_(5, dlo, dhi, s1, s2); \ - EMIT_OP_MODRM(0x03, 0, dlo, 4); \ - EMIT_SIB(0, 4, 4); /* add dlo, [xsp] */ \ - EMIT_OP_MODRM(0x13, 1, dhi, 4); \ - EMIT_SIB(0, 4, 4); \ - EMIT(sizeof(void *), u8); /* adc dhi, [xsp+{4,8}] */ \ + EMIT_REX_IF(0, dlo, xSP); \ + emith_deref_modrm(0x03, 0, dlo, xSP); /* add dlo, [xsp] */ \ + EMIT_REX_IF(0, dhi, xSP); \ + emith_deref_modrm(0x13, 1, dhi, xSP); /* adc dhi, [xsp+{4,8}] */ \ + EMIT(sizeof(void *), u8); \ emith_add_r_r_ptr_imm(xSP, xSP, sizeof(void *) * 2); \ } while (0) // "flag" instructions are the same +#define emith_adcf_r_imm emith_adc_r_imm +#define emith_subf_r_imm emith_sub_r_imm #define emith_addf_r_r emith_add_r_r #define emith_subf_r_r emith_sub_r_r #define emith_adcf_r_r emith_adc_r_r @@ -464,6 +702,14 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_eorf_r_r emith_eor_r_r #define emith_negcf_r_r emith_negc_r_r +#define emith_subf_r_r_imm emith_sub_r_r_imm +#define emith_addf_r_r_r emith_add_r_r_r +#define emith_subf_r_r_r emith_sub_r_r_r +#define emith_adcf_r_r_r emith_adc_r_r_r +#define emith_sbcf_r_r_r emith_sbc_r_r_r +#define emith_eorf_r_r_r emith_eor_r_r_r +#define emith_addf_r_r_r_lsr emith_add_r_r_r_lsr + #define emith_lslf emith_lsl #define emith_lsrf emith_lsr #define emith_asrf emith_asr @@ -472,60 +718,131 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_rolcf emith_rolc #define emith_rorcf emith_rorc +#define emith_deref_modrm(op, m, r, rs) do { \ + if (((rs) & 7) == 5 && m == 0) { /* xBP,xR13 not in mod 0, use mod 1 */\ + EMIT_OP_MODRM64(op, 1, r, rs); \ + EMIT(0, u8); \ + } else if (((rs) & 7) == 4) { /* xSP,xR12 must use SIB */ \ + EMIT_OP_MODRM64(op, m, r, 4); \ + EMIT_SIB64(0, 4, rs); \ + } else \ + EMIT_OP_MODRM64(op, m, r, rs); \ +} while (0) + #define emith_deref_op(op, r, rs, offs) do { \ /* mov r <-> [ebp+#offs] */ \ - if ((offs) >= 0x80) { \ - EMIT_OP_MODRM64(op, 2, r, rs); \ + if ((offs) == 0) { \ + emith_deref_modrm(op, 0, r, rs); \ + } else if ((s32)(offs) < -0x80 || (s32)(offs) >= 0x80) { \ + emith_deref_modrm(op, 2, r, rs); \ EMIT(offs, u32); \ } else { \ - EMIT_OP_MODRM64(op, 1, r, rs); \ - EMIT(offs, u8); \ + emith_deref_modrm(op, 1, r, rs); \ + EMIT((u8)offs, u8); \ } \ } while (0) -#define is_abcdx(r) (xAX <= (r) && (r) <= xDX) +#define is_abcdx(r) !((r) & ~0x3) -#define emith_read_r_r_offs(r, rs, offs) \ - emith_deref_op(0x8b, r, rs, offs) +#define emith_read_r_r_offs(r, rs, offs) do { \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x8b, r, rs, offs); \ +} while (0) +#define emith_read_r_r_offs_ptr(r, rs, offs) do { \ + EMIT_REX_IF(1, r, rs); \ + emith_deref_op(0x8b, r, rs, offs); \ +} while (0) -#define emith_write_r_r_offs(r, rs, offs) \ - emith_deref_op(0x89, r, rs, offs) +#define emith_write_r_r_offs(r, rs, offs) do { \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x89, r, rs, offs); \ +} while (0) +#define emith_write_r_r_offs_ptr(r, rs, offs) do { \ + EMIT_REX_IF(1, r, rs); \ + emith_deref_op(0x89, r, rs, offs); \ +} while (0) -// note: don't use prefixes on this #define emith_read8_r_r_offs(r, rs, offs) do { \ - int r_ = r; \ - if (!is_abcdx(r)) \ - r_ = rcache_get_tmp(); \ - emith_deref_op(0x8a, r_, rs, offs); \ - if ((r) != r_) { \ - emith_move_r_r(r, r_); \ - rcache_free_tmp(r_); \ - } \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x0fb6, r, rs, offs); \ +} while (0) + +#define emith_read8s_r_r_offs(r, rs, offs) do { \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x0fbe, r, rs, offs); \ } while (0) #define emith_write8_r_r_offs(r, rs, offs) do {\ - int r_ = r; \ - if (!is_abcdx(r)) { \ - r_ = rcache_get_tmp(); \ - emith_move_r_r(r_, r); \ - } \ - emith_deref_op(0x88, r_, rs, offs); \ - if ((r) != r_) \ - rcache_free_tmp(r_); \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x88, r, rs, offs); \ } while (0) #define emith_read16_r_r_offs(r, rs, offs) do { \ - EMIT(0x66, u8); /* operand override */ \ - emith_read_r_r_offs(r, rs, offs); \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x0fb7, r, rs, offs); \ +} while (0) + +#define emith_read16s_r_r_offs(r, rs, offs) do { \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x0fbf, r, rs, offs); \ } while (0) #define emith_write16_r_r_offs(r, rs, offs) do { \ - EMIT(0x66, u8); \ - emith_write_r_r_offs(r, rs, offs); \ + EMIT(0x66, u8); /* Intel SDM Vol 2a: REX must be closest to opcode */ \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x89, r, rs, offs); \ +} while (0) + +#define emith_read8_r_r_r(r, rs, rm) do { \ + EMIT_XREX_IF(0, r, rm, rs); \ + EMIT_OP_MODRM64(0x0fb6, 0, r, 4); \ + EMIT_SIB64(0, rs, rm); /* mov r, [rm + rs * 1] */ \ +} while (0) + +#define emith_read8s_r_r_r(r, rs, rm) do { \ + EMIT_XREX_IF(0, r, rm, rs); \ + EMIT_OP_MODRM64(0x0fbe, 0, r, 4); \ + EMIT_SIB64(0, rs, rm); /* mov r, [rm + rs * 1] */ \ +} while (0) + +#define emith_read16_r_r_r(r, rs, rm) do { \ + EMIT_XREX_IF(0, r, rm, rs); \ + EMIT_OP_MODRM64(0x0fb7, 0, r, 4); \ + EMIT_SIB64(0, rs, rm); /* mov r, [rm + rs * 1] */ \ +} while (0) + +#define emith_read16s_r_r_r(r, rs, rm) do { \ + EMIT_XREX_IF(0, r, rm, rs); \ + EMIT_OP_MODRM64(0x0fbf, 0, r, 4); \ + EMIT_SIB64(0, rs, rm); /* mov r, [rm + rs * 1] */ \ +} while (0) + +#define emith_read_r_r_r(r, rs, rm) do { \ + EMIT_XREX_IF(0, r, rm, rs); \ + EMIT_OP_MODRM64(0x8b, 0, r, 4); \ + EMIT_SIB64(0, rs, rm); /* mov r, [rm + rs * 1] */ \ +} while (0) +#define emith_read_r_r_r_ptr(r, rs, rm) do { \ + EMIT_XREX_IF(1, r, rm, rs); \ + EMIT_OP_MODRM64(0x8b, 0, r, 4); \ + EMIT_SIB64(0, rs, rm); /* mov r, [rm + rs * 1] */ \ +} while (0) + +#define emith_write_r_r_r(r, rs, rm) do { \ + EMIT_XREX_IF(0, r, rm, rs); \ + EMIT_OP_MODRM64(0x89, 0, r, 4); \ + EMIT_SIB64(0, rs, rm); /* mov [rm + rs * 1], r */ \ +} while (0) +#define emith_write_r_r_r_ptr(r, rs, rm) do { \ + EMIT_XREX_IF(1, r, rm, rs); \ + EMIT_OP_MODRM64(0x89, 0, r, 4); \ + EMIT_SIB64(0, rs, rm); /* mov [rm + rs * 1], r */ \ } while (0) #define emith_ctx_read(r, offs) \ emith_read_r_r_offs(r, CONTEXT_REG, offs) +#define emith_ctx_read_c(cond, r, offs) \ + emith_ctx_read(r, offs) #define emith_ctx_read_ptr(r, offs) do { \ EMIT_REX_IF(1, r, CONTEXT_REG); \ @@ -547,75 +864,101 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_ctx_write(r_, offs_); \ } while (0) -// assumes EBX is free -#define emith_ret_to_ctx(offs) { \ - emith_pop(xBX); \ - emith_ctx_write(xBX, offs); \ -} +#define emith_ret_to_ctx(offs) do { \ + int tmp_ = rcache_get_tmp(); \ + emith_pop(tmp_); \ + emith_ctx_write(tmp_, offs); \ + rcache_free_tmp(tmp_); \ +} while (0) -#define emith_jump(ptr) { \ +#define emith_jump(ptr) do { \ u32 disp = (u8 *)(ptr) - ((u8 *)tcache_ptr + 5); \ EMIT_OP(0xe9); \ EMIT(disp, u32); \ -} +} while (0) #define emith_jump_patchable(target) \ emith_jump(target) #define emith_jump_cond(cond, ptr) do { \ u32 disp = (u8 *)(ptr) - ((u8 *)tcache_ptr + 6); \ - EMIT(0x0f, u8); \ - EMIT_OP(0x80 | (cond)); \ + EMIT_OP(0x0f80 | (cond)); \ EMIT(disp, u32); \ } while (0) +#define emith_jump_cond_inrange(ptr) !0 #define emith_jump_cond_patchable(cond, target) \ emith_jump_cond(cond, target) -#define emith_jump_patch(ptr, target) do { \ +#define emith_jump_patch(ptr, target, pos) do { \ u32 disp_ = (u8 *)(target) - ((u8 *)(ptr) + 4); \ u32 offs_ = (*(u8 *)(ptr) == 0x0f) ? 2 : 1; \ EMIT_PTR((u8 *)(ptr) + offs_, disp_ - offs_, u32); \ + if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)ptr + offs_; \ } while (0) +#define emith_jump_patch_size() 4 +#define emith_jump_patch_inrange(ptr, target) !0 -#define emith_jump_at(ptr, target) { \ +#define emith_jump_at(ptr, target) do { \ u32 disp_ = (u8 *)(target) - ((u8 *)(ptr) + 5); \ EMIT_PTR(ptr, 0xe9, u8); \ EMIT_PTR((u8 *)(ptr) + 1, disp_, u32); \ -} +} while (0) +#define emith_jump_at_size() 5 -#define emith_call(ptr) { \ +#define emith_call(ptr) do { \ u32 disp = (u8 *)(ptr) - ((u8 *)tcache_ptr + 5); \ EMIT_OP(0xe8); \ EMIT(disp, u32); \ -} +} while (0) #define emith_call_cond(cond, ptr) \ emith_call(ptr) -#define emith_call_reg(r) \ - EMIT_OP_MODRM(0xff, 3, 2, r) +#define emith_call_reg(r) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM(0xff, 3, 2, (r)&7); \ +} while (0) #define emith_call_ctx(offs) do { \ EMIT_OP_MODRM(0xff, 2, 2, CONTEXT_REG); \ EMIT(offs, u32); \ } while (0) +#define emith_call_cleanup() \ + emith_add_r_r_ptr_imm(xSP, xSP, sizeof(void *)); // remove return addr + #define emith_ret() \ EMIT_OP(0xc3) -#define emith_jump_reg(r) \ - EMIT_OP_MODRM(0xff, 3, 4, r) +#define emith_add_r_ret(r) do { \ + EMIT_REX_IF(1, r, xSP); \ + emith_deref_modrm(0x03, 0, r, xSP); /* add r, [xsp] */ \ +} while (0) + +#define emith_jump_reg(r) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM(0xff, 3, 4, (r)&7); \ +} while (0) #define emith_jump_ctx(offs) do { \ EMIT_OP_MODRM(0xff, 2, 4, CONTEXT_REG); \ EMIT(offs, u32); \ } while (0) -#define emith_push_ret() +#define emith_push_ret(r) do { \ + int r_ = (r >= 0 ? r : xSI); \ + emith_push(r_); /* always push to align */ \ + emith_add_r_r_ptr_imm(xSP, xSP, -8*4); /* args shadow space */ \ +} while (0) + +#define emith_pop_and_ret(r) do { \ + int r_ = (r >= 0 ? r : xSI); \ + emith_add_r_r_ptr_imm(xSP, xSP, 8*4); /* args shadow space */ \ + emith_pop(r_); \ + emith_ret(); \ +} while (0) -#define emith_pop_and_ret() \ - emith_ret() #define EMITH_JMP_START(cond) { \ u8 *cond_ptr; \ @@ -637,17 +980,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; JMP8_EMIT_NC(else_ptr); \ } -// "simple" jump (no more then a few insns) +// "simple" jump (no more than a few insns) // ARM will use conditional instructions here -#define EMITH_SJMP_DECL_() \ - u8 *cond_ptr - -#define EMITH_SJMP_START_(cond) \ - JMP8_POS(cond_ptr) - -#define EMITH_SJMP_END_(cond) \ - JMP8_EMIT(cond, cond_ptr) - #define EMITH_SJMP_START EMITH_JMP_START #define EMITH_SJMP_END EMITH_JMP_END @@ -655,6 +989,15 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define EMITH_SJMP3_MID EMITH_JMP3_MID #define EMITH_SJMP3_END EMITH_JMP3_END +#define EMITH_SJMP2_START(cond) \ + EMITH_SJMP3_START(cond) +#define EMITH_SJMP2_MID(cond) \ + EMITH_SJMP3_MID(cond) +#define EMITH_SJMP2_END(cond) \ + EMITH_SJMP3_END() + +#define EMITH_HINT_COND(cond) /**/ + #define emith_pass_arg_r(arg, reg) do { \ int rd = 7; \ host_arg2reg(rd, arg); \ @@ -667,132 +1010,202 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_move_r_imm(rd, imm); \ } while (0) -#define host_instructions_updated(base, end) +#define host_instructions_updated(base, end) (void)(base),(void)(end) +#define emith_update_cache() /**/ + +// NB this MUST be <0x40000000 to avoid overflow in address calculations +#define emith_rw_offs_max() 0xfffffff // for better perfomance: <0x10000000 #ifdef __x86_64__ +#define HOST_REGS 16 #define PTR_SCALE 3 -#define NA_TMP_REG xAX // non-arg tmp from reg_temp[] -#define EMIT_REX_IF(w, r, rm) do { \ - int r_ = (r) > 7 ? 1 : 0; \ - int rm_ = (rm) > 7 ? 1 : 0; \ - if ((w) | r_ | rm_) \ - EMIT_REX(1, r_, 0, rm_); \ +#define EMIT_XREX_IF(w, r, rm, rs) do { \ + int xr_ = (r) > 7 ? 1 : 0; \ + int xb_ = (rm) > 7 ? 1 : 0; \ + int xx_ = (rs) > 7 ? 1 : 0; \ + if ((w) | xr_ | xx_ | xb_) \ + EMIT_REX(w, xr_, xx_, xb_); \ } while (0) + +#define EMIT_REX_IF(w, r, rm) \ + EMIT_XREX_IF(w, r, rm, 0) #ifndef _WIN32 +// SystemV ABI conventions: +// rbx,rbp,r12-r15 are preserved, rax,rcx,rdx,rsi,rdi,r8-r11 are temporaries +// parameters in rdi,rsi,rdx,rcx,r8,r9, return values in rax,rdx +#define PARAM_REGS { xDI, xSI, xDX, xCX, xR8, xR9 } +#define PRESERVED_REGS { xR12, xR13, xR14, xR15, xBX, xBP } +#define TEMPORARY_REGS { xAX, xR10, xR11 } +#define STATIC_SH2_REGS { SHR_SR,xBX , SHR_R0,xR15 } + #define host_arg2reg(rd, arg) \ switch (arg) { \ case 0: rd = xDI; break; \ case 1: rd = xSI; break; \ case 2: rd = xDX; break; \ + default: rd = xCX; break; \ } -#define emith_sh2_drc_entry() { \ +#define emith_sh2_drc_entry() do { \ emith_push(xBX); \ emith_push(xBP); \ + emith_push(xR12); \ + emith_push(xR13); \ + emith_push(xR14); \ + emith_push(xR15); \ emith_push(xSI); /* to align */ \ -} +} while (0) -#define emith_sh2_drc_exit() { \ +#define emith_sh2_drc_exit() do { \ emith_pop(xSI); \ + emith_pop(xR15); \ + emith_pop(xR14); \ + emith_pop(xR13); \ + emith_pop(xR12); \ emith_pop(xBP); \ emith_pop(xBX); \ emith_ret(); \ -} +} while (0) #else // _WIN32 +// M$ ABI conventions: +// rbx,rbp,rsi,rdi,r12-r15 are preserved, rcx,rdx,rax,r8,r9,r10,r11 temporaries +// parameters in rcx,rdx,r8,r9, return values in rax,rdx +#define PARAM_REGS { xCX, xDX, xR8, xR9 } +#define PRESERVED_REGS { xSI, xDI, xR12, xR13, xR14, xR15, xBX, xBP } +#define TEMPORARY_REGS { xAX, xR10, xR11 } +#define STATIC_SH2_REGS { SHR_SR,xBX , SHR_R(0),xR15 , SHR_R(1),xR14 } + #define host_arg2reg(rd, arg) \ switch (arg) { \ case 0: rd = xCX; break; \ case 1: rd = xDX; break; \ - case 2: rd = 8; break; \ + case 2: rd = xR8; break; \ + default: rd = xR9; break; \ } -#define emith_sh2_drc_entry() { \ +#define emith_sh2_drc_entry() do { \ emith_push(xBX); \ emith_push(xBP); \ + emith_push(xR12); \ + emith_push(xR13); \ + emith_push(xR14); \ + emith_push(xR15); \ emith_push(xSI); \ emith_push(xDI); \ - emith_add_r_r_ptr_imm(xSP, xSP, -8*5); \ -} + emith_add_r_r_ptr_imm(xSP, xSP, -8*5); /* align + args shadow space */ \ +} while (0) -#define emith_sh2_drc_exit() { \ +#define emith_sh2_drc_exit() do { \ emith_add_r_r_ptr_imm(xSP, xSP, 8*5); \ emith_pop(xDI); \ emith_pop(xSI); \ + emith_pop(xR15); \ + emith_pop(xR14); \ + emith_pop(xR13); \ + emith_pop(xR12); \ emith_pop(xBP); \ emith_pop(xBX); \ emith_ret(); \ -} +} while (0) #endif // _WIN32 #else // !__x86_64__ +#define HOST_REGS 8 #define PTR_SCALE 2 -#define NA_TMP_REG xBX // non-arg tmp from reg_temp[] #define EMIT_REX_IF(w, r, rm) do { \ assert((u32)(r) < 8u); \ assert((u32)(rm) < 8u); \ } while (0) +#define EMIT_XREX_IF(w, r, rs, rm) do { \ + assert((u32)(r) < 8u); \ + assert((u32)(rs) < 8u); \ + assert((u32)(rm) < 8u); \ +} while (0) + +// MS/SystemV ABI: ebx,esi,edi,ebp are preserved, eax,ecx,edx are temporaries +// DRC uses REGPARM to pass upto 3 parameters in registers eax,ecx,edx. +// To avoid conflicts with param passing ebx must be declared temp here. +#define PARAM_REGS { xAX, xDX, xCX } +#define PRESERVED_REGS { xSI, xDI, xBP } +#define TEMPORARY_REGS { xBX } +#define STATIC_SH2_REGS { SHR_SR,xDI , SHR_R0,xSI } #define host_arg2reg(rd, arg) \ switch (arg) { \ case 0: rd = xAX; break; \ case 1: rd = xDX; break; \ case 2: rd = xCX; break; \ + default: rd = xBX; break; \ } -#define emith_sh2_drc_entry() { \ +#define emith_sh2_drc_entry() do { \ emith_push(xBX); \ emith_push(xBP); \ emith_push(xSI); \ emith_push(xDI); \ -} +} while (0) -#define emith_sh2_drc_exit() { \ +#define emith_sh2_drc_exit() do { \ emith_pop(xDI); \ emith_pop(xSI); \ emith_pop(xBP); \ emith_pop(xBX); \ emith_ret(); \ -} +} while (0) #endif #define emith_save_caller_regs(mask) do { \ - if ((mask) & (1 << xAX)) emith_push(xAX); \ - if ((mask) & (1 << xCX)) emith_push(xCX); \ - if ((mask) & (1 << xDX)) emith_push(xDX); \ - if ((mask) & (1 << xSI)) emith_push(xSI); \ - if ((mask) & (1 << xDI)) emith_push(xDI); \ + int _c; u32 _m = mask & 0xfc7; /* AX, CX, DX, SI, DI, 8, 9, 10, 11 */ \ + if (__builtin_parity(_m) == 1) _m |= 0x8; /* BX for ABI align */ \ + for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + if (_m & (1 << _c)) emith_push(_c); \ } while (0) #define emith_restore_caller_regs(mask) do { \ - if ((mask) & (1 << xDI)) emith_pop(xDI); \ - if ((mask) & (1 << xSI)) emith_pop(xSI); \ - if ((mask) & (1 << xDX)) emith_pop(xDX); \ - if ((mask) & (1 << xCX)) emith_pop(xCX); \ - if ((mask) & (1 << xAX)) emith_pop(xAX); \ + int _c; u32 _m = mask & 0xfc7; \ + if (__builtin_parity(_m) == 1) _m |= 0x8; /* BX for ABI align */ \ + for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) emith_pop(_c); \ +} while (0) + +#define emith_sh2_rcall(a, tab, func, mask) do { \ + emith_lsr(mask, a, SH2_READ_SHIFT); \ + EMIT_XREX_IF(1, tab, tab, mask); \ + EMIT_OP_MODRM64(0x8d, 0, tab, 4); \ + EMIT_SIB64(PTR_SCALE, mask, tab); /* lea tab, [tab + mask * {4,8}] */ \ + EMIT_XREX_IF(1, tab, tab, mask); \ + EMIT_OP_MODRM64(0x8d, 0, tab, 4); \ + EMIT_SIB64(PTR_SCALE, mask, tab); /* lea tab, [tab + mask * {4,8}] */ \ + EMIT_REX_IF(1, func, tab); \ + emith_deref_modrm(0x8b, 0, func, tab); /* mov func, [tab] */ \ + EMIT_REX_IF(0, mask, tab); \ + emith_deref_modrm(0x8b, 1, mask, tab); \ + EMIT(1 << PTR_SCALE, u8); /* mov mask, [tab + {4,8}] */ \ + emith_add_r_r_ptr(func, func); \ } while (0) -#define emith_sh2_wcall(a, tab) { \ +#define emith_sh2_wcall(a, val, tab, func) do { \ int arg2_; \ host_arg2reg(arg2_, 2); \ - emith_lsr(NA_TMP_REG, a, SH2_WRITE_SHIFT); \ - EMIT_REX_IF(1, NA_TMP_REG, tab); \ - EMIT_OP_MODRM64(0x8b, 0, NA_TMP_REG, 4); \ - EMIT_SIB64(PTR_SCALE, NA_TMP_REG, tab); /* mov tmp, [tab + tmp * {4,8}] */ \ + emith_lsr(func, a, SH2_WRITE_SHIFT); /* tmp = a >> WRT_SHIFT */ \ + EMIT_XREX_IF(1, func, tab, func); \ + EMIT_OP_MODRM64(0x8b, 0, func, 4); \ + EMIT_SIB64(PTR_SCALE, func, tab); /* mov tmp, [tab + tmp * {4,8}] */ \ emith_move_r_r_ptr(arg2_, CONTEXT_REG); \ - emith_jump_reg(NA_TMP_REG); \ -} + emith_jump_reg(func); \ +} while (0) -#define emith_sh2_dtbf_loop() { \ +#define emith_sh2_dtbf_loop() do { \ u8 *jmp0; /* negative cycles check */ \ u8 *jmp1; /* unsinged overflow check */ \ int cr, rn; \ @@ -816,15 +1229,63 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_move_r_imm(rn, 0); \ JMP8_EMIT(ICOND_JA, jmp1); \ rcache_free_tmp(tmp_); \ -} +} while (0) -#define emith_write_sr(sr, srcr) { \ +#define emith_sh2_delay_loop(cycles, reg) do { \ + int sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); \ + int t1 = rcache_get_tmp(); \ + int t2 = rcache_get_tmp(); \ + int t3 = rcache_get_tmp(); \ + if (t3 == xAX) { t3 = t1; t1 = xAX; } /* for MUL */ \ + if (t3 == xDX) { t3 = t2; t2 = xDX; } \ + /* if (sr < 0) return */ \ + emith_asrf(t2, sr, 12); \ + EMITH_JMP_START(DCOND_LE); \ + /* turns = sr.cycles / cycles */ \ + emith_move_r_imm(t3, (u32)((1ULL<<32) / (cycles)) + 1); \ + emith_mul_u64(t1, t2, t2, t3); /* multiply by 1/x */ \ + rcache_free_tmp(t3); \ + if (reg >= 0) { \ + /* if (reg <= turns) turns = reg-1 */ \ + t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \ + emith_cmp_r_r(t3, t2); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_sub_r_r_imm_c(DCOND_LS, t2, t3, 1); \ + EMITH_SJMP_END(DCOND_HI); \ + /* if (reg <= 1) turns = 0 */ \ + emith_cmp_r_imm(t3, 1); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_move_r_imm_c(DCOND_LS, t2, 0); \ + EMITH_SJMP_END(DCOND_HI); \ + /* reg -= turns */ \ + emith_sub_r_r(t3, t2); \ + } \ + /* sr.cycles -= turns * cycles; */ \ + emith_move_r_imm(t1, cycles); \ + emith_mul_u64(t1, t2, t1, t2); \ + emith_sub_r_r_r_lsl(sr, sr, t1, 12); \ + EMITH_JMP_END(DCOND_LE); \ + rcache_free_tmp(t1); \ + rcache_free_tmp(t2); \ +} while (0) + +#define emith_write_sr(sr, srcr) do { \ int tmp_ = rcache_get_tmp(); \ emith_clear_msb(tmp_, srcr, 22); \ emith_bic_r_imm(sr, 0x3ff); \ emith_or_r_r(sr, tmp_); \ rcache_free_tmp(tmp_); \ -} +} while (0) + +#define emith_carry_to_t(sr, is_sub) do { \ + emith_rorc(sr); \ + emith_rol(sr, sr, 1); \ +} while (0) + +#define emith_t_to_carry(sr, is_sub) do { \ + emith_ror(sr, sr, 1); \ + emith_rol(sr, sr, 1); \ +} while (0) #define emith_tpop_carry(sr, is_sub) \ emith_lsr(sr, sr, 1) @@ -833,15 +1294,19 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_adc_r_r(sr, sr) /* + * T = carry(Rn = (Rn << 1) | T) * if Q * t = carry(Rn += Rm) * else * t = carry(Rn -= Rm) - * T ^= t + * T = !(T ^ t) */ -#define emith_sh2_div1_step(rn, rm, sr) { \ +#define emith_sh2_div1_step(rn, rm, sr) do { \ u8 *jmp0, *jmp1; \ int tmp_ = rcache_get_tmp(); \ + emith_tpop_carry(sr, 0); /* Rn = 2*Rn+T */\ + emith_adcf_r_r_r(rn, rn, rn); \ + emith_tpush_carry(sr, 0); /* T = C1 */ \ emith_eor_r_r(tmp_, tmp_); \ emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ JMP8_POS(jmp0); /* je do_sub */ \ @@ -851,7 +1316,100 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_sub_r_r(rn, rm); \ JMP8_EMIT_NC(jmp1); /* done: */ \ emith_adc_r_r(tmp_, tmp_); \ - emith_eor_r_r(sr, tmp_); \ + emith_eor_r_r(sr, tmp_);/* T = !(C1^C2) */\ + emith_eor_r_imm(sr, T); \ rcache_free_tmp(tmp_); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macl(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* MACH top 16 bits unused if saturated. sign ext for overfl detect */ \ + emith_sext(mh, mh, 16); \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ + /* to check: add MACH[15] to MACH[31:16]. this is 0 if no overflow */ \ + emith_asrf(rn, mh, 16); /* sum = (MACH>>16) + ((MACH>>15)&1) */ \ + emith_adcf_r_imm(rn, 0); /* (MACH>>15) is in carry after shift */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ + EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> +ovl */ \ + emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0xffffffff */ \ + emith_sub_r_imm_c(DCOND_GT, mh, 1); /* 0x00007fff */ \ + EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macw(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* XXX: MACH should be untouched when S is set? */ \ + emith_asr(mh, ml, 31); /* sign ext MACL to MACH for ovrfl check */ \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 33 bits of MACH:MACL aren't all 1 or 0 */ \ + /* to check: add MACL[31] to MACH. this is 0 if no overflow */ \ + emith_lsr(rn, ml, 31); \ + emith_addf_r_r(rn, mh); /* sum = MACH + ((MACL>>31)&1) */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> overflow */ \ + /* XXX: LSB signalling only in SH1, or in SH2 too? */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x00000001); /* LSB of MACH */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x80000000); /* negative ovrfl */ \ + EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> positive ovrfl */ \ + emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0x7fffffff */ \ + EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +#define emith_pool_check() /**/ +#define emith_pool_commit(j) /**/ +#define emith_insn_ptr() ((u8 *)tcache_ptr) +#define emith_flush() /**/ + +#ifdef T +// T bit handling +#define emith_invert_cond(cond) \ + ((cond) ^ 1) + +static void emith_clr_t_cond(int sr) +{ + emith_bic_r_imm(sr, T); +} + +static void emith_set_t_cond(int sr, int cond) +{ + EMITH_SJMP_START(emith_invert_cond(cond)); + emith_or_r_imm_c(cond, sr, T); + EMITH_SJMP_END(emith_invert_cond(cond)); } +#define emith_get_t_cond() -1 + +#define emith_sync_t(sr) ((void)sr) + +#define emith_invalidate_t() + +static void emith_set_t(int sr, int val) +{ + if (val) + emith_or_r_imm(sr, T); + else + emith_bic_r_imm(sr, T); +} + +static int emith_tst_t(int sr, int tf) +{ + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; +} +#endif diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index aa41a84df..085a61793 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -1,26 +1,30 @@ /* * SH2 recompiler * (C) notaz, 2009,2010,2013 + * (C) kub, 2018,2019,2020 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. * * notes: - * - tcache, block descriptor, link buffer overflows result in sh2_translate() - * failure, followed by full tcache invalidation for that region + * - tcache, block descriptor, block entry buffer overflows result in oldest + * blocks being deleted until enough space is available + * - link and list element buffer overflows result in failure and exit * - jumps between blocks are tracked for SMC handling (in block_entry->links), - * except jumps between different tcaches + * except jumps from global to CPU-local tcaches * * implemented: * - static register allocation * - remaining register caching and tracking in temporaries * - block-local branch linking - * - block linking (except between tcaches) + * - block linking * - some constant propagation + * - call stack caching for host block entry address + * - delay, poll, and idle loop detection and handling + * - some T/M flag optimizations where the value is known or isn't used * * TODO: * - better constant propagation - * - stack caching? * - bug fixing */ #include @@ -38,14 +42,17 @@ // features #define PROPAGATE_CONSTANTS 1 #define LINK_BRANCHES 1 - -// limits (per block) -#define MAX_BLOCK_SIZE (BLOCK_INSN_LIMIT * 6 * 6) - -// max literal offset from the block end -#define MAX_LITERAL_OFFSET 32*2 -#define MAX_LITERALS (BLOCK_INSN_LIMIT / 4) -#define MAX_LOCAL_BRANCHES 32 +#define BRANCH_CACHE 1 +#define CALL_STACK 1 +#define ALIAS_REGISTERS 1 +#define REMAP_REGISTER 1 +#define LOOP_DETECTION 1 +#define LOOP_OPTIMIZER 1 +#define T_OPTIMIZER 1 + +#define MAX_LITERAL_OFFSET 0x200 // max. MOVA, MOV @(PC) offset +#define MAX_LOCAL_TARGETS (BLOCK_INSN_LIMIT / 4) +#define MAX_LOCAL_BRANCHES (BLOCK_INSN_LIMIT / 2) // debug stuff // 01 - warnings/errors @@ -53,9 +60,16 @@ // 04 - asm // 08 - runtime block entry log // 10 - smc self-check +// 20 - runtime block entry counter +// 40 - rcache checking +// 80 - branch cache statistics +// 100 - write trace +// 200 - compare trace +// 400 - block entry backtrace on exit +// 800 - state dump on exit // { #ifndef DRC_DEBUG -#define DRC_DEBUG 0 +#define DRC_DEBUG 0//x847 #endif #if DRC_DEBUG @@ -73,6 +87,7 @@ static int insns_compiled, hash_collisions, host_insn_count; #define dbg(...) #endif + /// #define FETCH_OP(pc) \ dr_pc_base[(pc) / 2] @@ -93,13 +108,21 @@ static int insns_compiled, hash_collisions, host_insn_count; #define GET_Rn() \ ((op >> 8) & 0x0f) -#define BITMASK1(v0) (1 << (v0)) -#define BITMASK2(v0,v1) ((1 << (v0)) | (1 << (v1))) -#define BITMASK3(v0,v1,v2) (BITMASK2(v0,v1) | (1 << (v2))) -#define BITMASK4(v0,v1,v2,v3) (BITMASK3(v0,v1,v2) | (1 << (v3))) -#define BITMASK5(v0,v1,v2,v3,v4) (BITMASK4(v0,v1,v2,v3) | (1 << (v4))) +#define SHR_T 30 // separate T for not-used detection +#define SHR_MEM 31 +#define SHR_TMP -1 + +#define T 0x00000001 +#define S 0x00000002 +#define I 0x000000f0 +#define Q 0x00000100 +#define M 0x00000200 +#define T_save 0x00000800 -#define SHR_T SHR_SR // might make them separate someday +#define I_SHIFT 4 +#define Q_SHIFT 8 +#define M_SHIFT 9 +#define T_SHIFT 11 static struct op_data { u8 op; @@ -115,319 +138,429 @@ static struct op_data { enum op_types { OP_UNHANDLED = 0, OP_BRANCH, + OP_BRANCH_N, // conditional known not to be taken OP_BRANCH_CT, // conditional, branch if T set OP_BRANCH_CF, // conditional, branch if T clear OP_BRANCH_R, // indirect OP_BRANCH_RF, // indirect far (PC + Rm) OP_SETCLRT, // T flag set/clear OP_MOVE, // register move + OP_LOAD_CONST,// load const to register OP_LOAD_POOL, // literal pool load, imm is address - OP_MOVA, - OP_SLEEP, - OP_RTE, + OP_MOVA, // MOVA instruction + OP_SLEEP, // SLEEP instruction + OP_RTE, // RTE instruction + OP_TRAPA, // TRAPA instruction + OP_LDC, // LDC instruction + OP_UNDEFINED, }; -#ifdef DRC_SH2 +// XXX consider trap insns: OP_TRAPA, OP_UNDEFINED? +#define OP_ISBRANCH(op) ((BITRANGE(OP_BRANCH, OP_BRANCH_RF)| BITMASK1(OP_RTE)) \ + & BITMASK1(op)) +#define OP_ISBRAUC(op) (BITMASK4(OP_BRANCH, OP_BRANCH_R, OP_BRANCH_RF, OP_RTE) \ + & BITMASK1(op)) +#define OP_ISBRACND(op) (BITMASK2(OP_BRANCH_CT, OP_BRANCH_CF) \ + & BITMASK1(op)) +#define OP_ISBRAIMM(op) (BITMASK3(OP_BRANCH, OP_BRANCH_CT, OP_BRANCH_CF) \ + & BITMASK1(op)) +#define OP_ISBRAIND(op) (BITMASK3(OP_BRANCH_R, OP_BRANCH_RF, OP_RTE) \ + & BITMASK1(op)) -static int literal_disabled_frames; +#ifdef DRC_SH2 #if (DRC_DEBUG & 4) static u8 *tcache_dsm_ptrs[3]; static char sh2dasm_buff[64]; #define do_host_disasm(tcid) \ - host_dasm(tcache_dsm_ptrs[tcid], tcache_ptr - tcache_dsm_ptrs[tcid]); \ - tcache_dsm_ptrs[tcid] = tcache_ptr + host_dasm(tcache_dsm_ptrs[tcid], emith_insn_ptr() - tcache_dsm_ptrs[tcid]); \ + tcache_dsm_ptrs[tcid] = emith_insn_ptr() #else #define do_host_disasm(x) #endif -#if (DRC_DEBUG & 8) || defined(PDB) +#define SH2_DUMP(sh2, reason) { \ + char ms = (sh2)->is_slave ? 's' : 'm'; \ + printf("%csh2 %s %08x\n", ms, reason, (sh2)->pc); \ + printf("%csh2 r0-7 %08x %08x %08x %08x %08x %08x %08x %08x\n", ms, \ + (sh2)->r[0], (sh2)->r[1], (sh2)->r[2], (sh2)->r[3], \ + (sh2)->r[4], (sh2)->r[5], (sh2)->r[6], (sh2)->r[7]); \ + printf("%csh2 r8-15 %08x %08x %08x %08x %08x %08x %08x %08x\n", ms, \ + (sh2)->r[8], (sh2)->r[9], (sh2)->r[10], (sh2)->r[11], \ + (sh2)->r[12], (sh2)->r[13], (sh2)->r[14], (sh2)->r[15]); \ + printf("%csh2 pc-ml %08x %08x %08x %08x %08x %08x %08x %08x\n", ms, \ + (sh2)->pc, (sh2)->ppc, (sh2)->pr, (sh2)->sr&0xfff, \ + (sh2)->gbr, (sh2)->vbr, (sh2)->mach, (sh2)->macl); \ + printf("%csh2 tmp-p %08x %08x %08x %08x %08x %08x %08x %08x\n", ms, \ + (sh2)->drc_tmp, (sh2)->irq_cycles, \ + (sh2)->pdb_io_csum[0], (sh2)->pdb_io_csum[1], (sh2)->state, \ + (sh2)->poll_addr, (sh2)->poll_cycles, (sh2)->poll_cnt); \ +} + +#if (DRC_DEBUG & (8|256|512|1024)) || defined(PDB) +#if (DRC_DEBUG & (256|512|1024)) +static SH2 csh2[2][8]; +static FILE *trace[2]; +#endif static void REGPARM(3) *sh2_drc_log_entry(void *block, SH2 *sh2, u32 sr) { if (block != NULL) { dbg(8, "= %csh2 enter %08x %p, c=%d", sh2->is_slave ? 's' : 'm', sh2->pc, block, (signed int)sr >> 12); +#if defined PDB pdb_step(sh2, sh2->pc); +#elif (DRC_DEBUG & 256) + { + int idx = sh2->is_slave; + if (!trace[0]) { + trace[0] = fopen("pico.trace0", "wb"); + trace[1] = fopen("pico.trace1", "wb"); + } + if (csh2[idx][0].pc != sh2->pc) { + fwrite(sh2, offsetof(SH2, read8_map), 1, trace[idx]); + fwrite(&sh2->pdb_io_csum, sizeof(sh2->pdb_io_csum), 1, trace[idx]); + memcpy(&csh2[idx][0], sh2, offsetof(SH2, poll_cnt)+4); + csh2[idx][0].is_slave = idx; + } + } +#elif (DRC_DEBUG & 512) + { + static SH2 fsh2; + int idx = sh2->is_slave; + if (!trace[0]) { + trace[0] = fopen("pico.trace0", "rb"); + trace[1] = fopen("pico.trace1", "rb"); + } + if (csh2[idx][0].pc != sh2->pc) { + if (!fread(&fsh2, offsetof(SH2, read8_map), 1, trace[idx]) || + !fread(&fsh2.pdb_io_csum, sizeof(sh2->pdb_io_csum), 1, trace[idx])) { + printf("trace eof at %08lx\n",ftell(trace[idx])); + exit(1); + } + fsh2.sr = (fsh2.sr & 0xfff) | (sh2->sr & ~0xfff); + fsh2.is_slave = idx; + if (memcmp(&fsh2, sh2, offsetof(SH2, read8_map)) || + 0)//memcmp(&fsh2.pdb_io_csum, &sh2->pdb_io_csum, sizeof(sh2->pdb_io_csum))) + { + printf("difference at %08lx!\n",ftell(trace[idx])); + SH2_DUMP(&fsh2, "file"); + SH2_DUMP(sh2, "current"); + SH2_DUMP(&csh2[idx][0], "previous"); + char *ps = (char *)sh2, *pf = (char *)&fsh2; + for (idx = 0; idx < offsetof(SH2, read8_map); idx += sizeof(u32)) + if (*(u32 *)(ps+idx) != *(u32 *)(pf+idx)) + printf("diff reg %ld\n",idx/sizeof(u32)); + exit(1); + } + csh2[idx][0] = fsh2; + } + } +#elif (DRC_DEBUG & 1024) + { + int x = sh2->is_slave, i; + for (i = 0; i < ARRAY_SIZE(csh2[x])-1; i++) + memcpy(&csh2[x][i], &csh2[x][i+1], offsetof(SH2, poll_cnt)+4); + memcpy(&csh2[x][ARRAY_SIZE(csh2[x])-1], sh2, offsetof(SH2, poll_cnt)+4); + csh2[x][0].is_slave = x; + } +#endif } return block; } #endif -// } debug -#define TCACHE_BUFFERS 3 // we have 3 translation cache buffers, split from one drc/cmn buffer. // BIOS shares tcache with data array because it's only used for init // and can be discarded early -// XXX: need to tune sizes -static const int tcache_sizes[TCACHE_BUFFERS] = { - DRC_TCACHE_SIZE * 6 / 8, // ROM (rarely used), DRAM - DRC_TCACHE_SIZE / 8, // BIOS, data array in master sh2 - DRC_TCACHE_SIZE / 8, // ... slave -}; - -static u8 *tcache_bases[TCACHE_BUFFERS]; -static u8 *tcache_ptrs[TCACHE_BUFFERS]; +#define TCACHE_BUFFERS 3 -// ptr for code emiters -static u8 *tcache_ptr; -#define MAX_BLOCK_ENTRIES (BLOCK_INSN_LIMIT / 8) +struct ring_buffer { + u8 *base; // ring buffer memory + unsigned item_sz; // size of one buffer item + unsigned size; // number of itmes in ring + int first, next; // read and write pointers + int used; // number of used items in ring +}; +enum { BL_JMP=1, BL_LDJMP, BL_JCCBLX }; struct block_link { + short tcache_id; + short type; // BL_JMP et al u32 target_pc; void *jump; // insn address - struct block_link *next; // either in block_entry->links or + void *blx; // block link/exit area if any + u8 jdisp[12]; // jump backup buffer + struct block_link *next; // either in block_entry->links or unresolved + struct block_link *o_next; // ...in block_entry->o_links + struct block_link *prev; + struct block_link *o_prev; + struct block_entry *target;// target block this is linked in (be->links) }; struct block_entry { u32 pc; - void *tcache_ptr; // translated block for above PC - struct block_entry *next; // next block in hash_table with same pc hash - struct block_link *links; // links to this entry + u8 *tcache_ptr; // translated block for above PC + struct block_entry *next; // chain in hash_table with same pc hash + struct block_entry *prev; + struct block_link *links; // incoming links to this entry + struct block_link *o_links;// outgoing links from this entry #if (DRC_DEBUG & 2) struct block_desc *block; #endif +#if (DRC_DEBUG & 32) + int entry_count; +#endif }; struct block_desc { u32 addr; // block start SH2 PC address - u16 size; // ..of recompiled insns+lit. pool - u16 size_nolit; // same without literals + u32 addr_lit; // block start SH2 literal pool addr + int size; // ..of recompiled insns + int size_lit; // ..of (insns+)literal pool + u8 *tcache_ptr; // start address of block in cache + u16 crc; // crc of insns and literals + u16 active; // actively used or deactivated? + struct block_list *list; #if (DRC_DEBUG & 2) int refcount; #endif int entry_count; - struct block_entry entryp[MAX_BLOCK_ENTRIES]; + struct block_entry *entryp; +}; + +struct block_list { + struct block_desc *block; // block reference + struct block_list *next; // pointers for doubly linked list + struct block_list *prev; + struct block_list **head; // list head (for removing from list) + struct block_list *l_next; }; -static const int block_max_counts[TCACHE_BUFFERS] = { - 4*1024, - 256, - 256, +static u8 *tcache_ptr; // ptr for code emitters + +// XXX: need to tune sizes + +static struct ring_buffer tcache_ring[TCACHE_BUFFERS]; +static const int tcache_sizes[TCACHE_BUFFERS] = { + DRC_TCACHE_SIZE * 30 / 32, // ROM (rarely used), DRAM + DRC_TCACHE_SIZE / 32, // BIOS, data array in master sh2 + DRC_TCACHE_SIZE / 32, // ... slave }; + +#define BLOCK_MAX_COUNT(tcid) ((tcid) ? 256 : 32*256) +static struct ring_buffer block_ring[TCACHE_BUFFERS]; static struct block_desc *block_tables[TCACHE_BUFFERS]; -static int block_counts[TCACHE_BUFFERS]; + +#define ENTRY_MAX_COUNT(tcid) ((tcid) ? 8*512 : 256*512) +static struct ring_buffer entry_ring[TCACHE_BUFFERS]; +static struct block_entry *entry_tables[TCACHE_BUFFERS]; // we have block_link_pool to avoid using mallocs -static const int block_link_pool_max_counts[TCACHE_BUFFERS] = { - 4*1024, - 256, - 256, -}; +#define BLOCK_LINK_MAX_COUNT(tcid) ((tcid) ? 512 : 32*512) static struct block_link *block_link_pool[TCACHE_BUFFERS]; static int block_link_pool_counts[TCACHE_BUFFERS]; -static struct block_link *unresolved_links[TCACHE_BUFFERS]; +static struct block_link **unresolved_links[TCACHE_BUFFERS]; +static struct block_link *blink_free[TCACHE_BUFFERS]; // used for invalidation -static const int ram_sizes[TCACHE_BUFFERS] = { - 0x40000, - 0x1000, - 0x1000, -}; +#define RAM_SIZE(tcid) ((tcid) ? 0x1000 : 0x40000) #define INVAL_PAGE_SIZE 0x100 -struct block_list { - struct block_desc *block; - struct block_list *next; -}; +static struct block_list *inactive_blocks[TCACHE_BUFFERS]; // array of pointers to block_lists for RAM and 2 data arrays // each array has len: sizeof(mem) / INVAL_PAGE_SIZE static struct block_list **inval_lookup[TCACHE_BUFFERS]; -static const int hash_table_sizes[TCACHE_BUFFERS] = { - 0x1000, - 0x100, - 0x100, -}; +#define HASH_TABLE_SIZE(tcid) ((tcid) ? 512 : 64*512) static struct block_entry **hash_tables[TCACHE_BUFFERS]; #define HASH_FUNC(hash_tab, addr, mask) \ - (hash_tab)[(((addr) >> 20) ^ ((addr) >> 2)) & (mask)] + (hash_tab)[((addr) >> 1) & (mask)] + +#define BLOCK_LIST_MAX_COUNT (64*1024) +static struct block_list *block_list_pool; +static int block_list_pool_count; +static struct block_list *blist_free; + +#if (DRC_DEBUG & 128) +#if BRANCH_CACHE +int bchit, bcmiss; +#endif +#if CALL_STACK +int rchit, rcmiss; +#endif +#endif // host register tracking -enum { - HR_FREE, - HR_CACHED, // 'val' has sh2_reg_e -// HR_CONST, // 'val' has a constant - HR_TEMP, // reg used for temp storage +enum cache_reg_htype { + HRT_TEMP = 1, // is for temps and args + HRT_REG = 2, // is for sh2 regs }; -enum { - HRF_DIRTY = 1 << 0, // reg has "dirty" value to be written to ctx - HRF_LOCKED = 1 << 1, // HR_CACHED can't be evicted +enum cache_reg_flags { + HRF_DIRTY = 1 << 0, // has "dirty" value to be written to ctx + HRF_PINNED = 1 << 1, // has a pinned mapping + HRF_S16 = 1 << 2, // has a sign extended 16 bit value + HRF_U16 = 1 << 3, // has a zero extended 16 bit value }; -typedef struct { - u32 hreg:5; // "host" reg - u32 greg:5; // "guest" reg - u32 type:3; - u32 flags:3; - u32 stamp:16; // kind of a timestamp -} temp_reg_t; - -// note: reg_temp[] must have at least the amount of -// registers used by handlers in worst case (currently 4) -#ifdef __arm__ -#include "../drc/emit_arm.c" - -#ifndef __MACH__ +enum cache_reg_type { + HR_FREE, + HR_CACHED, // vreg has sh2_reg_e + HR_TEMP, // reg used for temp storage +}; -static const int reg_map_g2h[] = { - 4, 5, 6, 7, - 8, -1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, 9, // r12 .. sp - -1, -1, -1, 10, // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, - -1, -1, -1, -1, // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, +typedef struct { + u8 hreg:6; // "host" reg + u8 htype:2; // TEMP or REG? + u8 flags:4; // DIRTY, PINNED? + u8 type:2; // CACHED or TEMP? + u8 locked:2; // LOCKED reference counter + u16 stamp; // kind of a timestamp + u32 gregs; // "guest" reg mask +} cache_reg_t; + +// guest register tracking +enum guest_reg_flags { + GRF_DIRTY = 1 << 0, // reg has "dirty" value to be written to ctx + GRF_CONST = 1 << 1, // reg has a constant + GRF_CDIRTY = 1 << 2, // constant not yet written to ctx + GRF_STATIC = 1 << 3, // reg has static mapping to vreg + GRF_PINNED = 1 << 4, // reg has pinned mapping to vreg }; -#else +typedef struct { + u8 flags; // guest flags: is constant, is dirty? + s8 sreg; // cache reg for static mapping + s8 vreg; // cache_reg this is currently mapped to, -1 if not mapped + s8 cnst; // const index if this is constant +} guest_reg_t; -// no r9.. -static const int reg_map_g2h[] = { - 4, 5, 6, 7, - -1, -1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, 8, // r12 .. sp - -1, -1, -1, 10, // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, - -1, -1, -1, -1, // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, -}; -#endif +// possibly needed in code emitter +static int rcache_get_tmp(void); +static void rcache_free_tmp(int hr); -static temp_reg_t reg_temp[] = { - { 0, }, - { 1, }, - { 12, }, - { 14, }, - { 2, }, - { 3, }, -}; +// Note: Register assignment goes by ABI convention. Caller save registers are +// TEMPORARY, callee save registers are PRESERVED. Unusable regs are omitted. +// there must be at least the free (not context or statically mapped) amount of +// PRESERVED/TEMPORARY registers used by handlers in worst case (currently 4). +// there must be at least 3 PARAM, and PARAM+TEMPORARY must be at least 4. +// SR must and R0 should by all means be statically mapped. +// XXX the static definition of SR MUST match that in compiler.h +#ifdef __arm__ +#include "../drc/emit_arm.c" +#elif defined(__aarch64__) +#include "../drc/emit_arm64.c" +#elif defined(__mips__) +#include "../drc/emit_mips.c" +#elif defined(__riscv__) || defined(__riscv) +#include "../drc/emit_riscv.c" +#elif defined(__powerpc__) +#include "../drc/emit_ppc.c" #elif defined(__i386__) #include "../drc/emit_x86.c" - -static const int reg_map_g2h[] = { - xSI,-1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, -1, // r12 .. sp - -1, -1, -1, xDI, // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, - -1, -1, -1, -1, // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, -}; - -// ax, cx, dx are usually temporaries by convention -static temp_reg_t reg_temp[] = { - { xAX, }, - { xBX, }, - { xCX, }, - { xDX, }, -}; - #elif defined(__x86_64__) #include "../drc/emit_x86.c" - -static const int reg_map_g2h[] = { -#ifndef _WIN32 - -1, -1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, -1, // r12 .. sp - -1, -1, -1, xBX, // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, - -1, -1, -1, -1, // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, #else - xDI,-1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, -1, // r12 .. sp - -1, -1, -1, xBX, // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, - -1, -1, -1, -1, // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, +#error unsupported arch #endif -}; -// ax, cx, dx are usually temporaries by convention -static temp_reg_t reg_temp[] = { - { xAX, }, - { xCX, }, - { xDX, }, - { xSI, }, -#ifndef _WIN32 - { xDI, }, -#endif -}; +static const signed char hregs_param[] = PARAM_REGS; +static const signed char hregs_temp [] = TEMPORARY_REGS; +static const signed char hregs_saved[] = PRESERVED_REGS; +static const signed char regs_static[] = STATIC_SH2_REGS; -#else -#error unsupported arch -#endif +#define CACHE_REGS \ + (ARRAY_SIZE(hregs_param)+ARRAY_SIZE(hregs_temp)+ARRAY_SIZE(hregs_saved)-1) +static cache_reg_t cache_regs[CACHE_REGS]; -#define T 0x00000001 -#define S 0x00000002 -#define I 0x000000f0 -#define Q 0x00000100 -#define M 0x00000200 -#define T_save 0x00000800 +static signed char reg_map_host[HOST_REGS]; -#define I_SHIFT 4 -#define Q_SHIFT 8 -#define M_SHIFT 9 +static guest_reg_t guest_regs[SH2_REGS]; static void REGPARM(1) (*sh2_drc_entry)(SH2 *sh2); -static void (*sh2_drc_dispatcher)(void); -static void (*sh2_drc_exit)(void); +static void REGPARM(1) (*sh2_drc_dispatcher)(u32 pc); +#if CALL_STACK +static u32 REGPARM(2) (*sh2_drc_dispatcher_call)(u32 pc); +static void REGPARM(1) (*sh2_drc_dispatcher_return)(u32 pc); +#endif +static void REGPARM(1) (*sh2_drc_exit)(u32 pc); static void (*sh2_drc_test_irq)(void); -static u32 REGPARM(2) (*sh2_drc_read8)(u32 a, SH2 *sh2); -static u32 REGPARM(2) (*sh2_drc_read16)(u32 a, SH2 *sh2); -static u32 REGPARM(2) (*sh2_drc_read32)(u32 a, SH2 *sh2); +static u32 REGPARM(1) (*sh2_drc_read8)(u32 a); +static u32 REGPARM(1) (*sh2_drc_read16)(u32 a); +static u32 REGPARM(1) (*sh2_drc_read32)(u32 a); +static u32 REGPARM(1) (*sh2_drc_read8_poll)(u32 a); +static u32 REGPARM(1) (*sh2_drc_read16_poll)(u32 a); +static u32 REGPARM(1) (*sh2_drc_read32_poll)(u32 a); static void REGPARM(2) (*sh2_drc_write8)(u32 a, u32 d); static void REGPARM(2) (*sh2_drc_write16)(u32 a, u32 d); -static void REGPARM(3) (*sh2_drc_write32)(u32 a, u32 d, SH2 *sh2); +static void REGPARM(2) (*sh2_drc_write32)(u32 a, u32 d); + +#ifdef DRC_SR_REG +void REGPARM(1) (*sh2_drc_save_sr)(SH2 *sh2); +void REGPARM(1) (*sh2_drc_restore_sr)(SH2 *sh2); +#endif + +// flags for memory access +#define MF_SIZEMASK 0x03 // size of access +#define MF_POSTINCR 0x10 // post increment (for read_rr) +#define MF_PREDECR MF_POSTINCR // pre decrement (for write_rr) +#define MF_POLLING 0x20 // include polling check in read // address space stuff -static int dr_ctx_get_mem_ptr(u32 a, u32 *mask) +static int dr_is_rom(u32 a) { + // tweak for WWF Raw which writes data to some high ROM addresses + return (a & 0xc6000000) == 0x02000000 && (a & 0x3f0000) < 0x3e0000; +} + +static int dr_ctx_get_mem_ptr(SH2 *sh2, u32 a, u32 *mask) +{ + void *memptr; int poffs = -1; - if ((a & ~0x7ff) == 0) { - // BIOS + // check if region is mapped memory + memptr = p32x_sh2_get_mem_ptr(a, mask, sh2); + if (memptr == NULL) + return poffs; + + if (memptr == sh2->p_bios) // BIOS poffs = offsetof(SH2, p_bios); - *mask = 0x7ff; - } - else if ((a & 0xfffff000) == 0xc0000000) { - // data array - // FIXME: access sh2->data_array instead + else if (memptr == sh2->p_da) // data array poffs = offsetof(SH2, p_da); - *mask = 0xfff; - } - else if ((a & 0xc6000000) == 0x06000000) { - // SDRAM + else if (memptr == sh2->p_sdram) // SDRAM poffs = offsetof(SH2, p_sdram); - *mask = 0x03ffff; - } - else if ((a & 0xc6000000) == 0x02000000) { - // ROM + else if (memptr == sh2->p_rom) // ROM poffs = offsetof(SH2, p_rom); - *mask = 0x3fffff; - } return poffs; } +static int dr_get_tcache_id(u32 pc, int is_slave) +{ + u32 tcid = 0; + + if ((pc & 0xe0000000) == 0xc0000000) + tcid = 1 + is_slave; // data array + if ((pc & ~0xfff) == 0) + tcid = 1 + is_slave; // BIOS + return tcid; +} + static struct block_entry *dr_get_entry(u32 pc, int is_slave, int *tcache_id) { struct block_entry *be; - u32 tcid = 0, mask; - - // data arrays have their own caches - if ((pc & 0xe0000000) == 0xc0000000 || (pc & ~0xfff) == 0) - tcid = 1 + is_slave; - - *tcache_id = tcid; + + *tcache_id = dr_get_tcache_id(pc, is_slave); - mask = hash_table_sizes[tcid] - 1; - be = HASH_FUNC(hash_tables[tcid], pc, mask); + be = HASH_FUNC(hash_tables[*tcache_id], pc, HASH_TABLE_SIZE(*tcache_id) - 1); + if (be != NULL) // don't ask... gcc code generation hint for (; be != NULL; be = be->next) if (be->pc == pc) return be; @@ -437,300 +570,749 @@ static struct block_entry *dr_get_entry(u32 pc, int is_slave, int *tcache_id) // --------------------------------------------------------------- -// block management -static void add_to_block_list(struct block_list **blist, struct block_desc *block) -{ - struct block_list *added = malloc(sizeof(*added)); - if (!added) { - elprintf(EL_ANOMALY, "drc OOM (1)"); - return; - } - added->block = block; - added->next = *blist; - *blist = added; -} +// ring buffer management +#define RING_INIT(r,m,n) *(r) = (struct ring_buffer) { .base = (u8 *)m, \ + .item_sz = sizeof(*(m)), .size = n }; -static void rm_from_block_list(struct block_list **blist, struct block_desc *block) +static void *ring_alloc(struct ring_buffer *rb, int count) { - struct block_list *prev = NULL, *current = *blist; - for (; current != NULL; current = current->next) { - if (current->block == block) { - if (prev == NULL) - *blist = current->next; - else - prev->next = current->next; - free(current); - return; - } - prev = current; + // allocate space in ring buffer + void *p; + + p = rb->base + rb->next * rb->item_sz; + if (rb->next+count > rb->size) { + rb->used += rb->size - rb->next; + p = rb->base; // wrap if overflow at end + rb->next = count; + } else { + rb->next += count; + if (rb->next == rb->size) rb->next = 0; } - dbg(1, "can't rm block %p (%08x-%08x)", - block, block->addr, block->addr + block->size); + + rb->used += count; + return p; } -static void rm_block_list(struct block_list **blist) +static void ring_wrap(struct ring_buffer *rb) { - struct block_list *tmp, *current = *blist; - while (current != NULL) { - tmp = current; - current = current->next; - free(tmp); - } - *blist = NULL; + // insufficient space at end of buffer memory, wrap around + rb->used += rb->size - rb->next; + rb->next = 0; } -static void REGPARM(1) flush_tcache(int tcid) +static void ring_free(struct ring_buffer *rb, int count) { - int i; + // free oldest space in ring buffer + rb->first += count; + if (rb->first >= rb->size) rb->first -= rb->size; - dbg(1, "tcache #%d flush! (%d/%d, bds %d/%d)", tcid, - tcache_ptrs[tcid] - tcache_bases[tcid], tcache_sizes[tcid], - block_counts[tcid], block_max_counts[tcid]); + rb->used -= count; +} - block_counts[tcid] = 0; - block_link_pool_counts[tcid] = 0; - unresolved_links[tcid] = NULL; - memset(hash_tables[tcid], 0, sizeof(*hash_tables[0]) * hash_table_sizes[tcid]); - tcache_ptrs[tcid] = tcache_bases[tcid]; - if (Pico32xMem != NULL) { - if (tcid == 0) // ROM, RAM - memset(Pico32xMem->drcblk_ram, 0, - sizeof(Pico32xMem->drcblk_ram)); - else - memset(Pico32xMem->drcblk_da[tcid - 1], 0, - sizeof(Pico32xMem->drcblk_da[0])); - } -#if (DRC_DEBUG & 4) - tcache_dsm_ptrs[tcid] = tcache_bases[tcid]; -#endif +static void ring_free_p(struct ring_buffer *rb, void *p) +{ + // free ring buffer space upto given pointer + rb->first = ((u8 *)p - rb->base) / rb->item_sz; - for (i = 0; i < ram_sizes[tcid] / INVAL_PAGE_SIZE; i++) - rm_block_list(&inval_lookup[tcid][i]); + rb->used = rb->next - rb->first; + if (rb->used < 0) rb->used += rb->size; } -static void add_to_hashlist(struct block_entry *be, int tcache_id) +static void *ring_reset(struct ring_buffer *rb) { - u32 tcmask = hash_table_sizes[tcache_id] - 1; + // reset to initial state + rb->first = rb->next = rb->used = 0; + return rb->base + rb->next * rb->item_sz; +} - be->next = HASH_FUNC(hash_tables[tcache_id], be->pc, tcmask); - HASH_FUNC(hash_tables[tcache_id], be->pc, tcmask) = be; +static void *ring_first(struct ring_buffer *rb) +{ + return rb->base + rb->first * rb->item_sz; +} -#if (DRC_DEBUG & 2) - if (be->next != NULL) { - printf(" %08x: hash collision with %08x\n", - be->pc, be->next->pc); - hash_collisions++; - } -#endif +static void *ring_next(struct ring_buffer *rb) +{ + return rb->base + rb->next * rb->item_sz; } -static void rm_from_hashlist(struct block_entry *be, int tcache_id) + +// block management +static void add_to_block_list(struct block_list **blist, struct block_desc *block) { - u32 tcmask = hash_table_sizes[tcache_id] - 1; - struct block_entry *cur, *prev; - - cur = HASH_FUNC(hash_tables[tcache_id], be->pc, tcmask); - if (cur == NULL) - goto missing; + struct block_list *added; - if (be == cur) { // first - HASH_FUNC(hash_tables[tcache_id], be->pc, tcmask) = be->next; - return; + if (blist_free) { + added = blist_free; + blist_free = added->next; + } else if (block_list_pool_count >= BLOCK_LIST_MAX_COUNT) { + printf( "block list overflow\n"); + exit(1); + } else { + added = block_list_pool + block_list_pool_count; + block_list_pool_count ++; } - for (prev = cur, cur = cur->next; cur != NULL; cur = cur->next) { - if (cur == be) { - prev->next = cur->next; - return; - } - } + added->block = block; + added->l_next = block->list; + block->list = added; + added->head = blist; -missing: - dbg(1, "rm_from_hashlist: be %p %08x missing?", be, be->pc); + added->prev = NULL; + if (*blist) + (*blist)->prev = added; + added->next = *blist; + *blist = added; } -static void unregister_links(struct block_entry *be, int tcache_id) +static void rm_from_block_lists(struct block_desc *block) { - struct block_link *bl_unresolved = unresolved_links[tcache_id]; - struct block_link *bl, *bl_next; + struct block_list *entry; + + entry = block->list; + while (entry != NULL) { + if (entry->prev != NULL) + entry->prev->next = entry->next; + else + *(entry->head) = entry->next; + if (entry->next != NULL) + entry->next->prev = entry->prev; - for (bl = be->links; bl != NULL; ) { - bl_next = bl->next; - bl->next = bl_unresolved; - bl_unresolved = bl; - bl = bl_next; + entry->next = blist_free; + blist_free = entry; + + entry = entry->l_next; } - be->links = NULL; - unresolved_links[tcache_id] = bl_unresolved; + block->list = NULL; } -// unlike sh2_smc_rm_block, the block stays and can still be accessed -// by other already directly linked blocks, just not preferred -static void kill_block_entry(struct block_entry *be, int tcache_id) +static void discard_block_list(struct block_list **blist) { - rm_from_hashlist(be, tcache_id); - unregister_links(be, tcache_id); + struct block_list *next, *current = *blist; + while (current != NULL) { + next = current->next; + current->next = blist_free; + blist_free = current; + current = next; + } + *blist = NULL; } -static struct block_desc *dr_add_block(u32 addr, u16 size_lit, - u16 size_nolit, int is_slave, int *blk_id) +static void add_to_hashlist(struct block_entry *be, int tcache_id) { - struct block_entry *be; - struct block_desc *bd; - int tcache_id; - int *bcount; - - // do a lookup to get tcache_id and override check - be = dr_get_entry(addr, is_slave, &tcache_id); - if (be != NULL) { - dbg(1, "block override for %08x, was %p", addr, be->tcache_ptr); - kill_block_entry(be, tcache_id); - } - - bcount = &block_counts[tcache_id]; - if (*bcount >= block_max_counts[tcache_id]) { - dbg(1, "bd overflow for tcache %d", tcache_id); - return NULL; - } + u32 tcmask = HASH_TABLE_SIZE(tcache_id) - 1; + struct block_entry **head = &HASH_FUNC(hash_tables[tcache_id], be->pc, tcmask); - bd = &block_tables[tcache_id][*bcount]; - bd->addr = addr; - bd->size = size_lit; - bd->size_nolit = size_nolit; + be->prev = NULL; + if (*head) + (*head)->prev = be; + be->next = *head; + *head = be; - bd->entry_count = 1; - bd->entryp[0].pc = addr; - bd->entryp[0].tcache_ptr = tcache_ptr; - bd->entryp[0].links = NULL; #if (DRC_DEBUG & 2) - bd->entryp[0].block = bd; - bd->refcount = 0; + if (be->next != NULL) { + printf(" %08x@%p: entry hash collision with %08x@%p\n", + be->pc, be->tcache_ptr, be->next->pc, be->next->tcache_ptr); + hash_collisions++; + } #endif - add_to_hashlist(&bd->entryp[0], tcache_id); +} - *blk_id = *bcount; - (*bcount)++; +static void rm_from_hashlist(struct block_entry *be, int tcache_id) +{ + u32 tcmask = HASH_TABLE_SIZE(tcache_id) - 1; + struct block_entry **head = &HASH_FUNC(hash_tables[tcache_id], be->pc, tcmask); + +#if DRC_DEBUG & 1 + struct block_entry *current = be; + while (current->prev != NULL) + current = current->prev; + if (current != *head) + dbg(1, "rm_from_hashlist @%p: be %p %08x missing?", head, be, be->pc); +#endif - return bd; + if (be->prev != NULL) + be->prev->next = be->next; + else + *head = be->next; + if (be->next != NULL) + be->next->prev = be->prev; } -static void REGPARM(3) *dr_lookup_block(u32 pc, int is_slave, int *tcache_id) + +static void add_to_hashlist_unresolved(struct block_link *bl, int tcache_id) { - struct block_entry *be = NULL; - void *block = NULL; + u32 tcmask = HASH_TABLE_SIZE(tcache_id) - 1; + struct block_link **head = &HASH_FUNC(unresolved_links[tcache_id], bl->target_pc, tcmask); - be = dr_get_entry(pc, is_slave, tcache_id); - if (be != NULL) - block = be->tcache_ptr; +#if DRC_DEBUG & 1 + struct block_link *current = *head; + while (current != NULL && current != bl) + current = current->next; + if (current == bl) + dbg(1, "add_to_hashlist_unresolved @%p: bl %p %p %08x already in?", head, bl, bl->target, bl->target_pc); +#endif -#if (DRC_DEBUG & 2) - if (be != NULL) - be->block->refcount++; + bl->target = NULL; // marker for not resolved + bl->prev = NULL; + if (*head) + (*head)->prev = bl; + bl->next = *head; + *head = bl; +} + +static void rm_from_hashlist_unresolved(struct block_link *bl, int tcache_id) +{ + u32 tcmask = HASH_TABLE_SIZE(tcache_id) - 1; + struct block_link **head = &HASH_FUNC(unresolved_links[tcache_id], bl->target_pc, tcmask); + +#if DRC_DEBUG & 1 + struct block_link *current = bl; + while (current->prev != NULL) + current = current->prev; + if (current != *head) + dbg(1, "rm_from_hashlist_unresolved @%p: bl %p %p %08x missing?", head, bl, bl->target, bl->target_pc); #endif - return block; + + if (bl->prev != NULL) + bl->prev->next = bl->next; + else + *head = bl->next; + if (bl->next != NULL) + bl->next->prev = bl->prev; } -static void *dr_failure(void) +#if LINK_BRANCHES +static void dr_block_link(struct block_entry *be, struct block_link *bl, int emit_jump) { - lprintf("recompilation failed\n"); - exit(1); + dbg(2, "- %slink from %p to pc %08x entry %p", emit_jump ? "":"early ", + bl->jump, bl->target_pc, be->tcache_ptr); + + if (emit_jump) { + u8 *jump = bl->jump; + int jsz = emith_jump_patch_size(); + if (bl->type == BL_JMP) { // patch: jump @entry + // inlined: @jump far jump to target + emith_jump_patch(jump, be->tcache_ptr, &jump); + } else if (bl->type == BL_LDJMP) { // write: jump @entry + // inlined: @jump far jump to target + emith_jump_at(jump, be->tcache_ptr); + jsz = emith_jump_at_size(); + } else if (bl->type == BL_JCCBLX) { // patch: jump cond -> jump @entry + if (emith_jump_patch_inrange(bl->jump, be->tcache_ptr)) { + // inlined: @jump near jumpcc to target + emith_jump_patch(jump, be->tcache_ptr, &jump); + } else { // dispatcher cond immediate + // via blx: @jump near jumpcc to blx; @blx far jump + emith_jump_patch(jump, bl->blx, &jump); + emith_jump_at(bl->blx, be->tcache_ptr); + if ((((uintptr_t)bl->blx & 0x1f) + emith_jump_at_size()-1) > 0x1f) + host_instructions_updated(bl->blx, bl->blx + emith_jump_at_size()-1); + } + } else { + printf("unknown BL type %d\n", bl->type); + exit(1); + } + // only needs sync if patch is possibly crossing cacheline (assume 32 byte) + if ((((uintptr_t)jump & 0x1f) + jsz-1) > 0x1f) + host_instructions_updated(jump, jump + jsz-1); + } + + // move bl to block_entry + bl->target = be; + bl->prev = NULL; + if (be->links) + be->links->prev = bl; + bl->next = be->links; + be->links = bl; +} + +static void dr_block_unlink(struct block_link *bl, int emit_jump) +{ + dbg(2,"- unlink from %p to pc %08x", bl->jump, bl->target_pc); + + if (bl->target) { + if (emit_jump) { + u8 *jump = bl->jump; + int jsz = emith_jump_patch_size(); + if (bl->type == BL_JMP) { // jump_patch @dispatcher + // inlined: @jump far jump to dispatcher + emith_jump_patch(jump, sh2_drc_dispatcher, &jump); + } else if (bl->type == BL_LDJMP) { // restore: load pc, jump @dispatcher + // inlined: @jump load target_pc, far jump to dispatcher + memcpy(jump, bl->jdisp, emith_jump_at_size()); + jsz = emith_jump_at_size(); + } else if (bl->type == BL_JCCBLX) { // jump cond @blx; @blx: load pc, jump + // via blx: @jump near jumpcc to blx; @blx load target_pc, far jump + emith_jump_patch(bl->jump, bl->blx, &jump); + memcpy(bl->blx, bl->jdisp, emith_jump_at_size()); + host_instructions_updated(bl->blx, bl->blx + emith_jump_at_size()-1); + } else { + printf("unknown BL type %d\n", bl->type); + exit(1); + } + // update cpu caches since the previous jump target doesn't exist anymore + host_instructions_updated(jump, jump + jsz-1); + } + + if (bl->prev) + bl->prev->next = bl->next; + else + bl->target->links = bl->next; + if (bl->next) + bl->next->prev = bl->prev; + bl->target = NULL; + } } +#endif -static void *dr_prepare_ext_branch(u32 pc, int is_slave, int tcache_id) +static struct block_link *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_slave, int tcache_id) { #if LINK_BRANCHES struct block_link *bl = block_link_pool[tcache_id]; int cnt = block_link_pool_counts[tcache_id]; - struct block_entry *be = NULL; int target_tcache_id; - int i; - be = dr_get_entry(pc, is_slave, &target_tcache_id); - if (target_tcache_id != tcache_id) - return sh2_drc_dispatcher; + // get the target block entry + target_tcache_id = dr_get_tcache_id(pc, is_slave); + if (target_tcache_id && target_tcache_id != tcache_id) + return NULL; - // if pool has been freed, reuse - for (i = cnt - 1; i >= 0; i--) - if (bl[i].target_pc != 0) - break; - cnt = i + 1; - if (cnt >= block_link_pool_max_counts[tcache_id]) { + // get a block link + if (blink_free[tcache_id] != NULL) { + bl = blink_free[tcache_id]; + blink_free[tcache_id] = bl->next; + } else if (cnt >= BLOCK_LINK_MAX_COUNT(tcache_id)) { dbg(1, "bl overflow for tcache %d", tcache_id); return NULL; + } else { + bl += cnt; + block_link_pool_counts[tcache_id] = cnt+1; } - bl += cnt; - block_link_pool_counts[tcache_id]++; + // prepare link and add to outgoing list of owner + bl->tcache_id = tcache_id; bl->target_pc = pc; bl->jump = tcache_ptr; + bl->blx = NULL; + bl->o_next = owner->o_links; + owner->o_links = bl; - if (be != NULL) { - dbg(2, "- early link from %p to pc %08x", bl->jump, pc); - bl->next = be->links; - be->links = bl; - return be->tcache_ptr; - } - else { - bl->next = unresolved_links[tcache_id]; - unresolved_links[tcache_id] = bl; - return sh2_drc_dispatcher; - } + add_to_hashlist_unresolved(bl, tcache_id); + return bl; #else - return sh2_drc_dispatcher; + return NULL; #endif } -static void dr_link_blocks(struct block_entry *be, int tcache_id) +static void dr_mark_memory(int mark, struct block_desc *block, int tcache_id, u32 nolit) { -#if LINK_BRANCHES - struct block_link *first = unresolved_links[tcache_id]; - struct block_link *bl, *prev, *tmp; - u32 pc = be->pc; + u8 *drc_ram_blk = NULL, *lit_ram_blk = NULL; + u32 addr, end, mask = 0, shift = 0, idx; - for (bl = prev = first; bl != NULL; ) { - if (bl->target_pc == pc) { - dbg(2, "- link from %p to pc %08x", bl->jump, pc); - emith_jump_patch(bl->jump, tcache_ptr); + // mark memory blocks as containing compiled code + if ((block->addr & 0xc7fc0000) == 0x06000000 + || (block->addr & 0xfffff000) == 0xc0000000) + { + if (tcache_id != 0) { + // data array + drc_ram_blk = Pico32xMem->drcblk_da[tcache_id-1]; + lit_ram_blk = Pico32xMem->drclit_da[tcache_id-1]; + shift = SH2_DRCBLK_DA_SHIFT; + } + else { + // SDRAM + drc_ram_blk = Pico32xMem->drcblk_ram; + lit_ram_blk = Pico32xMem->drclit_ram; + shift = SH2_DRCBLK_RAM_SHIFT; + } + mask = RAM_SIZE(tcache_id) - 1; - // move bl from unresolved_links to block_entry - tmp = bl->next; - bl->next = be->links; - be->links = bl; + // mark recompiled insns + addr = block->addr & ~((1 << shift) - 1); + end = block->addr + block->size; + for (idx = (addr & mask) >> shift; addr < end; addr += (1 << shift)) + drc_ram_blk[idx++] += mark; + + // mark literal pool + if (addr < (block->addr_lit & ~((1 << shift) - 1))) + addr = block->addr_lit & ~((1 << shift) - 1); + end = block->addr_lit + block->size_lit; + for (idx = (addr & mask) >> shift; addr < end; addr += (1 << shift)) + drc_ram_blk[idx++] += mark; + + // mark for literals disabled + if (nolit) { + addr = nolit & ~((1 << shift) - 1); + end = block->addr_lit + block->size_lit; + for (idx = (addr & mask) >> shift; addr < end; addr += (1 << shift)) + lit_ram_blk[idx++] = 1; + } - if (bl == first) - first = prev = bl = tmp; - else - prev->next = bl = tmp; - continue; + if (mark < 0) + rm_from_block_lists(block); + else { + // add to invalidation lookup lists + addr = block->addr & ~(INVAL_PAGE_SIZE - 1); + end = block->addr + block->size; + for (idx = (addr & mask) / INVAL_PAGE_SIZE; addr < end; addr += INVAL_PAGE_SIZE) + add_to_block_list(&inval_lookup[tcache_id][idx++], block); + + if (addr < (block->addr_lit & ~(INVAL_PAGE_SIZE - 1))) + addr = block->addr_lit & ~(INVAL_PAGE_SIZE - 1); + end = block->addr_lit + block->size_lit; + for (idx = (addr & mask) / INVAL_PAGE_SIZE; addr < end; addr += INVAL_PAGE_SIZE) + add_to_block_list(&inval_lookup[tcache_id][idx++], block); + } + } +} + +static u32 dr_check_nolit(u32 start, u32 end, int tcache_id) +{ + u8 *lit_ram_blk = NULL; + u32 mask = 0, shift = 0, addr, idx; + + if ((start & 0xc7fc0000) == 0x06000000 + || (start & 0xfffff000) == 0xc0000000) + { + if (tcache_id != 0) { + // data array + lit_ram_blk = Pico32xMem->drclit_da[tcache_id-1]; + shift = SH2_DRCBLK_DA_SHIFT; + } + else { + // SDRAM + lit_ram_blk = Pico32xMem->drclit_ram; + shift = SH2_DRCBLK_RAM_SHIFT; } - prev = bl; - bl = bl->next; + mask = RAM_SIZE(tcache_id) - 1; + + addr = start & ~((1 << shift) - 1); + for (idx = (addr & mask) >> shift; addr < end; addr += (1 << shift)) + if (lit_ram_blk[idx++]) + break; + + return (addr < start ? start : addr > end ? end : addr); + } + + return end; +} + +static void dr_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit, int free) +{ + struct block_link *bl; + u32 i; + + free = free || nolit; // block is invalid if literals are overwritten + dbg(2," %sing block %08x-%08x,%08x-%08x, blkid %d,%d", free?"delet":"disabl", + bd->addr, bd->addr + bd->size, bd->addr_lit, bd->addr_lit + bd->size_lit, + tcache_id, bd - block_tables[tcache_id]); + if (bd->addr == 0 || bd->entry_count == 0) { + dbg(1, " killing dead block!? %08x", bd->addr); + return; } - unresolved_links[tcache_id] = first; - // could sync arm caches here, but that's unnecessary +#if LINK_BRANCHES + // remove from hash table, make incoming links unresolved + if (bd->active) { + for (i = 0; i < bd->entry_count; i++) { + rm_from_hashlist(&bd->entryp[i], tcache_id); + + while ((bl = bd->entryp[i].links) != NULL) { + dr_block_unlink(bl, 1); + add_to_hashlist_unresolved(bl, tcache_id); + } + } + + dr_mark_memory(-1, bd, tcache_id, nolit); + add_to_block_list(&inactive_blocks[tcache_id], bd); + } + bd->active = 0; +#endif + + if (free) { +#if LINK_BRANCHES + // revoke outgoing links + for (bl = bd->entryp[0].o_links; bl != NULL; bl = bl->o_next) { + if (bl->target) + dr_block_unlink(bl, 0); + else + rm_from_hashlist_unresolved(bl, tcache_id); + bl->jump = NULL; + bl->next = blink_free[bl->tcache_id]; + blink_free[bl->tcache_id] = bl; + } + bd->entryp[0].o_links = NULL; #endif + // invalidate block + rm_from_block_lists(bd); + bd->addr = bd->size = bd->addr_lit = bd->size_lit = 0; + bd->entry_count = 0; + bd->entryp = NULL; + } + emith_update_cache(); } -#define ADD_TO_ARRAY(array, count, item, failcode) \ - if (count >= ARRAY_SIZE(array)) { \ - dbg(1, "warning: " #array " overflow"); \ - failcode; \ - } \ - array[count++] = item; +static struct block_desc *dr_find_inactive_block(int tcache_id, u16 crc, + u32 addr, int size, u32 addr_lit, int size_lit) +{ + struct block_list **head = &inactive_blocks[tcache_id]; + struct block_list *current; -static int find_in_array(u32 *array, size_t size, u32 what) + for (current = *head; current != NULL; current = current->next) { + struct block_desc *block = current->block; + if (block->crc == crc && block->addr == addr && block->size == size && + block->addr_lit == addr_lit && block->size_lit == size_lit) + { + rm_from_block_lists(block); + return block; + } + } + return NULL; +} + +static struct block_desc *dr_add_block(int entries, u32 addr, int size, + u32 addr_lit, int size_lit, u16 crc, int is_slave, int *blk_id) { - size_t i; - for (i = 0; i < size; i++) - if (what == array[i]) - return i; + struct block_entry *be; + struct block_desc *bd; + int tcache_id; - return -1; + // do a lookup to get tcache_id and override check + be = dr_get_entry(addr, is_slave, &tcache_id); + if (be != NULL) + dbg(1, "block override for %08x", addr); + + if (block_ring[tcache_id].used + 1 > block_ring[tcache_id].size || + entry_ring[tcache_id].used + entries > entry_ring[tcache_id].size) { + dbg(1, "bd overflow for tcache %d", tcache_id); + return NULL; + } + + *blk_id = block_ring[tcache_id].next; + bd = ring_alloc(&block_ring[tcache_id], 1); + bd->entryp = ring_alloc(&entry_ring[tcache_id], entries); + + bd->addr = addr; + bd->size = size; + bd->addr_lit = addr_lit; + bd->size_lit = size_lit; + bd->tcache_ptr = tcache_ptr; + bd->crc = crc; + bd->active = 0; + bd->list = NULL; + bd->entry_count = 0; +#if (DRC_DEBUG & 2) + bd->refcount = 0; +#endif + + return bd; +} + +static void dr_link_blocks(struct block_entry *be, int tcache_id) +{ +#if LINK_BRANCHES + u32 tcmask = HASH_TABLE_SIZE(tcache_id) - 1; + u32 pc = be->pc; + struct block_link **head = &HASH_FUNC(unresolved_links[tcache_id], pc, tcmask); + struct block_link *bl = *head, *next; + + while (bl != NULL) { + next = bl->next; + if (bl->target_pc == pc && (!bl->tcache_id || bl->tcache_id == tcache_id)) { + rm_from_hashlist_unresolved(bl, bl->tcache_id); + dr_block_link(be, bl, 1); + } + bl = next; + } +#endif +} + +static void dr_link_outgoing(struct block_entry *be, int tcache_id, int is_slave) +{ +#if LINK_BRANCHES + struct block_link *bl; + int target_tcache_id; + + for (bl = be->o_links; bl; bl = bl->o_next) { + if (bl->target == NULL) { + be = dr_get_entry(bl->target_pc, is_slave, &target_tcache_id); + if (be != NULL && (!target_tcache_id || target_tcache_id == tcache_id)) { + // remove bl from unresolved_links (must've been since target was NULL) + rm_from_hashlist_unresolved(bl, bl->tcache_id); + dr_block_link(be, bl, 1); + } + } + } +#endif +} + +static void dr_activate_block(struct block_desc *bd, int tcache_id, int is_slave) +{ + int i; + + // connect branches + for (i = 0; i < bd->entry_count; i++) { + struct block_entry *entry = &bd->entryp[i]; + add_to_hashlist(entry, tcache_id); + // incoming branches + dr_link_blocks(entry, tcache_id); + if (!tcache_id) + dr_link_blocks(entry, is_slave?2:1); + // outgoing branches + dr_link_outgoing(entry, tcache_id, is_slave); + } + + // mark memory for overwrite detection + dr_mark_memory(1, bd, tcache_id, 0); + bd->active = 1; +} + +static void REGPARM(3) *dr_lookup_block(u32 pc, SH2 *sh2, int *tcache_id) +{ + struct block_entry *be = NULL; + void *block = NULL; + + be = dr_get_entry(pc, sh2->is_slave, tcache_id); + if (be != NULL) + block = be->tcache_ptr; + +#if (DRC_DEBUG & 2) + if (be != NULL) + be->block->refcount++; +#endif + return block; +} + +static void dr_free_oldest_block(int tcache_id) +{ + struct block_desc *bf; + + bf = ring_first(&block_ring[tcache_id]); + if (bf->addr && bf->entry_count) + dr_rm_block_entry(bf, tcache_id, 0, 1); + ring_free(&block_ring[tcache_id], 1); + + if (block_ring[tcache_id].used) { + bf = ring_first(&block_ring[tcache_id]); + ring_free_p(&entry_ring[tcache_id], bf->entryp); + ring_free_p(&tcache_ring[tcache_id], bf->tcache_ptr); + } else { + // reset since size of code block isn't known if no successor block exists + ring_reset(&block_ring[tcache_id]); + ring_reset(&entry_ring[tcache_id]); + ring_reset(&tcache_ring[tcache_id]); + } +} + +static inline void dr_reserve_cache(int tcache_id, struct ring_buffer *rb, int count) +{ + // while not enough space available + if (rb->next + count >= rb->size){ + // not enough space in rest of buffer -> wrap around + while (rb->first >= rb->next && rb->used) + dr_free_oldest_block(tcache_id); + if (rb->first == 0 && rb->used) + dr_free_oldest_block(tcache_id); + ring_wrap(rb); + } + while (rb->first >= rb->next && rb->next + count > rb->first && rb->used) + dr_free_oldest_block(tcache_id); +} + +static u8 *dr_prepare_cache(int tcache_id, int insn_count, int entry_count) +{ + int bf = block_ring[tcache_id].first; + + // reserve one block desc + if (block_ring[tcache_id].used >= block_ring[tcache_id].size) + dr_free_oldest_block(tcache_id); + // reserve block entries + dr_reserve_cache(tcache_id, &entry_ring[tcache_id], entry_count); + // reserve cache space + dr_reserve_cache(tcache_id, &tcache_ring[tcache_id], insn_count*128); + + if (bf != block_ring[tcache_id].first) { + // deleted some block(s), clear branch cache and return stack +#if BRANCH_CACHE + if (tcache_id) + memset32(sh2s[tcache_id-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + else { + memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4); + } +#endif +#if CALL_STACK + if (tcache_id) { + memset32(sh2s[tcache_id-1].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4); + sh2s[tcache_id-1].rts_cache_idx = 0; + } else { + memset32(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4); + memset32(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)/4); + sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0; + } +#endif + } + + return ring_next(&tcache_ring[tcache_id]); +} + +static void dr_flush_tcache(int tcid) +{ + int i; +#if (DRC_DEBUG & 1) + elprintf(EL_STATUS, "tcache #%d flush! (%d/%d, bds %d/%d bes %d/%d)", tcid, + tcache_ring[tcid].used, tcache_ring[tcid].size, block_ring[tcid].used, + block_ring[tcid].size, entry_ring[tcid].used, entry_ring[tcid].size); +#endif + + ring_reset(&tcache_ring[tcid]); + ring_reset(&block_ring[tcid]); + ring_reset(&entry_ring[tcid]); + + block_link_pool_counts[tcid] = 0; + blink_free[tcid] = NULL; + memset(unresolved_links[tcid], 0, sizeof(*unresolved_links[0]) * HASH_TABLE_SIZE(tcid)); + memset(hash_tables[tcid], 0, sizeof(*hash_tables[0]) * HASH_TABLE_SIZE(tcid)); + + if (tcid == 0) { // ROM, RAM + memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram)); + memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram)); + memset(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)); + memset(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)); + memset(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)); + memset(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)); + sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0; + } else { + memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram)); + memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram)); + memset(Pico32xMem->drcblk_da[tcid - 1], 0, sizeof(Pico32xMem->drcblk_da[tcid - 1])); + memset(Pico32xMem->drclit_da[tcid - 1], 0, sizeof(Pico32xMem->drclit_da[tcid - 1])); + memset(sh2s[tcid - 1].branch_cache, -1, sizeof(sh2s[0].branch_cache)); + memset(sh2s[tcid - 1].rts_cache, -1, sizeof(sh2s[0].rts_cache)); + sh2s[tcid - 1].rts_cache_idx = 0; + } +#if (DRC_DEBUG & 4) + tcache_dsm_ptrs[tcid] = tcache_ring[tcid].base; +#endif + + for (i = 0; i < RAM_SIZE(tcid) / INVAL_PAGE_SIZE; i++) + discard_block_list(&inval_lookup[tcid][i]); + discard_block_list(&inactive_blocks[tcid]); +} + +static void *dr_failure(void) +{ + printf("recompilation failed\n"); + exit(1); } // --------------------------------------------------------------- +// NB rcache allocation dependencies: +// - get_reg_arg/get_tmp_arg first (might evict other regs just allocated) +// - get_reg(..., NULL) before get_reg(..., &hr) if it might get the same reg +// - get_reg(..., RC_GR_READ/RMW, ...) before WRITE (might evict needed reg) + // register cache / constant propagation stuff typedef enum { RC_GR_READ, @@ -738,397 +1320,1174 @@ typedef enum { RC_GR_RMW, } rc_gr_mode; -static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking); +typedef struct { + u32 gregs; + u32 val; +} gconst_t; + +gconst_t gconsts[ARRAY_SIZE(guest_regs)]; + +static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr); +static inline int rcache_is_cached(sh2_reg_e r); +static void rcache_add_vreg_alias(int x, sh2_reg_e r); +static void rcache_remove_vreg_alias(int x, sh2_reg_e r); +static void rcache_evict_vreg(int x); +static void rcache_remap_vreg(int x); + +static void rcache_set_x16(int hr, int s16_, int u16_) +{ + int x = reg_map_host[hr]; + if (x >= 0) { + cache_regs[x].flags &= ~(HRF_S16|HRF_U16); + if (s16_) cache_regs[x].flags |= HRF_S16; + if (u16_) cache_regs[x].flags |= HRF_U16; + } +} + +static void rcache_copy_x16(int hr, int hr2) +{ + int x = reg_map_host[hr], y = reg_map_host[hr2]; + if (x >= 0 && y >= 0) { + cache_regs[x].flags = (cache_regs[x].flags & ~(HRF_S16|HRF_U16)) | + (cache_regs[y].flags & (HRF_S16|HRF_U16)); + } +} + +static int rcache_is_s16(int hr) +{ + int x = reg_map_host[hr]; + return (x >= 0 ? cache_regs[x].flags & HRF_S16 : 0); +} + +static int rcache_is_u16(int hr) +{ + int x = reg_map_host[hr]; + return (x >= 0 ? cache_regs[x].flags & HRF_U16 : 0); +} -// guest regs with constants -static u32 dr_gcregs[24]; -// a mask of constant/dirty regs -static u32 dr_gcregs_mask; -static u32 dr_gcregs_dirty; +#define RCACHE_DUMP(msg) { \ + cache_reg_t *cp; \ + guest_reg_t *gp; \ + int i; \ + printf("cache dump %s:\n",msg); \ + printf(" cache_regs:\n"); \ + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { \ + cp = &cache_regs[i]; \ + if (cp->type != HR_FREE || cp->gregs || cp->locked || cp->flags) \ + printf(" %d: hr=%d t=%d f=%x c=%d m=%x\n", i, cp->hreg, cp->type, cp->flags, cp->locked, cp->gregs); \ + } \ + printf(" guest_regs:\n"); \ + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { \ + gp = &guest_regs[i]; \ + if (gp->vreg != -1 || gp->sreg >= 0 || gp->flags) \ + printf(" %d: v=%d f=%x s=%d c=%d\n", i, gp->vreg, gp->flags, gp->sreg, gp->cnst); \ + } \ + printf(" gconsts:\n"); \ + for (i = 0; i < ARRAY_SIZE(gconsts); i++) { \ + if (gconsts[i].gregs) \ + printf(" %d: m=%x v=%x\n", i, gconsts[i].gregs, gconsts[i].val); \ + } \ +} + +#define RCACHE_CHECK(msg) { \ + cache_reg_t *cp; \ + guest_reg_t *gp; \ + int i, x, m = 0, d = 0; \ + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { \ + cp = &cache_regs[i]; \ + if (cp->flags & HRF_PINNED) m |= (1 << i); \ + if (cp->type == HR_FREE || cp->type == HR_TEMP) continue; \ + /* check connectivity greg->vreg */ \ + FOR_ALL_BITS_SET_DO(cp->gregs, x, \ + if (guest_regs[x].vreg != i) \ + { d = 1; printf("cache check v=%d r=%d not connected?\n",i,x); } \ + ) \ + } \ + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { \ + gp = &guest_regs[i]; \ + if (gp->vreg != -1 && !(cache_regs[gp->vreg].gregs & (1 << i))) \ + { d = 1; printf("cache check r=%d v=%d not connected?\n", i, gp->vreg); }\ + if (gp->vreg != -1 && cache_regs[gp->vreg].type != HR_CACHED) \ + { d = 1; printf("cache check r=%d v=%d wrong type?\n", i, gp->vreg); }\ + if ((gp->flags & GRF_CONST) && !(gconsts[gp->cnst].gregs & (1 << i))) \ + { d = 1; printf("cache check r=%d c=%d not connected?\n", i, gp->cnst); }\ + if ((gp->flags & GRF_CDIRTY) && (gp->vreg != -1 || !(gp->flags & GRF_CONST)))\ + { d = 1; printf("cache check r=%d CDIRTY?\n", i); } \ + if (gp->flags & (GRF_STATIC|GRF_PINNED)) { \ + if (gp->sreg == -1 || !(cache_regs[gp->sreg].flags & HRF_PINNED))\ + { d = 1; printf("cache check r=%d v=%d not pinned?\n", i, gp->vreg); } \ + else m &= ~(1 << gp->sreg); \ + } \ + } \ + for (i = 0; i < ARRAY_SIZE(gconsts); i++) { \ + FOR_ALL_BITS_SET_DO(gconsts[i].gregs, x, \ + if (guest_regs[x].cnst != i || !(guest_regs[x].flags & GRF_CONST)) \ + { d = 1; printf("cache check c=%d v=%d not connected?\n",i,x); } \ + ) \ + } \ + if (m) \ + { d = 1; printf("cache check m=%x pinning wrong?\n",m); } \ + if (d) RCACHE_DUMP(msg) \ +/* else { \ + printf("locked regs %s:\n",msg); \ + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { \ + cp = &cache_regs[i]; \ + if (cp->locked) \ + printf(" %d: hr=%d t=%d f=%x c=%d m=%x\n", i, cp->hreg, cp->type, cp->flags, cp->locked, cp->gregs); \ + } \ + } */ \ +} #if PROPAGATE_CONSTANTS -static void gconst_new(sh2_reg_e r, u32 val) +static inline int gconst_alloc(sh2_reg_e r) { - int i; + int i, n = -1; + + for (i = 0; i < ARRAY_SIZE(gconsts); i++) { + gconsts[i].gregs &= ~(1 << r); + if (gconsts[i].gregs == 0 && n < 0) + n = i; + } + if (n >= 0) + gconsts[n].gregs = (1 << r); + else { + printf("all gconst buffers in use, aborting\n"); + exit(1); // cannot happen - more constants than guest regs? + } + return n; +} + +static void gconst_set(sh2_reg_e r, u32 val) +{ + int i = gconst_alloc(r); + + guest_regs[r].flags |= GRF_CONST; + guest_regs[r].cnst = i; + gconsts[i].val = val; +} - dr_gcregs_mask |= 1 << r; - dr_gcregs_dirty |= 1 << r; - dr_gcregs[r] = val; +static void gconst_new(sh2_reg_e r, u32 val) +{ + gconst_set(r, val); + guest_regs[r].flags |= GRF_CDIRTY; // throw away old r that we might have cached - for (i = ARRAY_SIZE(reg_temp) - 1; i >= 0; i--) { - if ((reg_temp[i].type == HR_CACHED) && - reg_temp[i].greg == r) { - reg_temp[i].type = HR_FREE; - reg_temp[i].flags = 0; - } - } + if (guest_regs[r].vreg >= 0) + rcache_remove_vreg_alias(guest_regs[r].vreg, r); } #endif static int gconst_get(sh2_reg_e r, u32 *val) { - if (dr_gcregs_mask & (1 << r)) { - *val = dr_gcregs[r]; + if (guest_regs[r].flags & GRF_CONST) { + *val = gconsts[guest_regs[r].cnst].val; return 1; } + *val = 0; return 0; } static int gconst_check(sh2_reg_e r) { - if ((dr_gcregs_mask | dr_gcregs_dirty) & (1 << r)) + if (guest_regs[r].flags & (GRF_CONST|GRF_CDIRTY)) return 1; return 0; } // update hr if dirty, else do nothing -static int gconst_try_read(int hr, sh2_reg_e r) +static int gconst_try_read(int vreg, sh2_reg_e r) { - if (dr_gcregs_dirty & (1 << r)) { - emith_move_r_imm(hr, dr_gcregs[r]); - dr_gcregs_dirty &= ~(1 << r); + int i, x; + u32 v; + + if (guest_regs[r].flags & GRF_CDIRTY) { + x = guest_regs[r].cnst; + v = gconsts[x].val; + emith_move_r_imm(cache_regs[vreg].hreg, v); + rcache_set_x16(cache_regs[vreg].hreg, v == (s16)v, v == (u16)v); + FOR_ALL_BITS_SET_DO(gconsts[x].gregs, i, + { + if (guest_regs[i].vreg >= 0 && guest_regs[i].vreg != vreg) + rcache_remove_vreg_alias(guest_regs[i].vreg, i); + if (guest_regs[i].vreg < 0) + rcache_add_vreg_alias(vreg, i); + guest_regs[i].flags &= ~GRF_CDIRTY; + guest_regs[i].flags |= GRF_DIRTY; + }); + cache_regs[vreg].type = HR_CACHED; + cache_regs[vreg].flags |= HRF_DIRTY; return 1; } return 0; } -static void gconst_check_evict(sh2_reg_e r) +static u32 gconst_dirty_mask(void) { - if (dr_gcregs_mask & (1 << r)) - // no longer cached in reg, make dirty again - dr_gcregs_dirty |= 1 << r; + u32 mask = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) + if (guest_regs[i].flags & GRF_CDIRTY) + mask |= (1 << i); + return mask; } static void gconst_kill(sh2_reg_e r) { - dr_gcregs_mask &= ~(1 << r); - dr_gcregs_dirty &= ~(1 << r); + if (guest_regs[r].flags & (GRF_CONST|GRF_CDIRTY)) + gconsts[guest_regs[r].cnst].gregs &= ~(1 << r); + guest_regs[r].flags &= ~(GRF_CONST|GRF_CDIRTY); +} + +static void gconst_copy(sh2_reg_e rd, sh2_reg_e rs) +{ + gconst_kill(rd); + if (guest_regs[rs].flags & GRF_CONST) { + guest_regs[rd].flags |= GRF_CONST; + if (guest_regs[rd].vreg < 0) + guest_regs[rd].flags |= GRF_CDIRTY; + guest_regs[rd].cnst = guest_regs[rs].cnst; + gconsts[guest_regs[rd].cnst].gregs |= (1 << rd); + } } static void gconst_clean(void) { int i; - for (i = 0; i < ARRAY_SIZE(dr_gcregs); i++) - if (dr_gcregs_dirty & (1 << i)) { + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) + if (guest_regs[i].flags & GRF_CDIRTY) { // using RC_GR_READ here: it will call gconst_try_read, // cache the reg and mark it dirty. - rcache_get_reg_(i, RC_GR_READ, 0); + rcache_get_reg_(i, RC_GR_READ, 0, NULL); } } static void gconst_invalidate(void) { - dr_gcregs_mask = dr_gcregs_dirty = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { + if (guest_regs[i].flags & (GRF_CONST|GRF_CDIRTY)) + gconsts[guest_regs[i].cnst].gregs &= ~(1 << i); + guest_regs[i].flags &= ~(GRF_CONST|GRF_CDIRTY); + } } + static u16 rcache_counter; +// SH2 register usage bitmasks +static u32 rcache_vregs_reg; // regs of type HRT_REG (for pinning) +static u32 rcache_regs_static; // statically allocated regs +static u32 rcache_regs_pinned; // pinned regs +static u32 rcache_regs_now; // regs used in current insn +static u32 rcache_regs_soon; // regs used in the next few insns +static u32 rcache_regs_late; // regs used in later insns +static u32 rcache_regs_discard; // regs overwritten without being used +static u32 rcache_regs_clean; // regs needing cleaning + +static void rcache_lock_vreg(int x) +{ + if (x >= 0) { + cache_regs[x].locked ++; +#if DRC_DEBUG & 64 + if (cache_regs[x].type == HR_FREE) { + printf("locking free vreg %x, aborting\n", x); + exit(1); + } + if (!cache_regs[x].locked) { + printf("locking overflow vreg %x, aborting\n", x); + exit(1); + } +#endif + } +} + +static void rcache_unlock_vreg(int x) +{ + if (x >= 0) { +#if DRC_DEBUG & 64 + if (cache_regs[x].type == HR_FREE) { + printf("unlocking free vreg %x, aborting\n", x); + exit(1); + } +#endif + if (cache_regs[x].locked) + cache_regs[x].locked --; + } +} + +static void rcache_free_vreg(int x) +{ + cache_regs[x].type = cache_regs[x].locked ? HR_TEMP : HR_FREE; + cache_regs[x].flags &= HRF_PINNED; + cache_regs[x].gregs = 0; +} + +static void rcache_unmap_vreg(int x) +{ + int i; + + FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, i, + if (guest_regs[i].flags & GRF_DIRTY) { + // if a dirty reg is unmapped save its value to context + if ((~rcache_regs_discard | rcache_regs_now) & (1 << i)) + emith_ctx_write(cache_regs[x].hreg, i * 4); + guest_regs[i].flags &= ~GRF_DIRTY; + } + guest_regs[i].vreg = -1); + rcache_free_vreg(x); +} -static temp_reg_t *rcache_evict(void) +static void rcache_move_vreg(int d, int x) { - // evict reg with oldest stamp - int i, oldest = -1; + int i; + + cache_regs[d].type = HR_CACHED; + cache_regs[d].gregs = cache_regs[x].gregs; + cache_regs[d].flags &= HRF_PINNED; + cache_regs[d].flags |= cache_regs[x].flags & ~HRF_PINNED; + cache_regs[d].locked = 0; + cache_regs[d].stamp = cache_regs[x].stamp; + emith_move_r_r(cache_regs[d].hreg, cache_regs[x].hreg); + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) + if (guest_regs[i].vreg == x) + guest_regs[i].vreg = d; + rcache_free_vreg(x); +} + +static void rcache_clean_vreg(int x) +{ + u32 rns = rcache_regs_now | rcache_regs_soon; + int r; + + if (cache_regs[x].flags & HRF_DIRTY) { // writeback + cache_regs[x].flags &= ~HRF_DIRTY; + rcache_lock_vreg(x); + FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, r, + if (guest_regs[r].flags & GRF_DIRTY) { + if (guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) { + if (guest_regs[r].vreg != guest_regs[r].sreg && + !cache_regs[guest_regs[r].sreg].locked && + ((~rcache_regs_discard | rcache_regs_now) & (1 << r)) && + !(rns & cache_regs[guest_regs[r].sreg].gregs)) { + // statically mapped reg not in its sreg. move back to sreg + rcache_evict_vreg(guest_regs[r].sreg); + emith_move_r_r(cache_regs[guest_regs[r].sreg].hreg, + cache_regs[guest_regs[r].vreg].hreg); + rcache_copy_x16(cache_regs[guest_regs[r].sreg].hreg, + cache_regs[guest_regs[r].vreg].hreg); + rcache_remove_vreg_alias(x, r); + rcache_add_vreg_alias(guest_regs[r].sreg, r); + cache_regs[guest_regs[r].sreg].flags |= HRF_DIRTY; + } else + // cannot remap. keep dirty for writeback in unmap + cache_regs[x].flags |= HRF_DIRTY; + } else { + if ((~rcache_regs_discard | rcache_regs_now) & (1 << r)) + emith_ctx_write(cache_regs[x].hreg, r * 4); + guest_regs[r].flags &= ~GRF_DIRTY; + } + rcache_regs_clean &= ~(1 << r); + }) + rcache_unlock_vreg(x); + } + +#if DRC_DEBUG & 64 + RCACHE_CHECK("after clean"); +#endif +} + +static void rcache_add_vreg_alias(int x, sh2_reg_e r) +{ + cache_regs[x].gregs |= (1 << r); + guest_regs[r].vreg = x; + cache_regs[x].type = HR_CACHED; +} + +static void rcache_remove_vreg_alias(int x, sh2_reg_e r) +{ + cache_regs[x].gregs &= ~(1 << r); + if (!cache_regs[x].gregs) { + // no reg mapped -> free vreg + if (cache_regs[x].locked) + cache_regs[x].type = HR_TEMP; + else + rcache_free_vreg(x); + } + guest_regs[r].vreg = -1; +} + +static void rcache_evict_vreg(int x) +{ +#if REMAP_REGISTER + rcache_remap_vreg(x); +#else + rcache_clean_vreg(x); +#endif + rcache_unmap_vreg(x); +} + +static void rcache_evict_vreg_aliases(int x, sh2_reg_e r) +{ + rcache_remove_vreg_alias(x, r); + rcache_evict_vreg(x); + rcache_add_vreg_alias(x, r); +} + +static int rcache_allocate(int what, int minprio) +{ + // evict reg with oldest stamp (only for HRT_REG, no temps) + int i, i_prio, oldest = -1, prio = 0; u16 min_stamp = (u16)-1; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) { - if (reg_temp[i].type == HR_CACHED && !(reg_temp[i].flags & HRF_LOCKED) && - reg_temp[i].stamp <= min_stamp) { - min_stamp = reg_temp[i].stamp; + for (i = ARRAY_SIZE(cache_regs)-1; i >= 0; i--) { + // consider only non-static, unpinned, unlocked REG or TEMP + if ((cache_regs[i].flags & HRF_PINNED) || cache_regs[i].locked) + continue; + if ((what > 0 && !(cache_regs[i].htype & HRT_REG)) || // get a REG + (what == 0 && (cache_regs[i].htype & HRT_TEMP)) || // get a non-TEMP + (what < 0 && !(cache_regs[i].htype & HRT_TEMP))) // get a TEMP + continue; + if (cache_regs[i].type == HR_FREE || cache_regs[i].type == HR_TEMP) { + // REG is free + prio = 10; oldest = i; + break; + } + if (cache_regs[i].type == HR_CACHED) { + if (rcache_regs_now & cache_regs[i].gregs) + // REGs needed for the current insn + i_prio = 0; + else if (rcache_regs_soon & cache_regs[i].gregs) + // REGs needed in the next insns + i_prio = 2; + else if (rcache_regs_late & cache_regs[i].gregs) + // REGs needed in some future insn + i_prio = 4; + else if (~rcache_regs_discard & cache_regs[i].gregs) + // REGs not needed in the foreseeable future + i_prio = 6; + else + // REGs soon overwritten anyway + i_prio = 8; + if (!(cache_regs[i].flags & HRF_DIRTY)) i_prio ++; + + if (prio < i_prio || (prio == i_prio && cache_regs[i].stamp < min_stamp)) { + min_stamp = cache_regs[i].stamp; + oldest = i; + prio = i_prio; + } } } - if (oldest == -1) { - printf("no registers to evict, aborting\n"); + + if (prio < minprio || oldest == -1) + return -1; + + if (cache_regs[oldest].type == HR_CACHED) + rcache_evict_vreg(oldest); + else + rcache_free_vreg(oldest); + + return oldest; +} + +static int rcache_allocate_vreg(int needed) +{ + int x; + + x = rcache_allocate(1, needed ? 0 : 4); + if (x < 0) + x = rcache_allocate(-1, 0); + return x; +} + +static int rcache_allocate_nontemp(void) +{ + int x = rcache_allocate(0, 4); + return x; +} + +static int rcache_allocate_temp(void) +{ + int x = rcache_allocate(-1, 0); + if (x < 0) + x = rcache_allocate(0, 0); + return x; +} + +#if REMAP_REGISTER +// maps a host register to a REG +static int rcache_map_reg(sh2_reg_e r, int hr) +{ + int i; + + gconst_kill(r); + + // lookup the TEMP hr maps to + i = reg_map_host[hr]; + if (i < 0) { + // must not happen + printf("invalid host register %d\n", hr); exit(1); } - i = oldest; - if (reg_temp[i].type == HR_CACHED) { - if (reg_temp[i].flags & HRF_DIRTY) - // writeback - emith_ctx_write(reg_temp[i].hreg, reg_temp[i].greg * 4); - gconst_check_evict(reg_temp[i].greg); + // remove old mappings of r and i if one exists + if (guest_regs[r].vreg >= 0) + rcache_remove_vreg_alias(guest_regs[r].vreg, r); + if (cache_regs[i].type == HR_CACHED) + rcache_evict_vreg(i); + // set new mappping + cache_regs[i].type = HR_CACHED; + cache_regs[i].gregs = 1 << r; + cache_regs[i].locked = 0; + cache_regs[i].stamp = ++rcache_counter; + cache_regs[i].flags |= HRF_DIRTY; + rcache_lock_vreg(i); + guest_regs[r].flags |= GRF_DIRTY; + guest_regs[r].vreg = i; +#if DRC_DEBUG & 64 + RCACHE_CHECK("after map"); +#endif + return cache_regs[i].hreg; +} + +// remap vreg from a TEMP to a REG if it will be used (upcoming TEMP invalidation) +static void rcache_remap_vreg(int x) +{ + u32 rsl_d = rcache_regs_soon | rcache_regs_late; + int d; + + // x must be a cached vreg + if (cache_regs[x].type != HR_CACHED || cache_regs[x].locked) + return; + // don't do it if x isn't used + if (!(rsl_d & cache_regs[x].gregs)) { + // clean here to avoid data loss on invalidation + rcache_clean_vreg(x); + return; + } + + FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, d, + if ((guest_regs[d].flags & (GRF_STATIC|GRF_PINNED)) && + !cache_regs[guest_regs[d].sreg].locked && + !((rsl_d|rcache_regs_now) & cache_regs[guest_regs[d].sreg].gregs)) { + // STATIC not in its sreg and sreg is available + rcache_evict_vreg(guest_regs[d].sreg); + rcache_move_vreg(guest_regs[d].sreg, x); + return; + } + ) + + // allocate a non-TEMP vreg + rcache_lock_vreg(x); // lock to avoid evicting x + d = rcache_allocate_nontemp(); + rcache_unlock_vreg(x); + if (d < 0) { + rcache_clean_vreg(x); + return; } - reg_temp[i].type = HR_FREE; - reg_temp[i].flags = 0; - return ®_temp[i]; + // move vreg to new location + rcache_move_vreg(d, x); +#if DRC_DEBUG & 64 + RCACHE_CHECK("after remap"); +#endif } +#endif -static int get_reg_static(sh2_reg_e r, rc_gr_mode mode) +#if ALIAS_REGISTERS +static void rcache_alias_vreg(sh2_reg_e rd, sh2_reg_e rs) { - int i = reg_map_g2h[r]; - if (i != -1) { - if (mode != RC_GR_WRITE) - gconst_try_read(i, r); + int x; + + // if s isn't constant, it must be in cache for aliasing + if (!gconst_check(rs)) + rcache_get_reg_(rs, RC_GR_READ, 0, NULL); + + // if d and s are not already aliased + x = guest_regs[rs].vreg; + if (guest_regs[rd].vreg != x) { + // remove possible old mapping of dst + if (guest_regs[rd].vreg >= 0) + rcache_remove_vreg_alias(guest_regs[rd].vreg, rd); + // make dst an alias of src + if (x >= 0) + rcache_add_vreg_alias(x, rd); + // if d is now in cache, it must be dirty + if (guest_regs[rd].vreg >= 0) { + x = guest_regs[rd].vreg; + cache_regs[x].flags |= HRF_DIRTY; + guest_regs[rd].flags |= GRF_DIRTY; + } } - return i; + + gconst_copy(rd, rs); +#if DRC_DEBUG & 64 + RCACHE_CHECK("after alias"); +#endif } +#endif // note: must not be called when doing conditional code -static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking) -{ - temp_reg_t *tr; - int i, ret; - - // maybe statically mapped? - ret = get_reg_static(r, mode); - if (ret != -1) - goto end; - - rcache_counter++; - - // maybe already cached? - // if so, prefer against gconst (they must be in sync) - for (i = ARRAY_SIZE(reg_temp) - 1; i >= 0; i--) { - if (reg_temp[i].type == HR_CACHED && reg_temp[i].greg == r) { - reg_temp[i].stamp = rcache_counter; - if (mode != RC_GR_READ) - reg_temp[i].flags |= HRF_DIRTY; - ret = reg_temp[i].hreg; - goto end; +static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr) +{ + int src, dst, ali; + cache_reg_t *tr; + u32 rsp_d = (rcache_regs_soon | rcache_regs_static | rcache_regs_pinned) & + ~rcache_regs_discard; + + dst = src = guest_regs[r].vreg; + + rcache_lock_vreg(src); // lock to avoid evicting src + // good opportunity to relocate a remapped STATIC? + if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) && + src != guest_regs[r].sreg && (src < 0 || mode != RC_GR_READ) && + !cache_regs[guest_regs[r].sreg].locked && + !((rsp_d|rcache_regs_now) & cache_regs[guest_regs[r].sreg].gregs)) { + dst = guest_regs[r].sreg; + rcache_evict_vreg(dst); + } else if (dst < 0) { + // allocate a cache register + if ((dst = rcache_allocate_vreg(rsp_d & (1 << r))) < 0) { + printf("no registers to evict, aborting\n"); + exit(1); } } - - // use any free reg - for (i = ARRAY_SIZE(reg_temp) - 1; i >= 0; i--) { - if (reg_temp[i].type == HR_FREE) { - tr = ®_temp[i]; - goto do_alloc; + tr = &cache_regs[dst]; + tr->stamp = rcache_counter; + // remove r from src + if (src >= 0 && src != dst) + rcache_remove_vreg_alias(src, r); + rcache_unlock_vreg(src); + + // if r has a constant it may have aliases + if (mode != RC_GR_WRITE && gconst_try_read(dst, r)) + src = dst; + + // if r will be modified, check for aliases being needed rsn + ali = tr->gregs & ~(1 << r); + if (mode != RC_GR_READ && src == dst && ali) { + int x = -1; + if ((rsp_d|rcache_regs_now) & ali) { + if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) && + guest_regs[r].sreg == dst && !tr->locked) { + // split aliases if r is STATIC in sreg and dst isn't already locked + int t; + FOR_ALL_BITS_SET_DO(ali, t, + if ((guest_regs[t].flags & (GRF_STATIC|GRF_PINNED)) && + !(ali & ~(1 << t)) && + !cache_regs[guest_regs[t].sreg].locked && + !((rsp_d|rcache_regs_now) & cache_regs[guest_regs[t].sreg].gregs)) { + // alias is a single STATIC and its sreg is available + x = guest_regs[t].sreg; + rcache_evict_vreg(x); + } else { + rcache_lock_vreg(dst); // lock to avoid evicting dst + x = rcache_allocate_vreg(rsp_d & ali); + rcache_unlock_vreg(dst); + } + break; + ) + if (x >= 0) { + rcache_remove_vreg_alias(src, r); + src = dst; + rcache_move_vreg(x, dst); + } + } else { + // split r + rcache_lock_vreg(src); // lock to avoid evicting src + x = rcache_allocate_vreg(rsp_d & (1 << r)); + rcache_unlock_vreg(src); + if (x >= 0) { + rcache_remove_vreg_alias(src, r); + dst = x; + tr = &cache_regs[dst]; + tr->stamp = rcache_counter; + } + } } + if (x < 0) + // aliases not needed or no vreg available, remove them + rcache_evict_vreg_aliases(dst, r); } - tr = rcache_evict(); + // assign r to dst + rcache_add_vreg_alias(dst, r); -do_alloc: - tr->type = HR_CACHED; + // handle dst register transfer + if (src < 0 && mode != RC_GR_WRITE) + emith_ctx_read(tr->hreg, r * 4); + if (hr) { + *hr = (src >= 0 ? cache_regs[src].hreg : tr->hreg); + rcache_lock_vreg(src >= 0 ? src : dst); + } else if (src >= 0 && mode != RC_GR_WRITE && cache_regs[src].hreg != tr->hreg) + emith_move_r_r(tr->hreg, cache_regs[src].hreg); + + // housekeeping if (do_locking) - tr->flags |= HRF_LOCKED; - if (mode != RC_GR_READ) + rcache_lock_vreg(dst); + if (mode != RC_GR_READ) { tr->flags |= HRF_DIRTY; - tr->greg = r; - tr->stamp = rcache_counter; - ret = tr->hreg; - - if (mode != RC_GR_WRITE) { - if (gconst_check(r)) { - if (gconst_try_read(ret, r)) - tr->flags |= HRF_DIRTY; - } - else - emith_ctx_read(tr->hreg, r * 4); - } - -end: - if (mode != RC_GR_READ) + guest_regs[r].flags |= GRF_DIRTY; gconst_kill(r); + rcache_set_x16(tr->hreg, 0, 0); + } else if (src >= 0 && cache_regs[src].hreg != tr->hreg) + rcache_copy_x16(tr->hreg, cache_regs[src].hreg); +#if DRC_DEBUG & 64 + RCACHE_CHECK("after getreg"); +#endif + return tr->hreg; +} - return ret; +static int rcache_get_reg(sh2_reg_e r, rc_gr_mode mode, int *hr) +{ + return rcache_get_reg_(r, mode, 1, hr); } -static int rcache_get_reg(sh2_reg_e r, rc_gr_mode mode) +static void rcache_pin_reg(sh2_reg_e r) { - return rcache_get_reg_(r, mode, 1); + int hr, x; + + // don't pin if static or already pinned + if (guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) + return; + + rcache_regs_soon |= (1 << r); // kludge to prevent allocation of a temp + hr = rcache_get_reg_(r, RC_GR_RMW, 0, NULL); + x = reg_map_host[hr]; + + // can only pin non-TEMPs + if (!(cache_regs[x].htype & HRT_TEMP)) { + guest_regs[r].flags |= GRF_PINNED; + cache_regs[x].flags |= HRF_PINNED; + guest_regs[r].sreg = x; + rcache_regs_pinned |= (1 << r); + } +#if DRC_DEBUG & 64 + RCACHE_CHECK("after pin"); +#endif } static int rcache_get_tmp(void) { - temp_reg_t *tr; int i; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) - if (reg_temp[i].type == HR_FREE) { - tr = ®_temp[i]; - goto do_alloc; - } + i = rcache_allocate_temp(); + if (i < 0) { + printf("cannot allocate temp\n"); + exit(1); + } - tr = rcache_evict(); + cache_regs[i].type = HR_TEMP; + rcache_lock_vreg(i); -do_alloc: - tr->type = HR_TEMP; - return tr->hreg; + return cache_regs[i].hreg; } -static int rcache_get_hr_id(int hr) +static int rcache_get_vreg_hr(int hr) { int i; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) - if (reg_temp[i].hreg == hr) - break; - - if (i == ARRAY_SIZE(reg_temp)) // can't happen + i = reg_map_host[hr]; + if (i < 0 || cache_regs[i].locked) { + printf("host register %d is locked\n", hr); exit(1); - - if (reg_temp[i].type == HR_CACHED) { - // writeback - if (reg_temp[i].flags & HRF_DIRTY) - emith_ctx_write(reg_temp[i].hreg, reg_temp[i].greg * 4); - gconst_check_evict(reg_temp[i].greg); } - else if (reg_temp[i].type == HR_TEMP) { + + if (cache_regs[i].type == HR_CACHED) + rcache_evict_vreg(i); + else if (cache_regs[i].type == HR_TEMP && cache_regs[i].locked) { printf("host reg %d already used, aborting\n", hr); exit(1); } - reg_temp[i].type = HR_FREE; - reg_temp[i].flags = 0; - return i; } -static int rcache_get_arg_id(int arg) +static int rcache_get_vreg_arg(int arg) { - int r = 0; - host_arg2reg(r, arg); - return rcache_get_hr_id(r); + int hr = 0; + + host_arg2reg(hr, arg); + return rcache_get_vreg_hr(hr); } // get a reg to be used as function arg static int rcache_get_tmp_arg(int arg) { - int id = rcache_get_arg_id(arg); - reg_temp[id].type = HR_TEMP; + int x = rcache_get_vreg_arg(arg); + cache_regs[x].type = HR_TEMP; + rcache_lock_vreg(x); - return reg_temp[id].hreg; + return cache_regs[x].hreg; } // ... as return value after a call static int rcache_get_tmp_ret(void) { - int id = rcache_get_hr_id(RET_REG); - reg_temp[id].type = HR_TEMP; + int x = rcache_get_vreg_hr(RET_REG); + cache_regs[x].type = HR_TEMP; + rcache_lock_vreg(x); - return reg_temp[id].hreg; + return cache_regs[x].hreg; } -// same but caches a reg. RC_GR_READ only. -static int rcache_get_reg_arg(int arg, sh2_reg_e r) +// same but caches a reg if access is readonly (announced by hr being NULL) +static int rcache_get_reg_arg(int arg, sh2_reg_e r, int *hr) { - int i, srcr, dstr, dstid; - int dirty = 0, src_dirty = 0; + int i, srcr, dstr, dstid, keep; + u32 val; + host_arg2reg(dstr, arg); + + i = guest_regs[r].vreg; + if (i >= 0 && cache_regs[i].type == HR_CACHED && cache_regs[i].hreg == dstr) + // r is already in arg, avoid evicting + dstid = i; + else + dstid = rcache_get_vreg_arg(arg); + dstr = cache_regs[dstid].hreg; + + if (rcache_is_cached(r)) { + // r is needed later on anyway + srcr = rcache_get_reg_(r, RC_GR_READ, 0, NULL); + keep = 1; + } else if ((guest_regs[r].flags & GRF_CDIRTY) && gconst_get(r, &val)) { + // r has an uncomitted const - load into arg, but keep constant uncomitted + srcr = dstr; + emith_move_r_imm(srcr, val); + keep = 0; + } else { + // must read from ctx + srcr = dstr; + emith_ctx_read(srcr, r * 4); + keep = 1; + } - dstid = rcache_get_arg_id(arg); - dstr = reg_temp[dstid].hreg; + if (cache_regs[dstid].type == HR_CACHED) + rcache_evict_vreg(dstid); + + cache_regs[dstid].type = HR_TEMP; + if (hr == NULL) { + if (dstr != srcr) + // arg is a copy of cached r + emith_move_r_r(dstr, srcr); + else if (keep && guest_regs[r].vreg < 0) + // keep arg as vreg for r + rcache_add_vreg_alias(dstid, r); + } else { + *hr = srcr; + if (dstr != srcr) // must lock srcr if not copied here + rcache_lock_vreg(reg_map_host[srcr]); + } + + cache_regs[dstid].stamp = ++rcache_counter; + rcache_lock_vreg(dstid); +#if DRC_DEBUG & 64 + RCACHE_CHECK("after getarg"); +#endif + return dstr; +} - // maybe already statically mapped? - srcr = get_reg_static(r, RC_GR_READ); - if (srcr != -1) - goto do_cache; +static void rcache_free_tmp(int hr) +{ + int i = reg_map_host[hr]; - // maybe already cached? - for (i = ARRAY_SIZE(reg_temp) - 1; i >= 0; i--) { - if ((reg_temp[i].type == HR_CACHED) && - reg_temp[i].greg == r) - { - srcr = reg_temp[i].hreg; - if (reg_temp[i].flags & HRF_DIRTY) - src_dirty = 1; - goto do_cache; - } + if (i < 0 || cache_regs[i].type != HR_TEMP) { + printf("rcache_free_tmp fail: #%i hr %d, type %d\n", i, hr, cache_regs[i].type); + exit(1); } - // must read - srcr = dstr; - if (gconst_check(r)) { - if (gconst_try_read(srcr, r)) - dirty = 1; + rcache_unlock_vreg(i); +} + +// saves temporary result either in REG or in drctmp +static int rcache_save_tmp(int hr) +{ + int i; + + // find REG, either free or unlocked temp or oldest non-hinted cached + i = rcache_allocate_nontemp(); + if (i < 0) { + // if none is available, store in drctmp + emith_ctx_write(hr, offsetof(SH2, drc_tmp)); + rcache_free_tmp(hr); + return -1; } - else - emith_ctx_read(srcr, r * 4); -do_cache: - if (dstr != srcr) - emith_move_r_r(dstr, srcr); -#if 1 - else - dirty |= src_dirty; + cache_regs[i].type = HR_CACHED; + cache_regs[i].gregs = 0; // not storing any guest register + cache_regs[i].flags &= HRF_PINNED; + cache_regs[i].locked = 0; + cache_regs[i].stamp = ++rcache_counter; + rcache_lock_vreg(i); + emith_move_r_r(cache_regs[i].hreg, hr); + rcache_free_tmp(hr); + return i; +} - if (dirty) - // must clean, callers might want to modify the arg before call - emith_ctx_write(dstr, r * 4); -#else - if (dirty) - reg_temp[dstid].flags |= HRF_DIRTY; -#endif +static int rcache_restore_tmp(int x) +{ + int hr; - reg_temp[dstid].stamp = ++rcache_counter; - reg_temp[dstid].type = HR_CACHED; - reg_temp[dstid].greg = r; - reg_temp[dstid].flags |= HRF_LOCKED; - return dstr; + // find REG with tmp store: cached but with no gregs + if (x >= 0) { + if (cache_regs[x].type != HR_CACHED || cache_regs[x].gregs) { + printf("invalid tmp storage %d\n", x); + exit(1); + } + // found, transform to a TEMP + cache_regs[x].type = HR_TEMP; + return cache_regs[x].hreg; + } + + // if not available, create a TEMP store and fetch from drctmp + hr = rcache_get_tmp(); + emith_ctx_read(hr, offsetof(SH2, drc_tmp)); + + return hr; } -static void rcache_free_tmp(int hr) +static void rcache_free(int hr) +{ + int x = reg_map_host[hr]; + rcache_unlock_vreg(x); +} + +static void rcache_unlock(int x) +{ + if (x >= 0) + cache_regs[x].locked = 0; +} + +static void rcache_unlock_all(void) { int i; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) - if (reg_temp[i].hreg == hr) - break; + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + cache_regs[i].locked = 0; +} - if (i == ARRAY_SIZE(reg_temp) || reg_temp[i].type != HR_TEMP) { - printf("rcache_free_tmp fail: #%i hr %d, type %d\n", i, hr, reg_temp[i].type); - return; +static void rcache_unpin_all(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { + if (guest_regs[i].flags & GRF_PINNED) { + guest_regs[i].flags &= ~GRF_PINNED; + cache_regs[guest_regs[i].sreg].flags &= ~HRF_PINNED; + guest_regs[i].sreg = -1; + rcache_regs_pinned &= ~(1 << i); + } } +#if DRC_DEBUG & 64 + RCACHE_CHECK("after unpin"); +#endif +} + +static void rcache_save_pinned(void) +{ + int i; + + // save pinned regs to context + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) + if ((guest_regs[i].flags & GRF_PINNED) && guest_regs[i].vreg >= 0) + emith_ctx_write(cache_regs[guest_regs[i].vreg].hreg, i * 4); +} + +static inline void rcache_set_usage_now(u32 mask) +{ + rcache_regs_now = mask; +} + +static inline void rcache_set_usage_soon(u32 mask) +{ + rcache_regs_soon = mask; +} + +static inline void rcache_set_usage_late(u32 mask) +{ + rcache_regs_late = mask; +} - reg_temp[i].type = HR_FREE; - reg_temp[i].flags = 0; +static inline void rcache_set_usage_discard(u32 mask) +{ + rcache_regs_discard = mask; +} + +static inline int rcache_is_cached(sh2_reg_e r) +{ + // is r in cache or needed RSN? + u32 rsc = rcache_regs_soon | rcache_regs_clean; + return (guest_regs[r].vreg >= 0 || (rsc & (1 << r))); } -static void rcache_unlock(int hr) +static inline int rcache_is_hreg_used(int hr) { + int x = reg_map_host[hr]; + // is hr in use? + return cache_regs[x].type != HR_FREE && + (cache_regs[x].type != HR_TEMP || cache_regs[x].locked); +} + +static inline u32 rcache_used_hregs_mask(void) +{ + u32 mask = 0; int i; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) - if (reg_temp[i].type == HR_CACHED && reg_temp[i].hreg == hr) - reg_temp[i].flags &= ~HRF_LOCKED; + + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + if ((cache_regs[i].htype & HRT_TEMP) && cache_regs[i].type != HR_FREE && + (cache_regs[i].type != HR_TEMP || cache_regs[i].locked)) + mask |= 1 << cache_regs[i].hreg; + + return mask; } -static void rcache_unlock_all(void) +static inline u32 rcache_dirty_mask(void) { + u32 mask = 0; int i; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) - reg_temp[i].flags &= ~HRF_LOCKED; + + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) + if (guest_regs[i].flags & GRF_DIRTY) + mask |= 1 << i; + mask |= gconst_dirty_mask(); + + return mask; } -#ifdef DRC_CMP -static u32 rcache_used_hreg_mask(void) +static inline u32 rcache_cached_mask(void) { u32 mask = 0; int i; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) - if (reg_temp[i].type != HR_FREE) - mask |= 1 << reg_temp[i].hreg; + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + if (cache_regs[i].type == HR_CACHED) + mask |= cache_regs[i].gregs; return mask; } + +static void rcache_clean_tmp(void) +{ + int i; + + rcache_regs_clean = (1 << ARRAY_SIZE(guest_regs)) - 1; + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + if (cache_regs[i].type == HR_CACHED && (cache_regs[i].htype & HRT_TEMP)) { + rcache_unlock(i); +#if REMAP_REGISTER + rcache_remap_vreg(i); +#else + rcache_clean_vreg(i); #endif + } + rcache_regs_clean = 0; +} + +static void rcache_clean_masked(u32 mask) +{ + int i, r, hr; + u32 m; + + rcache_regs_clean |= mask; + mask = rcache_regs_clean; + + // clean constants where all aliases are covered by the mask, exempt statics + // to avoid flushing them to context if sreg isn't available + m = mask & ~(rcache_regs_static | rcache_regs_pinned); + for (i = 0; i < ARRAY_SIZE(gconsts); i++) + if ((gconsts[i].gregs & m) && !(gconsts[i].gregs & ~mask)) { + FOR_ALL_BITS_SET_DO(gconsts[i].gregs, r, + if (guest_regs[r].flags & GRF_CDIRTY) { + hr = rcache_get_reg_(r, RC_GR_READ, 0, NULL); + rcache_clean_vreg(reg_map_host[hr]); + break; + }); + } + // clean vregs where all aliases are covered by the mask + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + if (cache_regs[i].type == HR_CACHED && + (cache_regs[i].gregs & mask) && !(cache_regs[i].gregs & ~mask)) + rcache_clean_vreg(i); +} static void rcache_clean(void) { int i; gconst_clean(); - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) - if (reg_temp[i].type == HR_CACHED && (reg_temp[i].flags & HRF_DIRTY)) { - // writeback - emith_ctx_write(reg_temp[i].hreg, reg_temp[i].greg * 4); - reg_temp[i].flags &= ~HRF_DIRTY; + rcache_regs_clean = (1 << ARRAY_SIZE(guest_regs)) - 1; + for (i = ARRAY_SIZE(cache_regs)-1; i >= 0; i--) + if (cache_regs[i].type == HR_CACHED) + rcache_clean_vreg(i); + + // relocate statics to their sregs (necessary before conditional jumps) + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { + if ((guest_regs[i].flags & (GRF_STATIC|GRF_PINNED)) && + guest_regs[i].vreg != guest_regs[i].sreg) { + rcache_lock_vreg(guest_regs[i].vreg); + rcache_evict_vreg(guest_regs[i].sreg); + rcache_unlock_vreg(guest_regs[i].vreg); + if (guest_regs[i].vreg < 0) + emith_ctx_read(cache_regs[guest_regs[i].sreg].hreg, i*4); + else { + emith_move_r_r(cache_regs[guest_regs[i].sreg].hreg, + cache_regs[guest_regs[i].vreg].hreg); + rcache_copy_x16(cache_regs[guest_regs[i].sreg].hreg, + cache_regs[guest_regs[i].vreg].hreg); + rcache_remove_vreg_alias(guest_regs[i].vreg, i); + } + cache_regs[guest_regs[i].sreg].gregs = 1 << i; + cache_regs[guest_regs[i].sreg].type = HR_CACHED; + cache_regs[guest_regs[i].sreg].flags |= HRF_DIRTY|HRF_PINNED; + guest_regs[i].flags |= GRF_DIRTY; + guest_regs[i].vreg = guest_regs[i].sreg; + } + } + rcache_regs_clean = 0; +} + +static void rcache_invalidate_tmp(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { + if (cache_regs[i].htype & HRT_TEMP) { + rcache_unlock(i); + if (cache_regs[i].type == HR_CACHED) + rcache_evict_vreg(i); + else + rcache_free_vreg(i); } + } } static void rcache_invalidate(void) { int i; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) { - reg_temp[i].type = HR_FREE; - reg_temp[i].flags = 0; + gconst_invalidate(); + rcache_unlock_all(); + + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + rcache_free_vreg(i); + + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { + guest_regs[i].flags &= GRF_STATIC; + if (!(guest_regs[i].flags & GRF_STATIC)) + guest_regs[i].vreg = -1; + else { + cache_regs[guest_regs[i].sreg].gregs = 1 << i; + cache_regs[guest_regs[i].sreg].type = HR_CACHED; + cache_regs[guest_regs[i].sreg].flags |= HRF_DIRTY|HRF_PINNED; + guest_regs[i].flags |= GRF_DIRTY; + guest_regs[i].vreg = guest_regs[i].sreg; + } } - rcache_counter = 0; - gconst_invalidate(); + rcache_counter = 0; + rcache_regs_now = rcache_regs_soon = rcache_regs_late = 0; + rcache_regs_discard = rcache_regs_clean = 0; } static void rcache_flush(void) @@ -1137,249 +2496,472 @@ static void rcache_flush(void) rcache_invalidate(); } +static void rcache_create(void) +{ + int x = 0, i; + + // create cache_regs as host register representation + // RET_REG/params should be first TEMPs to avoid allocation conflicts in calls + cache_regs[x++] = (cache_reg_t) {.hreg = RET_REG, .htype = HRT_TEMP}; + for (i = 0; i < ARRAY_SIZE(hregs_param); i++) + if (hregs_param[i] != RET_REG) + cache_regs[x++] = (cache_reg_t){.hreg = hregs_param[i],.htype = HRT_TEMP}; + + for (i = 0; i < ARRAY_SIZE(hregs_temp); i++) + if (hregs_temp[i] != RET_REG) + cache_regs[x++] = (cache_reg_t){.hreg = hregs_temp[i], .htype = HRT_TEMP}; + + for (i = ARRAY_SIZE(hregs_saved)-1; i >= 0; i--) + if (hregs_saved[i] != CONTEXT_REG) + cache_regs[x++] = (cache_reg_t){.hreg = hregs_saved[i], .htype = HRT_REG}; + + if (x != ARRAY_SIZE(cache_regs)) { + printf("rcache_create failed (conflicting register count)\n"); + exit(1); + } + + // mapping from host_register to cache regs index + memset(reg_map_host, -1, sizeof(reg_map_host)); + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { + if (cache_regs[i].htype) + reg_map_host[cache_regs[i].hreg] = i; + if (cache_regs[i].htype == HRT_REG) + rcache_vregs_reg |= (1 << i); + } + + // create static host register mapping for SH2 regs + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { + guest_regs[i] = (guest_reg_t){.sreg = -1}; + } + for (i = 0; i < ARRAY_SIZE(regs_static); i += 2) { + for (x = ARRAY_SIZE(cache_regs)-1; x >= 0; x--) + if (cache_regs[x].hreg == regs_static[i+1]) break; + if (x >= 0) { + guest_regs[regs_static[i]] = (guest_reg_t){.flags = GRF_STATIC,.sreg = x}; + rcache_regs_static |= (1 << regs_static[i]); + rcache_vregs_reg &= ~(1 << x); + } + } + + printf("DRC registers created, %ld host regs (%d REG, %d STATIC, 1 CTX)\n", + CACHE_REGS+1L, count_bits(rcache_vregs_reg),count_bits(rcache_regs_static)); +} + +static void rcache_init(void) +{ + // create DRC data structures + rcache_create(); + + rcache_invalidate(); +#if DRC_DEBUG & 64 + RCACHE_CHECK("after init"); +#endif +} + // --------------------------------------------------------------- -static int emit_get_rbase_and_offs(u32 a, u32 *offs) +// NB may return either REG or TEMP +static int emit_get_rbase_and_offs(SH2 *sh2, sh2_reg_e r, int rmode, u32 *offs) { + uptr omask = emith_rw_offs_max(); // offset mask u32 mask = 0; + u32 a; int poffs; - int hr; + int hr, hr2; + uptr la; - poffs = dr_ctx_get_mem_ptr(a, &mask); + // is r constant and points to a memory region? + if (! gconst_get(r, &a)) + return -1; + poffs = dr_ctx_get_mem_ptr(sh2, a, &mask); if (poffs == -1) return -1; - // XXX: could use some related reg - hr = rcache_get_tmp(); - emith_ctx_read_ptr(hr, poffs); - emith_add_r_r_ptr_imm(hr, hr, a & mask & ~0xff); - *offs = a & 0xff; // XXX: ARM oriented.. + if (mask < 0x20000) { + // data array, BIOS, DRAM, can't safely access directly since host addr may + // change (BIOS,da code may run on either core, DRAM may be switched) + hr = rcache_get_tmp(); + a = (a + *offs) & mask; + if (poffs == offsetof(SH2, p_da)) { + // access sh2->data_array directly + a += offsetof(SH2, data_array); + emith_add_r_r_ptr_imm(hr, CONTEXT_REG, a & ~omask); + } else { + emith_ctx_read_ptr(hr, poffs); + if (a & ~omask) + emith_add_r_r_ptr_imm(hr, hr, a & ~omask); + } + *offs = a & omask; + return hr; + } + + // ROM, SDRAM. Host address should be mmapped to be equal to SH2 address. + la = (uptr)*(void **)((char *)sh2 + poffs); + + // if r is in rcache or needed soon anyway, and offs is relative to region, + // and address translation fits in add_ptr_imm (s32), then use rcached const + if (la == (s32)la && !(*offs & ~mask) && rcache_is_cached(r)) { + u32 odd = a & 1; // need to fix odd address for correct byte addressing + la -= (s32)((a & ~mask) - *offs - odd); // diff between reg and memory + hr = hr2 = rcache_get_reg(r, rmode, NULL); + if ((s32)a < 0) emith_uext_ptr(hr2); + if ((la & ~omask) - odd) { + hr = rcache_get_tmp(); + emith_add_r_r_ptr_imm(hr, hr2, (la & ~omask) - odd); + rcache_free(hr2); + } + *offs = (la & omask); + } else { + // known fixed host address + la += (a + *offs) & mask; + hr = rcache_get_tmp(); + emith_move_r_ptr_imm(hr, la & ~omask); + *offs = la & omask; + } return hr; } +// read const data from const ROM address +static int emit_get_rom_data(SH2 *sh2, sh2_reg_e r, u32 offs, int size, u32 *val) +{ + u32 a, mask; + + *val = 0; + if (gconst_get(r, &a)) { + a += offs; + // check if rom is memory mapped (not bank switched), and address is in rom + if (dr_is_rom(a) && p32x_sh2_get_mem_ptr(a, &mask, sh2) == sh2->p_rom) { + switch (size & MF_SIZEMASK) { + case 0: *val = (s8)p32x_sh2_read8(a, sh2s); break; // 8 + case 1: *val = (s16)p32x_sh2_read16(a, sh2s); break; // 16 + case 2: *val = p32x_sh2_read32(a, sh2s); break; // 32 + } + return 1; + } + } + return 0; +} + static void emit_move_r_imm32(sh2_reg_e dst, u32 imm) { #if PROPAGATE_CONSTANTS gconst_new(dst, imm); #else - int hr = rcache_get_reg(dst, RC_GR_WRITE); - emith_move_r_imm(hr, imm); + int hr = rcache_get_reg(dst, RC_GR_WRITE, NULL); + emith_move_r_imm(hr, imm); +#endif +} + +static void emit_move_r_r(sh2_reg_e dst, sh2_reg_e src) +{ + if (gconst_check(src) || rcache_is_cached(src)) { +#if ALIAS_REGISTERS + rcache_alias_vreg(dst, src); +#else + int hr_s = rcache_get_reg(src, RC_GR_READ, NULL); + int hr_d = rcache_get_reg(dst, RC_GR_WRITE, NULL); + emith_move_r_r(hr_d, hr_s); + gconst_copy(dst, src); #endif + } else { + int hr_d = rcache_get_reg(dst, RC_GR_WRITE, NULL); + emith_ctx_read(hr_d, src * 4); + } } -static void emit_move_r_r(sh2_reg_e dst, sh2_reg_e src) +static void emit_add_r_imm(sh2_reg_e r, u32 imm) { - int hr_d = rcache_get_reg(dst, RC_GR_WRITE); - int hr_s = rcache_get_reg(src, RC_GR_READ); - - emith_move_r_r(hr_d, hr_s); + u32 val; + int isgc = gconst_get(r, &val); + int hr, hr2; + + if (!isgc || rcache_is_cached(r)) { + // not constant, or r is already in cache + hr = rcache_get_reg(r, RC_GR_RMW, &hr2); + emith_add_r_r_imm(hr, hr2, imm); + rcache_free(hr2); + if (isgc) + gconst_set(r, val + imm); + } else + gconst_new(r, val + imm); } -// T must be clear, and comparison done just before this -static void emit_or_t_if_eq(int srr) +static void emit_sub_r_imm(sh2_reg_e r, u32 imm) { - EMITH_SJMP_START(DCOND_NE); - emith_or_r_imm_c(DCOND_EQ, srr, T); - EMITH_SJMP_END(DCOND_NE); + u32 val; + int isgc = gconst_get(r, &val); + int hr, hr2; + + if (!isgc || rcache_is_cached(r)) { + // not constant, or r is already in cache + hr = rcache_get_reg(r, RC_GR_RMW, &hr2); + emith_sub_r_r_imm(hr, hr2, imm); + rcache_free(hr2); + if (isgc) + gconst_set(r, val - imm); + } else + gconst_new(r, val - imm); } -// arguments must be ready -// reg cache must be clean before call -static int emit_memhandler_read_(int size, int ram_check) +static void emit_sync_t_to_sr(void) { - int arg1; -#if 0 - int arg0; - host_arg2reg(arg0, 0); -#endif + // avoid reloading SR from context if there's nothing to do + if (emith_get_t_cond() >= 0) { + int sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); + } +} - rcache_clean(); +// rd = @(arg0) +static int emit_memhandler_read(int size) +{ + int hr; + emit_sync_t_to_sr(); + rcache_clean_tmp(); +#ifndef DRC_SR_REG // must writeback cycles for poll detection stuff - // FIXME: rm - if (reg_map_g2h[SHR_SR] != -1) - emith_ctx_write(reg_map_g2h[SHR_SR], SHR_SR * 4); - - arg1 = rcache_get_tmp_arg(1); - emith_move_r_r_ptr(arg1, CONTEXT_REG); + if (guest_regs[SHR_SR].vreg != -1) + rcache_unmap_vreg(guest_regs[SHR_SR].vreg); +#endif + rcache_invalidate_tmp(); -#if 0 // can't do this because of unmapped reads - // ndef PDB_NET - if (ram_check && Pico.rom == (void *)0x02000000 && Pico32xMem->sdram == (void *)0x06000000) { - int tmp = rcache_get_tmp(); - emith_and_r_r_imm(tmp, arg0, 0xfb000000); - emith_cmp_r_imm(tmp, 0x02000000); - switch (size) { - case 0: // 8 - EMITH_SJMP3_START(DCOND_NE); - emith_eor_r_imm_c(DCOND_EQ, arg0, 1); - emith_read8_r_r_offs_c(DCOND_EQ, arg0, arg0, 0); - EMITH_SJMP3_MID(DCOND_NE); - emith_call_cond(DCOND_NE, sh2_drc_read8); - EMITH_SJMP3_END(); - break; - case 1: // 16 - EMITH_SJMP3_START(DCOND_NE); - emith_read16_r_r_offs_c(DCOND_EQ, arg0, arg0, 0); - EMITH_SJMP3_MID(DCOND_NE); - emith_call_cond(DCOND_NE, sh2_drc_read16); - EMITH_SJMP3_END(); - break; - case 2: // 32 - EMITH_SJMP3_START(DCOND_NE); - emith_read_r_r_offs_c(DCOND_EQ, arg0, arg0, 0); - emith_ror_c(DCOND_EQ, arg0, arg0, 16); - EMITH_SJMP3_MID(DCOND_NE); - emith_call_cond(DCOND_NE, sh2_drc_read32); - EMITH_SJMP3_END(); - break; + if (size & MF_POLLING) + switch (size & MF_SIZEMASK) { + case 0: emith_call(sh2_drc_read8_poll); break; // 8 + case 1: emith_call(sh2_drc_read16_poll); break; // 16 + case 2: emith_call(sh2_drc_read32_poll); break; // 32 } - } else -#endif - { - switch (size) { - case 0: // 8 - emith_call(sh2_drc_read8); - break; - case 1: // 16 - emith_call(sh2_drc_read16); - break; - case 2: // 32 - emith_call(sh2_drc_read32); - break; + switch (size & MF_SIZEMASK) { + case 0: emith_call(sh2_drc_read8); break; // 8 + case 1: emith_call(sh2_drc_read16); break; // 16 + case 2: emith_call(sh2_drc_read32); break; // 32 } - } - rcache_invalidate(); - - if (reg_map_g2h[SHR_SR] != -1) - emith_ctx_read(reg_map_g2h[SHR_SR], SHR_SR * 4); - return rcache_get_tmp_ret(); + hr = rcache_get_tmp_ret(); + rcache_set_x16(hr, (size & MF_SIZEMASK) < 2, 0); + return hr; } -static int emit_memhandler_read(int size) +// @(arg0) = arg1 +static void emit_memhandler_write(int size) { - return emit_memhandler_read_(size, 1); + emit_sync_t_to_sr(); + rcache_clean_tmp(); +#ifndef DRC_SR_REG + if (guest_regs[SHR_SR].vreg != -1) + rcache_unmap_vreg(guest_regs[SHR_SR].vreg); +#endif + rcache_invalidate_tmp(); + + switch (size & MF_SIZEMASK) { + case 0: emith_call(sh2_drc_write8); break; // 8 + case 1: emith_call(sh2_drc_write16); break; // 16 + case 2: emith_call(sh2_drc_write32); break; // 32 + } } -static int emit_memhandler_read_rr(sh2_reg_e rd, sh2_reg_e rs, u32 offs, int size) +// rd = @(Rs,#offs); rd < 0 -> return a temp +static int emit_memhandler_read_rr(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rs, u32 offs, int size) { - int hr, hr2, ram_check = 1; - u32 val, offs2; + int hr, hr2; + u32 val; - if (gconst_get(rs, &val)) { - hr = emit_get_rbase_and_offs(val + offs, &offs2); - if (hr != -1) { - hr2 = rcache_get_reg(rd, RC_GR_WRITE); - switch (size) { - case 0: // 8 - emith_read8_r_r_offs(hr2, hr, offs2 ^ 1); - emith_sext(hr2, hr2, 8); - break; - case 1: // 16 - emith_read16_r_r_offs(hr2, hr, offs2); - emith_sext(hr2, hr2, 16); - break; - case 2: // 32 - emith_read_r_r_offs(hr2, hr, offs2); - emith_ror(hr2, hr2, 16); - break; - } - rcache_free_tmp(hr); - return hr2; +#if PROPAGATE_CONSTANTS + if (emit_get_rom_data(sh2, rs, offs, size, &val)) { + if (rd == SHR_TMP) { + hr2 = rcache_get_tmp(); + emith_move_r_imm(hr2, val); + } else { + emit_move_r_imm32(rd, val); + hr2 = rcache_get_reg(rd, RC_GR_RMW, NULL); + } + rcache_set_x16(hr2, val == (s16)val, val == (u16)val); + if (size & MF_POSTINCR) + emit_add_r_imm(rs, 1 << (size & MF_SIZEMASK)); + return hr2; + } + + val = size & MF_POSTINCR; + hr = emit_get_rbase_and_offs(sh2, rs, val ? RC_GR_RMW : RC_GR_READ, &offs); + if (hr != -1) { + if (rd == SHR_TMP) + hr2 = rcache_get_tmp(); + else + hr2 = rcache_get_reg(rd, RC_GR_WRITE, NULL); + switch (size & MF_SIZEMASK) { + case 0: emith_read8s_r_r_offs(hr2, hr, offs ^ 1); break; // 8 + case 1: emith_read16s_r_r_offs(hr2, hr, offs); break; // 16 + case 2: emith_read_r_r_offs(hr2, hr, offs); emith_ror(hr2, hr2, 16); break; } + rcache_free(hr); + if (size & MF_POSTINCR) + emit_add_r_imm(rs, 1 << (size & MF_SIZEMASK)); + return hr2; + } +#endif - ram_check = 0; + if (gconst_get(rs, &val) && !rcache_is_cached(rs)) { + hr = rcache_get_tmp_arg(0); + emith_move_r_imm(hr, val + offs); + if (size & MF_POSTINCR) + gconst_new(rs, val + (1 << (size & MF_SIZEMASK))); + } else if (size & MF_POSTINCR) { + hr = rcache_get_tmp_arg(0); + hr2 = rcache_get_reg(rs, RC_GR_RMW, NULL); + emith_add_r_r_imm(hr, hr2, offs); + emith_add_r_imm(hr2, 1 << (size & MF_SIZEMASK)); + if (gconst_get(rs, &val)) + gconst_set(rs, val + (1 << (size & MF_SIZEMASK))); + } else { + hr = rcache_get_reg_arg(0, rs, &hr2); + if (offs || hr != hr2) + emith_add_r_r_imm(hr, hr2, offs); } + hr = emit_memhandler_read(size); - hr = rcache_get_reg_arg(0, rs); - if (offs != 0) - emith_add_r_imm(hr, offs); - hr = emit_memhandler_read_(size, ram_check); - hr2 = rcache_get_reg(rd, RC_GR_WRITE); - if (size != 2) { - emith_sext(hr2, hr, (size == 1) ? 16 : 8); - } else - emith_move_r_r(hr2, hr); - rcache_free_tmp(hr); + if (rd == SHR_TMP) + hr2 = hr; + else +#if REMAP_REGISTER + hr2 = rcache_map_reg(rd, hr); +#else + hr2 = rcache_get_reg(rd, RC_GR_WRITE, NULL); +#endif + if (hr != hr2) { + emith_move_r_r(hr2, hr); + rcache_free_tmp(hr); + } return hr2; } -static void emit_memhandler_write(int size) +// @(Rs,#offs) = rd; rd < 0 -> write arg1 +static void emit_memhandler_write_rr(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rs, u32 offs, int size) { - int ctxr; - host_arg2reg(ctxr, 2); - if (reg_map_g2h[SHR_SR] != -1) - emith_ctx_write(reg_map_g2h[SHR_SR], SHR_SR * 4); + int hr, hr2; + u32 val; + + if (rd == SHR_TMP) { + host_arg2reg(hr2, 1); // already locked and prepared by caller + } else if ((size & MF_PREDECR) && rd == rs) { // must avoid caching rd in arg1 + hr2 = rcache_get_reg_arg(1, rd, &hr); + if (hr != hr2) { + emith_move_r_r(hr2, hr); + rcache_free(hr2); + } + } else + hr2 = rcache_get_reg_arg(1, rd, NULL); + if (rd != SHR_TMP) + rcache_unlock(guest_regs[rd].vreg); // unlock in case rd is in arg0 + + if (gconst_get(rs, &val) && !rcache_is_cached(rs)) { + hr = rcache_get_tmp_arg(0); + if (size & MF_PREDECR) { + val -= 1 << (size & MF_SIZEMASK); + gconst_new(rs, val); + } + emith_move_r_imm(hr, val + offs); + } else if (offs || (size & MF_PREDECR)) { + if (size & MF_PREDECR) + emit_sub_r_imm(rs, 1 << (size & MF_SIZEMASK)); + rcache_unlock(guest_regs[rs].vreg); // unlock in case rs is in arg0 + hr = rcache_get_reg_arg(0, rs, &hr2); + if (offs || hr != hr2) + emith_add_r_r_imm(hr, hr2, offs); + } else + hr = rcache_get_reg_arg(0, rs, NULL); - rcache_clean(); + emit_memhandler_write(size); +} - switch (size) { - case 0: // 8 - // XXX: consider inlining sh2_drc_write8 - emith_call(sh2_drc_write8); - break; - case 1: // 16 - emith_call(sh2_drc_write16); - break; - case 2: // 32 - emith_move_r_r_ptr(ctxr, CONTEXT_REG); - emith_call(sh2_drc_write32); - break; - } +// rd = @(Rx,Ry); rd < 0 -> return a temp +static int emit_indirect_indexed_read(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rx, sh2_reg_e ry, int size) +{ + int hr, hr2; + int tx, ty; +#if PROPAGATE_CONSTANTS + u32 offs; - rcache_invalidate(); - if (reg_map_g2h[SHR_SR] != -1) - emith_ctx_read(reg_map_g2h[SHR_SR], SHR_SR * 4); + // if offs is larger than 0x01000000, it's most probably the base address part + if (gconst_get(ry, &offs) && offs < 0x01000000) + return emit_memhandler_read_rr(sh2, rd, rx, offs, size); + if (gconst_get(rx, &offs) && offs < 0x01000000) + return emit_memhandler_read_rr(sh2, rd, ry, offs, size); +#endif + hr = rcache_get_reg_arg(0, rx, &tx); + ty = rcache_get_reg(ry, RC_GR_READ, NULL); + emith_add_r_r_r(hr, tx, ty); + hr = emit_memhandler_read(size); + + if (rd == SHR_TMP) + hr2 = hr; + else +#if REMAP_REGISTER + hr2 = rcache_map_reg(rd, hr); +#else + hr2 = rcache_get_reg(rd, RC_GR_WRITE, NULL); +#endif + + if (hr != hr2) { + emith_move_r_r(hr2, hr); + rcache_free_tmp(hr); + } + return hr2; } -// @(Rx,Ry) -static int emit_indirect_indexed_read(int rx, int ry, int size) +// @(Rx,Ry) = rd; rd < 0 -> write arg1 +static void emit_indirect_indexed_write(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rx, sh2_reg_e ry, int size) { - int a0, t; - a0 = rcache_get_reg_arg(0, rx); - t = rcache_get_reg(ry, RC_GR_READ); - emith_add_r_r(a0, t); - return emit_memhandler_read(size); + int hr, tx, ty; +#if PROPAGATE_CONSTANTS + u32 offs; + + // if offs is larger than 0x01000000, it's most probably the base address part + if (gconst_get(ry, &offs) && offs < 0x01000000) + return emit_memhandler_write_rr(sh2, rd, rx, offs, size); + if (gconst_get(rx, &offs) && offs < 0x01000000) + return emit_memhandler_write_rr(sh2, rd, ry, offs, size); +#endif + if (rd != SHR_TMP) + rcache_get_reg_arg(1, rd, NULL); + hr = rcache_get_reg_arg(0, rx, &tx); + ty = rcache_get_reg(ry, RC_GR_READ, NULL); + emith_add_r_r_r(hr, tx, ty); + emit_memhandler_write(size); } -// read @Rn, @rm -static void emit_indirect_read_double(u32 *rnr, u32 *rmr, int rn, int rm, int size) +// @Rn+,@Rm+ +static void emit_indirect_read_double(SH2 *sh2, int *rnr, int *rmr, sh2_reg_e rn, sh2_reg_e rm, int size) { int tmp; - rcache_get_reg_arg(0, rn); - tmp = emit_memhandler_read(size); - emith_ctx_write(tmp, offsetof(SH2, drc_tmp)); - rcache_free_tmp(tmp); - tmp = rcache_get_reg(rn, RC_GR_RMW); - emith_add_r_imm(tmp, 1 << size); - rcache_unlock(tmp); - - rcache_get_reg_arg(0, rm); - *rmr = emit_memhandler_read(size); - *rnr = rcache_get_tmp(); - emith_ctx_read(*rnr, offsetof(SH2, drc_tmp)); - tmp = rcache_get_reg(rm, RC_GR_RMW); - emith_add_r_imm(tmp, 1 << size); - rcache_unlock(tmp); + // unlock rn, rm here to avoid REG shortage in MAC operation + tmp = emit_memhandler_read_rr(sh2, SHR_TMP, rn, 0, size | MF_POSTINCR); + rcache_unlock(guest_regs[rn].vreg); + tmp = rcache_save_tmp(tmp); + *rmr = emit_memhandler_read_rr(sh2, SHR_TMP, rm, 0, size | MF_POSTINCR); + rcache_unlock(guest_regs[rm].vreg); + *rnr = rcache_restore_tmp(tmp); } static void emit_do_static_regs(int is_write, int tmpr) { int i, r, count; - for (i = 0; i < ARRAY_SIZE(reg_map_g2h); i++) { - r = reg_map_g2h[i]; - if (r == -1) + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { + if (guest_regs[i].flags & (GRF_STATIC|GRF_PINNED)) + r = cache_regs[guest_regs[i].vreg].hreg; + else continue; - for (count = 1; i < ARRAY_SIZE(reg_map_g2h) - 1; i++, r++) { - if (reg_map_g2h[i + 1] != r + 1) + for (count = 1; i < ARRAY_SIZE(guest_regs) - 1; i++, r++) { + if ((guest_regs[i + 1].flags & (GRF_STATIC|GRF_PINNED)) && + cache_regs[guest_regs[i + 1].vreg].hreg == r + 1) + count++; + else break; - count++; } if (count > 1) { @@ -1397,32 +2979,92 @@ static void emit_do_static_regs(int is_write, int tmpr) } } -/* just after lookup function, jump to address returned */ -static void emit_block_entry(void) +// block local link stuff +struct linkage { + u32 pc; + void *ptr; + struct block_link *bl; + u32 mask; +}; + +static inline int find_in_linkage(const struct linkage *array, int size, u32 pc) { -#if (DRC_DEBUG & 8) || defined(PDB) - int arg1, arg2; - host_arg2reg(arg1, 1); - host_arg2reg(arg2, 2); + size_t i; + for (i = 0; i < size; i++) + if (pc == array[i].pc) + return i; - emit_do_static_regs(1, arg2); - emith_move_r_r_ptr(arg1, CONTEXT_REG); - emith_move_r_r(arg2, rcache_get_reg(SHR_SR, RC_GR_READ)); - emith_call(sh2_drc_log_entry); - rcache_invalidate(); -#endif - emith_tst_r_r_ptr(RET_REG, RET_REG); - EMITH_SJMP_START(DCOND_EQ); - emith_jump_reg_c(DCOND_NE, RET_REG); - EMITH_SJMP_END(DCOND_EQ); + return -1; +} + +static int find_in_sorted_linkage(const struct linkage *array, int size, u32 pc) +{ + // binary search in sorted array + int left = 0, right = size-1; + while (left <= right) + { + int middle = (left + right) / 2; + if (array[middle].pc == pc) + return middle; + else if (array[middle].pc < pc) + left = middle + 1; + else + right = middle - 1; + } + return -1; +} + +static void emit_branch_linkage_code(SH2 *sh2, struct block_desc *block, int tcache_id, + const struct linkage *targets, int target_count, + const struct linkage *links, int link_count) +{ + struct block_link *bl; + int u, v, tmp; + + emith_flush(); + for (u = 0; u < link_count; u++) { + emith_pool_check(); + // look up local branch targets + if (links[u].mask & 0x2) { + v = find_in_sorted_linkage(targets, target_count, links[u].pc); + if (v < 0 || ! targets[v].ptr) { + // forward branch not yet resolved, prepare external linking + emith_jump_patch(links[u].ptr, tcache_ptr, NULL); + bl = dr_prepare_ext_branch(block->entryp, links[u].pc, sh2->is_slave, tcache_id); + if (bl) + bl->type = BL_LDJMP; + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, links[u].pc); + rcache_free_tmp(tmp); + emith_jump_patchable(sh2_drc_dispatcher); + } else if (emith_jump_patch_inrange(links[u].ptr, targets[v].ptr)) { + // inrange local branch + emith_jump_patch(links[u].ptr, targets[v].ptr, NULL); + } else { + // far local branch + emith_jump_patch(links[u].ptr, tcache_ptr, NULL); + emith_jump(targets[v].ptr); + } + } else { + // external or exit, emit blx area entry + void *target = (links[u].mask & 0x1 ? sh2_drc_exit : sh2_drc_dispatcher); + if (links[u].bl) + links[u].bl->blx = tcache_ptr; + emith_jump_patch(links[u].ptr, tcache_ptr, NULL); + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, links[u].pc & ~1); + rcache_free_tmp(tmp); + emith_jump(target); + } + } } #define DELAY_SAVE_T(sr) { \ + int t_ = rcache_get_tmp(); \ emith_bic_r_imm(sr, T_save); \ - emith_tst_r_imm(sr, T); \ - EMITH_SJMP_START(DCOND_EQ); \ - emith_or_r_imm_c(DCOND_NE, sr, T_save); \ - EMITH_SJMP_END(DCOND_EQ); \ + emith_and_r_r_imm(t_, sr, 1); \ + emith_or_r_r_lsl(sr, t_, T_SHIFT); \ + rcache_free_tmp(t_); \ } #define FLUSH_CYCLES(sr) \ @@ -1431,94 +3073,200 @@ static void emit_block_entry(void) cycles = 0; \ } -static void *dr_get_pc_base(u32 pc, int is_slave); +static void *dr_get_pc_base(u32 pc, SH2 *sh2); static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { - u32 branch_target_pc[MAX_LOCAL_BRANCHES]; - void *branch_target_ptr[MAX_LOCAL_BRANCHES]; + // branch targets in current block + static struct linkage branch_targets[MAX_LOCAL_TARGETS]; int branch_target_count = 0; - void *branch_patch_ptr[MAX_LOCAL_BRANCHES]; - u32 branch_patch_pc[MAX_LOCAL_BRANCHES]; - int branch_patch_count = 0; - u32 literal_addr[MAX_LITERALS]; - int literal_addr_count = 0; - u8 op_flags[BLOCK_INSN_LIMIT]; - struct { + // unresolved local or external targets with block link/exit area if needed + static struct linkage blx_targets[MAX_LOCAL_BRANCHES]; + int blx_target_count = 0; + + static u8 op_flags[BLOCK_INSN_LIMIT]; + + enum flg_states { FLG_UNKNOWN, FLG_UNUSED, FLG_0, FLG_1 }; + struct drcf { + int delay_reg:8; + u32 loop_type:8; + u32 polling:8; + u32 pinning:1; u32 test_irq:1; u32 pending_branch_direct:1; u32 pending_branch_indirect:1; - u32 literals_disabled:1; + u32 Tflag:2, Mflag:2; } drcf = { 0, }; +#if LOOP_OPTIMIZER + // loops with pinned registers for optimzation + // pinned regs are like statics and don't need saving/restoring inside a loop + static struct linkage pinned_loops[MAX_LOCAL_TARGETS/16]; + int pinned_loop_count = 0; +#endif + // PC of current, first, last SH2 insn u32 pc, base_pc, end_pc; - u32 end_literals; - void *block_entry_ptr; + u32 base_literals, end_literals; + u8 *block_entry_ptr; struct block_desc *block; + struct block_entry *entry; + struct block_link *bl; u16 *dr_pc_base; struct op_data *opd; int blkid_main = 0; - int skip_op = 0; - u32 tmp, tmp2; + int tmp, tmp2; int cycles; int i, v; + u32 u, m1, m2, m3, m4; int op; + u16 crc; base_pc = sh2->pc; - drcf.literals_disabled = literal_disabled_frames != 0; // get base/validate PC - dr_pc_base = dr_get_pc_base(base_pc, sh2->is_slave); + dr_pc_base = dr_get_pc_base(base_pc, sh2); if (dr_pc_base == (void *)-1) { printf("invalid PC, aborting: %08x\n", base_pc); // FIXME: be less destructive exit(1); } - tcache_ptr = tcache_ptrs[tcache_id]; - - // predict tcache overflow - tmp = tcache_ptr - tcache_bases[tcache_id]; - if (tmp > tcache_sizes[tcache_id] - MAX_BLOCK_SIZE) { - dbg(1, "tcache %d overflow", tcache_id); - return NULL; + // initial passes to disassemble and analyze the block + crc = scan_block(base_pc, sh2->is_slave, op_flags, &end_pc, &base_literals, &end_literals); + end_literals = dr_check_nolit(base_literals, end_literals, tcache_id); + if (base_literals == end_literals) // map empty lit section to end of code + base_literals = end_literals = end_pc; + + // if there is already a translated but inactive block, reuse it + block = dr_find_inactive_block(tcache_id, crc, base_pc, end_pc - base_pc, + base_literals, end_literals - base_literals); + + if (block) { + dbg(2, "== %csh2 reuse block %08x-%08x,%08x-%08x -> %p", sh2->is_slave ? 's' : 'm', + base_pc, end_pc, base_literals, end_literals, block->entryp->tcache_ptr); + dr_activate_block(block, tcache_id, sh2->is_slave); + emith_update_cache(); + return block->entryp[0].tcache_ptr; } - // initial passes to disassemble and analyze the block - scan_block(base_pc, sh2->is_slave, op_flags, &end_pc, &end_literals); + // collect branch_targets that don't land on delay slots + m1 = m2 = m3 = m4 = v = op = 0; + for (pc = base_pc, i = 0; pc < end_pc; i++, pc += 2) { + if (op_flags[i] & OF_DELAY_OP) + op_flags[i] &= ~OF_BTARGET; + if (op_flags[i] & OF_BTARGET) { + if (branch_target_count < ARRAY_SIZE(branch_targets)) + branch_targets[branch_target_count++] = (struct linkage) { .pc = pc }; + else { + printf("warning: linkage overflow\n"); + end_pc = pc; + break; + } + } + if (ops[i].op == OP_LDC && (ops[i].dest & BITMASK1(SHR_SR)) && pc+2 < end_pc) + op_flags[i+1] |= OF_BTARGET; // RTE entrypoint in case of SR.IMASK change + // unify T and SR since rcache doesn't know about "virtual" guest regs + if (ops[i].source & BITMASK1(SHR_T)) ops[i].source |= BITMASK1(SHR_SR); + if (ops[i].dest & BITMASK1(SHR_T)) ops[i].source |= BITMASK1(SHR_SR); + if (ops[i].dest & BITMASK1(SHR_T)) ops[i].dest |= BITMASK1(SHR_SR); +#if LOOP_DETECTION + // loop types detected: + // 1. target: ... BRA target -> idle loop + // 2. target: ... delay insn ... BF target -> delay loop + // 3. target: ... poll insn ... BF/BT target -> poll loop + // 4. target: ... poll insn ... BF/BT exit ... BRA target, exit: -> poll + // conditions: + // a. no further branch targets between target and back jump. + // b. no unconditional branch insn inside the loop. + // c. exactly one poll or delay insn is allowed inside a delay/poll loop + // (scan_block marks loops only if they meet conditions a through c) + // d. idle loops do not modify anything but PC,SR and contain no branches + // e. delay/poll loops do not modify anything but the concerned reg,PC,SR + // f. loading constants into registers inside the loop is allowed + // g. a delay/poll loop must have a conditional branch somewhere + // h. an idle loop must not have a conditional branch + if (op_flags[i] & OF_BTARGET) { + // possible loop entry point + drcf.loop_type = op_flags[i] & OF_LOOP; + drcf.pending_branch_direct = drcf.pending_branch_indirect = 0; + op = OF_IDLE_LOOP; // loop type + v = i; + m1 = m2 = m3 = m4 = 0; + if (!drcf.loop_type) // reset basic loop it it isn't recognized as loop + op_flags[i] &= ~OF_BASIC_LOOP; + } + if (drcf.loop_type) { + // calculate reg masks for loop pinning + m4 |= ops[i].source & ~m3; + m3 |= ops[i].dest; + // detect loop type, and store poll/delay register + if (op_flags[i] & OF_POLL_INSN) { + op = OF_POLL_LOOP; + m1 |= ops[i].dest; // loop poll/delay regs + } else if (op_flags[i] & OF_DELAY_INSN) { + op = OF_DELAY_LOOP; + m1 |= ops[i].dest; + } else if (ops[i].op != OP_LOAD_POOL && ops[i].op != OP_LOAD_CONST + && (ops[i].op != OP_MOVE || op != OF_POLL_LOOP)) { + // not (MOV @(PC) or MOV # or (MOV reg and poll)), condition f + m2 |= ops[i].dest; // regs modified by other insns + } + // branch detector + if (OP_ISBRAIMM(ops[i].op)) { + if (ops[i].imm == base_pc + 2*v) + drcf.pending_branch_direct = 1; // backward branch detected + else + op_flags[v] &= ~OF_BASIC_LOOP; // no basic loop + } + if (OP_ISBRACND(ops[i].op)) + drcf.pending_branch_indirect = 1; // conditions g,h - cond.branch + // poll/idle loops terminate with their backwards branch to the loop start + if (drcf.pending_branch_direct && !(op_flags[i+1] & OF_DELAY_OP)) { + m2 &= ~(m1 | BITMASK3(SHR_PC, SHR_SR, SHR_T)); // conditions d,e + g,h + if (m2 || ((op == OF_IDLE_LOOP) == (drcf.pending_branch_indirect))) + op = 0; // conditions not met + op_flags[v] = (op_flags[v] & ~OF_LOOP) | op; // set loop type + drcf.loop_type = 0; +#if LOOP_OPTIMIZER + if (op_flags[v] & OF_BASIC_LOOP) { + m3 &= ~rcache_regs_static & ~BITMASK5(SHR_PC, SHR_PR, SHR_SR, SHR_T, SHR_MEM); + if (m3 && count_bits(m3) < count_bits(rcache_vregs_reg) && + pinned_loop_count < ARRAY_SIZE(pinned_loops)-1) { + pinned_loops[pinned_loop_count++] = + (struct linkage) { .pc = base_pc + 2*v, .mask = m3 }; + } else + op_flags[v] &= ~OF_BASIC_LOOP; + } +#endif + } + } +#endif + } - if (drcf.literals_disabled) - end_literals = end_pc; + tcache_ptr = dr_prepare_cache(tcache_id, (end_pc - base_pc) / 2, branch_target_count); +#if (DRC_DEBUG & 4) + tcache_dsm_ptrs[tcache_id] = tcache_ptr; +#endif - block = dr_add_block(base_pc, end_literals - base_pc, - end_pc - base_pc, sh2->is_slave, &blkid_main); + block = dr_add_block(branch_target_count, base_pc, end_pc - base_pc, + base_literals, end_literals-base_literals, crc, sh2->is_slave, &blkid_main); if (block == NULL) return NULL; block_entry_ptr = tcache_ptr; - dbg(2, "== %csh2 block #%d,%d %08x-%08x -> %p", sh2->is_slave ? 's' : 'm', - tcache_id, blkid_main, base_pc, end_pc, block_entry_ptr); + dbg(2, "== %csh2 block #%d,%d %08x-%08x,%08x-%08x -> %p", sh2->is_slave ? 's' : 'm', + tcache_id, blkid_main, base_pc, end_pc, base_literals, end_literals, block_entry_ptr); - dr_link_blocks(&block->entryp[0], tcache_id); - - // collect branch_targets that don't land on delay slots - for (pc = base_pc, i = 0; pc < end_pc; i++, pc += 2) { - if (!(op_flags[i] & OF_BTARGET)) - continue; - if (op_flags[i] & OF_DELAY_OP) { - op_flags[i] &= ~OF_BTARGET; - continue; - } - ADD_TO_ARRAY(branch_target_pc, branch_target_count, pc, break); - } - - if (branch_target_count > 0) { - memset(branch_target_ptr, 0, sizeof(branch_target_ptr[0]) * branch_target_count); - } // clear stale state after compile errors rcache_invalidate(); + emith_invalidate_t(); + drcf = (struct drcf) { 0 }; +#if LOOP_OPTIMIZER + pinned_loops[pinned_loop_count].pc = -1; + pinned_loop_count = 0; +#endif // ------------------------------------------------- // 3rd pass: actual compilation @@ -1527,79 +3275,63 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) for (i = 0; pc < end_pc; i++) { u32 delay_dep_fw = 0, delay_dep_bk = 0; - u32 tmp3, tmp4, sr; - - opd = &ops[i]; - op = FETCH_OP(pc); - -#if (DRC_DEBUG & 2) - insns_compiled++; -#endif -#if (DRC_DEBUG & 4) - DasmSH2(sh2dasm_buff, pc, op); - printf("%c%08x %04x %s\n", (op_flags[i] & OF_BTARGET) ? '*' : ' ', - pc, op, sh2dasm_buff); -#endif + int tmp3, tmp4; + int sr; - if ((op_flags[i] & OF_BTARGET) || pc == base_pc) + if (op_flags[i] & OF_BTARGET) { if (pc != base_pc) { - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); + emith_sync_t(sr); + drcf.Mflag = FLG_UNKNOWN; rcache_flush(); + emith_flush(); + } - // make block entry - v = block->entry_count; - if (v < ARRAY_SIZE(block->entryp)) - { - struct block_entry *be_old; - - block->entryp[v].pc = pc; - block->entryp[v].tcache_ptr = tcache_ptr; - block->entryp[v].links = NULL; + // make block entry + v = block->entry_count; + entry = &block->entryp[v]; + if (v < branch_target_count) + { + entry = &block->entryp[v]; + entry->pc = pc; + entry->tcache_ptr = tcache_ptr; + entry->links = entry->o_links = NULL; #if (DRC_DEBUG & 2) - block->entryp[v].block = block; + entry->block = block; #endif - be_old = dr_get_entry(pc, sh2->is_slave, &tcache_id); - if (be_old != NULL) { - dbg(1, "entry override for %08x, was %p", pc, be_old->tcache_ptr); - kill_block_entry(be_old, tcache_id); - } - - add_to_hashlist(&block->entryp[v], tcache_id); - block->entry_count++; - - dbg(2, "-- %csh2 block #%d,%d entry %08x -> %p", - sh2->is_slave ? 's' : 'm', tcache_id, blkid_main, - pc, tcache_ptr); - - // since we made a block entry, link any other blocks - // that jump to current pc - dr_link_blocks(&block->entryp[v], tcache_id); - } - else { - dbg(1, "too many entryp for block #%d,%d pc=%08x", - tcache_id, blkid_main, pc); - } + block->entry_count++; - do_host_disasm(tcache_id); + dbg(2, "-- %csh2 block #%d,%d entry %08x -> %p", + sh2->is_slave ? 's' : 'm', tcache_id, blkid_main, + pc, tcache_ptr); + } + else { + dbg(1, "too many entryp for block #%d,%d pc=%08x", + tcache_id, blkid_main, pc); + break; } - v = find_in_array(branch_target_pc, branch_target_count, pc); + v = find_in_sorted_linkage(branch_targets, branch_target_count, pc); if (v >= 0) - branch_target_ptr[v] = tcache_ptr; + branch_targets[v].ptr = tcache_ptr; +#if LOOP_DETECTION + drcf.loop_type = op_flags[i] & OF_LOOP; + drcf.delay_reg = -1; + drcf.polling = (drcf.loop_type == OF_POLL_LOOP ? MF_POLLING : 0); +#endif - // must update PC - emit_move_r_imm32(SHR_PC, pc); rcache_clean(); #if (DRC_DEBUG & 0x10) - rcache_get_reg_arg(0, SHR_PC); - tmp = emit_memhandler_read(2); + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, pc); + tmp = emit_memhandler_read(1); tmp2 = rcache_get_tmp(); tmp3 = rcache_get_tmp(); - emith_move_r_imm(tmp2, FETCH32(pc)); + emith_move_r_imm(tmp2, (s16)FETCH_OP(pc)); emith_move_r_imm(tmp3, 0); emith_cmp_r_r(tmp, tmp2); EMITH_SJMP_START(DCOND_EQ); @@ -1611,21 +3343,107 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #endif // check cycles - sr = rcache_get_reg(SHR_SR, RC_GR_READ); - emith_cmp_r_imm(sr, 0); - emith_jump_cond(DCOND_LE, sh2_drc_exit); + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + +#if LOOP_OPTIMIZER + if (op_flags[i] & OF_BASIC_LOOP) { + if (pinned_loops[pinned_loop_count].pc == pc) { + // pin needed regs on loop entry + FOR_ALL_BITS_SET_DO(pinned_loops[pinned_loop_count].mask, v, rcache_pin_reg(v)); + emith_flush(); + // store current PC as loop target + pinned_loops[pinned_loop_count].ptr = tcache_ptr; + drcf.pinning = 1; + } else + op_flags[i] &= ~OF_BASIC_LOOP; + } + + if (op_flags[i] & OF_BASIC_LOOP) { + // if exiting a pinned loop pinned regs must be written back to ctx + // since they are reloaded in the loop entry code + emith_cmp_r_imm(sr, 0); + EMITH_JMP_START(DCOND_GT); + rcache_save_pinned(); + + if (blx_target_count < ARRAY_SIZE(blx_targets)) { + // exit via stub in blx table (saves some 1-3 insns in the main flow) + blx_targets[blx_target_count++] = + (struct linkage) { .pc = pc, .ptr = tcache_ptr, .mask = 0x1 }; + emith_jump_patchable(tcache_ptr); + } else { + // blx table full, must inline exit code + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, pc); + emith_jump(sh2_drc_exit); + rcache_free_tmp(tmp); + } + EMITH_JMP_END(DCOND_GT); + } else +#endif + { + if (blx_target_count < ARRAY_SIZE(blx_targets)) { + // exit via stub in blx table (saves some 1-3 insns in the main flow) + emith_cmp_r_imm(sr, 0); + blx_targets[blx_target_count++] = + (struct linkage) { .pc = pc, .ptr = tcache_ptr, .mask = 0x1 }; + emith_jump_cond_patchable(DCOND_LE, tcache_ptr); + } else { + // blx table full, must inline exit code + tmp = rcache_get_tmp_arg(0); + emith_cmp_r_imm(sr, 0); + EMITH_SJMP_START(DCOND_GT); + emith_move_r_imm_c(DCOND_LE, tmp, pc); + emith_jump_cond(DCOND_LE, sh2_drc_exit); + EMITH_SJMP_END(DCOND_GT); + rcache_free_tmp(tmp); + } + } + +#if (DRC_DEBUG & 32) + // block hit counter + tmp = rcache_get_tmp_arg(0); + tmp2 = rcache_get_tmp_arg(1); + emith_move_r_ptr_imm(tmp, (uptr)entry); + emith_read_r_r_offs(tmp2, tmp, offsetof(struct block_entry, entry_count)); + emith_add_r_imm(tmp2, 1); + emith_write_r_r_offs(tmp2, tmp, offsetof(struct block_entry, entry_count)); + rcache_free_tmp(tmp); + rcache_free_tmp(tmp2); +#endif + +#if (DRC_DEBUG & (8|256|512|1024)) + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); + rcache_clean(); + tmp = rcache_used_hregs_mask(); + emith_save_caller_regs(tmp); + emit_do_static_regs(1, 0); + rcache_get_reg_arg(2, SHR_SR, NULL); + tmp2 = rcache_get_tmp_arg(0); + tmp3 = rcache_get_tmp_arg(1); + tmp4 = rcache_get_tmp(); + emith_move_r_ptr_imm(tmp2, tcache_ptr); + emith_move_r_r_ptr(tmp3, CONTEXT_REG); + emith_move_r_imm(tmp4, pc); + emith_ctx_write(tmp4, SHR_PC * 4); + rcache_invalidate_tmp(); + emith_call(sh2_drc_log_entry); + emith_restore_caller_regs(tmp); +#endif + do_host_disasm(tcache_id); rcache_unlock_all(); } #ifdef DRC_CMP if (!(op_flags[i] & OF_DELAY_OP)) { - emit_move_r_imm32(SHR_PC, pc); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); + emith_sync_t(sr); + emit_move_r_imm32(SHR_PC, pc); rcache_clean(); - tmp = rcache_used_hreg_mask(); + tmp = rcache_used_hregs_mask(); emith_save_caller_regs(tmp); emit_do_static_regs(1, 0); emith_pass_arg_r(0, CONTEXT_REG); @@ -1634,20 +3452,49 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } #endif - pc += 2; - - if (skip_op > 0) { - skip_op--; - continue; + // emit blx area if limits are approached + if (blx_target_count && (blx_target_count > ARRAY_SIZE(blx_targets)-4 || + !emith_jump_patch_inrange(blx_targets[0].ptr, tcache_ptr+0x100))) { + u8 *jp; + rcache_invalidate_tmp(); + jp = tcache_ptr; + emith_jump_patchable(tcache_ptr); + emit_branch_linkage_code(sh2, block, tcache_id, branch_targets, + branch_target_count, blx_targets, blx_target_count); + blx_target_count = 0; + do_host_disasm(tcache_id); + emith_jump_patch(jp, tcache_ptr, NULL); } + emith_pool_check(); + + opd = &ops[i]; + op = FETCH_OP(pc); +#if (DRC_DEBUG & 4) + DasmSH2(sh2dasm_buff, pc, op); + if (op_flags[i] & OF_BTARGET) { + if ((op_flags[i] & OF_LOOP) == OF_DELAY_LOOP) tmp3 = '+'; + else if ((op_flags[i] & OF_LOOP) == OF_POLL_LOOP) tmp3 = '='; + else if ((op_flags[i] & OF_LOOP) == OF_IDLE_LOOP) tmp3 = '~'; + else tmp3 = '*'; + } else if (drcf.loop_type) tmp3 = '.'; + else tmp3 = ' '; + printf("%c%08x %04x %s\n", tmp3, pc, op, sh2dasm_buff); +#endif + + pc += 2; +#if (DRC_DEBUG & 2) + insns_compiled++; +#endif + if (op_flags[i] & OF_DELAY_OP) { // handle delay slot dependencies delay_dep_fw = opd->dest & ops[i-1].source; delay_dep_bk = opd->source & ops[i-1].dest; if (delay_dep_fw & BITMASK1(SHR_T)) { - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); DELAY_SAVE_T(sr); } if (delay_dep_bk & BITMASK1(SHR_PC)) { @@ -1656,8 +3503,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) elprintf_sh2(sh2, EL_ANOMALY, "drc: illegal slot insn %04x @ %08x?", op, pc - 2); } + // store PC for MOVA/MOV @PC address calculation if (opd->imm != 0) - ; // addr already resolved somehow + ; // case OP_BRANCH - addr already resolved in scan_block else { switch (ops[i-1].op) { case OP_BRANCH: @@ -1665,12 +3513,18 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) break; case OP_BRANCH_CT: case OP_BRANCH_CF: - tmp = rcache_get_reg(SHR_PC, RC_GR_WRITE); - sr = rcache_get_reg(SHR_SR, RC_GR_READ); + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + tmp = rcache_get_reg(SHR_PC, RC_GR_WRITE, NULL); emith_move_r_imm(tmp, pc); - emith_tst_r_imm(sr, T); - tmp2 = ops[i-1].op == OP_BRANCH_CT ? DCOND_NE : DCOND_EQ; + tmp2 = emith_tst_t(sr, (ops[i-1].op == OP_BRANCH_CT)); + tmp3 = emith_invert_cond(tmp2); + EMITH_SJMP_START(tmp3); emith_move_r_imm_c(tmp2, tmp, ops[i-1].imm); + EMITH_SJMP_END(tmp3); + break; + case OP_BRANCH_N: // BT/BF known not to be taken + // XXX could modify opd->imm instead? + emit_move_r_imm32(SHR_PC, pc); break; // case OP_BRANCH_R OP_BRANCH_RF - PC already loaded } @@ -1682,8 +3536,38 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) dbg(1, "unhandled delay_dep_bk: %x", delay_dep_bk); } + // inform cache about future register usage + u32 late = 0; // regs read by future ops + u32 write = 0; // regs written to (to detect write before read) + u32 soon = 0; // regs read soon + for (v = 1; v <= 9; v++) { + // no sense in looking any further than the next rcache flush + tmp = ((op_flags[i+v] & OF_BTARGET) || (op_flags[i+v-1] & OF_DELAY_OP) || + (OP_ISBRACND(opd[v-1].op) && !(op_flags[i+v] & OF_DELAY_OP))); + // XXX looking behind cond branch to avoid evicting regs used later? + if (pc + 2*v <= end_pc && !tmp) { // (pc already incremented above) + late |= opd[v].source & ~write; + // ignore source regs after they have been written to + write |= opd[v].dest; + // regs needed in the next few instructions + if (v <= 4) + soon = late; + } else + break; + } + rcache_set_usage_now(opd[0].source); // current insn + rcache_set_usage_soon(soon); // insns 1-4 + rcache_set_usage_late(late & ~soon); // insns 5-9 + rcache_set_usage_discard(write & ~(late|soon)); + if (v <= 9) + // upcoming rcache_flush, start writing back unused dirty stuff + rcache_clean_masked(rcache_dirty_mask() & ~(write|opd[0].dest)); + switch (opd->op) { + case OP_BRANCH_N: + // never taken, just use up cycles + goto end_op; case OP_BRANCH: case OP_BRANCH_CT: case OP_BRANCH_CF: @@ -1700,61 +3584,86 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto end_op; case OP_BRANCH_RF: - tmp = rcache_get_reg(SHR_PC, RC_GR_WRITE); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp = rcache_get_reg(SHR_PC, RC_GR_WRITE, NULL); + emith_move_r_imm(tmp, pc + 2); if (opd->dest & BITMASK1(SHR_PR)) { - tmp3 = rcache_get_reg(SHR_PR, RC_GR_WRITE); - emith_move_r_imm(tmp3, pc + 2); - emith_add_r_r_r(tmp, tmp2, tmp3); - } - else { - emith_move_r_r(tmp, tmp2); - emith_add_r_imm(tmp, pc + 2); + tmp3 = rcache_get_reg(SHR_PR, RC_GR_WRITE, NULL); + emith_move_r_r(tmp3, tmp); } + emith_add_r_r(tmp, tmp2); + if (gconst_get(GET_Rn(), &u)) + gconst_set(SHR_PC, pc + 2 + u); drcf.pending_branch_indirect = 1; goto end_op; - case OP_SLEEP: + case OP_SLEEP: // SLEEP 0000000000011011 printf("TODO sleep\n"); goto end_op; - case OP_RTE: + case OP_RTE: // RTE 0000000000101011 + emith_invalidate_t(); // pop PC - emit_memhandler_read_rr(SHR_PC, SHR_SP, 0, 2); + tmp = emit_memhandler_read_rr(sh2, SHR_PC, SHR_SP, 0, 2 | MF_POSTINCR); + rcache_free(tmp); // pop SR - tmp = rcache_get_reg_arg(0, SHR_SP); - emith_add_r_imm(tmp, 4); - tmp = emit_memhandler_read(2); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + tmp = emit_memhandler_read_rr(sh2, SHR_TMP, SHR_SP, 0, 2 | MF_POSTINCR); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_write_sr(sr, tmp); rcache_free_tmp(tmp); - tmp = rcache_get_reg(SHR_SP, RC_GR_RMW); - emith_add_r_imm(tmp, 4*2); drcf.test_irq = 1; drcf.pending_branch_indirect = 1; goto end_op; + case OP_UNDEFINED: + elprintf_sh2(sh2, EL_ANOMALY, "drc: unhandled op %04x @ %08x", op, pc-2); + opd->imm = (op_flags[i] & OF_B_IN_DS) ? 6 : 4; + // fallthrough + case OP_TRAPA: // TRAPA #imm 11000011iiiiiiii + // push SR + tmp = rcache_get_reg_arg(1, SHR_SR, &tmp2); + emith_sync_t(tmp2); + emith_clear_msb(tmp, tmp2, 22); + emit_memhandler_write_rr(sh2, SHR_TMP, SHR_SP, 0, 2 | MF_PREDECR); + // push PC + if (opd->op == OP_TRAPA) { + tmp = rcache_get_tmp_arg(1); + emith_move_r_imm(tmp, pc); + } else if (drcf.pending_branch_indirect) { + tmp = rcache_get_reg_arg(1, SHR_PC, NULL); + } else { + tmp = rcache_get_tmp_arg(1); + emith_move_r_imm(tmp, pc - 2); + } + emit_memhandler_write_rr(sh2, SHR_TMP, SHR_SP, 0, 2 | MF_PREDECR); + // obtain new PC + emit_memhandler_read_rr(sh2, SHR_PC, SHR_VBR, opd->imm * 4, 2); + // indirect jump -> back to dispatcher + drcf.pending_branch_indirect = 1; + goto end_op; + case OP_LOAD_POOL: #if PROPAGATE_CONSTANTS - if (opd->imm != 0 && opd->imm < end_literals - && literal_addr_count < MAX_LITERALS) + if ((opd->imm && opd->imm >= base_pc && opd->imm < end_literals) || + dr_is_rom(opd->imm)) { - ADD_TO_ARRAY(literal_addr, literal_addr_count, opd->imm,); if (opd->size == 2) - tmp = FETCH32(opd->imm); + u = FETCH32(opd->imm); else - tmp = (u32)(int)(signed short)FETCH_OP(opd->imm); - gconst_new(GET_Rn(), tmp); + u = (s16)FETCH_OP(opd->imm); + // tweak for Blackthorne: avoid stack overwriting + if (GET_Rn() == SHR_SP && u == 0x0603f800) u = 0x0603f880; + gconst_new(GET_Rn(), u); } else #endif { - tmp = rcache_get_tmp_arg(0); - if (opd->imm != 0) + if (opd->imm != 0) { + tmp = rcache_get_tmp_arg(0); emith_move_r_imm(tmp, opd->imm); - else { - // have to calculate read addr from PC - tmp2 = rcache_get_reg(SHR_PC, RC_GR_READ); + } else { + // have to calculate read addr from PC for delay slot + tmp = rcache_get_reg_arg(0, SHR_PC, &tmp2); if (opd->size == 2) { emith_add_r_r_imm(tmp, tmp2, 2 + (op & 0xff) * 4); emith_bic_r_imm(tmp, 3); @@ -1763,21 +3672,25 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_add_r_r_imm(tmp, tmp2, 2 + (op & 0xff) * 2); } tmp2 = emit_memhandler_read(opd->size); - tmp3 = rcache_get_reg(GET_Rn(), RC_GR_WRITE); - if (opd->size == 2) +#if REMAP_REGISTER + tmp3 = rcache_map_reg(GET_Rn(), tmp2); +#else + tmp3 = rcache_get_reg(GET_Rn(), RC_GR_WRITE, NULL); +#endif + if (tmp3 != tmp2) { emith_move_r_r(tmp3, tmp2); - else - emith_sext(tmp3, tmp2, 16); - rcache_free_tmp(tmp2); + rcache_free_tmp(tmp2); + } } goto end_op; - case OP_MOVA: + case OP_MOVA: // MOVA @(disp,PC),R0 11000111dddddddd if (opd->imm != 0) emit_move_r_imm32(SHR_R0, opd->imm); else { - tmp = rcache_get_reg(SHR_R0, RC_GR_WRITE); - tmp2 = rcache_get_reg(SHR_PC, RC_GR_READ); + // have to calculate addr from PC for delay slot + tmp2 = rcache_get_reg(SHR_PC, RC_GR_READ, NULL); + tmp = rcache_get_reg(SHR_R0, RC_GR_WRITE, NULL); emith_add_r_r_imm(tmp, tmp2, 2 + (op & 0xff) * 4); emith_bic_r_imm(tmp, 3); } @@ -1791,7 +3704,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) switch (op & 0x0f) { case 0x02: - tmp = rcache_get_reg(GET_Rn(), RC_GR_WRITE); switch (GET_Fx()) { case 0: // STC SR,Rn 0000nnnn00000010 @@ -1806,38 +3718,41 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) default: goto default_; } - tmp3 = rcache_get_reg(tmp2, RC_GR_READ); - emith_move_r_r(tmp, tmp3); - if (tmp2 == SHR_SR) - emith_clear_msb(tmp, tmp, 22); // reserved bits defined by ISA as 0 + if (tmp2 == SHR_SR) { + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + emith_sync_t(sr); + tmp = rcache_get_reg(GET_Rn(), RC_GR_WRITE, NULL); + emith_clear_msb(tmp, sr, 22); // reserved bits defined by ISA as 0 + } else + emit_move_r_r(GET_Rn(), tmp2); goto end_op; case 0x04: // MOV.B Rm,@(R0,Rn) 0000nnnnmmmm0100 case 0x05: // MOV.W Rm,@(R0,Rn) 0000nnnnmmmm0101 case 0x06: // MOV.L Rm,@(R0,Rn) 0000nnnnmmmm0110 - rcache_clean(); - tmp = rcache_get_reg_arg(1, GET_Rm()); - tmp2 = rcache_get_reg_arg(0, SHR_R0); - tmp3 = rcache_get_reg(GET_Rn(), RC_GR_READ); - emith_add_r_r(tmp2, tmp3); - emit_memhandler_write(op & 3); + emit_indirect_indexed_write(sh2, GET_Rm(), SHR_R0, GET_Rn(), op & 3); goto end_op; - case 0x07: - // MUL.L Rm,Rn 0000nnnnmmmm0111 - tmp = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - tmp3 = rcache_get_reg(SHR_MACL, RC_GR_WRITE); + case 0x07: // MUL.L Rm,Rn 0000nnnnmmmm0111 + tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(SHR_MACL, RC_GR_WRITE, NULL); emith_mul(tmp3, tmp2, tmp); goto end_op; case 0x08: switch (GET_Fx()) { case 0: // CLRT 0000000000001000 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_bic_r_imm(sr, T); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); +#if T_OPTIMIZER + if (~rcache_regs_discard & BITMASK1(SHR_T)) +#endif + emith_set_t(sr, 0); break; case 1: // SETT 0000000000011000 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_or_r_imm(sr, T); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); +#if T_OPTIMIZER + if (~rcache_regs_discard & BITMASK1(SHR_T)) +#endif + emith_set_t(sr, 1); break; case 2: // CLRMAC 0000000000101000 emit_move_r_imm32(SHR_MACL, 0); @@ -1853,12 +3768,15 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0: // NOP 0000000000001001 break; case 1: // DIV0U 0000000000011001 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_invalidate_t(); emith_bic_r_imm(sr, M|Q|T); + drcf.Mflag = FLG_0; break; case 2: // MOVT Rn 0000nnnn00101001 - sr = rcache_get_reg(SHR_SR, RC_GR_READ); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_WRITE); + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + emith_sync_t(sr); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_WRITE, NULL); emith_clear_msb(tmp2, sr, 31); break; default: @@ -1866,7 +3784,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } goto end_op; case 0x0a: - tmp = rcache_get_reg(GET_Rn(), RC_GR_WRITE); switch (GET_Fx()) { case 0: // STS MACH,Rn 0000nnnn00001010 @@ -1881,64 +3798,28 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) default: goto default_; } - tmp2 = rcache_get_reg(tmp2, RC_GR_READ); - emith_move_r_r(tmp, tmp2); + emit_move_r_r(GET_Rn(), tmp2); goto end_op; case 0x0c: // MOV.B @(R0,Rm),Rn 0000nnnnmmmm1100 case 0x0d: // MOV.W @(R0,Rm),Rn 0000nnnnmmmm1101 case 0x0e: // MOV.L @(R0,Rm),Rn 0000nnnnmmmm1110 - tmp = emit_indirect_indexed_read(SHR_R0, GET_Rm(), op & 3); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_WRITE); - if ((op & 3) != 2) { - emith_sext(tmp2, tmp, (op & 1) ? 16 : 8); - } else - emith_move_r_r(tmp2, tmp); - rcache_free_tmp(tmp); + emit_indirect_indexed_read(sh2, GET_Rn(), SHR_R0, GET_Rm(), (op & 3) | drcf.polling); goto end_op; case 0x0f: // MAC.L @Rm+,@Rn+ 0000nnnnmmmm1111 - emit_indirect_read_double(&tmp, &tmp2, GET_Rn(), GET_Rm(), 2); - tmp4 = rcache_get_reg(SHR_MACH, RC_GR_RMW); - /* MS 16 MAC bits unused if saturated */ - sr = rcache_get_reg(SHR_SR, RC_GR_READ); - emith_tst_r_imm(sr, S); - EMITH_SJMP_START(DCOND_EQ); - emith_clear_msb_c(DCOND_NE, tmp4, tmp4, 16); - EMITH_SJMP_END(DCOND_EQ); - rcache_unlock(sr); - tmp3 = rcache_get_reg(SHR_MACL, RC_GR_RMW); // might evict SR - emith_mula_s64(tmp3, tmp4, tmp, tmp2); + emit_indirect_read_double(sh2, &tmp, &tmp2, GET_Rn(), GET_Rm(), 2); + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + tmp3 = rcache_get_reg(SHR_MACL, RC_GR_RMW, NULL); + tmp4 = rcache_get_reg(SHR_MACH, RC_GR_RMW, NULL); + emith_sh2_macl(tmp3, tmp4, tmp, tmp2, sr); rcache_free_tmp(tmp2); - sr = rcache_get_reg(SHR_SR, RC_GR_READ); // reget just in case - emith_tst_r_imm(sr, S); - - EMITH_JMP_START(DCOND_EQ); - emith_asr(tmp, tmp4, 15); - emith_cmp_r_imm(tmp, -1); // negative overflow (0x80000000..0xffff7fff) - EMITH_SJMP_START(DCOND_GE); - emith_move_r_imm_c(DCOND_LT, tmp4, 0x8000); - emith_move_r_imm_c(DCOND_LT, tmp3, 0x0000); - EMITH_SJMP_END(DCOND_GE); - emith_cmp_r_imm(tmp, 0); // positive overflow (0x00008000..0x7fffffff) - EMITH_SJMP_START(DCOND_LE); - emith_move_r_imm_c(DCOND_GT, tmp4, 0x00007fff); - emith_move_r_imm_c(DCOND_GT, tmp3, 0xffffffff); - EMITH_SJMP_END(DCOND_LE); - EMITH_JMP_END(DCOND_EQ); - rcache_free_tmp(tmp); goto end_op; } goto default_; ///////////////////////////////////////////// - case 0x01: - // MOV.L Rm,@(disp,Rn) 0001nnnnmmmmdddd - rcache_clean(); - tmp = rcache_get_reg_arg(0, GET_Rn()); - tmp2 = rcache_get_reg_arg(1, GET_Rm()); - if (op & 0x0f) - emith_add_r_imm(tmp, (op & 0x0f) * 4); - emit_memhandler_write(2); + case 0x01: // MOV.L Rm,@(disp,Rn) 0001nnnnmmmmdddd + emit_memhandler_write_rr(sh2, GET_Rm(), GET_Rn(), (op & 0x0f) * 4, 2); goto end_op; case 0x02: @@ -1947,101 +3828,116 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x00: // MOV.B Rm,@Rn 0010nnnnmmmm0000 case 0x01: // MOV.W Rm,@Rn 0010nnnnmmmm0001 case 0x02: // MOV.L Rm,@Rn 0010nnnnmmmm0010 - rcache_clean(); - rcache_get_reg_arg(0, GET_Rn()); - rcache_get_reg_arg(1, GET_Rm()); - emit_memhandler_write(op & 3); + emit_memhandler_write_rr(sh2, GET_Rm(), GET_Rn(), 0, op & 3); goto end_op; case 0x04: // MOV.B Rm,@-Rn 0010nnnnmmmm0100 case 0x05: // MOV.W Rm,@-Rn 0010nnnnmmmm0101 case 0x06: // MOV.L Rm,@-Rn 0010nnnnmmmm0110 - rcache_get_reg_arg(1, GET_Rm()); // for Rm == Rn - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - emith_sub_r_imm(tmp, (1 << (op & 3))); - rcache_clean(); - rcache_get_reg_arg(0, GET_Rn()); - emit_memhandler_write(op & 3); + emit_memhandler_write_rr(sh2, GET_Rm(), GET_Rn(), 0, (op & 3) | MF_PREDECR); goto end_op; case 0x07: // DIV0S Rm,Rn 0010nnnnmmmm0111 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_tmp(); + emith_invalidate_t(); emith_bic_r_imm(sr, M|Q|T); - emith_tst_r_imm(tmp2, (1<<31)); - EMITH_SJMP_START(DCOND_EQ); - emith_or_r_imm_c(DCOND_NE, sr, Q); - EMITH_SJMP_END(DCOND_EQ); - emith_tst_r_imm(tmp3, (1<<31)); - EMITH_SJMP_START(DCOND_EQ); - emith_or_r_imm_c(DCOND_NE, sr, M); - EMITH_SJMP_END(DCOND_EQ); - emith_teq_r_r(tmp2, tmp3); - EMITH_SJMP_START(DCOND_PL); - emith_or_r_imm_c(DCOND_MI, sr, T); - EMITH_SJMP_END(DCOND_PL); + emith_lsr(tmp, tmp2, 31); // Q = Nn + emith_or_r_r_lsl(sr, tmp, Q_SHIFT); + emith_lsr(tmp, tmp3, 31); // M = Nm + emith_or_r_r_lsl(sr, tmp, M_SHIFT); + emith_eor_r_r_lsr(tmp, tmp2, 31); + emith_or_r_r(sr, tmp); // T = Q^M + rcache_free(tmp); + drcf.Mflag = FLG_UNKNOWN; goto end_op; case 0x08: // TST Rm,Rn 0010nnnnmmmm1000 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ); - emith_bic_r_imm(sr, T); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + emith_clr_t_cond(sr); emith_tst_r_r(tmp2, tmp3); - emit_or_t_if_eq(sr); + emith_set_t_cond(sr, DCOND_EQ); goto end_op; case 0x09: // AND Rm,Rn 0010nnnnmmmm1001 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - emith_and_r_r(tmp, tmp2); + if (GET_Rm() != GET_Rn()) { + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); + emith_and_r_r_r(tmp, tmp3, tmp2); + } goto end_op; case 0x0a: // XOR Rm,Rn 0010nnnnmmmm1010 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - emith_eor_r_r(tmp, tmp2); +#if PROPAGATE_CONSTANTS + if (GET_Rn() == GET_Rm()) { + gconst_new(GET_Rn(), 0); + goto end_op; + } +#endif + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); + emith_eor_r_r_r(tmp, tmp3, tmp2); goto end_op; case 0x0b: // OR Rm,Rn 0010nnnnmmmm1011 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - emith_or_r_r(tmp, tmp2); + if (GET_Rm() != GET_Rn()) { + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); + emith_or_r_r_r(tmp, tmp3, tmp2); + } goto end_op; case 0x0c: // CMP/STR Rm,Rn 0010nnnnmmmm1100 tmp = rcache_get_tmp(); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); emith_eor_r_r_r(tmp, tmp2, tmp3); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_bic_r_imm(sr, T); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_clr_t_cond(sr); emith_tst_r_imm(tmp, 0x000000ff); - emit_or_t_if_eq(sr); - emith_tst_r_imm(tmp, 0x0000ff00); - emit_or_t_if_eq(sr); - emith_tst_r_imm(tmp, 0x00ff0000); - emit_or_t_if_eq(sr); - emith_tst_r_imm(tmp, 0xff000000); - emit_or_t_if_eq(sr); + EMITH_SJMP_START(DCOND_EQ); + emith_tst_r_imm_c(DCOND_NE, tmp, 0x0000ff00); + EMITH_SJMP_START(DCOND_EQ); + emith_tst_r_imm_c(DCOND_NE, tmp, 0x00ff0000); + EMITH_SJMP_START(DCOND_EQ); + emith_tst_r_imm_c(DCOND_NE, tmp, 0xff000000); + EMITH_SJMP_END(DCOND_EQ); + EMITH_SJMP_END(DCOND_EQ); + EMITH_SJMP_END(DCOND_EQ); + emith_set_t_cond(sr, DCOND_EQ); rcache_free_tmp(tmp); goto end_op; case 0x0d: // XTRCT Rm,Rn 0010nnnnmmmm1101 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - emith_lsr(tmp, tmp, 16); + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); + emith_lsr(tmp, tmp3, 16); emith_or_r_r_lsl(tmp, tmp2, 16); goto end_op; case 0x0e: // MULU.W Rm,Rn 0010nnnnmmmm1110 case 0x0f: // MULS.W Rm,Rn 0010nnnnmmmm1111 - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp = rcache_get_reg(SHR_MACL, RC_GR_WRITE); - if (op & 1) { - emith_sext(tmp, tmp2, 16); - } else - emith_clear_msb(tmp, tmp2, 16); - tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ); - tmp2 = rcache_get_tmp(); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(SHR_MACL, RC_GR_WRITE, NULL); + tmp4 = tmp3; if (op & 1) { - emith_sext(tmp2, tmp3, 16); - } else - emith_clear_msb(tmp2, tmp3, 16); - emith_mul(tmp, tmp, tmp2); - rcache_free_tmp(tmp2); + if (! rcache_is_s16(tmp2)) { + emith_sext(tmp, tmp2, 16); + tmp2 = tmp; + } + if (! rcache_is_s16(tmp3)) { + tmp4 = rcache_get_tmp(); + emith_sext(tmp4, tmp3, 16); + } + } else { + if (! rcache_is_u16(tmp2)) { + emith_clear_msb(tmp, tmp2, 16); + tmp2 = tmp; + } + if (! rcache_is_u16(tmp3)) { + tmp4 = rcache_get_tmp(); + emith_clear_msb(tmp4, tmp3, 16); + } + } + emith_mul(tmp, tmp2, tmp4); + if (tmp4 != tmp3) + rcache_free_tmp(tmp4); goto end_op; } goto default_; @@ -2055,37 +3951,30 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x03: // CMP/GE Rm,Rn 0011nnnnmmmm0011 case 0x06: // CMP/HI Rm,Rn 0011nnnnmmmm0110 case 0x07: // CMP/GT Rm,Rn 0011nnnnmmmm0111 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ); - emith_bic_r_imm(sr, T); - emith_cmp_r_r(tmp2, tmp3); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); switch (op & 0x07) { case 0x00: // CMP/EQ - emit_or_t_if_eq(sr); + tmp = DCOND_EQ; break; case 0x02: // CMP/HS - EMITH_SJMP_START(DCOND_LO); - emith_or_r_imm_c(DCOND_HS, sr, T); - EMITH_SJMP_END(DCOND_LO); + tmp = DCOND_HS; break; case 0x03: // CMP/GE - EMITH_SJMP_START(DCOND_LT); - emith_or_r_imm_c(DCOND_GE, sr, T); - EMITH_SJMP_END(DCOND_LT); + tmp = DCOND_GE; break; case 0x06: // CMP/HI - EMITH_SJMP_START(DCOND_LS); - emith_or_r_imm_c(DCOND_HI, sr, T); - EMITH_SJMP_END(DCOND_LS); + tmp = DCOND_HI; break; case 0x07: // CMP/GT - EMITH_SJMP_START(DCOND_LE); - emith_or_r_imm_c(DCOND_GT, sr, T); - EMITH_SJMP_END(DCOND_LE); + tmp = DCOND_GT; break; } + emith_clr_t_cond(sr); + emith_cmp_r_r(tmp2, tmp3); + emith_set_t_cond(sr, tmp); goto end_op; case 0x04: // DIV1 Rm,Rn 0011nnnnmmmm0100 // Q1 = carry(Rn = (Rn << 1) | T) @@ -2095,80 +3984,109 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // Q2 = carry(Rn -= Rm) // Q = M ^ Q1 ^ Q2 // T = (Q == M) = !(Q ^ M) = !(Q1 ^ Q2) - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW); - tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_tpop_carry(sr, 0); - emith_adcf_r_r(tmp2, tmp2); - emith_tpush_carry(sr, 0); // keep Q1 in T for now - tmp4 = rcache_get_tmp(); - emith_and_r_r_imm(tmp4, sr, M); - emith_eor_r_r_lsr(sr, tmp4, M_SHIFT - Q_SHIFT); // Q ^= M - rcache_free_tmp(tmp4); - // add or sub, invert T if carry to get Q1 ^ Q2 - // in: (Q ^ M) passed in Q, Q1 in T + tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW, NULL); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); + tmp = rcache_get_tmp(); + if (drcf.Mflag != FLG_0) { + emith_and_r_r_imm(tmp, sr, M); + emith_eor_r_r_lsr(sr, tmp, M_SHIFT - Q_SHIFT); // Q ^= M + } + rcache_free_tmp(tmp); + // shift Rn, add T, add or sub Rm, set T = !(Q1 ^ Q2) + // in: (Q ^ M) passed in Q emith_sh2_div1_step(tmp2, tmp3, sr); - emith_bic_r_imm(sr, Q); - emith_tst_r_imm(sr, M); - EMITH_SJMP_START(DCOND_EQ); - emith_or_r_imm_c(DCOND_NE, sr, Q); // Q = M - EMITH_SJMP_END(DCOND_EQ); - emith_tst_r_imm(sr, T); - EMITH_SJMP_START(DCOND_EQ); - emith_eor_r_imm_c(DCOND_NE, sr, Q); // Q = M ^ Q1 ^ Q2 - EMITH_SJMP_END(DCOND_EQ); - emith_eor_r_imm(sr, T); // T = !(Q1 ^ Q2) + tmp = rcache_get_tmp(); + emith_or_r_imm(sr, Q); // Q = !T + emith_and_r_r_imm(tmp, sr, T); + emith_eor_r_r_lsl(sr, tmp, Q_SHIFT); + if (drcf.Mflag != FLG_0) { // Q = M ^ !T = M ^ Q1 ^ Q2 + emith_and_r_r_imm(tmp, sr, M); + emith_eor_r_r_lsr(sr, tmp, M_SHIFT - Q_SHIFT); + } + rcache_free_tmp(tmp); goto end_op; case 0x05: // DMULU.L Rm,Rn 0011nnnnmmmm0101 - tmp = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - tmp3 = rcache_get_reg(SHR_MACL, RC_GR_WRITE); - tmp4 = rcache_get_reg(SHR_MACH, RC_GR_WRITE); + tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(SHR_MACL, RC_GR_WRITE, NULL); + tmp4 = rcache_get_reg(SHR_MACH, RC_GR_WRITE, NULL); emith_mul_u64(tmp3, tmp4, tmp, tmp2); goto end_op; case 0x08: // SUB Rm,Rn 0011nnnnmmmm1000 +#if PROPAGATE_CONSTANTS + if (GET_Rn() == GET_Rm()) { + gconst_new(GET_Rn(), 0); + goto end_op; + } +#endif case 0x0c: // ADD Rm,Rn 0011nnnnmmmm1100 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); if (op & 4) { - emith_add_r_r(tmp, tmp2); + emith_add_r_r_r(tmp, tmp3, tmp2); } else - emith_sub_r_r(tmp, tmp2); + emith_sub_r_r_r(tmp, tmp3, tmp2); goto end_op; case 0x0a: // SUBC Rm,Rn 0011nnnnmmmm1010 case 0x0e: // ADDC Rm,Rn 0011nnnnmmmm1110 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - if (op & 4) { // adc - emith_tpop_carry(sr, 0); - emith_adcf_r_r(tmp, tmp2); - emith_tpush_carry(sr, 0); - } else { - emith_tpop_carry(sr, 1); - emith_sbcf_r_r(tmp, tmp2); - emith_tpush_carry(sr, 1); + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + if (op & 4) { + emith_t_to_carry(sr, 0); + emith_adc_r_r_r(tmp, tmp3, tmp2); + } else { + emith_t_to_carry(sr, 1); + emith_sbc_r_r_r(tmp, tmp3, tmp2); + } + } else +#endif + { + EMITH_HINT_COND(DCOND_CS); + if (op & 4) { // adc + emith_tpop_carry(sr, 0); + emith_adcf_r_r_r(tmp, tmp3, tmp2); + emith_tpush_carry(sr, 0); + } else { + emith_tpop_carry(sr, 1); + emith_sbcf_r_r_r(tmp, tmp3, tmp2); + emith_tpush_carry(sr, 1); + } } goto end_op; case 0x0b: // SUBV Rm,Rn 0011nnnnmmmm1011 case 0x0f: // ADDV Rm,Rn 0011nnnnmmmm1111 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_bic_r_imm(sr, T); - if (op & 4) { - emith_addf_r_r(tmp, tmp2); + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + if (op & 4) + emith_add_r_r_r(tmp,tmp3,tmp2); + else + emith_sub_r_r_r(tmp,tmp3,tmp2); } else - emith_subf_r_r(tmp, tmp2); - EMITH_SJMP_START(DCOND_VC); - emith_or_r_imm_c(DCOND_VS, sr, T); - EMITH_SJMP_END(DCOND_VC); +#endif + { + emith_clr_t_cond(sr); + EMITH_HINT_COND(DCOND_VS); + if (op & 4) + emith_addf_r_r_r(tmp, tmp3, tmp2); + else + emith_subf_r_r_r(tmp, tmp3, tmp2); + emith_set_t_cond(sr, DCOND_VS); + } goto end_op; case 0x0d: // DMULS.L Rm,Rn 0011nnnnmmmm1101 - tmp = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - tmp3 = rcache_get_reg(SHR_MACL, RC_GR_WRITE); - tmp4 = rcache_get_reg(SHR_MACH, RC_GR_WRITE); + tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(SHR_MACL, RC_GR_WRITE, NULL); + tmp4 = rcache_get_reg(SHR_MACH, RC_GR_WRITE, NULL); emith_mul_s64(tmp3, tmp4, tmp, tmp2); goto end_op; } @@ -2183,32 +4101,34 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { case 0: // SHLL Rn 0100nnnn00000000 case 2: // SHAL Rn 0100nnnn00100000 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_tpop_carry(sr, 0); // dummy - emith_lslf(tmp, tmp, 1); - emith_tpush_carry(sr, 0); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) + emith_lsl(tmp, tmp2, 1); + else +#endif + { + emith_invalidate_t(); + emith_lslf(tmp, tmp2, 1); + emith_carry_to_t(sr, 0); + } goto end_op; case 1: // DT Rn 0100nnnn00010000 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); -#if 0 // scheduling needs tuning - if (FETCH_OP(pc) == 0x8bfd) { // BF #-2 - if (gconst_get(GET_Rn(), &tmp)) { - // XXX: limit burned cycles - emit_move_r_imm32(GET_Rn(), 0); - emith_or_r_imm(sr, T); - cycles += tmp * 4 + 1; // +1 syncs with noconst version, not sure why - skip_op = 1; - } + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); +#if LOOP_DETECTION + if (drcf.loop_type == OF_DELAY_LOOP) { + if (drcf.delay_reg == -1) + drcf.delay_reg = GET_Rn(); else - emith_sh2_dtbf_loop(); - goto end_op; + drcf.polling = drcf.loop_type = 0; } #endif - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - emith_bic_r_imm(sr, T); - emith_subf_r_imm(tmp, 1); - emit_or_t_if_eq(sr); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); + emith_clr_t_cond(sr); + EMITH_HINT_COND(DCOND_EQ); + emith_subf_r_r_imm(tmp, tmp2, 1); + emith_set_t_cond(sr, DCOND_EQ); goto end_op; } goto default_; @@ -2217,23 +4137,31 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { case 0: // SHLR Rn 0100nnnn00000001 case 2: // SHAR Rn 0100nnnn00100001 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_tpop_carry(sr, 0); // dummy - if (op & 0x20) { - emith_asrf(tmp, tmp, 1); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + if (op & 0x20) + emith_asr(tmp,tmp2,1); + else + emith_lsr(tmp,tmp2,1); } else - emith_lsrf(tmp, tmp, 1); - emith_tpush_carry(sr, 0); +#endif + { + emith_invalidate_t(); + if (op & 0x20) { + emith_asrf(tmp, tmp2, 1); + } else + emith_lsrf(tmp, tmp2, 1); + emith_carry_to_t(sr, 0); + } goto end_op; case 1: // CMP/PZ Rn 0100nnnn00010001 - tmp = rcache_get_reg(GET_Rn(), RC_GR_READ); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_bic_r_imm(sr, T); + tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_clr_t_cond(sr); emith_cmp_r_imm(tmp, 0); - EMITH_SJMP_START(DCOND_LT); - emith_or_r_imm_c(DCOND_GE, sr, T); - EMITH_SJMP_END(DCOND_LT); + emith_set_t_cond(sr, DCOND_GE); goto end_op; } goto default_; @@ -2262,14 +4190,13 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) default: goto default_; } - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW); - emith_sub_r_imm(tmp2, 4); - rcache_clean(); - rcache_get_reg_arg(0, GET_Rn()); - tmp3 = rcache_get_reg_arg(1, tmp); - if (tmp == SHR_SR) - emith_clear_msb(tmp3, tmp3, 22); // reserved bits defined by ISA as 0 - emit_memhandler_write(2); + if (tmp == SHR_SR) { + tmp3 = rcache_get_reg_arg(1, tmp, &tmp4); + emith_sync_t(tmp4); + emith_clear_msb(tmp3, tmp4, 22); // reserved bits defined by ISA as 0 + } else + tmp3 = rcache_get_reg_arg(1, tmp, NULL); + emit_memhandler_write_rr(sh2, SHR_TMP, GET_Rn(), 0, 2 | MF_PREDECR); goto end_op; case 0x04: case 0x05: @@ -2277,34 +4204,54 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { case 0x04: // ROTL Rn 0100nnnn00000100 case 0x05: // ROTR Rn 0100nnnn00000101 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_tpop_carry(sr, 0); // dummy - if (op & 1) { - emith_rorf(tmp, tmp, 1); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + if (op & 1) + emith_ror(tmp, tmp2, 1); + else + emith_rol(tmp, tmp2, 1); } else - emith_rolf(tmp, tmp, 1); - emith_tpush_carry(sr, 0); +#endif + { + emith_invalidate_t(); + if (op & 1) + emith_rorf(tmp, tmp2, 1); + else + emith_rolf(tmp, tmp2, 1); + emith_carry_to_t(sr, 0); + } goto end_op; case 0x24: // ROTCL Rn 0100nnnn00100100 case 0x25: // ROTCR Rn 0100nnnn00100101 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_tpop_carry(sr, 0); - if (op & 1) { - emith_rorcf(tmp); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, NULL); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + emith_t_to_carry(sr, 0); + if (op & 1) + emith_rorc(tmp); + else + emith_rolc(tmp); } else - emith_rolcf(tmp); - emith_tpush_carry(sr, 0); +#endif + { + emith_tpop_carry(sr, 0); + if (op & 1) + emith_rorcf(tmp); + else + emith_rolcf(tmp); + emith_tpush_carry(sr, 0); + } goto end_op; case 0x15: // CMP/PL Rn 0100nnnn00010101 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_bic_r_imm(sr, T); + tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_clr_t_cond(sr); emith_cmp_r_imm(tmp, 0); - EMITH_SJMP_START(DCOND_LE); - emith_or_r_imm_c(DCOND_GT, sr, T); - EMITH_SJMP_END(DCOND_LE); + emith_set_t_cond(sr, DCOND_GT); goto end_op; } goto default_; @@ -2333,47 +4280,40 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) default: goto default_; } - rcache_get_reg_arg(0, GET_Rn()); - tmp2 = emit_memhandler_read(2); if (tmp == SHR_SR) { - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + emith_invalidate_t(); + tmp2 = emit_memhandler_read_rr(sh2, SHR_TMP, GET_Rn(), 0, 2 | MF_POSTINCR); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_write_sr(sr, tmp2); + rcache_free_tmp(tmp2); drcf.test_irq = 1; - } else { - tmp = rcache_get_reg(tmp, RC_GR_WRITE); - emith_move_r_r(tmp, tmp2); - } - rcache_free_tmp(tmp2); - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - emith_add_r_imm(tmp, 4); + } else + emit_memhandler_read_rr(sh2, tmp, GET_Rn(), 0, 2 | MF_POSTINCR); goto end_op; case 0x08: case 0x09: switch (GET_Fx()) { - case 0: - // SHLL2 Rn 0100nnnn00001000 - // SHLR2 Rn 0100nnnn00001001 + case 0: // SHLL2 Rn 0100nnnn00001000 + // SHLR2 Rn 0100nnnn00001001 tmp = 2; break; - case 1: - // SHLL8 Rn 0100nnnn00011000 - // SHLR8 Rn 0100nnnn00011001 + case 1: // SHLL8 Rn 0100nnnn00011000 + // SHLR8 Rn 0100nnnn00011001 tmp = 8; break; - case 2: - // SHLL16 Rn 0100nnnn00101000 - // SHLR16 Rn 0100nnnn00101001 + case 2: // SHLL16 Rn 0100nnnn00101000 + // SHLR16 Rn 0100nnnn00101001 tmp = 16; break; default: goto default_; } - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); if (op & 1) { - emith_lsr(tmp2, tmp2, tmp); + emith_lsr(tmp2, tmp3, tmp); } else - emith_lsl(tmp2, tmp2, tmp); + emith_lsl(tmp2, tmp3, tmp); goto end_op; case 0x0a: switch (GET_Fx()) @@ -2397,18 +4337,17 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { case 1: // TAS.B @Rn 0100nnnn00011011 // XXX: is TAS working on 32X? - rcache_get_reg_arg(0, GET_Rn()); + rcache_get_reg_arg(0, GET_Rn(), NULL); tmp = emit_memhandler_read(0); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_bic_r_imm(sr, T); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_clr_t_cond(sr); emith_cmp_r_imm(tmp, 0); - emit_or_t_if_eq(sr); - rcache_clean(); + emith_set_t_cond(sr, DCOND_EQ); emith_or_r_imm(tmp, 0x80); tmp2 = rcache_get_tmp_arg(1); // assuming it differs to tmp emith_move_r_r(tmp2, tmp); rcache_free_tmp(tmp); - rcache_get_reg_arg(0, GET_Rn()); + rcache_get_reg_arg(0, GET_Rn(), NULL); emit_memhandler_write(0); break; default: @@ -2416,7 +4355,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } goto end_op; case 0x0e: - tmp = rcache_get_reg(GET_Rn(), RC_GR_READ); switch (GET_Fx()) { case 0: // LDC Rm,SR 0100mmmm00001110 @@ -2432,48 +4370,29 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto default_; } if (tmp2 == SHR_SR) { - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + emith_invalidate_t(); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); emith_write_sr(sr, tmp); drcf.test_irq = 1; - } else { - tmp2 = rcache_get_reg(tmp2, RC_GR_WRITE); - emith_move_r_r(tmp2, tmp); - } + } else + emit_move_r_r(tmp2, GET_Rn()); goto end_op; - case 0x0f: - // MAC.W @Rm+,@Rn+ 0100nnnnmmmm1111 - emit_indirect_read_double(&tmp, &tmp2, GET_Rn(), GET_Rm(), 1); - emith_sext(tmp, tmp, 16); - emith_sext(tmp2, tmp2, 16); - tmp3 = rcache_get_reg(SHR_MACL, RC_GR_RMW); - tmp4 = rcache_get_reg(SHR_MACH, RC_GR_RMW); - emith_mula_s64(tmp3, tmp4, tmp, tmp2); + case 0x0f: // MAC.W @Rm+,@Rn+ 0100nnnnmmmm1111 + emit_indirect_read_double(sh2, &tmp, &tmp2, GET_Rn(), GET_Rm(), 1); + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + tmp3 = rcache_get_reg(SHR_MACL, RC_GR_RMW, NULL); + tmp4 = rcache_get_reg(SHR_MACH, RC_GR_RMW, NULL); + emith_sh2_macw(tmp3, tmp4, tmp, tmp2, sr); rcache_free_tmp(tmp2); - // XXX: MACH should be untouched when S is set? - sr = rcache_get_reg(SHR_SR, RC_GR_READ); - emith_tst_r_imm(sr, S); - EMITH_JMP_START(DCOND_EQ); - - emith_asr(tmp, tmp3, 31); - emith_eorf_r_r(tmp, tmp4); // tmp = ((signed)macl >> 31) ^ mach - EMITH_JMP_START(DCOND_EQ); - emith_move_r_imm(tmp3, 0x80000000); - emith_tst_r_r(tmp4, tmp4); - EMITH_SJMP_START(DCOND_MI); - emith_sub_r_imm_c(DCOND_PL, tmp3, 1); // positive - EMITH_SJMP_END(DCOND_MI); - EMITH_JMP_END(DCOND_EQ); - - EMITH_JMP_END(DCOND_EQ); rcache_free_tmp(tmp); goto end_op; } goto default_; ///////////////////////////////////////////// - case 0x05: - // MOV.L @(disp,Rm),Rn 0101nnnnmmmmdddd - emit_memhandler_read_rr(GET_Rn(), GET_Rm(), (op & 0x0f) * 4, 2); + case 0x05: // MOV.L @(disp,Rm),Rn 0101nnnnmmmmdddd + emit_memhandler_read_rr(sh2, GET_Rn(), GET_Rm(), (op & 0x0f) * 4, 2 | drcf.polling); goto end_op; ///////////////////////////////////////////// @@ -2486,21 +4405,17 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x04: // MOV.B @Rm+,Rn 0110nnnnmmmm0100 case 0x05: // MOV.W @Rm+,Rn 0110nnnnmmmm0101 case 0x06: // MOV.L @Rm+,Rn 0110nnnnmmmm0110 - emit_memhandler_read_rr(GET_Rn(), GET_Rm(), 0, op & 3); - if ((op & 7) >= 4 && GET_Rn() != GET_Rm()) { - tmp = rcache_get_reg(GET_Rm(), RC_GR_RMW); - emith_add_r_imm(tmp, (1 << (op & 3))); - } + tmp = ((op & 7) >= 4 && GET_Rn() != GET_Rm()) ? MF_POSTINCR : drcf.polling; + emit_memhandler_read_rr(sh2, GET_Rn(), GET_Rm(), 0, (op & 3) | tmp); + goto end_op; + case 0x03: // MOV Rm,Rn 0110nnnnmmmm0011 + emit_move_r_r(GET_Rn(), GET_Rm()); goto end_op; - case 0x03: case 0x07 ... 0x0f: - tmp = rcache_get_reg(GET_Rm(), RC_GR_READ); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_WRITE); + tmp = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_WRITE, NULL); switch (op & 0x0f) { - case 0x03: // MOV Rm,Rn 0110nnnnmmmm0011 - emith_move_r_r(tmp2, tmp); - break; case 0x07: // NOT Rm,Rn 0110nnnnmmmm0111 emith_mvn_r_r(tmp2, tmp); break; @@ -2522,25 +4437,39 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_rol(tmp2, tmp, 16); break; case 0x0a: // NEGC Rm,Rn 0110nnnnmmmm1010 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_tpop_carry(sr, 1); - emith_negcf_r_r(tmp2, tmp); - emith_tpush_carry(sr, 1); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + emith_t_to_carry(sr, 1); + emith_negc_r_r(tmp2, tmp); + } else +#endif + { + EMITH_HINT_COND(DCOND_CS); + emith_tpop_carry(sr, 1); + emith_negcf_r_r(tmp2, tmp); + emith_tpush_carry(sr, 1); + } break; case 0x0b: // NEG Rm,Rn 0110nnnnmmmm1011 emith_neg_r_r(tmp2, tmp); break; case 0x0c: // EXTU.B Rm,Rn 0110nnnnmmmm1100 emith_clear_msb(tmp2, tmp, 24); + rcache_set_x16(tmp2, 1, 1); break; case 0x0d: // EXTU.W Rm,Rn 0110nnnnmmmm1101 emith_clear_msb(tmp2, tmp, 16); + rcache_set_x16(tmp2, 0, 1); break; case 0x0e: // EXTS.B Rm,Rn 0110nnnnmmmm1110 emith_sext(tmp2, tmp, 8); + rcache_set_x16(tmp2, 1, 0); break; case 0x0f: // EXTS.W Rm,Rn 0110nnnnmmmm1111 emith_sext(tmp2, tmp, 16); + rcache_set_x16(tmp2, 1, 0); break; } goto end_op; @@ -2548,13 +4477,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto default_; ///////////////////////////////////////////// - case 0x07: - // ADD #imm,Rn 0111nnnniiiiiiii - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - if (op & 0x80) { // adding negative - emith_sub_r_imm(tmp, -op & 0xff); - } else - emith_add_r_imm(tmp, op & 0xff); + case 0x07: // ADD #imm,Rn 0111nnnniiiiiiii + if (op & 0x80) // adding negative + emit_sub_r_imm(GET_Rn(), (u8)-op); + else + emit_add_r_imm(GET_Rn(), (u8)op); goto end_op; ///////////////////////////////////////////// @@ -2563,29 +4490,20 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { case 0x0000: // MOV.B R0,@(disp,Rn) 10000000nnnndddd case 0x0100: // MOV.W R0,@(disp,Rn) 10000001nnnndddd - rcache_clean(); - tmp = rcache_get_reg_arg(0, GET_Rm()); - tmp2 = rcache_get_reg_arg(1, SHR_R0); - tmp3 = (op & 0x100) >> 8; - if (op & 0x0f) - emith_add_r_imm(tmp, (op & 0x0f) << tmp3); - emit_memhandler_write(tmp3); + tmp = (op & 0x100) >> 8; + emit_memhandler_write_rr(sh2, SHR_R0, GET_Rm(), (op & 0x0f) << tmp, tmp); goto end_op; case 0x0400: // MOV.B @(disp,Rm),R0 10000100mmmmdddd case 0x0500: // MOV.W @(disp,Rm),R0 10000101mmmmdddd tmp = (op & 0x100) >> 8; - emit_memhandler_read_rr(SHR_R0, GET_Rm(), (op & 0x0f) << tmp, tmp); + emit_memhandler_read_rr(sh2, SHR_R0, GET_Rm(), (op & 0x0f) << tmp, tmp | drcf.polling); goto end_op; case 0x0800: // CMP/EQ #imm,R0 10001000iiiiiiii - // XXX: could use cmn - tmp = rcache_get_tmp(); - tmp2 = rcache_get_reg(0, RC_GR_READ); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_move_r_imm_s8(tmp, op & 0xff); - emith_bic_r_imm(sr, T); - emith_cmp_r_r(tmp2, tmp); - emit_or_t_if_eq(sr); - rcache_free_tmp(tmp); + tmp2 = rcache_get_reg(SHR_R0, RC_GR_READ, NULL); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_clr_t_cond(sr); + emith_cmp_r_imm(tmp2, (s8)(op & 0xff)); + emith_set_t_cond(sr, DCOND_EQ); goto end_op; } goto default_; @@ -2597,130 +4515,87 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x0000: // MOV.B R0,@(disp,GBR) 11000000dddddddd case 0x0100: // MOV.W R0,@(disp,GBR) 11000001dddddddd case 0x0200: // MOV.L R0,@(disp,GBR) 11000010dddddddd - rcache_clean(); - tmp = rcache_get_reg_arg(0, SHR_GBR); - tmp2 = rcache_get_reg_arg(1, SHR_R0); - tmp3 = (op & 0x300) >> 8; - emith_add_r_imm(tmp, (op & 0xff) << tmp3); - emit_memhandler_write(tmp3); + tmp = (op & 0x300) >> 8; + emit_memhandler_write_rr(sh2, SHR_R0, SHR_GBR, (op & 0xff) << tmp, tmp); goto end_op; case 0x0400: // MOV.B @(disp,GBR),R0 11000100dddddddd case 0x0500: // MOV.W @(disp,GBR),R0 11000101dddddddd case 0x0600: // MOV.L @(disp,GBR),R0 11000110dddddddd tmp = (op & 0x300) >> 8; - emit_memhandler_read_rr(SHR_R0, SHR_GBR, (op & 0xff) << tmp, tmp); - goto end_op; - case 0x0300: // TRAPA #imm 11000011iiiiiiii - tmp = rcache_get_reg(SHR_SP, RC_GR_RMW); - emith_sub_r_imm(tmp, 4*2); - // push SR - tmp = rcache_get_reg_arg(0, SHR_SP); - emith_add_r_imm(tmp, 4); - tmp = rcache_get_reg_arg(1, SHR_SR); - emith_clear_msb(tmp, tmp, 22); - emit_memhandler_write(2); - // push PC - rcache_get_reg_arg(0, SHR_SP); - tmp = rcache_get_tmp_arg(1); - emith_move_r_imm(tmp, pc); - emit_memhandler_write(2); - // obtain new PC - emit_memhandler_read_rr(SHR_PC, SHR_VBR, (op & 0xff) * 4, 2); - // indirect jump -> back to dispatcher - rcache_flush(); - emith_jump(sh2_drc_dispatcher); + emit_memhandler_read_rr(sh2, SHR_R0, SHR_GBR, (op & 0xff) << tmp, tmp | drcf.polling); goto end_op; case 0x0800: // TST #imm,R0 11001000iiiiiiii - tmp = rcache_get_reg(SHR_R0, RC_GR_READ); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_bic_r_imm(sr, T); + tmp = rcache_get_reg(SHR_R0, RC_GR_READ, NULL); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_clr_t_cond(sr); emith_tst_r_imm(tmp, op & 0xff); - emit_or_t_if_eq(sr); + emith_set_t_cond(sr, DCOND_EQ); goto end_op; case 0x0900: // AND #imm,R0 11001001iiiiiiii - tmp = rcache_get_reg(SHR_R0, RC_GR_RMW); - emith_and_r_imm(tmp, op & 0xff); + tmp = rcache_get_reg(SHR_R0, RC_GR_RMW, &tmp2); + emith_and_r_r_imm(tmp, tmp2, (op & 0xff)); goto end_op; case 0x0a00: // XOR #imm,R0 11001010iiiiiiii - tmp = rcache_get_reg(SHR_R0, RC_GR_RMW); - emith_eor_r_imm(tmp, op & 0xff); + if (op & 0xff) { + tmp = rcache_get_reg(SHR_R0, RC_GR_RMW, &tmp2); + emith_eor_r_r_imm(tmp, tmp2, (op & 0xff)); + } goto end_op; case 0x0b00: // OR #imm,R0 11001011iiiiiiii - tmp = rcache_get_reg(SHR_R0, RC_GR_RMW); - emith_or_r_imm(tmp, op & 0xff); + if (op & 0xff) { + tmp = rcache_get_reg(SHR_R0, RC_GR_RMW, &tmp2); + emith_or_r_r_imm(tmp, tmp2, (op & 0xff)); + } goto end_op; case 0x0c00: // TST.B #imm,@(R0,GBR) 11001100iiiiiiii - tmp = emit_indirect_indexed_read(SHR_R0, SHR_GBR, 0); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_bic_r_imm(sr, T); + tmp = emit_indirect_indexed_read(sh2, SHR_TMP, SHR_R0, SHR_GBR, 0 | drcf.polling); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_clr_t_cond(sr); emith_tst_r_imm(tmp, op & 0xff); - emit_or_t_if_eq(sr); + emith_set_t_cond(sr, DCOND_EQ); rcache_free_tmp(tmp); goto end_op; case 0x0d00: // AND.B #imm,@(R0,GBR) 11001101iiiiiiii - tmp = emit_indirect_indexed_read(SHR_R0, SHR_GBR, 0); - emith_and_r_imm(tmp, op & 0xff); + tmp = emit_indirect_indexed_read(sh2, SHR_TMP, SHR_R0, SHR_GBR, 0); + tmp2 = rcache_get_tmp_arg(1); + emith_and_r_r_imm(tmp2, tmp, (op & 0xff)); goto end_rmw_op; case 0x0e00: // XOR.B #imm,@(R0,GBR) 11001110iiiiiiii - tmp = emit_indirect_indexed_read(SHR_R0, SHR_GBR, 0); - emith_eor_r_imm(tmp, op & 0xff); + tmp = emit_indirect_indexed_read(sh2, SHR_TMP, SHR_R0, SHR_GBR, 0); + tmp2 = rcache_get_tmp_arg(1); + emith_eor_r_r_imm(tmp2, tmp, (op & 0xff)); goto end_rmw_op; case 0x0f00: // OR.B #imm,@(R0,GBR) 11001111iiiiiiii - tmp = emit_indirect_indexed_read(SHR_R0, SHR_GBR, 0); - emith_or_r_imm(tmp, op & 0xff); - end_rmw_op: + tmp = emit_indirect_indexed_read(sh2, SHR_TMP, SHR_R0, SHR_GBR, 0); tmp2 = rcache_get_tmp_arg(1); - emith_move_r_r(tmp2, tmp); + emith_or_r_r_imm(tmp2, tmp, (op & 0xff)); + end_rmw_op: rcache_free_tmp(tmp); - tmp3 = rcache_get_reg_arg(0, SHR_GBR); - tmp4 = rcache_get_reg(SHR_R0, RC_GR_READ); - emith_add_r_r(tmp3, tmp4); - emit_memhandler_write(0); + emit_indirect_indexed_write(sh2, SHR_TMP, SHR_R0, SHR_GBR, 0); goto end_op; } goto default_; ///////////////////////////////////////////// - case 0x0e: - // MOV #imm,Rn 1110nnnniiiiiiii - emit_move_r_imm32(GET_Rn(), (u32)(signed int)(signed char)op); + case 0x0e: // MOV #imm,Rn 1110nnnniiiiiiii + emit_move_r_imm32(GET_Rn(), (s8)op); goto end_op; default: default_: - if (!(op_flags[i] & OF_B_IN_DS)) + if (!(op_flags[i] & OF_B_IN_DS)) { elprintf_sh2(sh2, EL_ANOMALY, "drc: illegal op %04x @ %08x", op, pc - 2); - - tmp = rcache_get_reg(SHR_SP, RC_GR_RMW); - emith_sub_r_imm(tmp, 4*2); - // push SR - tmp = rcache_get_reg_arg(0, SHR_SP); - emith_add_r_imm(tmp, 4); - tmp = rcache_get_reg_arg(1, SHR_SR); - emith_clear_msb(tmp, tmp, 22); - emit_memhandler_write(2); - // push PC - rcache_get_reg_arg(0, SHR_SP); - tmp = rcache_get_tmp_arg(1); - if (drcf.pending_branch_indirect) { - tmp2 = rcache_get_reg(SHR_PC, RC_GR_READ); - emith_move_r_r(tmp, tmp2); + exit(1); } - else - emith_move_r_imm(tmp, pc - 2); - emit_memhandler_write(2); - // obtain new PC - v = (op_flags[i] & OF_B_IN_DS) ? 6 : 4; - emit_memhandler_read_rr(SHR_PC, SHR_VBR, v * 4, 2); - // indirect jump -> back to dispatcher - rcache_flush(); - emith_jump(sh2_drc_dispatcher); - break; } end_op: rcache_unlock_all(); + rcache_set_usage_now(0); +#if DRC_DEBUG & 64 + RCACHE_CHECK("after insn"); +#endif cycles += opd->cycles; @@ -2731,8 +4606,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // test irq? if (drcf.test_irq && !drcf.pending_branch_direct) { - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); + emith_sync_t(sr); if (!drcf.pending_branch_indirect) emit_move_r_imm32(SHR_PC, pc); rcache_flush(); @@ -2740,180 +4616,300 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.test_irq = 0; } - // branch handling (with/without delay) + // branch handling if (drcf.pending_branch_direct) { - struct op_data *opd_b = - (op_flags[i] & OF_DELAY_OP) ? &ops[i-1] : opd; + struct op_data *opd_b = (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd; u32 target_pc = opd_b->imm; - int cond = -1, ncond = -1; + int cond = -1; + int ctaken = 0; void *target = NULL; - EMITH_SJMP_DECL_(); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + if (OP_ISBRACND(opd_b->op)) + ctaken = (op_flags[i] & OF_DELAY_OP) ? 1 : 2; + cycles += ctaken; // assume branch taken + +#if LOOP_OPTIMIZER + if ((drcf.loop_type == OF_IDLE_LOOP || + (drcf.loop_type == OF_DELAY_LOOP && drcf.delay_reg >= 0))) + { + // idle or delay loop + emit_sync_t_to_sr(); + emith_sh2_delay_loop(cycles, drcf.delay_reg); + rcache_unlock_all(); // may lock delay_reg + drcf.polling = drcf.loop_type = drcf.pinning = 0; + } +#endif + +#if CALL_STACK + void *rtsadd = NULL, *rtsret = NULL; + if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) { + // BSR - save rts data + tmp = rcache_get_tmp_arg(1); + rtsadd = tcache_ptr; + emith_move_r_imm_s8_patchable(tmp, 0); + rcache_clean_tmp(); + rcache_invalidate_tmp(); + emith_call(sh2_drc_dispatcher_call); + rtsret = tcache_ptr; + } +#endif + + // XXX move below cond test if not changing host cond (MIPS delay slot)? + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); rcache_clean(); - if (opd_b->op != OP_BRANCH) { + if (OP_ISBRACND(opd_b->op)) { + // BT[S], BF[S] - emit condition test cond = (opd_b->op == OP_BRANCH_CF) ? DCOND_EQ : DCOND_NE; - ncond = (opd_b->op == OP_BRANCH_CF) ? DCOND_NE : DCOND_EQ; - } - if (cond != -1) { - int ctaken = (op_flags[i] & OF_DELAY_OP) ? 1 : 2; - - if (delay_dep_fw & BITMASK1(SHR_T)) + if (delay_dep_fw & BITMASK1(SHR_T)) { + emith_sync_t(sr); emith_tst_r_imm(sr, T_save); - else - emith_tst_r_imm(sr, T); - - EMITH_SJMP_START_(ncond); - emith_sub_r_imm_c(cond, sr, ctaken<<12); - } + } else { + cond = emith_tst_t(sr, (opd_b->op == OP_BRANCH_CT)); + if (emith_get_t_cond() >= 0) { + if (opd_b->op == OP_BRANCH_CT) + emith_or_r_imm_c(cond, sr, T); + else + emith_bic_r_imm_c(cond, sr, T); + } + } + } else + emith_sync_t(sr); + // no modification of host status/flags between here and branching! -#if LINK_BRANCHES - if (find_in_array(branch_target_pc, branch_target_count, target_pc) >= 0) + v = find_in_sorted_linkage(branch_targets, branch_target_count, target_pc); + if (v >= 0) { // local branch - // XXX: jumps back can be linked already - if (branch_patch_count < MAX_LOCAL_BRANCHES) { + if (branch_targets[v].ptr) { + // local backward jump, link here now since host PC is already known + target = branch_targets[v].ptr; +#if LOOP_OPTIMIZER + if (pinned_loops[pinned_loop_count].pc == target_pc) { + // backward jump at end of optimized loop + rcache_unpin_all(); + target = pinned_loops[pinned_loop_count].ptr; + pinned_loop_count ++; + } +#endif + if (cond != -1) { + if (emith_jump_patch_inrange(tcache_ptr, target)) { + emith_jump_cond(cond, target); + } else { + // not reachable directly, must use far branch + EMITH_JMP_START(emith_invert_cond(cond)); + emith_jump(target); + EMITH_JMP_END(emith_invert_cond(cond)); + } + } else { + emith_jump(target); + rcache_invalidate(); + } + } else if (blx_target_count < MAX_LOCAL_BRANCHES) { + // local forward jump target = tcache_ptr; - branch_patch_pc[branch_patch_count] = target_pc; - branch_patch_ptr[branch_patch_count] = target; - branch_patch_count++; - } - else - dbg(1, "warning: too many local branches"); + blx_targets[blx_target_count++] = + (struct linkage) { .pc = target_pc, .ptr = target, .mask = 0x2 }; + if (cond != -1) + emith_jump_cond_patchable(cond, target); + else { + emith_jump_patchable(target); + rcache_invalidate(); + } + } else + // no space for resolving forward branch, handle it as external + dbg(1, "warning: too many unresolved branches"); } if (target == NULL) -#endif { // can't resolve branch locally, make a block exit - emit_move_r_imm32(SHR_PC, target_pc); - rcache_clean(); + bl = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); + if (cond != -1) { +#if 1 + if (bl && blx_target_count < ARRAY_SIZE(blx_targets)) { + // conditional jumps get a blx stub for the far jump + bl->type = BL_JCCBLX; + target = tcache_ptr; + blx_targets[blx_target_count++] = + (struct linkage) { .pc = target_pc, .ptr = target, .bl = bl }; + emith_jump_cond_patchable(cond, target); + } else { + // not linkable, or blx table full; inline jump @dispatcher + EMITH_JMP_START(emith_invert_cond(cond)); + if (bl) { + bl->jump = tcache_ptr; + emith_flush(); // flush to inhibit insn swapping + bl->type = BL_LDJMP; + } + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, target_pc); + rcache_free_tmp(tmp); + target = sh2_drc_dispatcher; - target = dr_prepare_ext_branch(target_pc, sh2->is_slave, tcache_id); - if (target == NULL) - return NULL; - } + emith_jump_patchable(target); + EMITH_JMP_END(emith_invert_cond(cond)); + } +#else + // jump @dispatcher - ARM 32bit version with conditional execution + EMITH_SJMP_START(emith_invert_cond(cond)); + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm_c(cond, tmp, target_pc); + rcache_free_tmp(tmp); + target = sh2_drc_dispatcher; - if (cond != -1) { - emith_jump_cond_patchable(cond, target); - EMITH_SJMP_END_(ncond); - } - else { - emith_jump_patchable(target); - rcache_invalidate(); + if (bl) { + bl->jump = tcache_ptr; + bl->type = BL_JMP; + } + emith_jump_cond_patchable(cond, target); + EMITH_SJMP_END(emith_invert_cond(cond)); +#endif + } else { + // unconditional, has the far jump inlined + if (bl) { + emith_flush(); // flush to inhibit insn swapping + bl->type = BL_LDJMP; + } + + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, target_pc); + rcache_free_tmp(tmp); + target = sh2_drc_dispatcher; + + emith_jump_patchable(target); + rcache_invalidate(); + } } +#if CALL_STACK + if (rtsadd) + emith_move_r_imm_s8_patch(rtsadd, tcache_ptr - (u8 *)rtsret); +#endif + + // branch not taken, correct cycle count + if (ctaken) + cycles -= ctaken; + // set T bit to reflect branch not taken for OP_BRANCH_CT/CF + if (emith_get_t_cond() >= 0) // T is synced for all other cases + emith_set_t(sr, opd_b->op == OP_BRANCH_CF); + drcf.pending_branch_direct = 0; + if (target_pc >= base_pc && target_pc < pc) + drcf.polling = drcf.loop_type = 0; } else if (drcf.pending_branch_indirect) { - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + u32 target_pc; + + tmp = rcache_get_reg_arg(0, SHR_PC, NULL); + +#if CALL_STACK + struct op_data *opd_b = (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd; + void *rtsadd = NULL, *rtsret = NULL; + + if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) { + // JSR, BSRF - save rts data + tmp = rcache_get_tmp_arg(1); + rtsadd = tcache_ptr; + emith_move_r_imm_s8_patchable(tmp, 0); + rcache_clean_tmp(); + rcache_invalidate_tmp(); + emith_call(sh2_drc_dispatcher_call); + rtsret = tcache_ptr; + } +#endif + + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); - rcache_flush(); - emith_jump(sh2_drc_dispatcher); + emith_sync_t(sr); + rcache_clean(); + +#if CALL_STACK + if (opd_b->rm == SHR_PR) { + // RTS - restore rts data, else jump to dispatcher + emith_jump(sh2_drc_dispatcher_return); + } else +#endif + if (gconst_get(SHR_PC, &target_pc)) { + // JMP, JSR, BRAF, BSRF const - treat like unconditional direct branch + bl = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); + if (bl) // pc already loaded somewhere else, can patch jump only + bl->type = BL_JMP; + emith_jump_patchable(sh2_drc_dispatcher); + } else { + // JMP, JSR, BRAF, BSRF not const + emith_jump(sh2_drc_dispatcher); + } + rcache_invalidate(); + +#if CALL_STACK + if (rtsadd) + emith_move_r_imm_s8_patch(rtsadd, tcache_ptr - (u8 *)rtsret); +#endif + drcf.pending_branch_indirect = 0; + drcf.polling = drcf.loop_type = 0; } + rcache_unlock_all(); do_host_disasm(tcache_id); } - tmp = rcache_get_reg(SHR_SR, RC_GR_RMW); - FLUSH_CYCLES(tmp); - rcache_flush(); - // check the last op if (op_flags[i-1] & OF_DELAY_OP) opd = &ops[i-2]; else opd = &ops[i-1]; - if (opd->op != OP_BRANCH && opd->op != OP_BRANCH_R - && opd->op != OP_BRANCH_RF && opd->op != OP_RTE) + if (! OP_ISBRAUC(opd->op)) { - void *target; - - emit_move_r_imm32(SHR_PC, pc); - rcache_flush(); - - target = dr_prepare_ext_branch(pc, sh2->is_slave, tcache_id); - if (target == NULL) - return NULL; - emith_jump_patchable(target); - } - - // link local branches - for (i = 0; i < branch_patch_count; i++) { - void *target; - int t; - t = find_in_array(branch_target_pc, branch_target_count, branch_patch_pc[i]); - target = branch_target_ptr[t]; - if (target == NULL) { - // flush pc and go back to dispatcher (this should no longer happen) - dbg(1, "stray branch to %08x %p", branch_patch_pc[i], tcache_ptr); - target = tcache_ptr; - emit_move_r_imm32(SHR_PC, branch_patch_pc[i]); - rcache_flush(); - emith_jump(sh2_drc_dispatcher); + tmp = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + FLUSH_CYCLES(tmp); + emith_sync_t(tmp); + + rcache_clean(); + bl = dr_prepare_ext_branch(block->entryp, pc, sh2->is_slave, tcache_id); + if (bl) { + emith_flush(); // flush to inhibit insn swapping + bl->type = BL_LDJMP; } - emith_jump_patch(branch_patch_ptr[i], target); - } - - // mark memory blocks as containing compiled code - // override any overlay blocks as they become unreachable anyway - if ((block->addr & 0xc7fc0000) == 0x06000000 - || (block->addr & 0xfffff000) == 0xc0000000) - { - u16 *drc_ram_blk = NULL; - u32 addr, mask = 0, shift = 0; + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, pc); + emith_jump_patchable(sh2_drc_dispatcher); + rcache_invalidate(); + } else + rcache_flush(); - if (tcache_id != 0) { - // data array, BIOS - drc_ram_blk = Pico32xMem->drcblk_da[sh2->is_slave]; - shift = SH2_DRCBLK_DA_SHIFT; - mask = 0xfff; - } - else { - // SDRAM - drc_ram_blk = Pico32xMem->drcblk_ram; - shift = SH2_DRCBLK_RAM_SHIFT; - mask = 0x3ffff; - } + // link unresolved branches, emitting blx area entries as needed + emit_branch_linkage_code(sh2, block, tcache_id, branch_targets, + branch_target_count, blx_targets, blx_target_count); - // mark recompiled insns - drc_ram_blk[(base_pc & mask) >> shift] = 1; - for (pc = base_pc; pc < end_pc; pc += 2) - drc_ram_blk[(pc & mask) >> shift] = 1; - - // mark literals - for (i = 0; i < literal_addr_count; i++) { - tmp = literal_addr[i]; - drc_ram_blk[(tmp & mask) >> shift] = 1; - } + emith_flush(); + do_host_disasm(tcache_id); - // add to invalidation lookup lists - addr = base_pc & ~(INVAL_PAGE_SIZE - 1); - for (; addr < end_literals; addr += INVAL_PAGE_SIZE) { - i = (addr & mask) / INVAL_PAGE_SIZE; - add_to_block_list(&inval_lookup[tcache_id][i], block); - } - } + emith_pool_commit(0); - tcache_ptrs[tcache_id] = tcache_ptr; + // fill blx backup; do this last to backup final patched code + for (i = 0; i < block->entry_count; i++) + for (bl = block->entryp[i].o_links; bl; bl = bl->o_next) + memcpy(bl->jdisp, bl->blx ?: bl->jump, emith_jump_at_size()); + ring_alloc(&tcache_ring[tcache_id], tcache_ptr - block_entry_ptr); host_instructions_updated(block_entry_ptr, tcache_ptr); + dr_activate_block(block, tcache_id, sh2->is_slave); + emith_update_cache(); + do_host_disasm(tcache_id); - if (drcf.literals_disabled && literal_addr_count) - dbg(1, "literals_disabled && literal_addr_count?"); - dbg(2, " block #%d,%d tcache %d/%d, insns %d -> %d %.3f", - tcache_id, blkid_main, - tcache_ptr - tcache_bases[tcache_id], tcache_sizes[tcache_id], + dbg(2, " block #%d,%d -> %p tcache %d/%d, insns %d -> %d %.3f", + tcache_id, blkid_main, tcache_ptr, + tcache_ring[tcache_id].used, tcache_ring[tcache_id].size, insns_compiled, host_insn_count, (float)host_insn_count / insns_compiled); if ((sh2->pc & 0xc6000000) == 0x02000000) { // ROM - dbg(2, " hash collisions %d/%d", hash_collisions, block_counts[tcache_id]); + dbg(2, " hash collisions %d/%d", hash_collisions, block_ring[tcache_id].used); Pico32x.emu_flags |= P32XF_DRC_ROM_C; } /* @@ -2923,7 +4919,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) printf("~~~\n"); */ -#if (DRC_DEBUG & 4) +#if (DRC_DEBUG) fflush(stdout); #endif @@ -2932,54 +4928,245 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) static void sh2_generate_utils(void) { - int arg0, arg1, arg2, sr, tmp; - - sh2_drc_write32 = p32x_sh2_write32; - sh2_drc_read8 = p32x_sh2_read8; - sh2_drc_read16 = p32x_sh2_read16; - sh2_drc_read32 = p32x_sh2_read32; + int arg0, arg1, arg2, arg3, sr, tmp, tmp2; +#if DRC_DEBUG + int hic = host_insn_count; // don't count utils for insn statistics +#endif host_arg2reg(arg0, 0); host_arg2reg(arg1, 1); host_arg2reg(arg2, 2); + host_arg2reg(arg3, 3); emith_move_r_r(arg0, arg0); // nop + emith_flush(); + + // sh2_drc_write8(u32 a, u32 d) + sh2_drc_write8 = (void *)tcache_ptr; + emith_ctx_read_ptr(arg2, offsetof(SH2, write8_tab)); + emith_sh2_wcall(arg0, arg1, arg2, arg3); + emith_flush(); + + // sh2_drc_write16(u32 a, u32 d) + sh2_drc_write16 = (void *)tcache_ptr; + emith_ctx_read_ptr(arg2, offsetof(SH2, write16_tab)); + emith_sh2_wcall(arg0, arg1, arg2, arg3); + emith_flush(); + + // sh2_drc_write32(u32 a, u32 d) + sh2_drc_write32 = (void *)tcache_ptr; + emith_ctx_read_ptr(arg2, offsetof(SH2, write32_tab)); + emith_sh2_wcall(arg0, arg1, arg2, arg3); + emith_flush(); + + // d = sh2_drc_read8(u32 a) + sh2_drc_read8 = (void *)tcache_ptr; + emith_ctx_read_ptr(arg1, offsetof(SH2, read8_map)); + EMITH_HINT_COND(DCOND_CS); + emith_sh2_rcall(arg0, arg1, arg2, arg3); + EMITH_SJMP_START(DCOND_CS); + emith_and_r_r_c(DCOND_CC, arg0, arg3); + emith_eor_r_imm_ptr_c(DCOND_CC, arg0, 1); + emith_read8s_r_r_r_c(DCOND_CC, RET_REG, arg2, arg0); + emith_ret_c(DCOND_CC); + EMITH_SJMP_END(DCOND_CS); + emith_move_r_r_ptr(arg1, CONTEXT_REG); + emith_jump_reg(arg2); + emith_flush(); + + // d = sh2_drc_read16(u32 a) + sh2_drc_read16 = (void *)tcache_ptr; + emith_ctx_read_ptr(arg1, offsetof(SH2, read16_map)); + EMITH_HINT_COND(DCOND_CS); + emith_sh2_rcall(arg0, arg1, arg2, arg3); + EMITH_SJMP_START(DCOND_CS); + emith_and_r_r_c(DCOND_CC, arg0, arg3); + emith_read16s_r_r_r_c(DCOND_CC, RET_REG, arg2, arg0); + emith_ret_c(DCOND_CC); + EMITH_SJMP_END(DCOND_CS); + emith_move_r_r_ptr(arg1, CONTEXT_REG); + emith_jump_reg(arg2); + emith_flush(); + + // d = sh2_drc_read32(u32 a) + sh2_drc_read32 = (void *)tcache_ptr; + emith_ctx_read_ptr(arg1, offsetof(SH2, read32_map)); + EMITH_HINT_COND(DCOND_CS); + emith_sh2_rcall(arg0, arg1, arg2, arg3); + EMITH_SJMP_START(DCOND_CS); + emith_and_r_r_c(DCOND_CC, arg0, arg3); + emith_read_r_r_r_c(DCOND_CC, RET_REG, arg2, arg0); + emith_ror_c(DCOND_CC, RET_REG, RET_REG, 16); + emith_ret_c(DCOND_CC); + EMITH_SJMP_END(DCOND_CS); + emith_move_r_r_ptr(arg1, CONTEXT_REG); + emith_jump_reg(arg2); + emith_flush(); + + // d = sh2_drc_read8_poll(u32 a) + sh2_drc_read8_poll = (void *)tcache_ptr; + emith_ctx_read_ptr(arg1, offsetof(SH2, read8_map)); + EMITH_HINT_COND(DCOND_CS); + emith_sh2_rcall(arg0, arg1, arg2, arg3); + EMITH_SJMP_START(DCOND_CC); + emith_move_r_r_ptr_c(DCOND_CS, arg1, CONTEXT_REG); + emith_jump_reg_c(DCOND_CS, arg2); + EMITH_SJMP_END(DCOND_CC); + emith_and_r_r_r(arg1, arg0, arg3); + emith_eor_r_imm_ptr(arg1, 1); + emith_read8s_r_r_r(arg1, arg2, arg1); + emith_push_ret(arg1); + emith_move_r_r_ptr(arg2, CONTEXT_REG); + emith_call(p32x_sh2_poll_memory8); + emith_pop_and_ret(arg1); + emith_flush(); + + // d = sh2_drc_read16_poll(u32 a) + sh2_drc_read16_poll = (void *)tcache_ptr; + emith_ctx_read_ptr(arg1, offsetof(SH2, read16_map)); + EMITH_HINT_COND(DCOND_CS); + emith_sh2_rcall(arg0, arg1, arg2, arg3); + EMITH_SJMP_START(DCOND_CC); + emith_move_r_r_ptr_c(DCOND_CS, arg1, CONTEXT_REG); + emith_jump_reg_c(DCOND_CS, arg2); + EMITH_SJMP_END(DCOND_CC); + emith_and_r_r_r(arg1, arg0, arg3); + emith_read16s_r_r_r(arg1, arg2, arg1); + emith_push_ret(arg1); + emith_move_r_r_ptr(arg2, CONTEXT_REG); + emith_call(p32x_sh2_poll_memory16); + emith_pop_and_ret(arg1); + emith_flush(); + + // d = sh2_drc_read32_poll(u32 a) + sh2_drc_read32_poll = (void *)tcache_ptr; + emith_ctx_read_ptr(arg1, offsetof(SH2, read32_map)); + EMITH_HINT_COND(DCOND_CS); + emith_sh2_rcall(arg0, arg1, arg2, arg3); + EMITH_SJMP_START(DCOND_CC); + emith_move_r_r_ptr_c(DCOND_CS, arg1, CONTEXT_REG); + emith_jump_reg_c(DCOND_CS, arg2); + EMITH_SJMP_END(DCOND_CC); + emith_and_r_r_r(arg1, arg0, arg3); + emith_read_r_r_r(arg1, arg2, arg1); + emith_ror(arg1, arg1, 16); + emith_push_ret(arg1); + emith_move_r_r_ptr(arg2, CONTEXT_REG); + emith_call(p32x_sh2_poll_memory32); + emith_pop_and_ret(arg1); + emith_flush(); - // sh2_drc_exit(void) + // sh2_drc_exit(u32 pc) sh2_drc_exit = (void *)tcache_ptr; + emith_ctx_write(arg0, SHR_PC * 4); emit_do_static_regs(1, arg2); emith_sh2_drc_exit(); + emith_flush(); - // sh2_drc_dispatcher(void) + // sh2_drc_dispatcher(u32 pc) sh2_drc_dispatcher = (void *)tcache_ptr; - sr = rcache_get_reg(SHR_SR, RC_GR_READ); - emith_cmp_r_imm(sr, 0); - emith_jump_cond(DCOND_LT, sh2_drc_exit); - rcache_invalidate(); - emith_ctx_read(arg0, SHR_PC * 4); - emith_ctx_read(arg1, offsetof(SH2, is_slave)); + emith_ctx_write(arg0, SHR_PC * 4); +#if BRANCH_CACHE + // check if PC is in branch target cache + emith_and_r_r_imm(arg1, arg0, (ARRAY_SIZE(sh2s->branch_cache)-1)*8); + emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg1, sizeof(void *) == 8 ? 1 : 0); + emith_read_r_r_offs(arg2, arg1, offsetof(SH2, branch_cache)); + emith_cmp_r_r(arg2, arg0); + EMITH_SJMP_START(DCOND_NE); +#if (DRC_DEBUG & 128) + emith_move_r_ptr_imm(arg2, (uptr)&bchit); + emith_read_r_r_offs_c(DCOND_EQ, arg3, arg2, 0); + emith_add_r_imm_c(DCOND_EQ, arg3, 1); + emith_write_r_r_offs_c(DCOND_EQ, arg3, arg2, 0); +#endif + emith_read_r_r_offs_ptr_c(DCOND_EQ, RET_REG, arg1, offsetof(SH2, branch_cache) + sizeof(void *)); + emith_jump_reg_c(DCOND_EQ, RET_REG); + EMITH_SJMP_END(DCOND_NE); +#endif + emith_move_r_r_ptr(arg1, CONTEXT_REG); emith_add_r_r_ptr_imm(arg2, CONTEXT_REG, offsetof(SH2, drc_tmp)); emith_call(dr_lookup_block); - emit_block_entry(); + // store PC and block entry ptr (in arg0) in branch target cache + emith_tst_r_r_ptr(RET_REG, RET_REG); + EMITH_SJMP_START(DCOND_EQ); +#if BRANCH_CACHE +#if (DRC_DEBUG & 128) + emith_move_r_ptr_imm(arg2, (uptr)&bcmiss); + emith_read_r_r_offs_c(DCOND_NE, arg3, arg2, 0); + emith_add_r_imm_c(DCOND_NE, arg3, 1); + emith_write_r_r_offs_c(DCOND_NE, arg3, arg2, 0); +#endif + emith_ctx_read_c(DCOND_NE, arg2, SHR_PC * 4); + emith_and_r_r_imm(arg1, arg2, (ARRAY_SIZE(sh2s->branch_cache)-1)*8); + emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg1, sizeof(void *) == 8 ? 1 : 0); + emith_write_r_r_offs_c(DCOND_NE, arg2, arg1, offsetof(SH2, branch_cache)); + emith_write_r_r_offs_ptr_c(DCOND_NE, RET_REG, arg1, offsetof(SH2, branch_cache) + sizeof(void *)); +#endif + emith_jump_reg_c(DCOND_NE, RET_REG); + EMITH_SJMP_END(DCOND_EQ); // lookup failed, call sh2_translate() emith_move_r_r_ptr(arg0, CONTEXT_REG); emith_ctx_read(arg1, offsetof(SH2, drc_tmp)); // tcache_id emith_call(sh2_translate); - emit_block_entry(); - // sh2_translate() failed, flush cache and retry - emith_ctx_read(arg0, offsetof(SH2, drc_tmp)); - emith_call(flush_tcache); - emith_move_r_r_ptr(arg0, CONTEXT_REG); - emith_ctx_read(arg1, offsetof(SH2, drc_tmp)); - emith_call(sh2_translate); - emit_block_entry(); + emith_tst_r_r_ptr(RET_REG, RET_REG); + EMITH_SJMP_START(DCOND_EQ); + emith_jump_reg_c(DCOND_NE, RET_REG); + EMITH_SJMP_END(DCOND_EQ); // XXX: can't translate, fail emith_call(dr_failure); + emith_flush(); + +#if CALL_STACK + // pc = sh2_drc_dispatcher_call(u32 pc) + sh2_drc_dispatcher_call = (void *)tcache_ptr; + emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx)); + emith_add_r_imm(arg2, (u32)(2*sizeof(void *))); + emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *)); + emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx)); + emith_add_r_r_r_lsl_ptr(arg3, CONTEXT_REG, arg2, 0); + rcache_get_reg_arg(2, SHR_PR, NULL); + emith_add_r_ret(arg1); + emith_write_r_r_offs_ptr(arg1, arg3, offsetof(SH2, rts_cache)+sizeof(void *)); + emith_write_r_r_offs(arg2, arg3, offsetof(SH2, rts_cache)); + rcache_flush(); + emith_ret(); + emith_flush(); + + // sh2_drc_dispatcher_return(u32 pc) + sh2_drc_dispatcher_return = (void *)tcache_ptr; + emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx)); + emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg2, 0); + emith_read_r_r_offs(arg3, arg1, offsetof(SH2, rts_cache)); + emith_cmp_r_r(arg0, arg3); +#if (DRC_DEBUG & 128) + EMITH_SJMP_START(DCOND_EQ); + emith_move_r_ptr_imm(arg3, (uptr)&rcmiss); + emith_read_r_r_offs_c(DCOND_NE, arg1, arg3, 0); + emith_add_r_imm_c(DCOND_NE, arg1, 1); + emith_write_r_r_offs_c(DCOND_NE, arg1, arg3, 0); + emith_jump_cond(DCOND_NE, sh2_drc_dispatcher); + EMITH_SJMP_END(DCOND_EQ); +#else + emith_jump_cond(DCOND_NE, sh2_drc_dispatcher); +#endif + emith_read_r_r_offs_ptr(arg0, arg1, offsetof(SH2, rts_cache) + sizeof(void *)); + emith_sub_r_imm(arg2, (u32)(2*sizeof(void *))); + emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *)); + emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx)); +#if (DRC_DEBUG & 128) + emith_move_r_ptr_imm(arg3, (uptr)&rchit); + emith_read_r_r_offs(arg1, arg3, 0); + emith_add_r_imm(arg1, 1); + emith_write_r_r_offs(arg1, arg3, 0); +#endif + emith_jump_reg(arg0); + emith_flush(); +#endif // sh2_drc_test_irq(void) // assumes it's called from main function (may jump to dispatcher) sh2_drc_test_irq = (void *)tcache_ptr; emith_ctx_read(arg1, offsetof(SH2, pending_level)); - sr = rcache_get_reg(SHR_SR, RC_GR_READ); + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); emith_lsr(arg0, sr, I_SHIFT); emith_and_r_imm(arg0, 0x0f); emith_cmp_r_r(arg1, arg0); // pending_level > ((sr >> 4) & 0x0f)? @@ -2987,26 +5174,26 @@ static void sh2_generate_utils(void) emith_ret_c(DCOND_LE); // nope, return EMITH_SJMP_END(DCOND_GT); // adjust SP - tmp = rcache_get_reg(SHR_SP, RC_GR_RMW); + tmp = rcache_get_reg(SHR_SP, RC_GR_RMW, NULL); emith_sub_r_imm(tmp, 4*2); rcache_clean(); // push SR - tmp = rcache_get_reg_arg(0, SHR_SP); - emith_add_r_imm(tmp, 4); - tmp = rcache_get_reg_arg(1, SHR_SR); + tmp = rcache_get_reg_arg(0, SHR_SP, &tmp2); + emith_add_r_r_imm(tmp, tmp2, 4); + tmp = rcache_get_reg_arg(1, SHR_SR, NULL); emith_clear_msb(tmp, tmp, 22); emith_move_r_r_ptr(arg2, CONTEXT_REG); + rcache_invalidate_tmp(); emith_call(p32x_sh2_write32); // XXX: use sh2_drc_write32? - rcache_invalidate(); // push PC - rcache_get_reg_arg(0, SHR_SP); - emith_ctx_read(arg1, SHR_PC * 4); + rcache_get_reg_arg(0, SHR_SP, NULL); + rcache_get_reg_arg(1, SHR_PC, NULL); emith_move_r_r_ptr(arg2, CONTEXT_REG); + rcache_invalidate_tmp(); emith_call(p32x_sh2_write32); - rcache_invalidate(); // update I, cycles, do callback emith_ctx_read(arg1, offsetof(SH2, pending_level)); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_bic_r_imm(sr, I); emith_or_r_r_lsl(sr, arg1, I_SHIFT); emith_sub_r_imm(sr, 13 << 12); // at least 13 cycles @@ -3014,16 +5201,15 @@ static void sh2_generate_utils(void) emith_move_r_r_ptr(arg0, CONTEXT_REG); emith_call_ctx(offsetof(SH2, irq_callback)); // vector = sh2->irq_callback(sh2, level); // obtain new PC - emith_lsl(arg0, RET_REG, 2); - emith_ctx_read(arg1, SHR_VBR * 4); - emith_add_r_r(arg0, arg1); - tmp = emit_memhandler_read(2); - emith_ctx_write(tmp, SHR_PC * 4); -#if defined(__i386__) || defined(__x86_64__) - emith_add_r_r_ptr_imm(xSP, xSP, sizeof(void *)); // fix stack -#endif - emith_jump(sh2_drc_dispatcher); + tmp = rcache_get_reg_arg(1, SHR_VBR, &tmp2); + emith_add_r_r_r_lsl(arg0, tmp2, RET_REG, 2); + emith_call(sh2_drc_read32); + if (arg0 != RET_REG) + emith_move_r_r(arg0, RET_REG); + emith_call_cleanup(); rcache_invalidate(); + emith_jump(sh2_drc_dispatcher); + emith_flush(); // sh2_drc_entry(SH2 *sh2) sh2_drc_entry = (void *)tcache_ptr; @@ -3031,17 +5217,27 @@ static void sh2_generate_utils(void) emith_move_r_r_ptr(CONTEXT_REG, arg0); // move ctx, arg0 emit_do_static_regs(0, arg2); emith_call(sh2_drc_test_irq); + emith_ctx_read(arg0, SHR_PC * 4); emith_jump(sh2_drc_dispatcher); + emith_flush(); - // sh2_drc_write8(u32 a, u32 d) - sh2_drc_write8 = (void *)tcache_ptr; - emith_ctx_read_ptr(arg2, offsetof(SH2, write8_tab)); - emith_sh2_wcall(arg0, arg2); +#ifdef DRC_SR_REG + // sh2_drc_save_sr(SH2 *sh2) + sh2_drc_save_sr = (void *)tcache_ptr; + tmp = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + emith_write_r_r_offs(tmp, arg0, SHR_SR * 4); + rcache_invalidate(); + emith_ret(); + emith_flush(); - // sh2_drc_write16(u32 a, u32 d) - sh2_drc_write16 = (void *)tcache_ptr; - emith_ctx_read_ptr(arg2, offsetof(SH2, write16_tab)); - emith_sh2_wcall(arg0, arg2); + // sh2_drc_restore_sr(SH2 *sh2) + sh2_drc_restore_sr = (void *)tcache_ptr; + tmp = rcache_get_reg(SHR_SR, RC_GR_WRITE, NULL); + emith_read_r_r_offs(tmp, arg0, SHR_SR * 4); + rcache_flush(); + emith_ret(); + emith_flush(); +#endif #ifdef PDB_NET // debug @@ -3056,6 +5252,7 @@ static void sh2_generate_utils(void) emith_adc_r_imm(arg2, 0x01000000); \ emith_ctx_write(arg2, offsetof(SH2, pdb_io_csum[1])); \ emith_pop_and_ret(); \ + emith_flush(); \ func = tmp; \ } #define MAKE_WRITE_WRAPPER(func) { \ @@ -3068,6 +5265,7 @@ static void sh2_generate_utils(void) emith_ctx_write(arg2, offsetof(SH2, pdb_io_csum[1])); \ emith_move_r_r_ptr(arg2, CONTEXT_REG); \ emith_jump(func); \ + emith_flush(); \ func = tmp; \ } @@ -3077,157 +5275,112 @@ static void sh2_generate_utils(void) MAKE_WRITE_WRAPPER(sh2_drc_write8); MAKE_WRITE_WRAPPER(sh2_drc_write16); MAKE_WRITE_WRAPPER(sh2_drc_write32); -#if (DRC_DEBUG & 4) - host_dasm_new_symbol(sh2_drc_read8); - host_dasm_new_symbol(sh2_drc_read16); - host_dasm_new_symbol(sh2_drc_read32); - host_dasm_new_symbol(sh2_drc_write32); -#endif + MAKE_READ_WRAPPER(sh2_drc_read8_poll); + MAKE_READ_WRAPPER(sh2_drc_read16_poll); + MAKE_READ_WRAPPER(sh2_drc_read32_poll); #endif + emith_pool_commit(0); rcache_invalidate(); #if (DRC_DEBUG & 4) host_dasm_new_symbol(sh2_drc_entry); host_dasm_new_symbol(sh2_drc_dispatcher); +#if CALL_STACK + host_dasm_new_symbol(sh2_drc_dispatcher_call); + host_dasm_new_symbol(sh2_drc_dispatcher_return); +#endif host_dasm_new_symbol(sh2_drc_exit); host_dasm_new_symbol(sh2_drc_test_irq); host_dasm_new_symbol(sh2_drc_write8); host_dasm_new_symbol(sh2_drc_write16); + host_dasm_new_symbol(sh2_drc_write32); + host_dasm_new_symbol(sh2_drc_read8); + host_dasm_new_symbol(sh2_drc_read16); + host_dasm_new_symbol(sh2_drc_read32); + host_dasm_new_symbol(sh2_drc_read8_poll); + host_dasm_new_symbol(sh2_drc_read16_poll); + host_dasm_new_symbol(sh2_drc_read32_poll); +#ifdef DRC_SR_REG + host_dasm_new_symbol(sh2_drc_save_sr); + host_dasm_new_symbol(sh2_drc_restore_sr); +#endif #endif -} - -static void sh2_smc_rm_block(struct block_desc *bd, int tcache_id, u32 ram_mask) -{ - u32 i, addr, end_addr; - void *tmp; - - dbg(2, " killing block %08x-%08x-%08x, blkid %d,%d", - bd->addr, bd->addr + bd->size_nolit, bd->addr + bd->size, - tcache_id, bd - block_tables[tcache_id]); - if (bd->addr == 0 || bd->entry_count == 0) { - dbg(1, " killing dead block!? %08x", bd->addr); - return; - } - - // remove from inval_lookup - addr = bd->addr & ~(INVAL_PAGE_SIZE - 1); - end_addr = bd->addr + bd->size; - for (; addr < end_addr; addr += INVAL_PAGE_SIZE) { - i = (addr & ram_mask) / INVAL_PAGE_SIZE; - rm_from_block_list(&inval_lookup[tcache_id][i], bd); - } - - tmp = tcache_ptr; - - // remove from hash table, make incoming links unresolved - // XXX: maybe patch branches w/flush instead? - for (i = 0; i < bd->entry_count; i++) { - rm_from_hashlist(&bd->entryp[i], tcache_id); - - // since we never reuse tcache space of dead blocks, - // insert jump to dispatcher for blocks that are linked to this - tcache_ptr = bd->entryp[i].tcache_ptr; - emit_move_r_imm32(SHR_PC, bd->entryp[i].pc); - rcache_flush(); - emith_jump(sh2_drc_dispatcher); - - host_instructions_updated(bd->entryp[i].tcache_ptr, tcache_ptr); - - unregister_links(&bd->entryp[i], tcache_id); - } - - tcache_ptr = tmp; - bd->addr = bd->size = bd->size_nolit = 0; - bd->entry_count = 0; +#if DRC_DEBUG + host_insn_count = hic; +#endif } -/* -04205:243: == msh2 block #0,200 060017a8-060017f0 -> 0x27cb9c - 060017a8 d11c MOV.L @($70,PC),R1 ; @$0600181c - -04230:261: msh2 xsh w32 [260017a8] d225e304 -04230:261: msh2 smc check @260017a8 -04239:226: = ssh2 enter 060017a8 0x27cb9c, c=173 -*/ -static void sh2_smc_rm_blocks(u32 a, u16 *drc_ram_blk, int tcache_id, u32 shift, u32 mask) +static void sh2_smc_rm_blocks(u32 a, int len, int tcache_id, u32 shift) { - struct block_list **blist = NULL, *entry; + struct block_list **blist, *entry, *next; + u32 mask = RAM_SIZE(tcache_id) - 1; + u32 wtmask = ~0x20000000; // writethrough area mask + u32 start_addr, end_addr; + u32 start_lit, end_lit; struct block_desc *block; - u32 start_addr, end_addr, taddr, i; - u32 from = ~0, to = 0; +#if (DRC_DEBUG & 2) + int removed = 0; +#endif // ignore cache-through - a &= ~0x20000000; + a &= wtmask; blist = &inval_lookup[tcache_id][(a & mask) / INVAL_PAGE_SIZE]; entry = *blist; + // go through the block list for this range while (entry != NULL) { + next = entry->next; block = entry->block; - start_addr = block->addr & ~0x20000000; + start_addr = block->addr & wtmask; end_addr = start_addr + block->size; - if (start_addr <= a && a < end_addr) { - // get addr range that includes all removed blocks - if (from > start_addr) - from = start_addr; - if (to < end_addr) - to = end_addr; - - if (a >= start_addr + block->size_nolit) - literal_disabled_frames = 3; - sh2_smc_rm_block(block, tcache_id, mask); - - // entry lost, restart search - entry = *blist; - continue; + start_lit = block->addr_lit & wtmask; + end_lit = start_lit + block->size_lit; + // disable/delete block if it covers the modified address + if ((start_addr < a+len && a < end_addr) || + (start_lit < a+len && a < end_lit)) + { + dbg(2, "smc remove @%08x", a); + end_addr = (start_lit < a+len && block->size_lit ? a : 0); + dr_rm_block_entry(block, tcache_id, end_addr, 0); +#if (DRC_DEBUG & 2) + removed = 1; +#endif } - entry = entry->next; + entry = next; } - - if (from >= to) - return; - - // update range around a to match latest state - from &= ~(INVAL_PAGE_SIZE - 1); - to |= (INVAL_PAGE_SIZE - 1); - for (taddr = from; taddr < to; taddr += INVAL_PAGE_SIZE) { - i = (taddr & mask) / INVAL_PAGE_SIZE; - entry = inval_lookup[tcache_id][i]; - - for (; entry != NULL; entry = entry->next) { - block = entry->block; - - start_addr = block->addr & ~0x20000000; - if (start_addr > a) { - if (to > start_addr) - to = start_addr; - } - else { - end_addr = start_addr + block->size; - if (from < end_addr) - from = end_addr; - } - } +#if (DRC_DEBUG & 2) + if (!removed) + dbg(2, "rm_blocks called @%08x, no work?", a); +#endif +#if BRANCH_CACHE + if (tcache_id) + memset32(sh2s[tcache_id-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + else { + memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4); } - - // clear code marks - if (from < to) { - u16 *p = drc_ram_blk + ((from & mask) >> shift); - memset(p, 0, (to - from) >> (shift - 1)); +#endif +#if CALL_STACK + if (tcache_id) { + memset32(sh2s[tcache_id-1].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4); + sh2s[tcache_id-1].rts_cache_idx = 0; + } else { + memset32(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4); + memset32(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)/4); + sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0; } +#endif } -void sh2_drc_wcheck_ram(unsigned int a, int val, int cpuid) +void sh2_drc_wcheck_ram(u32 a, unsigned len, SH2 *sh2) { - dbg(2, "%csh2 smc check @%08x", cpuid ? 's' : 'm', a); - sh2_smc_rm_blocks(a, Pico32xMem->drcblk_ram, 0, SH2_DRCBLK_RAM_SHIFT, 0x3ffff); + sh2_smc_rm_blocks(a, len, 0, SH2_DRCBLK_RAM_SHIFT); } -void sh2_drc_wcheck_da(unsigned int a, int val, int cpuid) +void sh2_drc_wcheck_da(u32 a, unsigned len, SH2 *sh2) { - dbg(2, "%csh2 smc check @%08x", cpuid ? 's' : 'm', a); - sh2_smc_rm_blocks(a, Pico32xMem->drcblk_da[cpuid], - 1 + cpuid, SH2_DRCBLK_DA_SHIFT, 0xfff); + sh2_smc_rm_blocks(a, len, 1 + sh2->is_slave, SH2_DRCBLK_DA_SHIFT); } int sh2_execute_drc(SH2 *sh2c, int cycles) @@ -3239,77 +5392,195 @@ int sh2_execute_drc(SH2 *sh2c, int cycles) // others are usual SH2 flags sh2c->sr &= 0x3f3; sh2c->sr |= cycles << 12; + + sh2c->state |= SH2_IN_DRC; sh2_drc_entry(sh2c); + sh2c->state &= ~SH2_IN_DRC; // TODO: irq cycles - ret_cycles = (signed int)sh2c->sr >> 12; + ret_cycles = (int32_t)sh2c->sr >> 12; if (ret_cycles > 0) - dbg(1, "warning: drc returned with cycles: %d", ret_cycles); + dbg(1, "warning: drc returned with cycles: %d, pc %08x", ret_cycles, sh2c->pc); sh2c->sr &= 0x3f3; return ret_cycles; } -#if (DRC_DEBUG & 2) -void block_stats(void) +static void block_stats(void) { - int c, b, i, total = 0; +#if (DRC_DEBUG & 2) + int c, b, i; + long total = 0; printf("block stats:\n"); - for (b = 0; b < ARRAY_SIZE(block_tables); b++) - for (i = 0; i < block_counts[b]; i++) + for (b = 0; b < ARRAY_SIZE(block_tables); b++) { + for (i = block_ring[b].first; i != block_ring[b].next; i = (i+1)%block_ring[b].size) if (block_tables[b][i].addr != 0) total += block_tables[b][i].refcount; + } + printf("total: %ld\n",total); - for (c = 0; c < 10; c++) { + for (c = 0; c < 20; c++) { struct block_desc *blk, *maxb = NULL; int max = 0; for (b = 0; b < ARRAY_SIZE(block_tables); b++) { - for (i = 0; i < block_counts[b]; i++) { - blk = &block_tables[b][i]; - if (blk->addr != 0 && blk->refcount > max) { + for (i = block_ring[b].first; i != block_ring[b].next; i = (i+1)%block_ring[b].size) + if ((blk = &block_tables[b][i])->addr != 0 && blk->refcount > max) { max = blk->refcount; maxb = blk; } - } } if (maxb == NULL) break; - printf("%08x %9d %2.3f%%\n", maxb->addr, maxb->refcount, + printf("%08x %p %9d %2.3f%%\n", maxb->addr, maxb->tcache_ptr, maxb->refcount, (double)maxb->refcount / total * 100.0); maxb->refcount = 0; } - for (b = 0; b < ARRAY_SIZE(block_tables); b++) - for (i = 0; i < block_counts[b]; i++) + for (b = 0; b < ARRAY_SIZE(block_tables); b++) + for (i = block_ring[b].first; i != block_ring[b].next; i = (i+1)%block_ring[b].size) block_tables[b][i].refcount = 0; +#endif } -#else -#define block_stats() + +void entry_stats(void) +{ +#if (DRC_DEBUG & 32) + int c, b, i, j; + long total = 0; + + printf("block entry stats:\n"); + for (b = 0; b < ARRAY_SIZE(block_tables); b++) { + for (i = block_ring[b].first; i != block_ring[b].next; i = (i+1)%block_ring[b].size) + for (j = 0; j < block_tables[b][i].entry_count; j++) + total += block_tables[b][i].entryp[j].entry_count; + } + printf("total: %ld\n",total); + + for (c = 0; c < 20; c++) { + struct block_desc *blk; + struct block_entry *maxb = NULL; + int max = 0; + for (b = 0; b < ARRAY_SIZE(block_tables); b++) { + for (i = block_ring[b].first; i != block_ring[b].next; i = (i+1)%block_ring[b].size) { + blk = &block_tables[b][i]; + for (j = 0; j < blk->entry_count; j++) + if (blk->entryp[j].entry_count > max) { + max = blk->entryp[j].entry_count; + maxb = &blk->entryp[j]; + } + } + } + if (maxb == NULL) + break; + printf("%08x %p %9d %2.3f%%\n", maxb->pc, maxb->tcache_ptr, maxb->entry_count, + (double)100 * maxb->entry_count / total); + maxb->entry_count = 0; + } + + for (b = 0; b < ARRAY_SIZE(block_tables); b++) { + for (i = block_ring[b].first; i != block_ring[b].next; i = (i+1)%block_ring[b].size) + for (j = 0; j < block_tables[b][i].entry_count; j++) + block_tables[b][i].entryp[j].entry_count = 0; + } +#endif +} + +static void backtrace(void) +{ +#if (DRC_DEBUG & 1024) + int i; + printf("backtrace master:\n"); + for (i = 0; i < ARRAY_SIZE(csh2[0]); i++) + SH2_DUMP(&csh2[0][i], "bt msh2"); + printf("backtrace slave:\n"); + for (i = 0; i < ARRAY_SIZE(csh2[1]); i++) + SH2_DUMP(&csh2[1][i], "bt ssh2"); +#endif +} + +static void state_dump(void) +{ +#if (DRC_DEBUG & 2048) + int i; + + SH2_DUMP(&sh2s[0], "master"); + printf("VBR msh2: %x\n", sh2s[0].vbr); + for (i = 0; i < 0x60; i++) { + printf("%08x ",p32x_sh2_read32(sh2s[0].vbr + i*4, &sh2s[0])); + if ((i+1) % 8 == 0) printf("\n"); + } + printf("stack msh2: %x\n", sh2s[0].r[15]); + for (i = -0x30; i < 0x30; i++) { + printf("%08x ",p32x_sh2_read32(sh2s[0].r[15] + i*4, &sh2s[0])); + if ((i+1) % 8 == 0) printf("\n"); + } + SH2_DUMP(&sh2s[1], "slave"); + printf("VBR ssh2: %x\n", sh2s[1].vbr); + for (i = 0; i < 0x60; i++) { + printf("%08x ",p32x_sh2_read32(sh2s[1].vbr + i*4, &sh2s[1])); + if ((i+1) % 8 == 0) printf("\n"); + } + printf("stack ssh2: %x\n", sh2s[1].r[15]); + for (i = -0x30; i < 0x30; i++) { + printf("%08x ",p32x_sh2_read32(sh2s[1].r[15] + i*4, &sh2s[1])); + if ((i+1) % 8 == 0) printf("\n"); + } #endif +} + +static void bcache_stats(void) +{ +#if (DRC_DEBUG & 128) + int i; +#if CALL_STACK + for (i = 1; i < ARRAY_SIZE(sh2s->rts_cache); i++) + if (sh2s[0].rts_cache[i].pc == -1 && sh2s[1].rts_cache[i].pc == -1) break; + + printf("return cache hits:%d misses:%d depth: %d index: %d/%d\n", rchit, rcmiss, i,sh2s[0].rts_cache_idx,sh2s[1].rts_cache_idx); + for (i = 0; i < ARRAY_SIZE(sh2s[0].rts_cache); i++) { + printf("%08x ",sh2s[0].rts_cache[i].pc); + if ((i+1) % 8 == 0) printf("\n"); + } + for (i = 0; i < ARRAY_SIZE(sh2s[1].rts_cache); i++) { + printf("%08x ",sh2s[1].rts_cache[i].pc); + if ((i+1) % 8 == 0) printf("\n"); + } +#endif +#if BRANCH_CACHE + printf("branch cache hits:%d misses:%d\n", bchit, bcmiss); + printf("branch cache master:\n"); + for (i = 0; i < ARRAY_SIZE(sh2s[0].branch_cache); i++) { + printf("%08x ",sh2s[0].branch_cache[i].pc); + if ((i+1) % 8 == 0) printf("\n"); + } + printf("branch cache slave:\n"); + for (i = 0; i < ARRAY_SIZE(sh2s[1].branch_cache); i++) { + printf("%08x ",sh2s[1].branch_cache[i].pc); + if ((i+1) % 8 == 0) printf("\n"); + } +#endif +#endif +} void sh2_drc_flush_all(void) { + backtrace(); + state_dump(); block_stats(); - flush_tcache(0); - flush_tcache(1); - flush_tcache(2); + entry_stats(); + bcache_stats(); + dr_flush_tcache(0); + dr_flush_tcache(1); + dr_flush_tcache(2); Pico32x.emu_flags &= ~P32XF_DRC_ROM_C; } void sh2_drc_mem_setup(SH2 *sh2) { - // fill the convenience pointers - sh2->p_bios = sh2->is_slave ? Pico32xMem->sh2_rom_s.w : Pico32xMem->sh2_rom_m.w; - sh2->p_da = sh2->data_array; - sh2->p_sdram = Pico32xMem->sdram; - sh2->p_rom = Pico.rom; -} - -void sh2_drc_frame(void) -{ - if (literal_disabled_frames > 0) - literal_disabled_frames--; + // fill the DRC-only convenience pointers + sh2->p_drcblk_da = Pico32xMem->drcblk_da[!!sh2->is_slave]; + sh2->p_drcblk_ram = Pico32xMem->drcblk_ram; } int sh2_drc_init(SH2 *sh2) @@ -3319,47 +5590,74 @@ int sh2_drc_init(SH2 *sh2) if (block_tables[0] == NULL) { for (i = 0; i < TCACHE_BUFFERS; i++) { - block_tables[i] = calloc(block_max_counts[i], sizeof(*block_tables[0])); + block_tables[i] = calloc(BLOCK_MAX_COUNT(i), sizeof(*block_tables[0])); if (block_tables[i] == NULL) goto fail; - // max 2 block links (exits) per block - block_link_pool[i] = calloc(block_link_pool_max_counts[i], + entry_tables[i] = calloc(ENTRY_MAX_COUNT(i), sizeof(*entry_tables[0])); + if (entry_tables[i] == NULL) + goto fail; + block_link_pool[i] = calloc(BLOCK_LINK_MAX_COUNT(i), sizeof(*block_link_pool[0])); if (block_link_pool[i] == NULL) goto fail; - inval_lookup[i] = calloc(ram_sizes[i] / INVAL_PAGE_SIZE, + inval_lookup[i] = calloc(RAM_SIZE(i) / INVAL_PAGE_SIZE, sizeof(inval_lookup[0])); if (inval_lookup[i] == NULL) goto fail; - hash_tables[i] = calloc(hash_table_sizes[i], sizeof(*hash_tables[0])); + hash_tables[i] = calloc(HASH_TABLE_SIZE(i), sizeof(*hash_tables[0])); if (hash_tables[i] == NULL) goto fail; + + unresolved_links[i] = calloc(HASH_TABLE_SIZE(i), sizeof(*unresolved_links[0])); + if (unresolved_links[i] == NULL) + goto fail; +//atexit(sh2_drc_finish); + + RING_INIT(&block_ring[i], block_tables[i], BLOCK_MAX_COUNT(i)); + RING_INIT(&entry_ring[i], entry_tables[i], ENTRY_MAX_COUNT(i)); } - memset(block_counts, 0, sizeof(block_counts)); + + block_list_pool = calloc(BLOCK_LIST_MAX_COUNT, sizeof(*block_list_pool)); + if (block_list_pool == NULL) + goto fail; + block_list_pool_count = 0; + blist_free = NULL; + memset(block_link_pool_counts, 0, sizeof(block_link_pool_counts)); + memset(blink_free, 0, sizeof(blink_free)); drc_cmn_init(); + rcache_init(); + tcache_ptr = tcache; sh2_generate_utils(); host_instructions_updated(tcache, tcache_ptr); + emith_update_cache(); - tcache_bases[0] = tcache_ptrs[0] = tcache_ptr; - for (i = 1; i < ARRAY_SIZE(tcache_bases); i++) - tcache_bases[i] = tcache_ptrs[i] = tcache_bases[i - 1] + tcache_sizes[i - 1]; + i = tcache_ptr - tcache; + RING_INIT(&tcache_ring[0], tcache_ptr, tcache_sizes[0] - i); + for (i = 1; i < ARRAY_SIZE(tcache_ring); i++) { + RING_INIT(&tcache_ring[i], tcache_ring[i-1].base + tcache_ring[i-1].size, + tcache_sizes[i]); + } #if (DRC_DEBUG & 4) for (i = 0; i < ARRAY_SIZE(block_tables); i++) - tcache_dsm_ptrs[i] = tcache_bases[i]; + tcache_dsm_ptrs[i] = tcache_ring[i].base; // disasm the utils tcache_dsm_ptrs[0] = tcache; do_host_disasm(0); + fflush(stdout); #endif #if (DRC_DEBUG & 1) hash_collisions = 0; #endif } + memset(sh2->branch_cache, -1, sizeof(sh2->branch_cache)); + memset(sh2->rts_cache, -1, sizeof(sh2->rts_cache)); + sh2->rts_cache_idx = 0; return 0; @@ -3375,24 +5673,49 @@ void sh2_drc_finish(SH2 *sh2) if (block_tables[0] == NULL) return; - sh2_drc_flush_all(); +#if (DRC_DEBUG & (256|512)) + if (trace[0]) fclose(trace[0]); + if (trace[1]) fclose(trace[1]); + trace[0] = trace[1] = NULL; +#endif - for (i = 0; i < TCACHE_BUFFERS; i++) { #if (DRC_DEBUG & 4) + for (i = 0; i < TCACHE_BUFFERS; i++) { printf("~~~ tcache %d\n", i); - tcache_dsm_ptrs[i] = tcache_bases[i]; - tcache_ptr = tcache_ptrs[i]; - do_host_disasm(i); +#if 0 + if (tcache_ring[i].first < tcache_ring[i].next) { + tcache_dsm_ptrs[i] = tcache_ring[i].first; + tcache_ptr = tcache_ring[i].next; + do_host_disasm(i); + } else if (tcache_ring[i].used) { + tcache_dsm_ptrs[i] = tcache_ring[i].first; + tcache_ptr = tcache_ring[i].base + tcache_ring[i].size; + do_host_disasm(i); + tcache_dsm_ptrs[i] = tcache_ring[i].base; + tcache_ptr = tcache_ring[i].next; + do_host_disasm(i); + } +#endif + printf("max links: %d\n", block_link_pool_counts[i]); + } + printf("max block list: %d\n", block_list_pool_count); #endif + sh2_drc_flush_all(); + + for (i = 0; i < TCACHE_BUFFERS; i++) { if (block_tables[i] != NULL) free(block_tables[i]); block_tables[i] = NULL; - if (block_link_pool[i] == NULL) + if (entry_tables[i] != NULL) + free(entry_tables[i]); + entry_tables[i] = NULL; + if (block_link_pool[i] != NULL) free(block_link_pool[i]); block_link_pool[i] = NULL; + blink_free[i] = NULL; - if (inval_lookup[i] == NULL) + if (inval_lookup[i] != NULL) free(inval_lookup[i]); inval_lookup[i] = NULL; @@ -3402,59 +5725,49 @@ void sh2_drc_finish(SH2 *sh2) } } + if (block_list_pool != NULL) + free(block_list_pool); + block_list_pool = NULL; + blist_free = NULL; + drc_cmn_cleanup(); } #endif /* DRC_SH2 */ -static void *dr_get_pc_base(u32 pc, int is_slave) +static void *dr_get_pc_base(u32 pc, SH2 *sh2) { - void *ret = NULL; + void *ret; u32 mask = 0; - if ((pc & ~0x7ff) == 0) { - // BIOS - ret = is_slave ? Pico32xMem->sh2_rom_s.w : Pico32xMem->sh2_rom_m.w; - mask = 0x7ff; - } - else if ((pc & 0xfffff000) == 0xc0000000) { - // data array - ret = sh2s[is_slave].data_array; - mask = 0xfff; - } - else if ((pc & 0xc6000000) == 0x06000000) { - // SDRAM - ret = Pico32xMem->sdram; - mask = 0x03ffff; - } - else if ((pc & 0xc6000000) == 0x02000000) { - // ROM - if ((pc & 0x3fffff) < Pico.romsize) - ret = Pico.rom; - mask = 0x3fffff; - } - - if (ret == NULL) - return (void *)-1; // NULL is valid value + ret = p32x_sh2_get_mem_ptr(pc, &mask, sh2); + if (ret == (void *)-1) + return ret; return (char *)ret - (pc & ~mask); } -void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, - u32 *end_literals_out) +u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, + u32 *base_literals_out, u32 *end_literals_out) { u16 *dr_pc_base; u32 pc, op, tmp; u32 end_pc, end_literals = 0; + u32 lowest_literal = 0; u32 lowest_mova = 0; struct op_data *opd; int next_is_delay = 0; int end_block = 0; int i, i_end; + u32 crc = 0; + // 2nd pass stuff + int last_btarget; // loop detector + enum { T_UNKNOWN, T_CLEAR, T_SET } t; // T propagation state - memset(op_flags, 0, BLOCK_INSN_LIMIT); + memset(op_flags, 0, sizeof(*op_flags) * BLOCK_INSN_LIMIT); + op_flags[0] |= OF_BTARGET; // block start is always a target - dr_pc_base = dr_get_pc_base(base_pc, is_slave); + dr_pc_base = dr_get_pc_base(base_pc, &sh2s[!!is_slave]); // 1st pass: disassemble for (i = 0, pc = base_pc; ; i++, pc += 2) { @@ -3473,6 +5786,9 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, } else if (end_block || i >= BLOCK_INSN_LIMIT - 2) break; + else if ((lowest_mova && lowest_mova <= pc) || + (lowest_literal && lowest_literal <= pc)) + break; // text area collides with data area op = FETCH_OP(pc); switch ((op & 0xf000) >> 12) @@ -3485,19 +5801,19 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, switch (GET_Fx()) { case 0: // STC SR,Rn 0000nnnn00000010 - tmp = SHR_SR; + tmp = BITMASK2(SHR_SR, SHR_T); break; case 1: // STC GBR,Rn 0000nnnn00010010 - tmp = SHR_GBR; + tmp = BITMASK1(SHR_GBR); break; case 2: // STC VBR,Rn 0000nnnn00100010 - tmp = SHR_VBR; + tmp = BITMASK1(SHR_VBR); break; default: goto undefined; } opd->op = OP_MOVE; - opd->source = BITMASK1(tmp); + opd->source = tmp; opd->dest = BITMASK1(GET_Rn()); break; case 0x03: @@ -3506,18 +5822,22 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, // BSRF Rm 0000mmmm00000011 opd->op = OP_BRANCH_RF; opd->rm = GET_Rn(); - opd->source = BITMASK1(opd->rm); + opd->source = BITMASK2(SHR_PC, opd->rm); opd->dest = BITMASK1(SHR_PC); if (!(op & 0x20)) opd->dest |= BITMASK1(SHR_PR); opd->cycles = 2; next_is_delay = 1; - end_block = 1; + if (!(opd->dest & BITMASK1(SHR_PR))) + end_block = !(op_flags[i+1+next_is_delay] & OF_BTARGET); + else + op_flags[i+1+next_is_delay] |= OF_BTARGET; break; case 0x04: // MOV.B Rm,@(R0,Rn) 0000nnnnmmmm0100 case 0x05: // MOV.W Rm,@(R0,Rn) 0000nnnnmmmm0101 case 0x06: // MOV.L Rm,@(R0,Rn) 0000nnnnmmmm0110 opd->source = BITMASK3(GET_Rm(), SHR_R0, GET_Rn()); + opd->dest = BITMASK1(SHR_MEM); break; case 0x07: // MUL.L Rm,Rn 0000nnnnmmmm0111 @@ -3540,7 +5860,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->imm = 1; break; case 2: // CLRMAC 0000000000101000 - opd->dest = BITMASK3(SHR_T, SHR_MACL, SHR_MACH); + opd->dest = BITMASK2(SHR_MACL, SHR_MACH); break; default: goto undefined; @@ -3554,6 +5874,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, break; case 1: // DIV0U 0000000000011001 CHECK_UNHANDLED_BITS(0xf00, undefined); + opd->source = BITMASK1(SHR_SR); opd->dest = BITMASK2(SHR_SR, SHR_T); break; case 2: // MOVT Rn 0000nnnn00101001 @@ -3594,7 +5915,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->dest = BITMASK1(SHR_PC); opd->cycles = 2; next_is_delay = 1; - end_block = 1; + end_block = !(op_flags[i+1+next_is_delay] & OF_BTARGET); break; case 1: // SLEEP 0000000000011011 opd->op = OP_SLEEP; @@ -3603,10 +5924,10 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 2: // RTE 0000000000101011 opd->op = OP_RTE; opd->source = BITMASK1(SHR_SP); - opd->dest = BITMASK2(SHR_SR, SHR_PC); + opd->dest = BITMASK4(SHR_SP, SHR_SR, SHR_T, SHR_PC); opd->cycles = 4; next_is_delay = 1; - end_block = 1; + end_block = !(op_flags[i+1+next_is_delay] & OF_BTARGET); break; default: goto undefined; @@ -3615,11 +5936,12 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x0c: // MOV.B @(R0,Rm),Rn 0000nnnnmmmm1100 case 0x0d: // MOV.W @(R0,Rm),Rn 0000nnnnmmmm1101 case 0x0e: // MOV.L @(R0,Rm),Rn 0000nnnnmmmm1110 - opd->source = BITMASK2(GET_Rm(), SHR_R0); + opd->source = BITMASK3(GET_Rm(), SHR_R0, SHR_MEM); opd->dest = BITMASK1(GET_Rn()); + op_flags[i] |= OF_POLL_INSN; break; case 0x0f: // MAC.L @Rm+,@Rn+ 0000nnnnmmmm1111 - opd->source = BITMASK5(GET_Rm(), GET_Rn(), SHR_SR, SHR_MACL, SHR_MACH); + opd->source = BITMASK6(GET_Rm(), GET_Rn(), SHR_SR, SHR_MACL, SHR_MACH, SHR_MEM); opd->dest = BITMASK4(GET_Rm(), GET_Rn(), SHR_MACL, SHR_MACH); opd->cycles = 3; break; @@ -3631,8 +5953,8 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, ///////////////////////////////////////////// case 0x01: // MOV.L Rm,@(disp,Rn) 0001nnnnmmmmdddd - opd->source = BITMASK1(GET_Rm()); - opd->source = BITMASK1(GET_Rn()); + opd->source = BITMASK2(GET_Rm(), GET_Rn()); + opd->dest = BITMASK1(SHR_MEM); opd->imm = (op & 0x0f) * 4; break; @@ -3643,18 +5965,18 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x00: // MOV.B Rm,@Rn 0010nnnnmmmm0000 case 0x01: // MOV.W Rm,@Rn 0010nnnnmmmm0001 case 0x02: // MOV.L Rm,@Rn 0010nnnnmmmm0010 - opd->source = BITMASK1(GET_Rm()); - opd->source = BITMASK1(GET_Rn()); + opd->source = BITMASK2(GET_Rm(), GET_Rn()); + opd->dest = BITMASK1(SHR_MEM); break; case 0x04: // MOV.B Rm,@-Rn 0010nnnnmmmm0100 case 0x05: // MOV.W Rm,@-Rn 0010nnnnmmmm0101 case 0x06: // MOV.L Rm,@-Rn 0010nnnnmmmm0110 opd->source = BITMASK2(GET_Rm(), GET_Rn()); - opd->dest = BITMASK1(GET_Rn()); + opd->dest = BITMASK2(GET_Rn(), SHR_MEM); break; case 0x07: // DIV0S Rm,Rn 0010nnnnmmmm0111 - opd->source = BITMASK2(GET_Rm(), GET_Rn()); - opd->dest = BITMASK1(SHR_SR); + opd->source = BITMASK3(SHR_SR, GET_Rm(), GET_Rn()); + opd->dest = BITMASK2(SHR_SR, SHR_T); break; case 0x08: // TST Rm,Rn 0010nnnnmmmm1000 opd->source = BITMASK2(GET_Rm(), GET_Rn()); @@ -3697,8 +6019,8 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->dest = BITMASK1(SHR_T); break; case 0x04: // DIV1 Rm,Rn 0011nnnnmmmm0100 - opd->source = BITMASK3(GET_Rm(), GET_Rn(), SHR_SR); - opd->dest = BITMASK2(GET_Rn(), SHR_SR); + opd->source = BITMASK4(GET_Rm(), GET_Rn(), SHR_SR, SHR_T); + opd->dest = BITMASK3(GET_Rn(), SHR_SR, SHR_T); break; case 0x05: // DMULU.L Rm,Rn 0011nnnnmmmm0101 case 0x0d: // DMULS.L Rm,Rn 0011nnnnmmmm1101 @@ -3741,6 +6063,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 1: // DT Rn 0100nnnn00010000 opd->source = BITMASK1(GET_Rn()); opd->dest = BITMASK2(GET_Rn(), SHR_T); + op_flags[i] |= OF_DELAY_INSN; break; default: goto undefined; @@ -3767,31 +6090,31 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, switch (op & 0x3f) { case 0x02: // STS.L MACH,@-Rn 0100nnnn00000010 - tmp = SHR_MACH; + tmp = BITMASK1(SHR_MACH); break; case 0x12: // STS.L MACL,@-Rn 0100nnnn00010010 - tmp = SHR_MACL; + tmp = BITMASK1(SHR_MACL); break; case 0x22: // STS.L PR,@-Rn 0100nnnn00100010 - tmp = SHR_PR; + tmp = BITMASK1(SHR_PR); break; case 0x03: // STC.L SR,@-Rn 0100nnnn00000011 - tmp = SHR_SR; + tmp = BITMASK2(SHR_SR, SHR_T); opd->cycles = 2; break; case 0x13: // STC.L GBR,@-Rn 0100nnnn00010011 - tmp = SHR_GBR; + tmp = BITMASK1(SHR_GBR); opd->cycles = 2; break; case 0x23: // STC.L VBR,@-Rn 0100nnnn00100011 - tmp = SHR_VBR; + tmp = BITMASK1(SHR_VBR); opd->cycles = 2; break; default: goto undefined; } - opd->source = BITMASK2(GET_Rn(), tmp); - opd->dest = BITMASK1(GET_Rn()); + opd->source = BITMASK1(GET_Rn()) | tmp; + opd->dest = BITMASK2(GET_Rn(), SHR_MEM); break; case 0x04: case 0x05: @@ -3820,31 +6143,34 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, switch (op & 0x3f) { case 0x06: // LDS.L @Rm+,MACH 0100mmmm00000110 - tmp = SHR_MACH; + tmp = BITMASK1(SHR_MACH); break; case 0x16: // LDS.L @Rm+,MACL 0100mmmm00010110 - tmp = SHR_MACL; + tmp = BITMASK1(SHR_MACL); break; case 0x26: // LDS.L @Rm+,PR 0100mmmm00100110 - tmp = SHR_PR; + tmp = BITMASK1(SHR_PR); break; case 0x07: // LDC.L @Rm+,SR 0100mmmm00000111 - tmp = SHR_SR; + tmp = BITMASK2(SHR_SR, SHR_T); + opd->op = OP_LDC; opd->cycles = 3; break; case 0x17: // LDC.L @Rm+,GBR 0100mmmm00010111 - tmp = SHR_GBR; + tmp = BITMASK1(SHR_GBR); + opd->op = OP_LDC; opd->cycles = 3; break; case 0x27: // LDC.L @Rm+,VBR 0100mmmm00100111 - tmp = SHR_VBR; + tmp = BITMASK1(SHR_VBR); + opd->op = OP_LDC; opd->cycles = 3; break; default: goto undefined; } - opd->source = BITMASK1(GET_Rn()); - opd->dest = BITMASK2(GET_Rn(), tmp); + opd->source = BITMASK2(GET_Rn(), SHR_MEM); + opd->dest = BITMASK1(GET_Rn()) | tmp; break; case 0x08: case 0x09: @@ -3899,11 +6225,14 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->dest |= BITMASK1(SHR_PC); opd->cycles = 2; next_is_delay = 1; - end_block = 1; + if (!(opd->dest & BITMASK1(SHR_PR))) + end_block = !(op_flags[i+1+next_is_delay] & OF_BTARGET); + else + op_flags[i+1+next_is_delay] |= OF_BTARGET; break; case 1: // TAS.B @Rn 0100nnnn00011011 - opd->source = BITMASK1(GET_Rn()); - opd->dest = BITMASK1(SHR_T); + opd->source = BITMASK2(GET_Rn(), SHR_MEM); + opd->dest = BITMASK2(SHR_T, SHR_MEM); opd->cycles = 4; break; default: @@ -3914,24 +6243,24 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, switch (GET_Fx()) { case 0: // LDC Rm,SR 0100mmmm00001110 - tmp = SHR_SR; + tmp = BITMASK2(SHR_SR, SHR_T); break; case 1: // LDC Rm,GBR 0100mmmm00011110 - tmp = SHR_GBR; + tmp = BITMASK1(SHR_GBR); break; case 2: // LDC Rm,VBR 0100mmmm00101110 - tmp = SHR_VBR; + tmp = BITMASK1(SHR_VBR); break; default: goto undefined; } - opd->op = OP_MOVE; + opd->op = OP_LDC; opd->source = BITMASK1(GET_Rn()); - opd->dest = BITMASK1(tmp); + opd->dest = tmp; break; case 0x0f: // MAC.W @Rm+,@Rn+ 0100nnnnmmmm1111 - opd->source = BITMASK5(GET_Rm(), GET_Rn(), SHR_SR, SHR_MACL, SHR_MACH); + opd->source = BITMASK6(GET_Rm(), GET_Rn(), SHR_SR, SHR_MACL, SHR_MACH, SHR_MEM); opd->dest = BITMASK4(GET_Rm(), GET_Rn(), SHR_MACL, SHR_MACH); opd->cycles = 3; break; @@ -3943,9 +6272,10 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, ///////////////////////////////////////////// case 0x05: // MOV.L @(disp,Rm),Rn 0101nnnnmmmmdddd - opd->source = BITMASK1(GET_Rm()); + opd->source = BITMASK2(GET_Rm(), SHR_MEM); opd->dest = BITMASK1(GET_Rn()); opd->imm = (op & 0x0f) * 4; + op_flags[i] |= OF_POLL_INSN; break; ///////////////////////////////////////////// @@ -3955,12 +6285,15 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x04: // MOV.B @Rm+,Rn 0110nnnnmmmm0100 case 0x05: // MOV.W @Rm+,Rn 0110nnnnmmmm0101 case 0x06: // MOV.L @Rm+,Rn 0110nnnnmmmm0110 - opd->dest = BITMASK1(GET_Rm()); + opd->dest = BITMASK2(GET_Rm(), GET_Rn()); + opd->source = BITMASK2(GET_Rm(), SHR_MEM); + break; case 0x00: // MOV.B @Rm,Rn 0110nnnnmmmm0000 case 0x01: // MOV.W @Rm,Rn 0110nnnnmmmm0001 case 0x02: // MOV.L @Rm,Rn 0110nnnnmmmm0010 - opd->source = BITMASK1(GET_Rm()); - opd->dest |= BITMASK1(GET_Rn()); + opd->dest = BITMASK1(GET_Rn()); + opd->source = BITMASK2(GET_Rm(), SHR_MEM); + op_flags[i] |= OF_POLL_INSN; break; case 0x0a: // NEGC Rm,Rn 0110nnnnmmmm1010 opd->source = BITMASK2(GET_Rm(), SHR_T); @@ -3988,7 +6321,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x07: // ADD #imm,Rn 0111nnnniiiiiiii opd->source = opd->dest = BITMASK1(GET_Rn()); - opd->imm = (int)(signed char)op; + opd->imm = (s8)op; break; ///////////////////////////////////////////// @@ -3997,26 +6330,30 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, { case 0x0000: // MOV.B R0,@(disp,Rn) 10000000nnnndddd opd->source = BITMASK2(GET_Rm(), SHR_R0); + opd->dest = BITMASK1(SHR_MEM); opd->imm = (op & 0x0f); break; case 0x0100: // MOV.W R0,@(disp,Rn) 10000001nnnndddd opd->source = BITMASK2(GET_Rm(), SHR_R0); + opd->dest = BITMASK1(SHR_MEM); opd->imm = (op & 0x0f) * 2; break; case 0x0400: // MOV.B @(disp,Rm),R0 10000100mmmmdddd - opd->source = BITMASK1(GET_Rm()); + opd->source = BITMASK2(GET_Rm(), SHR_MEM); opd->dest = BITMASK1(SHR_R0); opd->imm = (op & 0x0f); + op_flags[i] |= OF_POLL_INSN; break; case 0x0500: // MOV.W @(disp,Rm),R0 10000101mmmmdddd - opd->source = BITMASK1(GET_Rm()); + opd->source = BITMASK2(GET_Rm(), SHR_MEM); opd->dest = BITMASK1(SHR_R0); opd->imm = (op & 0x0f) * 2; + op_flags[i] |= OF_POLL_INSN; break; case 0x0800: // CMP/EQ #imm,R0 10001000iiiiiiii opd->source = BITMASK1(SHR_R0); opd->dest = BITMASK1(SHR_T); - opd->imm = (int)(signed char)op; + opd->imm = (s8)op; break; case 0x0d00: // BT/S label 10001101dddddddd case 0x0f00: // BF/S label 10001111dddddddd @@ -4025,7 +6362,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x0900: // BT label 10001001dddddddd case 0x0b00: // BF label 10001011dddddddd opd->op = (op & 0x0200) ? OP_BRANCH_CF : OP_BRANCH_CT; - opd->source = BITMASK1(SHR_T); + opd->source = BITMASK2(SHR_PC, SHR_T); opd->dest = BITMASK1(SHR_PC); opd->imm = ((signed int)(op << 24) >> 23); opd->imm += pc + 4; @@ -4045,13 +6382,16 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, if (op_flags[i] & OF_DELAY_OP) { if (ops[i-1].op == OP_BRANCH) tmp = ops[i-1].imm; - else + else if (ops[i-1].op != OP_BRANCH_N) tmp = 0; } - opd->source = BITMASK1(SHR_PC); + opd->source = BITMASK2(SHR_PC, SHR_MEM); opd->dest = BITMASK1(GET_Rn()); - if (tmp) + if (tmp) { opd->imm = tmp + 2 + (op & 0xff) * 2; + if (lowest_literal == 0 || opd->imm < lowest_literal) + lowest_literal = opd->imm; + } opd->size = 1; break; @@ -4062,14 +6402,21 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x0a: // BRA label 1010dddddddddddd opd->op = OP_BRANCH; + opd->source = BITMASK1(SHR_PC); opd->dest |= BITMASK1(SHR_PC); opd->imm = ((signed int)(op << 20) >> 19); opd->imm += pc + 4; opd->cycles = 2; next_is_delay = 1; - end_block = 1; - if (base_pc <= opd->imm && opd->imm < base_pc + BLOCK_INSN_LIMIT * 2) - op_flags[(opd->imm - base_pc) / 2] |= OF_BTARGET; + if (!(opd->dest & BITMASK1(SHR_PR))) { + if (base_pc <= opd->imm && opd->imm < base_pc + BLOCK_INSN_LIMIT * 2) { + op_flags[(opd->imm - base_pc) / 2] |= OF_BTARGET; + if (opd->imm <= pc) + end_block = !(op_flags[i+1+next_is_delay] & OF_BTARGET); + } else + end_block = !(op_flags[i+1+next_is_delay] & OF_BTARGET); + } else + op_flags[i+1+next_is_delay] |= OF_BTARGET; break; ///////////////////////////////////////////// @@ -4080,23 +6427,26 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x0100: // MOV.W R0,@(disp,GBR) 11000001dddddddd case 0x0200: // MOV.L R0,@(disp,GBR) 11000010dddddddd opd->source = BITMASK2(SHR_GBR, SHR_R0); + opd->dest = BITMASK1(SHR_MEM); opd->size = (op & 0x300) >> 8; opd->imm = (op & 0xff) << opd->size; break; case 0x0400: // MOV.B @(disp,GBR),R0 11000100dddddddd case 0x0500: // MOV.W @(disp,GBR),R0 11000101dddddddd case 0x0600: // MOV.L @(disp,GBR),R0 11000110dddddddd - opd->source = BITMASK1(SHR_GBR); + opd->source = BITMASK2(SHR_GBR, SHR_MEM); opd->dest = BITMASK1(SHR_R0); opd->size = (op & 0x300) >> 8; opd->imm = (op & 0xff) << opd->size; + op_flags[i] |= OF_POLL_INSN; break; case 0x0300: // TRAPA #imm 11000011iiiiiiii - opd->source = BITMASK2(SHR_PC, SHR_SR); - opd->dest = BITMASK1(SHR_PC); - opd->imm = (op & 0xff) * 4; + opd->op = OP_TRAPA; + opd->source = BITMASK4(SHR_SP, SHR_PC, SHR_SR, SHR_T); + opd->dest = BITMASK2(SHR_SP, SHR_PC); + opd->imm = (op & 0xff); opd->cycles = 8; - end_block = 1; // FIXME + op_flags[i+1] |= OF_BTARGET; break; case 0x0700: // MOVA @(disp,PC),R0 11000111dddddddd opd->op = OP_MOVA; @@ -4104,7 +6454,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, if (op_flags[i] & OF_DELAY_OP) { if (ops[i-1].op == OP_BRANCH) tmp = ops[i-1].imm; - else + else if (ops[i-1].op != OP_BRANCH_N) tmp = 0; } opd->dest = BITMASK1(SHR_R0); @@ -4134,15 +6484,17 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->imm = op & 0xff; break; case 0x0c00: // TST.B #imm,@(R0,GBR) 11001100iiiiiiii - opd->source = BITMASK2(SHR_GBR, SHR_R0); + opd->source = BITMASK3(SHR_GBR, SHR_R0, SHR_MEM); opd->dest = BITMASK1(SHR_T); opd->imm = op & 0xff; + op_flags[i] |= OF_POLL_INSN; opd->cycles = 3; break; case 0x0d00: // AND.B #imm,@(R0,GBR) 11001101iiiiiiii case 0x0e00: // XOR.B #imm,@(R0,GBR) 11001110iiiiiiii case 0x0f00: // OR.B #imm,@(R0,GBR) 11001111iiiiiiii - opd->source = BITMASK2(SHR_GBR, SHR_R0); + opd->source = BITMASK3(SHR_GBR, SHR_R0, SHR_MEM); + opd->dest = BITMASK1(SHR_MEM); opd->imm = op & 0xff; opd->cycles = 3; break; @@ -4159,104 +6511,180 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, if (op_flags[i] & OF_DELAY_OP) { if (ops[i-1].op == OP_BRANCH) tmp = ops[i-1].imm; - else + else if (ops[i-1].op != OP_BRANCH_N) tmp = 0; } - opd->source = BITMASK1(SHR_PC); + opd->source = BITMASK2(SHR_PC, SHR_MEM); opd->dest = BITMASK1(GET_Rn()); - if (tmp) + if (tmp) { opd->imm = (tmp + 2 + (op & 0xff) * 4) & ~3; + if (lowest_literal == 0 || opd->imm < lowest_literal) + lowest_literal = opd->imm; + } opd->size = 2; break; ///////////////////////////////////////////// case 0x0e: // MOV #imm,Rn 1110nnnniiiiiiii + opd->op = OP_LOAD_CONST; opd->dest = BITMASK1(GET_Rn()); - opd->imm = (u32)(signed int)(signed char)op; + opd->imm = (s8)op; break; default: undefined: - elprintf(EL_ANOMALY, "%csh2 drc: unhandled op %04x @ %08x", - is_slave ? 's' : 'm', op, pc); + opd->op = OP_UNDEFINED; + // an unhandled instruction is probably not code if it's not the 1st insn + if (!(op_flags[i] & OF_DELAY_OP) && pc != base_pc) + goto end; break; } if (op_flags[i] & OF_DELAY_OP) { switch (opd->op) { case OP_BRANCH: + case OP_BRANCH_N: case OP_BRANCH_CT: case OP_BRANCH_CF: case OP_BRANCH_R: case OP_BRANCH_RF: elprintf(EL_ANOMALY, "%csh2 drc: branch in DS @ %08x", is_slave ? 's' : 'm', pc); - opd->op = OP_UNHANDLED; + opd->op = OP_UNDEFINED; op_flags[i] |= OF_B_IN_DS; next_is_delay = 0; break; } } } +end: i_end = i; end_pc = pc; // 2nd pass: some analysis - for (i = 0; i < i_end; i++) { + lowest_literal = end_literals = lowest_mova = 0; + t = T_UNKNOWN; + last_btarget = 0; + op = 0; // delay/poll insns counter + for (i = 0, pc = base_pc; i < i_end; i++, pc += 2) { opd = &ops[i]; + crc += FETCH_OP(pc); // propagate T (TODO: DIV0U) - if ((opd->op == OP_SETCLRT && !opd->imm) || opd->op == OP_BRANCH_CT) - op_flags[i + 1] |= OF_T_CLEAR; - else if ((opd->op == OP_SETCLRT && opd->imm) || opd->op == OP_BRANCH_CF) - op_flags[i + 1] |= OF_T_SET; - if ((op_flags[i] & OF_BTARGET) || (opd->dest & BITMASK1(SHR_T))) - op_flags[i] &= ~(OF_T_SET | OF_T_CLEAR); - else - op_flags[i + 1] |= op_flags[i] & (OF_T_SET | OF_T_CLEAR); + t = T_UNKNOWN; - if ((opd->op == OP_BRANCH_CT && (op_flags[i] & OF_T_SET)) - || (opd->op == OP_BRANCH_CF && (op_flags[i] & OF_T_CLEAR))) - { + if ((opd->op == OP_BRANCH_CT && t == T_SET) || + (opd->op == OP_BRANCH_CF && t == T_CLEAR)) { opd->op = OP_BRANCH; - opd->cycles = 3; - i_end = i + 1; + opd->cycles = (op_flags[i + 1] & OF_DELAY_OP) ? 2 : 3; + } else if ((opd->op == OP_BRANCH_CT && t == T_CLEAR) || + (opd->op == OP_BRANCH_CF && t == T_SET)) + opd->op = OP_BRANCH_N; + else if ((opd->op == OP_SETCLRT && !opd->imm) || opd->op == OP_BRANCH_CT) + t = T_CLEAR; + else if ((opd->op == OP_SETCLRT && opd->imm) || opd->op == OP_BRANCH_CF) + t = T_SET; + + // "overscan" detection: unreachable code after unconditional branch + // this can happen if the insn after a forward branch isn't a local target + if (OP_ISBRAUC(opd->op)) { if (op_flags[i + 1] & OF_DELAY_OP) { - opd->cycles = 2; - i_end++; + if (i_end > i + 2 && !(op_flags[i + 2] & OF_BTARGET)) + i_end = i + 2; + } else { + if (i_end > i + 1 && !(op_flags[i + 1] & OF_BTARGET)) + i_end = i + 1; } } - else if (opd->op == OP_LOAD_POOL) - { - if (opd->imm < end_pc + MAX_LITERAL_OFFSET) { + + // literal pool size detection + if (opd->op == OP_MOVA && opd->imm >= base_pc) + if (lowest_mova == 0 || opd->imm < lowest_mova) + lowest_mova = opd->imm; + if (opd->op == OP_LOAD_POOL) { + if (opd->imm >= base_pc && opd->imm < end_pc + MAX_LITERAL_OFFSET) { if (end_literals < opd->imm + opd->size * 2) end_literals = opd->imm + opd->size * 2; + if (lowest_literal == 0 || lowest_literal > opd->imm) + lowest_literal = opd->imm; + if (opd->size == 2) { + // tweak for NFL: treat a 32bit literal as an address and check if it + // points to the literal space. In that case handle it like MOVA. + tmp = FETCH32(opd->imm) & ~0x20000000; // MUST ignore wt bit here + if (tmp >= end_pc && tmp < end_pc + MAX_LITERAL_OFFSET) + if (lowest_mova == 0 || tmp < lowest_mova) + lowest_mova = tmp; + } } } +#if LOOP_DETECTION + // inner loop detection + // 1. a loop always starts with a branch target (for the backwards jump) + // 2. it doesn't contain more than one polling and/or delaying insn + // 3. it doesn't contain unconditional jumps + // 4. no overlapping of loops + if (op_flags[i] & OF_BTARGET) { + last_btarget = i; // possible loop starting point + op = 0; + } + // XXX let's hope nobody is putting a delay or poll insn in a delay slot :-/ + if (OP_ISBRAIMM(opd->op)) { + // BSR, BRA, BT, BF with immediate target + int i_tmp = (opd->imm - base_pc) / 2; // branch target, index in ops + if (i_tmp == last_btarget) // candidate for basic loop optimizer + op_flags[i_tmp] |= OF_BASIC_LOOP; + if (i_tmp == last_btarget && op <= 1) { + op_flags[i_tmp] |= OF_LOOP; // conditions met -> mark loop + last_btarget = i+1; // condition 4 + } else if (opd->op == OP_BRANCH) + last_btarget = i+1; // condition 3 + } + else if (OP_ISBRAIND(opd->op)) + // BRAF, BSRF, JMP, JSR, register indirect. treat it as off-limits jump + last_btarget = i+1; // condition 3 + else if (op_flags[i] & (OF_POLL_INSN|OF_DELAY_INSN)) + op ++; // condition 2 +#endif } - end_pc = base_pc + i_end * 2; - if (end_literals < end_pc) - end_literals = end_pc; + end_pc = pc; // end_literals is used to decide to inline a literal or not // XXX: need better detection if this actually is used in write + if (lowest_literal >= base_pc) { + if (lowest_literal < end_pc) { + dbg(1, "warning: lowest_literal=%08x < end_pc=%08x", lowest_literal, end_pc); + // TODO: does this always mean end_pc covers data? + } + } if (lowest_mova >= base_pc) { if (lowest_mova < end_literals) { - dbg(1, "mova for %08x, block %08x", lowest_mova, base_pc); - end_literals = end_pc; + dbg(1, "warning: mova=%08x < end_literals=%08x", lowest_mova, end_literals); + end_literals = lowest_mova; } if (lowest_mova < end_pc) { - dbg(1, "warning: mova inside of blk for %08x, block %08x", - lowest_mova, base_pc); + dbg(1, "warning: mova=%08x < end_pc=%08x", lowest_mova, end_pc); end_literals = end_pc; } } + if (lowest_literal >= end_literals) + lowest_literal = end_literals; + + if (lowest_literal && end_literals) + for (pc = lowest_literal; pc < end_literals; pc += 2) + crc += FETCH_OP(pc); *end_pc_out = end_pc; + if (base_literals_out != NULL) + *base_literals_out = (lowest_literal ?: end_pc); if (end_literals_out != NULL) - *end_literals_out = end_literals; + *end_literals_out = (end_literals ?: end_pc); + + // crc overflow handling, twice to collect all overflows + crc = (crc & 0xffff) + (crc >> 16); + crc = (crc & 0xffff) + (crc >> 16); + return crc; } // vim:shiftwidth=2:ts=2:expandtab diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 61d8d2daf..9642492db 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -1,27 +1,86 @@ int sh2_drc_init(SH2 *sh2); void sh2_drc_finish(SH2 *sh2); -void sh2_drc_wcheck_ram(unsigned int a, int val, int cpuid); -void sh2_drc_wcheck_da(unsigned int a, int val, int cpuid); +void sh2_drc_wcheck_ram(uint32_t a, unsigned len, SH2 *sh2); +void sh2_drc_wcheck_da(uint32_t a, unsigned len, SH2 *sh2); #ifdef DRC_SH2 void sh2_drc_mem_setup(SH2 *sh2); void sh2_drc_flush_all(void); -void sh2_drc_frame(void); #else #define sh2_drc_mem_setup(x) #define sh2_drc_flush_all() #define sh2_drc_frame() #endif -#define BLOCK_INSN_LIMIT 128 +#define BLOCK_INSN_LIMIT 1024 /* op_flags */ #define OF_DELAY_OP (1 << 0) #define OF_BTARGET (1 << 1) -#define OF_T_SET (1 << 2) // T is known to be set -#define OF_T_CLEAR (1 << 3) // ... clear +#define OF_LOOP (3 << 2) // NONE, IDLE, DELAY, POLL loop #define OF_B_IN_DS (1 << 4) +#define OF_DELAY_INSN (1 << 5) // DT, (TODO ADD+CMP?) +#define OF_POLL_INSN (1 << 6) // MOV @(...),Rn (no post increment), TST @(...) +#define OF_BASIC_LOOP (1 << 7) // pinnable loop without any branches in it -void scan_block(unsigned int base_pc, int is_slave, - unsigned char *op_flags, unsigned int *end_pc, - unsigned int *end_literals); +#define OF_IDLE_LOOP (1 << 2) +#define OF_DELAY_LOOP (2 << 2) +#define OF_POLL_LOOP (3 << 2) + +unsigned short scan_block(uint32_t base_pc, int is_slave, + unsigned char *op_flags, uint32_t *end_pc, + uint32_t *base_literals, uint32_t *end_literals); + +#if defined(DRC_SH2) && defined(__GNUC__) && !defined(__clang__) +// direct access to some host CPU registers used by the DRC if gcc is used. +// XXX MUST match SHR_SR definitions in cpu/drc/emit_*.c; should be moved there +// XXX yuck, there's no portable way to determine register size. Use long long +// if target is 64 bit and data model is ILP32 or LLP64(windows), else long +#if defined(__arm__) +#define DRC_SR_REG "r10" +#define DRC_REG_LL 0 // 32 bit +#elif defined(__aarch64__) +#define DRC_SR_REG "r28" +#define DRC_REG_LL (__ILP32__ || _WIN32) +#elif defined(__mips__) +#define DRC_SR_REG "s6" +#define DRC_REG_LL (_MIPS_SIM == _ABIN32) +#elif defined(__riscv__) || defined(__riscv) +#define DRC_SR_REG "s11" +#define DRC_REG_LL 0 // no ABI for (__ILP32__ && __riscv_xlen != 32) +#elif defined(__powerpc__) +#define DRC_SR_REG "r30" +#define DRC_REG_LL 0 // no ABI for __ILP32__ +#elif defined(__i386__) +#define DRC_SR_REG "edi" +#define DRC_REG_LL 0 // 32 bit +#elif defined(__x86_64__) +#define DRC_SR_REG "rbx" +#define DRC_REG_LL (__ILP32__ || _WIN32) +#endif +#endif + +#ifdef DRC_SR_REG +// XXX this is more clear but produces too much overhead for slow platforms +extern void REGPARM(1) (*sh2_drc_save_sr)(SH2 *sh2); +extern void REGPARM(1) (*sh2_drc_restore_sr)(SH2 *sh2); + +// NB: sh2_sr MUST have register size if optimizing with -O3 (-fif-conversion) +#if DRC_REG_LL +#define DRC_DECLARE_SR register long long _sh2_sr asm(DRC_SR_REG) +#else +#define DRC_DECLARE_SR register long _sh2_sr asm(DRC_SR_REG) +#endif +#define DRC_SAVE_SR(sh2) \ + if (likely(sh2->state & SH2_IN_DRC)) \ + sh2->sr = (s32)_sh2_sr +// sh2_drc_save_sr(sh2) +#define DRC_RESTORE_SR(sh2) \ + if (likely(sh2->state & SH2_IN_DRC)) \ + _sh2_sr = (s32)sh2->sr +// sh2_drc_restore_sr(sh2) +#else +#define DRC_DECLARE_SR +#define DRC_SAVE_SR(sh2) +#define DRC_RESTORE_SR(sh2) +#endif diff --git a/cpu/sh2/mame/sh2.c b/cpu/sh2/mame/sh2.c index 2fb964b6c..fa49153aa 100644 --- a/cpu/sh2/mame/sh2.c +++ b/cpu/sh2/mame/sh2.c @@ -372,7 +372,7 @@ INLINE void BRA(sh2_state *sh2, UINT32 d) #if BUSY_LOOP_HACKS if (disp == -2) { - UINT32 next_opcode = RW( sh2, sh2->ppc & AM ); + UINT32 next_opcode = (UINT32)(UINT16)RW( sh2, sh2->ppc & AM ); /* BRA $ * NOP */ @@ -802,7 +802,7 @@ INLINE void DT(sh2_state *sh2, UINT32 n) sh2->sr &= ~T; #if BUSY_LOOP_HACKS { - UINT32 next_opcode = RW( sh2, sh2->ppc & AM ); + UINT32 next_opcode = (UINT32)(UINT16)RW( sh2, sh2->ppc & AM ); /* DT Rn * BF $-2 */ @@ -1049,12 +1049,12 @@ INLINE void MAC_W(sh2_state *sh2, UINT32 m, UINT32 n) INT32 tempm, tempn, dest, src, ans; UINT32 templ; - tempn = (INT32) RW( sh2, sh2->r[n] ); + tempn = (INT32)(INT16) RW( sh2, sh2->r[n] ); sh2->r[n] += 2; - tempm = (INT32) RW( sh2, sh2->r[m] ); + tempm = (INT32)(INT16) RW( sh2, sh2->r[m] ); sh2->r[m] += 2; templ = sh2->macl; - tempm = ((INT32) (short) tempn * (INT32) (short) tempm); + tempm = (tempn * tempm); if ((INT32) sh2->macl >= 0) dest = 0; else diff --git a/cpu/sh2/mame/sh2dasm.c b/cpu/sh2/mame/sh2dasm.c index 3fa25e923..0ecb7f455 100644 --- a/cpu/sh2/mame/sh2dasm.c +++ b/cpu/sh2/mame/sh2dasm.c @@ -465,7 +465,7 @@ static UINT32 op1000(char *buffer, UINT32 pc, UINT16 opcode) sprintf(buffer, "MOV.B @($%02X,%s),R0", (opcode & 15), regname[Rm]); break; case 5: - sprintf(buffer, "MOV.W @($%02X,%s),R0", (opcode & 15), regname[Rm]); + sprintf(buffer, "MOV.W @($%02X,%s),R0", (opcode & 15) * 2, regname[Rm]); break; case 8: sprintf(buffer, "CMP/EQ #$%02X,R0", (opcode & 0xff)); diff --git a/cpu/sh2/mame/sh2pico.c b/cpu/sh2/mame/sh2pico.c index 636ebc6f4..467b2adc9 100644 --- a/cpu/sh2/mame/sh2pico.c +++ b/cpu/sh2/mame/sh2pico.c @@ -121,7 +121,7 @@ int sh2_execute_interpreter(SH2 *sh2, int cycles) if (sh2->delay) { sh2->ppc = sh2->delay; - opcode = RW(sh2, sh2->delay); + opcode = (UINT32)(UINT16)RW(sh2, sh2->delay); // TODO: more branch types if ((opcode >> 13) == 5) { // BRA/BSR @@ -139,7 +139,7 @@ int sh2_execute_interpreter(SH2 *sh2, int cycles) else { sh2->ppc = sh2->pc; - opcode = RW(sh2, sh2->pc); + opcode = (UINT32)(UINT16)RW(sh2, sh2->pc); } sh2->delay = 0; @@ -214,7 +214,7 @@ int sh2_execute_interpreter(SH2 *sh2, int cycles) if (sh2->pc < *base_pc || sh2->pc >= *end_pc) { *base_pc = sh2->pc; scan_block(*base_pc, sh2->is_slave, - op_flags, end_pc, NULL); + op_flags, end_pc, NULL, NULL); } if ((op_flags[(sh2->pc - *base_pc) / 2] & OF_BTARGET) || sh2->pc == *base_pc @@ -232,13 +232,13 @@ int sh2_execute_interpreter(SH2 *sh2, int cycles) if (sh2->delay) { sh2->ppc = sh2->delay; - opcode = RW(sh2, sh2->delay); + opcode = (UINT32)(UINT16)RW(sh2, sh2->delay); sh2->pc -= 2; } else { sh2->ppc = sh2->pc; - opcode = RW(sh2, sh2->pc); + opcode = (UINT32)(UINT16)RW(sh2, sh2->pc); } sh2->delay = 0; diff --git a/cpu/sh2/sh2.c b/cpu/sh2/sh2.c index 403c4c70c..ba2607185 100644 --- a/cpu/sh2/sh2.c +++ b/cpu/sh2/sh2.c @@ -84,7 +84,7 @@ int sh2_irl_irq(SH2 *sh2, int level, int nested_call) // do this to avoid missing irqs that other SH2 might clear int vector = sh2->irq_callback(sh2, level); sh2_do_irq(sh2, level, vector); - sh2->m68krcycles_done += C_SH2_TO_M68K(*sh2, 13); + sh2->m68krcycles_done += C_SH2_TO_M68K(sh2, 13); } else sh2->test_irq = 1; diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index 407270f11..b9267d740 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -8,42 +8,58 @@ typedef enum { SHR_R0 = 0, SHR_SP = 15, SHR_PC, SHR_PPC, SHR_PR, SHR_SR, SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, + SH2_REGS // register set size } sh2_reg_e; +#define SHR_R(n) (SHR_R0+(n)) typedef struct SH2_ { - unsigned int r[16]; // 00 - unsigned int pc; // 40 - unsigned int ppc; - unsigned int pr; - unsigned int sr; - unsigned int gbr, vbr; // 50 - unsigned int mach, macl; // 58 + // registers. this MUST correlate with enum sh2_reg_e. + uint32_t r[16] ALIGNED(32); + uint32_t pc; // 40 + uint32_t ppc; + uint32_t pr; + uint32_t sr; + uint32_t gbr, vbr; // 50 + uint32_t mach, macl; // 58 // common - const void *read8_map; // 60 + const void *read8_map; const void *read16_map; + const void *read32_map; const void **write8_tab; const void **write16_tab; + const void **write32_tab; // drc stuff - int drc_tmp; // 70 + int drc_tmp; int irq_cycles; void *p_bios; // convenience pointers void *p_da; - void *p_sdram; // 80 + void *p_sdram; void *p_rom; + void *p_dram; + void *p_drcblk_da; + void *p_drcblk_ram; unsigned int pdb_io_csum[2]; #define SH2_STATE_RUN (1 << 0) // to prevent recursion -#define SH2_STATE_SLEEP (1 << 1) +#define SH2_STATE_SLEEP (1 << 1) // temporarily stopped (DMA, IO, ...) #define SH2_STATE_CPOLL (1 << 2) // polling comm regs #define SH2_STATE_VPOLL (1 << 3) // polling VDP +#define SH2_STATE_RPOLL (1 << 4) // polling address in SDRAM +#define SH2_TIMER_RUN (1 << 7) // SOC WDT timer is running +#define SH2_IN_DRC (1 << 8) // DRC in use unsigned int state; - unsigned int poll_addr; + uint32_t poll_addr; int poll_cycles; int poll_cnt; + // DRC branch cache. size must be 2^n and <=128 + int rts_cache_idx; + struct { uint32_t pc; void *code; } rts_cache[16]; + struct { uint32_t pc; void *code; } branch_cache[128]; + // interpreter stuff int icount; // cycles left in current timeslice unsigned int ea; @@ -60,21 +76,22 @@ typedef struct SH2_ unsigned int cycles_timeslice; struct SH2_ *other_sh2; + int (*run)(struct SH2_ *, int); // we use 68k reference cycles for easier sync unsigned int m68krcycles_done; unsigned int mult_m68k_to_sh2; unsigned int mult_sh2_to_m68k; - unsigned char data_array[0x1000]; // cache (can be used as RAM) - unsigned int peri_regs[0x200/4]; // periphereal regs + uint8_t data_array[0x1000]; // cache (can be used as RAM) + uint32_t peri_regs[0x200/4]; // peripheral regs } SH2; #define CYCLE_MULT_SHIFT 10 #define C_M68K_TO_SH2(xsh2, c) \ - ((int)((c) * (xsh2).mult_m68k_to_sh2) >> CYCLE_MULT_SHIFT) + (int)(((uint64_t)(c) * (xsh2)->mult_m68k_to_sh2) >> CYCLE_MULT_SHIFT) #define C_SH2_TO_M68K(xsh2, c) \ - ((int)((c + 3) * (xsh2).mult_sh2_to_m68k) >> CYCLE_MULT_SHIFT) + (int)(((uint64_t)(c+3U) * (xsh2)->mult_sh2_to_m68k) >> CYCLE_MULT_SHIFT) int sh2_init(SH2 *sh2, int is_slave, SH2 *other_sh2); void sh2_finish(SH2 *sh2); @@ -88,17 +105,21 @@ void sh2_unpack(SH2 *sh2, const unsigned char *buff); int sh2_execute_drc(SH2 *sh2c, int cycles); int sh2_execute_interpreter(SH2 *sh2c, int cycles); -static __inline int sh2_execute(SH2 *sh2, int cycles, int use_drc) +static __inline void sh2_execute_prepare(SH2 *sh2, int use_drc) +{ +#ifdef DRC_SH2 + sh2->run = use_drc ? sh2_execute_drc : sh2_execute_interpreter; +#else + sh2->run = sh2_execute_interpreter; +#endif +} + +static __inline int sh2_execute(SH2 *sh2, int cycles) { int ret; sh2->cycles_timeslice = cycles; -#ifdef DRC_SH2 - if (use_drc) - ret = sh2_execute_drc(sh2, cycles); - else -#endif - ret = sh2_execute_interpreter(sh2, cycles); + ret = sh2->run(sh2, cycles); return sh2->cycles_timeslice - ret; } diff --git a/jni/Android.mk b/jni/Android.mk index b72cbba9a..1ff6c9e72 100644 --- a/jni/Android.mk +++ b/jni/Android.mk @@ -52,8 +52,8 @@ SOURCES_C := $(LIBRETRO_DIR)/libretro.c \ COREFLAGS := $(addprefix -D,$(DEFINES)) -fno-strict-aliasing -GIT_VERSION := " $(shell git rev-parse --short HEAD || echo unknown)" -ifneq ($(GIT_VERSION)," unknown") +GIT_VERSION := $(shell git rev-parse --short HEAD || echo unknown) +ifneq ($(GIT_VERSION),"unknown") COREFLAGS += -DGIT_VERSION=\"$(GIT_VERSION)\" endif diff --git a/pico/32x/32x.c b/pico/32x/32x.c index 9bfbeface..3b8896483 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -12,7 +12,7 @@ struct Pico32x Pico32x; SH2 sh2s[2]; -#define SH2_IDLE_STATES (SH2_STATE_CPOLL|SH2_STATE_VPOLL|SH2_STATE_SLEEP) +#define SH2_IDLE_STATES (SH2_STATE_CPOLL|SH2_STATE_VPOLL|SH2_STATE_RPOLL|SH2_STATE_SLEEP) static int REGPARM(2) sh2_irq_cb(SH2 *sh2, int level) { @@ -30,7 +30,7 @@ static int REGPARM(2) sh2_irq_cb(SH2 *sh2, int level) } // MUST specify active_sh2 when called from sh2 memhandlers -void p32x_update_irls(SH2 *active_sh2, int m68k_cycles) +void p32x_update_irls(SH2 *active_sh2, unsigned int m68k_cycles) { int irqs, mlvl = 0, slvl = 0; int mrun, srun; @@ -38,30 +38,32 @@ void p32x_update_irls(SH2 *active_sh2, int m68k_cycles) if (active_sh2 != NULL) m68k_cycles = sh2_cycles_done_m68k(active_sh2); + // find top bit = highest irq number (0 <= irl <= 14/2) by binary search + // msh2 irqs = Pico32x.sh2irqs | Pico32x.sh2irqi[0]; - while ((irqs >>= 1)) - mlvl++; - mlvl *= 2; + if (irqs >= 0x10) mlvl += 8, irqs >>= 4; + if (irqs >= 0x04) mlvl += 4, irqs >>= 2; + if (irqs >= 0x02) mlvl += 2, irqs >>= 1; // ssh2 irqs = Pico32x.sh2irqs | Pico32x.sh2irqi[1]; - while ((irqs >>= 1)) - slvl++; - slvl *= 2; + if (irqs >= 0x10) slvl += 8, irqs >>= 4; + if (irqs >= 0x04) slvl += 4, irqs >>= 2; + if (irqs >= 0x02) slvl += 2, irqs >>= 1; - mrun = sh2_irl_irq(&msh2, mlvl, active_sh2 == &msh2); + mrun = sh2_irl_irq(&msh2, mlvl, msh2.state & SH2_STATE_RUN); if (mrun) { p32x_sh2_poll_event(&msh2, SH2_IDLE_STATES, m68k_cycles); - if (active_sh2 == &msh2) - sh2_end_run(active_sh2, 1); + if (msh2.state & SH2_STATE_RUN) + sh2_end_run(&msh2, 1); } - srun = sh2_irl_irq(&ssh2, slvl, active_sh2 == &ssh2); + srun = sh2_irl_irq(&ssh2, slvl, ssh2.state & SH2_STATE_RUN); if (srun) { p32x_sh2_poll_event(&ssh2, SH2_IDLE_STATES, m68k_cycles); - if (active_sh2 == &ssh2) - sh2_end_run(active_sh2, 1); + if (ssh2.state & SH2_STATE_RUN) + sh2_end_run(&ssh2, 1); } elprintf(EL_32X, "update_irls: m %d/%d, s %d/%d", mlvl, mrun, slvl, srun); @@ -70,7 +72,7 @@ void p32x_update_irls(SH2 *active_sh2, int m68k_cycles) // the mask register is inconsistent, CMD is supposed to be a mask, // while others are actually irq trigger enables? // TODO: test on hw.. -void p32x_trigger_irq(SH2 *sh2, int m68k_cycles, unsigned int mask) +void p32x_trigger_irq(SH2 *sh2, unsigned int m68k_cycles, unsigned int mask) { Pico32x.sh2irqs |= mask & P32XI_VRES; Pico32x.sh2irqi[0] |= mask & (Pico32x.sh2irq_mask[0] << 3); @@ -79,7 +81,7 @@ void p32x_trigger_irq(SH2 *sh2, int m68k_cycles, unsigned int mask) p32x_update_irls(sh2, m68k_cycles); } -void p32x_update_cmd_irq(SH2 *sh2, int m68k_cycles) +void p32x_update_cmd_irq(SH2 *sh2, unsigned int m68k_cycles) { if ((Pico32x.sh2irq_mask[0] & 2) && (Pico32x.regs[2 / 2] & 1)) Pico32x.sh2irqi[0] |= P32XI_CMD; @@ -194,11 +196,11 @@ void PicoPower32x(void) void PicoUnload32x(void) { + sh2_finish(&msh2); + sh2_finish(&ssh2); if (Pico32xMem != NULL) plat_munmap(Pico32xMem, sizeof(*Pico32xMem)); Pico32xMem = NULL; - sh2_finish(&msh2); - sh2_finish(&ssh2); PicoIn.AHW &= ~PAHW_32X; } @@ -207,8 +209,8 @@ void PicoReset32x(void) { if (PicoIn.AHW & PAHW_32X) { p32x_trigger_irq(NULL, SekCyclesDone(), P32XI_VRES); - p32x_sh2_poll_event(&msh2, SH2_IDLE_STATES, 0); - p32x_sh2_poll_event(&ssh2, SH2_IDLE_STATES, 0); + p32x_sh2_poll_event(&msh2, SH2_IDLE_STATES, SekCyclesDone()); + p32x_sh2_poll_event(&ssh2, SH2_IDLE_STATES, SekCyclesDone()); p32x_pwm_ctl_changed(); p32x_timers_recalc(); } @@ -254,11 +256,11 @@ static void p32x_start_blank(void) } p32x_trigger_irq(NULL, SekCyclesDone(), P32XI_VINT); - p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, 0); - p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, 0); + p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, SekCyclesDone()); + p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, SekCyclesDone()); } -void p32x_schedule_hint(SH2 *sh2, int m68k_cycles) +void p32x_schedule_hint(SH2 *sh2, unsigned int m68k_cycles) { // rather rough, 32x hint is useless in practice int after; @@ -267,7 +269,8 @@ void p32x_schedule_hint(SH2 *sh2, int m68k_cycles) return; // nobody cares // note: when Pico.m.scanline is 224, SH2s might // still be at scanline 93 (or so) - if (!(Pico32x.sh2_regs[0] & 0x80) && Pico.m.scanline > 224) + if (!(Pico32x.sh2_regs[0] & 0x80) && + Pico.m.scanline > (Pico.video.reg[1] & 0x08 ? 240 : 224)) return; after = (Pico32x.sh2_regs[4 / 2] + 1) * 488; @@ -323,8 +326,12 @@ void p32x_event_schedule_sh2(SH2 *sh2, enum p32x_event event, int after) p32x_event_schedule(now, event, after); - left_to_next = (event_time_next - now) * 3; - sh2_end_run(sh2, left_to_next); + left_to_next = C_M68K_TO_SH2(sh2, (int)(event_time_next - now)); + if (sh2_cycles_left(sh2) > left_to_next) { + if (left_to_next < 1) + left_to_next = 1; + sh2_end_run(sh2, left_to_next); + } } static void p32x_run_events(unsigned int until) @@ -366,19 +373,19 @@ static void p32x_run_events(unsigned int until) oldest, event_time_next); } -static void run_sh2(SH2 *sh2, int m68k_cycles) +static void run_sh2(SH2 *sh2, unsigned int m68k_cycles) { - int cycles, done; + unsigned int cycles, done; pevt_log_sh2_o(sh2, EVT_RUN_START); sh2->state |= SH2_STATE_RUN; - cycles = C_M68K_TO_SH2(*sh2, m68k_cycles); + cycles = C_M68K_TO_SH2(sh2, m68k_cycles); elprintf_sh2(sh2, EL_32X, "+run %u %d @%08x", sh2->m68krcycles_done, cycles, sh2->pc); - done = sh2_execute(sh2, cycles, PicoIn.opt & POPT_EN_DRC); + done = sh2_execute(sh2, cycles); - sh2->m68krcycles_done += C_SH2_TO_M68K(*sh2, done); + sh2->m68krcycles_done += C_SH2_TO_M68K(sh2, done); sh2->state &= ~SH2_STATE_RUN; pevt_log_sh2_o(sh2, EVT_RUN_END); elprintf_sh2(sh2, EL_32X, "-run %u %d", @@ -412,8 +419,7 @@ void p32x_sync_other_sh2(SH2 *sh2, unsigned int m68k_target) // there might be new event to schedule current sh2 to if (event_time_next) { - left_to_event = event_time_next - m68k_target; - left_to_event *= 3; + left_to_event = C_M68K_TO_SH2(sh2, (int)(event_time_next - m68k_target)); if (sh2_cycles_left(sh2) > left_to_event) { if (left_to_event < 1) left_to_event = 1; @@ -423,7 +429,7 @@ void p32x_sync_other_sh2(SH2 *sh2, unsigned int m68k_target) } #define STEP_LS 24 -#define STEP_N 440 +#define STEP_N 528 // at least one line (488) #define sync_sh2s_normal p32x_sync_sh2s //#define sync_sh2s_lockstep p32x_sync_sh2s @@ -431,7 +437,7 @@ void p32x_sync_other_sh2(SH2 *sh2, unsigned int m68k_target) /* most timing is in 68k clock */ void sync_sh2s_normal(unsigned int m68k_target) { - unsigned int now, target, timer_cycles; + unsigned int now, target, next, timer_cycles; int cycles; elprintf(EL_32X, "sh2 sync to %u", m68k_target); @@ -446,6 +452,7 @@ void sync_sh2s_normal(unsigned int m68k_target) now = ssh2.m68krcycles_done; timer_cycles = now; + pprof_start(m68k); while (CYCLES_GT(m68k_target, now)) { if (event_time_next && CYCLES_GE(now, event_time_next)) @@ -454,49 +461,68 @@ void sync_sh2s_normal(unsigned int m68k_target) target = m68k_target; if (event_time_next && CYCLES_GT(target, event_time_next)) target = event_time_next; - if (CYCLES_GT(target, now + STEP_N)) - target = now + STEP_N; - while (CYCLES_GT(target, now)) { - elprintf(EL_32X, "sh2 exec to %u %d,%d/%d, flags %x", target, - target - msh2.m68krcycles_done, target - ssh2.m68krcycles_done, + next = target; + if (CYCLES_GT(target, now + STEP_N)) + next = now + STEP_N; + elprintf(EL_32X, "sh2 exec to %u %d,%d/%d, flags %x", next, + next - msh2.m68krcycles_done, next - ssh2.m68krcycles_done, m68k_target - now, Pico32x.emu_flags); + pprof_start(ssh2); if (!(ssh2.state & SH2_IDLE_STATES)) { - cycles = target - ssh2.m68krcycles_done; + cycles = next - ssh2.m68krcycles_done; if (cycles > 0) { - run_sh2(&ssh2, cycles); + run_sh2(&ssh2, cycles > 20U ? cycles : 20U); if (event_time_next && CYCLES_GT(target, event_time_next)) target = event_time_next; + if (CYCLES_GT(next, target)) + next = target; } } + pprof_end(ssh2); + pprof_start(msh2); if (!(msh2.state & SH2_IDLE_STATES)) { - cycles = target - msh2.m68krcycles_done; + cycles = next - msh2.m68krcycles_done; if (cycles > 0) { - run_sh2(&msh2, cycles); + run_sh2(&msh2, cycles > 20U ? cycles : 20U); if (event_time_next && CYCLES_GT(target, event_time_next)) target = event_time_next; + if (CYCLES_GT(next, target)) + next = target; } } + pprof_end(msh2); - now = target; - if (!(msh2.state & SH2_IDLE_STATES)) { - if (CYCLES_GT(now, msh2.m68krcycles_done)) + now = next; + if (CYCLES_GT(now, msh2.m68krcycles_done)) { + if (!(msh2.state & SH2_IDLE_STATES)) now = msh2.m68krcycles_done; } - if (!(ssh2.state & SH2_IDLE_STATES)) { - if (CYCLES_GT(now, ssh2.m68krcycles_done)) + if (CYCLES_GT(now, ssh2.m68krcycles_done)) { + if (!(ssh2.state & SH2_IDLE_STATES)) now = ssh2.m68krcycles_done; } + if (CYCLES_GT(now, timer_cycles+STEP_N)) { + if (msh2.state & SH2_TIMER_RUN) + p32x_timer_do(&msh2, now - timer_cycles); + if (ssh2.state & SH2_TIMER_RUN) + p32x_timer_do(&ssh2, now - timer_cycles); + timer_cycles = now; + } } - p32x_timers_do(now - timer_cycles); + if (msh2.state & SH2_TIMER_RUN) + p32x_timer_do(&msh2, now - timer_cycles); + if (ssh2.state & SH2_TIMER_RUN) + p32x_timer_do(&ssh2, now - timer_cycles); timer_cycles = now; } + pprof_end_sub(m68k); // advance idle CPUs if (msh2.state & SH2_IDLE_STATES) { @@ -545,6 +571,9 @@ void sync_sh2s_lockstep(unsigned int m68k_target) void PicoFrame32x(void) { + sh2_execute_prepare(&msh2, PicoIn.opt & POPT_EN_DRC); + sh2_execute_prepare(&ssh2, PicoIn.opt & POPT_EN_DRC); + Pico.m.scanline = 0; Pico32x.vdp_regs[0x0a/2] &= ~P32XV_VBLK; // get out of vblank @@ -553,15 +582,14 @@ void PicoFrame32x(void) if (!(Pico32x.sh2_regs[0] & 0x80)) p32x_schedule_hint(NULL, SekCyclesDone()); - p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, 0); - p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, 0); + p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, SekCyclesDone()); + p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, SekCyclesDone()); if (PicoIn.AHW & PAHW_MCD) pcd_prepare_frame(); PicoFrameStart(); PicoFrameHints(); - sh2_drc_frame(); elprintf(EL_32X, "poll: %02x %02x %02x", Pico32x.emu_flags & 3, msh2.state, ssh2.state); diff --git a/pico/32x/draw.c b/pico/32x/draw.c index 828e0adb2..ffcb5c924 100644 --- a/pico/32x/draw.c +++ b/pico/32x/draw.c @@ -1,6 +1,7 @@ /* * PicoDrive * (C) notaz, 2009,2010 + * (C) kub, 2019 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. @@ -11,6 +12,9 @@ int (*PicoScan32xBegin)(unsigned int num); int (*PicoScan32xEnd)(unsigned int num); int Pico32xDrawMode; +void *DrawLineDestBase32x; +int DrawLineDestIncrement32x; + static void convert_pal555(int invert_prio) { unsigned int *ps = (void *)Pico32xMem->pal; @@ -44,16 +48,21 @@ static void convert_pal555(int invert_prio) const unsigned int m1 = 0x001f; \ const unsigned int m2 = 0x03e0; \ const unsigned int m3 = 0x7c00; \ - int i; \ + unsigned short t; \ + int i = 320; \ \ - for (i = 320; i > 0; i--, pd++, p32x++, pmd++) { \ - unsigned short t = *p32x; \ - if ((*pmd & 0x3f) != mdbg && !((t ^ inv) & 0x8000)) { \ - pmd_draw_code; \ - continue; \ + while (i > 0) { \ + for (; i > 0 && (*pmd & 0x3f) == mdbg; pd++, pmd++, i--) { \ + t = *p32x++; \ + *pd = ((t&m1) << 11) | ((t&m2) << 1) | ((t&m3) >> 10); \ + } \ + for (; i > 0 && (*pmd & 0x3f) != mdbg; pd++, pmd++, i--) { \ + t = *p32x++; \ + if ((t ^ inv) & 0x8000) \ + *pd = ((t&m1) << 11) | ((t&m2) << 1) | ((t&m3) >> 10); \ + else \ + pmd_draw_code; \ } \ - \ - *pd = ((t & m1) << 11) | ((t & m2) << 1) | ((t & m3) >> 10); \ } \ } @@ -61,15 +70,21 @@ static void convert_pal555(int invert_prio) #define do_line_pp(pd, p32x, pmd, pmd_draw_code) \ { \ unsigned short t; \ - int i; \ - for (i = 320; i > 0; i--, pd++, p32x++, pmd++) { \ - t = pal[*(unsigned char *)((uintptr_t)p32x ^ 1)]; \ - if ((t & 0x20) || (*pmd & 0x3f) == mdbg) \ + int i = 320; \ + while (i > 0) { \ + for (; i > 0 && (*pmd & 0x3f) == mdbg; pd++, pmd++, i--) { \ + t = pal[*(unsigned char *)((uintptr_t)(p32x++) ^ 1)]; \ *pd = t; \ - else \ - pmd_draw_code; \ + } \ + for (; i > 0 && (*pmd & 0x3f) != mdbg; pd++, pmd++, i--) { \ + t = pal[*(unsigned char *)((uintptr_t)(p32x++) ^ 1)]; \ + if (t & 0x20) \ + *pd = t; \ + else \ + pmd_draw_code; \ + } \ } \ -} +} // run length mode #define do_line_rl(pd, p32x, pmd, pmd_draw_code) \ @@ -233,13 +248,11 @@ void PicoDraw32xLayer(int offs, int lines, int md_bg) int lines_sft_offs; int which_func; - Pico.est.DrawLineDest = (char *)DrawLineDestBase + offs * DrawLineDestIncrement; + Pico.est.DrawLineDest = (char *)DrawLineDestBase32x + offs * DrawLineDestIncrement32x; dram = Pico32xMem->dram[Pico32x.vdp_regs[0x0a/2] & P32XV_FS]; - if (Pico32xDrawMode == PDM32X_BOTH) { - if (Pico.m.dirtyPal) - PicoDrawUpdateHighPal(); - } + if (Pico32xDrawMode == PDM32X_BOTH) + PicoDrawUpdateHighPal(); if ((Pico32x.vdp_regs[0] & P32XV_Mx) == 2) { @@ -278,20 +291,21 @@ void PicoDraw32xLayer(int offs, int lines, int md_bg) void PicoDraw32xLayerMdOnly(int offs, int lines) { int have_scan = PicoScan32xBegin != NULL && PicoScan32xEnd != NULL; - unsigned short *dst = (void *)((char *)DrawLineDestBase + offs * DrawLineDestIncrement); + unsigned short *dst = (void *)((char *)DrawLineDestBase32x + offs * DrawLineDestIncrement32x); unsigned char *pmd = Pico.est.Draw2FB + 328 * offs + 8; unsigned short *pal = Pico.est.HighPal; int poffs = 0, plen = 320; int l, p; if (!(Pico.video.reg[12] & 1)) { - // 32col mode + // 32col mode. for some render modes MD pixel data carries an offset + if (!(PicoIn.opt & (POPT_ALT_RENDERER|POPT_DIS_32C_BORDER))) + pmd += 32; poffs = 32; plen = 256; } - if (Pico.m.dirtyPal) - PicoDrawUpdateHighPal(); + PicoDrawUpdateHighPal(); dst += poffs; for (l = 0; l < lines; l++) { @@ -305,7 +319,7 @@ void PicoDraw32xLayerMdOnly(int offs, int lines) dst[p + 2] = pal[*pmd++]; dst[p + 3] = pal[*pmd++]; } - dst = (void *)((char *)dst + DrawLineDestIncrement); + dst = (void *)((char *)dst + DrawLineDestIncrement32x); pmd += 328 - plen; if (have_scan) PicoScan32xEnd(l + offs); @@ -314,21 +328,32 @@ void PicoDraw32xLayerMdOnly(int offs, int lines) void PicoDrawSetOutFormat32x(pdso_t which, int use_32x_line_mode) { -#ifdef _ASM_32X_DRAW - extern void *Pico32xNativePal; - Pico32xNativePal = Pico32xMem->pal_native; -#endif + if (which == PDF_RGB555) { + // need CLUT pixels in PicoDraw2FB for layer transparency + PicoDrawSetInternalBuf(Pico.est.Draw2FB, 328); + PicoDrawSetOutBufMD(DrawLineDestBase32x, DrawLineDestIncrement32x); + } else { + // use the same layout as alt renderer + PicoDrawSetInternalBuf(NULL, 0); + PicoDrawSetOutBufMD(Pico.est.Draw2FB + 8, 328); + } - if (which == PDF_RGB555 && use_32x_line_mode) { + if (use_32x_line_mode) // we'll draw via FinalizeLine32xRGB555 (rare) - PicoDrawSetInternalBuf(NULL, 0); Pico32xDrawMode = PDM32X_OFF; - return; - } + else + // in RGB555 mode the 32x layer is drawn over the MD layer, in the other + // modes 32x and MD layer are merged together by the 32x renderer + Pico32xDrawMode = (which == PDF_RGB555) ? PDM32X_32X_ONLY : PDM32X_BOTH; +} - // use the same layout as alt renderer - PicoDrawSetInternalBuf(Pico.est.Draw2FB, 328); - Pico32xDrawMode = (which == PDF_RGB555) ? PDM32X_32X_ONLY : PDM32X_BOTH; +void PicoDrawSetOutBuf32X(void *dest, int increment) +{ + DrawLineDestBase32x = dest; + DrawLineDestIncrement32x = increment; + // in RGB555 mode this buffer is also used by the MD renderer + if (Pico32xDrawMode != PDM32X_BOTH) + PicoDrawSetOutBufMD(DrawLineDestBase32x, DrawLineDestIncrement32x); } // vim:shiftwidth=2:ts=2:expandtab diff --git a/pico/32x/draw_arm.s b/pico/32x/draw_arm.S similarity index 63% rename from pico/32x/draw_arm.s rename to pico/32x/draw_arm.S index ba66fbf1f..ad5d428b1 100644 --- a/pico/32x/draw_arm.s +++ b/pico/32x/draw_arm.S @@ -1,32 +1,30 @@ @* @* PicoDrive @* (C) notaz, 2010 +@* (C) kub, 2019 @* @* This work is licensed under the terms of MAME license. @* See COPYING file in the top-level directory. @* +#include "pico/arm_features.h" +#include "pico/pico_int_offs.h" + .extern Pico32x -.extern PicoDraw2FB -.extern HighPal +.extern Pico .equiv P32XV_PRI, (1<< 7) -.bss -.align 2 -.global Pico32xNativePal -Pico32xNativePal: - .word 0 - .text .align 2 + PIC_LDR_INIT() -.macro call_scan_prep cond +.macro call_scan_prep cond est @ &Pico.est .if \cond - ldr r4, =PicoScan32xBegin - ldr r5, =PicoScan32xEnd - ldr r6, =DrawLineDest + PIC_LDR(r4, r6, PicoScan32xBegin) + PIC_LDR(r5, r6, PicoScan32xEnd) + ldr r6, [\est, #OFS_EST_DrawLineDest] ldr r4, [r4] ldr r5, [r5] stmfd sp!, {r4,r5,r6} @@ -70,19 +68,20 @@ Pico32xNativePal: \name: stmfd sp!, {r4-r11,lr} - ldr r10,=Pico32x - ldr r11,=PicoDraw2FB - ldr r10,[r10, #0x40] @ Pico32x.vdp_regs[0] - ldr r11,[r11] - ldr r9, =HighPal @ palmd + PIC_LDR(lr, r9, Pico) + PIC_LDR(r10,r9, Pico32x) + ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] + ldrh r10,[r10, #0x40] @ Pico32x.vdp_regs[0] + add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd + and r4, r2, #0xff mov r5, #328 - lsl r3, #26 @ mdbg << 26 + mov r3, r3, lsl #26 @ mdbg << 26 mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data tst r10,#P32XV_PRI - moveq r10,#0 - movne r10,#0x8000 @ r10 = inv_bit - call_scan_prep \call_scan + movne r10,#0 + moveq r10,#0x8000 @ r10 = inv_bit + call_scan_prep \call_scan lr mov r4, #0 @ line b 1f @ loop_outer_entry @@ -90,7 +89,6 @@ Pico32xNativePal: 0: @ loop_outer: call_scan_end \call_scan add r4, r4, #1 - sub r11,r11,#1 @ adjust for prev read cmp r4, r2, lsr #16 call_scan_fin_ge \call_scan ldmgefd sp!, {r4-r11,pc} @@ -104,29 +102,86 @@ Pico32xNativePal: add r5, r1, r12, lsl #1 @ p32x = dram + dram[l] 2: @ loop_inner: - ldrb r7, [r11], #1 @ MD pixel - subs r6, r6, #1 + ldrh r8, [r5], #2 + subs lr, r6, #1 blt 0b @ loop_outer - ldrh r8, [r5], #2 @ 32x pixel - cmp r3, r7, lsl #26 @ MD has bg pixel? - beq 3f @ draw32x + +3: @ loop_innermost: + ldrh r7, [r5], #2 @ 32x pixel + subs lr, lr, #1 + cmpge r7, r8 + beq 3b @ loop_innermost + + sub r5, r5, #2 + add lr, lr, #1 + sub lr, r6, lr + sub r6, r6, lr + eor r12,r8, r10 - ands r12,r12,#0x8000 @ !((t ^ inv) & 0x8000) + tst r12, #0x8000 @ !((t ^ inv) & 0x8000) + bne 5f @ draw_md + + and r7 ,r8, #0x03e0 + mov r8, r8, lsl #11 + orr r8, r8, r8, lsr #(10+11) + orr r8, r8, r7 ,lsl #1 + bic r8, r8, #0x0020 @ kill prio bit + + add r11,r11,lr + tst r0, #2 @ dst unaligned? + strneh r8, [r0], #2 + subne lr, lr, #1 + cmp lr, #0 + beq 2b @ loop_inner + mov r8, r8, lsl #16 + orr r12,r8, r8, lsr #16 + mov r8 ,r12 +4: @ draw_32x: + subs lr, lr, #4 @ store 4 pixels + stmgeia r0!, {r8, r12} + bgt 4b @ draw_32x + beq 2b @ loop_inner + adds lr, lr, #2 @ store 1-3 leftover pixels + strge r8, [r0], #4 + strneh r8, [r0], #2 + b 2b @ loop_inner + +5: @ draw_md: + subs lr, lr, #1 + ldrgeb r7, [r11], #1 @ MD pixel + blt 2b @ loop_inner + cmp r3, r7, lsl #26 @ MD has bg pixel? .if \do_md mov r7, r7, lsl #1 - ldreqh r12,[r9, r7] - streqh r12,[r0], #2 @ *dst++ = palmd[*pmd] + ldrneh r7 ,[r9, r7] + strneh r7 ,[r0], #2 @ *dst++ = palmd[*pmd] +.else + addne r0, r0, #2 .endif - beq 2b @ loop_inner + bne 5b @ draw_md -3: @ draw32x: - and r12,r8, #0x03e0 + and r7 ,r8, #0x03e0 mov r8, r8, lsl #11 orr r8, r8, r8, lsr #(10+11) - orr r8, r8, r12,lsl #1 + orr r8, r8, r7 ,lsl #1 bic r8, r8, #0x0020 @ kill prio bit strh r8, [r0], #2 @ *dst++ = bgr2rgb(*p32x++) - b 2b @ loop_inner + +6: @ draw_md_32x: + subs lr, lr, #1 + ldrgeb r7, [r11], #1 @ MD pixel + blt 2b @ loop_inner + cmp r3, r7, lsl #26 @ MD has bg pixel? +.if \do_md + mov r7, r7, lsl #1 + ldrneh r7 ,[r9, r7] @ *dst++ = palmd[*pmd] + moveq r7 ,r8 @ *dst++ = bgr2rgb(*p32x++) + strh r7 ,[r0], #2 +.else + streqh r8, [r0] @ *dst++ = bgr2rgb(*p32x++) + add r0, r0, #2 +.endif + b 6b @ draw_md_32x .endm @@ -139,16 +194,19 @@ Pico32xNativePal: \name: stmfd sp!, {r4-r11,lr} - ldr r11,=PicoDraw2FB - ldr r10,=Pico32xNativePal - ldr r11,[r11] - ldr r10,[r10] - ldr r9, =HighPal @ palmd + PIC_LDR(lr, r9, Pico) + PIC_LDR(r10,r9, Pico32xMem) + ldr r9,=OFS_PMEM32x_pal_native + ldr r10, [r10] + ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] + add r10,r10,r9 + add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd + and r4, r2, #0xff mov r5, #328 - lsl r3, #26 @ mdbg << 26 + mov r3, r3, lsl #26 @ mdbg << 26 mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data - call_scan_prep \call_scan + call_scan_prep \call_scan lr mov r4, #0 @ line b 1f @ loop_outer_entry @@ -199,11 +257,11 @@ Pico32xNativePal: ldrneh r7, [r9, r12] @ t = palmd[pmd[0]] tst lr, #0x20 ldrneb lr, [r11,#-1] @ MD pixel 1 - strh r7, [r0], #2 cmpne r3, lr, lsl #26 @ MD has bg pixel? mov lr, lr, lsl #1 ldrneh r8, [r9, lr] @ t = palmd[pmd[1]] - strh r8, [r0], #2 + orr r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth + str r7, [r0], #4 @ (no write combining on ARM9) .else streqh r7, [r0] tst lr, #0x20 @@ -214,18 +272,21 @@ Pico32xNativePal: .endif b 2b @ loop_inner -5: @ check_fill +5: @ check_fill: @ count pixels, align if needed bic r12,r5, #1 + ldrh lr ,[r12, #2] @ only do this for at least 4 pixels ldrh r12,[r12] + orr r12,lr,r12, lsl #16 orr lr, r7, r7, lsl #8 + orr lr, lr, lr, lsl #16 cmp r12,lr bne 3b @ no_fill tst r5, #1 sub lr, r5, #2 @ starting r5 (32x render data start) - addeq r5, r5, #2 - addne r5, r5, #1 @ add for the check above + addeq r5, r5, #4 + addne r5, r5, #3 @ add for the check above add r6, r6, #1 @ restore from dec orr r7, r7, r7, lsl #8 6: @@ -235,11 +296,12 @@ Pico32xNativePal: ldrh r12,[r5], #2 bge 7f @ count_done cmp r8, r7 + subne r5, r5, #2 @ undo readahead cmpeq r12,r7 beq 6b -7: @ count_done - sub r5, r5, #4 @ undo readahead +7: @ count_done: + sub r5, r5, #2 @ undo readahead @ fix alignment and check type sub r8, r5, lr @@ -257,30 +319,34 @@ Pico32xNativePal: beq 9f @ bg_mode add r11,r11,r8 -8: - subs r8, r8, #2 - strgeh r7, [r0], #2 - strgeh r7, [r0], #2 - bgt 8b + orr r12,r7, r7, lsl #16 + mov r7 ,r12 +8: @ 32x_loop: + subs r8, r8, #4 @ store 4 pixels + stmgeia r0!, {r7, r12} + bgt 8b @ 32x_loop + beq 2b @ loop_inner + adds r8, r8, #2 + strge r7, [r0], #4 @ store 2 leftover pixels b 2b @ loop_inner 9: @ bg_mode: - ldrb r12,[r11],#1 @ MD pixel + ldrb r12,[r11],#1 @ MD pixel 0,1 ldrb lr, [r11],#1 - cmp r3, lr, lsl #26 @ MD has bg pixel? + cmp r3, r12,lsl #26 @ MD pixel 0 has bg? .if \do_md mov r12,r12,lsl #1 ldrneh r12,[r9, r12] @ t = palmd[*pmd] moveq r12,r7 - cmp r3, lr, lsl #26 + cmp r3, lr, lsl #26 @ MD pixel 1 has bg? mov lr, lr, lsl #1 ldrneh lr, [r9, lr] moveq lr, r7 - strh r12,[r0], #2 - strh lr, [r0], #2 + orr r12,r12,lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth + str r12,[r0], #4 @ (no write combining on ARM9) .else streqh r7, [r0] - cmp r3, lr, lsl #26 + cmp r3, lr, lsl #26 @ MD pixel 1 has bg? streqh r7, [r0, #2] add r0, r0, #4 .endif @@ -297,16 +363,19 @@ Pico32xNativePal: \name: stmfd sp!, {r4-r11,lr} - ldr r11,=PicoDraw2FB - ldr r10,=Pico32xNativePal - ldr r11,[r11] - ldr r10,[r10] - ldr r9, =HighPal @ palmd + PIC_LDR(lr, r9, Pico) + PIC_LDR(r10,r9, Pico32xMem) + ldr r9,=OFS_PMEM32x_pal_native + ldr r10, [r10] + ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] + add r10,r10,r9 + add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd + and r4, r2, #0xff mov r5, #328 - lsl r3, #26 @ mdbg << 26 + mov r3, r3, lsl #26 @ mdbg << 26 mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data - call_scan_prep \call_scan + call_scan_prep \call_scan lr mov r4, #0 @ line b 1f @ loop_outer_entry @@ -314,7 +383,6 @@ Pico32xNativePal: 0: @ loop_outer: call_scan_end \call_scan add r4, r4, #1 - sub r11,r11,#1 @ adjust for prev read cmp r4, r2, lsr #16 call_scan_fin_ge \call_scan ldmgefd sp!, {r4-r11,pc} @@ -335,13 +403,13 @@ Pico32xNativePal: eor lr, lr, #0x20 3: @ loop_innermost: - ldrb r7, [r11], #1 @ MD pixel subs r6, r6, #1 + ldrgeb r7, [r11], #1 @ MD pixel blt 0b @ loop_outer - cmp r3, r7, lsl #26 @ MD has bg pixel? - mov r7, r7, lsl #1 - tstne lr, #0x20 + tst lr, #0x20 + cmpne r3, r7, lsl #26 @ MD has bg pixel? .if \do_md + mov r7, r7, lsl #1 ldrneh r12,[r9, r7] @ t = palmd[*pmd] streqh lr, [r0], #2 strneh r12,[r0], #2 @ *dst++ = t @@ -359,15 +427,18 @@ make_do_loop_dc do_loop_dc, 0, 0 make_do_loop_dc do_loop_dc_md, 0, 1 make_do_loop_dc do_loop_dc_scan, 1, 0 make_do_loop_dc do_loop_dc_scan_md, 1, 1 +.pool make_do_loop_pp do_loop_pp, 0, 0 make_do_loop_pp do_loop_pp_md, 0, 1 make_do_loop_pp do_loop_pp_scan, 1, 0 make_do_loop_pp do_loop_pp_scan_md, 1, 1 +.pool make_do_loop_rl do_loop_rl, 0, 0 make_do_loop_rl do_loop_rl_md, 0, 1 make_do_loop_rl do_loop_rl_scan, 1, 0 make_do_loop_rl do_loop_rl_scan_md, 1, 1 +.pool @ vim:filetype=armasm diff --git a/pico/32x/memory.c b/pico/32x/memory.c index eff0ab07c..69f703183 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -1,6 +1,7 @@ /* * PicoDrive * (C) notaz, 2009,2010,2013 + * (C) kub, 2019 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. @@ -40,7 +41,9 @@ */ #include "../pico_int.h" #include "../memory.h" + #include "../../cpu/sh2/compiler.h" +DRC_DECLARE_SR; static const char str_mars[] = "MARS"; @@ -56,32 +59,40 @@ static void (*m68k_write16_io)(u32 a, u32 d); #define REG8IN16(ptr, offs) ((u8 *)ptr)[(offs) ^ 1] // poll detection -#define POLL_THRESHOLD 3 +#define POLL_THRESHOLD 5 static struct { - u32 addr, cycles; + u32 addr1, addr2, cycles; int cnt; } m68k_poll; static int m68k_poll_detect(u32 a, u32 cycles, u32 flags) { int ret = 0; + // support polling on 2 addresses - seen in Wolfenstein + int match = (a - m68k_poll.addr1 <= 2 || a - m68k_poll.addr2 <= 2); - if (a - 2 <= m68k_poll.addr && m68k_poll.addr <= a + 2 - && cycles - m68k_poll.cycles <= 64 && !SekNotPolling) + if (match && cycles - m68k_poll.cycles <= 64 && !SekNotPolling) { - if (m68k_poll.cnt++ > POLL_THRESHOLD) { + // detect split 32bit access by same cycle count, and ignore those + if (cycles != m68k_poll.cycles && ++m68k_poll.cnt >= POLL_THRESHOLD) { if (!(Pico32x.emu_flags & flags)) { elprintf(EL_32X, "m68k poll addr %08x, cyc %u", a, cycles - m68k_poll.cycles); - ret = 1; } Pico32x.emu_flags |= flags; + ret = 1; } } else { + // reset poll state in case of restart by interrupt + Pico32x.emu_flags &= ~(P32XF_68KCPOLL|P32XF_68KVPOLL); + SekSetStop(0); m68k_poll.cnt = 0; - m68k_poll.addr = a; + if (!match) { + m68k_poll.addr2 = m68k_poll.addr1; + m68k_poll.addr1 = a; + } SekNotPolling = 0; } m68k_poll.cycles = cycles; @@ -97,15 +108,19 @@ void p32x_m68k_poll_event(u32 flags) Pico32x.emu_flags &= ~flags; SekSetStop(0); } - m68k_poll.addr = m68k_poll.cnt = 0; + m68k_poll.addr1 = m68k_poll.addr2 = m68k_poll.cnt = 0; } -static void sh2_poll_detect(SH2 *sh2, u32 a, u32 flags, int maxcnt) +void NOINLINE p32x_sh2_poll_detect(u32 a, SH2 *sh2, u32 flags, int maxcnt) { - int cycles_left = sh2_cycles_left(sh2); + u32 cycles_done = sh2_cycles_done_t(sh2); + u32 cycles_diff = cycles_done - sh2->poll_cycles; - if (a == sh2->poll_addr && sh2->poll_cycles - cycles_left <= 10) { - if (sh2->poll_cnt++ > maxcnt) { + // reading 2 consecutive 16bit values is probably a 32bit access. detect this + // by checking address (max 2 bytes away) and cycles (max 2 cycles later). + // no polling if more than 20 cycles have passed since last detect call. + if (a - sh2->poll_addr <= 2 && CYCLES_GE(20, cycles_diff)) { + if (CYCLES_GT(cycles_diff, 2) && ++sh2->poll_cnt >= maxcnt) { if (!(sh2->state & flags)) elprintf_sh2(sh2, EL_32X, "state: %02x->%02x", sh2->state, sh2->state | flags); @@ -113,40 +128,179 @@ static void sh2_poll_detect(SH2 *sh2, u32 a, u32 flags, int maxcnt) sh2->state |= flags; sh2_end_run(sh2, 1); pevt_log_sh2(sh2, EVT_POLL_START); - return; +#ifdef DRC_SH2 + // mark this as an address used for polling if SDRAM + if ((a & 0xc6000000) == 0x06000000) { + unsigned char *p = sh2->p_drcblk_ram; + p[(a & 0x3ffff) >> SH2_DRCBLK_RAM_SHIFT] |= 0x80; + // mark next word too to enable poll fifo for 32bit access + p[((a+2) & 0x3ffff) >> SH2_DRCBLK_RAM_SHIFT] |= 0x80; + } +#endif } } - else + else if (!(sh2->state & (SH2_STATE_CPOLL|SH2_STATE_VPOLL|SH2_STATE_RPOLL))) { sh2->poll_cnt = 0; - sh2->poll_addr = a; - sh2->poll_cycles = cycles_left; + sh2->poll_addr = a; + } + sh2->poll_cycles = cycles_done; } -void p32x_sh2_poll_event(SH2 *sh2, u32 flags, u32 m68k_cycles) +void NOINLINE p32x_sh2_poll_event(SH2 *sh2, u32 flags, u32 m68k_cycles) { if (sh2->state & flags) { elprintf_sh2(sh2, EL_32X, "state: %02x->%02x", sh2->state, sh2->state & ~flags); - if (sh2->m68krcycles_done < m68k_cycles) + if (sh2->m68krcycles_done < m68k_cycles && !(sh2->state & SH2_STATE_RUN)) sh2->m68krcycles_done = m68k_cycles; pevt_log_sh2_o(sh2, EVT_POLL_END); + sh2->state &= ~flags; } - sh2->state &= ~flags; - sh2->poll_addr = sh2->poll_cycles = sh2->poll_cnt = 0; + if (!(sh2->state & (SH2_STATE_CPOLL|SH2_STATE_VPOLL|SH2_STATE_RPOLL))) + sh2->poll_addr = sh2->poll_cycles = sh2->poll_cnt = 0; } -static void sh2s_sync_on_read(SH2 *sh2) +static void sh2s_sync_on_read(SH2 *sh2, unsigned cycles) { - int cycles; if (sh2->poll_cnt != 0) return; - cycles = sh2_cycles_done(sh2); - if (cycles > 600) - p32x_sync_other_sh2(sh2, sh2->m68krcycles_done + cycles / 3); + if (p32x_sh2_ready(sh2->other_sh2, cycles-250)) + p32x_sync_other_sh2(sh2, cycles); +} + +// poll fifo, stores writes to potential addresses used for polling. +// This is used to correctly deliver syncronisation data to the 3 cpus. The +// fifo stores 16 bit values, 8/32 bit accesses must be adapted accordingly. +#define PFIFO_SZ 4 +#define PFIFO_CNT 8 +struct sh2_poll_fifo { + u32 cycles; + u32 a; + u16 d; + int cpu; +} sh2_poll_fifo[PFIFO_CNT][PFIFO_SZ]; +unsigned sh2_poll_rd[PFIFO_CNT], sh2_poll_wr[PFIFO_CNT]; // ringbuffer pointers + +static NOINLINE u32 sh2_poll_read(u32 a, u32 d, unsigned int cycles, SH2* sh2) +{ + int hix = (a >> 1) % PFIFO_CNT; + struct sh2_poll_fifo *fifo = sh2_poll_fifo[hix]; + struct sh2_poll_fifo *p; + int cpu = sh2 ? sh2->is_slave : -1; + unsigned idx; + + a &= ~0x20000000; // ignore writethrough bit + // fetch oldest write to address from fifo, but stop when reaching the present + idx = sh2_poll_rd[hix]; + while (idx != sh2_poll_wr[hix] && CYCLES_GE(cycles, fifo[idx].cycles)) { + p = &fifo[idx]; + idx = (idx+1) % PFIFO_SZ; + + if (cpu != p->cpu) { + if (CYCLES_GT(cycles, p->cycles+80)) { + // drop older fifo stores that may cause synchronisation problems. + p->a = -1; + } else if (p->a == a) { + // replace current data with fifo value and discard fifo entry + d = p->d; + p->a = -1; + break; + } + } + } + return d; +} + +static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) +{ + int hix = (a >> 1) % PFIFO_CNT; + struct sh2_poll_fifo *fifo = sh2_poll_fifo[hix]; + struct sh2_poll_fifo *q; + int cpu = sh2 ? sh2->is_slave : -1; + unsigned rd = sh2_poll_rd[hix], wr = sh2_poll_wr[hix]; + unsigned idx, nrd; + + a &= ~0x20000000; // ignore writethrough bit + + // throw out any values written by other cpus, plus heading cancelled stuff + for (idx = nrd = wr; idx != rd; ) { + idx = (idx-1) % PFIFO_SZ; + q = &fifo[idx]; + if (q->a == a && q->cpu != cpu) { q->a = -1; } + if (q->a != -1) { nrd = idx; } + } + rd = nrd; + + // fold 2 consecutive writes to the same address to avoid reading of + // intermediate values that may cause synchronisation problems. + // NB this can take an eternity on m68k: mov.b , needs + // 28 m68k-cycles (~80 sh2-cycles) to complete (observed in Metal Head) + q = &fifo[(sh2_poll_wr[hix]-1) % PFIFO_SZ]; + if (rd != wr && q->a == a && !CYCLES_GT(cycles,q->cycles + (cpu<0 ? 30:4))) { + q->d = d; + } else { + // store write to poll address in fifo + fifo[wr] = + (struct sh2_poll_fifo){ .cycles = cycles, .a = a, .d = d, .cpu = cpu }; + wr = (wr+1) % PFIFO_SZ; + if (wr == rd) + // fifo overflow, discard oldest value + rd = (rd+1) % PFIFO_SZ; + } + + sh2_poll_rd[hix] = rd; sh2_poll_wr[hix] = wr; +} + +u32 REGPARM(3) p32x_sh2_poll_memory8(u32 a, u32 d, SH2 *sh2) +{ + int shift = (a & 1 ? 0 : 8); + d = (s8)(p32x_sh2_poll_memory16(a & ~1, d << shift, sh2) >> shift); + return d; +} + +u32 REGPARM(3) p32x_sh2_poll_memory16(u32 a, u32 d, SH2 *sh2) +{ + unsigned char *p = sh2->p_drcblk_ram; + unsigned int cycles; + + DRC_SAVE_SR(sh2); + // is this a synchronisation address? + if(p[(a & 0x3ffff) >> SH2_DRCBLK_RAM_SHIFT] & 0x80) { + cycles = sh2_cycles_done_m68k(sh2); + sh2s_sync_on_read(sh2, cycles); + // check poll fifo and sign-extend the result correctly + d = (s16)sh2_poll_read(a, d, cycles, sh2); + } + + p32x_sh2_poll_detect(a, sh2, SH2_STATE_RPOLL, 5); + + DRC_RESTORE_SR(sh2); + return d; +} + +u32 REGPARM(3) p32x_sh2_poll_memory32(u32 a, u32 d, SH2 *sh2) +{ + unsigned char *p = sh2->p_drcblk_ram; + unsigned int cycles; + + DRC_SAVE_SR(sh2); + // is this a synchronisation address? + if(p[(a & 0x3ffff) >> SH2_DRCBLK_RAM_SHIFT] & 0x80) { + cycles = sh2_cycles_done_m68k(sh2); + sh2s_sync_on_read(sh2, cycles); + // check poll fifo and sign-extend the result correctly + d = (sh2_poll_read(a, d >> 16, cycles, sh2) << 16) | + ((u16)sh2_poll_read(a+2, d, cycles, sh2)); + } + + p32x_sh2_poll_detect(a, sh2, SH2_STATE_RPOLL, 5); + + DRC_RESTORE_SR(sh2); + return d; } // SH2 faking @@ -191,23 +345,21 @@ static u32 p32x_reg_read16(u32 a) #else if ((a & 0x30) == 0x20) { unsigned int cycles = SekCyclesDone(); - int comreg = 1 << (a & 0x0f) / 2; - if (cycles - msh2.m68krcycles_done > 244 - || (Pico32x.comm_dirty & comreg)) + if (CYCLES_GT(cycles - msh2.m68krcycles_done, 244)) p32x_sync_sh2s(cycles); if (m68k_poll_detect(a, cycles, P32XF_68KCPOLL)) { SekSetStop(1); SekEndRun(16); } - goto out; + return sh2_poll_read(a, Pico32x.regs[a / 2], cycles, NULL); } #endif if (a == 2) { // INTM, INTS unsigned int cycles = SekCyclesDone(); - if (cycles - msh2.m68krcycles_done > 64) + if (CYCLES_GT(cycles - msh2.m68krcycles_done, 64)) p32x_sync_sh2s(cycles); goto out; } @@ -267,7 +419,7 @@ static void p32x_reg_write8(u32 a, u32 d) return; case 0x03: // irq ctl if ((d ^ r[0x02 / 2]) & 3) { - int cycles = SekCyclesDone(); + unsigned int cycles = SekCyclesDone(); p32x_sync_sh2s(cycles); r[0x02 / 2] = d & 3; p32x_update_cmd_irq(NULL, cycles); @@ -342,6 +494,35 @@ static void p32x_reg_write8(u32 a, u32 d) case 0x1d: case 0x1e: case 0x1f: + return; + case 0x20: // comm port + case 0x21: + case 0x22: + case 0x23: + case 0x24: + case 0x25: + case 0x26: + case 0x27: + case 0x28: + case 0x29: + case 0x2a: + case 0x2b: + case 0x2c: + case 0x2d: + case 0x2e: + case 0x2f: + if (REG8IN16(r, a) != d) { + unsigned int cycles = SekCyclesDone(); + + if (CYCLES_GT(cycles - msh2.m68krcycles_done, 64)) + p32x_sync_sh2s(cycles); + + REG8IN16(r, a) = d; + p32x_sh2_poll_event(&sh2s[0], SH2_STATE_CPOLL, cycles); + p32x_sh2_poll_event(&sh2s[1], SH2_STATE_CPOLL, cycles); + sh2_poll_write(a & ~1, r[a / 2], cycles, NULL); + } + return; case 0x30: return; case 0x31: // PWM control @@ -381,26 +562,6 @@ static void p32x_reg_write8(u32 a, u32 d) p32x_pwm_write16(a & ~1, d, NULL, SekCyclesDone()); return; } - - if ((a & 0x30) == 0x20) { - int cycles = SekCyclesDone(); - int comreg; - - if (REG8IN16(r, a) == d) - return; - - p32x_sync_sh2s(cycles); - - REG8IN16(r, a) = d; - p32x_sh2_poll_event(&sh2s[0], SH2_STATE_CPOLL, cycles); - p32x_sh2_poll_event(&sh2s[1], SH2_STATE_CPOLL, cycles); - comreg = 1 << (a & 0x0f) / 2; - Pico32x.comm_dirty |= comreg; - - if (cycles - (int)msh2.m68krcycles_done > 120) - p32x_sync_sh2s(cycles); - return; - } } static void p32x_reg_write16(u32 a, u32 d) @@ -411,59 +572,68 @@ static void p32x_reg_write16(u32 a, u32 d) // for things like bset on comm port m68k_poll.cnt = 0; - switch (a) { - case 0x00: // adapter ctl + switch (a/2) { + case 0x00/2: // adapter ctl if ((d ^ r[0]) & d & P32XS_nRES) p32x_reset_sh2s(); r[0] &= ~(P32XS_FM|P32XS_nRES|P32XS_ADEN); r[0] |= d & (P32XS_FM|P32XS_nRES|P32XS_ADEN); return; - case 0x08: // DREQ src + case 0x08/2: // DREQ src r[a / 2] = d & 0xff; return; - case 0x0a: + case 0x0a/2: r[a / 2] = d & ~1; return; - case 0x0c: // DREQ dest + case 0x0c/2: // DREQ dest r[a / 2] = d & 0xff; return; - case 0x0e: + case 0x0e/2: r[a / 2] = d; return; - case 0x10: // DREQ len + case 0x10/2: // DREQ len r[a / 2] = d & ~3; return; - case 0x12: // FIFO reg + case 0x12/2: // FIFO reg dreq0_write(r, d); return; - case 0x1a: // TV + mystery bit + case 0x1a/2: // TV + mystery bit r[a / 2] = d & 0x0101; return; - case 0x30: // PWM control + case 0x20/2: // comm port + case 0x22/2: + case 0x24/2: + case 0x26/2: + case 0x28/2: + case 0x2a/2: + case 0x2c/2: + case 0x2e/2: + if (r[a / 2] != d) { + unsigned int cycles = SekCyclesDone(); + + if (CYCLES_GT(cycles - msh2.m68krcycles_done, 64)) + p32x_sync_sh2s(cycles); + + r[a / 2] = d; + p32x_sh2_poll_event(&sh2s[0], SH2_STATE_CPOLL, cycles); + p32x_sh2_poll_event(&sh2s[1], SH2_STATE_CPOLL, cycles); + sh2_poll_write(a, (u16)d, cycles, NULL); + } + return; + case 0x30/2: // PWM control d = (r[a / 2] & ~0x0f) | (d & 0x0f); r[a / 2] = d; p32x_pwm_write16(a, d, NULL, SekCyclesDone()); return; - } - - // comm port - if ((a & 0x30) == 0x20) { - int cycles = SekCyclesDone(); - int comreg; - - p32x_sync_sh2s(cycles); - - r[a / 2] = d; - p32x_sh2_poll_event(&sh2s[0], SH2_STATE_CPOLL, cycles); - p32x_sh2_poll_event(&sh2s[1], SH2_STATE_CPOLL, cycles); - comreg = 1 << (a & 0x0f) / 2; - Pico32x.comm_dirty |= comreg; - return; - } - // PWM - else if ((a & 0x30) == 0x30) { - p32x_pwm_write16(a, d, NULL, SekCyclesDone()); - return; + case 0x32/2: + case 0x34/2: + case 0x36/2: + case 0x38/2: + case 0x3a/2: + case 0x3c/2: + case 0x3e/2: + p32x_pwm_write16(a, d, NULL, SekCyclesDone()); + return; } p32x_reg_write8(a + 1, d); @@ -541,7 +711,7 @@ static void p32x_vdp_write16(u32 a, u32 d, SH2 *sh2) } Pico32x.vdp_regs[0x06 / 2] = a; Pico32x.vdp_regs[0x08 / 2] = d; - if (sh2 != NULL && len > 4) { + if (sh2 != NULL && len > 8) { Pico32x.vdp_regs[0x0a / 2] |= P32XV_nFEN; // supposedly takes 3 bus/6 sh2 cycles? or 3 sh2 cycles? p32x_event_schedule_sh2(sh2, P32X_EVENT_FILLEND, 3 + len); @@ -558,25 +728,27 @@ static void p32x_vdp_write16(u32 a, u32 d, SH2 *sh2) static u32 p32x_sh2reg_read16(u32 a, SH2 *sh2) { u16 *r = Pico32x.regs; + unsigned cycles; a &= 0x3e; - switch (a) { - case 0x00: // adapter/irq ctl + switch (a/2) { + case 0x00/2: // adapter/irq ctl return (r[0] & P32XS_FM) | Pico32x.sh2_regs[0] | Pico32x.sh2irq_mask[sh2->is_slave]; - case 0x04: // H count (often as comm too) - sh2_poll_detect(sh2, a, SH2_STATE_CPOLL, 3); - sh2s_sync_on_read(sh2); - return Pico32x.sh2_regs[4 / 2]; - case 0x06: + case 0x04/2: // H count (often as comm too) + p32x_sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 9); + cycles = sh2_cycles_done_m68k(sh2); + sh2s_sync_on_read(sh2, cycles); + return sh2_poll_read(a, Pico32x.sh2_regs[4 / 2], cycles, sh2); + case 0x06/2: return (r[a / 2] & ~P32XS_FULL) | 0x4000; - case 0x08: // DREQ src - case 0x0a: - case 0x0c: // DREQ dst - case 0x0e: - case 0x10: // DREQ len + case 0x08/2: // DREQ src + case 0x0a/2: + case 0x0c/2: // DREQ dst + case 0x0e/2: + case 0x10/2: // DREQ len return r[a / 2]; - case 0x12: // DREQ FIFO - does this work on hw? + case 0x12/2: // DREQ FIFO - does this work on hw? if (Pico32x.dmac0_fifo_ptr > 0) { Pico32x.dmac0_fifo_ptr--; r[a / 2] = Pico32x.dmac_fifo[0]; @@ -584,23 +756,35 @@ static u32 p32x_sh2reg_read16(u32 a, SH2 *sh2) Pico32x.dmac0_fifo_ptr * 2); } return r[a / 2]; - case 0x14: - case 0x16: - case 0x18: - case 0x1a: - case 0x1c: + case 0x14/2: + case 0x16/2: + case 0x18/2: + case 0x1a/2: + case 0x1c/2: return 0; // ? + case 0x20/2: // comm port + case 0x22/2: + case 0x24/2: + case 0x26/2: + case 0x28/2: + case 0x2a/2: + case 0x2c/2: + case 0x2e/2: + p32x_sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 9); + cycles = sh2_cycles_done_m68k(sh2); + sh2s_sync_on_read(sh2, cycles); + return sh2_poll_read(a, r[a / 2], cycles, sh2); + case 0x30/2: // PWM + case 0x32/2: + case 0x34/2: + case 0x36/2: + case 0x38/2: + case 0x3a/2: + case 0x3c/2: + case 0x3e/2: + return p32x_pwm_read16(a, sh2, sh2_cycles_done_m68k(sh2)); } - // comm port - if ((a & 0x30) == 0x20) { - sh2_poll_detect(sh2, a, SH2_STATE_CPOLL, 3); - sh2s_sync_on_read(sh2); - return r[a / 2]; - } - if ((a & 0x30) == 0x30) - return p32x_pwm_read16(a, sh2, sh2_cycles_done_m68k(sh2)); - elprintf_sh2(sh2, EL_32X|EL_ANOMALY, "unhandled sysreg r16 [%02x] @%08x", a, sh2_pc(sh2)); return 0; @@ -612,7 +796,7 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) u32 old; a &= 0x3f; - sh2->poll_addr = 0; + sh2->poll_cnt = 0; switch (a) { case 0x00: // FM @@ -640,10 +824,39 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) case 0x05: // H count d &= 0xff; if (Pico32x.sh2_regs[4 / 2] != d) { + unsigned int cycles = sh2_cycles_done_m68k(sh2); Pico32x.sh2_regs[4 / 2] = d; - p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, - sh2_cycles_done_m68k(sh2)); - sh2_end_run(sh2, 4); + p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); + if (p32x_sh2_ready(sh2->other_sh2, cycles+8)) + sh2_end_run(sh2, 4); + sh2_poll_write(a & ~1, d, cycles, sh2); + } + return; + case 0x20: // comm port + case 0x21: + case 0x22: + case 0x23: + case 0x24: + case 0x25: + case 0x26: + case 0x27: + case 0x28: + case 0x29: + case 0x2a: + case 0x2b: + case 0x2c: + case 0x2d: + case 0x2e: + case 0x2f: + if (REG8IN16(r, a) != d) { + unsigned int cycles = sh2_cycles_done_m68k(sh2); + + REG8IN16(r, a) = d; + p32x_m68k_poll_event(P32XF_68KCPOLL); + p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); + if (p32x_sh2_ready(sh2->other_sh2, cycles+8)) + sh2_end_run(sh2, 1); + sh2_poll_write(a & ~1, r[a / 2], cycles, sh2); } return; case 0x30: @@ -683,24 +896,10 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) case 0x3f: return; pwm_write: - p32x_pwm_write16(a & ~1, d, sh2, 0); + p32x_pwm_write16(a & ~1, d, sh2, sh2_cycles_done_m68k(sh2)); return; } - if ((a & 0x30) == 0x20) { - int comreg; - if (REG8IN16(r, a) == d) - return; - - REG8IN16(r, a) = d; - p32x_m68k_poll_event(P32XF_68KCPOLL); - p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, - sh2_cycles_done_m68k(sh2)); - comreg = 1 << (a & 0x0f) / 2; - Pico32x.comm_dirty |= comreg; - return; - } - elprintf(EL_32X|EL_ANOMALY, "unhandled sysreg w8 [%02x] %02x @%08x", a, d, sh2_pc(sh2)); } @@ -709,51 +908,60 @@ static void p32x_sh2reg_write16(u32 a, u32 d, SH2 *sh2) { a &= 0x3e; - sh2->poll_addr = 0; - - // comm - if ((a & 0x30) == 0x20) { - int comreg; - if (Pico32x.regs[a / 2] == d) - return; - - Pico32x.regs[a / 2] = d; - p32x_m68k_poll_event(P32XF_68KCPOLL); - p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, - sh2_cycles_done_m68k(sh2)); - comreg = 1 << (a & 0x0f) / 2; - Pico32x.comm_dirty |= comreg; - return; - } - // PWM - else if ((a & 0x30) == 0x30) { - p32x_pwm_write16(a, d, sh2, sh2_cycles_done_m68k(sh2)); - return; - } + sh2->poll_cnt = 0; - switch (a) { - case 0: // FM + switch (a/2) { + case 0x00/2: // FM Pico32x.regs[0] &= ~P32XS_FM; Pico32x.regs[0] |= d & P32XS_FM; break; - case 0x14: + case 0x14/2: Pico32x.sh2irqs &= ~P32XI_VRES; goto irls; - case 0x16: + case 0x16/2: Pico32x.sh2irqi[sh2->is_slave] &= ~P32XI_VINT; goto irls; - case 0x18: + case 0x18/2: Pico32x.sh2irqi[sh2->is_slave] &= ~P32XI_HINT; goto irls; - case 0x1a: + case 0x1a/2: Pico32x.regs[2 / 2] &= ~(1 << sh2->is_slave); p32x_update_cmd_irq(sh2, 0); return; - case 0x1c: + case 0x1c/2: p32x_pwm_sync_to_sh2(sh2); Pico32x.sh2irqi[sh2->is_slave] &= ~P32XI_PWM; p32x_pwm_schedule_sh2(sh2); goto irls; + case 0x20/2: // comm port + case 0x22/2: + case 0x24/2: + case 0x26/2: + case 0x28/2: + case 0x2a/2: + case 0x2c/2: + case 0x2e/2: + if (Pico32x.regs[a / 2] != d) { + unsigned int cycles = sh2_cycles_done_m68k(sh2); + + Pico32x.regs[a / 2] = d; + p32x_m68k_poll_event(P32XF_68KCPOLL); + p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); + if (p32x_sh2_ready(sh2->other_sh2, cycles+8)) + sh2_end_run(sh2, 1); + sh2_poll_write(a, d, cycles, sh2); + } + return; + case 0x30/2: // PWM + case 0x32/2: + case 0x34/2: + case 0x36/2: + case 0x38/2: + case 0x3a/2: + case 0x3c/2: + case 0x3e/2: + p32x_pwm_write16(a, d, sh2, sh2_cycles_done_m68k(sh2)); + return; } p32x_sh2reg_write8(a | 1, d, sh2); @@ -1064,41 +1272,41 @@ void PicoWrite16_32x(u32 a, u32 d) } /* quirk: in both normal and overwrite areas only nonzero values go through */ -#define sh2_write8_dramN(n) \ +#define sh2_write8_dramN(p, a, d) \ if ((d & 0xff) != 0) { \ - u8 *dram = (u8 *)Pico32xMem->dram[n]; \ + u8 *dram = (u8 *)p; \ dram[(a & 0x1ffff) ^ 1] = d; \ } static void m68k_write8_dram0_ow(u32 a, u32 d) { - sh2_write8_dramN(0); + sh2_write8_dramN(Pico32xMem->dram[0], a, d); } static void m68k_write8_dram1_ow(u32 a, u32 d) { - sh2_write8_dramN(1); + sh2_write8_dramN(Pico32xMem->dram[1], a, d); } -#define sh2_write16_dramN(n) \ - u16 *pd = &Pico32xMem->dram[n][(a & 0x1ffff) / 2]; \ +#define sh2_write16_dramN(p, a, d) \ + u16 *pd = &((u16 *)p)[(a & 0x1ffff) / 2]; \ if (!(a & 0x20000)) { \ *pd = d; \ - return; \ - } \ - /* overwrite */ \ - if (!(d & 0xff00)) d |= *pd & 0xff00; \ - if (!(d & 0x00ff)) d |= *pd & 0x00ff; \ - *pd = d; + } else { \ + u16 v = *pd; /* overwrite */ \ + if (!(d & 0x00ff)) d |= v & 0x00ff; \ + if (!(d & 0xff00)) d |= v & 0xff00; \ + *pd = d; \ + } static void m68k_write16_dram0_ow(u32 a, u32 d) { - sh2_write16_dramN(0); + sh2_write16_dramN(Pico32xMem->dram[0], a, d); } static void m68k_write16_dram1_ow(u32 a, u32 d) { - sh2_write16_dramN(1); + sh2_write16_dramN(Pico32xMem->dram[1], a, d); } // ----------------------------------------------------------------- @@ -1227,20 +1435,21 @@ static void bank_switch_rom_68k(int b) // ----------------------------------------------------------------- // read8 -static u32 sh2_read8_unmapped(u32 a, SH2 *sh2) +static REGPARM(2) u32 sh2_read8_unmapped(u32 a, SH2 *sh2) { elprintf_sh2(sh2, EL_32X, "unmapped r8 [%08x] %02x @%06x", a, 0, sh2_pc(sh2)); return 0; } -static u32 sh2_read8_cs0(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read8_cs0(u32 a, SH2 *sh2) { u32 d = 0; + DRC_SAVE_SR(sh2); sh2_burn_cycles(sh2, 1*2); - // 0x3ffc0 is veridied + // 0x3ffc0 is verified if ((a & 0x3ffc0) == 0x4000) { d = p32x_sh2reg_read16(a, sh2); goto out_16to8; @@ -1248,22 +1457,23 @@ static u32 sh2_read8_cs0(u32 a, SH2 *sh2) if ((a & 0x3fff0) == 0x4100) { d = p32x_vdp_read16(a); - sh2_poll_detect(sh2, a, SH2_STATE_VPOLL, 7); + p32x_sh2_poll_detect(a, sh2, SH2_STATE_VPOLL, 9); goto out_16to8; } - // TODO: mirroring? - if (!sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_m)) - return Pico32xMem->sh2_rom_m.b[a ^ 1]; - if (sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_s)) - return Pico32xMem->sh2_rom_s.b[a ^ 1]; - if ((a & 0x3fe00) == 0x4200) { d = Pico32xMem->pal[(a & 0x1ff) / 2]; goto out_16to8; } - return sh2_read8_unmapped(a, sh2); + // TODO: mirroring? + if (!sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_m)) + d = Pico32xMem->sh2_rom_m.b[a ^ 1]; + else if (sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_s)) + d = Pico32xMem->sh2_rom_s.b[a ^ 1]; + else + d = sh2_read8_unmapped(a, sh2); + goto out; out_16to8: if (a & 1) @@ -1271,80 +1481,138 @@ static u32 sh2_read8_cs0(u32 a, SH2 *sh2) else d >>= 8; +out: elprintf_sh2(sh2, EL_32X, "r8 [%08x] %02x @%06x", a, d, sh2_pc(sh2)); - return d; -} - -static u32 sh2_read8_da(u32 a, SH2 *sh2) -{ - return sh2->data_array[(a & 0xfff) ^ 1]; + DRC_RESTORE_SR(sh2); + return (s8)d; } // for ssf2 -static u32 sh2_read8_rom(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read8_rom(u32 a, SH2 *sh2) { u32 bank = carthw_ssf2_banks[(a >> 19) & 7] << 19; - return Pico.rom[(bank + (a & 0x7ffff)) ^ 1]; + s8 *p = sh2->p_rom; + return p[(bank + (a & 0x7ffff)) ^ 1]; } // read16 -static u32 sh2_read16_unmapped(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read16_unmapped(u32 a, SH2 *sh2) { elprintf_sh2(sh2, EL_32X, "unmapped r16 [%08x] %04x @%06x", a, 0, sh2_pc(sh2)); return 0; } -static u32 sh2_read16_cs0(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read16_cs0(u32 a, SH2 *sh2) { u32 d = 0; + DRC_SAVE_SR(sh2); sh2_burn_cycles(sh2, 1*2); if ((a & 0x3ffc0) == 0x4000) { d = p32x_sh2reg_read16(a, sh2); if (!(EL_LOGMASK & EL_PWM) && (a & 0x30) == 0x30) // hide PWM - return d; + goto out_noprint; goto out; } if ((a & 0x3fff0) == 0x4100) { d = p32x_vdp_read16(a); - sh2_poll_detect(sh2, a, SH2_STATE_VPOLL, 7); + p32x_sh2_poll_detect(a, sh2, SH2_STATE_VPOLL, 9); goto out; } - if (!sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_m)) - return Pico32xMem->sh2_rom_m.w[a / 2]; - if (sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_s)) - return Pico32xMem->sh2_rom_s.w[a / 2]; - if ((a & 0x3fe00) == 0x4200) { d = Pico32xMem->pal[(a & 0x1ff) / 2]; goto out; } - return sh2_read16_unmapped(a, sh2); + if (!sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_m)) + d = Pico32xMem->sh2_rom_m.w[a / 2]; + else if (sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_s)) + d = Pico32xMem->sh2_rom_s.w[a / 2]; + else + d = sh2_read16_unmapped(a, sh2); out: elprintf_sh2(sh2, EL_32X, "r16 [%08x] %04x @%06x", a, d, sh2_pc(sh2)); - return d; +out_noprint: + DRC_RESTORE_SR(sh2); + return (s16)d; +} + +static u32 REGPARM(2) sh2_read16_rom(u32 a, SH2 *sh2) +{ + u32 bank = carthw_ssf2_banks[(a >> 19) & 7] << 19; + s16 *p = sh2->p_rom; + return p[(bank + (a & 0x7fffe)) / 2]; +} + +static u32 REGPARM(2) sh2_read32_unmapped(u32 a, SH2 *sh2) +{ + elprintf_sh2(sh2, EL_32X, "unmapped r32 [%08x] %08x @%06x", + a, 0, sh2_pc(sh2)); + return 0; } -static u32 sh2_read16_da(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read32_cs0(u32 a, SH2 *sh2) { - return ((u16 *)sh2->data_array)[(a & 0xfff) / 2]; + u32 d1 = sh2_read16_cs0(a, sh2) << 16, d2 = sh2_read16_cs0(a + 2, sh2) << 16; + return d1 | (d2 >> 16); } -static u32 sh2_read16_rom(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read32_rom(u32 a, SH2 *sh2) { u32 bank = carthw_ssf2_banks[(a >> 19) & 7] << 19; - return *(u16 *)(Pico.rom + bank + (a & 0x7fffe)); + u32 *p = sh2->p_rom; + u32 d = p[(bank + (a & 0x7fffc)) / 4]; + return (d << 16) | (d >> 16); } // writes +#ifdef DRC_SH2 +static void sh2_sdram_poll(u32 a, u32 d, SH2 *sh2) +{ + unsigned cycles; + + DRC_SAVE_SR(sh2); + cycles = sh2_cycles_done_m68k(sh2); + sh2_poll_write(a, d, cycles, sh2); + p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_RPOLL, cycles); + if (p32x_sh2_ready(sh2->other_sh2, cycles+8)) + sh2_end_run(sh2, 1); + DRC_RESTORE_SR(sh2); +} + +void sh2_sdram_checks(u32 a, u32 d, SH2 *sh2, u32 t) +{ + if (t & 0x80) sh2_sdram_poll(a, d, sh2); + if (t & 0x7f) sh2_drc_wcheck_ram(a, 2, sh2); +} + +void sh2_sdram_checks_l(u32 a, u32 d, SH2 *sh2, u32 t) +{ + if (t & 0x000080) sh2_sdram_poll(a, d>>16, sh2); + if (t & 0x800000) sh2_sdram_poll(a+2, d, sh2); + if (t & ~0x800080) sh2_drc_wcheck_ram(a, 4, sh2); +} + +#ifndef _ASM_32X_MEMORY_C +static void sh2_da_checks(u32 a, u32 t, SH2 *sh2) +{ + sh2_drc_wcheck_da(a, 2, sh2); +} + +static void sh2_da_checks_l(u32 a, u32 t, SH2 *sh2) +{ + sh2_drc_wcheck_da(a, 4, sh2); +} +#endif +#endif + static void REGPARM(3) sh2_write_ignore(u32 a, u32 d, SH2 *sh2) { } @@ -1358,66 +1626,69 @@ static void REGPARM(3) sh2_write8_unmapped(u32 a, u32 d, SH2 *sh2) static void REGPARM(3) sh2_write8_cs0(u32 a, u32 d, SH2 *sh2) { + DRC_SAVE_SR(sh2); elprintf_sh2(sh2, EL_32X, "w8 [%08x] %02x @%06x", a, d & 0xff, sh2_pc(sh2)); + if ((a & 0x3ffc0) == 0x4000) { + p32x_sh2reg_write8(a, d, sh2); + goto out; + } + if (Pico32x.regs[0] & P32XS_FM) { if ((a & 0x3fff0) == 0x4100) { - sh2->poll_addr = 0; + sh2->poll_cnt = 0; p32x_vdp_write8(a, d); - return; + goto out; } - } - if ((a & 0x3ffc0) == 0x4000) { - p32x_sh2reg_write8(a, d, sh2); - return; + if ((a & 0x3fe00) == 0x4200) { + sh2->poll_cnt = 0; + ((u8 *)Pico32xMem->pal)[(a & 0x1ff) ^ 1] = d; + Pico32x.dirty_pal = 1; + goto out; + } } sh2_write8_unmapped(a, d, sh2); +out: + DRC_RESTORE_SR(sh2); } -static void REGPARM(3) sh2_write8_dram0(u32 a, u32 d, SH2 *sh2) -{ - sh2_write8_dramN(0); -} - -static void REGPARM(3) sh2_write8_dram1(u32 a, u32 d, SH2 *sh2) +#ifdef _ASM_32X_MEMORY_C +extern void REGPARM(3) sh2_write8_dram(u32 a, u32 d, SH2 *sh2); +extern void REGPARM(3) sh2_write8_sdram(u32 a, u32 d, SH2 *sh2); +extern void REGPARM(3) sh2_write8_da(u32 a, u32 d, SH2 *sh2); +#else +static void REGPARM(3) sh2_write8_dram(u32 a, u32 d, SH2 *sh2) { - sh2_write8_dramN(1); + sh2_write8_dramN(sh2->p_dram, a, d); } static void REGPARM(3) sh2_write8_sdram(u32 a, u32 d, SH2 *sh2) { - u32 a1 = a & 0x3ffff; + u32 a1 = (a & 0x3ffff) ^ 1; + ((u8 *)sh2->p_sdram)[a1] = d; #ifdef DRC_SH2 - int t = Pico32xMem->drcblk_ram[a1 >> SH2_DRCBLK_RAM_SHIFT]; + u8 *p = sh2->p_drcblk_ram; + u32 t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; if (t) - sh2_drc_wcheck_ram(a, t, sh2->is_slave); + sh2_sdram_checks(a & ~1, ((u16 *)sh2->p_sdram)[a1 / 2], sh2, t); #endif - Pico32xMem->sdram[a1 ^ 1] = d; -} - -static void REGPARM(3) sh2_write8_sdram_wt(u32 a, u32 d, SH2 *sh2) -{ - // xmen sync hack.. - if (a < 0x26000200) - sh2_end_run(sh2, 32); - - sh2_write8_sdram(a, d, sh2); } static void REGPARM(3) sh2_write8_da(u32 a, u32 d, SH2 *sh2) { - u32 a1 = a & 0xfff; + u32 a1 = (a & 0xfff) ^ 1; + sh2->data_array[a1] = d; #ifdef DRC_SH2 - int id = sh2->is_slave; - int t = Pico32xMem->drcblk_da[id][a1 >> SH2_DRCBLK_DA_SHIFT]; + u8 *p = sh2->p_drcblk_da; + u32 t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; if (t) - sh2_drc_wcheck_da(a, t, id); + sh2_da_checks(a, t, sh2); #endif - sh2->data_array[a1 ^ 1] = d; } +#endif // write16 static void REGPARM(3) sh2_write16_unmapped(u32 a, u32 d, SH2 *sh2) @@ -1428,65 +1699,153 @@ static void REGPARM(3) sh2_write16_unmapped(u32 a, u32 d, SH2 *sh2) static void REGPARM(3) sh2_write16_cs0(u32 a, u32 d, SH2 *sh2) { + DRC_SAVE_SR(sh2); if (((EL_LOGMASK & EL_PWM) || (a & 0x30) != 0x30)) // hide PWM elprintf_sh2(sh2, EL_32X, "w16 [%08x] %04x @%06x", a, d & 0xffff, sh2_pc(sh2)); + if ((a & 0x3ffc0) == 0x4000) { + p32x_sh2reg_write16(a, d, sh2); + goto out; + } + if (Pico32x.regs[0] & P32XS_FM) { if ((a & 0x3fff0) == 0x4100) { - sh2->poll_addr = 0; + sh2->poll_cnt = 0; p32x_vdp_write16(a, d, sh2); - return; + goto out; } if ((a & 0x3fe00) == 0x4200) { + sh2->poll_cnt = 0; Pico32xMem->pal[(a & 0x1ff) / 2] = d; Pico32x.dirty_pal = 1; - return; + goto out; } } - if ((a & 0x3ffc0) == 0x4000) { - p32x_sh2reg_write16(a, d, sh2); - return; - } - sh2_write16_unmapped(a, d, sh2); +out: + DRC_RESTORE_SR(sh2); } -static void REGPARM(3) sh2_write16_dram0(u32 a, u32 d, SH2 *sh2) -{ - sh2_write16_dramN(0); -} - -static void REGPARM(3) sh2_write16_dram1(u32 a, u32 d, SH2 *sh2) +#ifdef _ASM_32X_MEMORY_C +extern void REGPARM(3) sh2_write16_dram(u32 a, u32 d, SH2 *sh2); +extern void REGPARM(3) sh2_write16_sdram(u32 a, u32 d, SH2 *sh2); +extern void REGPARM(3) sh2_write16_da(u32 a, u32 d, SH2 *sh2); +#else +static void REGPARM(3) sh2_write16_dram(u32 a, u32 d, SH2 *sh2) { - sh2_write16_dramN(1); + sh2_write16_dramN(sh2->p_dram, a, d); } static void REGPARM(3) sh2_write16_sdram(u32 a, u32 d, SH2 *sh2) { - u32 a1 = a & 0x3ffff; + u32 a1 = a & 0x3fffe; + ((u16 *)sh2->p_sdram)[a1 / 2] = d; #ifdef DRC_SH2 - int t = Pico32xMem->drcblk_ram[a1 >> SH2_DRCBLK_RAM_SHIFT]; + u8 *p = sh2->p_drcblk_ram; + u32 t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; if (t) - sh2_drc_wcheck_ram(a, t, sh2->is_slave); + sh2_sdram_checks(a, d, sh2, t); #endif - ((u16 *)Pico32xMem->sdram)[a1 / 2] = d; } static void REGPARM(3) sh2_write16_da(u32 a, u32 d, SH2 *sh2) { - u32 a1 = a & 0xfff; + u32 a1 = a & 0xffe; + ((u16 *)sh2->data_array)[a1 / 2] = d; #ifdef DRC_SH2 - int id = sh2->is_slave; - int t = Pico32xMem->drcblk_da[id][a1 >> SH2_DRCBLK_DA_SHIFT]; + u8 *p = sh2->p_drcblk_da; + u32 t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; if (t) - sh2_drc_wcheck_da(a, t, id); + sh2_da_checks(a, t, sh2); #endif - ((u16 *)sh2->data_array)[a1 / 2] = d; +} +#endif + +static void REGPARM(3) sh2_write16_rom(u32 a, u32 d, SH2 *sh2) +{ + u32 a1 = a & 0x3ffffe; + // tweak for WWF Raw: does writes to ROM area, and it doesn't work without + // allowing this. + // Presumably the write goes to the CPU cache and is read back from there, + // but it would be extremely costly to emulate cache behaviour. Just allow + // writes to that region, hoping that the original ROM values are never used. + if ((a1 & 0x3e0000) == 0x3e0000) + ((u16 *)sh2->p_rom)[a1 / 2] = d; + else + sh2_write16_unmapped(a, d, sh2); +} + +static void REGPARM(3) sh2_write32_unmapped(u32 a, u32 d, SH2 *sh2) +{ + elprintf_sh2(sh2, EL_32X, "unmapped w32 [%08x] %08x @%06x", + a, d, sh2_pc(sh2)); +} + +static void REGPARM(3) sh2_write32_cs0(u32 a, u32 d, SH2 *sh2) +{ + sh2_write16_cs0(a, d >> 16, sh2); + sh2_write16_cs0(a + 2, d, sh2); +} + +#define sh2_write32_dramN(p, a, d) \ + u32 *pd = &((u32 *)p)[(a & 0x1ffff) / 4]; \ + if (!(a & 0x20000)) { \ + *pd = (d << 16) | (d >> 16); \ + } else { \ + /* overwrite */ \ + u32 v = *pd, m = 0; d = (d << 16) | (d >> 16) ; \ + if (!(d & 0x000000ff)) m |= 0x000000ff; \ + if (!(d & 0x0000ff00)) m |= 0x0000ff00; \ + if (!(d & 0x00ff0000)) m |= 0x00ff0000; \ + if (!(d & 0xff000000)) m |= 0xff000000; \ + *pd = d | (v&m); \ + } + +#ifdef _ASM_32X_MEMORY_C +extern void REGPARM(3) sh2_write32_dram(u32 a, u32 d, SH2 *sh2); +extern void REGPARM(3) sh2_write32_sdram(u32 a, u32 d, SH2 *sh2); +extern void REGPARM(3) sh2_write32_da(u32 a, u32 d, SH2 *sh2); +#else +static void REGPARM(3) sh2_write32_dram(u32 a, u32 d, SH2 *sh2) +{ + sh2_write32_dramN(sh2->p_dram, a, d); } +static void REGPARM(3) sh2_write32_sdram(u32 a, u32 d, SH2 *sh2) +{ + u32 a1 = a & 0x3fffc; + *(u32 *)(sh2->p_sdram + a1) = (d << 16) | (d >> 16); +#ifdef DRC_SH2 + u8 *p = sh2->p_drcblk_ram; + u32 t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; + u32 u = p[(a1+2) >> SH2_DRCBLK_RAM_SHIFT]; + if (t|(u<<16)) + sh2_sdram_checks_l(a, d, sh2, t|(u<<16)); +#endif +} + +static void REGPARM(3) sh2_write32_da(u32 a, u32 d, SH2 *sh2) +{ + u32 a1 = a & 0xffc; + *((u32 *)sh2->data_array + a1/4) = (d << 16) | (d >> 16); +#ifdef DRC_SH2 + u8 *p = sh2->p_drcblk_da; + u32 t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; + u32 u = p[(a1+2) >> SH2_DRCBLK_DA_SHIFT]; + if (t|(u<<16)) + sh2_da_checks_l(a, t|(u<<16), sh2); +#endif +} +#endif + +static void REGPARM(3) sh2_write32_rom(u32 a, u32 d, SH2 *sh2) +{ + sh2_write16_rom(a, d >> 16, sh2); + sh2_write16_rom(a + 2, d, sh2); +} typedef u32 (sh2_read_handler)(u32 a, SH2 *sh2); typedef void REGPARM(3) (sh2_write_handler)(u32 a, u32 d, SH2 *sh2); @@ -1504,10 +1863,10 @@ u32 REGPARM(2) p32x_sh2_read8(u32 a, SH2 *sh2) sh2_map += SH2MAP_ADDR2OFFS_R(a); p = sh2_map->addr; - if (map_flag_set(p)) - return ((sh2_read_handler *)(p << 1))(a, sh2); + if (!map_flag_set(p)) + return *(s8 *)((p << 1) + ((a & sh2_map->mask) ^ 1)); else - return *(u8 *)((p << 1) + ((a & sh2_map->mask) ^ 1)); + return ((sh2_read_handler *)(p << 1))(a, sh2); } u32 REGPARM(2) p32x_sh2_read16(u32 a, SH2 *sh2) @@ -1517,33 +1876,24 @@ u32 REGPARM(2) p32x_sh2_read16(u32 a, SH2 *sh2) sh2_map += SH2MAP_ADDR2OFFS_R(a); p = sh2_map->addr; - if (map_flag_set(p)) - return ((sh2_read_handler *)(p << 1))(a, sh2); + if (!map_flag_set(p)) + return *(s16 *)((p << 1) + (a & sh2_map->mask)); else - return *(u16 *)((p << 1) + ((a & sh2_map->mask) & ~1)); + return ((sh2_read_handler *)(p << 1))(a, sh2); } u32 REGPARM(2) p32x_sh2_read32(u32 a, SH2 *sh2) { - const sh2_memmap *sh2_map = sh2->read16_map; - sh2_read_handler *handler; - u32 offs; + const sh2_memmap *sh2_map = sh2->read32_map; uptr p; - offs = SH2MAP_ADDR2OFFS_R(a); - sh2_map += offs; + sh2_map += SH2MAP_ADDR2OFFS_R(a); p = sh2_map->addr; if (!map_flag_set(p)) { - // XXX: maybe 32bit access instead with ror? - u16 *pd = (u16 *)((p << 1) + ((a & sh2_map->mask) & ~1)); - return (pd[0] << 16) | pd[1]; - } - - if (offs == SH2MAP_ADDR2OFFS_R(0xffffc000)) - return sh2_peripheral_read32(a, sh2); - - handler = (sh2_read_handler *)(p << 1); - return (handler(a, sh2) << 16) | handler(a + 2, sh2); + u32 *pd = (u32 *)((p << 1) + (a & sh2_map->mask)); + return (*pd << 16) | (*pd >> 16); + } else + return ((sh2_read_handler *)(p << 1))(a, sh2); } void REGPARM(3) p32x_sh2_write8(u32 a, u32 d, SH2 *sh2) @@ -1566,27 +1916,111 @@ void REGPARM(3) p32x_sh2_write16(u32 a, u32 d, SH2 *sh2) void REGPARM(3) p32x_sh2_write32(u32 a, u32 d, SH2 *sh2) { - const void **sh2_wmap = sh2->write16_tab; + const void **sh2_wmap = sh2->write32_tab; sh2_write_handler *wh; - u32 offs; - offs = SH2MAP_ADDR2OFFS_W(a); + wh = sh2_wmap[SH2MAP_ADDR2OFFS_W(a)]; + wh(a, d, sh2); +} - if (offs == SH2MAP_ADDR2OFFS_W(0xffffc000)) { - sh2_peripheral_write32(a, d, sh2); - return; +void *p32x_sh2_get_mem_ptr(u32 a, u32 *mask, SH2 *sh2) +{ + const sh2_memmap *mm = sh2->read8_map; + void *ret = (void *)-1; + + mm += SH2MAP_ADDR2OFFS_R(a); + if (!map_flag_set(mm->addr)) { + // directly mapped memory (SDRAM, ROM, data array) + ret = (void *)(mm->addr << 1); + *mask = mm->mask; + } else if ((a & ~0x7ff) == 0) { + // BIOS, has handler function since it shares its segment with I/O + ret = sh2->p_bios; + *mask = 0x7ff; + } else if ((a & 0xc6000000) == 0x02000000) { + // banked ROM. Return bank address + u32 bank = carthw_ssf2_banks[(a >> 19) & 7] << 19; + ret = sh2->p_rom + bank; + *mask = 0x07ffff; + } + + return ret; +} + +int p32x_sh2_memcpy(u32 dst, u32 src, int count, int size, SH2 *sh2) +{ + u32 mask; + void *ps, *pd; + int len, i; + + // check if src and dst points to memory (rom/sdram/dram/da) + if ((pd = p32x_sh2_get_mem_ptr(dst, &mask, sh2)) == (void *)-1) + return 0; + if ((ps = p32x_sh2_get_mem_ptr(src, &mask, sh2)) == (void *)-1) + return 0; + ps += src & mask; + len = count * size; + + // DRAM in byte access is always in overwrite mode + if (pd == sh2->p_dram && size == 1) + dst |= 0x20000; + + // align dst to halfword + if (dst & 1) { + p32x_sh2_write8(dst, *(u8 *)((uptr)ps ^ 1), sh2); + ps++, dst++, len --; + } + + // copy data + if ((uptr)ps & 1) { + // unaligned, use halfword copy mode to reduce memory bandwidth + u16 *sp = (u16 *)(ps - 1); + u16 dl, dh = *sp++; + for (i = 0; i < (len & ~1); i += 2, dst += 2, sp++) { + dl = dh, dh = *sp; + p32x_sh2_write16(dst, (dh >> 8) | (dl << 8), sh2); + } + if (len & 1) + p32x_sh2_write8(dst, dh, sh2); + } else { + // dst and src at least halfword aligned + u16 *sp = (u16 *)ps; + // align dst to word + if ((dst & 2) && len >= 2) { + p32x_sh2_write16(dst, *sp++, sh2); + dst += 2, len -= 2; + } + if ((uptr)sp & 2) { + // halfword copy, using word writes to reduce memory bandwidth + u16 dl, dh; + for (i = 0; i < (len & ~3); i += 4, dst += 4, sp += 2) { + dl = sp[0], dh = sp[1]; + p32x_sh2_write32(dst, (dl << 16) | dh, sh2); + } + } else { + // word copy + u32 d; + for (i = 0; i < (len & ~3); i += 4, dst += 4, sp += 2) { + d = *(u32 *)sp; + p32x_sh2_write32(dst, (d << 16) | (d >> 16), sh2); + } + } + if (len & 2) { + p32x_sh2_write16(dst, *sp++, sh2); + dst += 2; + } + if (len & 1) + p32x_sh2_write8(dst, *sp >> 8, sh2); } - wh = sh2_wmap[offs]; - wh(a, d >> 16, sh2); - wh(a + 2, d, sh2); + return count; } // ----------------------------------------------------------------- -static void z80_md_bank_write_32x(unsigned int a, unsigned char d) +static void z80_md_bank_write_32x(u32 a, unsigned char d) { - unsigned int addr68k; + u32 addr68k; addr68k = Pico.m.z80_bank68k << 15; addr68k += a & 0x7fff; @@ -1787,9 +2221,11 @@ static void get_bios(void) #define MAP_MEMORY(m) ((uptr)(m) >> 1) #define MAP_HANDLER(h) ( ((uptr)(h) >> 1) | ((uptr)1 << (sizeof(uptr) * 8 - 1)) ) -static sh2_memmap sh2_read8_map[0x80], sh2_read16_map[0x80]; +static sh2_memmap msh2_read8_map[0x80], msh2_read16_map[0x80], msh2_read32_map[0x80]; +static sh2_memmap ssh2_read8_map[0x80], ssh2_read16_map[0x80], ssh2_read32_map[0x80]; // for writes we are using handlers only -static sh2_write_handler *sh2_write8_map[0x80], *sh2_write16_map[0x80]; +static sh2_write_handler *msh2_write8_map[0x80], *msh2_write16_map[0x80], *msh2_write32_map[0x80]; +static sh2_write_handler *ssh2_write8_map[0x80], *ssh2_write16_map[0x80], *ssh2_write32_map[0x80]; void Pico32xSwapDRAM(int b) { @@ -1803,23 +2239,35 @@ void Pico32xSwapDRAM(int b) b ? m68k_write16_dram1_ow : m68k_write16_dram0_ow, 1); // SH2 - sh2_read8_map[0x04/2].addr = sh2_read8_map[0x24/2].addr = - sh2_read16_map[0x04/2].addr = sh2_read16_map[0x24/2].addr = MAP_MEMORY(Pico32xMem->dram[b]); + msh2_read8_map[0x04/2].addr = msh2_read8_map[0x24/2].addr = + msh2_read16_map[0x04/2].addr = msh2_read16_map[0x24/2].addr = + msh2_read32_map[0x04/2].addr = msh2_read32_map[0x24/2].addr = MAP_MEMORY(Pico32xMem->dram[b]); + ssh2_read8_map[0x04/2].addr = ssh2_read8_map[0x24/2].addr = + ssh2_read16_map[0x04/2].addr = ssh2_read16_map[0x24/2].addr = + ssh2_read32_map[0x04/2].addr = ssh2_read32_map[0x24/2].addr = MAP_MEMORY(Pico32xMem->dram[b]); - sh2_write8_map[0x04/2] = sh2_write8_map[0x24/2] = b ? sh2_write8_dram1 : sh2_write8_dram0; - sh2_write16_map[0x04/2] = sh2_write16_map[0x24/2] = b ? sh2_write16_dram1 : sh2_write16_dram0; + // convenience ptrs + msh2.p_dram = ssh2.p_dram = Pico32xMem->dram[b]; } static void bank_switch_rom_sh2(void) { if (!carthw_ssf2_active) { // easy - sh2_read8_map[0x02/2].addr = sh2_read8_map[0x22/2].addr = - sh2_read16_map[0x02/2].addr = sh2_read16_map[0x22/2].addr = MAP_MEMORY(Pico.rom); + msh2_read8_map[0x02/2].addr = msh2_read8_map[0x22/2].addr = + msh2_read16_map[0x02/2].addr = msh2_read16_map[0x22/2].addr = + msh2_read32_map[0x02/2].addr = msh2_read32_map[0x22/2].addr = MAP_MEMORY(Pico.rom); + ssh2_read8_map[0x02/2].addr = ssh2_read8_map[0x22/2].addr = + ssh2_read16_map[0x02/2].addr = ssh2_read16_map[0x22/2].addr = + ssh2_read32_map[0x02/2].addr = ssh2_read32_map[0x22/2].addr = MAP_MEMORY(Pico.rom); } else { - sh2_read8_map[0x02/2].addr = sh2_read8_map[0x22/2].addr = MAP_HANDLER(sh2_read8_rom); - sh2_read16_map[0x02/2].addr = sh2_read16_map[0x22/2].addr = MAP_HANDLER(sh2_read16_rom); + msh2_read8_map[0x02/2].addr = msh2_read8_map[0x22/2].addr = MAP_HANDLER(sh2_read8_rom); + msh2_read16_map[0x02/2].addr = msh2_read16_map[0x22/2].addr = MAP_HANDLER(sh2_read16_rom); + msh2_read32_map[0x02/2].addr = msh2_read32_map[0x22/2].addr = MAP_HANDLER(sh2_read32_rom); + ssh2_read8_map[0x02/2].addr = ssh2_read8_map[0x22/2].addr = MAP_HANDLER(sh2_read8_rom); + ssh2_read16_map[0x02/2].addr = ssh2_read16_map[0x22/2].addr = MAP_HANDLER(sh2_read16_rom); + ssh2_read32_map[0x02/2].addr = ssh2_read32_map[0x22/2].addr = MAP_HANDLER(sh2_read32_rom); } } @@ -1886,63 +2334,110 @@ void PicoMemSetup32x(void) // SH2 maps: A31,A30,A29,CS1,CS0 // all unmapped by default - for (i = 0; i < ARRAY_SIZE(sh2_read8_map); i++) { - sh2_read8_map[i].addr = MAP_HANDLER(sh2_read8_unmapped); - sh2_read16_map[i].addr = MAP_HANDLER(sh2_read16_unmapped); + for (i = 0; i < ARRAY_SIZE(msh2_read8_map); i++) { + msh2_read8_map[i].addr = MAP_HANDLER(sh2_read8_unmapped); + msh2_read16_map[i].addr = MAP_HANDLER(sh2_read16_unmapped); + msh2_read32_map[i].addr = MAP_HANDLER(sh2_read32_unmapped); } - for (i = 0; i < ARRAY_SIZE(sh2_write8_map); i++) { - sh2_write8_map[i] = sh2_write8_unmapped; - sh2_write16_map[i] = sh2_write16_unmapped; + for (i = 0; i < ARRAY_SIZE(msh2_write8_map); i++) { + msh2_write8_map[i] = sh2_write8_unmapped; + msh2_write16_map[i] = sh2_write16_unmapped; + msh2_write32_map[i] = sh2_write32_unmapped; } // "purge area" for (i = 0x40; i <= 0x5f; i++) { - sh2_write8_map[i >> 1] = - sh2_write16_map[i >> 1] = sh2_write_ignore; + msh2_write8_map[i >> 1] = + msh2_write16_map[i >> 1] = + msh2_write32_map[i >> 1] = sh2_write_ignore; } // CS0 - sh2_read8_map[0x00/2].addr = sh2_read8_map[0x20/2].addr = MAP_HANDLER(sh2_read8_cs0); - sh2_read16_map[0x00/2].addr = sh2_read16_map[0x20/2].addr = MAP_HANDLER(sh2_read16_cs0); - sh2_write8_map[0x00/2] = sh2_write8_map[0x20/2] = sh2_write8_cs0; - sh2_write16_map[0x00/2] = sh2_write16_map[0x20/2] = sh2_write16_cs0; + msh2_read8_map[0x00/2].addr = msh2_read8_map[0x20/2].addr = MAP_HANDLER(sh2_read8_cs0); + msh2_read16_map[0x00/2].addr = msh2_read16_map[0x20/2].addr = MAP_HANDLER(sh2_read16_cs0); + msh2_read32_map[0x00/2].addr = msh2_read32_map[0x20/2].addr = MAP_HANDLER(sh2_read32_cs0); + msh2_write8_map[0x00/2] = msh2_write8_map[0x20/2] = sh2_write8_cs0; + msh2_write16_map[0x00/2] = msh2_write16_map[0x20/2] = sh2_write16_cs0; + msh2_write32_map[0x00/2] = msh2_write32_map[0x20/2] = sh2_write32_cs0; // CS1 - ROM bank_switch_rom_sh2(); - sh2_read8_map[0x02/2].mask = sh2_read8_map[0x22/2].mask = - sh2_read16_map[0x02/2].mask = sh2_read16_map[0x22/2].mask = 0x3fffff; // FIXME - // CS2 - DRAM - done by Pico32xSwapDRAM() - sh2_read8_map[0x04/2].mask = sh2_read8_map[0x24/2].mask = - sh2_read16_map[0x04/2].mask = sh2_read16_map[0x24/2].mask = 0x01ffff; + msh2_read8_map[0x02/2].mask = msh2_read8_map[0x22/2].mask = 0x3fffff; // FIXME + msh2_read16_map[0x02/2].mask = msh2_read16_map[0x22/2].mask = 0x3ffffe; // FIXME + msh2_read32_map[0x02/2].mask = msh2_read32_map[0x22/2].mask = 0x3ffffc; // FIXME + msh2_write16_map[0x02/2] = msh2_write16_map[0x22/2] = sh2_write16_rom; + msh2_write32_map[0x02/2] = msh2_write32_map[0x22/2] = sh2_write32_rom; + // CS2 - DRAM + msh2_read8_map[0x04/2].mask = msh2_read8_map[0x24/2].mask = 0x01ffff; + msh2_read16_map[0x04/2].mask = msh2_read16_map[0x24/2].mask = 0x01fffe; + msh2_read32_map[0x04/2].mask = msh2_read32_map[0x24/2].mask = 0x01fffc; + msh2_write8_map[0x04/2] = msh2_write8_map[0x24/2] = sh2_write8_dram; + msh2_write16_map[0x04/2] = msh2_write16_map[0x24/2] = sh2_write16_dram; + msh2_write32_map[0x04/2] = msh2_write32_map[0x24/2] = sh2_write32_dram; + // CS3 - SDRAM - sh2_read8_map[0x06/2].addr = sh2_read8_map[0x26/2].addr = - sh2_read16_map[0x06/2].addr = sh2_read16_map[0x26/2].addr = MAP_MEMORY(Pico32xMem->sdram); - sh2_write8_map[0x06/2] = sh2_write8_sdram; - sh2_write8_map[0x26/2] = sh2_write8_sdram_wt; - sh2_write16_map[0x06/2] = sh2_write16_map[0x26/2] = sh2_write16_sdram; - sh2_read8_map[0x06/2].mask = sh2_read8_map[0x26/2].mask = - sh2_read16_map[0x06/2].mask = sh2_read16_map[0x26/2].mask = 0x03ffff; + msh2_read8_map[0x06/2].addr = msh2_read8_map[0x26/2].addr = + msh2_read16_map[0x06/2].addr = msh2_read16_map[0x26/2].addr = + msh2_read32_map[0x06/2].addr = msh2_read32_map[0x26/2].addr = MAP_MEMORY(Pico32xMem->sdram); + msh2_write8_map[0x06/2] = msh2_write8_map[0x26/2] = sh2_write8_sdram; + msh2_write16_map[0x06/2] = msh2_write16_map[0x26/2] = sh2_write16_sdram; + msh2_write32_map[0x06/2] = msh2_write32_map[0x26/2] = sh2_write32_sdram; + msh2_read8_map[0x06/2].mask = msh2_read8_map[0x26/2].mask = 0x03ffff; + msh2_read16_map[0x06/2].mask = msh2_read16_map[0x26/2].mask = 0x03fffe; + msh2_read32_map[0x06/2].mask = msh2_read32_map[0x26/2].mask = 0x03fffc; // SH2 data array - sh2_read8_map[0xc0/2].addr = MAP_HANDLER(sh2_read8_da); - sh2_read16_map[0xc0/2].addr = MAP_HANDLER(sh2_read16_da); - sh2_write8_map[0xc0/2] = sh2_write8_da; - sh2_write16_map[0xc0/2] = sh2_write16_da; + msh2_read8_map[0xc0/2].mask = 0x0fff; + msh2_read16_map[0xc0/2].mask = 0x0ffe; + msh2_read32_map[0xc0/2].mask = 0x0ffc; + msh2_write8_map[0xc0/2] = sh2_write8_da; + msh2_write16_map[0xc0/2] = sh2_write16_da; + msh2_write32_map[0xc0/2] = sh2_write32_da; // SH2 IO - sh2_read8_map[0xff/2].addr = MAP_HANDLER(sh2_peripheral_read8); - sh2_read16_map[0xff/2].addr = MAP_HANDLER(sh2_peripheral_read16); - sh2_write8_map[0xff/2] = sh2_peripheral_write8; - sh2_write16_map[0xff/2] = sh2_peripheral_write16; + msh2_read8_map[0xff/2].addr = MAP_HANDLER(sh2_peripheral_read8); + msh2_read16_map[0xff/2].addr = MAP_HANDLER(sh2_peripheral_read16); + msh2_read32_map[0xff/2].addr = MAP_HANDLER(sh2_peripheral_read32); + msh2_write8_map[0xff/2] = sh2_peripheral_write8; + msh2_write16_map[0xff/2] = sh2_peripheral_write16; + msh2_write32_map[0xff/2] = sh2_peripheral_write32; + + memcpy(ssh2_read8_map, msh2_read8_map, sizeof(msh2_read8_map)); + memcpy(ssh2_read16_map, msh2_read16_map, sizeof(msh2_read16_map)); + memcpy(ssh2_read32_map, msh2_read32_map, sizeof(msh2_read32_map)); + memcpy(ssh2_write8_map, msh2_write8_map, sizeof(msh2_write8_map)); + memcpy(ssh2_write16_map, msh2_write16_map, sizeof(msh2_write16_map)); + memcpy(ssh2_write32_map, msh2_write32_map, sizeof(msh2_write32_map)); + + msh2_read8_map[0xc0/2].addr = + msh2_read16_map[0xc0/2].addr = + msh2_read32_map[0xc0/2].addr = MAP_MEMORY(msh2.data_array); + ssh2_read8_map[0xc0/2].addr = + ssh2_read16_map[0xc0/2].addr = + ssh2_read32_map[0xc0/2].addr = MAP_MEMORY(ssh2.data_array); // map DRAM area, both 68k and SH2 Pico32xSwapDRAM(1); - msh2.read8_map = ssh2.read8_map = sh2_read8_map; - msh2.read16_map = ssh2.read16_map = sh2_read16_map; - msh2.write8_tab = ssh2.write8_tab = (const void **)(void *)sh2_write8_map; - msh2.write16_tab = ssh2.write16_tab = (const void **)(void *)sh2_write16_map; + msh2.read8_map = msh2_read8_map; ssh2.read8_map = ssh2_read8_map; + msh2.read16_map = msh2_read16_map; ssh2.read16_map = ssh2_read16_map; + msh2.read32_map = msh2_read32_map; ssh2.read32_map = ssh2_read32_map; + msh2.write8_tab = (const void **)(void *)msh2_write8_map; + msh2.write16_tab = (const void **)(void *)msh2_write16_map; + msh2.write32_tab = (const void **)(void *)msh2_write32_map; + ssh2.write8_tab = (const void **)(void *)ssh2_write8_map; + ssh2.write16_tab = (const void **)(void *)ssh2_write16_map; + ssh2.write32_tab = (const void **)(void *)ssh2_write32_map; + + // convenience ptrs + msh2.p_sdram = ssh2.p_sdram = Pico32xMem->sdram; + msh2.p_rom = ssh2.p_rom = Pico.rom; + msh2.p_bios = Pico32xMem->sh2_rom_m.w; msh2.p_da = msh2.data_array; + ssh2.p_bios = Pico32xMem->sh2_rom_s.w; ssh2.p_da = ssh2.data_array; sh2_drc_mem_setup(&msh2); sh2_drc_mem_setup(&ssh2); + memset(sh2_poll_rd, 0, sizeof(sh2_poll_rd)); + memset(sh2_poll_wr, 0, sizeof(sh2_poll_wr)); + memset(sh2_poll_fifo, -1, sizeof(sh2_poll_fifo)); // z80 hack z80_map_set(z80_write_map, 0x8000, 0xffff, z80_md_bank_write_32x, 1); diff --git a/pico/32x/memory_arm.S b/pico/32x/memory_arm.S new file mode 100644 index 000000000..40707fe7b --- /dev/null +++ b/pico/32x/memory_arm.S @@ -0,0 +1,288 @@ +/* + * PicoDrive 32X memory access functions, assembler version + * (C) KUB, 2018 + * + * This work is licensed under the terms of MAME license. + * See COPYING file in the top-level directory. + */ + +#include "../pico_int_offs.h" + +@ 32X bank sizes... TODO this should somehow come from an include file +.equ SH2_ROM_SHIFT, 10 @ 0x003fffff +.equ SH2_RAM_SHIFT, 14 @ 0x0003ffff +.equ SH2_DRAM_SHIFT,15 @ 0x0001ffff +.equ SH2_DA_SHIFT, 20 @ 0x00000fff + +.equ SH2_DRAM_OW, 1<<(32-SH2_DRAM_SHIFT) @ DRAM overwrite mode bit + +.text +.align 5 + +#if 0 +@ u32 a, SH2 *sh2 +.global sh2_read8_rom +.global sh2_read8_sdram +.global sh2_read8_da +.global sh2_read8_dram +.global sh2_read16_rom +.global sh2_read16_sdram +.global sh2_read16_da +.global sh2_read16_dram +.global sh2_read32_rom +.global sh2_read32_sdram +.global sh2_read32_da +.global sh2_read32_dram +#endif + +@ u32 a, u32 d, SH2 *sh2 +.global sh2_write8_sdram +.global sh2_write8_da +.global sh2_write8_dram +.global sh2_write16_sdram +.global sh2_write16_da +.global sh2_write16_dram +.global sh2_write32_sdram +.global sh2_write32_da +.global sh2_write32_dram + +#if 0 +sh2_read8_rom: + ldr ip, [r1, #OFS_SH2_p_rom] + eor r0, r0, #1 + mov r0, r0, lsl #SH2_ROM_SHIFT + ldrb r0, [ip, r0, lsr #SH2_ROM_SHIFT] + bx lr + +sh2_read8_sdram: + ldr ip, [r1, #OFS_SH2_p_sdram] + eor r0, r0, #1 + mov r0, r0, lsl #SH2_RAM_SHIFT + ldrb r0, [ip, r0, lsr #SH2_RAM_SHIFT] + bx lr + +sh2_read8_da: + ldr ip, [r1, #OFS_SH2_p_da] + eor r0, r0, #1 + mov r0, r0, lsl #SH2_DA_SHIFT + ldrb r0, [ip, r0, lsr #SH2_DA_SHIFT] + bx lr + +sh2_read8_dram: + ldr ip, [r1, #OFS_SH2_p_dram] + eor r0, r0, #1 + mov r0, r0, lsl #SH2_DRAM_SHIFT + ldrb r0, [ip, r0, lsr #SH2_DRAM_SHIFT] + bx lr + +sh2_read16_rom: + ldr ip, [r1, #OFS_SH2_p_rom] + mov r0, r0, lsl #SH2_ROM_SHIFT + mov r0, r0, lsr #SH2_ROM_SHIFT + ldrh r0, [ip, r0] + bx lr + +sh2_read16_sdram: + ldr ip, [r1, #OFS_SH2_p_sdram] + mov r0, r0, lsl #SH2_RAM_SHIFT + mov r0, r0, lsr #SH2_RAM_SHIFT + ldrh r0, [ip, r0] + bx lr + +sh2_read16_da: + ldr ip, [r1, #OFS_SH2_p_da] + mov r0, r0, lsl #SH2_DA_SHIFT + mov r0, r0, lsr #SH2_DA_SHIFT + ldrh r0, [ip, r0] + bx lr + +sh2_read16_dram: + ldr ip, [r1, #OFS_SH2_p_dram] + mov r0, r0, lsl #SH2_DRAM_SHIFT + mov r0, r0, lsr #SH2_DRAM_SHIFT + ldrh r0, [ip, r0] + bx lr + +sh2_read32_rom: + ldr ip, [r1, #OFS_SH2_p_rom] + mov r0, r0, lsl #SH2_ROM_SHIFT + ldr r0, [ip, r0, lsr #SH2_ROM_SHIFT] + mov r0, r0, ror #16 + bx lr + +sh2_read32_sdram: + ldr ip, [r1, #OFS_SH2_p_sdram] + mov r0, r0, lsl #SH2_RAM_SHIFT + ldr r0, [ip, r0, lsr #SH2_RAM_SHIFT] + mov r0, r0, ror #16 + bx lr + +sh2_read32_da: + ldr ip, [r1, #OFS_SH2_p_da] + mov r0, r0, lsl #SH2_DA_SHIFT + ldr r0, [ip, r0, lsr #SH2_DA_SHIFT] + mov r0, r0, ror #16 + bx lr + +sh2_read32_dram: + ldr ip, [r1, #OFS_SH2_p_dram] + mov r0, r0, lsl #SH2_DRAM_SHIFT + ldr r0, [ip, r0, lsr #SH2_DRAM_SHIFT] + mov r0, r0, ror #16 + bx lr +#endif + +sh2_write8_sdram: + @ preserve r0-r2 for tail call + ldr ip, [r2, #OFS_SH2_p_sdram] + eor r3, r0, #1 + mov r3, r3, lsl #SH2_RAM_SHIFT + strb r1, [ip, r3, lsr #SH2_RAM_SHIFT] +#ifdef DRC_SH2 + ldr r1, [r2, #OFS_SH2_p_drcblk_ram] + ldrb r3, [r1, r3, lsr #SH2_RAM_SHIFT+1] + cmp r3, #0 + bxeq lr + @ need to load aligned 16 bit data for check + bic r0, r0, #1 + mov r1, r0, lsl #SH2_RAM_SHIFT + mov r1, r1, lsr #SH2_RAM_SHIFT + ldrh r1, [ip, r1] + b sh2_sdram_checks +#else + bx lr +#endif + +sh2_write8_da: + @ preserve r0 and r2 for tail call + ldr ip, [r2, #OFS_SH2_p_da] + eor r3, r0, #1 + mov r3, r3, lsl #SH2_DA_SHIFT + strb r1, [ip, r3, lsr #SH2_DA_SHIFT] +#ifdef DRC_SH2 + ldr ip, [r2, #OFS_SH2_p_drcblk_da] + ldrb r1, [ip, r3, lsr #SH2_DA_SHIFT+1] + bic r0, r0, #1 + cmp r1, #0 + bxeq lr + mov r1, #2 + b sh2_drc_wcheck_da +#else + bx lr +#endif + +sh2_write8_dram: + tst r1, #0xff + ldrne ip, [r2, #OFS_SH2_p_dram] + eorne r3, r0, #1 + movne r3, r3, lsl #SH2_DRAM_SHIFT + strneb r1, [ip, r3, lsr #SH2_DRAM_SHIFT] + bx lr + +sh2_write16_sdram: + @ preserve r0-r2 for tail call + ldr ip, [r2, #OFS_SH2_p_sdram] + mov r3, r0, lsl #SH2_RAM_SHIFT + mov r3, r3, lsr #SH2_RAM_SHIFT + strh r1, [ip, r3] +#ifdef DRC_SH2 + ldr ip, [r2, #OFS_SH2_p_drcblk_ram] + ldrb r3, [ip, r3, lsr #1] + cmp r3, #0 + bxeq lr + b sh2_sdram_checks +#else + bx lr +#endif + +sh2_write16_da: + @ preserve r0 and r2 for tail call + ldr ip, [r2, #OFS_SH2_p_da] + mov r3, r0, lsl #SH2_DA_SHIFT + mov r3, r3, lsr #SH2_DA_SHIFT + strh r1, [ip, r3] +#ifdef DRC_SH2 + ldr ip, [r2, #OFS_SH2_p_drcblk_da] + ldrb r1, [ip, r3, lsr #1] + cmp r1, #0 + bxeq lr + mov r1, #2 + b sh2_drc_wcheck_da +#else + bx lr +#endif + +sh2_write16_dram: + ldr ip, [r2, #OFS_SH2_p_dram] + tst r0, #SH2_DRAM_OW + mov r3, r0, lsl #SH2_DRAM_SHIFT + mov r3, r3, lsr #SH2_DRAM_SHIFT + streqh r1, [ip, r3] + bxeq lr + add ip, ip, r3 + tst r1, #0xff + strneb r1, [ip, #0] + tst r1, #0xff00 + movne r1, r1, lsr #8 + strneb r1, [ip, #1] + bx lr + +sh2_write32_sdram: + @ preserve r0-r2 for tail call + ldr ip, [r2, #OFS_SH2_p_sdram] + mov r1, r1, ror #16 + mov r3, r0, lsl #SH2_RAM_SHIFT + str r1, [ip, r3, lsr #SH2_RAM_SHIFT] +#ifdef DRC_SH2 + ldr ip, [r2, #OFS_SH2_p_drcblk_ram] + ldrb r3, [ip, r3, lsr #SH2_RAM_SHIFT+1]! + ldrb ip, [ip, #1] + orrs r3, r3, ip, lsl #16 + bxeq lr + mov r1, r1, ror #16 + b sh2_sdram_checks_l +#else + bx lr +#endif + +sh2_write32_da: + @ preserve r0 and r2 for tail call + ldr ip, [r2, #OFS_SH2_p_da] + mov r1, r1, ror #16 + mov r3, r0, lsl #SH2_DA_SHIFT + str r1, [ip, r3, lsr #SH2_DA_SHIFT] +#ifdef DRC_SH2 + ldr ip, [r2, #OFS_SH2_p_drcblk_da] + ldrb r1, [ip, r3, lsr #SH2_DA_SHIFT+1]! + ldrb ip, [ip, #1] + orrs r1, r1, ip, lsl #16 + bxeq lr + mov r1, #4 + b sh2_drc_wcheck_da +#else + bx lr +#endif + +sh2_write32_dram: + ldr ip, [r2, #OFS_SH2_p_dram] + tst r0, #SH2_DRAM_OW + mov r3, r0, lsl #SH2_DRAM_SHIFT + mov r1, r1, ror #16 + streq r1, [ip, r3, lsr #SH2_DRAM_SHIFT] + bxeq lr + ldr r0, [ip, r3, lsr #SH2_DRAM_SHIFT] + tst r1, #0x00ff0000 + bicne r0, r0, #0x00ff0000 + tst r1, #0xff000000 + bicne r0, r0, #0xff000000 + tst r1, #0x000000ff + bicne r0, r0, #0x000000ff + tst r1, #0x0000ff00 + bicne r0, r0, #0x0000ff00 + orr r0, r0, r1 + str r0, [ip, r3, lsr #SH2_DRAM_SHIFT] + bx lr + +.pool + +@ vim:filetype=armasm diff --git a/pico/32x/pwm.c b/pico/32x/pwm.c index 507356420..ec4bdb3e7 100644 --- a/pico/32x/pwm.c +++ b/pico/32x/pwm.c @@ -7,32 +7,42 @@ */ #include "../pico_int.h" -static int pwm_cycles; -static int pwm_mult; -static int pwm_ptr; -static int pwm_irq_reload; -static int pwm_doing_fifo; -static int pwm_silent; +static struct { + int cycles; + unsigned mult; + int ptr; + int irq_reload; + int doing_fifo; + int silent; + int irq_timer; + int irq_state; + short current[2]; +} pwm; + +enum { PWM_IRQ_LOCKED, PWM_IRQ_STOPPED, PWM_IRQ_LOW, PWM_IRQ_HIGH }; void p32x_pwm_ctl_changed(void) { int control = Pico32x.regs[0x30 / 2]; int cycles = Pico32x.regs[0x32 / 2]; + int pwm_irq_opt = PicoIn.opt & POPT_PWM_IRQ_OPT; cycles = (cycles - 1) & 0x0fff; - pwm_cycles = cycles; + pwm.cycles = cycles; // supposedly we should stop FIFO when xMd is 0, // but mars test disagrees - pwm_mult = 0; + pwm.mult = 0; if ((control & 0x0f) != 0) - pwm_mult = 0x10000 / cycles; + pwm.mult = 0x10000 / cycles; - pwm_irq_reload = (control & 0x0f00) >> 8; - pwm_irq_reload = ((pwm_irq_reload - 1) & 0x0f) + 1; + pwm.irq_timer = (control & 0x0f00) >> 8; + pwm.irq_timer = ((pwm.irq_timer - 1) & 0x0f) + 1; + pwm.irq_reload = pwm.irq_timer; + pwm.irq_state = pwm_irq_opt ? PWM_IRQ_STOPPED: PWM_IRQ_LOCKED; if (Pico32x.pwm_irq_cnt == 0) - Pico32x.pwm_irq_cnt = pwm_irq_reload; + Pico32x.pwm_irq_cnt = pwm.irq_reload; } static void do_pwm_irq(SH2 *sh2, unsigned int m68k_cycles) @@ -40,7 +50,7 @@ static void do_pwm_irq(SH2 *sh2, unsigned int m68k_cycles) p32x_trigger_irq(sh2, m68k_cycles, P32XI_PWM); if (Pico32x.regs[0x30 / 2] & P32XP_RTP) { - p32x_event_schedule(m68k_cycles, P32X_EVENT_PWM, pwm_cycles / 3 + 1); + p32x_event_schedule(m68k_cycles, P32X_EVENT_PWM, pwm.cycles / 3 + 1); // note: might recurse p32x_dreq1_trigger(); } @@ -48,16 +58,16 @@ static void do_pwm_irq(SH2 *sh2, unsigned int m68k_cycles) static int convert_sample(unsigned int v) { + if (v > pwm.cycles) + v = pwm.cycles; if (v == 0) return 0; - if (v > pwm_cycles) - v = pwm_cycles; - return ((int)v - pwm_cycles / 2) * pwm_mult; + return v * pwm.mult - 0x10000/2; } #define consume_fifo(sh2, m68k_cycles) { \ int cycles_diff = ((m68k_cycles) * 3) - Pico32x.pwm_cycle_p; \ - if (cycles_diff >= pwm_cycles) \ + if (cycles_diff >= pwm.cycles) \ consume_fifo_do(sh2, m68k_cycles, cycles_diff); \ } @@ -69,67 +79,70 @@ static void consume_fifo_do(SH2 *sh2, unsigned int m68k_cycles, unsigned short *fifo_r = mem->pwm_fifo[1]; int sum = 0; - if (pwm_cycles == 0 || pwm_doing_fifo) + if (pwm.cycles == 0 || pwm.doing_fifo) return; elprintf(EL_PWM, "pwm: %u: consume %d/%d, %d,%d ptr %d", - m68k_cycles, sh2_cycles_diff, sh2_cycles_diff / pwm_cycles, - Pico32x.pwm_p[0], Pico32x.pwm_p[1], pwm_ptr); + m68k_cycles, sh2_cycles_diff, sh2_cycles_diff / pwm.cycles, + Pico32x.pwm_p[0], Pico32x.pwm_p[1], pwm.ptr); // this is for recursion from dreq1 writes - pwm_doing_fifo = 1; + pwm.doing_fifo = 1; - for (; sh2_cycles_diff >= pwm_cycles; sh2_cycles_diff -= pwm_cycles) + while (sh2_cycles_diff >= pwm.cycles) { + sh2_cycles_diff -= pwm.cycles; + if (Pico32x.pwm_p[0] > 0) { - fifo_l[0] = fifo_l[1]; - fifo_l[1] = fifo_l[2]; - fifo_l[2] = fifo_l[3]; + mem->pwm_index[0] = (mem->pwm_index[0]+1) % 4; Pico32x.pwm_p[0]--; - mem->pwm_current[0] = convert_sample(fifo_l[0]); - sum += mem->pwm_current[0]; + pwm.current[0] = convert_sample(fifo_l[mem->pwm_index[0]]); + sum |= (u16)pwm.current[0]; } if (Pico32x.pwm_p[1] > 0) { - fifo_r[0] = fifo_r[1]; - fifo_r[1] = fifo_r[2]; - fifo_r[2] = fifo_r[3]; + mem->pwm_index[1] = (mem->pwm_index[1]+1) % 4; Pico32x.pwm_p[1]--; - mem->pwm_current[1] = convert_sample(fifo_r[0]); - sum += mem->pwm_current[1]; + pwm.current[1] = convert_sample(fifo_r[mem->pwm_index[1]]); + sum |= (u16)pwm.current[1]; } - mem->pwm[pwm_ptr * 2 ] = mem->pwm_current[0]; - mem->pwm[pwm_ptr * 2 + 1] = mem->pwm_current[1]; - pwm_ptr = (pwm_ptr + 1) & (PWM_BUFF_LEN - 1); + mem->pwm[pwm.ptr * 2 ] = pwm.current[0]; + mem->pwm[pwm.ptr * 2 + 1] = pwm.current[1]; + pwm.ptr = (pwm.ptr + 1) & (PWM_BUFF_LEN - 1); if (--Pico32x.pwm_irq_cnt == 0) { - Pico32x.pwm_irq_cnt = pwm_irq_reload; + Pico32x.pwm_irq_cnt = pwm.irq_reload; do_pwm_irq(sh2, m68k_cycles); + } else if (Pico32x.pwm_p[1] == 0 && pwm.irq_state >= PWM_IRQ_LOW) { + // buffer underrun. Reduce reload rate if above programmed setting. + if (pwm.irq_reload > pwm.irq_timer) + pwm.irq_reload--; + pwm.irq_state = PWM_IRQ_LOW; } } Pico32x.pwm_cycle_p = m68k_cycles * 3 - sh2_cycles_diff; - pwm_doing_fifo = 0; + pwm.doing_fifo = 0; if (sum != 0) - pwm_silent = 0; + pwm.silent = 0; } static int p32x_pwm_schedule_(SH2 *sh2, unsigned int m68k_now) { - unsigned int sh2_now = m68k_now * 3; + unsigned int pwm_now = m68k_now * 3; int cycles_diff_sh2; - if (pwm_cycles == 0) + if (pwm.cycles == 0) return 0; - cycles_diff_sh2 = sh2_now - Pico32x.pwm_cycle_p; - if (cycles_diff_sh2 >= pwm_cycles) + cycles_diff_sh2 = pwm_now - Pico32x.pwm_cycle_p; + if (cycles_diff_sh2 >= pwm.cycles) consume_fifo_do(sh2, m68k_now, cycles_diff_sh2); if (!((Pico32x.sh2irq_mask[0] | Pico32x.sh2irq_mask[1]) & 1)) return 0; // masked by everyone - cycles_diff_sh2 = sh2_now - Pico32x.pwm_cycle_p; - return (Pico32x.pwm_irq_cnt * pwm_cycles + cycles_diff_sh2 = pwm_now - Pico32x.pwm_cycle_p; + return (Pico32x.pwm_irq_cnt * pwm.cycles - cycles_diff_sh2) / 3 + 1; } @@ -166,21 +179,21 @@ unsigned int p32x_pwm_read16(unsigned int a, SH2 *sh2, consume_fifo(sh2, m68k_cycles); a &= 0x0e; - switch (a) { - case 0: // control - case 2: // cycle + switch (a/2) { + case 0/2: // control + case 2/2: // cycle d = Pico32x.regs[(0x30 + a) / 2]; break; - case 4: // L ch + case 4/2: // L ch if (Pico32x.pwm_p[0] == 3) d |= P32XP_FULL; else if (Pico32x.pwm_p[0] == 0) d |= P32XP_EMPTY; break; - case 6: // R ch - case 8: // MONO + case 6/2: // R ch + case 8/2: // MONO if (Pico32x.pwm_p[1] == 3) d |= P32XP_FULL; else if (Pico32x.pwm_p[1] == 0) @@ -196,47 +209,62 @@ unsigned int p32x_pwm_read16(unsigned int a, SH2 *sh2, void p32x_pwm_write16(unsigned int a, unsigned int d, SH2 *sh2, unsigned int m68k_cycles) { + unsigned short *fifo; + int idx; + elprintf(EL_PWM, "pwm: %u: w16 %02x %04x (p %d %d)", m68k_cycles, a & 0x0e, d, Pico32x.pwm_p[0], Pico32x.pwm_p[1]); consume_fifo(sh2, m68k_cycles); a &= 0x0e; - if (a == 0) { // control - // avoiding pops.. - if ((Pico32x.regs[0x30 / 2] & 0x0f) == 0) - Pico32xMem->pwm_fifo[0][0] = Pico32xMem->pwm_fifo[1][0] = 0; - Pico32x.regs[0x30 / 2] = d; - p32x_pwm_ctl_changed(); - Pico32x.pwm_irq_cnt = pwm_irq_reload; // ? - } - else if (a == 2) { // cycle - Pico32x.regs[0x32 / 2] = d & 0x0fff; - p32x_pwm_ctl_changed(); - } - else if (a <= 8) { - d = (d - 1) & 0x0fff; - - if (a == 4 || a == 8) { // L ch or MONO - unsigned short *fifo = Pico32xMem->pwm_fifo[0]; + switch (a/2) { + case 0/2: // control + // avoiding pops.. + if ((Pico32x.regs[0x30 / 2] & 0x0f) == 0) + Pico32xMem->pwm_fifo[0][0] = Pico32xMem->pwm_fifo[1][0] = 0; + Pico32x.regs[0x30 / 2] = d; + p32x_pwm_ctl_changed(); + Pico32x.pwm_irq_cnt = pwm.irq_reload; // ? + break; + case 2/2: // cycle + Pico32x.regs[0x32 / 2] = d & 0x0fff; + p32x_pwm_ctl_changed(); + break; + case 8/2: // MONO + case 6/2: // R ch + fifo = Pico32xMem->pwm_fifo[1]; + idx = Pico32xMem->pwm_index[1]; + if (Pico32x.pwm_p[1] < 3) { + if (Pico32x.pwm_p[1] == 2 && pwm.irq_state >= PWM_IRQ_STOPPED) { + // buffer full. If there was no buffer underrun after last fill, + // try increasing reload rate to reduce IRQs + if (pwm.irq_reload < 3 && pwm.irq_state == PWM_IRQ_HIGH) + pwm.irq_reload ++; + pwm.irq_state = PWM_IRQ_HIGH; + } + Pico32x.pwm_p[1]++; + } else { + // buffer overflow. Some roms always fill the complete buffer even if + // reload rate is set below max. Lock reload rate to programmed setting. + pwm.irq_reload = pwm.irq_timer; + pwm.irq_state = PWM_IRQ_LOCKED; + idx = (idx+1) % 4; + Pico32xMem->pwm_index[1] = idx; + } + fifo[(idx+Pico32x.pwm_p[1]) % 4] = (d - 1) & 0x0fff; + if (a != 8) break; // fallthrough if MONO + case 4/2: // L ch + fifo = Pico32xMem->pwm_fifo[0]; + idx = Pico32xMem->pwm_index[0]; if (Pico32x.pwm_p[0] < 3) Pico32x.pwm_p[0]++; else { - fifo[1] = fifo[2]; - fifo[2] = fifo[3]; + idx = (idx+1) % 4; + Pico32xMem->pwm_index[0] = idx; } - fifo[Pico32x.pwm_p[0]] = d; - } - if (a == 6 || a == 8) { // R ch or MONO - unsigned short *fifo = Pico32xMem->pwm_fifo[1]; - if (Pico32x.pwm_p[1] < 3) - Pico32x.pwm_p[1]++; - else { - fifo[1] = fifo[2]; - fifo[2] = fifo[3]; - } - fifo[Pico32x.pwm_p[1]] = d; - } + fifo[(idx+Pico32x.pwm_p[0]) % 4] = (d - 1) & 0x0fff; + break; } } @@ -252,10 +280,10 @@ void p32x_pwm_update(int *buf32, int length, int stereo) xmd = Pico32x.regs[0x30 / 2] & 0x0f; if (xmd == 0 || xmd == 0x06 || xmd == 0x09 || xmd == 0x0f) goto out; // invalid? - if (pwm_silent) + if (pwm.silent) return; - step = (pwm_ptr << 16) / length; + step = (pwm.ptr << 16) / length; pwmb = Pico32xMem->pwm; if (stereo) @@ -310,13 +338,12 @@ void p32x_pwm_update(int *buf32, int length, int stereo) } } - elprintf(EL_PWM, "pwm_update: pwm_ptr %d, len %d, step %04x, done %d", - pwm_ptr, length, step, (pwmb - Pico32xMem->pwm) / 2); + elprintf(EL_PWM, "pwm_update: pwm.ptr %d, len %d, step %04x, done %d", + pwm.ptr, length, step, (pwmb - Pico32xMem->pwm) / 2); out: - pwm_ptr = 0; - pwm_silent = Pico32xMem->pwm_current[0] == 0 - && Pico32xMem->pwm_current[1] == 0; + pwm.ptr = 0; + pwm.silent = pwm.current[0] == 0 && pwm.current[1] == 0; } void p32x_pwm_state_loaded(void) @@ -327,8 +354,8 @@ void p32x_pwm_state_loaded(void) // for old savestates cycles_diff_sh2 = Pico.t.m68c_cnt * 3 - Pico32x.pwm_cycle_p; - if (cycles_diff_sh2 >= pwm_cycles || cycles_diff_sh2 < 0) { - Pico32x.pwm_irq_cnt = pwm_irq_reload; + if (cycles_diff_sh2 >= pwm.cycles || cycles_diff_sh2 < 0) { + Pico32x.pwm_irq_cnt = pwm.irq_reload; Pico32x.pwm_cycle_p = Pico.t.m68c_cnt * 3; p32x_pwm_schedule(Pico.t.m68c_cnt); } diff --git a/pico/32x/sh2soc.c b/pico/32x/sh2soc.c index 62423d136..9da3f296e 100644 --- a/pico/32x/sh2soc.c +++ b/pico/32x/sh2soc.c @@ -25,6 +25,9 @@ #include "../pico_int.h" #include "../memory.h" +#include "../../cpu/sh2/compiler.h" +DRC_DECLARE_SR; + // DMAC handling struct dma_chan { unsigned int sar, dar; // src, dst addr @@ -87,6 +90,7 @@ static void dmac_transfer_one(SH2 *sh2, struct dma_chan *chan) case 0: d = p32x_sh2_read8(chan->sar, sh2); p32x_sh2_write8(chan->dar, d, sh2); + break; case 1: d = p32x_sh2_read16(chan->sar, sh2); p32x_sh2_write16(chan->dar, d, sh2); @@ -125,6 +129,33 @@ static void dmac_transfer_one(SH2 *sh2, struct dma_chan *chan) chan->sar += size; } +// optimization for copying around memory with SH2 DMA +static void dmac_memcpy(struct dma_chan *chan, SH2 *sh2) +{ + u32 size = (chan->chcr >> 10) & 3, up = chan->chcr & (1 << 14); + int count; + + if (!up || chan->tcr < 4) + return; +#if MARS_CHECK_HACK + // XXX Mars Check Program copies 32K longwords (128KB) from a 64KB buffer in + // ROM or DRAM to SDRAM in 4-longword mode, overwriting an SDRAM comm area in + // turn, which crashes the test on emulators without CPU cache emulation. + // This may be a bug in Mars Check. As a kludge limit the transfer to 64KB, + // which is what the check program test uses for checking the result. + // A better way would clearly be to have a mechanism to patch the ROM... + if (size == 3 && chan->tcr == 32768 && chan->dar == 0x06020000) size = 1; +#endif + if (size == 3) size = 2; // 4-word xfer mode still counts in words + // XXX check TCR being a multiple of 4 in 4-word xfer mode? + // XXX check alignment of sar/dar, generating a bus error if unaligned? + count = p32x_sh2_memcpy(chan->dar, chan->sar, chan->tcr, 1 << size, sh2); + + chan->sar += count << size; + chan->dar += count << size; + chan->tcr -= count; +} + // DMA trigger by SH2 register write static void dmac_trigger(SH2 *sh2, struct dma_chan *chan) { @@ -134,6 +165,12 @@ static void dmac_trigger(SH2 *sh2, struct dma_chan *chan) if (chan->chcr & DMA_AR) { // auto-request transfer + sh2->state |= SH2_STATE_SLEEP; + if ((((chan->chcr >> 12) ^ (chan->chcr >> 14)) & 3) == 0 && + (((chan->chcr >> 14) ^ (chan->chcr >> 15)) & 1) == 1) { + // SM == DM and either DM0 or DM1 are set. check for mem to mem copy + dmac_memcpy(chan, sh2); + } while ((int)chan->tcr > 0) dmac_transfer_one(sh2, chan); dmac_transfer_complete(sh2, chan); @@ -160,8 +197,9 @@ static void dmac_trigger(SH2 *sh2, struct dma_chan *chan) } // timer state - FIXME -static int timer_cycles[2]; -static int timer_tick_cycles[2]; +static u32 timer_cycles[2]; +static u32 timer_tick_cycles[2]; +static u32 timer_tick_factor[2]; // timers void p32x_timers_recalc(void) @@ -171,6 +209,9 @@ void p32x_timers_recalc(void) // SH2 timer step for (i = 0; i < 2; i++) { + sh2s[i].state &= ~SH2_TIMER_RUN; + if (PREG8(sh2s[i].peri_regs, 0x80) & 0x20) // TME + sh2s[i].state |= SH2_TIMER_RUN; tmp = PREG8(sh2s[i].peri_regs, 0x80) & 7; // Sclk cycles per timer tick if (tmp) @@ -178,36 +219,35 @@ void p32x_timers_recalc(void) else cycles = 2; timer_tick_cycles[i] = cycles; + timer_tick_factor[i] = (1ULL << 32) / cycles; timer_cycles[i] = 0; elprintf(EL_32XP, "WDT cycles[%d] = %d", i, cycles); } } -void p32x_timers_do(unsigned int m68k_slice) +NOINLINE void p32x_timer_do(SH2 *sh2, unsigned int m68k_slice) { unsigned int cycles = m68k_slice * 3; - int cnt, i; - - // WDT timers - for (i = 0; i < 2; i++) { - void *pregs = sh2s[i].peri_regs; - if (PREG8(pregs, 0x80) & 0x20) { // TME - timer_cycles[i] += cycles; - cnt = PREG8(pregs, 0x81); - while (timer_cycles[i] >= timer_tick_cycles[i]) { - timer_cycles[i] -= timer_tick_cycles[i]; - cnt++; - } - if (cnt >= 0x100) { - int level = PREG8(pregs, 0xe3) >> 4; - int vector = PREG8(pregs, 0xe4) & 0x7f; - elprintf(EL_32XP, "%csh2 WDT irq (%d, %d)", - i ? 's' : 'm', level, vector); - sh2_internal_irq(&sh2s[i], level, vector); - cnt &= 0xff; - } - PREG8(pregs, 0x81) = cnt; + void *pregs = sh2->peri_regs; + int cnt; int i = sh2->is_slave; + + // WDT timer + timer_cycles[i] += cycles; + if (timer_cycles[i] > timer_tick_cycles[i]) { + // cnt = timer_cycles[i] / timer_tick_cycles[i]; + cnt = (1ULL * timer_cycles[i] * timer_tick_factor[i]) >> 32; + timer_cycles[i] -= timer_tick_cycles[i] * cnt; + + cnt += PREG8(pregs, 0x81); + if (cnt >= 0x100) { + int level = PREG8(pregs, 0xe3) >> 4; + int vector = PREG8(pregs, 0xe4) & 0x7f; + elprintf(EL_32XP, "%csh2 WDT irq (%d, %d)", + i ? 's' : 'm', level, vector); + sh2_internal_irq(sh2, level, vector); + cnt &= 0xff; } + PREG8(pregs, 0x81) = cnt; } } @@ -225,7 +265,7 @@ void sh2_peripheral_reset(SH2 *sh2) // SH2 internal peripheral memhandlers // we keep them in little endian format -u32 sh2_peripheral_read8(u32 a, SH2 *sh2) +u32 REGPARM(2) sh2_peripheral_read8(u32 a, SH2 *sh2) { u8 *r = (void *)sh2->peri_regs; u32 d; @@ -235,30 +275,52 @@ u32 sh2_peripheral_read8(u32 a, SH2 *sh2) elprintf_sh2(sh2, EL_32XP, "peri r8 [%08x] %02x @%06x", a | ~0x1ff, d, sh2_pc(sh2)); + if ((a & 0x1c0) == 0x140) { + // abused as comm area + DRC_SAVE_SR(sh2); + p32x_sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 3); + DRC_RESTORE_SR(sh2); + } return d; } -u32 sh2_peripheral_read16(u32 a, SH2 *sh2) +u32 REGPARM(2) sh2_peripheral_read16(u32 a, SH2 *sh2) { u16 *r = (void *)sh2->peri_regs; u32 d; - a &= 0x1ff; + a &= 0x1fe; d = r[(a / 2) ^ 1]; elprintf_sh2(sh2, EL_32XP, "peri r16 [%08x] %04x @%06x", a | ~0x1ff, d, sh2_pc(sh2)); + if ((a & 0x1c0) == 0x140) { + // abused as comm area + DRC_SAVE_SR(sh2); + p32x_sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 3); + DRC_RESTORE_SR(sh2); + } return d; } -u32 sh2_peripheral_read32(u32 a, SH2 *sh2) +u32 REGPARM(2) sh2_peripheral_read32(u32 a, SH2 *sh2) { u32 d; + a &= 0x1fc; d = sh2->peri_regs[a / 4]; elprintf_sh2(sh2, EL_32XP, "peri r32 [%08x] %08x @%06x", a | ~0x1ff, d, sh2_pc(sh2)); + if (a == 0x18c) + // kludge for polling COMM while polling for end of DMA + sh2->poll_cnt = 0; + else if ((a & 0x1c0) == 0x140) { + // abused as comm area + DRC_SAVE_SR(sh2); + p32x_sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 3); + DRC_RESTORE_SR(sh2); + } return d; } @@ -334,6 +396,9 @@ void REGPARM(3) sh2_peripheral_write8(u32 a, u32 d, SH2 *sh2) break; } PREG8(r, a) = d; + + if ((a & 0x1c0) == 0x140) + p32x_sh2_poll_event(sh2, SH2_STATE_CPOLL, SekCyclesDone()); } void REGPARM(3) sh2_peripheral_write16(u32 a, u32 d, SH2 *sh2) @@ -342,7 +407,7 @@ void REGPARM(3) sh2_peripheral_write16(u32 a, u32 d, SH2 *sh2) elprintf_sh2(sh2, EL_32XP, "peri w16 [%08x] %04x @%06x", a, d, sh2_pc(sh2)); - a &= 0x1ff; + a &= 0x1fe; // evil WDT if (a == 0x80) { @@ -356,12 +421,15 @@ void REGPARM(3) sh2_peripheral_write16(u32 a, u32 d, SH2 *sh2) } r[(a / 2) ^ 1] = d; + if ((a & 0x1c0) == 0x140) + p32x_sh2_poll_event(sh2, SH2_STATE_CPOLL, SekCyclesDone()); } void REGPARM(3) sh2_peripheral_write32(u32 a, u32 d, SH2 *sh2) { u32 *r = sh2->peri_regs; u32 old; + struct dmac *dmac; elprintf_sh2(sh2, EL_32XP, "peri w32 [%08x] %08x @%06x", a, d, sh2_pc(sh2)); @@ -402,21 +470,25 @@ void REGPARM(3) sh2_peripheral_write32(u32 a, u32 d, SH2 *sh2) else r[0x110 / 4] = r[0x114 / 4] = r[0x118 / 4] = r[0x11c / 4] = 0; // ? break; + // perhaps starting a DMA? + case 0x18c: + case 0x19c: + case 0x1b0: + dmac = (void *)&sh2->peri_regs[0x180 / 4]; + if (a == 0x1b0 && !((old ^ d) & d & DMA_DME)) + return; + if (!(dmac->dmaor & DMA_DME)) + return; + + if ((dmac->chan[0].chcr & (DMA_TE|DMA_DE)) == DMA_DE) + dmac_trigger(sh2, &dmac->chan[0]); + if ((dmac->chan[1].chcr & (DMA_TE|DMA_DE)) == DMA_DE) + dmac_trigger(sh2, &dmac->chan[1]); + break; } - // perhaps starting a DMA? - if (a == 0x1b0 || a == 0x18c || a == 0x19c) { - struct dmac *dmac = (void *)&sh2->peri_regs[0x180 / 4]; - if (a == 0x1b0 && !((old ^ d) & d & DMA_DME)) - return; - if (!(dmac->dmaor & DMA_DME)) - return; - - if ((dmac->chan[0].chcr & (DMA_TE|DMA_DE)) == DMA_DE) - dmac_trigger(sh2, &dmac->chan[0]); - if ((dmac->chan[1].chcr & (DMA_TE|DMA_DE)) == DMA_DE) - dmac_trigger(sh2, &dmac->chan[1]); - } + if ((a & 0x1c0) == 0x140) + p32x_sh2_poll_event(sh2, SH2_STATE_CPOLL, SekCyclesDone()); } /* 32X specific */ diff --git a/pico/arm_features.h b/pico/arm_features.h index fdec52298..b772b77c5 100644 --- a/pico/arm_features.h +++ b/pico/arm_features.h @@ -49,4 +49,32 @@ #endif +// indexed branch (XB) via branch table (BT) +#ifdef __PIC__ +#define PIC_XB(c,r,s) add##c pc, r, s +#define PIC_BT(a) b a +#else +#define PIC_XB(c,r,s) ldr##c pc, [pc, r, s] +#define PIC_BT(a) .word a +#endif + +// load data address (LDR) either via literal pool or via GOT +#ifdef __PIC__ +// can't use pool loads since ldr= only allows a symbol or a constant expr :-( +#define PIC_LDR_INIT() \ + .macro pic_ldr r t a; \ + ldr \r, [pc, $.LD\@-.-8]; \ + ldr \t, [pc, $.LD\@-.-4]; \ + .LP\@:add \r, pc; \ + ldr \r, [\r, \t]; \ + add pc, $4; \ + .LD\@:.word _GLOBAL_OFFSET_TABLE_-.LP\@-8; \ + .word \a(GOT); \ + .endm; +#define PIC_LDR(r,t,a) pic_ldr r, t, a +#else +#define PIC_LDR_INIT() +#define PIC_LDR(r,t,a) ldr r, =a +#endif + #endif /* __ARM_FEATURES_H__ */ diff --git a/pico/carthw/svp/compiler.c b/pico/carthw/svp/compiler.c index b31197c2a..df051e478 100644 --- a/pico/carthw/svp/compiler.c +++ b/pico/carthw/svp/compiler.c @@ -1438,12 +1438,9 @@ static int translate_op(unsigned int op, int *pc, int imm, int *end_cond, int *j } tr_mov16(0, *pc); tr_r0_to_STACK(*pc); - if (tmpv != A_COND_AL) { - u32 *real_ptr = tcache_ptr; - tcache_ptr = jump_op; - EOP_C_B(tr_neg_cond(tmpv),0,real_ptr - jump_op - 2); - tcache_ptr = real_ptr; - } + if (tmpv != A_COND_AL) + EOP_C_B_PTR(jump_op, tr_neg_cond(tmpv), 0, + tcache_ptr - jump_op - 2); tr_mov16_cond(tmpv, 0, imm); if (tmpv != A_COND_AL) tr_mov16_cond(tr_neg_cond(tmpv), 0, *pc); @@ -1712,12 +1709,8 @@ static void *emit_block_epilogue(int cycles, int cond, int pc, int end_pc) ssp_block_table[pc]; if (target != NULL) emith_jump(target); - else { - int ops = emith_jump(ssp_drc_next); - end_ptr = tcache_ptr; - // cause the next block to be emitted over jump instruction - tcache_ptr -= ops; - } + else + emith_jump(ssp_drc_next); } else { u32 *target1 = (pc < 0x400) ? @@ -1795,6 +1788,8 @@ void *ssp_translate_block(int pc) tr_flush_dirty_ST(); tr_flush_dirty_pmcrs(); block_end = emit_block_epilogue(ccount, end_cond, jump_pc, pc); + emith_pool_commit(0); + emith_flush(); if (tcache_ptr - (u32 *)tcache > DRC_TCACHE_SIZE/4) { elprintf(EL_ANOMALY|EL_STATUS|EL_SVP, "tcache overflow!\n"); diff --git a/pico/carthw/svp/stub_arm.S b/pico/carthw/svp/stub_arm.S index 9d5c5fa1a..736d459be 100644 --- a/pico/carthw/svp/stub_arm.S +++ b/pico/carthw/svp/stub_arm.S @@ -8,7 +8,7 @@ #include "../../arm_features.h" -.syntax unified +@.syntax unified .text .align 2 @@ -281,8 +281,8 @@ ssp_hle_902_loop: bgt ssp_hle_902_loop tst r12, #1 - ldrhne r0, [r2], #2 - strhne r0, [r3], #2 + ldrneh r0, [r2], #2 + strneh r0, [r3], #2 ldr r0, [r7, #SSP_OFFS_IRAM_ROM] add r1, r7, #0x200 @@ -501,7 +501,7 @@ FUNCTION(ssp_hle_07_036): mov r12, #0x4000 orr r12,r12,#0x0018 subs r12,r3, r12 - subsne r12,r12,#0x0400 + subnes r12,r12,#0x0400 blne tr_unhandled orr r2, r2, r2, lsl #16 @@ -510,7 +510,7 @@ FUNCTION(ssp_hle_07_036): hle_07_036_no_ovrwr: tst r1, #2 - strhne r2, [r1], #0x3e @ align + strneh r2, [r1], #0x3e @ align subne r0, r0, #1 subs r0, r0, #4 blt hle_07_036_l2 @@ -525,7 +525,7 @@ hle_07_036_l2: tst r0, #2 strne r2, [r1], #0x40 tst r0, #1 - strhne r2, [r1], #2 + strneh r2, [r1], #2 b hle_07_036_end_copy hle_07_036_ovrwr: @@ -562,10 +562,10 @@ hle_07_036_ol1: hle_07_036_ol2: tst r0, #1 - ldrhne r3, [r1] + ldrneh r3, [r1] andne r3, r3, r12 orrne r3, r3, r2 - strhne r3, [r1], #2 + strneh r3, [r1], #2 hle_07_036_end_copy: ldr r2, [r7, #SSP_OFFS_DRAM] diff --git a/pico/cd/gfx_dma.c b/pico/cd/gfx_dma.c index 7dfe4bc9c..354fc2136 100644 --- a/pico/cd/gfx_dma.c +++ b/pico/cd/gfx_dma.c @@ -10,10 +10,6 @@ #include "cell_map.c" -#ifndef UTYPES_DEFINED -typedef unsigned short u16; -#endif - // check: Heart of the alien, jaguar xj 220 PICO_INTERNAL void DmaSlowCell(unsigned int source, unsigned int a, int len, unsigned char inc) { @@ -32,7 +28,7 @@ PICO_INTERNAL void DmaSlowCell(unsigned int source, unsigned int a, int len, uns asrc = cell_map(source >> 2) << 2; asrc |= source & 2; // if(a&1) d=(d<<8)|(d>>8); // ?? - r[a>>1] = *(u16 *)(base + asrc); + VideoWriteVRAM(a, *(u16 *)(base + asrc)); source += 2; // AutoIncrement a=(u16)(a+inc); diff --git a/pico/cd/mcd.c b/pico/cd/mcd.c index 5e3629a36..8a2f230d5 100644 --- a/pico/cd/mcd.c +++ b/pico/cd/mcd.c @@ -125,6 +125,7 @@ static void SekRunS68k(unsigned int to) if (SekShouldInterrupt()) Pico_mcd->m.s68k_poll_a = 0; + pprof_start(s68k); SekCycleCntS68k += cyc_do; #if defined(EMU_C68K) PicoCpuCS68k.cycles = cyc_do; @@ -137,6 +138,7 @@ static void SekRunS68k(unsigned int to) #elif defined(EMU_F68K) SekCycleCntS68k += fm68k_emulate(&PicoCpuFS68k, cyc_do, 0) - cyc_do; #endif + pprof_end(s68k); } static void pcd_set_cycle_mult(void) diff --git a/pico/cd/memory.c b/pico/cd/memory.c index 1c5dcf94c..e64868400 100644 --- a/pico/cd/memory.c +++ b/pico/cd/memory.c @@ -14,12 +14,14 @@ uptr s68k_read16_map [0x1000000 >> M68K_MEM_SHIFT]; uptr s68k_write8_map [0x1000000 >> M68K_MEM_SHIFT]; uptr s68k_write16_map[0x1000000 >> M68K_MEM_SHIFT]; +#ifndef _ASM_CD_MEMORY_C MAKE_68K_READ8(s68k_read8, s68k_read8_map) MAKE_68K_READ16(s68k_read16, s68k_read16_map) MAKE_68K_READ32(s68k_read32, s68k_read16_map) MAKE_68K_WRITE8(s68k_write8, s68k_write8_map) MAKE_68K_WRITE16(s68k_write16, s68k_write16_map) MAKE_68K_WRITE32(s68k_write32, s68k_write16_map) +#endif // ----------------------------------------------------------------- diff --git a/pico/cd/memory_arm.S b/pico/cd/memory_arm.S index fe82ecb9e..0d1369ee8 100644 --- a/pico/cd/memory_arm.S +++ b/pico/cd/memory_arm.S @@ -6,7 +6,8 @@ @* See COPYING file in the top-level directory. @* -#include "../pico_int_o32.h" +#include "../arm_features.h" +#include "../pico_int_offs.h" .equiv PCM_STEP_SHIFT, 11 @@ -65,6 +66,7 @@ .extern PicoWrite16_io .extern m68k_comm_check + PIC_LDR_INIT() @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @@ -73,16 +75,16 @@ @ r0=addr[in,out], r1,r2=tmp .macro cell_map ands r1, r0, #0x01c000 - ldrne pc, [pc, r1, lsr #12] - beq 0f @ most common? - .long 0f - .long 0f - .long 0f - .long 0f - .long 1f - .long 1f - .long 2f - .long 3f + PIC_XB(ne ,r1, lsr #12) + b 0f @ most common? + PIC_BT(0f) + PIC_BT(0f) + PIC_BT(0f) + PIC_BT(0f) + PIC_BT(1f) + PIC_BT(1f) + PIC_BT(2f) + PIC_BT(3f) 1: @ x16 cells and r1, r0, #0x7e00 @ col and r2, r0, #0x01fc @ row @@ -128,7 +130,7 @@ PicoReadM68k8_cell1: @ 0x220000 - 0x23ffff, cell arranged mov r3, #0x0e0000 0: cell_map - ldr r1, =Pico + PIC_LDR(r1, r2, Pico) add r0, r0, r3 ldr r1, [r1, #OFS_Pico_rom] @ Pico.mcd (used everywhere) eor r0, r0, #1 @@ -141,26 +143,26 @@ PicoRead8_mcd_io: cmp r1, #0x2000 @ a120xx? bne PicoRead8_io - ldr r1, =Pico + PIC_LDR(r1, r2, Pico) and r0, r0, #0x3f ldr r1, [r1, #OFS_Pico_rom] @ Pico.mcd cmp r0, #0x0e - ldrlt pc, [pc, r0, lsl #2] + PIC_XB(lt ,r0, lsl #2) b m_m68k_read8_hi - .long m_m68k_read8_r00 - .long m_m68k_read8_r01 - .long m_m68k_read8_r02 - .long m_m68k_read8_r03 - .long m_m68k_read8_r04 - .long m_read_null @ unused bits - .long m_m68k_read8_r06 - .long m_m68k_read8_r07 - .long m_m68k_read8_r08 - .long m_m68k_read8_r09 - .long m_read_null @ reserved - .long m_read_null - .long m_m68k_read8_r0c - .long m_m68k_read8_r0d + PIC_BT(m_m68k_read8_r00) + PIC_BT(m_m68k_read8_r01) + PIC_BT(m_m68k_read8_r02) + PIC_BT(m_m68k_read8_r03) + PIC_BT(m_m68k_read8_r04) + PIC_BT(m_read_null) @ unused bits + PIC_BT(m_m68k_read8_r06) + PIC_BT(m_m68k_read8_r07) + PIC_BT(m_m68k_read8_r08) + PIC_BT(m_m68k_read8_r09) + PIC_BT(m_read_null) @ reserved + PIC_BT(m_read_null) + PIC_BT(m_m68k_read8_r0c) + PIC_BT(m_m68k_read8_r0d) m_m68k_read8_r00: add r1, r1, #0x110000 ldr r0, [r1, #0x30] @@ -178,9 +180,9 @@ m_m68k_read8_r02: bx lr m_m68k_read8_r03: add r1, r1, #0x110000 - push {r1, lr} + stmfd sp!, {r1, lr} bl m68k_comm_check - pop {r1, lr} + ldmfd sp!, {r1, lr} ldrb r0, [r1, #3] and r0, r0, #0xc7 bx lr @@ -219,10 +221,10 @@ m_m68k_read8_hi: add r1, r1, #0x110000 movge r0, #0 bxge lr - add r1, r0 - push {r1, lr} + add r1, r1, r0 + stmfd sp!, {r1, lr} bl m68k_comm_check - pop {r1, lr} + ldmfd sp!, {r1, lr} ldrb r0, [r1] bx lr @@ -238,7 +240,7 @@ PicoReadM68k16_cell1: @ 0x220000 - 0x23ffff, cell arranged mov r3, #0x0e0000 0: cell_map - ldr r1, =Pico + PIC_LDR(r1, r2, Pico) add r0, r0, r3 ldr r1, [r1, #OFS_Pico_rom] @ Pico.mcd bic r0, r0, #1 @@ -252,19 +254,19 @@ PicoRead16_mcd_io: bne PicoRead16_io m_m68k_read16_m68k_regs: - ldr r1, =Pico + PIC_LDR(r1, r2, Pico) and r0, r0, #0x3e ldr r1, [r1, #OFS_Pico_rom] @ Pico.mcd cmp r0, #0x0e - ldrlt pc, [pc, r0, lsl #1] + PIC_XB(lt ,r0, lsl #1) b m_m68k_read16_hi - .long m_m68k_read16_r00 - .long m_m68k_read16_r02 - .long m_m68k_read16_r04 - .long m_m68k_read16_r06 - .long m_m68k_read16_r08 - .long m_read_null @ reserved - .long m_m68k_read16_r0c + PIC_BT(m_m68k_read16_r00) + PIC_BT(m_m68k_read16_r02) + PIC_BT(m_m68k_read16_r04) + PIC_BT(m_m68k_read16_r06) + PIC_BT(m_m68k_read16_r08) + PIC_BT(m_read_null) @ reserved + PIC_BT(m_m68k_read16_r0c) m_m68k_read16_r00: add r1, r1, #0x110000 ldr r0, [r1, #0x30] @@ -275,9 +277,9 @@ m_m68k_read16_r00: bx lr m_m68k_read16_r02: add r1, r1, #0x110000 - push {r1, lr} + stmfd sp!, {r1, lr} bl m68k_comm_check - pop {r1, lr} + ldmfd sp!, {r1, lr} ldrb r2, [r1, #3] ldrb r0, [r1, #2] and r2, r2, #0xc7 @@ -307,9 +309,9 @@ m_m68k_read16_hi: bxge lr add r1, r0, r1 - push {r1, lr} + stmfd sp!, {r1, lr} bl m68k_comm_check - pop {r0, lr} + ldmfd sp!, {r0, lr} ldrh r0, [r0] mov r1, r0, lsr #8 and r0, r0, #0xff @@ -329,7 +331,7 @@ PicoWriteM68k8_cell1: @ 0x220000 - 0x23ffff, cell arranged 0: mov r3, r1 cell_map - ldr r2, =Pico + PIC_LDR(r2, r1, Pico) add r0, r0, r12 ldr r2, [r2, #OFS_Pico_rom] @ Pico.mcd ldr r2, [r2] @@ -357,7 +359,7 @@ PicoWriteM68k16_cell1: @ 0x220000 - 0x23ffff, cell arranged 0: mov r3, r1 cell_map - ldr r1, =Pico + PIC_LDR(r1, r2, Pico) add r0, r0, r12 ldr r1, [r1, #OFS_Pico_rom] @ Pico.mcd bic r0, r0, #1 @@ -399,7 +401,7 @@ PicoReadS68k8_dec0: @ 0x080000 - 0x0bffff PicoReadS68k8_dec1: mov r3, #0x0a0000 @ + ^ / 2 0: - ldr r2, =Pico + PIC_LDR(r2, r1, Pico) eor r0, r0, #2 ldr r2, [r2, #OFS_Pico_rom] @ Pico.mcd movs r0, r0, lsr #1 @ +4-6 <<16 @@ -431,7 +433,7 @@ m_s68k_read8_regs: bx lr m_s68k_read8_comm: - ldr r1, =Pico + PIC_LDR(r1, r2, Pico) ldr r1, [r1, #OFS_Pico_rom] @ Pico.mcd add r1, r1, #0x110000 ldrb r1, [r1, r0] @@ -444,7 +446,7 @@ m_s68k_read8_pcm: bne m_read_null @ must not trash r3 and r12 - ldr r1, =Pico + PIC_LDR(r1, r2, Pico) bic r0, r0, #0xff0000 ldr r1, [r1, #OFS_Pico_rom] @ Pico.mcd mov r2, #0x110000 @@ -479,7 +481,7 @@ PicoReadS68k16_dec0: @ 0x080000 - 0x0bffff PicoReadS68k16_dec1: mov r3, #0x0a0000 @ + ^ / 2 0: - ldr r2, =Pico + PIC_LDR(r2, r1, Pico) eor r0, r0, #2 ldr r2, [r2, #OFS_Pico_rom] @ Pico.mcd mov r0, r0, lsr #1 @ +4-6 <<16 @@ -505,12 +507,11 @@ m_s68k_read16_regs: mov r0, #1 b cdc_host_r - @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ .macro m_s68k_write8_2M_decode - ldr r2, =Pico + PIC_LDR(r2, ip, Pico) eor r0, r0, #2 ldr r2, [r2, #OFS_Pico_rom] @ Pico.mcd movs r0, r0, lsr #1 @ +4-6 <<16 @@ -594,7 +595,7 @@ m_s68k_write8_pcm: bxlt lr m_s68k_write8_pcm_ram: - ldr r3, =Pico + PIC_LDR(r3, r2, Pico) bic r0, r0, #0x00e000 ldr r3, [r3, #OFS_Pico_rom] @ Pico.mcd mov r0, r0, lsr #1 @@ -608,12 +609,11 @@ m_s68k_write8_pcm_ram: strb r1, [r3, r0] bx lr - @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ .macro m_s68k_write16_2M_decode - ldr r2, =Pico + PIC_LDR(r2, ip, Pico) eor r0, r0, #2 ldr r2, [r2, #OFS_Pico_rom] @ Pico.mcd mov r0, r0, lsr #1 @ +4-6 <<16 @@ -694,13 +694,110 @@ m_s68k_write16_regs: bne s68k_reg_write16 m_s68k_write16_regs_spec: @ special case - ldr r2, =Pico + PIC_LDR(r2, r0, Pico) mov r0, #0x110000 ldr r2, [r2, #OFS_Pico_rom] @ Pico.mcd add r0, r0, #0x00000f strb r1, [r2, r0] @ if (a == 0xe) s68k_regs[0xf] = d; bx lr +.global s68k_read8 +.global s68k_read16 +.global s68k_read32 +.global s68k_write8 +.global s68k_write16 +.global s68k_write32 + +s68k_read8: + PIC_LDR(r3, r2, s68k_read8_map) + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + eor r2, r0, #1 + movs r3, r3, lsl #1 + ldrccb r0, [r3, r2] + bxcc lr + bx r3 + +s68k_read16: + PIC_LDR(r3, r2, s68k_read16_map) + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + ldrcch r0, [r3, r0] + bxcc lr + bx r3 + +s68k_read32: + PIC_LDR(r3, r2, s68k_read16_map) + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + ldrcch r1, [r3, r0]! + ldrcch r0, [r3, #2] + orrcc r0, r0, r1, lsl #16 + bxcc lr + + stmfd sp!, {r0, r3, r4, lr} + mov lr, pc + bx r3 + ldmfd sp!, {r1, r3} + str r0, [sp] + add r0, r1, #2 + mov lr, pc + bx r3 + ldmfd sp!, {r1, lr} + mov r0, r0, lsl #16 + mov r1, r1, lsl #16 + orr r0, r1, r0, lsr #16 + bx lr + +s68k_write8: + PIC_LDR(r3, r2, s68k_write8_map) + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + eor r2, r0, #1 + movs r3, r3, lsl #1 + strccb r1, [r3, r2] + bxcc lr + bx r3 + +s68k_write16: + PIC_LDR(r3, r2, s68k_write16_map) + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + strcch r1, [r3, r0] + bxcc lr + bx r3 + +s68k_write32: + PIC_LDR(r3, r2, s68k_write16_map) + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + movcc r2, r1, lsr #16 + strcch r2, [r3, r0]! + strcch r1, [r3, #2] + bxcc lr + + stmfd sp!, {r0, r1, r3, lr} + mov r1, r1, lsr #16 + mov lr, pc + bx r3 + ldmfd sp!, {r0, r1, r3, lr} + add r0, r0, #2 + bx r3 + .pool @ vim:filetype=armasm diff --git a/pico/debug.c b/pico/debug.c index 50cbaf387..e4b5232ec 100644 --- a/pico/debug.c +++ b/pico/debug.c @@ -43,6 +43,12 @@ char *PDebugMain(void) !!(Pico.sv.flags & SRF_ENABLED), !!(Pico.sv.flags & SRF_EEPROM), Pico.sv.eeprom_type); MVP; sprintf(dstrp, "sram range: %06x-%06x, reg: %02x\n", Pico.sv.start, Pico.sv.end, Pico.m.sram_reg); MVP; sprintf(dstrp, "pend int: v:%i, h:%i, vdp status: %04x\n", bit(pv->pending_ints,5), bit(pv->pending_ints,4), pv->status); MVP; + sprintf(dstrp, "VDP regs 00-07: %02x %02x %02x %02x %02x %02x %02x %02x\n",reg[0],reg[1],reg[2],reg[3],reg[4],reg[5],reg[6],reg[7]); MVP; + sprintf(dstrp, "VDP regs 08-0f: %02x %02x %02x %02x %02x %02x %02x %02x\n",reg[8],reg[9],reg[10],reg[11],reg[12],reg[13],reg[14],reg[15]); MVP; + sprintf(dstrp, "VDP regs 10-17: %02x %02x %02x %02x %02x %02x %02x %02x\n",reg[16],reg[17],reg[18],reg[19],reg[20],reg[21],reg[22],reg[23]); MVP; + sprintf(dstrp, "VDP regs 18-1f: %02x %02x %02x %02x %02x %02x %02x %02x\n",reg[24],reg[25],reg[26],reg[27],reg[28],reg[29],reg[30],reg[31]); MVP; + r = (reg[5]<<9)+(reg[6]<<11); + sprintf(dstrp, "sprite #0: %04x %04x %04x %04x\n",PicoMem.vram[r/2],PicoMem.vram[r/2+1],PicoMem.vram[r/2+2],PicoMem.vram[r/2+3]); MVP; sprintf(dstrp, "pal: %i, hw: %02x, frame#: %i, cycles: %u\n", Pico.m.pal, Pico.m.hardware, Pico.m.frame_count, SekCyclesDone()); MVP; sprintf(dstrp, "M68k: PC: %06x, SR: %04x, irql: %i\n", SekPc, SekSr, SekIrqLevel); MVP; for (r = 0; r < 8; r++) { @@ -369,42 +375,32 @@ void PDebugDumpMem(void) void PDebugZ80Frame(void) { - int lines, line_sample; + int lines; if (PicoIn.AHW & PAHW_SMS) return; - if (Pico.m.pal) { + if (Pico.m.pal) lines = 313; - line_sample = 68; - } else { + else lines = 262; - line_sample = 93; - } z80_resetCycles(); PsndStartFrame(); - if (/*Pico.m.z80Run &&*/ !Pico.m.z80_reset && (PicoIn.opt&POPT_EN_Z80)) - PicoSyncZ80(Pico.t.m68c_cnt + line_sample * 488); - if (PicoIn.sndOut) - PsndGetSamples(line_sample); - if (/*Pico.m.z80Run &&*/ !Pico.m.z80_reset && (PicoIn.opt&POPT_EN_Z80)) { PicoSyncZ80(Pico.t.m68c_cnt + 224 * 488); z80_int(); } - if (PicoIn.sndOut) - PsndGetSamples(224); // sync z80 if (/*Pico.m.z80Run &&*/ !Pico.m.z80_reset && (PicoIn.opt&POPT_EN_Z80)) { Pico.t.m68c_cnt += Pico.m.pal ? 151809 : 127671; // cycles adjusted for converter PicoSyncZ80(Pico.t.m68c_cnt); } - if (PicoIn.sndOut && ym2612.dacen && Pico.snd.dac_line < lines) - PsndDoDAC(lines - 1); - PsndDoPSG(lines - 1); + + if (PicoIn.sndOut) + PsndGetSamples(lines); timers_cycle(); Pico.t.m68c_aim = Pico.t.m68c_cnt; diff --git a/pico/draw.c b/pico/draw.c index a6c5903b3..bdd450e0b 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -2,6 +2,7 @@ * line renderer * (c) Copyright Dave, 2004 * (C) notaz, 2006-2010 + * (C) kub, 2019-2020 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. @@ -29,6 +30,7 @@ */ #include "pico_int.h" +#define FORCE // layer forcing via debug register? int (*PicoScanBegin)(unsigned int num) = NULL; int (*PicoScanEnd) (unsigned int num) = NULL; @@ -45,6 +47,8 @@ static int HighCacheA[41+1]; // caches for high layers static int HighCacheB[41+1]; static int HighPreSpr[80*2+1]; // slightly preprocessed sprites +unsigned int VdpSATCache[128]; // VDP sprite cache (1st 32 sprite attr bits) + #define LF_PLANE_1 (1 << 0) #define LF_SH (1 << 1) // must be = 2 #define LF_FORCE (1 << 2) @@ -53,7 +57,11 @@ static int HighPreSpr[80*2+1]; // slightly preprocessed sprites #define SPRL_HAVE_LO 0x40 // *lo* #define SPRL_MAY_HAVE_OP 0x20 // may have operator sprites on the line #define SPRL_LO_ABOVE_HI 0x10 // low priority sprites may be on top of hi -unsigned char HighLnSpr[240][3 + MAX_LINE_SPRITES]; // sprite_count, ^flags, tile_count, [spritep]... +#define SPRL_HAVE_X 0x08 // have sprites with x != 0 +#define SPRL_TILE_OVFL 0x04 // tile limit exceeded on previous line +#define SPRL_HAVE_MASK0 0x02 // have sprite with x == 0 in 1st slot +#define SPRL_MASKED 0x01 // lo prio masking by sprite with x == 0 active +unsigned char HighLnSpr[240][4+MAX_LINE_SPRITES+1]; // sprite_count, ^flags, tile_count, sprites_total, [spritep]..., last_width int rendstatus_old; int rendlines; @@ -94,7 +102,7 @@ void blockcpy_or(void *dst, void *src, size_t n, int pat) #define blockcpy memcpy #endif -#define TileNormMaker_(pix_func) \ +#define TileNormMaker_(pix_func,ret) \ { \ unsigned int t; \ \ @@ -106,9 +114,10 @@ void blockcpy_or(void *dst, void *src, size_t n, int pat) t = (pack&0x0f000000)>>24; pix_func(5); \ t = (pack&0x00f00000)>>20; pix_func(6); \ t = (pack&0x000f0000)>>16; pix_func(7); \ + return ret; \ } -#define TileFlipMaker_(pix_func) \ +#define TileFlipMaker_(pix_func,ret) \ { \ unsigned int t; \ \ @@ -120,23 +129,24 @@ void blockcpy_or(void *dst, void *src, size_t n, int pat) t = (pack&0x000000f0)>> 4; pix_func(5); \ t = (pack&0x00000f00)>> 8; pix_func(6); \ t = (pack&0x0000f000)>>12; pix_func(7); \ + return ret; \ } #define TileNormMaker(funcname, pix_func) \ static void funcname(unsigned char *pd, unsigned int pack, int pal) \ -TileNormMaker_(pix_func) +TileNormMaker_(pix_func,) #define TileFlipMaker(funcname, pix_func) \ static void funcname(unsigned char *pd, unsigned int pack, int pal) \ -TileFlipMaker_(pix_func) +TileFlipMaker_(pix_func,) #define TileNormMakerAS(funcname, pix_func) \ -static void funcname(unsigned char *pd, unsigned char *mb, unsigned int pack, int pal) \ -TileNormMaker_(pix_func) +static unsigned funcname(unsigned char *pd, unsigned m, unsigned int pack, int pal) \ +TileNormMaker_(pix_func,m) #define TileFlipMakerAS(funcname, pix_func) \ -static void funcname(unsigned char *pd, unsigned char *mb, unsigned int pack, int pal) \ -TileFlipMaker_(pix_func) +static unsigned funcname(unsigned char *pd, unsigned m, unsigned int pack, int pal) \ +TileFlipMaker_(pix_func,m) #define pix_just_write(x) \ if (t) pd[x]=pal|t @@ -178,17 +188,19 @@ TileFlipMaker(TileFlipSH_onlyop_lp, pix_sh_onlyop) #endif +// AS: sprite mask bits in m shifted to bits 8-15, see DrawSpritesHiAS + // draw a sprite pixel (AS) #define pix_as(x) \ - if (t & mb[x]) mb[x] = 0, pd[x] = pal | t + if (t && (m & (1<<(x+8)))) m &= ~(1<<(x+8)), pd[x] = pal | t TileNormMakerAS(TileNormAS, pix_as) TileFlipMakerAS(TileFlipAS, pix_as) // draw a sprite pixel, process operator colors (AS) #define pix_sh_as(x) \ - if (t & mb[x]) { \ - mb[x] = 0; \ + if (t && (m & (1<<(x+8)))) { \ + m &= ~(1<<(x+8)); \ if (t>=0xe) pd[x]=(pd[x]&0x3f)|(t<<6); /* c0 shadow, 80 hilight */ \ else pd[x] = pal | t; \ } @@ -197,8 +209,8 @@ TileNormMakerAS(TileNormSH_AS, pix_sh_as) TileFlipMakerAS(TileFlipSH_AS, pix_sh_as) #define pix_sh_as_onlyop(x) \ - if (t & mb[x]) { \ - mb[x] = 0; \ + if (t && (m & (1<<(x+8)))) { \ + m &= ~(1<<(x+8)); \ pix_sh_onlyop(x); \ } @@ -207,11 +219,12 @@ TileFlipMakerAS(TileFlipSH_AS_onlyop_lp, pix_sh_as_onlyop) // mark pixel as sprite pixel (AS) #define pix_sh_as_onlymark(x) \ - if (t) mb[x] = 0 + if (t) m &= ~(1<<(x+8)) TileNormMakerAS(TileNormAS_onlymark, pix_sh_as_onlymark) TileFlipMakerAS(TileFlipAS_onlymark, pix_sh_as_onlymark) +#ifdef FORCE // forced both layer draw (through debug reg) #define pix_and(x) \ pd[x] = (pd[x] & 0xc0) | (pd[x] & (pal | t)) @@ -219,6 +232,18 @@ TileFlipMakerAS(TileFlipAS_onlymark, pix_sh_as_onlymark) TileNormMaker(TileNorm_and, pix_and) TileFlipMaker(TileFlip_and, pix_and) +// forced sprite draw (through debug reg) +#define pix_sh_as_and(x) /* XXX is there S/H with forced draw? */ \ + if (m & (1<<(x+8))) { \ + m &= ~(1<<(x+8)); \ + if (t>=0xe) pd[x]=(pd[x]&0x3f)|(t<<6); /* c0 shadow, 80 hilight */ \ + else pd[x] = (pd[x] & 0xc0) | (pd[x] & (pal | t)); \ + } + +TileNormMakerAS(TileNormSH_AS_and, pix_sh_as_and) +TileFlipMakerAS(TileFlipSH_AS_and, pix_sh_as_and) +#endif + // -------------------------------------------- #ifndef _ASM_DRAW_C @@ -293,6 +318,7 @@ static void DrawStripVSRam(struct TileStrip *ts, int plane_sh, int cellskip) int adj = ((ts->hscroll ^ dx) >> 3) & 1; cell -= adj + 1; ts->cells -= adj; + PicoMem.vsram[0x3e] = PicoMem.vsram[0x3f] = plane_sh >> 16; } cell+=cellskip; tilex+=cellskip; @@ -306,7 +332,7 @@ static void DrawStripVSRam(struct TileStrip *ts, int plane_sh, int cellskip) //if((cell&1)==0) { int line,vscroll; - vscroll=PicoMem.vsram[(plane_sh&1)+(cell&~1)]; + vscroll=PicoMem.vsram[(plane_sh&1)+(cell&0x3e)]; // Find the line in the name table line=(vscroll+scan)&ts->line&0xffff; // ts->line is really ymask .. @@ -315,7 +341,7 @@ static void DrawStripVSRam(struct TileStrip *ts, int plane_sh, int cellskip) } code=PicoMem.vram[ts->nametab+nametabadd+(tilex&ts->xmask)]; - if (code==blank) continue; + if ((code<<16|ty)==blank) continue; if (code>>15) { // high priority tile int cval = code | (dx<<16) | (ty<<25); if(code&0x1000) cval^=7<<26; @@ -327,14 +353,15 @@ static void DrawStripVSRam(struct TileStrip *ts, int plane_sh, int cellskip) oldcode = code; // Get tile address/2: addr=(code&0x7ff)<<4; - if (code&0x1000) addr+=14-ty; else addr+=ty; // Y-flip pal=((code>>9)&0x30)|((plane_sh<<5)&0x40); } - pack = *(unsigned int *)(PicoMem.vram + addr); + if (code & 0x1000) ty ^= 0xe; // Y-flip + pack = *(unsigned int *)(PicoMem.vram + addr+ty); + if (!pack) { - blank = code; + blank = code<<16|ty; continue; } @@ -382,7 +409,7 @@ void DrawStripInterlace(struct TileStrip *ts) if (code!=oldcode) { oldcode = code; // Get tile address/2: - addr=(code&0x7ff)<<5; + addr=(code&0x3ff)<<5; if (code&0x1000) addr+=30-ty; else addr+=ty; // Y-flip // pal=Pico.cram+((code>>9)&0x30); @@ -437,8 +464,11 @@ static void DrawLayer(int plane_sh, int *hcache, int cellskip, int maxcells, else ts.nametab=(pvid->reg[2]&0x38)<< 9; // A htab=pvid->reg[13]<<9; // Horizontal scroll table address - if ( pvid->reg[11]&2) htab+=est->DrawScanline<<1; // Offset by line - if ((pvid->reg[11]&1)==0) htab&=~0xf; // Offset by tile + switch (pvid->reg[11]&3) { + case 1: htab += (est->DrawScanline<<1) & 0x0f; break; + case 2: htab += (est->DrawScanline<<1) & ~0x0f; break; // Offset by tile + case 3: htab += (est->DrawScanline<<1); break; // Offset by line + } htab+=plane_sh&1; // A or B // Get horizontal scroll value, will be masked later @@ -457,6 +487,10 @@ static void DrawLayer(int plane_sh, int *hcache, int cellskip, int maxcells, // shit, we have 2-cell column based vscroll // luckily this doesn't happen too often ts.line=ymask|(shift[width]<<24); // save some stuff instead of line + // vscroll value for leftmost cells in case of hscroll not on 16px boundary + // XXX it's unclear what exactly the hw is doing. Continue reading where it + // stopped last seems to work best (H40: 0x50 (wrap->0x00), H32 0x40). + plane_sh |= PicoMem.vsram[(pvid->reg[12]&1?0x00:0x20) + (plane_sh&1)] << 16; DrawStripVSRam(&ts, plane_sh, cellskip); } else { vscroll = PicoMem.vsram[plane_sh & 1]; // Get vertical scroll value @@ -614,19 +648,14 @@ static void DrawTilesFromCache(int *hc, int sh, int rlim, struct PicoEState *est if (!sh) { - short blank=-1; // The tile we know is blank while ((code=*hc++)) { - if (!(code & 0x8000) || (short)code == blank) - continue; // Get tile address/2: addr = (code & 0x7ff) << 4; addr += code >> 25; // y offset into tile pack = *(unsigned int *)(PicoMem.vram + addr); - if (!pack) { - blank = (short)code; + if (!pack) continue; - } dx = (code >> 16) & 0x1ff; pal = ((code >> 9) & 0x30); @@ -706,7 +735,7 @@ static void DrawTilesFromCache(int *hc, int sh, int rlim, struct PicoEState *est // Index + 0 : hhhhvvvv ab--hhvv yyyyyyyy yyyyyyyy // a: offscreen h, b: offs. v, h: horiz. size // Index + 4 : xxxxxxxx xxxxxxxx pccvhnnn nnnnnnnn // x: x coord + 8 -static void DrawSprite(int *sprite, int sh) +static void DrawSprite(int *sprite, int sh, int w) { void (*fTileFunc)(unsigned char *pd, unsigned int pack, int pal); unsigned char *pd = Pico.est.HighCol; @@ -746,6 +775,7 @@ static void DrawSprite(int *sprite, int sh) else fTileFunc=TileNorm; } + if (w) width = w; // tile limit for (; width; width--,sx+=8,tile+=delta) { unsigned int pack; @@ -759,28 +789,6 @@ static void DrawSprite(int *sprite, int sh) } #endif -static NOINLINE void DrawTilesFromCacheForced(const int *hc) -{ - unsigned char *pd = Pico.est.HighCol; - int code, addr, dx; - unsigned int pack; - int pal; - - // *ts->hc++ = code | (dx<<16) | (ty<<25); - while ((code = *hc++)) { - // Get tile address/2: - addr = (code & 0x7ff) << 4; - addr += (code >> 25) & 0x0e; // y offset into tile - - dx = (code >> 16) & 0x1ff; - pal = ((code >> 9) & 0x30); - pack = *(unsigned int *)(PicoMem.vram + addr); - - if (code & 0x0800) TileFlip_and(pd + dx, pack, pal); - else TileNorm_and(pd + dx, pack, pal); - } -} - static void DrawSpriteInterlace(unsigned int *sprite) { unsigned char *pd = Pico.est.HighCol; @@ -833,12 +841,13 @@ static NOINLINE void DrawAllSpritesInterlace(int pri, int sh) struct PicoVideo *pvid=&Pico.video; int i,u,table,link=0,sline=Pico.est.DrawScanline<<1; unsigned int *sprites[80]; // Sprite index + int max_sprites = Pico.video.reg[12]&1 ? 80 : 64; table=pvid->reg[5]&0x7f; if (pvid->reg[12]&1) table&=0x7e; // Lowest bit 0 in 40-cell mode table<<=8; // Get sprite table address/2 - for (i=u=0; u < 80 && i < 21; u++) + for (i = u = 0; u < max_sprites && link < max_sprites; u++) { unsigned int *sprite; int code, sx, sy, height; @@ -885,18 +894,25 @@ static NOINLINE void DrawAllSpritesInterlace(int pri, int sh) */ static void DrawSpritesSHi(unsigned char *sprited, const struct PicoEState *est) { + static void (*tilefuncs[2][2][2])(unsigned char *, unsigned, int) = { + { {NULL, NULL}, {TileNorm, TileFlip} }, + { {TileNormSH_onlyop_lp, TileFlipSH_onlyop_lp}, {TileNormSH, TileFlipSH} } + }; // [sh?][hi?][flip?] void (*fTileFunc)(unsigned char *pd, unsigned int pack, int pal); unsigned char *pd = Pico.est.HighCol; unsigned char *p; - int cnt; + int cnt, w; cnt = sprited[0] & 0x7f; if (cnt == 0) return; - p = &sprited[3]; + p = &sprited[4]; + if ((sprited[1] & (SPRL_TILE_OVFL|SPRL_HAVE_MASK0)) == (SPRL_TILE_OVFL|SPRL_HAVE_MASK0)) + return; // masking effective due to tile overflow // Go through sprites backwards: - for (cnt--; cnt >= 0; cnt--) + w = p[cnt]; // possibly clipped width of last sprite + for (cnt--; cnt >= 0; cnt--, w = 0) { int *sprite, code, pal, tile, sx, sy; int offs, delta, width, height, row; @@ -906,21 +922,8 @@ static void DrawSpritesSHi(unsigned char *sprited, const struct PicoEState *est) code = sprite[1]; pal = (code>>9)&0x30; - if (pal == 0x30) - { - if (code & 0x8000) // hi priority - { - if (code&0x800) fTileFunc=TileFlipSH; - else fTileFunc=TileNormSH; - } else { - if (code&0x800) fTileFunc=TileFlipSH_onlyop_lp; - else fTileFunc=TileNormSH_onlyop_lp; - } - } else { - if (!(code & 0x8000)) continue; // non-operator low sprite, already drawn - if (code&0x800) fTileFunc=TileFlip; - else fTileFunc=TileNorm; - } + fTileFunc = tilefuncs[pal == 0x30][!!(code & 0x8000)][!!(code & 0x800)]; + if (fTileFunc == NULL) continue; // non-operator low sprite, already drawn // parse remaining sprite data sy=sprite[0]; @@ -940,6 +943,7 @@ static void DrawSpritesSHi(unsigned char *sprited, const struct PicoEState *est) tile &= 0x7ff; tile<<=4; tile+=(row&7)<<1; // Tile address delta<<=4; // Delta of address + if (w) width = w; // tile limit for (; width; width--,sx+=8,tile+=delta) { unsigned int pack; @@ -956,18 +960,24 @@ static void DrawSpritesSHi(unsigned char *sprited, const struct PicoEState *est) static void DrawSpritesHiAS(unsigned char *sprited, int sh) { - void (*fTileFunc)(unsigned char *pd, unsigned char *mb, - unsigned int pack, int pal); + static unsigned (*tilefuncs[2][2][2])(unsigned char *, unsigned, unsigned, int) = { + { {TileNormAS_onlymark, TileFlipAS_onlymark}, {TileNormAS, TileFlipAS} }, + { {TileNormSH_AS_onlyop_lp, TileFlipSH_AS_onlyop_lp}, {TileNormSH_AS, TileFlipSH_AS} } + }; // [sh?][hi?][flip?] + unsigned (*fTileFunc)(unsigned char *pd, unsigned m, unsigned int pack, int pal); unsigned char *pd = Pico.est.HighCol; - unsigned char mb[8+320+8]; - unsigned char *p; + unsigned char mb[1+320/8+1]; + unsigned char *p, *mp; + unsigned m; int entry, cnt; cnt = sprited[0] & 0x7f; if (cnt == 0) return; memset(mb, 0xff, sizeof(mb)); - p = &sprited[3]; + p = &sprited[4]; + if ((sprited[1] & (SPRL_TILE_OVFL|SPRL_HAVE_MASK0)) == (SPRL_TILE_OVFL|SPRL_HAVE_MASK0)) + return; // masking effective due to tile overflow // Go through sprites: for (entry = 0; entry < cnt; entry++) @@ -980,27 +990,241 @@ static void DrawSpritesHiAS(unsigned char *sprited, int sh) code = sprite[1]; pal = (code>>9)&0x30; - if (sh && pal == 0x30) + fTileFunc = tilefuncs[(sh && pal == 0x30)][!!(code&0x8000)][!!(code&0x800)]; + + // parse remaining sprite data + sy=sprite[0]; + sx=code>>16; // X + width=sy>>28; + height=(sy>>24)&7; // Width and height in tiles + sy=(sy<<16)>>16; // Y + + row=Pico.est.DrawScanline-sy; // Row of the sprite we are on + + if (code&0x1000) row=(height<<3)-1-row; // Flip Y + + tile=code + (row>>3); // Tile number increases going down + delta=height; // Delta to increase tile by going right + if (code&0x0800) { tile+=delta*(width-1); delta=-delta; } // Flip X + + tile &= 0x7ff; tile<<=4; tile+=(row&7)<<1; // Tile address + delta<<=4; // Delta of address + + if (entry+1 == cnt) width = p[entry+1]; // last sprite width limited? + while (sx <= 0 && width) width--, sx+=8, tile+=delta; // Offscreen + mp = mb+(sx>>3); + for (m = *mp; width; width--, sx+=8, tile+=delta, *mp++ = m, m >>= 8) { - if (code & 0x8000) // hi priority - { - if (code&0x800) fTileFunc = TileFlipSH_AS; - else fTileFunc = TileNormSH_AS; - } else { - if (code&0x800) fTileFunc = TileFlipSH_AS_onlyop_lp; - else fTileFunc = TileNormSH_AS_onlyop_lp; - } - } else { - if (code & 0x8000) // hi priority - { - if (code&0x800) fTileFunc = TileFlipAS; - else fTileFunc = TileNormAS; - } else { - if (code&0x800) fTileFunc = TileFlipAS_onlymark; - else fTileFunc = TileNormAS_onlymark; - } + unsigned int pack; + + if(sx>=328) break; // Offscreen + + pack = *(unsigned int *)(PicoMem.vram + (tile & 0x7fff)); + + m |= mp[1] << 8; // next mask byte + // shift mask bits to bits 8-15 for easier load/store handling + m = fTileFunc(pd + sx, m << (8-(sx&0x7)), pack, pal) >> (8-(sx&0x7)); + } + *mp = m; // write last mask byte + } +} + +#ifdef FORCE +static void DrawStripForced(struct TileStrip *ts, int lflags, int cellskip) +{ + unsigned char *pd = Pico.est.HighCol; + int tilex,dx,ty,code=0,addr=0,cells; + int oldcode=-1; + int pal=0,sh; + + // Draw tiles across screen: + sh = (lflags & LF_SH) << 5; // 0x40 + tilex=((-ts->hscroll)>>3)+cellskip; + ty=(ts->line&7)<<1; // Y-Offset into tile + dx=((ts->hscroll-1)&7)+1; + cells = ts->cells - cellskip; + if(dx != 8) cells++; // have hscroll, need to draw 1 cell more + dx+=cellskip<<3; + + for (; cells > 0; dx+=8, tilex++, cells--) + { + unsigned int pack; + + code = PicoMem.vram[ts->nametab + (tilex & ts->xmask)]; + + if (code!=oldcode) { + oldcode = code; + // Get tile address/2: + addr=(code&0x7ff)<<4; + addr+=ty; + if (code&0x1000) addr^=0xe; // Y-flip + + pal=((code>>9)&0x30)|sh; } + pack = *(unsigned int *)(PicoMem.vram + addr); + + if (code & 0x0800) TileFlip_and(pd + dx, pack, pal); + else TileNorm_and(pd + dx, pack, pal); + } +} + +// this is messy +static void DrawStripVSRamForced(struct TileStrip *ts, int plane_sh, int cellskip) +{ + unsigned char *pd = Pico.est.HighCol; + int tilex,dx,code=0,addr=0,cell=0; + int oldcode=-1; + int pal=0,scan=Pico.est.DrawScanline; + + // Draw tiles across screen: + tilex=(-ts->hscroll)>>3; + dx=((ts->hscroll-1)&7)+1; + if (ts->hscroll & 0x0f) { + int adj = ((ts->hscroll ^ dx) >> 3) & 1; + cell -= adj + 1; + ts->cells -= adj; + PicoMem.vsram[0x3e] = PicoMem.vsram[0x3f] = plane_sh >> 16; + } + cell+=cellskip; + tilex+=cellskip; + dx+=cellskip<<3; + + for (; cell < ts->cells; dx+=8,tilex++,cell++) + { + int nametabadd, ty; + unsigned int pack; + + //if((cell&1)==0) + { + int line,vscroll; + vscroll=PicoMem.vsram[(plane_sh&1)+(cell&0x3e)]; + + // Find the line in the name table + line=(vscroll+scan)&ts->line&0xffff; // ts->line is really ymask .. + nametabadd=(line>>3)<<(ts->line>>24); // .. and shift[width] + ty=(line&7)<<1; // Y-Offset into tile + } + + code=PicoMem.vram[ts->nametab+nametabadd+(tilex&ts->xmask)]; + + if (code!=oldcode) { + oldcode = code; + // Get tile address/2: + addr=(code&0x7ff)<<4; + + pal=((code>>9)&0x30)|((plane_sh<<5)&0x40); + } + + if (code & 0x1000) ty ^= 0xe; // Y-flip + pack = *(unsigned int *)(PicoMem.vram + addr+ty); + + if (code & 0x0800) TileFlip_and(pd + dx, pack, pal); + else TileNorm_and(pd + dx, pack, pal); + } +} + +static void DrawLayerForced(int plane_sh, int cellskip, int maxcells, + struct PicoEState *est) +{ + struct PicoVideo *pvid=&Pico.video; + const char shift[4]={5,6,5,7}; // 32,64 or 128 sized tilemaps (2 is invalid) + struct TileStrip ts; + int width, height, ymask; + int vscroll, htab; + + ts.cells=maxcells; + + // Work out the TileStrip to draw + + // Work out the name table size: 32 64 or 128 tiles (0-3) + width=pvid->reg[16]; + height=(width>>4)&3; width&=3; + + ts.xmask=(1<reg[4]&0x07)<<12; // B + else ts.nametab=(pvid->reg[2]&0x38)<< 9; // A + + htab=pvid->reg[13]<<9; // Horizontal scroll table address + switch (pvid->reg[11]&3) { + case 1: htab += (est->DrawScanline<<1) & 0x0f; break; + case 2: htab += (est->DrawScanline<<1) & ~0x0f; break; // Offset by tile + case 3: htab += (est->DrawScanline<<1); break; // Offset by line + } + htab+=plane_sh&1; // A or B + + // Get horizontal scroll value, will be masked later + ts.hscroll = PicoMem.vram[htab & 0x7fff]; + + if((pvid->reg[12]&6) == 6) { + // interlace mode 2 + vscroll = PicoMem.vsram[plane_sh & 1]; // Get vertical scroll value + + // Find the line in the name table + ts.line=(vscroll+(est->DrawScanline<<1))&((ymask<<1)|1); + ts.nametab+=(ts.line>>4)<reg[11]&4) { + // shit, we have 2-cell column based vscroll + // luckily this doesn't happen too often + ts.line=ymask|(shift[width]<<24); // save some stuff instead of line + // vscroll value for leftmost cells in case of hscroll not on 16px boundary + // XXX it's unclear what exactly the hw is doing. Continue reading where it + // stopped last seems to work best (H40: 0x50 (wrap->0x00), H32 0x40). + plane_sh |= PicoMem.vsram[(pvid->reg[12]&1?0x00:0x20) + (plane_sh&1)] << 16; + DrawStripVSRamForced(&ts, plane_sh, cellskip); + } else { + vscroll = PicoMem.vsram[plane_sh & 1]; // Get vertical scroll value + + // Find the line in the name table + ts.line=(vscroll+est->DrawScanline)&ymask; + ts.nametab+=(ts.line>>3)<>9)&0x30; + + if (code&0x800) fTileFunc = TileFlipSH_AS_and; + else fTileFunc = TileNormSH_AS_and; + // parse remaining sprite data sy=sprite[0]; sx=code>>16; // X @@ -1019,18 +1243,35 @@ static void DrawSpritesHiAS(unsigned char *sprited, int sh) tile &= 0x7ff; tile<<=4; tile+=(row&7)<<1; // Tile address delta<<=4; // Delta of address - for (; width; width--,sx+=8,tile+=delta) + if (entry+1 == cnt) width = p[entry+1]; // last sprite width limited? + while (sx <= 0 && width) width--, sx+=8, tile+=delta; // Offscreen + mp = mb+(sx>>3); + for (m = *mp; width; width--, sx+=8, tile+=delta, *mp++ = m, m >>= 8) { unsigned int pack; - if(sx<=0) continue; if(sx>=328) break; // Offscreen pack = *(unsigned int *)(PicoMem.vram + (tile & 0x7fff)); - fTileFunc(pd + sx, mb + sx, pack, pal); - } + + m |= mp[1] << 8; // next mask byte + // shift mask bits to bits 8-15 for easier load/store handling + m = fTileFunc(pd + sx, m << (8-(sx&0x7)), pack, pal) >> (8-(sx&0x7)); + } + *mp = m; // write last mask byte } + + // anything not covered by a sprite is off (XXX or bg?) + for (cnt = 1; cnt < sizeof(mb)-1; cnt++) + if (mb[cnt] == 0xff) + for (m = 0; m < 8; m++) + pd[8*cnt+m] = 0; + else if (mb[cnt]) + for (m = 0; m < 8; m++) + if (mb[cnt] & (1<reg[1]&8) max_lines = 240; sh = Pico.video.reg[0xC]&8; // shadow/hilight? table=pvid->reg[5]&0x7f; if (pvid->reg[12]&1) table&=0x7e; // Lowest bit 0 in 40-cell mode table<<=8; // Get sprite table address/2 - if (!full) - { - int pack; - // updates: tilecode, sx - for (u=0; u < max_sprites && (pack = *pd); u++, pd+=2) - { - unsigned int *sprite; - int code2, sx, sy, height; + for (u = est->DrawScanline; u < max_lines; u++) + *((int *)&HighLnSpr[u][0]) = 0; - sprite=(unsigned int *)(PicoMem.vram+((table+(link<<2))&0x7ffc)); // Find sprite - - // parse sprite info - code2 = sprite[1]; - sx = (code2>>16)&0x1ff; - sx -= 0x78; // Get X coordinate + 8 - sy = (pack << 16) >> 16; - height = (pack >> 24) & 0xf; + for (u = 0; u < max_sprites && link < max_sprites; u++) + { + unsigned int *sprite; + int code, code2, sx, sy, hv, height, width; - if (sy < max_lines && - sy + (height<<3) > est->DrawScanline && // sprite onscreen (y)? - (sx > -24 || sx < max_width)) // onscreen x - { - int y = (sy >= est->DrawScanline) ? sy : est->DrawScanline; - int entry = ((pd - HighPreSpr) / 2) | ((code2>>8)&0x80); - for (; y < sy + (height<<3) && y < max_lines; y++) - { - int i, cnt; - cnt = HighLnSpr[y][0] & 0x7f; - if (cnt >= max_line_sprites) continue; // sprite limit? - - for (i = 0; i < cnt; i++) - if (((HighLnSpr[y][3+i] ^ entry) & 0x7f) == 0) goto found; - - // this sprite was previously missing - HighLnSpr[y][3+cnt] = entry; - HighLnSpr[y][0] = cnt + 1; -found:; - if (entry & 0x80) - HighLnSpr[y][1] |= SPRL_HAVE_HI; - else HighLnSpr[y][1] |= SPRL_HAVE_LO; - } - } + sprite=(unsigned int *)(PicoMem.vram+((table+(link<<2))&0x7ffc)); // Find sprite - code2 &= ~0xfe000000; - code2 -= 0x00780000; // Get X coordinate + 8 in upper 16 bits - pd[1] = code2; + // parse sprite info. the 1st half comes from the VDPs internal cache, + // the 2nd half is read from VRAM + code = VdpSATCache[link]; // normally but not always equal to sprite[0] + sy = (code&0x1ff)-0x80; + hv = (code>>24)&0xf; + height = (hv&3)+1; + width = (hv>>2)+1; - // Find next sprite - link=(sprite[0]>>16)&0x7f; - if (!link) break; // End of sprites - } - } - else - { - for (u = 0; u < max_lines; u++) - *((int *)&HighLnSpr[u][0]) = 0; + code2 = sprite[1]; + sx = (code2>>16)&0x1ff; + sx -= 0x78; // Get X coordinate + 8 - for (u = 0; u < max_sprites; u++) + if (sy < max_lines && sy + (height<<3) >= est->DrawScanline) // sprite onscreen (y)? { - unsigned int *sprite; - int code, code2, sx, sy, hv, height, width; - - sprite=(unsigned int *)(PicoMem.vram+((table+(link<<2))&0x7ffc)); // Find sprite + int entry, y, w, sx_min, onscr_x, maybe_op = 0; - // parse sprite info - code = sprite[0]; - sy = (code&0x1ff)-0x80; - hv = (code>>24)&0xf; - height = (hv&3)+1; + sx_min = 8-(width<<3); + onscr_x = sx_min < sx && sx < max_width; + if (sh && (code2 & 0x6000) == 0x6000) + maybe_op = SPRL_MAY_HAVE_OP; - width = (hv>>2)+1; - code2 = sprite[1]; - sx = (code2>>16)&0x1ff; - sx -= 0x78; // Get X coordinate + 8 - - if (sy < max_lines && sy + (height<<3) > est->DrawScanline) // sprite onscreen (y)? + entry = ((pd - HighPreSpr) / 2) | ((code2>>8)&0x80); + y = (sy >= est->DrawScanline) ? sy : est->DrawScanline; + for (; y < sy + (height<<3) && y < max_lines; y++) { - int entry, y, sx_min, onscr_x, maybe_op = 0; - - sx_min = 8-(width<<3); - onscr_x = sx_min < sx && sx < max_width; - if (sh && (code2 & 0x6000) == 0x6000) - maybe_op = SPRL_MAY_HAVE_OP; - - entry = ((pd - HighPreSpr) / 2) | ((code2>>8)&0x80); - y = (sy >= est->DrawScanline) ? sy : est->DrawScanline; - for (; y < sy + (height<<3) && y < max_lines; y++) - { - unsigned char *p = &HighLnSpr[y][0]; - int cnt = p[0]; - if (cnt >= max_line_sprites) continue; // sprite limit? - - if (p[2] >= max_line_sprites*2) { // tile limit? - p[0] |= 0x80; - continue; - } - p[2] += width; - - if (sx == -0x78) { - if (cnt > 0) - p[0] |= 0x80; // masked, no more sprites for this line - continue; - } - // must keep the first sprite even if it's offscreen, for masking - if (cnt > 0 && !onscr_x) continue; // offscreen x - - p[3+cnt] = entry; - p[0] = cnt + 1; - p[1] |= (entry & 0x80) ? SPRL_HAVE_HI : SPRL_HAVE_LO; - p[1] |= maybe_op; // there might be op sprites on this line - if (cnt > 0 && (code2 & 0x8000) && !(p[3+cnt-1]&0x80)) - p[1] |= SPRL_LO_ABOVE_HI; + unsigned char *p = &HighLnSpr[y][0]; + int cnt = p[0]; + if (p[3] >= max_line_sprites) continue; // sprite limit? + if ((p[1] & SPRL_MASKED) && !(entry & 0x80)) continue; // masked? + + w = width; + if (p[2] + width > max_line_sprites*2) { // tile limit? + if (y+1 < 240) HighLnSpr[y+1][1] |= SPRL_TILE_OVFL; + if (p[2] >= max_line_sprites*2) continue; + w = max_line_sprites*2 - p[2]; } + p[2] += w; + p[3] ++; + + if (sx == -0x78) { + if (p[1] & (SPRL_HAVE_X|SPRL_TILE_OVFL)) + p[1] |= SPRL_MASKED; // masked, no more low sprites for this line + if (!(p[1] & SPRL_HAVE_X) && cnt == 0) + p[1] |= SPRL_HAVE_MASK0; // 1st sprite is masking + } else + p[1] |= SPRL_HAVE_X; + + if (!onscr_x) continue; // offscreen x + + p[4+cnt] = entry; + p[5+cnt] = w; // width clipped by tile limit for sprite renderer + p[0] = cnt + 1; + p[1] |= (entry & 0x80) ? SPRL_HAVE_HI : SPRL_HAVE_LO; + p[1] |= maybe_op; // there might be op sprites on this line + if (cnt > 0 && (code2 & 0x8000) && !(p[4+cnt-1]&0x80)) + p[1] |= SPRL_LO_ABOVE_HI; } + } - *pd++ = (width<<28)|(height<<24)|(hv<<16)|((unsigned short)sy); - *pd++ = (sx<<16)|((unsigned short)code2); + *pd++ = (width<<28)|(height<<24)|(hv<<16)|((unsigned short)sy); + *pd++ = (sx<<16)|((unsigned short)code2); - // Find next sprite - link=(code>>16)&0x7f; - if (!link) break; // End of sprites - } - *pd = 0; + // Find next sprite + link=(code>>16)&0x7f; + if (!link) break; // End of sprites + } + *pd = 0; #if 0 - for (u = 0; u < max_lines; u++) - { - int y; - printf("c%03i: %2i, %2i: ", u, HighLnSpr[u][0] & 0x7f, HighLnSpr[u][2]); - for (y = 0; y < HighLnSpr[u][0] & 0x7f; y++) - printf(" %i", HighLnSpr[u][y+3]); - printf("\n"); - } -#endif + for (u = 0; u < max_lines; u++) + { + int y; + printf("c%03i: f %x c %2i/%2i w %2i: ", u, HighLnSpr[u][1], + HighLnSpr[u][0], HighLnSpr[u][3], HighLnSpr[u][2]); + for (y = 0; y < HighLnSpr[u][0]; y++) + printf(" %i", HighLnSpr[u][y+4]); + printf("\n"); } +#endif } #ifndef _ASM_DRAW_C @@ -1203,20 +1397,22 @@ static void DrawAllSprites(unsigned char *sprited, int prio, int sh, struct PicoEState *est) { unsigned char *p; - int cnt; + int cnt, w; cnt = sprited[0] & 0x7f; if (cnt == 0) return; - p = &sprited[3]; + p = &sprited[4]; + if ((sprited[1] & (SPRL_TILE_OVFL|SPRL_HAVE_MASK0)) == (SPRL_TILE_OVFL|SPRL_HAVE_MASK0)) + return; // masking effective due to tile overflow // Go through sprites backwards: - for (cnt--; cnt >= 0; cnt--) + w = p[cnt]; // possibly clipped width of last sprite + for (cnt--; cnt >= 0; cnt--, w = 0) { - int offs; + int *sp = HighPreSpr + (p[cnt]&0x7f) * 2; if ((p[cnt] >> 7) != prio) continue; - offs = (p[cnt]&0x7f) * 2; - DrawSprite(HighPreSpr + offs, sh); + DrawSprite(sp, sh, w); } } @@ -1239,6 +1435,49 @@ void BackFill(int reg7, int sh, struct PicoEState *est) // -------------------------------------------- +void PicoDoHighPal555_8bit(int sh, int line, struct PicoEState *est) +{ + unsigned int *spal, *dpal; + unsigned int cnt = (sh ? 1 : est->SonicPalCount+1); + unsigned int t, i; + + // reset dirty only if there are no outstanding changes + if (Pico.m.dirtyPal == 2) + Pico.m.dirtyPal = 0; + + // In Sonic render mode palettes were backuped in SonicPal + spal = (void *)est->SonicPal; + dpal = (void *)est->HighPal; + + // additional palettes stored after in-frame changes + for (i = 0; i < cnt * 0x40 / 2; i++) { + t = spal[i]; +#ifdef USE_BGR555 + t = ((t & 0x000e000e)<< 1) | ((t & 0x00e000e0)<<3) | ((t & 0x0e000e00)<<4); +#else + t = ((t & 0x000e000e)<<12) | ((t & 0x00e000e0)<<3) | ((t & 0x0e000e00)>>7); +#endif + // treat it like it was 4-bit per channel, since in s/h mode it somewhat is that. + // otherwise intensity difference between this and s/h will be wrong + t |= (t >> 4) & 0x08610861; // 0x18e318e3 + dpal[i] = t; + } + + // norm: xxx0, sh: 0xxx, hi: 0xxx + 7 + if (sh) + { + // shadowed pixels + for (i = 0; i < 0x40 / 2; i++) + dpal[0x40/2 | i] = dpal[0xc0/2 | i] = (dpal[i] >> 1) & 0x738e738e; + // hilighted pixels + for (i = 0; i < 0x40 / 2; i++) { + t = ((dpal[i] >> 1) & 0x738e738e) + 0x738e738e; // 0x7bef7bef; + t |= (t >> 4) & 0x08610861; + dpal[0x80/2 | i] = t; + } + } +} + #ifndef _ASM_DRAW_C void PicoDoHighPal555(int sh, int line, struct PicoEState *est) { @@ -1285,8 +1524,7 @@ void FinalizeLine555(int sh, int line, struct PicoEState *est) unsigned short *pal=est->HighPal; int len; - if (Pico.m.dirtyPal) - PicoDoHighPal555(sh, line, est); + PicoDrawUpdateHighPal(); if (Pico.video.reg[12]&1) { len = 320; @@ -1299,8 +1537,12 @@ void FinalizeLine555(int sh, int line, struct PicoEState *est) #if 1 int i; - for (i = 0; i < len; i++) - pd[i] = pal[ps[i]]; + for (i = len; i > 0; i-=4) { + *pd++ = pal[*ps++]; + *pd++ = pal[*ps++]; + *pd++ = pal[*ps++]; + *pd++ = pal[*ps++]; + } #else extern void amips_clut(unsigned short *dst, unsigned char *src, unsigned short *pal, int count); extern void amips_clut_6bit(unsigned short *dst, unsigned char *src, unsigned short *pal, int count); @@ -1315,22 +1557,21 @@ void FinalizeLine555(int sh, int line, struct PicoEState *est) static void FinalizeLine8bit(int sh, int line, struct PicoEState *est) { unsigned char *pd = est->DrawLineDest; - int len, rs = est->rendstatus; - static int dirty_count; + int len; + static int dirty_line; - if (!sh && Pico.m.dirtyPal == 1) + if (Pico.m.dirtyPal == 1) { // a hack for mid-frame palette changes - if (!(rs & PDRAW_SONIC_MODE)) - dirty_count = 1; - else dirty_count++; - rs |= PDRAW_SONIC_MODE; - est->rendstatus = rs; - if (dirty_count == 3) { - blockcpy(est->HighPal, PicoMem.cram, 0x40*2); - } else if (dirty_count == 11) { - blockcpy(est->HighPal+0x40, PicoMem.cram, 0x40*2); + if (!(est->rendstatus & PDRAW_SONIC_MODE) || line - dirty_line > 4) { + // store a maximum of 2 additional palettes in SonicPal + if (est->SonicPalCount < 2) + est->SonicPalCount ++; + dirty_line = line; + est->rendstatus |= PDRAW_SONIC_MODE; } + blockcpy(est->SonicPal+est->SonicPalCount*0x40, PicoMem.cram, 0x40*2); + Pico.m.dirtyPal = 2; } if (Pico.video.reg[12]&1) { @@ -1341,12 +1582,9 @@ static void FinalizeLine8bit(int sh, int line, struct PicoEState *est) len = 256; } - if (!sh && (rs & PDRAW_SONIC_MODE)) { - if (dirty_count >= 11) { - blockcpy_or(pd, est->HighCol+8, len, 0x80); - } else { - blockcpy_or(pd, est->HighCol+8, len, 0x40); - } + if (!sh && (est->rendstatus & PDRAW_SONIC_MODE)) { + // select active backup palette + blockcpy_or(pd, est->HighCol+8, len, est->SonicPalCount*0x40); } else { blockcpy(pd, est->HighCol+8, len); } @@ -1364,12 +1602,6 @@ static int DrawDisplay(int sh) int win=0, edge=0, hvwind=0, lflags; int maxw, maxcells; - if (est->rendstatus & (PDRAW_SPRITES_MOVED|PDRAW_DIRTY_SPRITES)) { - // elprintf(EL_STATUS, "PrepareSprites(%i)", (est->rendstatus>>4)&1); - PrepareSprites(est->rendstatus & PDRAW_DIRTY_SPRITES); - est->rendstatus &= ~(PDRAW_SPRITES_MOVED|PDRAW_DIRTY_SPRITES); - } - est->rendstatus &= ~(PDRAW_SHHI_DONE|PDRAW_PLANE_HI_PRIO); if (pvid->reg[12]&1) { @@ -1402,14 +1634,10 @@ static int DrawDisplay(int sh) /* - layer B low - */ if (!(pvid->debug_p & PVD_KILL_B)) { lflags = LF_PLANE_1 | (sh << 1); - if (pvid->debug_p & PVD_FORCE_B) - lflags |= LF_FORCE; DrawLayer(lflags, HighCacheB, 0, maxcells, est); } /* - layer A low - */ lflags = 0 | (sh << 1); - if (pvid->debug_p & PVD_FORCE_A) - lflags |= LF_FORCE; if (pvid->debug_p & PVD_KILL_A) ; else if (hvwind == 1) @@ -1456,10 +1684,16 @@ static int DrawDisplay(int sh) else if (sprited[1] & SPRL_HAVE_HI) DrawAllSprites(sprited, 1, 0, est); - if (pvid->debug_p & PVD_FORCE_B) - DrawTilesFromCacheForced(HighCacheB); - else if (pvid->debug_p & PVD_FORCE_A) - DrawTilesFromCacheForced(HighCacheA); +#ifdef FORCE + if (pvid->debug_p & PVD_FORCE_B) { + lflags = LF_PLANE_1 | (sh << 1); + DrawLayerForced(lflags, 0, maxcells, est); + } else if (pvid->debug_p & PVD_FORCE_A) { + lflags = (sh << 1); + DrawLayerForced(lflags, 0, maxcells, est); + } else if (pvid->debug_p & PVD_FORCE_S) + DrawSpritesForced(sprited); +#endif #if 0 { @@ -1478,6 +1712,8 @@ static int DrawDisplay(int sh) PICO_INTERNAL void PicoFrameStart(void) { int offs = 8, lines = 224; + int dirty = ((Pico.est.rendstatus & PDRAW_SONIC_MODE) || Pico.m.dirtyPal); + int sprep = Pico.est.rendstatus & (PDRAW_SPRITES_MOVED|PDRAW_DIRTY_SPRITES); // prepare to do this frame Pico.est.rendstatus = 0; @@ -1497,18 +1733,23 @@ PICO_INTERNAL void PicoFrameStart(void) lines, (Pico.video.reg[12] & 1) ? 0 : 1); rendstatus_old = Pico.est.rendstatus; } + if (sprep) + Pico.est.rendstatus |= PDRAW_PARSE_SPRITES; Pico.est.HighCol = HighColBase + offs * HighColIncrement; Pico.est.DrawLineDest = (char *)DrawLineDestBase + offs * DrawLineDestIncrement; Pico.est.DrawScanline = 0; skip_next_line = 0; + if (FinalizeLine == FinalizeLine8bit) { + // make a backup of the current palette in case Sonic mode is detected later + Pico.est.SonicPalCount = 0; + Pico.m.dirtyPal = (dirty ? 2 : 0); // mark as dirty but already copied + blockcpy(Pico.est.SonicPal, PicoMem.cram, 0x40*2); + } + if (PicoIn.opt & POPT_ALT_RENDERER) return; - - if (Pico.m.dirtyPal) - Pico.m.dirtyPal = 2; // reset dirty if needed - PrepareSprites(1); } static void DrawBlankedLine(int line, int offs, int sh, int bgc) @@ -1546,7 +1787,7 @@ static void PicoLine(int line, int offs, int sh, int bgc) return; } - if (Pico.video.debug_p & (PVD_FORCE_A | PVD_FORCE_B)) + if (Pico.video.debug_p & (PVD_FORCE_A | PVD_FORCE_B | PVD_FORCE_S)) bgc = 0x3f; // Draw screen: @@ -1566,6 +1807,7 @@ static void PicoLine(int line, int offs, int sh, int bgc) void PicoDrawSync(int to, int blank_last_line) { + struct PicoEState *est = &Pico.est; int line, offs = 0; int sh = (Pico.video.reg[0xC] & 8) >> 3; // shadow/hilight? int bgc = Pico.video.reg[7]; @@ -1577,8 +1819,11 @@ void PicoDrawSync(int to, int blank_last_line) if (to > 223) to = 223; } + if (est->DrawScanline <= to - blank_last_line && (est->rendstatus & + (PDRAW_SPRITES_MOVED|PDRAW_DIRTY_SPRITES|PDRAW_PARSE_SPRITES))) + PrepareSprites(to - blank_last_line + 1); - for (line = Pico.est.DrawScanline; line < to; line++) + for (line = est->DrawScanline; line < to; line++) PicoLine(line, offs, sh, bgc); // last line @@ -1589,7 +1834,7 @@ void PicoDrawSync(int to, int blank_last_line) else PicoLine(line, offs, sh, bgc); line++; } - Pico.est.DrawScanline = line; + est->DrawScanline = line; pprof_end(draw); } @@ -1598,15 +1843,21 @@ void PicoDrawSync(int to, int blank_last_line) void PicoDrawUpdateHighPal(void) { struct PicoEState *est = &Pico.est; - int sh = (Pico.video.reg[0xC] & 8) >> 3; // shadow/hilight? - if (PicoIn.opt & POPT_ALT_RENDERER) - sh = 0; // no s/h support + if (Pico.m.dirtyPal) { + int sh = (Pico.video.reg[0xC] & 8) >> 3; // shadow/hilight? + if ((PicoIn.opt & POPT_ALT_RENDERER) | (est->rendstatus & PDRAW_SONIC_MODE)) + sh = 0; // no s/h support + + if (FinalizeLine == FinalizeLine8bit) + PicoDoHighPal555_8bit(sh, 0, est); + else + PicoDoHighPal555(sh, 0, est); - PicoDoHighPal555(sh, 0, &Pico.est); - if (est->rendstatus & PDRAW_SONIC_MODE) { - // FIXME? - memcpy(est->HighPal + 0x40, est->HighPal, 0x40*2); - memcpy(est->HighPal + 0x80, est->HighPal, 0x40*2); + // cover for sprite priority bits if not in s/h or sonic mode + if (!sh && !(est->rendstatus & PDRAW_SONIC_MODE)) { + blockcpy(est->HighPal+0x40, est->HighPal, 0x40*2); + blockcpy(est->HighPal+0x80, est->HighPal, 0x80*2); + } } } @@ -1629,17 +1880,33 @@ void PicoDrawSetOutFormat(pdso_t which, int use_32x_line_mode) FinalizeLine = NULL; break; } - PicoDrawSetOutFormat32x(which, use_32x_line_mode); + if (PicoIn.AHW & PAHW_32X) + PicoDrawSetOutFormat32x(which, use_32x_line_mode); PicoDrawSetOutputMode4(which); rendstatus_old = -1; } +void PicoDrawSetOutBufMD(void *dest, int increment) +{ + if (dest != NULL) { + DrawLineDestBase = dest; + DrawLineDestIncrement = increment; + Pico.est.DrawLineDest = DrawLineDestBase + Pico.est.DrawScanline * increment; + } + else { + DrawLineDestBase = DefOutBuff; + DrawLineDestIncrement = 0; + Pico.est.DrawLineDest = DefOutBuff; + } +} + // note: may be called on the middle of frame void PicoDrawSetOutBuf(void *dest, int increment) { - DrawLineDestBase = dest; - DrawLineDestIncrement = increment; - Pico.est.DrawLineDest = (char *)DrawLineDestBase + Pico.est.DrawScanline * increment; + if (PicoIn.AHW & PAHW_32X) + PicoDrawSetOutBuf32X(dest, increment); + else + PicoDrawSetOutBufMD(dest, increment); } void PicoDrawSetInternalBuf(void *dest, int increment) @@ -1652,6 +1919,7 @@ void PicoDrawSetInternalBuf(void *dest, int increment) else { HighColBase = DefHighCol; HighColIncrement = 0; + Pico.est.HighCol = DefHighCol; } } diff --git a/pico/draw2.c b/pico/draw2.c index f0e0518e7..910697707 100644 --- a/pico/draw2.c +++ b/pico/draw2.c @@ -20,7 +20,7 @@ #define LINE_WIDTH 328 #endif -static unsigned char PicoDraw2FB_[(8+320) * (8+240+8)]; +static unsigned char PicoDraw2FB_[(8+320) * (8+240+8) + 8]; static int HighCache2A[41*(TILE_ROWS+1)+1+1]; // caches for high layers static int HighCache2B[41*(TILE_ROWS+1)+1+1]; @@ -157,6 +157,8 @@ static void DrawWindowFull(int start, int end, int prio, struct PicoEState *est) { nametab=(pvid->reg[3]&0x3e)<<9; // 32-cell mode nametab_step = 1<<5; + if (!(PicoIn.opt&POPT_DIS_32C_BORDER)) + scrpos += 32; } nametab += nametab_step*start; @@ -240,6 +242,8 @@ static void DrawLayerFull(int plane, int *hcache, int planestart, int planeend, else nametab=(pvid->reg[4]&0x07)<<12; // B scrpos = est->Draw2FB; + if (!(pvid->reg[12]&1) && !(PicoIn.opt&POPT_DIS_32C_BORDER)) + scrpos += 32; scrpos+=8*LINE_WIDTH*(planestart-START_ROW); // Get vertical scroll value: @@ -315,6 +319,8 @@ static void DrawTilesFromCacheF(int *hc, struct PicoEState *est) short blank=-1; // The tile we know is blank unsigned char *scrpos = est->Draw2FB, *pd = 0; + if (!(Pico.video.reg[12]&1) && !(PicoIn.opt&POPT_DIS_32C_BORDER)) + scrpos += 32; // *hcache++ = code|(dx<<16)|(trow<<27); // cache it scrpos+=(*hc++)*LINE_WIDTH - START_ROW*LINE_WIDTH*8; @@ -377,6 +383,8 @@ static void DrawSpriteFull(unsigned int *sprite, struct PicoEState *est) while(sy <= START_ROW*8) { sy+=8; tile+=tdeltay; height--; } scrpos = est->Draw2FB; + if (!(Pico.video.reg[12]&1) && !(PicoIn.opt&POPT_DIS_32C_BORDER)) + scrpos += 32; scrpos+=(sy-START_ROW*8)*LINE_WIDTH; for (; height > 0; height--, sy+=8, tile+=tdeltay) @@ -412,12 +420,13 @@ static void DrawAllSpritesFull(int prio, int maxwidth) int i,u,link=0; unsigned int *sprites[80]; // Sprites int y_min=START_ROW*8, y_max=END_ROW*8; // for a simple sprite masking + int max_sprites = Pico.video.reg[12]&1 ? 80 : 64; table=pvid->reg[5]&0x7f; if (pvid->reg[12]&1) table&=0x7e; // Lowest bit 0 in 40-cell mode table<<=8; // Get sprite table address/2 - for (i=u=0; u < 80; u++) + for (i = u = 0; u < max_sprites && link < max_sprites; u++) { unsigned int *sprite=NULL; int code, code2, sx, sy, height; @@ -502,6 +511,11 @@ static void DrawDisplayFull(void) maxw = 264; maxcolc = 32; } + // 32C border for centering? (for asm) + est->rendstatus &= ~PDRAW_BORDER_32; + if ((est->rendstatus&PDRAW_32_COLS) && !(PicoIn.opt&POPT_DIS_32C_BORDER)) + est->rendstatus |= PDRAW_BORDER_32; + // horizontal window? if ((win=pvid->reg[0x12])) { diff --git a/pico/draw2_arm.S b/pico/draw2_arm.S index 6b110b320..ded0d5a5a 100644 --- a/pico/draw2_arm.S +++ b/pico/draw2_arm.S @@ -8,7 +8,7 @@ * this is highly specialized, be careful if changing related C code! */ -#include "pico_int_o32.h" +#include "pico_int_offs.h" @ define these constants in your include file: @ .equiv START_ROW, 1 @@ -414,7 +414,10 @@ DrawLayerFull: ldr r11,[sp, #9*4] @ est sub r4, r9, #(START_ROW<<24) + ldr r7, [r11, #OFS_EST_rendstatus] ldr r11, [r11, #OFS_EST_Draw2FB] + tst r7, #0x100 @ H32 border mode? + addne r11, r11, #32 mov r4, r4, asr #24 mov r7, #328*8 mla r11, r4, r7, r11 @ scrpos+=8*328*(planestart-START_ROW); @@ -590,8 +593,11 @@ DrawTilesFromCacheF: mov r9, #0xff000000 @ r9=prevcode=-1 mvn r6, #0 @ r6=prevy=-1 + ldr r7, [r1, #OFS_EST_rendstatus] ldr r4, [r1, #OFS_EST_Draw2FB] ldr r2, [r0], #4 @ read y offset + tst r7, #0x100 @ H32 border mode? + addne r4, r4, #32 mov r7, #328 mla r2, r7, r2, r4 sub r12, r2, #(328*8*START_ROW) @ r12=scrpos @@ -688,13 +694,18 @@ DrawWindowFull: ldr r4, [r11, #OFS_Pico_video_reg+12] mov r5, #1 @ nametab_step + ldr r11, [r3, #OFS_EST_Draw2FB] tst r4, #1 @ 40 cell mode? andne r12, r12, #0xf000 @ 0x3c<<10 - andeq r12, r12, #0xf800 movne r5, r5, lsl #7 - moveq r5, r5, lsl #6 @ nametab_step - - and r4, r0, #0xff + bne 0f + ldr r7, [r3, #OFS_EST_rendstatus] + and r12, r12, #0xf800 + mov r5, r5, lsl #6 @ nametab_step + tst r7, #0x100 + addne r11, r11, #32 @ center screen in H32 mode + +0: and r4, r0, #0xff mla r12, r5, r4, r12 @ nametab += nametab_step*start; ldr r10, [r3, #OFS_EST_PicoMem_vram] @@ -715,7 +726,6 @@ DrawWindowFull: mov r9, #0xff000000 @ r9=prevcode=-1 - ldr r11, [r3, #OFS_EST_Draw2FB] and r4, r0, #0xff add r11, r11, #328*8 sub r4, r4, #START_ROW @@ -915,8 +925,11 @@ DrawSpriteFull: and r3, lr, #0x6000 mov r3, r3, lsr #9 @ r3=pal=((code>>9)&0x30); + ldr r0, [r1, #OFS_EST_rendstatus] ldr r11, [r1, #OFS_EST_Draw2FB] ldr r10, [r1, #OFS_EST_PicoMem_vram] + tst r0, #0x100 @ H32 border mode? + addne r11, r11, #32 sub r1, r12, #(START_ROW*8) mov r0, #328 mla r11, r1, r0, r11 @ scrpos+=(sy-START_ROW*8)*328; diff --git a/pico/draw_arm.S b/pico/draw_arm.S index 29af1c136..0579006cd 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -1,6 +1,7 @@ /* * assembly optimized versions of most funtions from draw.c * (C) notaz, 2006-2010,2017 + * (C) kub, 2020 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. @@ -8,13 +9,13 @@ * this is highly specialized, be careful if changing related C code! */ -#include "pico_int_o32.h" +#include "pico_int_offs.h" .extern DrawStripInterlace .equ PDRAW_SPRITES_MOVED, (1<<0) .equ PDRAW_WND_DIFF_PRIO, (1<<1) -.equ PDRAW_ACC_SPRITES, (1<<2) +.equ PDRAW_PARSE_SPRITES, (1<<2) .equ PDRAW_DIRTY_SPRITES, (1<<4) .equ PDRAW_PLANE_HI_PRIO, (1<<6) .equ PDRAW_SHHI_DONE, (1<<7) @@ -317,9 +318,10 @@ DrawLayer: moveq r1, #0x0007 movgt r1, #0x00ff @ r1=ymask=(height<<8)|0xff; ...; // Y Mask in pixels - add r10, r10, #5 - cmp r10, #7 - subge r10, r10, #1 @ r10=shift[width] (5,6,6,7) + cmp r10, #2 + addlt r10, r10, #5 + moveq r10, #5 + movgt r10, #7 @ r10=shift[width] (5,6,5,7) ldr r2, [r12, #OFS_EST_DrawScanline] ldr lr, [r12, #OFS_EST_PicoMem_vram] @@ -342,11 +344,15 @@ DrawLayer: mov r4, r8, lsr #8 @ pvid->reg[13] mov r4, r4, lsl #10 @ htab=pvid->reg[13]<<9; (halfwords) - tst r7, #2 - addne r4, r4, r2, lsl #2 @ htab+=DrawScanline<<1; // Offset by line - tst r7, #1 - biceq r4, r4, #0x1f @ htab&=~0xf; // Offset by tile - add r4, r4, r0, lsl #1 @ htab+=plane + + ands r3, r7, #0x03 + beq 0f + cmp r3, #2 + mov r3, r2, lsl #2 @ htab+=DrawScanline<<1; // Offset by line + biceq r3, #0x1f @ htab&=~0xf; // Offset by tile + andlt r3, #0x1f + add r4, r4, r3 +0: add r4, r4, r0, lsl #1 @ htab+=plane bic r4, r4, #0x00ff0000 @ just in case ldrh r3, [lr, r4] @ r3=hscroll @@ -362,7 +368,8 @@ DrawLayer: bne .DrawStrip_interlace tst r0, r0 - movne r7, r7, lsr #16 + moveq r7, r7, lsl #16 + mov r7, r7, lsr #16 @ Find the line in the name table add r2, r2, r7 @@ -517,6 +524,9 @@ DrawLayer: @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ .DrawStrip_vsscroll: + tst r8, #1 @ if h40: lflags |= 0x10000 + orrne r0, r0, #0x10000 + rsb r8, r3, #0 mov r8, r8, lsr #3 @ r8=tilex=(-ts->hscroll)>>3 bic r8, r8, #0x3fc00000 @@ -541,6 +551,12 @@ DrawLayer: tst r3, #0x08 subne r10,r10, #1<<16 @ cells-- subne r10,r10, #1<<24 @ cell-- // even more negative + + add_c24 r1, lr, (OFS_PMEM_vsram-OFS_PMEM_vram) + tst r0, #0x10000 @ h40? + ldrne r3, [r1, #0x00] @ r3=vsram[0x00..0x01] + ldreq r3, [r1, #0x40] @ r3=vsram[0x20..0x21] + str r3, [r1, #0x7c] @ vsram[0x3e..0x3f]=r3 0: tst r9, #1<<31 mov r3, #0 @@ -571,8 +587,8 @@ DrawLayer: @ calc offset and read tileline code to r7, also calc ty add_c24 r7, lr, (OFS_PMEM_vsram-OFS_PMEM_vram) - add r7, r7, r10,asr #23 @ vsram + ((cell&~1)<<1) - bic r7, r7, #3 + and r4, r10, #0x3e000000 + add r7, r7, r4, asr #23 @ vsram + ((cell&0x3e)<<1) tst r10,#0x8000 @ plane1? addne r7, r7, #2 ldrh r7, [r7] @ r7=vscroll @@ -599,6 +615,7 @@ DrawLayer: tst r7, #0x8000 bne .DrawStrip_vs_hiprio + orr r7, r7, r10, lsl #24 @ code | (ty << 24) cmp r7, r9 beq .DrawStrip_vs_samecode @ we know stuff about this tile already @@ -694,8 +711,8 @@ DrawLayer: @ interlace mode 2? Sonic 2? .DrawStrip_interlace: tst r0, r0 - moveq r7, r7, lsl #21 - movne r7, r7, lsl #5 + movne r7, r7, lsr #16 + mov r7, r7, lsl #21 @ Find the line in the name table add r2, r7, r2, lsl #22 @ r2=(vscroll+(DrawScanline<<1))<<21 (11 bits); @@ -790,7 +807,6 @@ DrawTilesFromCache: add r1, r11, r4 @ r1=pdest movs r7, r6, lsl #16 - bpl .dtfc_loop @ !(code & 0x8000) cmp r5, r7, lsr #16 beq .dtfc_samecode @ if (code==prevcode) @@ -942,17 +958,23 @@ DrawTilesFromCache: .global DrawSpritesSHi DrawSpritesSHi: - ldr r3, [r0] + ldrb r3, [r0] mov r12,#0xff ands r3, r3, #0x7f bxeq lr - stmfd sp!, {r1,r4-r11,lr} @ +est - strb r12,[r0,#2] @ set end marker - add r10,r0, #3 @ r10=HighLnSpr end + stmfd sp!, {r1,r3-r11,lr} @ +est + strb r12,[r0,#3] @ set end marker + ldrb r12,[r0,#1] + add r10,r0, #4 @ r10=HighLnSpr end + mvn r12,r12 + tst r12,#0x6 @ masking in slot 1 and tile ovfl? + ldmeqfd sp!, {r1,r3-r11,pc} add r10,r10,r3 @ r10=HighLnSpr end + ldrb r12,[r10,#0] @ width of last sprite ldr r11,[r1, #OFS_EST_HighCol] + str r12,[sp, #4] mov r12,#0xf ldr lr, [r1, #OFS_EST_PicoMem_vram] @@ -963,7 +985,7 @@ DrawSpriteSHi: ldr r7, [sp] @ est ldr r1, [r7, #OFS_EST_HighPreSpr] cmp r0, #0xff - ldmeqfd sp!, {r1,r4-r11,pc} @ end of list + ldmeqfd sp!, {r1,r3-r11,pc} @ end of list and r0, r0, #0x7f add r0, r1, r0, lsl #3 @@ -1007,10 +1029,16 @@ DrawSpriteSHi: and r7, r7, #7 add r8, r8, r7, lsl #1 @ tile+=(row&7)<<1; // Tile address + ldr r0, [sp, #4] + add r6, r6, #1 @ inc now + cmp r0, #0 @ check width of last sprite + movne r6, r0 + movne r0, #0 + strne r0, [sp, #4] + mov r5, r5, lsl #4 @ delta<<=4; // Delta of address mov r3, r4, lsr #9 @ r3=pal=((code>>9)&0x30); - add r6, r6, #1 @ inc now adds r0, r2, #0 @ mov sx to r0 and set ZV flags b .dsprShi_loop_enter @@ -1126,11 +1154,18 @@ DrawAllSprites: @ time to do some real work stmfd sp!, {r1,r3-r11,lr} @ +sh|prio<<1 +est mov r12,#0xff - strb r12,[r0,#2] @ set end marker - add r10,r0, #3 + strb r12,[r0,#3] @ set end marker + ldrb r12,[r0,#1] + add r10,r0 ,#4 + mvn r12,r12 + tst r12,#0x6 @ masking in slot 1 and tile ovfl? + ldmeqfd sp!, {r1,r3-r11,pc} add r10,r10,r2 @ r10=HighLnSpr end + ldrb r12,[r10,#0] @ width of last sprite ldr r11,[r3, #OFS_EST_HighCol] + orr r1 ,r1 ,r12,lsl #24 + str r1, [sp] mov r12,#0xf ldr lr, [r3, #OFS_EST_PicoMem_vram] @@ -1140,13 +1175,15 @@ DrawAllSprites: DrawSprite: @ draw next sprite ldrb r0, [r10,#-1]! - ldr r8, [sp] @ sh|prio<<1 + ldr r4, [sp] @ sh|prio<<1|lastw<<24 ldr r7, [sp, #4] @ est - mov r2, r0, lsr #7 + mov r2, r0, lsl #24 cmp r0, #0xff ldmeqfd sp!, {r1,r3-r11,pc} @ end of list - cmp r2, r8, lsr #1 - bne DrawSprite @ wrong priority + eors r2, r2, r4, lsl #30 + bic r2, r4, #0xff000000 + str r2, [sp] + bmi DrawSprite @ wrong priority ldr r1, [r7, #OFS_EST_HighPreSpr] and r0, r0, #0x7f add r0, r1, r0, lsl #3 @@ -1158,20 +1195,20 @@ DrawSprite: mov r5, r3, lsr #24 and r5, r5, #7 @ r5=height - mov r4, r3, lsl #16 @ r4=sy<<16 (tmp) + mov r8, r3, lsl #16 @ r8=sy<<16 (tmp) ldr r9, [r0, #4] - sub r7, r7, r4, asr #16 @ r7=row=DrawScanline-sy + sub r7, r7, r8, asr #16 @ r7=row=DrawScanline-sy mov r2, r9, asr #16 @ r2=sx mov r9, r9, lsl #16 mov r9, r9, lsr #16 - orr r9, r9, r8, lsl #31 @ r9=code|sh[31] + orr r9, r9, r4, lsl #31 @ r9=code|sh[31] tst r9, #0x1000 - movne r4, r5, lsl #3 - subne r4, r4, #1 - subne r7, r4, r7 @ if (code&0x1000) row=(height<<3)-1-row; // Flip Y + movne r8, r5, lsl #3 + subne r8, r8, #1 + subne r7, r8, r7 @ if (code&0x1000) row=(height<<3)-1-row; // Flip Y add r8, r9, r7, lsr #3 @ tile+=row>>3; // Tile number increases going down tst r9, #0x0800 @@ -1183,7 +1220,10 @@ DrawSprite: and r7, r7, #7 add r8, r8, r7, lsl #1 @ tile+=(row&7)<<1; // Tile address -.dspr_continue: + add r6, r6, #1 @ inc now + cmp r4, #0x1000000 @ check width of last sprite + movhs r6, r4, lsr #24 + @ cache some stuff to avoid mem access mov r5, r5, lsl #4 @ delta<<=4; // Delta of address and r4, r9, #0x6000 @@ -1193,7 +1233,6 @@ DrawSprite: mov r3, r4, lsr #9 @ r3=pal=((code>>9)&0x30); orrmi r3, r3, #0x40 @ for sh/hi - add r6, r6, #1 @ inc now adds r0, r2, #0 @ mov sx to r0 and set ZV flags b .dspr_loop_enter @@ -1498,11 +1537,9 @@ vidConvCpyRGB565: @ void *to, void *from, int pixels PicoDoHighPal555: stmfd sp!, {r4-r10,lr} mov r10,r2 @ est - mov r1, #0 ldr r8, [r10, #OFS_EST_Pico] -PicoDoHighPal555_nopush: - orr r9, r1, r0, lsl #31 @ 0:called from FinalizeLine555, 31: s/h + mov r9, r0 add r0, r10, #OFS_EST_HighPal @@ -1517,7 +1554,7 @@ PicoDoHighPal555_nopush: vidConvCpyRGB565_local - tst r9, #(1<<31) + cmp r9, #0 beq PicoDoHighPal555_end add r3, r10, #OFS_EST_HighPal @@ -1560,11 +1597,7 @@ PicoDoHighPal555_nopush: mov r0, #1 PicoDoHighPal555_end: - tst r9, #1 - ldmeqfd sp!, {r4-r10,pc} - - ldr r8, [r10, #OFS_EST_Pico] - b FinalizeLineRGB555_pal_done + ldmfd sp!, {r4-r10,pc} @ void FinalizeLine555(int sh, int line, struct PicoEState *est) @@ -1576,19 +1609,11 @@ FinalizeLine555: mov r10,r2 @ est ldr r8, [r10, #OFS_EST_Pico] - ldrb r2, [r8, #OFS_Pico_m_dirtyPal] - mov r1, #1 - tst r2, r2 - bne PicoDoHighPal555_nopush + bl PicoDrawUpdateHighPal -FinalizeLineRGB555_pal_done: add r3, r10, #OFS_EST_HighPal - ldr r12, [r10, #OFS_EST_rendstatus] - eors r0, r0, #1 @ sh is 0 mov lr, #0xff - tstne r12,#PDRAW_ACC_SPRITES - movne lr, #0x3f ldr r1, [r10, #OFS_EST_HighCol] ldr r0, [r10, #OFS_EST_DrawLineDest] diff --git a/pico/m68kif_cyclone.s b/pico/m68kif_cyclone.s index a0a508cd4..3a9621dc2 100644 --- a/pico/m68kif_cyclone.s +++ b/pico/m68kif_cyclone.s @@ -87,19 +87,19 @@ cyclone_fetch32: orrcc r0, r1, r0, lsl #16 bxcc lr - stmfd sp!,{r0,r1,lr} + stmfd sp!,{r0,r1,r2,lr} mov lr, pc bx r1 mov r2, r0, lsl #16 - ldmia sp, {r0,r1} + ldmfd sp!, {r0,r1} str r2, [sp] add r0, r0, #2 mov lr, pc bx r1 - ldr r1, [sp] + ldmfd sp!, {r1,lr} mov r0, r0, lsl #16 orr r0, r1, r0, lsr #16 - ldmfd sp!,{r1,r2,pc} + bx lr cyclone_write8: @ u32 a, u8 d diff --git a/pico/memory.c b/pico/memory.c index a31a08e99..c0ba9ffe7 100644 --- a/pico/memory.c +++ b/pico/memory.c @@ -163,12 +163,14 @@ void m68k_map_unmap(int start_addr, int end_addr) m68k_write16_map[i] = (addr >> 1) | MAP_FLAG; } +#ifndef _ASM_MEMORY_C MAKE_68K_READ8(m68k_read8, m68k_read8_map) MAKE_68K_READ16(m68k_read16, m68k_read16_map) MAKE_68K_READ32(m68k_read32, m68k_read16_map) MAKE_68K_WRITE8(m68k_write8, m68k_write8_map) MAKE_68K_WRITE16(m68k_write16, m68k_write16_map) MAKE_68K_WRITE32(m68k_write32, m68k_write16_map) +#endif // ----------------------------------------------------------------- @@ -389,7 +391,7 @@ static int get_scanline(int is_from_z80); static void psg_write_68k(u32 d) { // look for volume write and update if needed - if ((d & 0x90) == 0x90 && Pico.snd.psg_line < Pico.m.scanline) + if ((d & 0x90) == 0x90) PsndDoPSG(Pico.m.scanline); SN76496Write(d); @@ -399,8 +401,7 @@ static void psg_write_z80(u32 d) { if ((d & 0x90) == 0x90) { int scanline = get_scanline(1); - if (Pico.snd.psg_line < scanline) - PsndDoPSG(scanline); + PsndDoPSG(scanline); } SN76496Write(d); @@ -420,6 +421,7 @@ static u32 PicoRead8_sram(u32 a) d = EEPROM_read(); if (!(a & 1)) d >>= 8; + d &= 0xff; } else d = *(u8 *)(Pico.sv.data - Pico.sv.start + a); elprintf(EL_SRAMIO, "sram r8 [%06x] %02x @ %06x", a, d, SekPc); @@ -543,7 +545,7 @@ static void PicoWrite8_z80(u32 a, u32 d) } if ((a & 0x6000) == 0x4000) { // FM Sound if (PicoIn.opt & POPT_EN_FM) - Pico.m.status |= ym2612_write_local(a & 3, d & 0xff, 0) & 1; + ym2612_write_local(a & 3, d & 0xff, 0); return; } // TODO: probably other VDP access too? Maybe more mirrors? @@ -730,8 +732,10 @@ static void PicoWrite8_vdp(u32 a, u32 d) static void PicoWrite16_vdp(u32 a, u32 d) { - if ((a & 0x00f9) == 0x0010) // PSG Sound + if ((a & 0x00f9) == 0x0010) { // PSG Sound psg_write_68k(d); + return; + } if ((a & 0x00e0) == 0x0000) { PicoVideoWrite(a, d); return; @@ -879,7 +883,7 @@ static void m68k_mem_setup(void) static int get_scanline(int is_from_z80) { if (is_from_z80) { - int mclk_z80 = z80_cyclesDone() * 15; + int mclk_z80 = (z80_cyclesLeft<0 ? Pico.t.z80c_aim : z80_cyclesDone()) * 15; int mclk_line = Pico.t.z80_scanline * 488 * 7; while (mclk_z80 - mclk_line >= 488 * 7) Pico.t.z80_scanline++, mclk_line += 488 * 7; @@ -895,10 +899,10 @@ void ym2612_sync_timers(int z80_cycles, int mode_old, int mode_new) int xcycles = z80_cycles << 8; /* check for overflows */ - if ((mode_old & 4) && xcycles > Pico.t.timer_a_next_oflow) + if ((mode_old & 4) && xcycles >= Pico.t.timer_a_next_oflow) ym2612.OPN.ST.status |= 1; - if ((mode_old & 8) && xcycles > Pico.t.timer_b_next_oflow) + if ((mode_old & 8) && xcycles >= Pico.t.timer_b_next_oflow) ym2612.OPN.ST.status |= 2; /* update timer a */ @@ -940,11 +944,11 @@ static int ym2612_write_local(u32 a, u32 d, int is_from_z80) a &= 3; if (a == 1 && ym2612.OPN.ST.address == 0x2a) /* DAC data */ { - int scanline = get_scanline(is_from_z80); - //elprintf(EL_STATUS, "%03i -> %03i dac w %08x z80 %i", Pico.snd.dac_line, scanline, d, is_from_z80); + int cycles = is_from_z80 ? z80_cyclesDone() : z80_cycles_from_68k(); + //elprintf(EL_STATUS, "%03i dac w %08x z80 %i", cycles, d, is_from_z80); ym2612.dacout = ((int)d - 0x80) << 6; if (ym2612.dacen) - PsndDoDAC(scanline); + PsndDoDAC(cycles); return 0; } @@ -1026,13 +1030,9 @@ static int ym2612_write_local(u32 a, u32 d, int is_from_z80) return 0; } case 0x2b: { /* DAC Sel (YM2612) */ - int scanline = get_scanline(is_from_z80); - if (ym2612.dacen != (d & 0x80)) { - ym2612.dacen = d & 0x80; - Pico.snd.dac_line = scanline; - } + ym2612.dacen = d & 0x80; #ifdef __GP2X__ - if (PicoIn.opt & POPT_EXT_FM) YM2612Write_940(a, d, scanline); + if (PicoIn.opt & POPT_EXT_FM) YM2612Write_940(a, d, get_scanline(is_from_z80)); #endif return 0; } @@ -1060,6 +1060,7 @@ static int ym2612_write_local(u32 a, u32 d, int is_from_z80) if (PicoIn.opt & POPT_EXT_FM) return YM2612Write_940(a, d, get_scanline(is_from_z80)); #endif + PsndDoFM(is_from_z80 ? z80_cyclesDone() : z80_cycles_from_68k()); return YM2612Write_(a, d); } @@ -1221,7 +1222,7 @@ static unsigned char z80_md_bank_read(unsigned short a) static void z80_md_ym2612_write(unsigned int a, unsigned char data) { if (PicoIn.opt & POPT_EN_FM) - Pico.m.status |= ym2612_write_local(a, data, 1) & 1; + ym2612_write_local(a, data, 1); } static void z80_md_vdp_br_write(unsigned int a, unsigned char data) diff --git a/pico/memory.h b/pico/memory.h index c878a40f8..eba234712 100644 --- a/pico/memory.h +++ b/pico/memory.h @@ -2,13 +2,6 @@ #include "pico_port.h" -#ifndef UTYPES_DEFINED -typedef unsigned char u8; -typedef unsigned short u16; -typedef unsigned int u32; -#endif -typedef uintptr_t uptr; // unsigned pointer-sized int - #define M68K_MEM_SHIFT 16 // minimum size we can map #define M68K_BANK_SIZE (1 << M68K_MEM_SHIFT) @@ -32,8 +25,17 @@ typedef void (cpu68k_write_f)(u32 a, u32 d); extern u32 m68k_read8(u32 a); extern u32 m68k_read16(u32 a); +extern u32 m68k_read32(u32 a); extern void m68k_write8(u32 a, u8 d); extern void m68k_write16(u32 a, u16 d); +extern void m68k_write32(u32 a, u32 d); + +extern u32 s68k_read8(u32 a); +extern u32 s68k_read16(u32 a); +extern u32 s68k_read32(u32 a); +extern void s68k_write8(u32 a, u8 d); +extern void s68k_write16(u32 a, u16 d); +extern void s68k_write32(u32 a, u32 d); // z80 #define Z80_MEM_SHIFT 13 diff --git a/pico/memory_amips.S b/pico/memory_amips.S index 7ae259220..7932c2c90 100644 --- a/pico/memory_amips.S +++ b/pico/memory_amips.S @@ -8,7 +8,7 @@ # OUT OF DATE -#include "pico_int_o32.h" +#include "pico_int_offs.h" .set noreorder .set noat diff --git a/pico/memory_arm.S b/pico/memory_arm.S index bfe8ca109..607006ced 100644 --- a/pico/memory_arm.S +++ b/pico/memory_arm.S @@ -1,12 +1,14 @@ /* * PicoDrive * (C) notaz, 2006-2009 + * (C) kub, 2019 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. */ -#include "pico_int_o32.h" +#include "arm_features.h" +#include "pico_int_offs.h" .equ SRR_MAPPED, (1 << 0) .equ SRR_READONLY, (1 << 1) @@ -23,8 +25,10 @@ .global PicoWrite8_io .global PicoWrite16_io + PIC_LDR_INIT() + PicoRead8_sram: @ u32 a - ldr r3, =Pico + PIC_LDR(r3, r1, Pico) ldr r1, [r3, #OFS_Pico_sv_end] cmp r0, r1 bgt m_read8_nosram @@ -59,6 +63,7 @@ m_read8_eeprom: ldmfd sp!,{r1,lr} tst r1, #1 moveq r0, r0, lsr #8 + and r0, r0, #0xff bx lr @@ -72,7 +77,7 @@ m_read8_not_io: cmp r2, #0x1000 bne PicoRead8_32x - ldr r3, =Pico + PIC_LDR(r3, r1, Pico) mov r1, r0 ldr r0, [r3, #OFS_Pico_m_rotate] add r0, r0, #1 @@ -95,7 +100,7 @@ m_read8_not_io: @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ PicoRead16_sram: @ u32 a, u32 d - ldr r3, =Pico + PIC_LDR(r3, r1, Pico) ldr r1, [r3, #OFS_Pico_sv_end] cmp r0, r1 bgt m_read16_nosram @@ -140,7 +145,7 @@ m_read16_not_io: cmp r2, #0x1000 bne PicoRead16_32x - ldr r3, =Pico + PIC_LDR(r3, r2, Pico) and r2, r0, #0xff00 ldr r0, [r3, #OFS_Pico_m_rotate] add r0, r0, #1 @@ -182,7 +187,7 @@ m_write8_not_z80ctl: eor r2, r2, #0x003000 eors r2, r2, #0x0000f1 bne PicoWrite8_32x - ldr r3, =Pico + PIC_LDR(r3, r2, Pico) ldrb r2, [r3, #OFS_Pico_m_sram_reg] and r1, r1, #(SRR_MAPPED|SRR_READONLY) bic r2, r2, #(SRR_MAPPED|SRR_READONLY) @@ -212,7 +217,7 @@ m_write16_not_z80ctl: eor r2, r2, #0x003000 eors r2, r2, #0x0000f0 bne PicoWrite16_32x - ldr r3, =Pico + PIC_LDR(r3, r2, Pico) ldrb r2, [r3, #OFS_Pico_m_sram_reg] and r1, r1, #(SRR_MAPPED|SRR_READONLY) bic r2, r2, #(SRR_MAPPED|SRR_READONLY) @@ -220,6 +225,103 @@ m_write16_not_z80ctl: strb r2, [r3, #OFS_Pico_m_sram_reg] bx lr +.global m68k_read8 +.global m68k_read16 +.global m68k_read32 +.global m68k_write8 +.global m68k_write16 +.global m68k_write32 + +m68k_read8: + PIC_LDR(r3, r2, m68k_read8_map) + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + eor r2, r0, #1 + movs r3, r3, lsl #1 + ldrccb r0, [r3, r2] + bxcc lr + bx r3 + +m68k_read16: + PIC_LDR(r3, r2, m68k_read16_map) + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + ldrcch r0, [r3, r0] + bxcc lr + bx r3 + +m68k_read32: + PIC_LDR(r3, r2, m68k_read16_map) + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + ldrcch r1, [r3, r0]! + ldrcch r0, [r3, #2] + orrcc r0, r0, r1, lsl #16 + bxcc lr + + stmfd sp!, {r0, r3, r4, lr} + mov lr, pc + bx r3 + ldmfd sp!, {r1, r3} + str r0, [sp] + add r0, r1, #2 + mov lr, pc + bx r3 + ldmfd sp!, {r1, lr} + mov r0, r0, lsl #16 + mov r1, r1, lsl #16 + orr r0, r1, r0, lsr #16 + bx lr + +m68k_write8: + PIC_LDR(r3, r2, m68k_write8_map) + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + eor r2, r0, #1 + movs r3, r3, lsl #1 + strccb r1, [r3, r2] + bxcc lr + bx r3 + +m68k_write16: + PIC_LDR(r3, r2, m68k_write16_map) + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + strcch r1, [r3, r0] + bxcc lr + bx r3 + +m68k_write32: + PIC_LDR(r3, r2, m68k_write16_map) + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + movcc r2, r1, lsr #16 + strcch r2, [r3, r0]! + strcch r1, [r3, #2] + bxcc lr + + stmfd sp!, {r0, r1, r3, lr} + mov r1, r1, lsr #16 + mov lr, pc + bx r3 + ldmfd sp!, {r0, r1, r3, lr} + add r0, r0, #2 + bx r3 + .pool @ vim:filetype=armasm diff --git a/pico/misc.c b/pico/misc.c index 47842e3fa..cf09688ed 100644 --- a/pico/misc.c +++ b/pico/misc.c @@ -1,6 +1,7 @@ /* * rarely used EEPROM code * (C) notaz, 2006-2008 + * (C) kub, 2020 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. @@ -8,83 +9,174 @@ #include "pico_int.h" -// H-counter table for hvcounter reads in 40col mode -// based on Gens code +// H-counter table for hvcounter reads in 40col mode, starting at HINT const unsigned char hcounts_40[] = { -0x07,0x07,0x08,0x08,0x08,0x09,0x09,0x0a,0x0a,0x0b,0x0b,0x0b,0x0c,0x0c,0x0d,0x0d, -0x0e,0x0e,0x0e,0x0f,0x0f,0x10,0x10,0x10,0x11,0x11,0x12,0x12,0x13,0x13,0x13,0x14, -0x14,0x15,0x15,0x15,0x16,0x16,0x17,0x17,0x18,0x18,0x18,0x19,0x19,0x1a,0x1a,0x1b, -0x1b,0x1b,0x1c,0x1c,0x1d,0x1d,0x1d,0x1e,0x1e,0x1f,0x1f,0x20,0x20,0x20,0x21,0x21, -0x22,0x22,0x23,0x23,0x23,0x24,0x24,0x25,0x25,0x25,0x26,0x26,0x27,0x27,0x28,0x28, -0x28,0x29,0x29,0x2a,0x2a,0x2a,0x2b,0x2b,0x2c,0x2c,0x2d,0x2d,0x2d,0x2e,0x2e,0x2f, -0x2f,0x30,0x30,0x30,0x31,0x31,0x32,0x32,0x32,0x33,0x33,0x34,0x34,0x35,0x35,0x35, -0x36,0x36,0x37,0x37,0x38,0x38,0x38,0x39,0x39,0x3a,0x3a,0x3a,0x3b,0x3b,0x3c,0x3c, -0x3d,0x3d,0x3d,0x3e,0x3e,0x3f,0x3f,0x3f,0x40,0x40,0x41,0x41,0x42,0x42,0x42,0x43, -0x43,0x44,0x44,0x45,0x45,0x45,0x46,0x46,0x47,0x47,0x47,0x48,0x48,0x49,0x49,0x4a, -0x4a,0x4a,0x4b,0x4b,0x4c,0x4c,0x4d,0x4d,0x4d,0x4e,0x4e,0x4f,0x4f,0x4f,0x50,0x50, -0x51,0x51,0x52,0x52,0x52,0x53,0x53,0x54,0x54,0x55,0x55,0x55,0x56,0x56,0x57,0x57, -0x57,0x58,0x58,0x59,0x59,0x5a,0x5a,0x5a,0x5b,0x5b,0x5c,0x5c,0x5c,0x5d,0x5d,0x5e, -0x5e,0x5f,0x5f,0x5f,0x60,0x60,0x61,0x61,0x62,0x62,0x62,0x63,0x63,0x64,0x64,0x64, -0x65,0x65,0x66,0x66,0x67,0x67,0x67,0x68,0x68,0x69,0x69,0x6a,0x6a,0x6a,0x6b,0x6b, -0x6c,0x6c,0x6c,0x6d,0x6d,0x6e,0x6e,0x6f,0x6f,0x6f,0x70,0x70,0x71,0x71,0x71,0x72, -0x72,0x73,0x73,0x74,0x74,0x74,0x75,0x75,0x76,0x76,0x77,0x77,0x77,0x78,0x78,0x79, -0x79,0x79,0x7a,0x7a,0x7b,0x7b,0x7c,0x7c,0x7c,0x7d,0x7d,0x7e,0x7e,0x7f,0x7f,0x7f, -0x80,0x80,0x81,0x81,0x81,0x82,0x82,0x83,0x83,0x84,0x84,0x84,0x85,0x85,0x86,0x86, -0x86,0x87,0x87,0x88,0x88,0x89,0x89,0x89,0x8a,0x8a,0x8b,0x8b,0x8c,0x8c,0x8c,0x8d, -0x8d,0x8e,0x8e,0x8e,0x8f,0x8f,0x90,0x90,0x91,0x91,0x91,0x92,0x92,0x93,0x93,0x94, -0x94,0x94,0x95,0x95,0x96,0x96,0x96,0x97,0x97,0x98,0x98,0x99,0x99,0x99,0x9a,0x9a, -0x9b,0x9b,0x9b,0x9c,0x9c,0x9d,0x9d,0x9e,0x9e,0x9e,0x9f,0x9f,0xa0,0xa0,0xa1,0xa1, -0xa1,0xa2,0xa2,0xa3,0xa3,0xa3,0xa4,0xa4,0xa5,0xa5,0xa6,0xa6,0xa6,0xa7,0xa7,0xa8, -0xa8,0xa9,0xa9,0xa9,0xaa,0xaa,0xab,0xab,0xab,0xac,0xac,0xad,0xad,0xae,0xae,0xae, -0xaf,0xaf,0xb0,0xb0, -0xe4,0xe4,0xe4,0xe5,0xe5,0xe6,0xe6,0xe6,0xe7,0xe7,0xe8,0xe8,0xe9,0xe9,0xe9,0xea, -0xea,0xeb,0xeb,0xeb,0xec,0xec,0xed,0xed,0xee,0xee,0xee,0xef,0xef,0xf0,0xf0,0xf1, -0xf1,0xf1,0xf2,0xf2,0xf3,0xf3,0xf3,0xf4,0xf4,0xf5,0xf5,0xf6,0xf6,0xf6,0xf7,0xf7, -0xf8,0xf8,0xf9,0xf9,0xf9,0xfa,0xfa,0xfb,0xfb,0xfb,0xfc,0xfc,0xfd,0xfd,0xfe,0xfe, -0xfe,0xff,0xff,0x00,0x00,0x00,0x01,0x01,0x02,0x02,0x03,0x03,0x03,0x04,0x04,0x05, -0x05,0x06,0x06,0x06, -0x07,0x07,0x08,0x08,0x08,0x09,0x09,0x0a,0x0a,0x0b,0x0b,0x0b,0x0c,0x0c,0x0d,0x0d, -0x0e,0x0e,0x0e,0x0f,0x0f,0x10,0x10,0x10, +0xa5,0xa6,0xa7,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xae,0xaf,0xb0,0xb1,0xb2, +0xb3,0xb4,0xb5,0xb5,0xe4,0xe5,0xe6,0xe7,0xe8,0xe8,0xe9,0xea,0xea,0xeb,0xec,0xed, +0xed,0xee,0xef,0xef,0xf0,0xf1,0xf2,0xf2,0xf3,0xf4,0xf4,0xf5,0xf6,0xf7,0xf7,0xf8, +0xf9,0xfa,0xfb,0xfc,0xfd,0xfd,0xfe,0xff,0x00,0x01,0x02,0x03,0x04,0x04,0x05,0x06, +0x07,0x08,0x09,0x0a,0x0b,0x0b,0x0c,0x0d,0x0e,0x0f,0x10,0x11,0x12,0x12,0x13,0x14, +0x15,0x16,0x17,0x18,0x19,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x20,0x21,0x22, +0x23,0x24,0x25,0x26,0x27,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2e,0x2f,0x30, +0x31,0x32,0x33,0x34,0x35,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3c,0x3d,0x3e, +0x3f,0x40,0x41,0x42,0x43,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4a,0x4b,0x4c, +0x4d,0x4e,0x4f,0x50,0x51,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x58,0x59,0x5a, +0x5b,0x5c,0x5d,0x5e,0x5f,0x5f,0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x66,0x67,0x68, +0x69,0x6a,0x6b,0x6c,0x6d,0x6d,0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x74,0x75,0x76, +0x77,0x78,0x79,0x7a,0x7b,0x7b,0x7c,0x7d,0x7e,0x7f,0x80,0x81,0x82,0x82,0x83,0x84, +0x85,0x86,0x87,0x88,0x89,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,0x90,0x90,0x91,0x92, +0x93,0x94,0x95,0x96,0x97,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9e,0x9f,0xa0, +0xa1,0xa2,0xa3,0xa4,0xa5,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xac,0xad,0xae, }; -// H-counter table for hvcounter reads in 32col mode +// H-counter table for hvcounter reads in 32col mode, starting at HINT const unsigned char hcounts_32[] = { -0x05,0x05,0x05,0x06,0x06,0x07,0x07,0x07,0x08,0x08,0x08,0x09,0x09,0x09,0x0a,0x0a, -0x0a,0x0b,0x0b,0x0b,0x0c,0x0c,0x0c,0x0d,0x0d,0x0d,0x0e,0x0e,0x0f,0x0f,0x0f,0x10, -0x10,0x10,0x11,0x11,0x11,0x12,0x12,0x12,0x13,0x13,0x13,0x14,0x14,0x14,0x15,0x15, -0x15,0x16,0x16,0x17,0x17,0x17,0x18,0x18,0x18,0x19,0x19,0x19,0x1a,0x1a,0x1a,0x1b, -0x1b,0x1b,0x1c,0x1c,0x1c,0x1d,0x1d,0x1d,0x1e,0x1e,0x1f,0x1f,0x1f,0x20,0x20,0x20, -0x21,0x21,0x21,0x22,0x22,0x22,0x23,0x23,0x23,0x24,0x24,0x24,0x25,0x25,0x26,0x26, -0x26,0x27,0x27,0x27,0x28,0x28,0x28,0x29,0x29,0x29,0x2a,0x2a,0x2a,0x2b,0x2b,0x2b, -0x2c,0x2c,0x2c,0x2d,0x2d,0x2e,0x2e,0x2e,0x2f,0x2f,0x2f,0x30,0x30,0x30,0x31,0x31, -0x31,0x32,0x32,0x32,0x33,0x33,0x33,0x34,0x34,0x34,0x35,0x35,0x36,0x36,0x36,0x37, -0x37,0x37,0x38,0x38,0x38,0x39,0x39,0x39,0x3a,0x3a,0x3a,0x3b,0x3b,0x3b,0x3c,0x3c, -0x3d,0x3d,0x3d,0x3e,0x3e,0x3e,0x3f,0x3f,0x3f,0x40,0x40,0x40,0x41,0x41,0x41,0x42, -0x42,0x42,0x43,0x43,0x43,0x44,0x44,0x45,0x45,0x45,0x46,0x46,0x46,0x47,0x47,0x47, -0x48,0x48,0x48,0x49,0x49,0x49,0x4a,0x4a,0x4a,0x4b,0x4b,0x4b,0x4c,0x4c,0x4d,0x4d, -0x4d,0x4e,0x4e,0x4e,0x4f,0x4f,0x4f,0x50,0x50,0x50,0x51,0x51,0x51,0x52,0x52,0x52, -0x53,0x53,0x53,0x54,0x54,0x55,0x55,0x55,0x56,0x56,0x56,0x57,0x57,0x57,0x58,0x58, -0x58,0x59,0x59,0x59,0x5a,0x5a,0x5a,0x5b,0x5b,0x5c,0x5c,0x5c,0x5d,0x5d,0x5d,0x5e, -0x5e,0x5e,0x5f,0x5f,0x5f,0x60,0x60,0x60,0x61,0x61,0x61,0x62,0x62,0x62,0x63,0x63, -0x64,0x64,0x64,0x65,0x65,0x65,0x66,0x66,0x66,0x67,0x67,0x67,0x68,0x68,0x68,0x69, -0x69,0x69,0x6a,0x6a,0x6a,0x6b,0x6b,0x6c,0x6c,0x6c,0x6d,0x6d,0x6d,0x6e,0x6e,0x6e, -0x6f,0x6f,0x6f,0x70,0x70,0x70,0x71,0x71,0x71,0x72,0x72,0x72,0x73,0x73,0x74,0x74, -0x74,0x75,0x75,0x75,0x76,0x76,0x76,0x77,0x77,0x77,0x78,0x78,0x78,0x79,0x79,0x79, -0x7a,0x7a,0x7b,0x7b,0x7b,0x7c,0x7c,0x7c,0x7d,0x7d,0x7d,0x7e,0x7e,0x7e,0x7f,0x7f, -0x7f,0x80,0x80,0x80,0x81,0x81,0x81,0x82,0x82,0x83,0x83,0x83,0x84,0x84,0x84,0x85, -0x85,0x85,0x86,0x86,0x86,0x87,0x87,0x87,0x88,0x88,0x88,0x89,0x89,0x89,0x8a,0x8a, -0x8b,0x8b,0x8b,0x8c,0x8c,0x8c,0x8d,0x8d,0x8d,0x8e,0x8e,0x8e,0x8f,0x8f,0x8f,0x90, -0x90,0x90,0x91,0x91, -0xe8,0xe8,0xe8,0xe9,0xe9,0xe9,0xea,0xea,0xea,0xeb,0xeb,0xeb,0xec,0xec,0xec,0xed, -0xed,0xed,0xee,0xee,0xee,0xef,0xef,0xf0,0xf0,0xf0,0xf1,0xf1,0xf1,0xf2,0xf2,0xf2, -0xf3,0xf3,0xf3,0xf4,0xf4,0xf4,0xf5,0xf5,0xf5,0xf6,0xf6,0xf6,0xf7,0xf7,0xf8,0xf8, -0xf8,0xf9,0xf9,0xf9,0xfa,0xfa,0xfa,0xfb,0xfb,0xfb,0xfc,0xfc,0xfc,0xfd,0xfd,0xfd, -0xfe,0xfe,0xfe,0xff,0xff,0x00,0x00,0x00,0x01,0x01,0x01,0x02,0x02,0x02,0x03,0x03, -0x03,0x04,0x04,0x04, -0x05,0x05,0x05,0x06,0x06,0x07,0x07,0x07,0x08,0x08,0x08,0x09,0x09,0x09,0x0a,0x0a, -0x0a,0x0b,0x0b,0x0b,0x0c,0x0c,0x0c,0x0d, +0x85,0x86,0x86,0x87,0x88,0x88,0x89,0x8a,0x8a,0x8b,0x8c,0x8d,0x8d,0x8e,0x8f,0x8f, +0x90,0x91,0x91,0x92,0x93,0xe9,0xe9,0xea,0xeb,0xeb,0xec,0xed,0xed,0xee,0xef,0xf0, +0xf0,0xf1,0xf2,0xf2,0xf3,0xf4,0xf4,0xf5,0xf6,0xf7,0xf7,0xf8,0xf9,0xf9,0xfa,0xfb, +0xfb,0xfc,0xfd,0xfe,0xfe,0xff,0x00,0x00,0x01,0x02,0x02,0x03,0x04,0x05,0x05,0x06, +0x07,0x07,0x08,0x09,0x09,0x0a,0x0b,0x0c,0x0c,0x0d,0x0e,0x0e,0x0f,0x10,0x10,0x11, +0x12,0x13,0x13,0x14,0x15,0x15,0x16,0x17,0x17,0x18,0x19,0x1a,0x1a,0x1b,0x1c,0x1c, +0x1d,0x1e,0x1e,0x1f,0x20,0x21,0x21,0x22,0x23,0x23,0x24,0x25,0x25,0x26,0x27,0x28, +0x28,0x29,0x2a,0x2a,0x2b,0x2c,0x2c,0x2d,0x2e,0x2f,0x2f,0x30,0x31,0x31,0x32,0x33, +0x33,0x34,0x35,0x36,0x36,0x37,0x38,0x38,0x39,0x3a,0x3a,0x3b,0x3c,0x3d,0x3d,0x3e, +0x3f,0x3f,0x40,0x41,0x41,0x42,0x43,0x44,0x44,0x45,0x46,0x46,0x47,0x48,0x48,0x49, +0x4a,0x4b,0x4b,0x4c,0x4d,0x4d,0x4e,0x4f,0x4f,0x50,0x51,0x52,0x52,0x53,0x54,0x54, +0x55,0x56,0x56,0x57,0x58,0x59,0x59,0x5a,0x5b,0x5b,0x5c,0x5d,0x5d,0x5e,0x5f,0x60, +0x60,0x61,0x62,0x62,0x63,0x64,0x64,0x65,0x66,0x67,0x67,0x68,0x69,0x69,0x6a,0x6b, +0x6b,0x6c,0x6d,0x6e,0x6e,0x6f,0x70,0x70,0x71,0x72,0x72,0x73,0x74,0x75,0x75,0x76, +0x77,0x77,0x78,0x79,0x79,0x7a,0x7b,0x7c,0x7c,0x7d,0x7e,0x7e,0x7f,0x80,0x80,0x81, +0x82,0x83,0x83,0x84,0x85,0x85,0x86,0x87,0x87,0x88,0x89,0x8a,0x8a,0x8b,0x8c,0x8c, }; +// VDP transfer slots for blanked and active display in 32col and 40col mode. +// 1 slot is 488/171 = 2.8538 68k cycles in h32, and 488/210 = 2.3238 in h40 +// In blanked display, all slots but 5(h32) / 6(h40) are usable for transfers, +// in active display only 16(h32) / 18(h40) slots can be used. + +// XXX inactive tables by slot#=cycles*maxslot#/488. should be through hv tables +// VDP transfer slots in inactive (blanked) display 32col mode. +// refresh slots: 250, 26, 58, 90, 122 -> 32, 64, 96, 128, 160 +const unsigned char vdpcyc2sl_32_bl[] = { // 68k cycles/2 to slot # +// 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 + 0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, + 10, 11, 12, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 19, 20, 21, + 21, 22, 23, 23, 24, 25, 25, 26, 27, 27, 28, 29, 29, 30, 31, 31, + 32, 33, 34, 34, 35, 36, 36, 37, 38, 38, 39, 40, 40, 41, 42, 42, + 43, 44, 44, 45, 46, 46, 47, 48, 48, 49, 50, 51, 51, 52, 53, 53, + 54, 55, 55, 56, 57, 57, 58, 59, 59, 60, 61, 61, 62, 63, 63, 64, + 65, 65, 66, 67, 68, 68, 69, 70, 70, 71, 72, 72, 73, 74, 74, 75, + 76, 76, 77, 78, 78, 79, 80, 80, 81, 82, 83, 83, 84, 85, 85, 86, + 87, 87, 88, 89, 89, 90, 91, 91, 92, 93, 93, 94, 95, 95, 96, 97, + 97, 98, 99,100,100,101,102,102,103,104,104,105,106,106,107,108, + 108,109,110,110,111,112,112,113,114,114,115,116,117,117,118,119, + 119,120,121,121,122,123,123,124,125,125,126,127,127,128,129,129, + 130,131,131,132,133,134,134,135,136,136,137,138,138,139,140,140, + 141,142,142,143,144,144,145,146,146,147,148,148,149,150,151,151, + 152,153,153,154,155,155,156,157,157,158,159,159,160,161,161,162, + 163,163,164,165,166,166,167,168,168,169,170,170,171,172,172,173, +}; +// VDP transfer slots in inactive (blanked) display 40col mode. +// refresh slots: 250, 26, 58, 90, 122, 154 -> 40, 72, 104, 136, 168, 200 +const unsigned char vdpcyc2sl_40_bl[] = { // 68k cycles/2 to slot # +// 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 + 0, 0, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 10, 11, 12, + 13, 14, 15, 15, 16, 17, 18, 19, 20, 20, 21, 22, 23, 24, 25, 25, + 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 35, 35, 36, 37, 38, 39, + 40, 40, 41, 42, 43, 44, 45, 45, 46, 47, 48, 49, 50, 51, 51, 52, + 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65, 66, + 66, 67, 68, 69, 70, 71, 71, 72, 73, 74, 75, 76, 76, 77, 78, 79, + 80, 81, 81, 82, 83, 84, 85, 86, 86, 87, 88, 89, 90, 91, 91, 92, + 93, 94, 95, 96, 96, 97, 98, 99,100,101,102,102,103,104,105,106, + 107,107,108,109,110,111,112,112,113,114,115,116,117,117,118,119, + 120,121,122,122,123,124,125,126,127,127,128,129,130,131,132,132, + 133,134,135,136,137,137,138,139,140,141,142,142,143,144,145,146, + 147,147,148,149,150,151,152,153,153,154,155,156,157,158,158,159, + 160,161,162,163,163,164,165,166,167,168,168,169,170,171,172,173, + 173,174,175,176,177,178,178,179,180,181,182,183,183,184,185,186, + 187,188,188,189,190,191,192,193,193,194,195,196,197,198,198,199, + 200,201,202,203,204,204,205,206,207,208,209,209,210,211,212,213, +}; +// VDP transfer slots in active display 32col mode. Transfer slots (Hint=0): +// 11,25,40,48,56,72,80,88,104,112,120,136,144,152,167,168 +const unsigned char vdpcyc2sl_32[] = { // 68k cycles/2 to slot # +// 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, +}; +// VDP transfer slots in active display 40col mode. Transfer slots (Hint=0): +// 21,47,55,63,79,87,95,111,119,127,143,151,159,175,183,191,206,207 +const unsigned char vdpcyc2sl_40[] = { // 68k cycles/2 to slot # +// 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0 + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, // 32 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 64 + 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 96 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, // 128 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, // 160 + 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, // 192 + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 224 + 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, // 256 + 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, // 288 + 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, // 320 + 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, // 352 + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, // 384 + 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, // 416 + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, // 448 + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, // 480 +}; + +// XXX inactive tables by cyc=slot#*488/maxslot#. should be through hv tables +const unsigned short vdpsl2cyc_32_bl[] = { // slot # to 68k cycles/2 + 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, + 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43, 45, 46, + 48, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 64, 65, 67, 68, 70, + 71, 73, 74, 75, 77, 78, 80, 81, 83, 84, 86, 87, 89, 90, 92, 93, + 95, 96, 98, 99,100,102,103,105,106,108,109,111,112,114,115,117, + 118,120,121,122,124,125,127,128,130,131,133,134,136,137,139,140, + 142,143,145,146,147,149,150,152,153,155,156,158,159,161,162,164, + 165,167,168,170,171,172,174,175,177,178,180,181,183,184,186,187, + 189,190,192,193,195,196,197,199,200,202,203,205,206,208,209,211, + 212,214,215,217,218,220,221,222,224,225,227,228,230,231,233,234, + 236,237,239,240,242,243,244,246, +}; +const unsigned short vdpsl2cyc_40_bl[] = { // slot # to 68k cycles/2 + 0, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, + 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 38, + 39, 40, 41, 42, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 56, 57, + 58, 59, 60, 61, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 75, 76, + 77, 78, 79, 81, 82, 83, 84, 85, 87, 88, 89, 90, 91, 93, 94, 95, + 96, 97, 99,100,101,102,103,105,106,107,108,109,111,112,113,114, + 115,117,118,119,120,121,122,124,125,126,127,128,130,131,132,133, + 134,136,137,138,139,140,142,143,144,145,146,148,149,150,151,152, + 154,155,156,157,158,160,161,162,163,164,166,167,168,169,170,172, + 173,174,175,176,178,179,180,181,182,183,185,186,187,188,189,191, + 192,193,194,195,197,198,199,200,201,203,204,205,206,207,209,210, + 211,212,213,215,216,217,218,219,221,222,223,224,225,227,228,229, + 230,231,233,234,235,236,237,239,240,241,242,243,244,246, +}; +const unsigned short vdpsl2cyc_32[] = { // slot # to 68k cycles/2 + 0, 16, 36, 56, 67, 79,102,113,125,148,159,171,194,205,217,239, + 240,260 +}; +const unsigned short vdpsl2cyc_40[] = { // slot # to 68k cycles/2 + 0, 24, 55, 64, 73, 92,101,110,129,138,147,166,175,184,203,212, + 221,239,240,268 +}; #ifndef _ASM_MISC_C PICO_INTERNAL_ASM void memcpy16bswap(unsigned short *dest, void *src, int count) diff --git a/pico/pico.c b/pico/pico.c index f6b43cd69..579cdd0dc 100644 --- a/pico/pico.c +++ b/pico/pico.c @@ -67,6 +67,7 @@ void PicoPower(void) memset(&Pico.video,0,sizeof(Pico.video)); memset(&Pico.m,0,sizeof(Pico.m)); + memset(&Pico.t,0,sizeof(Pico.t)); Pico.video.pending_ints=0; z80_reset(); @@ -78,6 +79,7 @@ void PicoPower(void) Pico.video.reg[0] = Pico.video.reg[1] = 0x04; Pico.video.reg[0xc] = 0x81; Pico.video.reg[0xf] = 0x02; + PicoVideoFIFOMode(0, 1); if (PicoIn.AHW & PAHW_MCD) PicoPowerMCD(); @@ -182,8 +184,7 @@ int PicoReset(void) PsndReset(); // pal must be known here // create an empty "dma" to cause 68k exec start at random frame location - if (Pico.m.dma_xfers == 0 && !(PicoIn.opt & POPT_DIS_VDP_FIFO)) - Pico.m.dma_xfers = rand() & 0x1fff; + PicoVideoFIFOWrite(rand() & 0x1fff, 0, 0, PVS_CPURD); SekFinishIdleDet(); @@ -222,51 +223,6 @@ void PicoLoopPrepare(void) rendstatus_old = -1; } -// this table is wrong and should be removed -// keeping it for now to compensate wrong timing elswhere, mainly for Outrunners -static const int dma_timings[] = { - 83, 166, 83, 83, // vblank: 32cell: dma2vram dma2[vs|c]ram vram_fill vram_copy - 102, 204, 102, 102, // vblank: 40cell: - 8, 16, 8, 8, // active: 32cell: - 17, 18, 9, 9 // ... -}; - -static const int dma_bsycles[] = { - (488<<8)/83, (488<<8)/166, (488<<8)/83, (488<<8)/83, - (488<<8)/102, (488<<8)/204, (488<<8)/102, (488<<8)/102, - (488<<8)/8, (488<<8)/16, (488<<8)/8, (488<<8)/8, - (488<<8)/9, (488<<8)/18, (488<<8)/9, (488<<8)/9 -}; - -// grossly inaccurate.. FIXME FIXXXMEE -PICO_INTERNAL int CheckDMA(void) -{ - int burn = 0, xfers_can, dma_op = Pico.video.reg[0x17]>>6; // see gens for 00 and 01 modes - int xfers = Pico.m.dma_xfers; - int dma_op1; - - if(!(dma_op&2)) dma_op = (Pico.video.type==1) ? 0 : 1; // setting dma_timings offset here according to Gens - dma_op1 = dma_op; - if(Pico.video.reg[12] & 1) dma_op |= 4; // 40 cell mode? - if(!(Pico.video.status&8)&&(Pico.video.reg[1]&0x40)) dma_op|=8; // active display? - xfers_can = dma_timings[dma_op]; - if(xfers <= xfers_can) - { - Pico.video.status &= ~SR_DMA; - if (!(dma_op & 2)) - burn = xfers * dma_bsycles[dma_op] >> 8; // have to be approximate because can't afford division.. - Pico.m.dma_xfers = 0; - } else { - if(!(dma_op&2)) burn = 488; - Pico.m.dma_xfers -= xfers_can; - } - - elprintf(EL_VDPDMA, "~Dma %i op=%i can=%i burn=%i [%u]", - Pico.m.dma_xfers, dma_op1, xfers_can, burn, SekCyclesDone()); - //dprintf("~aim: %i, cnt: %i", Pico.t.m68c_aim, Pico.t.m68c_cnt); - return burn; -} - #include "pico_cmn.c" /* sync z80 to 68k */ @@ -313,7 +269,7 @@ void PicoFrame(void) goto end; } - //if(Pico.video.reg[12]&0x2) Pico.video.status ^= 0x10; // change odd bit in interlace mode + //if(Pico.video.reg[12]&0x2) Pico.video.status ^= SR_ODD; // change odd bit in interlace mode PicoFrameStart(); PicoFrameHints(); @@ -326,7 +282,7 @@ void PicoFrameDrawOnly(void) { if (!(PicoIn.AHW & PAHW_SMS)) { PicoFrameStart(); - PicoDrawSync(223, 0); + PicoDrawSync(Pico.m.pal?239:223, 0); } else { PicoFrameDrawOnlyMS(); } diff --git a/pico/pico.h b/pico/pico.h index a669215dc..2c73f383e 100644 --- a/pico/pico.h +++ b/pico/pico.h @@ -70,8 +70,10 @@ extern void *p32x_bios_g, *p32x_bios_m, *p32x_bios_s; #define POPT_EN_DRC (1<<17) #define POPT_DIS_SPRITE_LIM (1<<18) #define POPT_DIS_IDLE_DET (1<<19) -#define POPT_EN_32X (1<<20) +#define POPT_EN_32X (1<<20) // x0 0000 #define POPT_EN_PWM (1<<21) +#define POPT_PWM_IRQ_OPT (1<<22) +#define POPT_DIS_FM_SSGEG (1<<23) #define PAHW_MCD (1<<0) #define PAHW_32X (1<<1) @@ -197,14 +199,16 @@ void vidConvCpyRGB565(void *to, void *from, int pixels); #endif void PicoDoHighPal555(int sh, int line, struct PicoEState *est); // internals -#define PDRAW_SPRITES_MOVED (1<<0) // (asm) +#define PDRAW_SPRITES_MOVED (1<<0) // SAT address modified #define PDRAW_WND_DIFF_PRIO (1<<1) // not all window tiles use same priority +#define PDRAW_PARSE_SPRITES (1<<2) // SAT needs parsing #define PDRAW_INTERLACE (1<<3) -#define PDRAW_DIRTY_SPRITES (1<<4) // (asm) +#define PDRAW_DIRTY_SPRITES (1<<4) // SAT modified #define PDRAW_SONIC_MODE (1<<5) // mid-frame palette changes for 8bit renderer #define PDRAW_PLANE_HI_PRIO (1<<6) // have layer with all hi prio tiles (mk3) #define PDRAW_SHHI_DONE (1<<7) // layer sh/hi already processed #define PDRAW_32_COLS (1<<8) // 32 column mode +#define PDRAW_BORDER_32 (1<<9) // center H32 in buffer (32 px border) extern int rendstatus_old; extern int rendlines; diff --git a/pico/pico_cmn.c b/pico/pico_cmn.c index 1f89da905..8863bb39c 100644 --- a/pico/pico_cmn.c +++ b/pico/pico_cmn.c @@ -22,27 +22,30 @@ #endif // sync m68k to Pico.t.m68c_aim -static void SekSyncM68k(void) +static void SekExecM68k(int cyc_do) { - int cyc_do; - pprof_start(m68k); - pevt_log_m68k_o(EVT_RUN_START); - - while ((cyc_do = Pico.t.m68c_aim - Pico.t.m68c_cnt) > 0) { - Pico.t.m68c_cnt += cyc_do; + Pico.t.m68c_cnt += cyc_do; #if defined(EMU_C68K) - PicoCpuCM68k.cycles = cyc_do; - CycloneRun(&PicoCpuCM68k); - Pico.t.m68c_cnt -= PicoCpuCM68k.cycles; + PicoCpuCM68k.cycles = cyc_do; + CycloneRun(&PicoCpuCM68k); + Pico.t.m68c_cnt -= PicoCpuCM68k.cycles; #elif defined(EMU_M68K) - Pico.t.m68c_cnt += m68k_execute(cyc_do) - cyc_do; + Pico.t.m68c_cnt += m68k_execute(cyc_do) - cyc_do; #elif defined(EMU_F68K) - Pico.t.m68c_cnt += fm68k_emulate(&PicoCpuFM68k, cyc_do, 0) - cyc_do; + Pico.t.m68c_cnt += fm68k_emulate(&PicoCpuFM68k, cyc_do, 0) - cyc_do; #endif - } - SekCyclesLeft = 0; +} + +static void SekSyncM68k(void) +{ + int cyc_do; + pprof_start(m68k); + pevt_log_m68k_o(EVT_RUN_START); + + while ((cyc_do = Pico.t.m68c_aim - Pico.t.m68c_cnt) > 0) + SekExecM68k(cyc_do); SekTrace(0); pevt_log_m68k_o(EVT_RUN_END); @@ -52,10 +55,10 @@ static void SekSyncM68k(void) static __inline void SekRunM68k(int cyc) { Pico.t.m68c_aim += cyc; + Pico.t.m68c_cnt += cyc >> 6; // refresh slowdowns cyc = Pico.t.m68c_aim - Pico.t.m68c_cnt; if (cyc <= 0) return; - Pico.t.m68c_cnt += cyc >> 6; // refresh slowdowns SekSyncM68k(); } @@ -68,28 +71,19 @@ static void do_hint(struct PicoVideo *pv) } } -static void do_timing_hacks_as(struct PicoVideo *pv, int vdp_slots) +static void do_timing_hacks_end(struct PicoVideo *pv) { - pv->lwrite_cnt += vdp_slots - Pico.m.dma_xfers * 2; // wrong *2 - if (pv->lwrite_cnt > vdp_slots) - pv->lwrite_cnt = vdp_slots; - else if (pv->lwrite_cnt < 0) - pv->lwrite_cnt = 0; - if (Pico.m.dma_xfers) - SekCyclesBurn(CheckDMA()); + PicoVideoFIFOSync(488); } -static void do_timing_hacks_vb(void) +static void do_timing_hacks_start(struct PicoVideo *pv) { - if (unlikely(Pico.m.dma_xfers)) - SekCyclesBurn(CheckDMA()); + SekCyclesBurn(PicoVideoFIFOHint()); // prolong cpu HOLD if necessary } static int PicoFrameHints(void) { struct PicoVideo *pv = &Pico.video; - int line_sample = Pico.m.pal ? 68 : 93; - int vdp_slots = (Pico.video.reg[12] & 1) ? 18 : 16; int lines, y, lines_vis, skip; int vcnt_wrap, vcnt_adj; unsigned int cycles; @@ -150,27 +144,11 @@ static int PicoFrameHints(void) } } - // get samples from sound chips - if ((y == 224 || y == line_sample) && PicoIn.sndOut) - { - cycles = SekCyclesDone(); - - if (Pico.m.z80Run && !Pico.m.z80_reset && (PicoIn.opt&POPT_EN_Z80)) - PicoSyncZ80(cycles); -#ifdef PICO_CD - if (PicoIn.AHW & PAHW_MCD) - pcd_sync_s68k(cycles, 0); -#endif -#ifdef PICO_32X - p32x_sync_sh2s(cycles); -#endif - PsndGetSamples(y); - } - // Run scanline: Pico.t.m68c_line_start = Pico.t.m68c_aim; - do_timing_hacks_as(pv, vdp_slots); + do_timing_hacks_start(pv); CPUS_RUN(CYCLES_M68K_LINE); + do_timing_hacks_end(pv); if (PicoLineHook) PicoLineHook(); pevt_log_m68k_o(EVT_NEXT_LINE); @@ -189,10 +167,6 @@ static int PicoFrameHints(void) #endif } - // VDP FIFO - pv->lwrite_cnt = 0; - Pico.video.status |= SR_EMPT; - memcpy(PicoIn.padInt, PicoIn.pad, sizeof(PicoIn.padInt)); PAD_DELAY(); @@ -204,25 +178,26 @@ static int PicoFrameHints(void) } pv->status |= SR_VB | PVS_VB2; // go into vblank + PicoVideoFIFOMode(pv->reg[1]&0x40, pv->reg[12]&1); // the following SekRun is there for several reasons: // there must be a delay after vblank bit is set and irq is asserted (Mazin Saga) // also delay between F bit (bit 7) is set in SR and IRQ happens (Ex-Mutants) // also delay between last H-int and V-int (Golden Axe 3) Pico.t.m68c_line_start = Pico.t.m68c_aim; - do_timing_hacks_vb(); + do_timing_hacks_start(pv); CPUS_RUN(CYCLES_M68K_VINT_LAG); pv->status |= SR_F; pv->pending_ints |= 0x20; if (pv->reg[1] & 0x20) { - Pico.t.m68c_aim = Pico.t.m68c_cnt + 11; // HACK - SekSyncM68k(); + if (Pico.t.m68c_cnt - Pico.t.m68c_aim < 60) // CPU blocked? + SekExecM68k(11); // HACK elprintf(EL_INTS, "vint: @ %06x [%u]", SekPc, SekCyclesDone()); SekInterrupt(6); } - cycles = SekCyclesDone(); + cycles = Pico.t.m68c_aim; if (Pico.m.z80Run && !Pico.m.z80_reset && (PicoIn.opt&POPT_EN_Z80)) { PicoSyncZ80(cycles); elprintf(EL_INTS, "zint"); @@ -238,12 +213,9 @@ static int PicoFrameHints(void) p32x_start_blank(); #endif - // get samples from sound chips - if (y == 224 && PicoIn.sndOut) - PsndGetSamples(y); - // Run scanline: CPUS_RUN(CYCLES_M68K_LINE - CYCLES_M68K_VINT_LAG); + do_timing_hacks_end(pv); if (PicoLineHook) PicoLineHook(); pevt_log_m68k_o(EVT_NEXT_LINE); @@ -278,8 +250,9 @@ static int PicoFrameHints(void) // Run scanline: Pico.t.m68c_line_start = Pico.t.m68c_aim; - do_timing_hacks_vb(); + do_timing_hacks_start(pv); CPUS_RUN(CYCLES_M68K_LINE); + do_timing_hacks_end(pv); if (PicoLineHook) PicoLineHook(); pevt_log_m68k_o(EVT_NEXT_LINE); @@ -289,18 +262,19 @@ static int PicoFrameHints(void) unsigned int l = PicoIn.overclockM68k * lines / 100; while (l-- > 0) { Pico.t.m68c_cnt -= CYCLES_M68K_LINE; - do_timing_hacks_vb(); + do_timing_hacks_start(pv); SekSyncM68k(); + do_timing_hacks_end(pv); } } pv->status &= ~(SR_VB | PVS_VB2); pv->status |= ((pv->reg[1] >> 3) ^ SR_VB) & SR_VB; // forced blanking + PicoVideoFIFOMode(pv->reg[1]&0x40, pv->reg[12]&1); // last scanline - Pico.m.scanline = y; + Pico.m.scanline = y++; pv->v_counter = 0xff; - pv->lwrite_cnt = 0; PAD_DELAY(); @@ -315,20 +289,17 @@ static int PicoFrameHints(void) // Run scanline: Pico.t.m68c_line_start = Pico.t.m68c_aim; - do_timing_hacks_as(pv, vdp_slots); + do_timing_hacks_start(pv); CPUS_RUN(CYCLES_M68K_LINE); + do_timing_hacks_end(pv); if (PicoLineHook) PicoLineHook(); pevt_log_m68k_o(EVT_NEXT_LINE); // sync cpus - cycles = SekCyclesDone(); + cycles = Pico.t.m68c_aim; if (Pico.m.z80Run && !Pico.m.z80_reset && (PicoIn.opt&POPT_EN_Z80)) PicoSyncZ80(cycles); - if (PicoIn.sndOut && ym2612.dacen && Pico.snd.dac_line < lines) - PsndDoDAC(lines - 1); - if (PicoIn.sndOut && Pico.snd.psg_line < lines) - PsndDoPSG(lines - 1); #ifdef PICO_CD if (PicoIn.AHW & PAHW_MCD) @@ -337,6 +308,11 @@ static int PicoFrameHints(void) #ifdef PICO_32X p32x_sync_sh2s(cycles); #endif + + // get samples from sound chips + if (PicoIn.sndOut) + PsndGetSamples(y); + timers_cycle(); pv->hint_cnt = hint; diff --git a/pico/pico_int.h b/pico/pico_int.h index 7225cab85..088c7aa53 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -33,6 +33,14 @@ extern "C" { #endif +typedef unsigned char u8; +typedef signed char s8; +typedef unsigned short u16; +typedef signed short s16; +typedef unsigned int u32; +typedef signed int s32; +typedef uintptr_t uptr; // unsigned pointer-sized int + // ----------------------- 68000 CPU ----------------------- #ifdef EMU_C68K #include "../cpu/cyclone/Cyclone.h" @@ -129,9 +137,7 @@ extern m68ki_cpu_core PicoCpuMM68k, PicoCpuMS68k; // burn cycles while not in SekRun() and while in #define SekCyclesBurn(c) Pico.t.m68c_cnt += c -#define SekCyclesBurnRun(c) { \ - SekCyclesLeft -= c; \ -} +#define SekCyclesBurnRun(c) SekCyclesLeft -= c // note: sometimes may extend timeslice to delay an irq #define SekEndRun(after) { \ @@ -185,7 +191,7 @@ extern struct DrZ80 drZ80; #define z80_int_assert(a) Cz80_Set_IRQ(&CZ80, 0, (a) ? ASSERT_LINE : CLEAR_LINE) #define z80_nmi() Cz80_Set_IRQ(&CZ80, IRQ_LINE_NMI, 0) -#define z80_cyclesLeft (CZ80.ICount - CZ80.ExtraCycles) +#define z80_cyclesLeft CZ80.ICount #define z80_subCLeft(c) CZ80.ICount -= c #define z80_pc() Cz80_Get_Reg(&CZ80, CZ80_PC) @@ -207,7 +213,7 @@ extern struct DrZ80 drZ80; #define z80_cyclesDone() \ (Pico.t.z80c_aim - z80_cyclesLeft) -#define cycles_68k_to_z80(x) ((x) * 3823 >> 13) +#define cycles_68k_to_z80(x) ((x) * 3822 >> 13) // ----------------------- SH2 CPU ----------------------- @@ -229,11 +235,10 @@ extern SH2 sh2s[2]; # define sh2_pc(sh2) (sh2)->ppc #else # define sh2_end_run(sh2, after_) do { \ - int left_ = (signed int)(sh2)->sr >> 12; \ - if (left_ > (after_)) { \ - (sh2)->cycles_timeslice -= left_ - (after_); \ - (sh2)->sr &= 0xfff; \ - (sh2)->sr |= (after_) << 12; \ + int left_ = ((signed int)(sh2)->sr >> 12) - (after_); \ + if (left_ > 0) { \ + (sh2)->cycles_timeslice -= left_; \ + (sh2)->sr -= (left_ << 12); \ } \ } while (0) # define sh2_cycles_left(sh2) ((signed int)(sh2)->sr >> 12) @@ -241,11 +246,11 @@ extern SH2 sh2s[2]; # define sh2_pc(sh2) (sh2)->pc #endif -#define sh2_cycles_done(sh2) ((int)(sh2)->cycles_timeslice - sh2_cycles_left(sh2)) +#define sh2_cycles_done(sh2) (unsigned)((int)(sh2)->cycles_timeslice - sh2_cycles_left(sh2)) #define sh2_cycles_done_t(sh2) \ - ((sh2)->m68krcycles_done * 3 + sh2_cycles_done(sh2)) + (unsigned)(C_M68K_TO_SH2(sh2, (sh2)->m68krcycles_done) + sh2_cycles_done(sh2)) #define sh2_cycles_done_m68k(sh2) \ - ((sh2)->m68krcycles_done + (sh2_cycles_done(sh2) / 3)) + (unsigned)((sh2)->m68krcycles_done + C_SH2_TO_M68K(sh2, sh2_cycles_done(sh2))) #define sh2_reg(c, x) (c) ? ssh2.r[x] : msh2.r[x] #define sh2_gbr(c) (c) ? ssh2.gbr : msh2.gbr @@ -290,6 +295,11 @@ extern SH2 sh2s[2]; // not part of real SR #define PVS_ACTIVE (1 << 16) #define PVS_VB2 (1 << 17) // ignores forced blanking +#define PVS_CPUWR (1 << 18) // CPU write blocked by FIFO full +#define PVS_CPURD (1 << 19) // CPU read blocked by FIFO not empty +#define PVS_DMAFILL (1 << 20) // DMA fill is waiting for fill data +#define PVS_DMABG (1 << 21) // background DMA operation is running +#define PVS_FIFORUN (1 << 22) // FIFO is processing struct PicoVideo { @@ -300,13 +310,16 @@ struct PicoVideo unsigned short addr; // Read/Write address unsigned int status; // Status bits (SR) and extra flags unsigned char pending_ints; // pending interrupts: ??VH???? - signed char lwrite_cnt; // VDP write count during active display line + signed char pad1; // was VDP write count unsigned short v_counter; // V-counter unsigned short debug; // raw debug register unsigned char debug_p; // ... parsed: PVD_* unsigned char addr_u; // bit16 of .addr unsigned char hint_cnt; - unsigned char pad[0x0b]; + unsigned char pad2; + unsigned short hv_latch; // latched hvcounter value + signed int fifo_cnt; // pending xfers for current FIFO queue entry + unsigned char pad[0x04]; }; struct PicoMisc @@ -328,8 +341,8 @@ struct PicoMisc unsigned char eeprom_cycle; // EEPROM cycle number unsigned char eeprom_slave; // EEPROM slave word for X24C02 and better SRAMs unsigned char eeprom_status; - unsigned char status; // rapid_ym2612, multi_ym_updates - unsigned short dma_xfers; // 18 + unsigned char pad1; // was ym2612 status + unsigned short dma_xfers; // 18 unused (was VDP DMA transfer count) unsigned char eeprom_wb[2]; // EEPROM latch/write buffer unsigned int frame_count; // 1c for movies and idle det }; @@ -356,6 +369,8 @@ struct PicoEState unsigned int *PicoOpt; unsigned char *Draw2FB; unsigned short HighPal[0x100]; + unsigned short SonicPal[0x100]; + int SonicPalCount; }; struct PicoMem @@ -421,11 +436,15 @@ struct PicoSound short len_use; // adjusted int len_e_add; // for non-int samples/frame int len_e_cnt; - short dac_line; - short psg_line; + unsigned int clkl_mult; // z80 clocks per line in Q20 + unsigned int smpl_mult; // samples per line in Q16 + short dac_val, dac_val2; // last DAC sample + unsigned int dac_pos; // last DAC position in Q20 + unsigned int fm_pos; // last FM position in Q20 + unsigned int psg_pos; // last PSG position in Q16 }; -// run tools/mkoffsets pico/pico_int_o32.h if you change these +// run tools/mkoffsets pico/pico_int_offs.h if you change these // careful with savestate compat struct Pico { @@ -597,7 +616,8 @@ struct Pico32xMem { unsigned char sdram[0x40000]; #ifdef DRC_SH2 - unsigned short drcblk_ram[1 << (18 - SH2_DRCBLK_RAM_SHIFT)]; + unsigned char drcblk_ram[1 << (18 - SH2_DRCBLK_RAM_SHIFT)]; + unsigned char drclit_ram[1 << (18 - SH2_DRCBLK_RAM_SHIFT)]; #endif unsigned short dram[2][0x20000/2]; // AKA fb union { @@ -605,7 +625,8 @@ struct Pico32xMem unsigned char m68k_rom_bank[0x10000]; // M68K_BANK_SIZE }; #ifdef DRC_SH2 - unsigned short drcblk_da[2][1 << (12 - SH2_DRCBLK_DA_SHIFT)]; + unsigned char drcblk_da[2][1 << (12 - SH2_DRCBLK_DA_SHIFT)]; + unsigned char drclit_da[2][1 << (12 - SH2_DRCBLK_DA_SHIFT)]; #endif union { unsigned char b[0x800]; @@ -618,8 +639,8 @@ struct Pico32xMem unsigned short pal[0x100]; unsigned short pal_native[0x100]; // converted to native (for renderer) signed short pwm[2*PWM_BUFF_LEN]; // PWM buffer for current frame - signed short pwm_current[2]; // current converted samples unsigned short pwm_fifo[2][4]; // [0] - current raw, others - fifo entries + unsigned pwm_index[2]; // ringbuffer index for pwm_fifo }; // area.c @@ -648,12 +669,14 @@ PICO_INTERNAL void PicoFrameStart(void); void PicoDrawSync(int to, int blank_last_line); void BackFill(int reg7, int sh, struct PicoEState *est); void FinalizeLine555(int sh, int line, struct PicoEState *est); +void PicoDrawSetOutBufMD(void *dest, int increment); extern int (*PicoScanBegin)(unsigned int num); extern int (*PicoScanEnd)(unsigned int num); -#define MAX_LINE_SPRITES 29 -extern unsigned char HighLnSpr[240][3 + MAX_LINE_SPRITES]; +#define MAX_LINE_SPRITES 27 // +1 last sprite width, +4 hdr; total 32 +extern unsigned char HighLnSpr[240][4+MAX_LINE_SPRITES+1]; extern void *DrawLineDestBase; extern int DrawLineDestIncrement; +extern unsigned int VdpSATCache[128]; // draw2.c void PicoDraw2Init(void); @@ -723,7 +746,7 @@ extern struct Pico Pico; extern struct PicoMem PicoMem; extern void (*PicoResetHook)(void); extern void (*PicoLineHook)(void); -PICO_INTERNAL int CheckDMA(void); +PICO_INTERNAL int CheckDMA(int cycles); PICO_INTERNAL void PicoDetectRegion(void); PICO_INTERNAL void PicoSyncZ80(unsigned int m68k_cycles_done); @@ -805,10 +828,10 @@ void ym2612_pack_state(void); void ym2612_unpack_state(void); #define TIMER_NO_OFLOW 0x70000000 -// tA = 72 * (1024 - NA) / M -#define TIMER_A_TICK_ZCYCLES 17203 -// tB = 1152 * (256 - NA) / M -#define TIMER_B_TICK_ZCYCLES 262800 // 275251 broken, see Dai Makaimura +// tA = 72 * (1024 - NA) / M, with M = mclock/2 -> tick = 72 * 2/mclock +#define TIMER_A_TICK_ZCYCLES 17203 // zcycles = Q8*tick*zclock = Q8*77*2*7/15 +// tB = 1152 * (256 - NA) / M, +#define TIMER_B_TICK_ZCYCLES 275251 // zcycles = Q8*1152*2*7/15 #define timers_cycle() \ if (Pico.t.timer_a_next_oflow > 0 && Pico.t.timer_a_next_oflow < TIMER_NO_OFLOW) \ @@ -820,10 +843,29 @@ void ym2612_unpack_state(void); #define timers_reset() \ Pico.t.timer_a_next_oflow = Pico.t.timer_b_next_oflow = TIMER_NO_OFLOW; \ Pico.t.timer_a_step = TIMER_A_TICK_ZCYCLES * 1024; \ - Pico.t.timer_b_step = TIMER_B_TICK_ZCYCLES * 256; + Pico.t.timer_b_step = TIMER_B_TICK_ZCYCLES * 256; \ + ym2612.OPN.ST.status &= ~3; // videoport.c +extern unsigned SATaddr, SATmask; +static __inline void UpdateSAT(u32 a, u32 d) +{ + unsigned num = (a-SATaddr) >> 3; + + Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; + if (!(a & 4) && num < 128) { + ((u16 *)&VdpSATCache[num])[(a&3) >> 1] = d; + } +} +static __inline void VideoWriteVRAM(u32 a, u16 d) +{ + PicoMem.vram [(u16)a >> 1] = d; + + if (!((u16)(a^SATaddr) & SATmask)) + UpdateSAT(a, d); +} + PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d); PICO_INTERNAL_ASM unsigned int PicoVideoRead(unsigned int a); unsigned char PicoVideoRead8DataH(void); @@ -833,6 +875,12 @@ unsigned char PicoVideoRead8CtlL(void); unsigned char PicoVideoRead8HV_H(void); unsigned char PicoVideoRead8HV_L(void); extern int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned int *mask); +void PicoVideoFIFOSync(int cycles); +int PicoVideoFIFOHint(void); +void PicoVideoFIFOMode(int active, int h40); +int PicoVideoFIFOWrite(int count, int byte_p, unsigned sr_mask, unsigned sr_flags); +void PicoVideoSave(void); +void PicoVideoLoad(void); // misc.c PICO_INTERNAL_ASM void memcpy16bswap(unsigned short *dest, void *src, int count); @@ -857,11 +905,12 @@ PICO_INTERNAL_ASM void wram_1M_to_2M(unsigned char *m); // sound/sound.c PICO_INTERNAL void PsndReset(void); PICO_INTERNAL void PsndStartFrame(void); -PICO_INTERNAL void PsndDoDAC(int line_to); +PICO_INTERNAL void PsndDoDAC(int cycle_to); PICO_INTERNAL void PsndDoPSG(int line_to); +PICO_INTERNAL void PsndDoFM(int line_to); PICO_INTERNAL void PsndClear(void); PICO_INTERNAL void PsndGetSamples(int y); -PICO_INTERNAL void PsndGetSamplesMS(void); +PICO_INTERNAL void PsndGetSamplesMS(int y); // sms.c #ifndef NO_SMS @@ -900,13 +949,17 @@ void PicoFrame32x(void); void Pico32xStateLoaded(int is_early); void p32x_sync_sh2s(unsigned int m68k_target); void p32x_sync_other_sh2(SH2 *sh2, unsigned int m68k_target); -void p32x_update_irls(SH2 *active_sh2, int m68k_cycles); -void p32x_trigger_irq(SH2 *sh2, int m68k_cycles, unsigned int mask); -void p32x_update_cmd_irq(SH2 *sh2, int m68k_cycles); +void p32x_update_irls(SH2 *active_sh2, unsigned int m68k_cycles); +void p32x_trigger_irq(SH2 *sh2, unsigned int m68k_cycles, unsigned int mask); +void p32x_update_cmd_irq(SH2 *sh2, unsigned int m68k_cycles); void p32x_reset_sh2s(void); void p32x_event_schedule(unsigned int now, enum p32x_event event, int after); void p32x_event_schedule_sh2(SH2 *sh2, enum p32x_event event, int after); -void p32x_schedule_hint(SH2 *sh2, int m68k_cycles); +void p32x_schedule_hint(SH2 *sh2, unsigned int m68k_cycles); + +#define p32x_sh2_ready(sh2, cycles) \ + (CYCLES_GT(cycles,sh2->m68krcycles_done) && \ + !(sh2->state&(SH2_STATE_CPOLL|SH2_STATE_VPOLL|SH2_STATE_RPOLL))) // 32x/memory.c extern struct Pico32xMem *Pico32xMem; @@ -919,10 +972,17 @@ void Pico32xSwapDRAM(int b); void Pico32xMemStateLoaded(void); void p32x_update_banks(void); void p32x_m68k_poll_event(unsigned int flags); +unsigned int REGPARM(3) p32x_sh2_poll_memory8(unsigned int a, unsigned int d, SH2 *sh2); +unsigned int REGPARM(3) p32x_sh2_poll_memory16(unsigned int a, unsigned int d, SH2 *sh2); +unsigned int REGPARM(3) p32x_sh2_poll_memory32(unsigned int a, unsigned int d, SH2 *sh2); +void *p32x_sh2_get_mem_ptr(unsigned int a, unsigned int *mask, SH2 *sh2); +void p32x_sh2_poll_detect(unsigned int a, SH2 *sh2, unsigned int flags, int maxcnt); void p32x_sh2_poll_event(SH2 *sh2, unsigned int flags, unsigned int m68k_cycles); +int p32x_sh2_memcpy(unsigned int dst, unsigned int src, int count, int size, SH2 *sh2); // 32x/draw.c void PicoDrawSetOutFormat32x(pdso_t which, int use_32x_line_mode); +void PicoDrawSetOutBuf32X(void *dest, int increment); void FinalizeLine32xRGB555(int sh, int line, struct PicoEState *est); void PicoDraw32xLayer(int offs, int lines, int mdbg); void PicoDraw32xLayerMdOnly(int offs, int lines); @@ -952,11 +1012,11 @@ void p32x_pwm_state_loaded(void); void p32x_dreq0_trigger(void); void p32x_dreq1_trigger(void); void p32x_timers_recalc(void); -void p32x_timers_do(unsigned int m68k_slice); +void p32x_timer_do(SH2 *sh2, unsigned int m68k_slice); void sh2_peripheral_reset(SH2 *sh2); -unsigned int sh2_peripheral_read8(unsigned int a, SH2 *sh2); -unsigned int sh2_peripheral_read16(unsigned int a, SH2 *sh2); -unsigned int sh2_peripheral_read32(unsigned int a, SH2 *sh2); +unsigned int REGPARM(2) sh2_peripheral_read8(unsigned int a, SH2 *sh2); +unsigned int REGPARM(2) sh2_peripheral_read16(unsigned int a, SH2 *sh2); +unsigned int REGPARM(2) sh2_peripheral_read32(unsigned int a, SH2 *sh2); void REGPARM(3) sh2_peripheral_write8(unsigned int a, unsigned int d, SH2 *sh2); void REGPARM(3) sh2_peripheral_write16(unsigned int a, unsigned int d, SH2 *sh2); void REGPARM(3) sh2_peripheral_write32(unsigned int a, unsigned int d, SH2 *sh2); diff --git a/pico/pico_int_o32.h b/pico/pico_int_o32.h deleted file mode 100644 index 25c64f432..000000000 --- a/pico/pico_int_o32.h +++ /dev/null @@ -1,28 +0,0 @@ -/* autogenerated by tools/mkoffsets, do not edit */ -#define OFS_Pico_video_reg 0x0000 -#define OFS_Pico_m_rotate 0x0040 -#define OFS_Pico_m_z80Run 0x0041 -#define OFS_Pico_m_dirtyPal 0x0046 -#define OFS_Pico_m_hardware 0x0047 -#define OFS_Pico_m_z80_reset 0x004f -#define OFS_Pico_m_sram_reg 0x0049 -#define OFS_Pico_sv 0x008c -#define OFS_Pico_sv_data 0x008c -#define OFS_Pico_sv_start 0x0090 -#define OFS_Pico_sv_end 0x0094 -#define OFS_Pico_sv_flags 0x0098 -#define OFS_Pico_rom 0x033c -#define OFS_Pico_romsize 0x0340 -#define OFS_EST_DrawScanline 0x00 -#define OFS_EST_rendstatus 0x04 -#define OFS_EST_DrawLineDest 0x08 -#define OFS_EST_HighCol 0x0c -#define OFS_EST_HighPreSpr 0x10 -#define OFS_EST_Pico 0x14 -#define OFS_EST_PicoMem_vram 0x18 -#define OFS_EST_PicoMem_cram 0x1c -#define OFS_EST_PicoOpt 0x20 -#define OFS_EST_Draw2FB 0x24 -#define OFS_EST_HighPal 0x28 -#define OFS_PMEM_vram 0x10000 -#define OFS_PMEM_vsram 0x22100 diff --git a/pico/pico_port.h b/pico/pico_port.h index e26e6ca2c..af9ce8534 100644 --- a/pico/pico_port.h +++ b/pico/pico_port.h @@ -17,10 +17,12 @@ #define NOINLINE __attribute__((noinline)) #define ALIGNED(n) __attribute__((aligned(n))) #define unlikely(x) __builtin_expect((x), 0) +#define likely(x) __builtin_expect(!!(x), 1) #else #define NOINLINE #define ALIGNED(n) #define unlikely(x) (x) +#define likely(x) (x) #endif #ifdef _MSC_VER diff --git a/pico/sms.c b/pico/sms.c index 286b8bf1d..0f4a48ad4 100644 --- a/pico/sms.c +++ b/pico/sms.c @@ -46,8 +46,8 @@ static void vdp_data_write(unsigned char d) struct PicoVideo *pv = &Pico.video; if (pv->type == 3) { + if (PicoMem.cram[pv->addr & 0x1f] != d) Pico.m.dirtyPal = 1; PicoMem.cram[pv->addr & 0x1f] = d; - Pico.m.dirtyPal = 1; } else { PicoMem.vramb[pv->addr] = d; } @@ -152,7 +152,7 @@ static void z80_sms_out(unsigned short a, unsigned char d) case 0x40: case 0x41: - if ((d & 0x90) == 0x90 && Pico.snd.psg_line < Pico.m.scanline) + if ((d & 0x90) == 0x90) PsndDoPSG(Pico.m.scanline); SN76496Write(d); break; @@ -320,16 +320,12 @@ void PicoFrameMS(void) } } - // 224 because of how it's done for MD... - if (y == 224 && PicoIn.sndOut) - PsndGetSamplesMS(); - cycles_aim += cycles_line; cycles_done += z80_run((cycles_aim - cycles_done) >> 8) << 8; } - if (PicoIn.sndOut && Pico.snd.psg_line < lines) - PsndDoPSG(lines - 1); + if (PicoIn.sndOut) + PsndGetSamplesMS(lines); } void PicoFrameDrawOnlyMS(void) diff --git a/pico/sound/mix.c b/pico/sound/mix.c index 636edb553..4b4bbdd81 100644 --- a/pico/sound/mix.c +++ b/pico/sound/mix.c @@ -6,32 +6,72 @@ * See COPYING file in the top-level directory. */ +#include "string.h" + #define MAXOUT (+32767) #define MINOUT (-32768) /* limitter */ -#define Limit(val, max,min) { \ - if ( val > max ) val = max; \ - else if ( val < min ) val = min; \ -} +#define Limit16(val) \ + if ((short)val != val) val = (val < 0 ? MINOUT : MAXOUT) +int mix_32_to_16l_level; -void mix_32_to_16l_stereo(short *dest, int *src, int count) +static struct iir2 { // 2-pole IIR + int x[2]; // sample buffer + int y[2]; // filter intermediates + int i; +} lfi2, rfi2; + +// NB ">>" rounds to -infinity, "/" to 0. To compensate the effect possibly use +// "-(-y>>n)" (round to +infinity) instead of "y>>n" in places. + +// NB uses Q12 fixpoint; samples mustn't have more than 20 bits for this. +#define QB 12 + + +// exponential moving average filter for DC filtering +// y[n] = (x[n]-y[n-1])*(1/8192) (corner approx. 20Hz, gain 1) +static inline int filter_exp(struct iir2 *fi2, int x) { - int l, r; + int xf = (x<y[0]; + fi2->y[0] += xf >> 13; + xf -= xf >> 2; // level reduction to avoid clipping from overshoot + return xf>>QB; +} - for (; count > 0; count--) - { - l = r = *dest; - l += *src++; - r += *src++; - Limit( l, MAXOUT, MINOUT ); - Limit( r, MAXOUT, MINOUT ); - *dest++ = l; - *dest++ = r; - } +// unfiltered (for testing) +static inline int filter_null(struct iir2 *fi2, int x) +{ + return x; +} + +#define mix_32_to_16l_stereo_core(dest, src, count, lv, fl) { \ + int l, r; \ + \ + for (; count > 0; count--) \ + { \ + l = r = *dest; \ + l += *src++ >> lv; \ + r += *src++ >> lv; \ + l = fl(&lfi2, l); \ + r = fl(&rfi2, r); \ + Limit16(l); \ + Limit16(r); \ + *dest++ = l; \ + *dest++ = r; \ + } \ +} + +void mix_32_to_16l_stereo_lvl(short *dest, int *src, int count) +{ + mix_32_to_16l_stereo_core(dest, src, count, mix_32_to_16l_level, filter_exp); } +void mix_32_to_16l_stereo(short *dest, int *src, int count) +{ + mix_32_to_16l_stereo_core(dest, src, count, 0, filter_exp); +} void mix_32_to_16_mono(short *dest, int *src, int count) { @@ -41,7 +81,8 @@ void mix_32_to_16_mono(short *dest, int *src, int count) { l = *dest; l += *src++; - Limit( l, MAXOUT, MINOUT ); + l = filter_exp(&lfi2, l); + Limit16(l); *dest++ = l; } } @@ -77,3 +118,8 @@ void mix_16h_to_32_s2(int *dest_buf, short *mp3_buf, int count) } } +void mix_reset(void) +{ + memset(&lfi2, 0, sizeof(lfi2)); + memset(&rfi2, 0, sizeof(rfi2)); +} diff --git a/pico/sound/mix.h b/pico/sound/mix.h index b9315114c..e128bad17 100644 --- a/pico/sound/mix.h +++ b/pico/sound/mix.h @@ -8,3 +8,4 @@ void mix_32_to_16_mono(short *dest, int *src, int count); extern int mix_32_to_16l_level; void mix_32_to_16l_stereo_lvl(short *dest, int *src, int count); +void mix_reset(void); diff --git a/pico/sound/mix_arm.S b/pico/sound/mix_arm.S index 5088e61bb..a1558d743 100644 --- a/pico/sound/mix_arm.S +++ b/pico/sound/mix_arm.S @@ -166,13 +166,6 @@ m16_32_s2_no_unal2: @ limit and shift up by 16 @ reg=int_sample, lr=1, r3=tmp, kills flags .macro Limitsh reg -@ movs r4, r3, asr #16 -@ cmnne r4, #1 -@ beq c32_16_no_overflow -@ tst r4, r4 -@ mov r3, #0x8000 -@ subpl r3, r3, #1 - add r3, lr, \reg, asr #15 bics r3, r3, #1 @ in non-overflow conditions r3 is 0 or 1 moveq \reg, \reg, lsl #16 @@ -180,20 +173,30 @@ m16_32_s2_no_unal2: subpl \reg, \reg, #0x00010000 .endm +@ filter out DC offset +@ in=int_sample (max 20 bit), y=filter memory, r3=tmp +.macro DCfilt in y + rsb r3, \y, \in, lsl #12 @ fixpoint 20.12 + add \y, \y, r3, asr #13 + sub r3, r3, r3, asr #2 @ reduce audio lvl some + asr \in, r3, #12 +.endm @ mix 32bit audio (with 16bits really used, upper bits indicate overflow) with normal 16 bit audio with left channel only @ warning: this function assumes dest is word aligned .global mix_32_to_16l_stereo @ short *dest, int *src, int count mix_32_to_16l_stereo: - stmfd sp!, {r4-r8,lr} - - mov lr, #1 + stmfd sp!, {r4-r8,r10-r11,lr} mov r2, r2, lsl #1 subs r2, r2, #4 bmi m32_16l_st_end + mov lr, #1 + ldr r12, =filter + ldmia r12, {r10-r11} + m32_16l_st_loop: ldmia r0, {r8,r12} ldmia r1!, {r4-r7} @@ -203,6 +206,10 @@ m32_16l_st_loop: add r5, r5, r8, asr #16 add r6, r6, r12,asr #16 add r7, r7, r12,asr #16 + DCfilt r4, r10 + DCfilt r5, r11 + DCfilt r6, r10 + DCfilt r7, r11 Limitsh r4 Limitsh r5 Limitsh r6 @@ -221,13 +228,17 @@ m32_16l_st_end: ldmia r1!,{r4,r5} add r4, r4, r6 add r5, r5, r6 + DCfilt r4, r10 + DCfilt r5, r11 Limitsh r4 Limitsh r5 orr r4, r5, r4, lsr #16 str r4, [r0], #4 m32_16l_st_no_unal2: - ldmfd sp!, {r4-r8,lr} + ldr r12, =filter + stmia r12, {r10-r11} + ldmfd sp!, {r4-r8,r10-r11,lr} bx lr @@ -235,9 +246,11 @@ m32_16l_st_no_unal2: .global mix_32_to_16_mono @ short *dest, int *src, int count mix_32_to_16_mono: - stmfd sp!, {r4-r8,lr} + stmfd sp!, {r4-r8,r10-r11,lr} mov lr, #1 + ldr r12, =filter + ldr r10, [r12] @ check if dest is word aligned tst r0, #2 @@ -262,6 +275,10 @@ m32_16_mo_loop: add r7, r7, r12,asr #16 mov r12,r12,lsl #16 add r6, r6, r12,asr #16 + DCfilt r4, r10 + DCfilt r5, r10 + DCfilt r6, r10 + DCfilt r7, r10 Limitsh r4 Limitsh r5 Limitsh r6 @@ -281,6 +298,8 @@ m32_16_mo_end: add r5, r5, r6, asr #16 mov r6, r6, lsl #16 add r4, r4, r6, asr #16 + DCfilt r4, r10 + DCfilt r5, r10 Limitsh r4 Limitsh r5 orr r4, r5, r4, lsr #16 @@ -288,14 +307,18 @@ m32_16_mo_end: m32_16_mo_no_unal2: tst r2, #1 - ldmeqfd sp!, {r4-r8,pc} + beq m32_16_mo_no_unal ldrsh r5, [r0] ldr r4, [r1], #4 add r4, r4, r5 + DCfilt r4, r10 Limit r4 strh r4, [r0], #2 - ldmfd sp!, {r4-r8,lr} +m32_16_mo_no_unal: + ldr r12, =filter + str r10, [r12] + ldmfd sp!, {r4-r8,r10-r11,lr} bx lr @@ -315,11 +338,13 @@ mix_32_to_16l_level: .global mix_32_to_16l_stereo_lvl @ short *dest, int *src, int count mix_32_to_16l_stereo_lvl: - stmfd sp!, {r4-r9,lr} + stmfd sp!, {r4-r11,lr} ldr r9, =mix_32_to_16l_level mov lr, #1 ldr r9, [r9] + ldr r12, =filter + ldm r12, {r10-r11} mov r2, r2, lsl #1 subs r2, r2, #4 @@ -338,6 +363,10 @@ m32_16l_st_l_loop: mov r5, r5, asr r9 mov r6, r6, asr r9 mov r7, r7, asr r9 + DCfilt r4, r10 + DCfilt r5, r11 + DCfilt r6, r10 + DCfilt r7, r11 Limitsh r4 Limitsh r5 Limitsh r6 @@ -358,15 +387,31 @@ m32_16l_st_l_end: add r5, r5, r6 mov r4, r4, asr r9 mov r5, r5, asr r9 + DCfilt r4, r10 + DCfilt r5, r11 Limitsh r4 Limitsh r5 orr r4, r5, r4, lsr #16 str r4, [r0], #4 m32_16l_st_l_no_unal2: - ldmfd sp!, {r4-r9,lr} + ldr r12, =filter + stmia r12, {r10-r11} + ldmfd sp!, {r4-r11,lr} bx lr #endif /* __GP2X__ */ +.global mix_reset @ void +mix_reset: + ldr r0, =filter + mov r1, #0 + str r1, [r0] + str r1, [r0, #4] + bx lr + +.data +filter: + .ds 8 + @ vim:filetype=armasm diff --git a/pico/sound/sn76496.c b/pico/sound/sn76496.c index b21275941..4507507c4 100644 --- a/pico/sound/sn76496.c +++ b/pico/sound/sn76496.c @@ -173,9 +173,12 @@ void SN76496Update(short *buffer, int length, int stereo) /* If we exit the loop in the middle, Output[i] has to be inverted */ /* and vol[i] incremented only if the exit status of the square */ /* wave is 1. */ + left = 0; while (R->Count[i] <= 0) { - R->Count[i] += R->Period[i]; + if (R->Count[i] + R->Period[i]*4 < R->Period[i]) + left+= 4, R->Count[i] += R->Period[i]*4; + else left++, R->Count[i] += R->Period[i]; if (R->Count[i] > 0) { R->Output[i] ^= 1; @@ -186,6 +189,9 @@ void SN76496Update(short *buffer, int length, int stereo) vol[i] += R->Period[i]; } if (R->Output[i]) vol[i] -= R->Count[i]; + /* Cut of anything above the sample freqency. It will only create */ + /* aliasing and hearable distortions anyway. */ + if (left > 1) vol[i] = STEP/2; } left = STEP; diff --git a/pico/sound/sound.c b/pico/sound/sound.c index 0d2ae0f55..2b18446c3 100644 --- a/pico/sound/sound.c +++ b/pico/sound/sound.c @@ -19,9 +19,6 @@ void (*PsndMix_32_to_16l)(short *dest, int *src, int count) = mix_32_to_16l_ster // master int buffer to mix to static int PsndBuffer[2*(44100+100)/50]; -// dac, psg -static unsigned short dac_info[312+4]; // pos in sample buffer - // cdda output buffer short cdda_out_buffer[2*1152]; @@ -95,58 +92,6 @@ static void low_pass_filter_mono(int *buf32, int length) void (*low_pass_filter)(int *buf32, int length) = low_pass_filter_stereo; -static void dac_recalculate(void) -{ - int lines = Pico.m.pal ? 313 : 262; - int mid = Pico.m.pal ? 68 : 93; - int i, dac_cnt, pos, len; - - if (Pico.snd.len <= lines) - { - // shrinking algo - dac_cnt = -Pico.snd.len; - len=1; pos=0; - dac_info[225] = 1; - - for(i=226; i != 225; i++) - { - if (i >= lines) i = 0; - if(dac_cnt < 0) { - pos++; - dac_cnt += lines; - } - dac_cnt -= Pico.snd.len; - dac_info[i] = pos; - } - } - else - { - // stretching - dac_cnt = Pico.snd.len; - pos=0; - for(i = 225; i != 224; i++) - { - if (i >= lines) i = 0; - len=0; - while(dac_cnt >= 0) { - dac_cnt -= lines; - len++; - } - if (i == mid) // midpoint - while(pos+len < Pico.snd.len/2) { - dac_cnt -= lines; - len++; - } - dac_cnt += Pico.snd.len; - pos += len; - dac_info[i] = pos; - } - } - for (i = lines; i < sizeof(dac_info) / sizeof(dac_info[0]); i++) - dac_info[i] = dac_info[0]; -} - - PICO_INTERNAL void PsndReset(void) { // PsndRerate calls YM2612Init, which also resets @@ -156,6 +101,8 @@ PICO_INTERNAL void PsndReset(void) // Reset low pass filter lpf_lp = 0; lpf_rp = 0; + + mix_reset(); } @@ -164,6 +111,7 @@ void PsndRerate(int preserve_state) { void *state = NULL; int target_fps = Pico.m.pal ? 50 : 60; + int target_lines = Pico.m.pal ? 313 : 262; if (preserve_state) { state = malloc(0x204); @@ -171,7 +119,7 @@ void PsndRerate(int preserve_state) ym2612_pack_state(); memcpy(state, YM2612GetRegs(), 0x204); } - YM2612Init(Pico.m.pal ? OSC_PAL/7 : OSC_NTSC/7, PicoIn.sndRate); + YM2612Init(Pico.m.pal ? OSC_PAL/7 : OSC_NTSC/7, PicoIn.sndRate, !(PicoIn.opt&POPT_DIS_FM_SSGEG)); if (preserve_state) { // feed it back it's own registers, just like after loading state memcpy(YM2612GetRegs(), state, 0x204); @@ -188,10 +136,12 @@ void PsndRerate(int preserve_state) // calculate Pico.snd.len Pico.snd.len = PicoIn.sndRate / target_fps; Pico.snd.len_e_add = ((PicoIn.sndRate - Pico.snd.len * target_fps) << 16) / target_fps; - Pico.snd.len_e_cnt = 0; + Pico.snd.len_e_cnt = 0; // Q16 - // recalculate dac info - dac_recalculate(); + // samples per line (Q16) + Pico.snd.smpl_mult = 65536LL * PicoIn.sndRate / (target_fps*target_lines); + // samples per z80 clock (Q20) + Pico.snd.clkl_mult = 16 * Pico.snd.smpl_mult * 15/7 / 488; // clear all buffers memset32(PsndBuffer, 0, sizeof(PsndBuffer)/4); @@ -219,59 +169,61 @@ PICO_INTERNAL void PsndStartFrame(void) Pico.snd.len_e_cnt -= 0x10000; Pico.snd.len_use++; } - - Pico.snd.dac_line = Pico.snd.psg_line = 0; - Pico.m.status &= ~1; - dac_info[224] = Pico.snd.len_use; } -PICO_INTERNAL void PsndDoDAC(int line_to) +PICO_INTERNAL void PsndDoDAC(int cyc_to) { - int pos, pos1, len; + int pos, len; int dout = ym2612.dacout; - int line_from = Pico.snd.dac_line; - if (line_to >= 313) - line_to = 312; + // number of samples to fill in buffer (Q20) + len = (cyc_to * Pico.snd.clkl_mult) - Pico.snd.dac_pos; - pos = dac_info[line_from]; - pos1 = dac_info[line_to + 1]; - len = pos1 - pos; + // update position and calculate buffer offset and length + pos = (Pico.snd.dac_pos+0x80000) >> 20; + Pico.snd.dac_pos += len; + len = ((Pico.snd.dac_pos+0x80000) >> 20) - pos; + + // avoid loss of the 1st sample of a new block (Q rounding issues) + if (pos+len == 0) + len = 1, Pico.snd.dac_pos += 0x80000; if (len <= 0) return; - Pico.snd.dac_line = line_to + 1; - if (!PicoIn.sndOut) return; + // fill buffer, applying a rather weak order 1 bessel IIR on the way + // y[n] = (x[n] + x[n-1])*(1/2) (3dB cutoff at 11025 Hz, no gain) + // 1 sample delay for correct IIR filtering over audio frame boundaries if (PicoIn.opt & POPT_EN_STEREO) { short *d = PicoIn.sndOut + pos*2; - for (; len > 0; len--, d+=2) *d += dout; + // left channel only, mixed ro right channel in mixing phase + *d++ += Pico.snd.dac_val2; d++; + while (--len) *d++ += Pico.snd.dac_val, d++; } else { short *d = PicoIn.sndOut + pos; - for (; len > 0; len--, d++) *d += dout; + *d++ += Pico.snd.dac_val2; + while (--len) *d++ += Pico.snd.dac_val; } + Pico.snd.dac_val2 = (Pico.snd.dac_val + dout) >> 1; + Pico.snd.dac_val = dout; } PICO_INTERNAL void PsndDoPSG(int line_to) { - int line_from = Pico.snd.psg_line; - int pos, pos1, len; + int pos, len; int stereo = 0; - if (line_to >= 313) - line_to = 312; - - pos = dac_info[line_from]; - pos1 = dac_info[line_to + 1]; - len = pos1 - pos; - //elprintf(EL_STATUS, "%3d %3d %3d %3d %3d", - // pos, pos1, len, line_from, line_to); + // Q16, number of samples since last call + len = ((line_to+1) * Pico.snd.smpl_mult) - Pico.snd.psg_pos; if (len <= 0) return; - Pico.snd.psg_line = line_to + 1; + // update position and calculate buffer offset and length + pos = (Pico.snd.psg_pos+0x8000) >> 16; + Pico.snd.psg_pos += len; + len = ((Pico.snd.psg_pos+0x8000) >> 16) - pos; if (!PicoIn.sndOut || !(PicoIn.opt & POPT_EN_PSG)) return; @@ -283,6 +235,32 @@ PICO_INTERNAL void PsndDoPSG(int line_to) SN76496Update(PicoIn.sndOut + pos, len, stereo); } +PICO_INTERNAL void PsndDoFM(int cyc_to) +{ + int pos, len; + int stereo = 0; + + // Q16, number of samples since last call + len = (cyc_to * Pico.snd.clkl_mult) - Pico.snd.fm_pos; + + // don't do this too often (about every 4th scanline) + if (len >> 20 <= PicoIn.sndRate >> 12) + return; + + // update position and calculate buffer offset and length + pos = (Pico.snd.fm_pos+0x80000) >> 20; + Pico.snd.fm_pos += len; + len = ((Pico.snd.fm_pos+0x80000) >> 20) - pos; + + // fill buffer + if (PicoIn.opt & POPT_EN_STEREO) { + stereo = 1; + pos <<= 1; + } + if (PicoIn.opt & POPT_EN_FM) + YM2612UpdateOne(PsndBuffer + pos, len, stereo, 1); +} + // cdda static void cdda_raw_update(int *buffer, int length) { @@ -342,37 +320,62 @@ PICO_INTERNAL void PsndClear(void) memset32((int *) out, 0, len/2); if (len & 1) out[len-1] = 0; } + if (!(PicoIn.opt & POPT_EN_FM)) + memset32(PsndBuffer, 0, PicoIn.opt & POPT_EN_STEREO ? len*2 : len); + // drop pos remainder to avoid rounding errors (not entirely correct though) + Pico.snd.dac_pos = Pico.snd.fm_pos = Pico.snd.psg_pos = 0; } static int PsndRender(int offset, int length) { - int buf32_updated = 0; - int *buf32 = PsndBuffer+offset; + int *buf32; int stereo = (PicoIn.opt & 8) >> 3; + int fmlen = ((Pico.snd.fm_pos+0x80000) >> 20); + int daclen = ((Pico.snd.dac_pos+0x80000) >> 20); + int psglen = ((Pico.snd.psg_pos+0x8000) >> 16); - offset <<= stereo; + buf32 = PsndBuffer+(offset< 0) { + short *dacbuf = PicoIn.sndOut + (daclen << stereo); + Pico.snd.dac_pos += (length-daclen) << 20; + *dacbuf++ += Pico.snd.dac_val2; + if (stereo) dacbuf++; + for (daclen++; length-daclen > 0; daclen++) { + *dacbuf++ += Pico.snd.dac_val; + if (stereo) dacbuf++; + } + Pico.snd.dac_val2 = Pico.snd.dac_val; + } + + // Add in parts of the PSG output not yet done + if (length-psglen > 0) { + short *psgbuf = PicoIn.sndOut + (psglen << stereo); + Pico.snd.psg_pos += (length-psglen) << 16; + if (PicoIn.opt & POPT_EN_PSG) + SN76496Update(psgbuf, length-psglen, stereo); + } -//printf("active_chs: %02x\n", buf32_updated); - (void)buf32_updated; + // Add in parts of the FM buffer not yet done + if (length-fmlen > 0) { + int *fmbuf = buf32 + ((fmlen-offset) << stereo); + Pico.snd.fm_pos += (length-fmlen) << 20; + if (PicoIn.opt & POPT_EN_FM) + YM2612UpdateOne(fmbuf, length-fmlen, stereo, 1); + } // CD: PCM sound if (PicoIn.AHW & PAHW_MCD) { - pcd_pcm_update(buf32, length, stereo); - //buf32_updated = 1; + pcd_pcm_update(buf32, length-offset, stereo); } // CD: CDDA audio @@ -383,13 +386,13 @@ static int PsndRender(int offset, int length) { // note: only 44, 22 and 11 kHz supported, with forced stereo if (Pico_mcd->cdda_type == CT_MP3) - mp3_update(buf32, length, stereo); + mp3_update(buf32, length-offset, stereo); else - cdda_raw_update(buf32, length); + cdda_raw_update(buf32, length-offset); } if ((PicoIn.AHW & PAHW_32X) && (PicoIn.opt & POPT_EN_PWM)) - p32x_pwm_update(buf32, length, stereo); + p32x_pwm_update(buf32, length-offset, stereo); // Apply low pass filter, if required if (PicoIn.sndFilter == 1) { @@ -397,49 +400,39 @@ static int PsndRender(int offset, int length) } // convert + limit to normal 16bit output - PsndMix_32_to_16l(PicoIn.sndOut+offset, buf32, length); + PsndMix_32_to_16l(PicoIn.sndOut+(offset<> 3; + int psglen = ((Pico.snd.psg_pos+0x8000) >> 16); - PsndDoPSG(223); + pprof_start(sound); + + // Add in parts of the PSG output not yet done + if (length-psglen > 0) { + short *psgbuf = PicoIn.sndOut + (psglen << stereo); + Pico.snd.psg_pos += (length-psglen) << 16; + if (PicoIn.opt & POPT_EN_PSG) + SN76496Update(psgbuf, length-psglen, stereo); + } // upmix to "stereo" if needed if (PicoIn.opt & POPT_EN_STEREO) { @@ -448,11 +441,20 @@ PICO_INTERNAL void PsndGetSamplesMS(void) *p |= *p << 16; } + pprof_end(sound); + + return length; +} + +PICO_INTERNAL void PsndGetSamplesMS(int y) +{ + static int curr_pos = 0; + + curr_pos = PsndRenderMS(0, Pico.snd.len_use); + if (PicoIn.writeSound != NULL) - PicoIn.writeSound(length * ((PicoIn.opt & POPT_EN_STEREO) ? 4 : 2)); + PicoIn.writeSound(curr_pos * ((PicoIn.opt & POPT_EN_STEREO) ? 4 : 2)); PsndClear(); - - dac_info[224] = 0; } // vim:shiftwidth=2:ts=2:expandtab diff --git a/pico/sound/ym2612.c b/pico/sound/ym2612.c index 0867f558b..622fff0b3 100644 --- a/pico/sound/ym2612.c +++ b/pico/sound/ym2612.c @@ -5,6 +5,10 @@ ** ** SSG-EG was also removed, because it's rarely used, Sega2.doc even does not ** document it ("proprietary") and tells to write 0 to SSG-EG control register. +** +** updated with fixes from mame 0.216 (file version 1.5.1) (kub) +** SSG-EG readded from GenPlus (kub) +** linear sample interpolation for chip to output rate adaption (kub) */ /* @@ -124,7 +128,7 @@ extern YM2612 *ym2612_940; #endif -void memset32(int *dest, int c, int count); +void memset32(void *dest, int c, int count); #ifndef __GNUC__ @@ -148,7 +152,7 @@ void memset32(int *dest, int c, int count); #define FREQ_SH 16 /* 16.16 fixed point (frequency calculations) */ #define EG_SH 16 /* 16.16 fixed point (envelope generator timing) */ -#define LFO_SH 25 /* 7.25 fixed point (LFO calculations) */ +#define LFO_SH 24 /* 8.24 fixed point (LFO calculations) */ #define TIMER_SH 16 /* 16.16 fixed point (timers calculations) */ #define ENV_BITS 10 @@ -172,16 +176,6 @@ void memset32(int *dest, int c, int count); #define EG_TIMER_OVERFLOW (3*(1< max ) val = max; \ - else if ( val < min ) val = min; \ -} - - /* TL_TAB_LEN is calculated as: * 13 - sinus amplitude bits (Y axis) * 2 - sinus sign bit (Y axis) @@ -287,7 +281,7 @@ O(18),O(18),O(18),O(18),O(18),O(18),O(18),O(18), O(18),O(18),O(18),O(18),O(18),O(18),O(18),O(18), /* rates 00-11 */ -O( 0),O( 1),O( 2),O( 3), +O(18),O(18),O( 2),O( 3), O( 0),O( 1),O( 2),O( 3), O( 0),O( 1),O( 2),O( 3), O( 0),O( 1),O( 2),O( 3), @@ -328,10 +322,10 @@ O(16),O(16),O(16),O(16),O(16),O(16),O(16),O(16) #define O(a) (a*1) static const UINT8 eg_rate_shift[32+64+32]={ /* Envelope Generator counter shifts (32 + 64 rates + 32 RKS) */ /* 32 infinite time rates */ -O(0),O(0),O(0),O(0),O(0),O(0),O(0),O(0), -O(0),O(0),O(0),O(0),O(0),O(0),O(0),O(0), -O(0),O(0),O(0),O(0),O(0),O(0),O(0),O(0), -O(0),O(0),O(0),O(0),O(0),O(0),O(0),O(0), +O(11),O(11),O(11),O(11),O(11),O(11),O(11),O(11), +O(11),O(11),O(11),O(11),O(11),O(11),O(11),O(11), +O(11),O(11),O(11),O(11),O(11),O(11),O(11),O(11), +O(11),O(11),O(11),O(11),O(11),O(11),O(11),O(11), /* rates 00-11 */ O(11),O(11),O(11),O(11), @@ -517,7 +511,7 @@ static INT32 lfo_pm_table[128*8*32]; /* 128 combinations of 7 bits meaningful (o but LFO works with one more bit of a precision so we really need 4096 elements */ static UINT32 fn_table[4096]; /* fnumber->increment counter */ -static int g_lfo_ampm = 0; +static int g_lfo_ampm; /* register number to channel number , slot offset */ #define OPN_CHAN(N) (N&3) @@ -552,6 +546,13 @@ INLINE void set_timers( int v ) ym2612.OPN.ST.status &= ~1; } +INLINE void recalc_volout(FM_SLOT *SLOT) +{ + INT16 vol_out = SLOT->volume; + if ((SLOT->ssg&0x0c) == 0x0c) + vol_out = (0x200 - SLOT->volume) & MAX_ATT_INDEX; + SLOT->vol_out = vol_out + SLOT->tl; +} INLINE void FM_KEYON(int c , int s ) { @@ -560,7 +561,15 @@ INLINE void FM_KEYON(int c , int s ) { SLOT->key = 1; SLOT->phase = 0; /* restart Phase Generator */ - SLOT->state = EG_ATT; /* phase -> Attack */ + SLOT->ssg ^= SLOT->ssgn; + SLOT->ssgn = 0; + SLOT->state = (SLOT->sl == MIN_ATT_INDEX) ? EG_SUS : EG_DEC; + if (SLOT->ar_ksr < 32+62) { + if (SLOT->volume > MIN_ATT_INDEX) SLOT->state = EG_ATT; + } else { + SLOT->volume = MIN_ATT_INDEX; + } +// recalc_volout(SLOT); ym2612.slot_mask |= (1<key ) { SLOT->key = 0; - if (SLOT->state>EG_REL) + if (SLOT->state>EG_REL) { SLOT->state = EG_REL;/* phase -> Release */ + if (SLOT->ssg&0x08) { + if (SLOT->ssg&0x04) + SLOT->volume = (0x200 - SLOT->volume); + if (SLOT->volume >= 0x200) { + SLOT->volume = MAX_ATT_INDEX; + SLOT->state = EG_OFF; + } + } + } + SLOT->vol_out = SLOT->volume + SLOT->tl; } } @@ -589,38 +608,38 @@ INLINE void set_det_mul(FM_CH *CH, FM_SLOT *SLOT, int v) INLINE void set_tl(FM_SLOT *SLOT, int v) { SLOT->tl = (v&0x7f)<<(ENV_BITS-7); /* 7bit TL */ +// if (SLOT->state > EG_REL) +// recalc_volout(SLOT); } /* set attack rate & key scale */ INLINE void set_ar_ksr(FM_CH *CH, FM_SLOT *SLOT, int v) { UINT8 old_KSR = SLOT->KSR; + int eg_sh_ar, eg_sel_ar; SLOT->ar = (v&0x1f) ? 32 + ((v&0x1f)<<1) : 0; + SLOT->ar_ksr = SLOT->ar + SLOT->ksr; SLOT->KSR = 3-(v>>6); if (SLOT->KSR != old_KSR) { CH->SLOT[SLOT1].Incr=-1; } + + /* refresh Attack rate */ + if ((SLOT->ar_ksr) < 32+62) + { + eg_sh_ar = eg_rate_shift [SLOT->ar_ksr]; + eg_sel_ar = eg_rate_select[SLOT->ar_ksr]; + } else { - int eg_sh_ar, eg_sel_ar; - - /* refresh Attack rate */ - if ((SLOT->ar + SLOT->ksr) < 32+62) - { - eg_sh_ar = eg_rate_shift [SLOT->ar + SLOT->ksr ]; - eg_sel_ar = eg_rate_select[SLOT->ar + SLOT->ksr ]; - } - else - { - eg_sh_ar = 0; - eg_sel_ar = 17; - } - - SLOT->eg_pack_ar = eg_inc_pack[eg_sel_ar] | (eg_sh_ar<<24); + eg_sh_ar = 0; + eg_sel_ar = 18; } + + SLOT->eg_pack_ar = eg_inc_pack[eg_sel_ar] | (eg_sh_ar<<24); } /* set decay rate */ @@ -656,6 +675,9 @@ INLINE void set_sl_rr(FM_SLOT *SLOT, int v) SLOT->sl = sl_table[ v>>4 ]; + if (SLOT->state == EG_DEC && (SLOT->volume >= (INT32)(SLOT->sl))) + SLOT->state = EG_SUS; + SLOT->rr = 34 + ((v&0x0f)<<2); eg_sh_rr = eg_rate_shift [SLOT->rr + SLOT->ksr]; @@ -715,12 +737,12 @@ INLINE int advance_lfo(int lfo_ampm, UINT32 lfo_cnt_old, UINT32 lfo_cnt) if (prev_pos != pos) { lfo_ampm &= 0xff; - /* triangle */ + /* triangle (inverted) */ /* AM: 0 to 126 step +2, 126 to 0 step -2 */ if (pos<64) - lfo_ampm |= ((pos&63) * 2) << 8; /* 0 - 126 */ + lfo_ampm |= ((pos^63) * 2) << 8; /* 0 - 126 */ else - lfo_ampm |= (126 - (pos&63)*2) << 8; + lfo_ampm |= ((pos&63) * 2) << 8; } else { @@ -739,7 +761,7 @@ INLINE int advance_lfo(int lfo_ampm, UINT32 lfo_cnt_old, UINT32 lfo_cnt) return lfo_ampm; } -INLINE void update_eg_phase(UINT16 *vol_out, FM_SLOT *SLOT, UINT32 eg_cnt) +INLINE void update_eg_phase(FM_SLOT *SLOT, UINT32 eg_cnt, UINT32 ssg_en) { INT32 volume = SLOT->volume; UINT32 pack = SLOT->eg_pack[SLOT->state - 1]; @@ -752,44 +774,114 @@ INLINE void update_eg_phase(UINT16 *vol_out, FM_SLOT *SLOT, UINT32 eg_cnt) eg_inc_val = pack >> ((eg_cnt >> shift) & 7) * 3; eg_inc_val = (1 << (eg_inc_val & 7)) >> 1; - switch (SLOT->state) - { - case EG_ATT: /* attack phase */ - volume += ( ~volume * eg_inc_val ) >> 4; - if ( volume <= MIN_ATT_INDEX ) + if ((SLOT->ssg&0x08) && ssg_en) { + switch (SLOT->state) { - volume = MIN_ATT_INDEX; - SLOT->state = EG_DEC; - } - break; + case EG_ATT: /* attack phase */ + volume += ( ~volume * eg_inc_val ) >> 4; + if ( volume <= MIN_ATT_INDEX ) + { + volume = MIN_ATT_INDEX; + SLOT->state = (SLOT->sl == MIN_ATT_INDEX) ? EG_SUS: EG_DEC; + } + break; - case EG_DEC: /* decay phase */ - volume += eg_inc_val; - if ( volume >= (INT32) SLOT->sl ) - SLOT->state = EG_SUS; - break; + case EG_DEC: /* decay phase */ + if (volume < 0x200) + volume += 4*eg_inc_val; + if ( volume >= (INT32) SLOT->sl ) + SLOT->state = EG_SUS; + break; - case EG_SUS: /* sustain phase */ - volume += eg_inc_val; - if ( volume >= MAX_ATT_INDEX ) - { - volume = MAX_ATT_INDEX; - /* do not change SLOT->state (verified on real chip) */ + case EG_SUS: /* sustain phase */ + if (volume < 0x200) + volume += 4*eg_inc_val; + break; + + case EG_REL: /* release phase */ + if (volume < 0x200) + volume += 4*eg_inc_val; + if ( volume >= 0x200 ) + { + volume = MAX_ATT_INDEX; + SLOT->state = EG_OFF; + } + break; } - break; - case EG_REL: /* release phase */ - volume += eg_inc_val; - if ( volume >= MAX_ATT_INDEX ) + SLOT->vol_out = volume + SLOT->tl; + if ((SLOT->ssg&0x04) && (SLOT->state > EG_REL)) + SLOT->vol_out = ((0x200 - volume) & MAX_ATT_INDEX) + SLOT->tl; + } else { + switch (SLOT->state) { - volume = MAX_ATT_INDEX; - SLOT->state = EG_OFF; + case EG_ATT: /* attack phase */ + volume += ( ~volume * eg_inc_val ) >> 4; + if ( volume <= MIN_ATT_INDEX ) + { + volume = MIN_ATT_INDEX; + SLOT->state = (SLOT->sl == MIN_ATT_INDEX) ? EG_SUS: EG_DEC; + } + break; + + case EG_DEC: /* decay phase */ + volume += eg_inc_val; + if ( volume >= (INT32) SLOT->sl ) + SLOT->state = EG_SUS; + break; + + case EG_SUS: /* sustain phase */ + volume += eg_inc_val; + if ( volume >= MAX_ATT_INDEX ) + { + volume = MAX_ATT_INDEX; + /* do not change SLOT->state (verified on real chip) */ + } + break; + + case EG_REL: /* release phase */ + volume += eg_inc_val; + if ( volume >= MAX_ATT_INDEX ) + { + volume = MAX_ATT_INDEX; + SLOT->state = EG_OFF; + } + break; } - break; - } + SLOT->vol_out = volume + SLOT->tl; + } SLOT->volume = volume; - *vol_out = SLOT->tl + volume; /* tl is 7bit<<3, volume 0-1023 (0-2039 total) */ +} + +INLINE UINT32 update_ssg_eg_phase(FM_SLOT *SLOT, UINT32 phase) +{ + if (SLOT->ssg&0x01) { + if (SLOT->ssg&0x02) { + SLOT->ssg ^= SLOT->ssgn ^ 4; + SLOT->ssgn = 4; + } + + if (SLOT->state != EG_ATT && !(SLOT->ssg&0x04)) + SLOT->volume = MAX_ATT_INDEX; + } else { + if (SLOT->ssg&0x02) { + SLOT->ssg ^= 4; + SLOT->ssgn ^= 4; + } else + phase = 0; + + if (SLOT->state != EG_ATT) { + SLOT->state = (SLOT->sl == MIN_ATT_INDEX) ? EG_SUS : EG_DEC; + if (SLOT->ar_ksr < 32+62) { + if (SLOT->volume > MIN_ATT_INDEX) SLOT->state = EG_ATT; + } else { + SLOT->volume = MIN_ATT_INDEX; + } + } + } +// recalc_volout(SLOT); + return phase; } #endif @@ -835,6 +927,24 @@ static void chan_render_loop(chan_rend_context *ct, int *buffer, int length) { int smp = 0; /* produced sample */ unsigned int eg_out, eg_out2, eg_out4; + FM_SLOT *SLOT; + UINT32 cnt = ct->eg_timer_add+(ct->eg_timer & ((1<pack & 2) while (cnt >= 1<CH->SLOT[SLOT1]; + if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) + ct->phase1 = update_ssg_eg_phase(SLOT, ct->phase1); + SLOT = &ct->CH->SLOT[SLOT2]; + if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) + ct->phase2 = update_ssg_eg_phase(SLOT, ct->phase2); + SLOT = &ct->CH->SLOT[SLOT3]; + if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) + ct->phase3 = update_ssg_eg_phase(SLOT, ct->phase3); + SLOT = &ct->CH->SLOT[SLOT4]; + if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) + ct->phase4 = update_ssg_eg_phase(SLOT, ct->phase4); + } if (ct->pack & 8) { /* LFO enabled ? (test Earthworm Jim in between demo 1 and 2) */ ct->pack = (ct->pack&0xffff) | (advance_lfo(ct->pack >> 16, ct->lfo_cnt, ct->lfo_cnt + ct->lfo_inc) << 16); @@ -842,16 +952,95 @@ static void chan_render_loop(chan_rend_context *ct, int *buffer, int length) } ct->eg_timer += ct->eg_timer_add; - while (ct->eg_timer >= EG_TIMER_OVERFLOW) + if (ct->eg_timer < EG_TIMER_OVERFLOW) { + SLOT = &ct->CH->SLOT[SLOT1]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state > EG_REL) recalc_volout(SLOT); + SLOT = &ct->CH->SLOT[SLOT2]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state > EG_REL) recalc_volout(SLOT); + SLOT = &ct->CH->SLOT[SLOT3]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state > EG_REL) recalc_volout(SLOT); + SLOT = &ct->CH->SLOT[SLOT4]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state > EG_REL) recalc_volout(SLOT); + } + else while (ct->eg_timer >= EG_TIMER_OVERFLOW) { ct->eg_timer -= EG_TIMER_OVERFLOW; ct->eg_cnt++; + if (ct->eg_cnt >= 4096) ct->eg_cnt = 1; + + SLOT = &ct->CH->SLOT[SLOT1]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt, ct->pack & 2); + SLOT = &ct->CH->SLOT[SLOT2]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt, ct->pack & 2); + SLOT = &ct->CH->SLOT[SLOT3]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt, ct->pack & 2); + SLOT = &ct->CH->SLOT[SLOT4]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt, ct->pack & 2); + } - if (ct->CH->SLOT[SLOT1].state != EG_OFF) update_eg_phase(&ct->vol_out1, &ct->CH->SLOT[SLOT1], ct->eg_cnt); - if (ct->CH->SLOT[SLOT2].state != EG_OFF) update_eg_phase(&ct->vol_out2, &ct->CH->SLOT[SLOT2], ct->eg_cnt); - if (ct->CH->SLOT[SLOT3].state != EG_OFF) update_eg_phase(&ct->vol_out3, &ct->CH->SLOT[SLOT3], ct->eg_cnt); - if (ct->CH->SLOT[SLOT4].state != EG_OFF) update_eg_phase(&ct->vol_out4, &ct->CH->SLOT[SLOT4], ct->eg_cnt); +#if 0 + UINT32 ifrac0 = ct->eg_timer / (EG_TIMER_OVERFLOW>>EG_SH); + UINT32 ifrac1 = (1<CH->SLOT[SLOT1]; + ct->vol_out1 = (SLOT->vol_ipol*ifrac1 + SLOT->vol_out*ifrac0) >> EG_SH; + SLOT = &ct->CH->SLOT[SLOT2]; + ct->vol_out2 = (SLOT->vol_ipol*ifrac1 + SLOT->vol_out*ifrac0) >> EG_SH; + SLOT = &ct->CH->SLOT[SLOT3]; + ct->vol_out3 = (SLOT->vol_ipol*ifrac1 + SLOT->vol_out*ifrac0) >> EG_SH; + SLOT = &ct->CH->SLOT[SLOT4]; + ct->vol_out4 = (SLOT->vol_ipol*ifrac1 + SLOT->vol_out*ifrac0) >> EG_SH; +#elif 1 + switch (ct->eg_timer >> EG_SH) + { + case 0: + ct->vol_out1 = ct->CH->SLOT[SLOT1].vol_ipol; + ct->vol_out2 = ct->CH->SLOT[SLOT2].vol_ipol; + ct->vol_out3 = ct->CH->SLOT[SLOT3].vol_ipol; + ct->vol_out4 = ct->CH->SLOT[SLOT4].vol_ipol; + break; + case (EG_TIMER_OVERFLOW>>EG_SH)-1: + ct->vol_out1 = ct->CH->SLOT[SLOT1].vol_out; + ct->vol_out2 = ct->CH->SLOT[SLOT2].vol_out; + ct->vol_out3 = ct->CH->SLOT[SLOT3].vol_out; + ct->vol_out4 = ct->CH->SLOT[SLOT4].vol_out; + break; + default: + ct->vol_out1 = (ct->CH->SLOT[SLOT1].vol_ipol + + ct->CH->SLOT[SLOT1].vol_out) >> 1; + ct->vol_out2 = (ct->CH->SLOT[SLOT2].vol_ipol + + ct->CH->SLOT[SLOT2].vol_out) >> 1; + ct->vol_out3 = (ct->CH->SLOT[SLOT3].vol_ipol + + ct->CH->SLOT[SLOT3].vol_out) >> 1; + ct->vol_out4 = (ct->CH->SLOT[SLOT4].vol_ipol + + ct->CH->SLOT[SLOT4].vol_out) >> 1; + break; + } +#elif 0 + if (ct->eg_timer >> (EG_SH-1) < EG_TIMER_OVERFLOW >> EG_SH) { + ct->vol_out1 = ct->CH->SLOT[SLOT1].vol_ipol; + ct->vol_out2 = ct->CH->SLOT[SLOT2].vol_ipol; + ct->vol_out3 = ct->CH->SLOT[SLOT3].vol_ipol; + ct->vol_out4 = ct->CH->SLOT[SLOT4].vol_ipol; + } else { + ct->vol_out1 = ct->CH->SLOT[SLOT1].vol_out; + ct->vol_out2 = ct->CH->SLOT[SLOT2].vol_out; + ct->vol_out3 = ct->CH->SLOT[SLOT3].vol_out; + ct->vol_out4 = ct->CH->SLOT[SLOT4].vol_out; } +#else + ct->vol_out1 = ct->CH->SLOT[SLOT1].vol_out; + ct->vol_out2 = ct->CH->SLOT[SLOT2].vol_out; + ct->vol_out3 = ct->CH->SLOT[SLOT3].vol_out; + ct->vol_out4 = ct->CH->SLOT[SLOT4].vol_out; +#endif if (ct->pack & 4) continue; /* output disabled */ @@ -881,7 +1070,7 @@ static void chan_render_loop(chan_rend_context *ct, int *buffer, int length) if (ct->pack & (1<<(SLOT4+8))) eg_out4 += add; } - switch( ct->CH->ALGO ) + switch( ct->algo&0x7 ) { case 0: { @@ -1075,13 +1264,40 @@ static void chan_render_finish(void) ym2612.OPN.lfo_cnt = crct.lfo_cnt; } +static UINT32 update_lfo_phase(FM_SLOT *SLOT, UINT32 block_fnum) +{ + UINT32 fnum_lfo; + INT32 lfo_fn_table_index_offset; + UINT8 blk; + UINT32 fn; + int fc,fdt; + + fnum_lfo = ((block_fnum & 0x7f0) >> 4) * 32 * 8; + lfo_fn_table_index_offset = lfo_pm_table[ fnum_lfo + crct.CH->pms + ((crct.pack>>16)&0xff) ]; + if (lfo_fn_table_index_offset) /* LFO phase modulation active */ + { + block_fnum = block_fnum*2 + lfo_fn_table_index_offset; + blk = (block_fnum&0x7000) >> 12; + fn = block_fnum & 0xfff; + + /* phase increment counter */ + fc = (fn_table[fn]>>(7-blk)); + + fdt = fc + SLOT->DT[crct.CH->kcode]; + if (fdt < 0) fdt += fn_table[0x7ff*2] >> 2; + + return (fdt * SLOT->mul) >> 1; + } else + return SLOT->Incr; +} + static int chan_render(int *buffer, int length, int c, UINT32 flags) // flags: stereo, ?, disabled, ?, pan_r, pan_l { crct.CH = &ym2612.CH[c]; crct.mem = crct.CH->mem_value; /* one sample delay memory */ crct.lfo_cnt = ym2612.OPN.lfo_cnt; - flags &= 0x35; + flags &= 0x37; if (crct.lfo_inc) { flags |= 8; @@ -1103,51 +1319,22 @@ static int chan_render(int *buffer, int length, int c, UINT32 flags) // flags: s crct.phase3 = crct.CH->SLOT[SLOT3].phase; crct.phase4 = crct.CH->SLOT[SLOT4].phase; - /* current output from EG circuit (without AM from LFO) */ - crct.vol_out1 = crct.CH->SLOT[SLOT1].tl + ((UINT32)crct.CH->SLOT[SLOT1].volume); - crct.vol_out2 = crct.CH->SLOT[SLOT2].tl + ((UINT32)crct.CH->SLOT[SLOT2].volume); - crct.vol_out3 = crct.CH->SLOT[SLOT3].tl + ((UINT32)crct.CH->SLOT[SLOT3].volume); - crct.vol_out4 = crct.CH->SLOT[SLOT4].tl + ((UINT32)crct.CH->SLOT[SLOT4].volume); - crct.op1_out = crct.CH->op1_out; crct.algo = crct.CH->ALGO & 7; - if(crct.CH->pms) + if(crct.CH->pms && (ym2612.OPN.ST.mode & 0xC0) && c == 2) { + /* 3 slot mode */ + crct.incr1 = update_lfo_phase(&crct.CH->SLOT[SLOT1], ym2612.OPN.SL3.block_fnum[1]); + crct.incr2 = update_lfo_phase(&crct.CH->SLOT[SLOT2], ym2612.OPN.SL3.block_fnum[2]); + crct.incr3 = update_lfo_phase(&crct.CH->SLOT[SLOT3], ym2612.OPN.SL3.block_fnum[0]); + crct.incr4 = update_lfo_phase(&crct.CH->SLOT[SLOT4], crct.CH->block_fnum); + } + else if(crct.CH->pms) { - /* add support for 3 slot mode */ - UINT32 block_fnum = crct.CH->block_fnum; - - UINT32 fnum_lfo = ((block_fnum & 0x7f0) >> 4) * 32 * 8; - INT32 lfo_fn_table_index_offset = lfo_pm_table[ fnum_lfo + crct.CH->pms + ((crct.pack>>16)&0xff) ]; - - if (lfo_fn_table_index_offset) /* LFO phase modulation active */ - { - UINT8 blk; - UINT32 fn; - int kc,fc; - - blk = block_fnum >> 11; - block_fnum = block_fnum*2 + lfo_fn_table_index_offset; - - fn = block_fnum & 0xfff; - - /* keyscale code */ - kc = (blk<<2) | opn_fktable[fn >> 8]; - /* phase increment counter */ - fc = fn_table[fn]>>(7-blk); - - crct.incr1 = ((fc+crct.CH->SLOT[SLOT1].DT[kc])*crct.CH->SLOT[SLOT1].mul) >> 1; - crct.incr2 = ((fc+crct.CH->SLOT[SLOT2].DT[kc])*crct.CH->SLOT[SLOT2].mul) >> 1; - crct.incr3 = ((fc+crct.CH->SLOT[SLOT3].DT[kc])*crct.CH->SLOT[SLOT3].mul) >> 1; - crct.incr4 = ((fc+crct.CH->SLOT[SLOT4].DT[kc])*crct.CH->SLOT[SLOT4].mul) >> 1; - } - else /* LFO phase modulation = zero */ - { - crct.incr1 = crct.CH->SLOT[SLOT1].Incr; - crct.incr2 = crct.CH->SLOT[SLOT2].Incr; - crct.incr3 = crct.CH->SLOT[SLOT3].Incr; - crct.incr4 = crct.CH->SLOT[SLOT4].Incr; - } + crct.incr1 = update_lfo_phase(&crct.CH->SLOT[SLOT1], crct.CH->block_fnum); + crct.incr2 = update_lfo_phase(&crct.CH->SLOT[SLOT2], crct.CH->block_fnum); + crct.incr3 = update_lfo_phase(&crct.CH->SLOT[SLOT3], crct.CH->block_fnum); + crct.incr4 = update_lfo_phase(&crct.CH->SLOT[SLOT4], crct.CH->block_fnum); } else /* no LFO phase modulation */ { @@ -1191,17 +1378,18 @@ INLINE void refresh_fc_eg_slot(FM_SLOT *SLOT, int fc, int kc) { int eg_sh, eg_sel; SLOT->ksr = ksr; + SLOT->ar_ksr = SLOT->ar + ksr; /* calculate envelope generator rates */ - if ((SLOT->ar + ksr) < 32+62) + if ((SLOT->ar_ksr) < 32+62) { - eg_sh = eg_rate_shift [SLOT->ar + ksr ]; - eg_sel = eg_rate_select[SLOT->ar + ksr ]; + eg_sh = eg_rate_shift [SLOT->ar_ksr]; + eg_sel = eg_rate_select[SLOT->ar_ksr]; } else { eg_sh = 0; - eg_sel = 17; + eg_sel = 18; } SLOT->eg_pack_ar = eg_inc_pack[eg_sel] | (eg_sh<<24); @@ -1256,7 +1444,7 @@ static void init_timetables(const UINT8 *dttable) /* DeTune table */ for (d = 0;d <= 3;d++){ for (i = 0;i <= 31;i++){ - rate = ((double)dttable[d*32 + i]) * SIN_LEN * ym2612.OPN.ST.freqbase * (1<ssg = v&0x0f; + SLOT->ssg ^= SLOT->ssgn; + if (v&0x08) ym2612.ssg_mask |= 1<<(OPN_SLOT(r) + c*4); + else ym2612.ssg_mask &= ~(1<<(OPN_SLOT(r) + c*4)); +// if (SLOT->state > EG_REL) +// recalc_volout(SLOT); break; case 0xa0: @@ -1581,6 +1779,7 @@ int YM2612UpdateOne_(int *buffer, int length, int stereo, int is_buf_empty) { int pan; int active_chs = 0; + int flags = stereo ? 1:0; // if !is_buf_empty, it means it has valid samples to mix with, else it may contain trash if (is_buf_empty) memset32(buffer, 0, length<>2)) << 3; - if (ym2612.slot_mask & 0x0f0000) active_chs |= chan_render(buffer, length, 4, stereo|((pan&0x300)>>4)) << 4; - if (ym2612.slot_mask & 0xf00000) active_chs |= chan_render(buffer, length, 5, stereo|((pan&0xc00)>>6)|(ym2612.dacen<<2)) << 5; +#define BIT_IF(v,b,c) { v &= ~(1<<(b)); if (c) v |= 1<<(b); } + BIT_IF(flags, 1, (ym2612.ssg_mask & 0x00000f) && (ym2612.OPN.ST.flags & 1)); + if (ym2612.slot_mask & 0x00000f) active_chs |= chan_render(buffer, length, 0, flags|((pan&0x003)<<4)) << 0; + BIT_IF(flags, 1, (ym2612.ssg_mask & 0x0000f0) && (ym2612.OPN.ST.flags & 1)); + if (ym2612.slot_mask & 0x0000f0) active_chs |= chan_render(buffer, length, 1, flags|((pan&0x00c)<<2)) << 1; + BIT_IF(flags, 1, (ym2612.ssg_mask & 0x000f00) && (ym2612.OPN.ST.flags & 1)); + if (ym2612.slot_mask & 0x000f00) active_chs |= chan_render(buffer, length, 2, flags|((pan&0x030) )) << 2; + BIT_IF(flags, 1, (ym2612.ssg_mask & 0x00f000) && (ym2612.OPN.ST.flags & 1)); + if (ym2612.slot_mask & 0x00f000) active_chs |= chan_render(buffer, length, 3, flags|((pan&0x0c0)>>2)) << 3; + BIT_IF(flags, 1, (ym2612.ssg_mask & 0x0f0000) && (ym2612.OPN.ST.flags & 1)); + if (ym2612.slot_mask & 0x0f0000) active_chs |= chan_render(buffer, length, 4, flags|((pan&0x300)>>4)) << 4; + BIT_IF(flags, 1, (ym2612.ssg_mask & 0xf00000) && (ym2612.OPN.ST.flags & 1)); + if (ym2612.slot_mask & 0xf00000) active_chs |= chan_render(buffer, length, 5, flags|((pan&0xc00)>>6)|(!!ym2612.dacen<<2)) << 5; +#undef BIT_IF chan_render_finish(); return active_chs; // 1 if buffer updated @@ -1634,13 +1840,14 @@ int YM2612UpdateOne_(int *buffer, int length, int stereo, int is_buf_empty) /* initialize YM2612 emulator */ -void YM2612Init_(int clock, int rate) +void YM2612Init_(int clock, int rate, int ssg) { memset(&ym2612, 0, sizeof(ym2612)); init_tables(); ym2612.OPN.ST.clock = clock; ym2612.OPN.ST.rate = rate; + ym2612.OPN.ST.flags = (ssg ? 1:0); OPNSetPres( 6*24 ); @@ -1661,6 +1868,9 @@ void YM2612ResetChip_(void) ym2612.OPN.eg_timer = 0; ym2612.OPN.eg_cnt = 0; + ym2612.OPN.lfo_inc = 0; + ym2612.OPN.lfo_cnt = 0; + g_lfo_ampm = 126 << 8; ym2612.OPN.ST.status = 0; reset_channels( &ym2612.CH[0] ); @@ -1720,6 +1930,7 @@ int YM2612Write_(unsigned int a, unsigned int v) { ym2612.OPN.lfo_inc = 0; ym2612.OPN.lfo_cnt = 0; + g_lfo_ampm = 126 << 8; } break; #if 0 // handled elsewhere diff --git a/pico/sound/ym2612.h b/pico/sound/ym2612.h index a2921b222..e73c97321 100644 --- a/pico/sound/ym2612.h +++ b/pico/sound/ym2612.h @@ -53,6 +53,12 @@ typedef struct }; UINT32 eg_pack[4]; }; + + UINT8 ssg; /* 0x30 SSG-EG waveform */ + UINT8 ssgn; + UINT16 ar_ksr; /* 0x32 ar+ksr */ + UINT16 vol_out; /* 0x34 current output from EG (without LFO) */ + UINT16 vol_ipol; /* 0x36 interpolator memory */ } FM_SLOT; @@ -89,7 +95,7 @@ typedef struct UINT8 address; /* 10 address register | need_save */ UINT8 status; /* 11 status flag | need_save */ UINT8 mode; /* mode CSM / 3SLOT */ - UINT8 pad; + UINT8 flags; /* operational flags */ int TA; /* timer a */ int TAC; /* timer a maxval */ int TAT; /* timer a ticker | need_save */ @@ -147,6 +153,7 @@ typedef struct FM_OPN OPN; /* OPN state */ UINT32 slot_mask; /* active slot mask (performance hack) */ + UINT32 ssg_mask; /* active ssg mask (performance hack) */ } YM2612; #endif @@ -154,7 +161,7 @@ typedef struct extern YM2612 ym2612; #endif -void YM2612Init_(int baseclock, int rate); +void YM2612Init_(int baseclock, int rate, int ssg); void YM2612ResetChip_(void); int YM2612UpdateOne_(int *buffer, int length, int stereo, int is_buf_empty); @@ -176,22 +183,22 @@ int YM2612PicoStateLoad2(int *tat, int *tbt); #else /* GP2X specific */ #include "../../platform/gp2x/940ctl.h" -extern int PicoIn.opt; -#define YM2612Init(baseclock,rate) { \ - if (PicoIn.opt&0x200) YM2612Init_940(baseclock, rate); \ - else YM2612Init_(baseclock, rate); \ -} -#define YM2612ResetChip() { \ - if (PicoIn.opt&0x200) YM2612ResetChip_940(); \ +#define YM2612Init(baseclock,rate,ssg) do { \ + if (PicoIn.opt&POPT_EXT_FM) YM2612Init_940(baseclock, rate, ssg); \ + else YM2612Init_(baseclock, rate, ssg); \ +} while (0) +#define YM2612ResetChip() do { \ + if (PicoIn.opt&POPT_EXT_FM) YM2612ResetChip_940(); \ else YM2612ResetChip_(); \ -} -#define YM2612UpdateOne(buffer,length,stereo,is_buf_empty) \ - (PicoIn.opt&0x200) ? YM2612UpdateOne_940(buffer, length, stereo, is_buf_empty) : \ - YM2612UpdateOne_(buffer, length, stereo, is_buf_empty); -#define YM2612PicoStateLoad() { \ - if (PicoIn.opt&0x200) YM2612PicoStateLoad_940(); \ +} while (0) +#define YM2612UpdateOne(buffer,length,stereo,is_buf_empty) do { \ + (PicoIn.opt&POPT_EXT_FM) ? YM2612UpdateOne_940(buffer, length, stereo, is_buf_empty) : \ + YM2612UpdateOne_(buffer, length, stereo, is_buf_empty); \ +} while (0) +#define YM2612PicoStateLoad() do { \ + if (PicoIn.opt&POPT_EXT_FM) YM2612PicoStateLoad_940(); \ else YM2612PicoStateLoad_(); \ -} +} while (0) #endif /* __GP2X__ */ diff --git a/pico/sound/ym2612_arm.s b/pico/sound/ym2612_arm.S similarity index 58% rename from pico/sound/ym2612_arm.s rename to pico/sound/ym2612_arm.S index 9c436d41b..0334d1cfe 100644 --- a/pico/sound/ym2612_arm.s +++ b/pico/sound/ym2612_arm.S @@ -1,6 +1,7 @@ /* * PicoDrive * (C) notaz, 2006 + * (C) kub, 2020 added SSG-EG and simple output rate interpolation * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. @@ -12,11 +13,17 @@ @ vim:filetype=armasm +#include "../arm_features.h" + +@ very simple YM2612 output rate to sample rate adaption (~500k cycles @44100) +#define INTERPOL +#define SSG_EG + .equiv SLOT1, 0 .equiv SLOT2, 2 .equiv SLOT3, 1 .equiv SLOT4, 3 -.equiv SLOT_STRUCT_SIZE, 0x30 +.equiv SLOT_STRUCT_SIZE, 0x38 .equiv TL_TAB_LEN, 0x1A00 @@ -26,20 +33,27 @@ .equiv EG_REL, 1 .equiv EG_OFF, 0 -.equiv EG_SH, 16 @ 16.16 fixed point (envelope generator timing) +.equiv EG_SH, 16 @ 16.16 fixed point (envelope generator timing) .equiv EG_TIMER_OVERFLOW, (3*(1<= (INT32) SLOT->sl ) + strgeb r3, [r5,#0x17] @ state + b 10f + +4: @ EG_ATT + subs r3, r3, #1 @ eg_inc_val_shift - 1 + mvnpl r2, r0 + movpl r2, r2, lsl r3 + addpl r0, r0, r2, asr #4 + cmp r0, #0 @ if (volume <= MIN_ATT_INDEX) + bgt 10f + ldr r2, [r5,#0x1c] + mov r0, #0 + cmp r2, #0 + movne r3, #EG_DEC + moveq r3, #EG_SUS + strb r3, [r5,#0x17] @ state + b 10f + +1: @ EG_REL + mov r2, #0x200 + cmp r0, r2 @ if ( volume >= 0x200 ) + movge r0, #1024 + subge r0, #1 + movge r3, #EG_OFF + strgeb r3, [r5,#0x17] @ state + +10: @ finish + ldrb r2, [r5,#0x30] @ ssg + ldrb r3, [r5,#0x17] @ state + strh r0, [r5,#0x1a] @ volume + cmp r2, #0x0c @ if ( ssg&0x04 && state > EG_REL ) + cmpge r3, #EG_REL+1 + ldrh r3, [r5,#0x18] @ tl + rsbge r0, r0, #0x200 @ volume = (0x200-volume) & MAX_ATT + lslge r0, r0, #22 + lsrge r0, r0, #22 + +11: +#endif + add r0, r0, r3 @ volume += tl + strh r0, [r5,#0x34] @ vol_out 0: @ EG_OFF .endm +#if defined(SSG_EG) +@ r5=slot, trashes: r0,r2,r3 +.macro update_ssg_eg + ldrh r0, [r5,#0x30] @ ssg+ssgn + ldrb r2, [r5,#0x17] @ state + ldrh r3, [r5,#0x1a] @ volume + tst r0, #0x08 @ ssg enabled && + beq 9f + cmp r2, #EG_REL+1 @ state > EG_REL && + cmpge r3, #0x200 @ volume >= 0x200? + blt 9f + orr r4, r4, #0x10 @ ssg_update + + tst r0, #0x01 + beq 1f + + tst r0, #0x02 + eorne r0, r0, lsr #8 @ ssg ^= ssgn ^ 4 + eorne r0, r0, #0x4 + orrne r0, r0, #0x400 @ ssgn = 4 + strneh r0, [r5,#0x30] + + eor r0, r0, #0x4 @ if ( !(ssg&0x04) ) + tst r0, #0x4 + cmpne r2, #EG_ATT @ if ( state != EG_ATT ) + movne r3, #0x400 + subne r3, r3, #1 + strneh r3, [r5,#0x1a] @ volume = MAX_ATT + b 9f + +1: tst r0, #0x02 + eorne r0, r0, #0x4 @ ssg ^= 4 + eorne r0, r0, #0x400 @ ssgn ^= 4 + strneh r0, [r5,#0x30] + moveq r0, #0 + streq r0, [r5,#0x0c] @ phase = 0 + + cmp r2, #EG_ATT @ if ( state != EG_ATT ) + beq 9f + + ldr r0, [r5,#0x1c] @ sl + mov r2, #EG_SUS @ state = sl==MIN_ATT ? EG_SUS:EG_DEC + cmp r0, #0 + + ldrh r0, [r5,#0x32] @ ar+ksr + movne r2, #EG_DEC + cmp r0, #32+62 @ if ( ar+ksr >= 32+62 ) + movge r3, #0 + strgeh r3, [r5,#0x1a] @ volume = MIN_ATT + bge 9f + + cmp r3, #0 + movgt r2, #EG_ATT + strb r2, [r5,#0x17] @ state +9: +.endm + +@ r5=slot, trashes: r0,r2,r3 +.macro recalc_volout +#if defined(INTERPOL) + ldrh r0, [r5,#0x34] @ vol_out +#endif + ldrb r2, [r5,#0x30] @ ssg + ldrb r3, [r5,#0x17] @ state +#if defined(INTERPOL) + strh r0, [r5,#0x36] @ vol_ipol +#endif + ldrh r0, [r5,#0x1a] @ volume + +@ and r2, r2, #0x0c + cmp r2, #0x0c @ if ( ~ssg&0x0c && state > EG_REL ) + cmpge r3, #EG_REL+1 + ldrh r3, [r5,#0x18] @ tl + rsbge r0, r0, #0x200 @ volume = (0x200-volume) & MAX_ATT + lslge r0, r0, #22 + lsrge r0, r0, #22 + ldrh r0, [r5,#0x1a] @ volume + ldrh r3, [r5,#0x18] @ tl + + add r0, r0, r3 @ volume += tl + strh r0, [r5,#0x34] @ vol_out +.endm +#endif @ r12=lfo_ampm[31:16], r1=lfo_cnt_old, r2=lfo_cnt, r3=scratch .macro advance_lfo_m @@ -138,7 +292,7 @@ beq 0f and r3, r2, #0x3f cmp r2, #0x40 - rsbge r3, r3, #0x3f + eorlt r3, r3, #0x3f bic r12,r12, #0xff000000 @ lfo_ampm &= 0xff orr r12,r12, r3, lsl #1+24 @@ -187,7 +341,7 @@ .endm -@ lr=context, r12=pack (stereo, lastchan, disabled, lfo_enabled | pan_r, pan_l, ams[2] | AMmasks[4] | FB[4] | lfo_ampm[16]) +@ lr=context, r12=pack (stereo, ssg_enabled, disabled, lfo_enabled | pan_r, pan_l, ams[2] | AMmasks[4] | FB[4] | lfo_ampm[16]) @ r0-r2=scratch, r3=sin_tab, r5=scratch, r6-r7=vol_out[4], r10=op1_out .macro upd_algo0_m @@ -525,189 +679,8 @@ .endm -/* -.global update_eg_phase @ FM_SLOT *SLOT, UINT32 eg_cnt - -update_eg_phase: - stmfd sp!, {r5,r6} - mov r5, r0 @ slot - ldrh r3, [r5,#0x18] @ tl - ldrh r6, [r5,#0x1a] @ volume - add r6, r6, r3 - update_eg_phase_slot SLOT1 - mov r0, r6 - ldmfd sp!, {r5,r6} - bx lr -.pool - - -.global advance_lfo @ int lfo_ampm, UINT32 lfo_cnt_old, UINT32 lfo_cnt - -advance_lfo: - mov r12, r0, lsl #16 - advance_lfo_m - mov r0, r12, lsr #16 - bx lr -.pool - - -.global upd_algo0 @ chan_rend_context *c -upd_algo0: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo0_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_algo1 @ chan_rend_context *c -upd_algo1: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo1_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_algo2 @ chan_rend_context *c -upd_algo2: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo2_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_algo3 @ chan_rend_context *c -upd_algo3: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo3_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_algo4 @ chan_rend_context *c -upd_algo4: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo4_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_algo5 @ chan_rend_context *c -upd_algo5: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo5_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_algo6 @ chan_rend_context *c -upd_algo6: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo6_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_algo7 @ chan_rend_context *c -upd_algo7: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo7_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_slot1 @ chan_rend_context *c -upd_slot1: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_slot1_m - str r10, [lr, #0x38] - - ldmfd sp!, {r4-r10,pc} -.pool -*/ - - -@ lr=context, r12=pack (stereo, lastchan, disabled, lfo_enabled | pan_r, pan_l, ams[2] | AMmasks[4] | FB[4] | lfo_ampm[16]) -@ r0-r2=scratch, r3=sin_tab/scratch, r4=(length<<8)|unused[4],was_update,algo[3], r5=tl_tab/slot, +@ lr=context, r12=pack (stereo, ssg_enabled, disabled, lfo_enabled | pan_r, pan_l, ams[2] | AMmasks[4] | FB[4] | lfo_ampm[16]) +@ r0-r2=scratch, r3=sin_tab/scratch, r4=(length<<8)|unused[3],ssg_update,was_update,algo[3], r5=tl_tab/slot, @ r6-r7=vol_out[4], r8=eg_timer, r9=eg_timer_add[31:16], r10=op1_out, r11=buffer .global chan_render_loop @ chan_rend_context *ct, int *buffer, int length @@ -720,17 +693,16 @@ chan_render_loop: mov r11, r1 and r0, r0, #7 orr r4, r4, r0 @ (length<<8)|algo - add r0, lr, #0x44 - ldmia r0, {r8,r9} @ eg_timer, eg_timer_add + ldr r8, [lr, #0x44] @ eg_timer + ldr r9, [lr, #0x48] @ eg_timer_add ldr r10, [lr, #0x54] @ op1_out - ldmia lr, {r6,r7} @ load volumes tst r12, #8 @ lfo? beq crl_loop crl_loop_lfo: - add r0, lr, #0x30 - ldmia r0, {r1,r2} + ldr r1, [lr, #0x30] @ lfo_cnt + ldr r2, [lr, #0x34] @ lfo_inc subs r4, r4, #0x100 bmi crl_loop_end @@ -747,32 +719,91 @@ crl_loop: subs r4, r4, #0x100 bmi crl_loop_end + ldr r5, [lr, #0x40] @ CH +#if defined(SSG_EG) + tst r12, #0x02 @ ssg_enabled? + beq ssg_done + @ -- SSG -- + lsl r7, r8, #EG_SH + add r7, r9, r7, lsr #EG_SH + subs r7, r7, #1<>EG_SH)/2 + bne 0f @ mix is vol_out + + ldr r6, [r5, #0x34] @ vol_out, vol_ipol for all slots + ldr r2, [r5, #0x34+SLOT_STRUCT_SIZE*2] + ldr r7, [r5, #0x34+SLOT_STRUCT_SIZE] + ldr r3, [r5, #0x34+SLOT_STRUCT_SIZE*3] + add r6, r6, r6, lsl #16 + lsr r6, r6, #17 + add r2, r2, r2, lsl #16 + lsr r2, r2, #17 + add r7, r7, r7, lsl #16 + lsr r7, r7, #17 + add r3, r3, r3, lsl #16 + lsr r3, r3, #17 + b 1f +#else + @ super-basic... just take value closest to sample point + mov r3, r8, lsr #EG_SH-1 @ eg_timer, [0..3<>EG_SH) +#endif + +0: ldrgeh r6, [r5, #0x34] @ vol_out values for all slots + ldrlth r6, [r5, #0x36] @ vol_ipol values for all slots + ldrgeh r2, [r5, #0x34+SLOT_STRUCT_SIZE*2] + ldrlth r2, [r5, #0x36+SLOT_STRUCT_SIZE*2] + ldrgeh r7, [r5, #0x34+SLOT_STRUCT_SIZE] + ldrlth r7, [r5, #0x36+SLOT_STRUCT_SIZE] + ldrgeh r3, [r5, #0x34+SLOT_STRUCT_SIZE*3] + ldrlth r3, [r5, #0x36+SLOT_STRUCT_SIZE*3] + +#else + ldrh r6, [r5, #0x34] @ vol_out values for all slots + ldrh r2, [r5, #0x34+SLOT_STRUCT_SIZE*2] + ldrh r7, [r5, #0x34+SLOT_STRUCT_SIZE] + ldrh r3, [r5, #0x34+SLOT_STRUCT_SIZE*3] +#endif +1: orr r6, r6, r2, lsl #16 + orr r7, r7, r3, lsl #16 + @ -- SLOT1 -- - ldr r3, =ym_tl_tab + PIC_LDR(r3, r2, ym_tl_tab) - @ lr=context, r12=pack (stereo, lastchan, disabled, lfo_enabled | pan_r, pan_l, ams[2] | AMmasks[4] | FB[4] | lfo_ampm[16]) + @ lr=context, r12=pack (stereo, ssg_enabled, disabled, lfo_enabled | pan_r, pan_l, ams[2] | AMmasks[4] | FB[4] | lfo_ampm[16]) @ r0-r2=scratch, r3=tl_tab, r5=scratch, r6-r7=vol_out[4], r10=op1_out upd_slot1_m @ -- SLOT2+ -- and r0, r4, #7 - ldr pc, [pc, r0, lsl #2] + PIC_XB(,r0, lsl #2) nop - .word crl_algo0 - .word crl_algo1 - .word crl_algo2 - .word crl_algo3 - .word crl_algo4 - .word crl_algo5 - .word crl_algo6 - .word crl_algo7 + PIC_BT(crl_algo0) + PIC_BT(crl_algo1) + PIC_BT(crl_algo2) + PIC_BT(crl_algo3) + PIC_BT(crl_algo4) + PIC_BT(crl_algo5) + PIC_BT(crl_algo6) + PIC_BT(crl_algo7) .pool crl_algo0: @@ -860,34 +936,28 @@ crl_algo_done: strne r1, [r11], #4 b crl_do_phase -ctl_sample_skip: - and r1, r12, #1 - add r1, r1, #1 - add r11,r11, r1, lsl #2 - b crl_do_phase - ctl_sample_mono: ldr r1, [r11] add r1, r0, r1 str r1, [r11], #4 + b crl_do_phase + +ctl_sample_skip: + and r1, r12, #1 + add r1, r1, #1 + add r11,r11, r1, lsl #2 crl_do_phase: @ -- PHASE UPDATE -- add r5, lr, #0x10 - ldmia r5, {r0-r1} - add r5, lr, #0x20 - ldmia r5, {r2-r3} - add r5, lr, #0x10 - add r0, r0, r2 - add r1, r1, r3 - stmia r5!,{r0-r1} - ldmia r5, {r0-r1} - add r5, lr, #0x28 - ldmia r5, {r2-r3} - add r5, lr, #0x18 - add r0, r0, r2 - add r1, r1, r3 - stmia r5, {r0-r1} + ldmia r5, {r0-r3,r6-r7} + add r0, r0, r6 + add r1, r1, r7 + ldr r6, [r5, #0x18] + ldr r7, [r5, #0x1c] + add r2, r2, r6 + add r3, r3, r7 + stmia r5, {r0-r3} tst r12, #8 bne crl_loop_lfo @@ -895,7 +965,6 @@ crl_do_phase: crl_loop_end: -@ stmia lr, {r6,r7} @ save volumes (for debug) str r8, [lr, #0x44] @ eg_timer str r12, [lr, #0x4c] @ pack (for lfo_ampm) str r4, [lr, #0x50] @ was_update diff --git a/pico/state.c b/pico/state.c index dc15bc05f..5160ce95a 100644 --- a/pico/state.c +++ b/pico/state.c @@ -254,6 +254,8 @@ static int state_save(void *file) CHECKED_WRITE_BUFF(CHUNK_ZRAM, PicoMem.zram); CHECKED_WRITE_BUFF(CHUNK_CRAM, PicoMem.cram); CHECKED_WRITE_BUFF(CHUNK_MISC, Pico.m); + + PicoVideoSave(); CHECKED_WRITE_BUFF(CHUNK_VIDEO, Pico.video); z80_pack(buff_z80); @@ -437,7 +439,11 @@ static int state_load(void *file) case CHUNK_CRAM: CHECKED_READ_BUFF(PicoMem.cram); break; case CHUNK_VSRAM: CHECKED_READ_BUFF(PicoMem.vsram); break; case CHUNK_MISC: CHECKED_READ_BUFF(Pico.m); break; - case CHUNK_VIDEO: CHECKED_READ_BUFF(Pico.video); break; + case CHUNK_VIDEO: + CHECKED_READ_BUFF(Pico.video); + PicoVideoLoad(); + break; + case CHUNK_IOPORTS: CHECKED_READ_BUFF(PicoMem.ioports); break; case CHUNK_PSG: CHECKED_READ2(28*4, sn76496_regs); break; case CHUNK_FM: @@ -563,7 +569,7 @@ static int state_load(void *file) z80_unpack(buff_z80); // due to dep from 68k cycles.. - Pico.t.m68c_aim = Pico.t.m68c_cnt; + Pico.t.m68c_frame_start = Pico.t.m68c_aim = Pico.t.m68c_cnt; if (PicoIn.AHW & PAHW_32X) Pico32xStateLoaded(0); if (PicoIn.AHW & PAHW_MCD) diff --git a/pico/videoport.c b/pico/videoport.c index cd76dc049..f324f704f 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -2,6 +2,7 @@ * PicoDrive * (c) Copyright Dave, 2004 * (C) notaz, 2006-2009 + * (C) kub, 2020 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. @@ -11,22 +12,330 @@ #define NEED_DMA_SOURCE #include "memory.h" -extern const unsigned char hcounts_32[]; -extern const unsigned char hcounts_40[]; +extern const unsigned char hcounts_32[], hcounts_40[]; +extern const unsigned char vdpcyc2sl_32_bl[], vdpcyc2sl_40_bl[]; +extern const unsigned char vdpcyc2sl_32[], vdpcyc2sl_40[]; +extern const unsigned short vdpsl2cyc_32_bl[], vdpsl2cyc_40_bl[]; +extern const unsigned short vdpsl2cyc_32[], vdpsl2cyc_40[]; +static int blankline; // display disabled for this line + +unsigned SATaddr, SATmask; // VRAM addr of sprite attribute table int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned int *mask) = NULL; + +/* VDP FIFO implementation + * + * fifo_slot: last slot executed in this scanline + * fifo_cnt: #slots remaining for active FIFO write (#writes<<#bytep) + * fifo_total: #total FIFO entries pending + * fifo_data: last values transferred through fifo + * fifo_queue: fifo transfer queue (#writes, flags) + * + * FIFO states: empty total=0 + * inuse total>0 && total<4 + * full total==4 + * wait total>4 + * Conditions: + * fifo_slot is always behind slot2cyc[cycles]. Advancing it beyond cycles + * implies blocking the 68k up to that slot. + * + * A FIFO write goes to the end of the FIFO queue, but DMA running in background + * is always the last queue entry (transfers by CPU intervene and come 1st). + * There can be more pending writes than FIFO slots, but the CPU will be blocked + * until FIFO level (without background DMA) <= 4. + * This is only about correct timing, data xfer must be handled by the caller. + * Blocking the CPU means burning cycles via SekCyclesBurn*(), which is to be + * executed by the caller. + * + * FIFOSync "executes" FIFO write slots up to the given cycle in the current + * scanline. A queue entry completely executed is removed from the queue. + * FIFOWrite pushes writes to the transfer queue. If it's a blocking write, 68k + * is blocked if more than 4 FIFO writes are pending. + * FIFORead executes a 68k read. 68k is blocked until the next transfer slot. + */ + +// NB code assumes fifo_* arrays have size 2^n +static struct VdpFIFO { // XXX this must go into save file! + // last transferred FIFO data, ...x = index XXX currently only CPU + unsigned short fifo_data[4], fifo_dx; + + // queued FIFO transfers, ...x = index, ...l = queue length + // each entry has 2 values: [n]>>3 = #writes, [n]&7 = flags (FQ_*) + unsigned int fifo_queue[8], fifo_qx, fifo_ql; + unsigned int fifo_total; // total# of pending FIFO entries (w/o BGDMA) + + unsigned short fifo_slot; // last executed slot in current scanline + unsigned short fifo_maxslot;// #slots in scanline + + const unsigned char *fifo_cyc2sl; + const unsigned short *fifo_sl2cyc; +} VdpFIFO; + +enum { FQ_BYTE = 1, FQ_BGDMA = 2, FQ_FGDMA = 4 }; // queue flags, NB: BYTE = 1! + +// do the FIFO math +static __inline int AdvanceFIFOEntry(struct VdpFIFO *vf, struct PicoVideo *pv, int slots) +{ + int l = slots, b = vf->fifo_queue[vf->fifo_qx] & FQ_BYTE; + int cnt = pv->fifo_cnt; + + // advance currently active FIFO entry + if (l > cnt) + l = cnt; + if (!(vf->fifo_queue[vf->fifo_qx] & FQ_BGDMA)) + vf->fifo_total -= ((cnt & b) + l) >> b; + cnt -= l; + + // if entry has been processed... + if (cnt == 0) { + // remove entry from FIFO + if (vf->fifo_ql) { + vf->fifo_queue[vf->fifo_qx] = 0; + vf->fifo_qx = (vf->fifo_qx+1) & 7, vf->fifo_ql --; + } + // start processing for next entry if there is one + if (vf->fifo_ql) { + b = vf->fifo_queue[vf->fifo_qx] & FQ_BYTE; + cnt = (vf->fifo_queue[vf->fifo_qx] >> 3) << b; + } else { // FIFO empty + pv->status &= ~PVS_FIFORUN; + vf->fifo_total = 0; + } + } + + pv->fifo_cnt = cnt; + return l; +} + +static __inline void SetFIFOState(struct VdpFIFO *vf, struct PicoVideo *pv) +{ + unsigned int st = pv->status, cmd = pv->command; + // release CPU and terminate DMA if FIFO isn't blocking the 68k anymore + if (vf->fifo_total <= 4) { + st &= ~PVS_CPUWR; + if (!(st & (PVS_DMABG|PVS_DMAFILL))) { + st &= ~SR_DMA; + cmd &= ~0x80; + } + } + if (pv->fifo_cnt == 0) { + st &= ~PVS_CPURD; + // terminate DMA if applicable + if (!(st & (PVS_FIFORUN|PVS_DMAFILL))) { + st &= ~(SR_DMA|PVS_DMABG); + cmd &= ~0x80; + } + } + pv->status = st; + pv->command = cmd; +} + +// sync FIFO to cycles +void PicoVideoFIFOSync(int cycles) +{ + struct VdpFIFO *vf = &VdpFIFO; + struct PicoVideo *pv = &Pico.video; + int slots, done; + + // calculate #slots since last executed slot + slots = vf->fifo_cyc2sl[cycles>>1] - vf->fifo_slot; + + // advance FIFO queue by #done slots + done = slots; + while (done > 0 && pv->fifo_cnt) { + int l = AdvanceFIFOEntry(vf, pv, done); + vf->fifo_slot += l; + done -= l; + } + + if (done != slots) + SetFIFOState(vf, pv); +} + +// drain FIFO, blocking 68k on the way. FIFO must be synced prior to drain. +static int PicoVideoFIFODrain(int level, int cycles, int bgdma) +{ + struct VdpFIFO *vf = &VdpFIFO; + struct PicoVideo *pv = &Pico.video; + unsigned ocyc = cycles; + int burn = 0; +//int osl = fifo_slot; + + // process FIFO entries until low level is reached + while (vf->fifo_slot <= vf->fifo_maxslot && cycles < 488 && + ((vf->fifo_total > level) | (vf->fifo_queue[vf->fifo_qx] & bgdma))) { + int b = vf->fifo_queue[vf->fifo_qx] & FQ_BYTE; + int cnt = bgdma ? pv->fifo_cnt : ((vf->fifo_total-level)<fifo_cnt&b); + int slot = (pv->fifo_cntfifo_cnt:cnt) + vf->fifo_slot; + + if (slot > vf->fifo_maxslot) { + // target slot in later scanline, advance to eol + slot = vf->fifo_maxslot; + cycles = 488; + } else { + // advance FIFO to target slot and CPU to cycles at that slot + cycles = vf->fifo_sl2cyc[slot]<<1; + } + if (slot > vf->fifo_slot) { + AdvanceFIFOEntry(vf, pv, slot - vf->fifo_slot); + vf->fifo_slot = slot; + } + } + if (cycles > ocyc) + burn = cycles - ocyc; + + SetFIFOState(vf, pv); + + return burn; +} + +// read VDP data port +static int PicoVideoFIFORead(void) +{ + struct VdpFIFO *vf = &VdpFIFO; + struct PicoVideo *pv = &Pico.video; + int lc = SekCyclesDone()-Pico.t.m68c_line_start; + int burn = 0; + + if (pv->fifo_cnt) { + PicoVideoFIFOSync(lc); + // advance FIFO and CPU until FIFO is empty + burn = PicoVideoFIFODrain(0, lc, FQ_BGDMA); + lc += burn; + } + + if (pv->fifo_cnt) + pv->status |= PVS_CPURD; // target slot is in later scanline + else { + // use next VDP access slot for reading, block 68k until then + vf->fifo_slot = vf->fifo_cyc2sl[lc>>1] + 1; + burn += (vf->fifo_sl2cyc[vf->fifo_slot]<<1) - lc; + } + + return burn; +} + +// write VDP data port +int PicoVideoFIFOWrite(int count, int flags, unsigned sr_mask,unsigned sr_flags) +{ + struct VdpFIFO *vf = &VdpFIFO; + struct PicoVideo *pv = &Pico.video; + int lc = SekCyclesDone()-Pico.t.m68c_line_start; + int burn = 0; + + if (pv->fifo_cnt) + PicoVideoFIFOSync(lc); + pv->status = (pv->status & ~sr_mask) | sr_flags; + + if (count && vf->fifo_ql < 8) { + // determine queue position for entry + int x = (vf->fifo_qx + vf->fifo_ql - 1) & 7; + if (unlikely(vf->fifo_queue[x] & FQ_BGDMA)) { + // CPU FIFO writes have priority over a background DMA Fill/Copy + // XXX if interrupting a DMA fill, fill data changes + if (x == vf->fifo_qx) { // overtaking to queue head? + int f = vf->fifo_queue[x] & 7; + vf->fifo_queue[(x+1) & 7] = (pv->fifo_cnt >> (f & FQ_BYTE) << 3) | f; + pv->status &= ~PVS_FIFORUN; + } else + // push background DMA back + vf->fifo_queue[(x+1) & 7] = vf->fifo_queue[x]; + x = (x-1) & 7; + } + + if ((pv->status & PVS_FIFORUN) && (vf->fifo_queue[x] & 7) == flags) { + // amalgamate entries if of same type + vf->fifo_queue[x] += (count << 3); + if (x == vf->fifo_qx) + pv->fifo_cnt += count << (flags & FQ_BYTE); + } else { + // create new xfer queue entry + vf->fifo_ql ++; + x = (x+1) & 7; + vf->fifo_queue[x] = (count << 3) | flags; + } + + // update FIFO state if it was empty + if (!(pv->status & PVS_FIFORUN)) { + vf->fifo_slot = vf->fifo_cyc2sl[(lc+8)>>1]; // FIFO latency ~3 vdp slots + pv->status |= PVS_FIFORUN; + pv->fifo_cnt = count << (flags & FQ_BYTE); + } + if (!(flags & FQ_BGDMA)) + vf->fifo_total += count; + } + + // if CPU is waiting for the bus, advance CPU and FIFO until bus is free + if (pv->status & PVS_CPUWR) + burn = PicoVideoFIFODrain(4, lc, 0); + + return burn; +} + +// at HINT, advance FIFO to new scanline +int PicoVideoFIFOHint(void) +{ + struct VdpFIFO *vf = &VdpFIFO; + struct PicoVideo *pv = &Pico.video; + int burn = 0; + + // reset slot to start of scanline + vf->fifo_slot = 0; + + // if CPU is waiting for the bus, advance CPU and FIFO until bus is free + if (pv->status & PVS_CPUWR) + burn = PicoVideoFIFOWrite(0, 0, 0, 0); + else if (pv->status & PVS_CPURD) + burn = PicoVideoFIFORead(); + + return burn; +} + +// switch FIFO mode between active/inactive display +void PicoVideoFIFOMode(int active, int h40) +{ + static const unsigned char *vdpcyc2sl[2][2] = + { {vdpcyc2sl_32_bl, vdpcyc2sl_40_bl} , {vdpcyc2sl_32, vdpcyc2sl_40} }; + static const unsigned short *vdpsl2cyc[2][2] = + { {vdpsl2cyc_32_bl, vdpsl2cyc_40_bl} , {vdpsl2cyc_32, vdpsl2cyc_40} }; + + struct VdpFIFO *vf = &VdpFIFO; + struct PicoVideo *pv = &Pico.video; + int lc = SekCyclesDone() - Pico.t.m68c_line_start; + active = active && !(pv->status & PVS_VB2); + + if (vf->fifo_maxslot) + PicoVideoFIFOSync(lc); + + vf->fifo_cyc2sl = vdpcyc2sl[active][h40]; + vf->fifo_sl2cyc = vdpsl2cyc[active][h40]; + // recalculate FIFO slot for new mode + vf->fifo_slot = vf->fifo_cyc2sl[lc>>1]-1; + vf->fifo_maxslot = vf->fifo_cyc2sl[488>>1]; +} + + +// VDP memory rd/wr + static __inline void AutoIncrement(void) { Pico.video.addr=(unsigned short)(Pico.video.addr+Pico.video.reg[0xf]); + if (Pico.video.addr < Pico.video.reg[0xf]) Pico.video.addr_u ^= 1; } -static NOINLINE void VideoWrite128(u32 a, u16 d) +static NOINLINE void VideoWriteVRAM128(u32 a, u16 d) { // nasty - a = ((a & 2) >> 1) | ((a & 0x400) >> 9) | (a & 0x3FC) | ((a & 0x1F800) >> 1); - ((u8 *)PicoMem.vram)[a] = d; + u32 b = ((a & 2) >> 1) | ((a & 0x400) >> 9) | (a & 0x3FC) | ((a & 0x1F800) >> 1); + + ((u8 *)PicoMem.vram)[b] = d; + if (!((u16)(b^SATaddr) & SATmask)) + Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; + + if (!((u16)(a^SATaddr) & SATmask)) + UpdateSAT(a, d); } static void VideoWrite(u16 d) @@ -37,17 +346,16 @@ static void VideoWrite(u16 d) { case 1: if (a & 1) d = (u16)((d << 8) | (d >> 8)); - PicoMem.vram [(a >> 1) & 0x7fff] = d; - if (a - ((unsigned)(Pico.video.reg[5]&0x7f) << 9) < 0x400) - Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; + a |= Pico.video.addr_u << 16; + VideoWriteVRAM(a, d); break; - case 3: Pico.m.dirtyPal = 1; - PicoMem.cram [(a >> 1) & 0x3f] = d; break; - case 5: PicoMem.vsram[(a >> 1) & 0x3f] = d; break; + case 3: if (PicoMem.cram [(a >> 1) & 0x3f] != d) Pico.m.dirtyPal = 1; + PicoMem.cram [(a >> 1) & 0x3f] = d & 0xeee; break; + case 5: PicoMem.vsram[(a >> 1) & 0x3f] = d & 0x7ff; break; case 0x81: - a |= Pico.video.addr_u << 16; - VideoWrite128(a, d); - break; + a |= Pico.video.addr_u << 16; + VideoWriteVRAM128(a, d); + break; //default:elprintf(EL_ANOMALY, "VDP write %04x with bad type %i", d, Pico.video.type); break; } @@ -56,15 +364,19 @@ static void VideoWrite(u16 d) static unsigned int VideoRead(void) { - unsigned int a=0,d=0; + unsigned int a, d = VdpFIFO.fifo_data[(VdpFIFO.fifo_dx+1)&3]; a=Pico.video.addr; a>>=1; + SekCyclesBurnRun(PicoVideoFIFORead()); switch (Pico.video.type) { case 0: d=PicoMem.vram [a & 0x7fff]; break; - case 8: d=PicoMem.cram [a & 0x003f]; break; - case 4: d=PicoMem.vsram[a & 0x003f]; break; + case 8: d=PicoMem.cram [a & 0x003f] | (d & ~0x0eee); break; + case 4: if ((a & 0x3f) >= 0x28) a = 0; + d=PicoMem.vsram [a & 0x003f] | (d & ~0x07ff); break; + case 12:a=PicoMem.vram [a & 0x7fff]; if (Pico.video.addr&1) a >>= 8; + d=(a & 0x00ff) | (d & ~0x00ff); break; default:elprintf(EL_ANOMALY, "VDP read with bad type %i", Pico.video.type); break; } @@ -72,6 +384,8 @@ static unsigned int VideoRead(void) return d; } +// VDP DMA + static int GetDmaLength(void) { struct PicoVideo *pvid=&Pico.video; @@ -86,18 +400,16 @@ static int GetDmaLength(void) static void DmaSlow(int len, unsigned int source) { u32 inc = Pico.video.reg[0xf]; - u32 a = Pico.video.addr; + u32 a = Pico.video.addr | (Pico.video.addr_u << 16); u16 *r, *base = NULL; u32 mask = 0x1ffff; elprintf(EL_VDPDMA, "DmaSlow[%i] %06x->%04x len %i inc=%i blank %i [%u] @ %06x", - Pico.video.type, source, a, len, inc, (Pico.video.status&8)||!(Pico.video.reg[1]&0x40), + Pico.video.type, source, a, len, inc, (Pico.video.status&SR_VB)||!(Pico.video.reg[1]&0x40), SekCyclesDone(), SekPc); - Pico.m.dma_xfers = len; - if (Pico.m.dma_xfers < len) // lame 16bit var - Pico.m.dma_xfers = ~0; - SekCyclesBurnRun(CheckDMA()); + SekCyclesBurnRun(PicoVideoFIFOWrite(len, FQ_FGDMA | (Pico.video.type == 1), + PVS_DMABG, SR_DMA | PVS_CPUWR)); if ((source & 0xe00000) == 0xe00000) { // Ram base = (u16 *)PicoMem.ram; @@ -149,25 +461,23 @@ static void DmaSlow(int len, unsigned int source) { case 1: // vram r = PicoMem.vram; - if (inc == 2 && !(a & 1) && a + len * 2 < 0x10000 - && !(((source + len - 1) ^ source) & ~mask)) + if (inc == 2 && !(a & 1) && (a & ~0xffff) == ((a + len*2-1) & ~0xffff) && + ((a >= SATaddr+0x280) | ((a + len*2-1) < SATaddr)) && + (source & ~mask) == ((source + len-1) & ~mask)) { // most used DMA mode memcpy((char *)r + a, base + (source & mask), len * 2); a += len * 2; + break; } - else + for(; len; len--) { - for(; len; len--) - { - u16 d = base[source++ & mask]; - if(a & 1) d=(d<<8)|(d>>8); - r[a >> 1] = d; - // AutoIncrement - a = (u16)(a + inc); - } + u16 d = base[source++ & mask]; + if(a & 1) d=(d<<8)|(d>>8); + VideoWriteVRAM(a, d); + // AutoIncrement + a = (a+inc) & ~0x20000; } - Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; break; case 3: // cram @@ -175,9 +485,9 @@ static void DmaSlow(int len, unsigned int source) r = PicoMem.cram; for (; len; len--) { - r[(a / 2) & 0x3f] = base[source++ & mask]; + r[(a / 2) & 0x3f] = base[source++ & mask] & 0xeee; // AutoIncrement - a += inc; + a = (a+inc) & ~0x20000; } break; @@ -185,21 +495,20 @@ static void DmaSlow(int len, unsigned int source) r = PicoMem.vsram; for (; len; len--) { - r[(a / 2) & 0x3f] = base[source++ & mask]; + r[(a / 2) & 0x3f] = base[source++ & mask] & 0x7ff; // AutoIncrement - a += inc; + a = (a+inc) & ~0x20000; } break; case 0x81: // vram 128k - a |= Pico.video.addr_u << 16; for(; len; len--) { - VideoWrite128(a, base[source++ & mask]); + u16 d = base[source++ & mask]; + VideoWriteVRAM128(a, d); // AutoIncrement - a = (a + inc) & 0x1ffff; + a = (a+inc) & ~0x20000; } - Pico.video.addr_u = a >> 16; break; default: @@ -208,39 +517,41 @@ static void DmaSlow(int len, unsigned int source) break; } // remember addr - Pico.video.addr=(u16)a; + Pico.video.addr = a; + Pico.video.addr_u = a >> 16; } static void DmaCopy(int len) { - u16 a = Pico.video.addr; + u32 a = Pico.video.addr | (Pico.video.addr_u << 16); u8 *vr = (u8 *)PicoMem.vram; u8 inc = Pico.video.reg[0xf]; int source; elprintf(EL_VDPDMA, "DmaCopy len %i [%u]", len, SekCyclesDone()); - Pico.m.dma_xfers = len; - if (Pico.m.dma_xfers < len) - Pico.m.dma_xfers = ~0; - Pico.video.status |= SR_DMA; + // XXX implement VRAM 128k? Is this even working? xfer/count still FQ_BYTE? + SekCyclesBurnRun(PicoVideoFIFOWrite(len, FQ_BGDMA | FQ_BYTE, + PVS_CPUWR, SR_DMA | PVS_DMABG)); source =Pico.video.reg[0x15]; source|=Pico.video.reg[0x16]<<8; for (; len; len--) { - vr[a] = vr[source++ & 0xffff]; + vr[(u16)a] = vr[(u16)(source++)]; + if (!((u16)(a^SATaddr) & SATmask)) + UpdateSAT(a, ((u16 *)vr)[(u16)a >> 1]); // AutoIncrement - a=(u16)(a+inc); + a = (a+inc) & ~0x20000; } // remember addr - Pico.video.addr=a; - Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; + Pico.video.addr = a; + Pico.video.addr_u = a >> 16; } static NOINLINE void DmaFill(int data) { - u16 a = Pico.video.addr; + u32 a = Pico.video.addr | (Pico.video.addr_u << 16); u8 *vr = (u8 *)PicoMem.vram; u8 high = (u8)(data >> 8); u8 inc = Pico.video.reg[0xf]; @@ -250,30 +561,59 @@ static NOINLINE void DmaFill(int data) len = GetDmaLength(); elprintf(EL_VDPDMA, "DmaFill len %i inc %i [%u]", len, inc, SekCyclesDone()); - Pico.m.dma_xfers = len; - if (Pico.m.dma_xfers < len) // lame 16bit var - Pico.m.dma_xfers = ~0; - Pico.video.status |= SR_DMA; + SekCyclesBurnRun(PicoVideoFIFOWrite(len, FQ_BGDMA | (Pico.video.type == 1), + PVS_CPUWR | PVS_DMAFILL, SR_DMA | PVS_DMABG)); switch (Pico.video.type) { case 1: // vram + if (inc == 1 && (a & ~0xffff) == ((a + len-1) & ~0xffff) && + ((a >= SATaddr+0x280) | ((a + len-1) < SATaddr))) + { + // most used DMA mode + memset(vr + (u16)a, high, len); + a += len; + break; + } for (l = len; l; l--) { // Write upper byte to adjacent address // (here we are byteswapped, so address is already 'adjacent') - vr[a] = high; + vr[(u16)a] = high; + if (!((u16)(a^SATaddr) & SATmask)) + UpdateSAT(a, ((u16 *)vr)[(u16)a >> 1]); // Increment address register - a = (u16)(a + inc); + a = (a+inc) & ~0x20000; } break; case 3: // cram + Pico.m.dirtyPal = 1; + data &= 0xeee; + for (l = len; l; l--) { + PicoMem.cram[(a/2) & 0x3f] = data; + + // Increment address register + a = (a+inc) & ~0x20000; + } + break; case 5: { // vsram - // TODO: needs fifo; anyone using these? - static int once; - if (!once++) - elprintf(EL_STATUS|EL_ANOMALY|EL_VDPDMA, "TODO: cram/vsram fill"); + data &= 0x7ff; + for (l = len; l; l--) { + PicoMem.vsram[(a/2) & 0x3f] = data; + + // Increment address register + a = (a+inc) & ~0x20000; + } + break; } + case 0x81: // vram 128k + for (l = len; l; l--) { + VideoWriteVRAM128(a, data); + + // Increment address register + a = (a+inc) & ~0x20000; + } + break; default: a += len * inc; break; @@ -281,6 +621,7 @@ static NOINLINE void DmaFill(int data) // remember addr Pico.video.addr = a; + Pico.video.addr_u = a >> 16; // register update Pico.video.reg[0x13] = Pico.video.reg[0x14] = 0; source = Pico.video.reg[0x15]; @@ -289,20 +630,23 @@ static NOINLINE void DmaFill(int data) Pico.video.reg[0x15] = source; Pico.video.reg[0x16] = source >> 8; - Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; } +// VDP command handling + static NOINLINE void CommandDma(void) { struct PicoVideo *pvid=&Pico.video; u32 len, method; u32 source; - if ((pvid->reg[1]&0x10)==0) return; // DMA not enabled - - if (Pico.m.dma_xfers) + PicoVideoFIFOSync(SekCyclesDone()-Pico.t.m68c_line_start); + if (pvid->status & SR_DMA) { elprintf(EL_VDPDMA, "Dma overlap, left=%d @ %06x", - Pico.m.dma_xfers, SekPc); + VdpFIFO.fifo_total, SekPc); + pvid->fifo_cnt = VdpFIFO.fifo_total = VdpFIFO.fifo_ql = 0; + pvid->status &= ~(PVS_FIFORUN|PVS_DMAFILL); + } len = GetDmaLength(); source =Pico.video.reg[0x15]; @@ -314,18 +658,18 @@ static NOINLINE void CommandDma(void) DmaSlow(len, source << 1); // 68000 to VDP else if (method == 3) DmaCopy(len); // VRAM Copy - else + else { + pvid->status |= SR_DMA|PVS_DMAFILL; return; - + } source += len; Pico.video.reg[0x13] = Pico.video.reg[0x14] = 0; Pico.video.reg[0x15] = source; Pico.video.reg[0x16] = source >> 8; } -static NOINLINE void CommandChange(void) +static NOINLINE void CommandChange(struct PicoVideo *pvid) { - struct PicoVideo *pvid = &Pico.video; unsigned int cmd, addr; cmd = pvid->command; @@ -342,12 +686,21 @@ static NOINLINE void CommandChange(void) pvid->addr_u = (u8)((cmd >> 2) & 1); } -static void DrawSync(int blank_on) +// VDP interface + +static void DrawSync(int skip) { - if (Pico.m.scanline < 224 && !(PicoIn.opt & POPT_ALT_RENDERER) && - !PicoIn.skipFrame && Pico.est.DrawScanline <= Pico.m.scanline) { + int lines = Pico.video.reg[1]&0x08 ? 240 : 224; + int last = Pico.m.scanline - (skip || blankline == Pico.m.scanline); + + if (last < lines && !(PicoIn.opt & POPT_ALT_RENDERER) && + !PicoIn.skipFrame && Pico.est.DrawScanline <= last) { //elprintf(EL_ANOMALY, "sync"); - PicoDrawSync(Pico.m.scanline, blank_on); + if (blankline >= 0 && blankline < last) { + PicoDrawSync(blankline, 1); + blankline = -1; + } + PicoDrawSync(last, 0); } } @@ -363,44 +716,48 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) { case 0x00: // Data port 0 or 2 // try avoiding the sync.. - if (Pico.m.scanline < 224 && (pvid->reg[1]&0x40) && + if (Pico.m.scanline < (pvid->reg[1]&0x08 ? 240 : 224) && (pvid->reg[1]&0x40) && !(!pvid->pending && - ((pvid->command & 0xc00000f0) == 0x40000010 && PicoMem.vsram[pvid->addr>>1] == d)) + ((pvid->command & 0xc00000f0) == 0x40000010 && PicoMem.vsram[pvid->addr>>1] == (d & 0x7ff))) ) - DrawSync(0); + DrawSync(0); // XXX it's unclear when vscroll data is fetched from vsram? if (pvid->pending) { - CommandChange(); + CommandChange(pvid); pvid->pending=0; } - if (!(pvid->status & SR_VB) && !(PicoIn.opt&POPT_DIS_VDP_FIFO)) + if (!(PicoIn.opt&POPT_DIS_VDP_FIFO)) { - int use = pvid->type == 1 ? 2 : 1; - pvid->lwrite_cnt -= use; - if (pvid->lwrite_cnt < 0) - SekCyclesLeft = 0; - elprintf(EL_ASVDP, "VDP data write: [%04x] %04x [%u] {%i} #%i @ %06x", - Pico.video.addr, d, SekCyclesDone(), Pico.video.type, pvid->lwrite_cnt, SekPc); + VdpFIFO.fifo_data[++VdpFIFO.fifo_dx&3] = d; + SekCyclesBurnRun(PicoVideoFIFOWrite(1, pvid->type == 1, 0, PVS_CPUWR)); + + elprintf(EL_ASVDP, "VDP data write: [%04x] %04x [%u] {%i} @ %06x", + Pico.video.addr, d, SekCyclesDone(), Pico.video.type, SekPc); } VideoWrite(d); - if ((pvid->command&0x80) && (pvid->reg[1]&0x10) && (pvid->reg[0x17]>>6)==2) - DmaFill(d); + // start DMA fill on write. NB VSRAM and CRAM fills use wrong FIFO data. + if (pvid->status & PVS_DMAFILL) + DmaFill(VdpFIFO.fifo_data[(VdpFIFO.fifo_dx + !!(pvid->type&~0x81))&3]); break; case 0x04: // Control (command) port 4 or 6 + if (pvid->status & SR_DMA) + SekCyclesBurnRun(PicoVideoFIFORead()); // kludge, flush out running DMA if (pvid->pending) { // Low word of command: + if (!(pvid->reg[1]&0x10)) + d = (d&~0x80)|(pvid->command&0x80); pvid->command &= 0xffff0000; pvid->command |= d; pvid->pending = 0; - CommandChange(); + CommandChange(pvid); // Check for dma: if (d & 0x80) { - DrawSync(0); + DrawSync(SekCyclesDone() - Pico.t.m68c_line_start <= 488-390); CommandDma(); } } @@ -411,16 +768,23 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) // Register write: int num=(d>>8)&0x1f; int dold=pvid->reg[num]; - int blank_on = 0; pvid->type=0; // register writes clear command (else no Sega logo in Golden Axe II) if (num > 0x0a && !(pvid->reg[1]&4)) { elprintf(EL_ANOMALY, "%02x written to reg %02x in SMS mode @ %06x", d, num, SekPc); return; } - if (num == 1 && !(d&0x40) && SekCyclesDone() - Pico.t.m68c_line_start <= 488-390) - blank_on = 1; - DrawSync(blank_on); + if (num == 0 && !(pvid->reg[0]&2) && (d&2)) + pvid->hv_latch = PicoVideoRead(0x08); + if (num == 1 && ((pvid->reg[1]^d)&0x40)) { + PicoVideoFIFOMode(d & 0x40, pvid->reg[12]&1); + // handle line blanking before line rendering + if (SekCyclesDone() - Pico.t.m68c_line_start <= 488-390) + blankline = d&0x40 ? -1 : Pico.m.scanline; + } + if (num == 12 && ((pvid->reg[12]^d)&0x01)) + PicoVideoFIFOMode(pvid->reg[1]&0x40, d & 1); + DrawSync(SekCyclesDone() - Pico.t.m68c_line_start <= 488-390); pvid->reg[num]=(unsigned char)d; switch (num) { @@ -436,14 +800,21 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) pvid->status |= ((d >> 3) ^ SR_VB) & SR_VB; // forced blanking goto update_irq; case 0x05: - //elprintf(EL_STATUS, "spritep moved to %04x", (unsigned)(Pico.video.reg[5]&0x7f) << 9); + case 0x06: if (d^dold) Pico.est.rendstatus |= PDRAW_SPRITES_MOVED; break; case 0x0c: // renderers should update their palettes if sh/hi mode is changed - if ((d^dold)&8) Pico.m.dirtyPal = 2; + if ((d^dold)&8) Pico.m.dirtyPal = 1; break; + default: + return; } + SATaddr = ((pvid->reg[5]&0x7f) << 9) | ((pvid->reg[6]&0x20) << 11); + SATmask = ~0x1ff; + if (Pico.video.reg[12]&1) + SATaddr &= ~0x200, SATmask &= ~0x200; // H40, zero lowest SAT bit + //elprintf(EL_STATUS, "spritep moved to %04x", SATaddr); return; update_irq: @@ -503,13 +874,21 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) } } -static u32 SrLow(const struct PicoVideo *pv) +static u32 VideoSr(const struct PicoVideo *pv) { unsigned int c, d = pv->status; + unsigned int hp = pv->reg[12]&1 ? 15*488/210+1 : 15*488/171+1; // HBLANK start + unsigned int hl = pv->reg[12]&1 ? 37*488/210+1 : 28*488/171+1; // HBLANK len - c = SekCyclesDone() - Pico.t.m68c_line_start - 39; - if (c < 92) + c = SekCyclesDone() - Pico.t.m68c_line_start; + if (c - hp < hl) d |= SR_HB; + + PicoVideoFIFOSync(c); + if (VdpFIFO.fifo_total >= 4) + d |= SR_FULL; + else if (!VdpFIFO.fifo_total) + d |= SR_EMPT; return d; } @@ -520,8 +899,11 @@ PICO_INTERNAL_ASM unsigned int PicoVideoRead(unsigned int a) if (a == 0x04) // control port { struct PicoVideo *pv = &Pico.video; - unsigned int d = SrLow(pv); - pv->pending = 0; + unsigned int d = VideoSr(pv); + if (pv->pending) { + CommandChange(pv); + pv->pending = 0; + } elprintf(EL_SR, "SR read: %04x [%u] @ %06x", d, SekCyclesDone(), SekPc); return d; } @@ -546,12 +928,14 @@ PICO_INTERNAL_ASM unsigned int PicoVideoRead(unsigned int a) unsigned int d; d = (SekCyclesDone() - Pico.t.m68c_line_start) & 0x1ff; // FIXME - if (Pico.video.reg[12]&1) - d = hcounts_40[d]; - else d = hcounts_32[d]; + if (Pico.video.reg[0]&2) + d = Pico.video.hv_latch; + else if (Pico.video.reg[12]&1) + d = hcounts_40[d/2] | (Pico.video.v_counter << 8); + else d = hcounts_32[d/2] | (Pico.video.v_counter << 8); elprintf(EL_HVCNT, "hv: %02x %02x [%u] @ %06x", d, Pico.video.v_counter, SekCyclesDone(), SekPc); - return d | (Pico.video.v_counter << 8); + return d; } if (a==0x00) // data port @@ -574,16 +958,24 @@ unsigned char PicoVideoRead8DataL(void) unsigned char PicoVideoRead8CtlH(void) { - u8 d = (u8)(Pico.video.status >> 8); - Pico.video.pending = 0; + struct PicoVideo *pv = &Pico.video; + u8 d = VideoSr(pv) >> 8; + if (pv->pending) { + CommandChange(pv); + pv->pending = 0; + } elprintf(EL_SR, "SR read (h): %02x @ %06x", d, SekPc); return d; } unsigned char PicoVideoRead8CtlL(void) { - u8 d = SrLow(&Pico.video); - Pico.video.pending = 0; + struct PicoVideo *pv = &Pico.video; + u8 d = VideoSr(pv); + if (pv->pending) { + CommandChange(pv); + pv->pending = 0; + } elprintf(EL_SR, "SR read (l): %02x @ %06x", d, SekPc); return d; } @@ -598,11 +990,50 @@ unsigned char PicoVideoRead8HV_H(void) unsigned char PicoVideoRead8HV_L(void) { u32 d = (SekCyclesDone() - Pico.t.m68c_line_start) & 0x1ff; // FIXME - if (Pico.video.reg[12]&1) - d = hcounts_40[d]; - else d = hcounts_32[d]; + if (Pico.video.reg[0]&2) + d = Pico.video.hv_latch; + else if (Pico.video.reg[12]&1) + d = hcounts_40[d/2]; + else d = hcounts_32[d/2]; elprintf(EL_HVCNT, "hcounter: %02x [%u] @ %06x", d, SekCyclesDone(), SekPc); return d; } +void PicoVideoSave(void) +{ + struct VdpFIFO *vf = &VdpFIFO; + struct PicoVideo *pv = &Pico.video; + int l, x; + + // account for all outstanding xfers XXX kludge, entry attr's not saved + for (l = vf->fifo_ql, x = vf->fifo_qx + l-1; l > 1; l--, x--) + pv->fifo_cnt += (vf->fifo_queue[x&7] >> 3) << (vf->fifo_queue[x&7] & FQ_BYTE); +} + +void PicoVideoLoad(void) +{ + struct VdpFIFO *vf = &VdpFIFO; + struct PicoVideo *pv = &Pico.video; + int l; + + // convert former dma_xfers (why was this in PicoMisc anyway?) + if (Pico.m.dma_xfers) { + pv->status = SR_DMA|PVS_FIFORUN; + pv->fifo_cnt = Pico.m.dma_xfers * (pv->type == 1 ? 2 : 1); + vf->fifo_total = Pico.m.dma_xfers; + Pico.m.dma_xfers = 0; + } + + SATaddr = ((pv->reg[5]&0x7f) << 9) | ((pv->reg[6]&0x20) << 11); + SATmask = ~0x1ff; + if (pv->reg[12]&1) + SATaddr &= ~0x200, SATmask &= ~0x200; // H40, zero lowest SAT bit + + // rebuild SAT cache XXX wrong since cache and memory can differ + for (l = 0; l < 80; l++) { + *((u16 *)VdpSATCache + 2*l ) = PicoMem.vram[(SATaddr>>1) + l*4 ]; + *((u16 *)VdpSATCache + 2*l+1) = PicoMem.vram[(SATaddr>>1) + l*4 + 1]; + } +} + // vim:shiftwidth=2:ts=2:expandtab diff --git a/platform/common/arm_utils.s b/platform/common/arm_utils.s index 9e8d9f250..6696e5afe 100644 --- a/platform/common/arm_utils.s +++ b/platform/common/arm_utils.s @@ -141,6 +141,7 @@ vidcpy_m2: movne lr, #64 tstne r3, r3 addne r0, r0, #32 + addne r1, r1, #32 vidCpyM2_loop_out: mov r6, #10 diff --git a/platform/common/common.mak b/platform/common/common.mak index 51eaa30f5..3c9ff81d4 100644 --- a/platform/common/common.mak +++ b/platform/common/common.mak @@ -9,6 +9,8 @@ asm_render = 0 asm_ym2612 = 0 asm_misc = 0 asm_cdmemory = 0 +asm_32xdraw = 0 +asm_32xmemory = 0 asm_mix = 0 endif @@ -40,6 +42,10 @@ ifeq "$(pprof)" "1" DEFINES += PPROF SRCS_COMMON += $(R)platform/linux/pprof.c endif +ifeq "$(gperf)" "1" +DEFINES += GPERF +LDFLAGS += -lprofiler -lstdc++ +endif # ARM asm stuff ifeq "$(ARCH)" "arm" @@ -53,7 +59,7 @@ SRCS_COMMON += $(R)pico/memory_arm.S endif ifeq "$(asm_ym2612)" "1" DEFINES += _ASM_YM2612_C -SRCS_COMMON += $(R)pico/sound/ym2612_arm.s +SRCS_COMMON += $(R)pico/sound/ym2612_arm.S endif ifeq "$(asm_misc)" "1" DEFINES += _ASM_MISC_C @@ -66,7 +72,11 @@ SRCS_COMMON += $(R)pico/cd/memory_arm.S endif ifeq "$(asm_32xdraw)" "1" DEFINES += _ASM_32X_DRAW -SRCS_COMMON += $(R)pico/32x/draw_arm.s +SRCS_COMMON += $(R)pico/32x/draw_arm.S +endif +ifeq "$(asm_32xmemory)" "1" +DEFINES += _ASM_32X_MEMORY_C +SRCS_COMMON += $(R)pico/32x/memory_arm.s endif ifeq "$(asm_mix)" "1" SRCS_COMMON += $(R)pico/sound/mix_arm.S @@ -138,7 +148,7 @@ endif # --- Z80 --- ifeq "$(use_drz80)" "1" DEFINES += _USE_DRZ80 -SRCS_COMMON += $(R)cpu/DrZ80/drz80.s +SRCS_COMMON += $(R)cpu/DrZ80/drz80.S endif # ifeq "$(use_cz80)" "1" @@ -157,8 +167,16 @@ SRCS_COMMON += $(R)cpu/sh2/compiler.c ifdef drc_debug DEFINES += DRC_DEBUG=$(drc_debug) SRCS_COMMON += $(R)cpu/sh2/mame/sh2dasm.c -SRCS_COMMON += $(R)platform/libpicofe/linux/host_dasm.c -LDFLAGS += -lbfd -lopcodes -liberty +DASM = $(R)platform/libpicofe/linux/host_dasm.c +DASMLIBS = -lbfd -lopcodes -liberty +ifeq ("$(ARCH)",$(filter "$(ARCH)","arm" "mipsel")) +ifeq ($(filter_out $(shell $(CC) --print-file-name=libbfd.so),"/"),) +DASM = $(R)platform/common/host_dasm.c +DASMLIBS = +endif +endif +SRCS_COMMON += $(DASM) +LDFLAGS += $(DASMLIBS) endif endif # use_sh2drc SRCS_COMMON += $(R)cpu/sh2/mame/sh2pico.c @@ -181,7 +199,7 @@ $(FR)cpu/cyclone/Cyclone.h: $(FR)cpu/cyclone/Cyclone.s: $(FR)cpu/$(CYCLONE_CONFIG) @echo building Cyclone... - @make CC=$(CYCLONE_CC) CXX=$(CYCLONE_CXX) -C $(R)cpu/cyclone/ CONFIG_FILE=../$(CYCLONE_CONFIG) + @make CC=$(CYCLONE_CC) CXX=$(CYCLONE_CXX) -C $(R)cpu/cyclone/ CONFIG_FILE=../$(CYCLONE_CONFIG) HAVE_ARMv6=$(HAVE_ARMv6) $(FR)cpu/cyclone/Cyclone.s: $(FR)cpu/cyclone/*.cpp $(FR)cpu/cyclone/*.h diff --git a/platform/common/config_file.c b/platform/common/config_file.c index 0cd27260a..da19fad1a 100644 --- a/platform/common/config_file.c +++ b/platform/common/config_file.c @@ -39,7 +39,7 @@ static char *mystrip(char *str); static int seek_sect(FILE *f, const char *section) { - char line[128], *tmp; + char line[640], *tmp; int len; len = strlen(section); @@ -100,7 +100,7 @@ int config_write(const char *fname) FILE *fn = NULL; menu_entry *me; int t; - char line[128]; + char line[640]; fn = fopen(fname, "w"); if (fn == NULL) @@ -169,7 +169,7 @@ int config_write(const char *fname) int config_writelrom(const char *fname) { - char line[128], *tmp, *optr = NULL; + char line[640], *tmp, *optr = NULL; char *old_data = NULL; int size; FILE *f; @@ -216,7 +216,7 @@ int config_writelrom(const char *fname) int config_readlrom(const char *fname) { - char line[128], *tmp; + char line[640], *tmp; int i, len, ret = -1; FILE *f; @@ -326,6 +326,10 @@ static int custom_read(menu_entry *me, const char *var, const char *val) currentConfig.gamma = atoi(val); return 1; + case MA_OPT2_MAX_FRAMESKIP: + currentConfig.max_skip = atoi(val); + return 1; + /* PSP */ case MA_OPT3_SCALE: if (strcasecmp(var, "Scale factor") != 0) return 0; @@ -503,7 +507,7 @@ static void parse(const char *var, const char *val, int *keys_encountered) int config_readsect(const char *fname, const char *section) { - char line[128], *var, *val; + char line[640], *var, *val; int keys_encountered = 0; FILE *f; int ret; diff --git a/platform/common/disarm.c b/platform/common/disarm.c new file mode 100644 index 000000000..249922064 --- /dev/null +++ b/platform/common/disarm.c @@ -0,0 +1,485 @@ +/* + * Copyright (c) 2012 Wojtek Kaniewski + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include + +#define IMM_FORMAT "0x%x" +//#define IMM_FORMAT "%d" +#define ADDR_FORMAT "0x%x" + +static inline unsigned int rol(unsigned int value, unsigned int shift) +{ + shift &= 31; + + return (value >> shift) | (value << (32 - shift)); +} + +static inline const char *condition(unsigned int insn) +{ + const char *conditions[16] = { "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc", "hi", "ls", "ge", "lt", "gt", "le", "", "nv" }; + return conditions[(insn >> 28) & 0x0f]; +} + +static inline const char *register_name(unsigned int reg) +{ + const char *register_names[16] = { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "sp", "lr", "pc" }; + return register_names[reg & 0x0f]; +} + +static const char *register_list(unsigned int list, char *buf, size_t buf_len) +{ + int i; + + buf[0] = 0; + + for (i = 0; i < 16; i++) + { + if ((list >> i) & 1) + { + snprintf(buf + strlen(buf), buf_len - strlen(buf), "%s%s", (buf[0] == 0) ? "" : ",", register_name(i)); + } + } + + return buf; +} + +static const char *shift(unsigned int insn, char *buf, size_t buf_len) +{ + unsigned int imm = (insn >> 7) & 0x1f; + const char *rn = register_name(insn >> 8); + unsigned int type = (insn >> 4) & 0x07; + + switch (type) + { + case 0: + snprintf(buf, buf_len, (imm != 0) ? ",lsl #%d" : "", imm); + break; + case 1: + snprintf(buf, buf_len, ",lsl %s", rn); + break; + case 2: + snprintf(buf, buf_len, ",lsr #%d", imm ? imm : 32); + break; + case 3: + snprintf(buf, buf_len, ",lsr %s", rn); + break; + case 4: + snprintf(buf, buf_len, ",asr #%d", imm ? imm : 32); + break; + case 5: + snprintf(buf, buf_len, ",asr %s", rn); + break; + case 6: + snprintf(buf, buf_len, (imm != 0) ? ",ror #%d" : ",rrx", imm); + break; + case 7: + snprintf(buf, buf_len, ",ror %s", rn); + break; + } + + return buf; +} + +static const char *immediate(unsigned int imm, int negative, int show_if_zero, char *buf, size_t buf_len) +{ + if (imm || show_if_zero) + { + snprintf(buf, buf_len, ",#%s" IMM_FORMAT, (negative) ? "-" : "", imm); + return buf; + } + + return ""; +} + +static int data_processing(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + unsigned int oper = (insn >> 21) & 15; + const char *names[16] = { "and", "eor", "sub", "rsb", "add", "adc", "sbc", "rsc", "tst", "teq", "cmp", "cmn", "orr", "mov", "bic", "mvn" }; + const char *name; + const char *s; + unsigned int rd; + unsigned int rn; + int is_move = ((oper == 13) || (oper == 15)); + int is_test = ((oper >= 8) && (oper <= 11)); + char tmp_buf[64]; + + name = names[oper]; + s = ((insn >> 20) & 1) ? "s" : ""; + rn = (insn >> 16) & 15; + rd = (insn >> 12) & 15; + + /* mov r0,r0,r0 is a nop */ + if (insn == 0xe1a00000) + { + snprintf(buf, buf_len, "nop"); + return 1; + } + + /* mrs */ + if ((insn & 0x0fbf0fff) == 0x010f0000) + { + const char *psr = ((insn >> 22) & 1) ? "spsr" : "cpsr"; + const char *rd = register_name(insn >> 12); + + snprintf(buf, buf_len, "mrs%s %s,%s", condition(insn), rd, psr); + + return 1; + } + + /* msr flag only*/ + if ((insn & 0x0db0f000) == 0x0120f000) + { + const char *psr = ((insn >> 22) & 1) ? "spsr" : "cpsr"; + const char *suffix; + + switch ((insn >> 16) & 15) + { + case 9: + suffix = ""; + break; + case 8: + suffix = "_f"; + break; + case 1: + suffix = "_c"; + break; + default: + return 0; + } + + if ((insn >> 25) & 1) + { + unsigned int imm = rol(insn & 0x000000ff, ((insn >> 8) & 15) * 2); + + snprintf(buf, buf_len, "msr%s %s%s,#" IMM_FORMAT, condition(insn), psr, suffix, imm); + } + else + { + const char *rm = register_name(insn >> 0); + + if (((insn >> 4) & 255) != 0) + { + return 0; + } + + snprintf(buf, buf_len, "msr%s %s%s,%s", condition(insn), psr, suffix, rm); + } + + return 1; + } + + if (((insn >> 25) & 1) == 0) + { + unsigned int rm; + + rm = (insn & 15); + + if (is_move) + { + snprintf(buf, buf_len, "%s%s%s %s,%s%s", name, condition(insn), s, register_name(rd), register_name(rm), shift(insn, tmp_buf, sizeof(tmp_buf))); + } + else if (is_test) + { + snprintf(buf, buf_len, "%s%s %s,%s%s", name, condition(insn), register_name(rn), register_name(rm), shift(insn, tmp_buf, sizeof(tmp_buf))); + } + else + { + snprintf(buf, buf_len, "%s%s%s %s,%s,%s%s", name, condition(insn), s, register_name(rd), register_name(rn), register_name(rm), shift(insn, tmp_buf, sizeof(tmp_buf))); + } + } + else + { + unsigned int imm; + + imm = rol(insn & 0x000000ff, ((insn >> 8) & 15) * 2); + + if (is_move) + { + snprintf(buf, buf_len, "%s%s%s %s%s", name, condition(insn), s, register_name(rd), immediate(imm, 0, 1, tmp_buf, sizeof(tmp_buf))); + } + else if (is_test) + { + snprintf(buf, buf_len, "%s%s %s%s", name, condition(insn), register_name(rn), immediate(imm, 0, 1, tmp_buf, sizeof(tmp_buf))); + } + else + { + snprintf(buf, buf_len, "%s%s%s %s,%s%s", name, condition(insn), s, register_name(rd), register_name(rn), immediate(imm, 0, 1, tmp_buf, sizeof(tmp_buf))); + } + } + + return 1; +} + +static int branch(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *link = ((insn >> 24) & 1) ? "l" : ""; + unsigned int address; + unsigned int offset; + + offset = insn & 0x00ffffff; + + if ((offset & 0x00800000) != 0) + { + offset |= 0xff000000; + } + + address = pc + 8 + (offset << 2); + + snprintf(buf, buf_len, "b%s%s " ADDR_FORMAT, link, condition(insn), address); + + return 1; +} + +static int multiply(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *rd = register_name(insn >> 16); + const char *rn = register_name(insn >> 12); + const char *rs = register_name(insn >> 8); + const char *rm = register_name(insn >> 0); + const char *s = ((insn >> 20) & 1) ? "s" : ""; + int mla = (insn >> 21) & 1; + + snprintf(buf, buf_len, (mla) ? "mla%s%s %s,%s,%s,%s" : "mul%s%s %s,%s,%s", condition(insn), s, rd, rm, rs, rn); + + return 1; +} + +static int multiply_long(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *rh = register_name(insn >> 16); + const char *rl = register_name(insn >> 12); + const char *rs = register_name(insn >> 8); + const char *rm = register_name(insn >> 0); + const char *u = ((insn >> 22) & 1) ? "s" : "u"; + const char *s = ((insn >> 20) & 1) ? "s" : ""; + const char *name = ((insn >> 21) & 1) ? "mlal" : "mull"; + + snprintf(buf, buf_len, "%s%s%s%s %s,%s,%s,%s", u, name, condition(insn), s, rl, rh, rm, rs); + + return 1; +} + +static int single_data_swap(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *rn = register_name(insn >> 16); + const char *rd = register_name(insn >> 12); + const char *rm = register_name(insn >> 0); + const char *b = ((insn >> 22) & 1) ? "b" : ""; + + snprintf(buf, buf_len, "swp%s%s %s,%s,[%s]", condition(insn), b, rd, rm, rn); + + return 1; +} + +static int branch_and_exchange(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *rn = register_name(insn >> 0); + const char *l = ((insn >> 5) & 1) ? "l" : ""; + + snprintf(buf, buf_len, "b%sx%s %s", l, condition(insn), rn); + + return 1; +} + +static int halfword_data_transfer(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *rn = register_name(insn >> 16); + const char *rd = register_name(insn >> 12); + const char *name = ((insn >> 20) & 1) ? "ldr" : "str"; + const char *w = ((insn >> 21) & 1) ? "!" : ""; + int sign = (insn >> 23) & 1; + int pre = (insn >> 24) & 1; + const char *suffix = ""; + char tmp_buf[64]; + + switch ((insn >> 5) & 3) + { + case 0: + name = "swp"; + break; + case 1: + suffix = "h"; + break; + case 2: + suffix = "sb"; + break; + case 3: + suffix = "sh"; + break; + } + + if ((insn >> 22) & 1) + { + unsigned int imm = ((insn >> 4) & 0xf0) | (insn & 0x0f); + + snprintf(buf, buf_len, (pre) ? "%s%s%s %s,[%s%s]%s" : "%s%s%s %s,[%s],%s%s", name, condition(insn), suffix, rd, rn, immediate(imm, !sign, 0, tmp_buf, sizeof(tmp_buf)), w); + } + else + { + const char *rm = register_name(insn >> 0); + + snprintf(buf, buf_len, (pre) ? "%s%s%s %s,[%s,%s%s]%s" : "%s%s%s %s,[%s],%s%s%s", name, condition(insn), suffix, rd, rn, sign ? "" : "-", rm, w); + } + + return 1; +} + +static int single_data_transfer(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *rn = register_name(insn >> 16); + const char *rd = register_name(insn >> 12); + const char *name = ((insn >> 20) & 1) ? "ldr" : "str"; + const char *w = ((insn >> 21) & 1) ? "!" : ""; + const char *b = ((insn >> 22) & 1) ? "b" : ""; + int sign = (insn >> 23) & 1; + int pre = (insn >> 24) & 1; + char tmp_buf[64]; + + if ((insn >> 25) & 1) + { + const char *rm = register_name(insn >> 0); + + snprintf(buf, buf_len, (pre) ? "%s%s%s %s,[%s,%s%s%s]%s" : "%s%s%s %s,[%s],%s%s%s%s", name, condition(insn), b, rd, rn, sign ? "" : "-", rm, shift(insn, tmp_buf, sizeof(tmp_buf)), w); + } + else + { + unsigned int imm = insn & 0x00000fff; + + snprintf(buf, buf_len, (pre) ? "%s%s%s %s,[%s%s]%s" : "%s%s%s %s,[%s]%s%s", name, condition(insn), b, rd, rn, immediate(imm, !sign, 0, tmp_buf, sizeof(tmp_buf)), w); + } + + return 1; +} + +static int block_data_transfer(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *s = ((insn >> 22) & 1) ? "^" : ""; + const char *w = ((insn >> 21) & 1) ? "!" : ""; + int load = (insn >> 20) & 1; + const char *name = (load) ? "ldm" : "stm"; + const char *ldm_stubs[4] = { "fa", "fd", "ea", "ed" }; + const char *stm_stubs[4] = { "ed", "ea", "fd", "fa" }; + int stub_idx = (insn >> 23) & 3; + const char *stub = (load) ? ldm_stubs[stub_idx] : stm_stubs[stub_idx]; + char tmp_buf[64]; + + snprintf(buf, buf_len, "%s%s%s %s%s, {%s}%s", name, condition(insn), stub, register_name(insn >> 16), w, register_list(insn & 0xffff, tmp_buf, sizeof(tmp_buf)), s); + + return 1; +} + +static int coprocessor_data_transfer(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *name = ((insn >> 20) & 1) ? "ldc" : "stc"; + const char *rn = register_name(insn >> 16); + int sign = (insn >> 23) & 1; + const char *l = ((insn >> 22) & 1) ? "l" : ""; + const char *w = ((insn >> 21) & 1) ? "!" : ""; + int pre = (insn >> 24) & 1; + unsigned int cp = (insn >> 8) & 15; + unsigned int cd = (insn >> 12) & 15; + unsigned int imm = (insn >> 0) & 255; + char tmp_buf[64]; + + snprintf(buf, buf_len, (pre) ? "%s%s%s p%d,cr%d,[%s%s]%s" : "%s%s%s p%d,cr%d,[%s]%s%s", name, condition(insn), l, cp, cd, rn, immediate(imm, !sign, 0, tmp_buf, sizeof(tmp_buf)), w); + + return 1; +} + +static int coprocessor_data_operation(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + snprintf(buf, buf_len, "cdp%s p%d,%d,cr%d,cr%d,cr%d,{%d}", condition(insn), (insn >> 8) & 15, (insn >> 20) & 15, (insn >> 12) & 15, (insn >> 16) & 15, (insn >> 0) & 15, (insn >> 5) & 7); + + return 1; +} + +static int coprocessor_register_transfer(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *name = ((insn >> 20) & 1) ? "mrc" : "mcr"; + unsigned int cn = (insn >> 16) & 15; + const char *rd = register_name(insn >> 12); + unsigned int expr1 = (insn >> 21) & 7; + unsigned int expr2 = (insn >> 5) & 7; + unsigned int cp = (insn >> 8) & 15; + unsigned int cm = (insn >> 0) & 15; + + snprintf(buf, buf_len, "%s%s p%d,%d,%s,cr%d,cr%d,{%d}", name, condition(insn), cp, expr1, rd, cn, cm, expr2); + + return 1; +} + +static int software_interrupt(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + snprintf(buf, buf_len, "swi%s %u", condition(insn), insn & 0x00ffffff); + + return 1; +} + +int disarm(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len, unsigned long *addr) +{ + *addr = 0; + + if ((insn & 0x0fffffd0) == 0x012fff10) + return branch_and_exchange(pc, insn, buf, buf_len); + + if ((insn & 0x0fb00ff0) == 0x01000090) + return single_data_swap(pc, insn, buf, buf_len); + + if ((insn & 0x0fc000f0) == 0x00000090) + return multiply(pc, insn, buf, buf_len); + + if ((insn & 0x0f8000f0) == 0x00800090) + return multiply_long(pc, insn, buf, buf_len); + + if ((insn & 0x0f000010) == 0x0e000000) + return coprocessor_data_operation(pc, insn, buf, buf_len); + + if ((insn & 0x0f000010) == 0x0e000010) + return coprocessor_register_transfer(pc, insn, buf, buf_len); + + if ((insn & 0x0f000000) == 0x0f000000) + return software_interrupt(pc, insn, buf, buf_len); + + if ((insn & 0x0e000090) == 0x00000090) + return halfword_data_transfer(pc, insn, buf, buf_len); + + if ((insn & 0x0e000000) == 0x08000000) + return block_data_transfer(pc, insn, buf, buf_len); + + if ((insn & 0x0e000000) == 0x0a000000) { + *addr = (unsigned long)pc+8 + ((unsigned long)(insn << 8) >> 6); + return branch(pc, insn, buf, buf_len); + } + + if ((insn & 0x0e000000) == 0x0c000000) + return coprocessor_data_transfer(pc, insn, buf, buf_len); + + if ((insn & 0x0c000000) == 0x00000000) + return data_processing(pc, insn, buf, buf_len); + + if ((insn & 0x0c000000) == 0x04000000) + return single_data_transfer(pc, insn, buf, buf_len); + + return 0; +} + diff --git a/platform/common/disarm.h b/platform/common/disarm.h new file mode 100644 index 000000000..a07675fd0 --- /dev/null +++ b/platform/common/disarm.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2012 Wojtek Kaniewski + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef DISARM_H +#define DISARM_H + +int disarm(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len, unsigned long *sym); + +#endif /* DISARM_H */ diff --git a/platform/common/dismips.c b/platform/common/dismips.c new file mode 100644 index 000000000..d855ad6b8 --- /dev/null +++ b/platform/common/dismips.c @@ -0,0 +1,412 @@ +/* + * very basic mips disassembler for MIPS32/MIPS64 Release 2, only for picodrive + * Copyright (C) 2019 kub + * + * This work is licensed under the terms of MAME license. + * See COPYING file in the top-level directory. + */ + +// unimplemented insns: MOV[FT], SYSCALL, BREAK, SYNC, SYNCI, T*, SDBBP, RDHWR, +// CACHE, PREF, LWC*/LDC*, SWC*/SDC*, and all of COP* (fpu, mmu, irq, exc, ...) +// unimplemented variants of insns: EHB, SSNOP (both SLL zero), JALR.HB, JR.HB +// however, it's certainly good enough for anything picodrive DRC throws at it. + +#include +#include +#include +#include + +#include "dismips.h" + + +static char *const register_names[32] = { + "$zero", + "$at", + "$v0", + "$v1", + "$a0", + "$a1", + "$a2", + "$a3", + "$t0", + "$t1", + "$t2", + "$t3", + "$t4", + "$t5", + "$t6", + "$t7", + "$s0", + "$s1", + "$s2", + "$s3", + "$s4", + "$s5", + "$s6", + "$s7", + "$t8", + "$t9", + "$k0", + "$k1", + "$gp", + "$sp", + "$fp", + "$ra" +}; + + +enum insn_type { + REG_DTS, REG_TS, // 3, 2, or 1 regs + REG_DS, REG_DT, REG_D, REG_S, + S_IMM_DT, // 2 regs with shift amount + F_IMM_TS, // 2 regs with bitfield spec + B_IMM_S, B_IMM_TS, // pc-relative branches with 1 or 2 regs + J_IMM, // region-relative jump + A_IMM_TS, // arithmetic immediate with 2 regs + L_IMM_T, L_IMM_TS, // logical immediate with 1 or 2 regs + M_IMM_TS, // memory indexed with 2 regs + SR_BIT = 0x80 // shift right with R-bit +}; + +struct insn { + unsigned char op; + enum insn_type type; + char *name; +}; + +// ATTN: these array MUST be sorted by op (decode relies on it) + +// instructions with opcode SPECIAL (R-type) +#define OP_SPECIAL 0x00 +static const struct insn special_insns[] = { + {0x00, S_IMM_DT, "sll"}, +// {0x01, , "movf\0movt"}, + {0x02, S_IMM_DT|SR_BIT, "srl\0rotr"}, + {0x03, S_IMM_DT, "sra"}, + {0x04, REG_DTS, "sllv"}, + {0x06, REG_DTS|SR_BIT, "srlv\0rotrv"}, + {0x07, REG_DTS, "srav"}, + {0x08, REG_S, "jr"}, + {0x09, REG_DS, "jalr"}, + {0x0a, REG_DTS, "movz"}, + {0x0b, REG_DTS, "movn"}, +// {0x0c, , "syscall"}, +// {0x0d, , "break"}, +// {0x0f, , "sync"}, + {0x10, REG_D, "mfhi"}, + {0x11, REG_S, "mthi"}, + {0x12, REG_D, "mflo"}, + {0x13, REG_S, "mtlo"}, + {0x14, REG_DTS, "dsllv"}, + {0x16, REG_DTS|SR_BIT, "dsrlv\0drotrv"}, + {0x17, REG_DTS, "dsrav"}, + {0x18, REG_TS, "mult"}, + {0x19, REG_TS, "multu"}, + {0x1A, REG_TS, "div"}, + {0x1B, REG_TS, "divu"}, + {0x1C, REG_TS, "dmult"}, + {0x1D, REG_TS, "dmultu"}, + {0x1E, REG_TS, "ddiv"}, + {0x1F, REG_TS, "ddivu"}, + {0x20, REG_DTS, "add"}, + {0x21, REG_DTS, "addu"}, + {0x22, REG_DTS, "sub"}, + {0x23, REG_DTS, "subu"}, + {0x24, REG_DTS, "and"}, + {0x25, REG_DTS, "or"}, + {0x26, REG_DTS, "xor"}, + {0x27, REG_DTS, "nor"}, + {0x2A, REG_DTS, "slt"}, + {0x2B, REG_DTS, "sltu"}, + {0x2C, REG_DTS, "dadd"}, + {0x2D, REG_DTS, "daddu"}, + {0x2E, REG_DTS, "dsub"}, + {0x2F, REG_DTS, "dsubu"}, +// {0x30, REG_TS, "tge" }, +// {0x31, REG_TS, "tgeu" }, +// {0x32, REG_TS, "tlt" }, +// {0x33, REG_TS, "tltu" }, +// {0x34, REG_TS, "teq" }, +// {0x36, REG_TS, "tne" }, + {0x38, S_IMM_DT, "dsll"}, + {0x3A, S_IMM_DT|SR_BIT, "dsrl\0drotrv"}, + {0x3B, S_IMM_DT, "dsra"}, + {0x3C, S_IMM_DT, "dsll32"}, + {0x3E, S_IMM_DT|SR_BIT, "dsrl32\0drotr32"}, + {0x3F, S_IMM_DT, "dsra32"}, +}; + +// instructions with opcode SPECIAL2 (R-type) +#define OP_SPECIAL2 0x1C +static const struct insn special2_insns[] = { + {0x00, REG_TS, "madd" }, + {0x01, REG_TS, "maddu" }, + {0x02, REG_TS, "mul" }, + {0x04, REG_TS, "msub" }, + {0x05, REG_TS, "msubu" }, + {0x20, REG_DS, "clz" }, + {0x21, REG_DS, "clo" }, + {0x24, REG_DS, "dclz" }, + {0x25, REG_DS, "dclo" }, +// {0x37, , "sdbbp" }, +}; + +// instructions with opcode SPECIAL3 (R-type) +#define OP_SPECIAL3 0x1F +static const struct insn special3_insns[] = { + {0x00, F_IMM_TS, "ext" }, + {0x01, F_IMM_TS, "dextm" }, + {0x02, F_IMM_TS, "dextu" }, + {0x03, F_IMM_TS, "dext" }, + {0x04, F_IMM_TS, "ins" }, + {0x05, F_IMM_TS, "dinsm" }, + {0x06, F_IMM_TS, "dinsu" }, + {0x07, F_IMM_TS, "dins" }, +// {0x3b, , "rdhwr" }, +}; + +// instruction with opcode SPECIAL3 and function *BSHFL +#define FN_BSHFL 0x20 +static const struct insn bshfl_insns[] = { + {0x02, REG_DT, "wsbh" }, + {0x10, REG_DT, "seb" }, + {0x18, REG_DT, "seh" }, +}; +#define FN_DBSHFL 0x24 +static const struct insn dbshfl_insns[] = { + {0x02, REG_DT, "dsbh" }, + {0x05, REG_DT, "dshd" }, +}; + +// instructions with opcode REGIMM (I-type) +#define OP_REGIMM 0x01 +static const struct insn regimm_insns[] = { + {0x00, B_IMM_S, "bltz"}, + {0x01, B_IMM_S, "bgez"}, + {0x02, B_IMM_S, "bltzl"}, + {0x03, B_IMM_S, "bgezl"}, +// {0x08, , "tgei"}, +// {0x09, , "tgeiu"}, +// {0x0a, , "tlti"}, +// {0x0b, , "tltiu"}, +// {0x0c, , "teqi"}, +// {0x0e, , "tnei"}, + {0x10, B_IMM_S, "bltzal"}, + {0x11, B_IMM_S, "bgezal"}, + {0x12, B_IMM_S, "bltzall"}, + {0x13, B_IMM_S, "bgezall"}, + {0x13, B_IMM_S, "bgezall"}, +// {0x1f, , "synci" }, +}; + +// instructions with other opcodes (I-type) +static const struct insn immediate_insns[] = { + {0x02, J_IMM, "j"}, + {0x03, J_IMM, "jal"}, + {0x04, B_IMM_TS, "beq"}, + {0x05, B_IMM_TS, "bne"}, + {0x06, B_IMM_S, "blez"}, + {0x07, B_IMM_S, "bgtz"}, + {0x08, A_IMM_TS, "addi"}, + {0x09, A_IMM_TS, "addiu"}, + {0x0A, A_IMM_TS, "slti"}, + {0x0B, A_IMM_TS, "sltiu"}, + {0x0C, L_IMM_TS, "andi"}, + {0x0D, L_IMM_TS, "ori"}, + {0x0E, L_IMM_TS, "xori"}, + {0x0F, L_IMM_T, "lui"}, + {0x14, B_IMM_TS, "beql"}, + {0x15, B_IMM_TS, "bnel"}, + {0x16, B_IMM_S, "blezl"}, + {0x17, B_IMM_S, "bgtzl"}, + {0x18, A_IMM_TS, "daddi"}, + {0x19, A_IMM_TS, "daddiu"}, + {0x1A, M_IMM_TS, "ldl"}, + {0x1B, M_IMM_TS, "ldr"}, + {0x20, M_IMM_TS, "lb"}, + {0x21, M_IMM_TS, "lh"}, + {0x22, M_IMM_TS, "lwl"}, + {0x23, M_IMM_TS, "lw"}, + {0x24, M_IMM_TS, "lbu"}, + {0x25, M_IMM_TS, "lhu"}, + {0x26, M_IMM_TS, "lwr"}, + {0x27, M_IMM_TS, "lwu"}, + {0x28, M_IMM_TS, "sb"}, + {0x29, M_IMM_TS, "sh"}, + {0x2A, M_IMM_TS, "swl"}, + {0x2B, M_IMM_TS, "sw"}, + {0x2C, M_IMM_TS, "sdl"}, + {0x2D, M_IMM_TS, "sdr"}, + {0x2E, M_IMM_TS, "swr"}, +// {0x2F, , "cache"}, + {0x30, M_IMM_TS, "ll"}, +// {0x31, , "lwc1"}, +// {0x32, , "lwc2"}, +// {0x33, , "pref"}, + {0x34, M_IMM_TS, "lld"}, +// {0x35, , "ldc1"}, +// {0x36, , "ldc2"}, + {0x37, M_IMM_TS, "ld"}, + {0x38, M_IMM_TS, "sc"}, +// {0x39, , "swc1"}, +// {0x3A, , "swc2"}, + {0x3C, M_IMM_TS, "scd"}, +// {0x3D, , "sdc1"}, +// {0x3E, , "sdc2"}, + {0x3F, M_IMM_TS, "sd"}, +}; + +#define ARRAY_SIZE(a) (sizeof(a)/sizeof(*a)) + +// find instruction description for insn +static const struct insn *decode_insn(uint32_t insn) +{ + uint32_t op = insn >> 26; + const struct insn *pi; + int l = 0, r = 0; + + if (op == OP_SPECIAL) { + op = insn & 0x3f; + pi = special_insns; + r = ARRAY_SIZE(special_insns)-1; + } else if (op == OP_SPECIAL2) { + op = insn & 0x3f; + pi = special2_insns; + r = ARRAY_SIZE(special2_insns)-1; + } else if (op == OP_SPECIAL3) { + op = insn & 0x3f; + if (op == FN_BSHFL) { + op = (insn >> 6) & 0x1f; + pi = bshfl_insns; + r = ARRAY_SIZE(bshfl_insns)-1; + } else if (op == FN_DBSHFL) { + op = (insn >> 6) & 0x1f; + pi = dbshfl_insns; + r = ARRAY_SIZE(dbshfl_insns)-1; + } else { + pi = special3_insns; + r = ARRAY_SIZE(special3_insns)-1; + } + } else if (op == OP_REGIMM) { + op = (insn>>16) & 0x1f; + pi = regimm_insns; + r = ARRAY_SIZE(regimm_insns)-1; + } else { + pi = immediate_insns; + r = ARRAY_SIZE(immediate_insns)-1; + } + + while (l <= r) { + int m = (l+r) / 2; + if (pi[m].op == op) + return pi+m; + else if (pi[m].op < op) + l = m+1; + else + r = m-1; + } + return NULL; +} + +// calculate target for pc-relative branches +static unsigned long b_target(unsigned long pc, uint32_t insn) +{ + return pc + 4 + (int16_t)insn * 4; +} + +// calculate target for region-relative branches +static unsigned long j_target(unsigned long pc, uint32_t insn) +{ + return (pc & ~0x0fffffffL) | ((insn & 0x03ffffff) << 2); +} + +// main disassembler function +int dismips(uintptr_t pc, uint32_t insn, char *buf, size_t buflen, unsigned long *sym) +{ + const struct insn *pi = decode_insn(insn); + char *rs = register_names[(insn >> 21) & 0x1f]; + char *rt = register_names[(insn >> 16) & 0x1f]; + char *rd = register_names[(insn >> 11) & 0x1f]; + int sa = (insn >> 6) & 0x1f, sb = (insn >> 11) & 0x1f; + int imm = (int16_t) insn; + + *sym = 0; + if (pi == NULL) { + snprintf(buf, buflen, "0x%x", insn); + return 0; + } + + switch (pi->type & ~SR_BIT) { + case REG_DTS: + if ((insn & 0x3f) == 0x25 /*OR*/ && (insn & 0x1f0000) == 0 /*zero*/) + snprintf(buf, buflen, "move %s, %s", rd, rs); + else if ((pi->type & SR_BIT) && (insn & (1<<6))) + snprintf(buf, buflen, "%s %s, %s, %s", pi->name+strlen(pi->name)+1, rd, rs, rt); + else + snprintf(buf, buflen, "%s %s, %s, %s", pi->name, rd, rs, rt); + break; + case REG_TS: + snprintf(buf, buflen, "%s %s, %s", pi->name, rs, rt); + break; + case REG_DS: + snprintf(buf, buflen, "%s %s, %s", pi->name, rd, rs); + break; + case REG_DT: + snprintf(buf, buflen, "%s %s, %s", pi->name, rd, rt); + break; + case REG_D: + snprintf(buf, buflen, "%s %s", pi->name, rd); + break; + case REG_S: + snprintf(buf, buflen, "%s %s", pi->name, rs); + break; + case S_IMM_DT: + if (insn == 0x00000000) + snprintf(buf, buflen, "nop"); + else if ((pi->type & SR_BIT) && (insn & (1<<21))) + snprintf(buf, buflen, "%s %s, %s, %d", pi->name+strlen(pi->name)+1, rd, rt, sa); + else + snprintf(buf, buflen, "%s %s, %s, %d", pi->name, rd, rt, sa); + break; + //dext: pos,size-1 dextm: pos,size-33 dextu: pos-32,size-1 + //dins: pos,pos+size-1 dinsm: pos,pos+size-33 dinsu: pos-32,pos+size-33 + case F_IMM_TS: + if (insn & 0x01) sb+=32; // ...m + if (insn & 0x02) sa+=32; // ...u + if (insn & 0x04) sb-=sa; // ins + snprintf(buf, buflen, "%s %s, %s, %d, %d", pi->name, rt, rs, sa, sb+1); + break; + case B_IMM_S: + *sym = b_target(pc, insn); + snprintf(buf, buflen, "%s %s, 0x%lx", pi->name, rs, *sym); + break; + case B_IMM_TS: + *sym = b_target(pc, insn); + snprintf(buf, buflen, "%s %s, %s, 0x%lx", pi->name, rs, rt, *sym); + break; + case J_IMM: + *sym = j_target(pc, insn); + snprintf(buf, buflen, "%s 0x%lx", pi->name, *sym); + break; + case A_IMM_TS: + if (abs(imm) < 1000) + snprintf(buf, buflen, "%s %s, %s, %d", pi->name, rt, rs, imm); + else + snprintf(buf, buflen, "%s %s, %s, 0x%x", pi->name, rt, rs, imm); + break; + case L_IMM_T: + snprintf(buf, buflen, "%s %s, 0x%x", pi->name, rt, (uint16_t)imm); + break; + case L_IMM_TS: + if ((insn >> 26) == 0x34 /*ORI*/ && (insn & 0x03e00000) == 0 /*zero*/) + snprintf(buf, buflen, "li %s, 0x%x", rt, (uint16_t)imm); + else + snprintf(buf, buflen, "%s %s, %s, 0x%x", pi->name, rt, rs, (uint16_t)imm); + break; + case M_IMM_TS: + snprintf(buf, buflen, "%s %s, %d(%s)", pi->name, rt, imm, rs); + break; + } + return 1; +} + diff --git a/platform/common/dismips.h b/platform/common/dismips.h new file mode 100644 index 000000000..8d1059254 --- /dev/null +++ b/platform/common/dismips.h @@ -0,0 +1,6 @@ +#ifndef DISMIPS_H +#define DISMIPS_H + +int dismips(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len, unsigned long *sym); + +#endif /* DISMIPS_H */ diff --git a/platform/common/emu.c b/platform/common/emu.c index 7b68abe90..1c2bfa2cc 100644 --- a/platform/common/emu.c +++ b/platform/common/emu.c @@ -600,6 +600,7 @@ void emu_prep_defconfig(void) defaultConfig.turbo_rate = 15; defaultConfig.msh2_khz = PICO_MSH2_HZ / 1000; defaultConfig.ssh2_khz = PICO_SSH2_HZ / 1000; + defaultConfig.max_skip = 4; // platform specific overrides pemu_prep_defconfig(); @@ -1411,8 +1412,10 @@ void emu_loop(void) { notice_msg_time = 0; plat_status_msg_clear(); +#ifndef __GP2X__ plat_video_flip(); plat_status_msg_clear(); /* Do it again in case of double buffering */ +#endif notice_msg = NULL; } else { @@ -1465,10 +1468,16 @@ void emu_loop(void) else if (diff < -target_frametime_x3) { /* no time left for this frame - skip */ - /* limit auto frameskip to 8 */ - if (frames_done / 8 <= frames_shown) + /* limit auto frameskip to max_skip */ + if (fskip_cnt < currentConfig.max_skip) { + fskip_cnt++; skip = 1; - } + } + else { + fskip_cnt = 0; + } + } else + fskip_cnt = 0; // don't go in debt too much while (diff < -target_frametime_x3 * 3) { diff --git a/platform/common/emu.h b/platform/common/emu.h index 1e751f891..26e2159b4 100644 --- a/platform/common/emu.h +++ b/platform/common/emu.h @@ -76,6 +76,7 @@ typedef struct _currentConfig_t { int msh2_khz; int ssh2_khz; int overclock_68k; + int max_skip; } currentConfig_t; extern currentConfig_t currentConfig, defaultConfig; diff --git a/platform/common/helix/Makefile b/platform/common/helix/Makefile new file mode 100644 index 000000000..9fa4c1cc6 --- /dev/null +++ b/platform/common/helix/Makefile @@ -0,0 +1,43 @@ +CROSS ?= arm-linux-gnueabi- + +CC = $(CROSS)gcc +AS = $(CROSS)as +AR = $(CROSS)ar +TOOLCHAIN = $(notdir $(CROSS)) +LIBGCC ?= ${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1/libgcc.a + +CFLAGS += -Ipub -O2 -Wall -fstrict-aliasing -ffast-math +ifneq ($(findstring arm-,$(TOOLCHAIN)),) +CFLAGS += -mcpu=arm940t -mtune=arm940t -mfloat-abi=soft -mfpu=fpa -mabi=apcs-gnu -mno-thumb-interwork +ASFLAGS = -mcpu=arm940t -mfloat-abi=soft -mfpu=fpa -mabi=apcs-gnu +OBJS += real/arm/asmpoly_gcc.o +else +CFLAGS += -m32 +ASFLAGS += -m32 +OBJS += real/polyphase.o +endif + +LIB = $(TOOLCHAIN)helix_mp3.a +SHLIB = $(TOOLCHAIN)helix_mp3.so + +all: $(LIB) $(SHLIB) + + +OBJS += mp3dec.o mp3tabs.o +#OBJS += ipp/bitstream.o ipp/buffers.o ipp/dequant.o ipp/huffman.o ipp/imdct.o ipp/subband.o +OBJS += real/bitstream.o real/buffers.o real/dct32.o real/dequant.o real/dqchan.o real/huffman.o +OBJS += real/hufftabs.o real/imdct.o real/scalfact.o real/stproc.o real/subband.o real/trigtabs.o + +OBJS += lib.o + +real/arm/asmpoly_gcc.o: real/arm/asmpoly_gcc.s + $(CC) -o $@ $(ASFLAGS) -c $< + +$(LIB) : $(OBJS) + $(AR) r $@ $^ +$(SHLIB) : $(OBJS) $(LIBGCC) + $(CC) -o $@ -nostdlib -shared $(CFLAGS) $^ + +clean: + $(RM) -f $(OBJS) + diff --git a/platform/common/helix/lib.c b/platform/common/helix/lib.c new file mode 100644 index 000000000..d2b058987 --- /dev/null +++ b/platform/common/helix/lib.c @@ -0,0 +1,57 @@ +#include +#include + +// libgcc has this with gcc 4.x +void raise(int sig) +{ +} + +// very limited heap functions for helix decoder + +static char heap[65000] __attribute__((aligned(16))); +static long heap_offs; + +void __malloc_init(void) +{ + heap_offs = 0; +} + +void *malloc(size_t size) +{ + void *chunk = heap + heap_offs; + size = (size+15) & ~15; + if (heap_offs + size > sizeof(heap)) + return NULL; + else { + heap_offs += size; + return chunk; + } +} + +void free(void *chunk) +{ + if (chunk == heap) + heap_offs = 0; +} + +#if 0 +void *memcpy (void *dest, const void *src, size_t n) +{ + char *_dest = dest; + const char *_src = src; + while (n--) *_dest++ = *_src++; + return dest; +} + +void *memmove (void *dest, const void *src, size_t n) +{ + char *_dest = dest+n; + const char *_src = src+n; + if (dest <= src || dest >= _src) + return memcpy(dest, src, n); + while (n--) *--_dest = *--_src; + return dest; +} +#else +#include "../memcpy.c" +#endif diff --git a/platform/common/host_dasm.c b/platform/common/host_dasm.c new file mode 100644 index 000000000..b3b504e8e --- /dev/null +++ b/platform/common/host_dasm.c @@ -0,0 +1,93 @@ +/* + * DRC host disassembler interface for MIPS/ARM32 for use without binutils + * (C) kub, 2018,2019 + */ +#include +#include +#include +#include + +#ifdef __mips__ +#include "dismips.c" +#define disasm dismips +#else +#include "disarm.c" +#define disasm disarm +#endif + +/* symbols */ +typedef struct { const char *name; void *value; } asymbol; + +static asymbol **symbols; +static long symcount, symstorage = 8; + +static const char *lookup_name(void *addr) +{ + asymbol **sptr = symbols; + int i; + + for (i = 0; i < symcount; i++) { + asymbol *sym = *sptr++; + + if (addr == sym->value) + return sym->name; + } + + return NULL; +} + +void host_dasm(void *addr, int len) +{ + void *end = (char *)addr + len; + const char *name; + char buf[64]; + unsigned long insn, symaddr; + + while (addr < end) { + name = lookup_name(addr); + if (name != NULL) + printf("%s:\n", name); + + insn = *(unsigned long *)addr; + printf(" %08lx %08lx ", (long)addr, insn); + if(disasm((unsigned)addr, insn, buf, sizeof(buf), &symaddr)) + { + if (symaddr) + name = lookup_name((void *)symaddr); + if (symaddr && name) + printf("%s <%s>\n", buf, name); + else if (symaddr && !name) + printf("%s \n", buf); + else + printf("%s\n", buf); + } else + printf("unknown (0x%08lx)\n", insn); + addr = (char *)addr + sizeof(long); + } +} + +void host_dasm_new_symbol_(void *addr, const char *name) +{ + asymbol *sym, **tmp; + + if (symbols == NULL) + symbols = malloc(symstorage); + if (symstorage <= symcount * sizeof(symbols[0])) { + tmp = realloc(symbols, symstorage * 2); + if (tmp == NULL) + return; + symstorage *= 2; + symbols = tmp; + } + + symbols[symcount] = calloc(sizeof(*symbols[0]), 1); + if (symbols[symcount] == NULL) + return; + + // a HACK (should use correct section), but ohwell + sym = symbols[symcount]; + sym->value = addr; + sym->name = name; + symcount++; +} + diff --git a/platform/common/main.c b/platform/common/main.c index 424c1b5a9..24ca1865a 100644 --- a/platform/common/main.c +++ b/platform/common/main.c @@ -93,6 +93,10 @@ int main(int argc, char *argv[]) emu_init(); menu_init(); +#ifdef GPERF + ProfilerStart("gperf.out"); +#endif + engineState = PGS_Menu; if (argc > 1) @@ -148,6 +152,9 @@ int main(int argc, char *argv[]) } endloop: +#ifdef GPERF + ProfilerStop(); +#endif emu_finish(); plat_finish(); diff --git a/platform/common/memcpy.c b/platform/common/memcpy.c new file mode 100644 index 000000000..1cd741759 --- /dev/null +++ b/platform/common/memcpy.c @@ -0,0 +1,134 @@ +/* + * (C) 2018 Kai-Uwe Bloem + * + * 32bit ARM/MIPS optimized C implementation of memcpy and memove, designed for + * good performance with gcc. + * - if src and dest have the same alignment, 4-word copy is used. + * - if src and dest are unaligned to each other, still loads word data and + * stores correctly shifted word data (for all but the first and last bytes + * to avoid under/overstepping the src region). + * + * ATTN does dirty aliasing tricks with undefined behaviour by standard. + * (however, this improved the generated code). + * ATTN uses struct assignment, which only works if the compiler is inlining + * this (else it would probably call memcpy :-)). + */ +#include +#include + +#include +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define _L_ >> +#define _U_ << +#else +#define _L_ << +#define _U_ >> +#endif + +void *memcpy(void *dest, const void *src, size_t n) +{ + struct _16 { uint32_t a[4]; }; + union { const void *v; uint8_t *c; uint32_t *i; uint64_t *l; struct _16 *s; } + ss = { src }, ds = { dest }; + const int lm = sizeof(uint32_t)-1; + + /* align src to word */ + while (((uintptr_t)ss.c & lm) && n > 0) + *ds.c++ = *ss.c++, n--; + if (((uintptr_t)ds.c & lm) == 0) { + /* fast copy if pointers have the same aligment */ + while (n >= sizeof(struct _16)) /* copy 16 byte blocks */ + *ds.s++ = *ss.s++, n -= sizeof(struct _16); + if (n >= sizeof(uint64_t)) /* copy leftover 8 byte block */ + *ds.l++ = *ss.l++, n -= sizeof(uint64_t); +// if (n >= sizeof(uint32_t)) /* copy leftover 4 byte block */ +// *ds.i++ = *ss.i++, n -= sizeof(uint32_t); + } else if (n >= 2*sizeof(uint32_t)) { + /* unaligned data big enough to avoid overstepping src */ + uint32_t v1, v2, b, s; + /* align dest to word */ + while (((uintptr_t)ds.c & lm) && n > 0) + *ds.c++ = *ss.c++, n--; + /* copy loop: load aligned words and store shifted words */ + b = (uintptr_t)ss.c & lm, s = b*8; ss.c -= b; + v1 = *ss.i++, v2 = *ss.i++; + while (n >= 3*sizeof(uint32_t)) { + *ds.i++ = (v1 _L_ s) | (v2 _U_ (32-s)); v1 = *ss.i++; + *ds.i++ = (v2 _L_ s) | (v1 _U_ (32-s)); v2 = *ss.i++; + n -= 2*sizeof(uint32_t); + } + /* data for one more store is already loaded */ + if (n >= sizeof(uint32_t)) { + *ds.i++ = (v1 _L_ s) | (v2 _U_ (32-s)); + n -= sizeof(uint32_t); + ss.c += sizeof(uint32_t); + } + ss.c += b - 2*sizeof(uint32_t); + } + /* copy 0-7 leftover bytes */ + while (n >= 4) { + *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; + *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; + } + while (n > 0) + *ds.c++ = *ss.c++, n--; + return dest; +} + +void *memmove (void *dest, const void *src, size_t n) +{ + struct _16 { uint32_t a[4]; }; + union { const void *v; uint8_t *c; uint32_t *i; uint64_t *l; struct _16 *s; } + ss = { src+n }, ds = { dest+n }; + size_t pd = dest > src ? dest - src : src - dest; + const int lm = sizeof(uint32_t)-1; + + if (dest <= src || dest >= src+n) + return memcpy(dest, src, n); + + /* align src to word */ + while (((uintptr_t)ss.c & lm) && n > 0) + *--ds.c = *--ss.c, n--; + /* take care not to copy multi-byte data if it overlaps */ + if (((uintptr_t)ds.c & lm) == 0) { + /* fast copy if pointers have the same aligment */ + while (n >= sizeof(struct _16) && pd >= sizeof(struct _16)) + /* copy 16 bytes blocks if no overlap */ + *--ds.s = *--ss.s, n -= sizeof(struct _16); + while (n >= sizeof(uint64_t) && pd >= sizeof(uint64_t)) + /* copy leftover 8 byte blocks if no overlap */ + *--ds.l = *--ss.l, n -= sizeof(uint64_t); + while (n >= sizeof(uint32_t) && pd >= sizeof(uint32_t)) + /* copy leftover 4 byte blocks if no overlap */ + *--ds.i = *--ss.i, n -= sizeof(uint32_t); + } else if (n >= 2*sizeof(uint32_t) && pd >= 2*sizeof(uint32_t)) { + /* unaligned data big enough to avoid understepping src */ + uint32_t v1, v2, b, s; + /* align dest to word */ + while (((uintptr_t)ds.c & lm) && n > 0) + *--ds.c = *--ss.c, n--; + /* copy loop: load aligned words and store shifted words */ + b = (uintptr_t)ss.c & lm, s = b*8; ss.c += b; + v1 = *--ss.i, v2 = *--ss.i; + while (n >= 3*sizeof(uint32_t)) { + *--ds.i = (v1 _U_ s) | (v2 _L_ (32-s)); v1 = *--ss.i; + *--ds.i = (v2 _U_ s) | (v1 _L_ (32-s)); v2 = *--ss.i; + n -= 2*sizeof(uint32_t); + } + /* data for one more store is already loaded */ + if (n >= sizeof(uint32_t)) { + *--ds.i = (v1 _U_ s) | (v2 _L_ (32-s)); + n -= sizeof(uint32_t); + ss.c -= sizeof(uint32_t); + } + ss.c -= b - 2*sizeof(uint32_t); + } + /* copy 0-7 leftover bytes (or upto everything if ptrs are too close) */ + while (n >= 4) { + *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; + *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; + } + while (n > 0) + *--ds.c = *--ss.c, n--; + return dest; +} diff --git a/platform/common/menu_pico.c b/platform/common/menu_pico.c index 7b0cd78c8..1d46e634b 100644 --- a/platform/common/menu_pico.c +++ b/platform/common/menu_pico.c @@ -499,6 +499,7 @@ static menu_entry e_menu_adv_options[] = mee_range_h ("Overclock M68k (%)", MA_OPT2_OVERCLOCK_M68K,currentConfig.overclock_68k, 0, 1000, h_ovrclk), mee_onoff ("Emulate Z80", MA_OPT2_ENABLE_Z80, PicoIn.opt, POPT_EN_Z80), mee_onoff ("Emulate YM2612 (FM)", MA_OPT2_ENABLE_YM2612, PicoIn.opt, POPT_EN_FM), + mee_onoff ("Disable YM2612 SSG-EG", MA_OPT2_DISABLE_YM_SSG,PicoIn.opt, POPT_DIS_FM_SSGEG), mee_onoff ("Emulate SN76496 (PSG)", MA_OPT2_ENABLE_SN76496,PicoIn.opt, POPT_EN_PSG), mee_onoff ("gzip savestates", MA_OPT2_GZIP_STATES, currentConfig.EmuOpt, EOPT_GZIP_SAVES), mee_onoff ("Don't save last used ROM", MA_OPT2_NO_LAST_ROM, currentConfig.EmuOpt, EOPT_NO_AUTOSVCFG), @@ -506,6 +507,8 @@ static menu_entry e_menu_adv_options[] = mee_onoff ("Disable frame limiter", MA_OPT2_NO_FRAME_LIMIT,currentConfig.EmuOpt, EOPT_NO_FRMLIMIT), mee_onoff ("Enable dynarecs", MA_OPT2_DYNARECS, PicoIn.opt, POPT_EN_DRC), mee_onoff ("Status line in main menu", MA_OPT2_STATUS_LINE, currentConfig.EmuOpt, EOPT_SHOW_RTC), + mee_range ("Max auto frameskip", MA_OPT2_MAX_FRAMESKIP, currentConfig.max_skip, 1, 10), + mee_onoff ("PWM IRQ optimization", MA_OPT2_PWM_IRQ_OPT, PicoIn.opt, POPT_PWM_IRQ_OPT), MENU_OPTIONS_ADV mee_end, }; @@ -920,7 +923,8 @@ static void draw_frame_credits(void) } static const char credits[] = - "PicoDrive v" VERSION " (c) notaz, 2006-2013\n\n\n" + "PicoDrive v" VERSION "\n" + "(c) notaz, 2006-2013; irixxxx, 2018-2020\n\n" "Credits:\n" "fDave: initial code\n" #ifdef EMU_C68K @@ -936,6 +940,7 @@ static const char credits[] = "MAME devs: SH2, YM2612 and SN76496 cores\n" "Eke, Stef: some Sega CD code\n" "Inder, ketchupgun: graphics\n" + "Irixxxx: SH2 drc improvements\n" #ifdef __GP2X__ "Squidge: mmuhack\n" "Dzz: ARM940 sample\n" diff --git a/platform/common/menu_pico.h b/platform/common/menu_pico.h index 595989e84..d15113fc1 100644 --- a/platform/common/menu_pico.h +++ b/platform/common/menu_pico.h @@ -48,6 +48,7 @@ typedef enum MA_OPT2_VSYNC, MA_OPT2_ENABLE_Z80, MA_OPT2_ENABLE_YM2612, + MA_OPT2_DISABLE_YM_SSG, MA_OPT2_ENABLE_SN76496, MA_OPT2_GZIP_STATES, MA_OPT2_NO_LAST_ROM, @@ -58,6 +59,8 @@ typedef enum MA_OPT2_NO_SPRITE_LIM, MA_OPT2_NO_IDLE_LOOPS, MA_OPT2_OVERCLOCK_M68K, + MA_OPT2_MAX_FRAMESKIP, + MA_OPT2_PWM_IRQ_OPT, MA_OPT2_DONE, MA_OPT3_SCALE, /* psp (all OPT3) */ MA_OPT3_HSCALE32, diff --git a/platform/common/mp3.c b/platform/common/mp3.c index c84962cc2..346e01958 100644 --- a/platform/common/mp3.c +++ b/platform/common/mp3.c @@ -21,33 +21,6 @@ unsigned short mpeg1_l3_bitrates[16] = { 0, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320 }; -int mp3_find_sync_word(const unsigned char *buf, int size) -{ - const unsigned char *p, *pe; - - /* find byte-aligned syncword - need 12 (MPEG 1,2) or 11 (MPEG 2.5) matching bits */ - for (p = buf, pe = buf + size - 3; p <= pe; p++) - { - int pn; - if (p[0] != 0xff) - continue; - pn = p[1]; - if ((pn & 0xf8) != 0xf8 || // currently must be MPEG1 - (pn & 6) == 0) { // invalid layer - p++; continue; - } - pn = p[2]; - if ((pn & 0xf0) < 0x20 || (pn & 0xf0) == 0xf0 || // bitrates - (pn & 0x0c) != 0) { // not 44kHz - continue; - } - - return p - buf; - } - - return -1; -} - static int try_get_bitrate(unsigned char *buf, int buf_size) { int offs1, offs = 0; diff --git a/platform/common/mp3.h b/platform/common/mp3.h index eb66db88b..4a2b230bd 100644 --- a/platform/common/mp3.h +++ b/platform/common/mp3.h @@ -12,8 +12,8 @@ int mp3dec_decode(FILE *f, int *file_pos, int file_len); extern unsigned short mpeg1_l3_bitrates[16]; #ifdef __GP2X__ -void mp3_update_local(int *buffer, int length, int stereo); -void mp3_start_play_local(void *f, int pos); +int _mp3dec_start(FILE *f, int fpos_start); +int _mp3dec_decode(FILE *f, int *file_pos, int file_len); #endif #endif // __COMMON_MP3_H__ diff --git a/platform/common/mp3_helix.c b/platform/common/mp3_helix.c index b27852981..75be8df3b 100644 --- a/platform/common/mp3_helix.c +++ b/platform/common/mp3_helix.c @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -20,10 +21,15 @@ static HMP3Decoder mp3dec; static unsigned char mp3_input_buffer[2 * 1024]; #ifdef __GP2X__ -#define mp3_update mp3_update_local -#define mp3_start_play mp3_start_play_local +#define mp3dec_decode _mp3dec_decode +#define mp3dec_start _mp3dec_start #endif +static void *libhelix; +HMP3Decoder (*p_MP3InitDecoder)(void); +void (*p_MP3FreeDecoder)(HMP3Decoder); +int (*p_MP3Decode)(HMP3Decoder, unsigned char **, int *, short *, int); + int mp3dec_decode(FILE *f, int *file_pos, int file_len) { unsigned char *readPtr; @@ -51,7 +57,7 @@ int mp3dec_decode(FILE *f, int *file_pos, int file_len) bytesLeft -= offset; had_err = err; - err = MP3Decode(mp3dec, &readPtr, &bytesLeft, cdda_out_buffer, 0); + err = p_MP3Decode(mp3dec, &readPtr, &bytesLeft, cdda_out_buffer, 0); if (err) { if (err == ERR_MP3_MAINDATA_UNDERFLOW && !had_err) { // just need another frame @@ -86,10 +92,31 @@ int mp3dec_decode(FILE *f, int *file_pos, int file_len) int mp3dec_start(FILE *f, int fpos_start) { + if (libhelix == NULL) { + libhelix = dlopen("./libhelix.so", RTLD_NOW); + if (libhelix == NULL) { + lprintf("mp3dec: load libhelix.so: %s\n", dlerror()); + return -1; + } + + p_MP3InitDecoder = dlsym(libhelix, "MP3InitDecoder"); + p_MP3FreeDecoder = dlsym(libhelix, "MP3FreeDecoder"); + p_MP3Decode = dlsym(libhelix, "MP3Decode"); + + if (p_MP3InitDecoder == NULL || p_MP3FreeDecoder == NULL + || p_MP3Decode == NULL) + { + lprintf("mp3dec: missing symbol(s) in libhelix.so\n"); + dlclose(libhelix); + libhelix = NULL; + return -1; + } + } + // must re-init decoder for new track if (mp3dec) - MP3FreeDecoder(mp3dec); - mp3dec = MP3InitDecoder(); + p_MP3FreeDecoder(mp3dec); + mp3dec = p_MP3InitDecoder(); return (mp3dec == 0) ? -1 : 0; } diff --git a/platform/common/mp3_sync.c b/platform/common/mp3_sync.c new file mode 100644 index 000000000..509c259dd --- /dev/null +++ b/platform/common/mp3_sync.c @@ -0,0 +1,27 @@ + +int mp3_find_sync_word(const unsigned char *buf, int size) +{ + const unsigned char *p, *pe; + + /* find byte-aligned syncword - need 12 (MPEG 1,2) or 11 (MPEG 2.5) matching bits */ + for (p = buf, pe = buf + size - 3; p <= pe; p++) + { + int pn; + if (p[0] != 0xff) + continue; + pn = p[1]; + if ((pn & 0xf8) != 0xf8 || // currently must be MPEG1 + (pn & 6) == 0) { // invalid layer + p++; continue; + } + pn = p[2]; + if ((pn & 0xf0) < 0x20 || (pn & 0xf0) == 0xf0 || // bitrates + (pn & 0x0c) != 0) { // not 44kHz + continue; + } + + return p - buf; + } + + return -1; +} diff --git a/platform/common/plat_sdl.c b/platform/common/plat_sdl.c index 4446f72e2..276a0c619 100644 --- a/platform/common/plat_sdl.c +++ b/platform/common/plat_sdl.c @@ -89,6 +89,8 @@ static const struct in_pdata in_sdl_platform_data = { /* YUV stuff */ static int yuv_ry[32], yuv_gy[32], yuv_by[32]; static unsigned char yuv_u[32 * 2], yuv_v[32 * 2]; +static unsigned char yuv_y[256]; +static struct uyvy { unsigned int y:8; unsigned int vyu:24; } yuv_uyvy[65536]; void bgr_to_uyvy_init(void) { @@ -119,34 +121,40 @@ void bgr_to_uyvy_init(void) v = 255; yuv_v[i + 32] = v; } + // valid Y range seems to be 16..235 + for (i = 0; i < 256; i++) { + yuv_y[i] = 16 + 219 * i / 32; + } + // everything combined into one large array for speed + for (i = 0; i < 65536; i++) { + int r = (i >> 11) & 0x1f, g = (i >> 6) & 0x1f, b = (i >> 0) & 0x1f; + int y = (yuv_ry[r] + yuv_gy[g] + yuv_by[b]) >> 16; + yuv_uyvy[i].y = yuv_y[y]; + yuv_uyvy[i].vyu = (yuv_v[r-y + 32] << 16) | (yuv_y[y] << 8) | yuv_u[b-y + 32]; + } } void rgb565_to_uyvy(void *d, const void *s, int pixels) { - unsigned int *dst = d; - const unsigned short *src = s; - const unsigned char *yu = yuv_u + 32; - const unsigned char *yv = yuv_v + 32; - int r0, g0, b0, r1, g1, b1; - int y0, y1, u, v; - - for (; pixels > 0; src += 2, dst++, pixels -= 2) + uint32_t *dst = d; + const uint16_t *src = s; + + if (plat_sdl_overlay->w > 2*plat_sdl_overlay->h) + for (; pixels > 0; src += 4, dst += 4, pixels -= 4) + { + struct uyvy *uyvy0 = yuv_uyvy + src[0], *uyvy1 = yuv_uyvy + src[1]; + struct uyvy *uyvy2 = yuv_uyvy + src[2], *uyvy3 = yuv_uyvy + src[3]; + dst[0] = (uyvy0->y << 24) | uyvy0->vyu; + dst[1] = (uyvy1->y << 24) | uyvy1->vyu; + dst[2] = (uyvy2->y << 24) | uyvy2->vyu; + dst[3] = (uyvy3->y << 24) | uyvy3->vyu; + } else + for (; pixels > 0; src += 4, dst += 2, pixels -= 4) { - r0 = (src[0] >> 11) & 0x1f; - g0 = (src[0] >> 6) & 0x1f; - b0 = src[0] & 0x1f; - r1 = (src[1] >> 11) & 0x1f; - g1 = (src[1] >> 6) & 0x1f; - b1 = src[1] & 0x1f; - y0 = (yuv_ry[r0] + yuv_gy[g0] + yuv_by[b0]) >> 16; - y1 = (yuv_ry[r1] + yuv_gy[g1] + yuv_by[b1]) >> 16; - u = yu[b0 - y0]; - v = yv[r0 - y0]; - // valid Y range seems to be 16..235 - y0 = 16 + 219 * y0 / 31; - y1 = 16 + 219 * y1 / 31; - - *dst = (y1 << 24) | (v << 16) | (y0 << 8) | u; + struct uyvy *uyvy0 = yuv_uyvy + src[0], *uyvy1 = yuv_uyvy + src[1]; + struct uyvy *uyvy2 = yuv_uyvy + src[2], *uyvy3 = yuv_uyvy + src[3]; + dst[0] = (uyvy1->y << 24) | uyvy0->vyu; + dst[1] = (uyvy3->y << 24) | uyvy2->vyu; } } @@ -272,7 +280,7 @@ void plat_init(void) if (shadow_size < 320 * 480 * 2) shadow_size = 320 * 480 * 2; - shadow_fb = malloc(shadow_size); + shadow_fb = calloc(1, shadow_size); g_menubg_ptr = calloc(1, shadow_size); if (shadow_fb == NULL || g_menubg_ptr == NULL) { fprintf(stderr, "OOM\n"); diff --git a/platform/common/version.h b/platform/common/version.h index ce4223b5b..cd811a665 100644 --- a/platform/common/version.h +++ b/platform/common/version.h @@ -1 +1 @@ -#define VERSION "1.92" +#define VERSION "1.96" diff --git a/platform/gizmondo/emu.c b/platform/gizmondo/emu.c index 86c473c2a..fcf271250 100644 --- a/platform/gizmondo/emu.c +++ b/platform/gizmondo/emu.c @@ -155,7 +155,7 @@ static void blit(const char *fps, const char *notice) } // a hack for VR if (PicoIn.AHW & PAHW_SVP) - memset32((int *)(Pico.est.Draw2FB+328*8+328*223), 0xe0e0e0e0, 328); + memset((int *)(Pico.est.Draw2FB+328*8+328*223), 0xe0e0e0e0, 328*4); if (!(Pico.video.reg[12]&1)) lines_flags|=0x10000; if (currentConfig.EmuOpt&0x4000) lines_flags|=0x40000; // (Pico.m.frame_count&1)?0x20000:0x40000; @@ -166,22 +166,25 @@ static void blit(const char *fps, const char *notice) int lines_flags; // 8bit accurate renderer if (Pico.m.dirtyPal) { - Pico.m.dirtyPal = 0; - vidConvCpyRGB565(localPal, Pico.cram, 0x40); + if (Pico.m.dirtyPal == 2) + Pico.m.dirtyPal = 0; + /* no support + switch (Pico.est.SonicPalCount) { + case 3: vidConvCpyRGB565(localPal+0xc0, Pico.est.SonicPal+0xc0, 0x40); + case 2: vidConvCpyRGB565(localPal+0x80, Pico.est.SonicPal+0x80, 0x40); + case 1: vidConvCpyRGB565(localPal+0x40, Pico.est.SonicPal+0x40, 0x40); + default://vidConvCpyRGB565(localPal, Pico.est.SonicPal, 0x40); + } */ + vidConvCpyRGB565(localPal, Pico.est.SonicPal, 0x40); if (Pico.video.reg[0xC]&8) { // shadow/hilight mode - //vidConvCpyRGB32sh(localPal+0x40, Pico.cram, 0x40); - //vidConvCpyRGB32hi(localPal+0x80, Pico.cram, 0x40); // TODO? - memcpy32((void *)(localPal+0xc0), (void *)(localPal+0x40), 0x40*2/4); + //vidConvCpyRGB32sh(localPal+0x40, Pico.est.SonicPal, 0x40); + //vidConvCpyRGB32hi(localPal+0x80, Pico.est.SonicPal, 0x40); // TODO? + memcpy((void *)(localPal+0xc0), (void *)(localPal+0x40), 0x40*2); localPal[0xc0] = 0x0600; localPal[0xd0] = 0xc000; localPal[0xe0] = 0x0000; // reserved pixels for OSD localPal[0xf0] = 0xffff; } - /* no support - else if (rendstatus & 0x20) { // mid-frame palette changes - vidConvCpyRGB565(localPal+0x40, HighPal, 0x40); - vidConvCpyRGB565(localPal+0x80, HighPal+0x40, 0x40); - } */ } lines_flags = (Pico.video.reg[1]&8) ? 240 : 224; if (!(Pico.video.reg[12]&1)) lines_flags|=0x10000; diff --git a/platform/gizmondo/menu.c b/platform/gizmondo/menu.c index 51f032f0d..1045f47b2 100644 --- a/platform/gizmondo/menu.c +++ b/platform/gizmondo/menu.c @@ -54,7 +54,7 @@ static unsigned int inp_prev = 0; void menu_draw_begin(int use_bgbuff) { if (use_bgbuff) - memcpy32((int *)menu_screen, (int *)bg_buffer, 321*240*2/4); + memcpy((int *)menu_screen, (int *)bg_buffer, 321*240*2); } @@ -66,7 +66,7 @@ void menu_draw_end(void) lprintf("%s: Framework2D_LockBuffer() returned NULL\n", __FUNCTION__); return; } - memcpy32(giz_screen, (int *)menu_screen, 321*240*2/4); + memcpy(giz_screen, (int *)menu_screen, 321*240*2); fb_unlock(); giz_screen = NULL; fb_flip(); diff --git a/platform/gp2x/940ctl.c b/platform/gp2x/940ctl.c index c270bfeea..cd3fcdc33 100644 --- a/platform/gp2x/940ctl.c +++ b/platform/gp2x/940ctl.c @@ -100,10 +100,10 @@ int YM2612Write_940(unsigned int a, unsigned int v, int scanline) UINT16 *writebuff = shared_ctl->writebuffsel ? shared_ctl->writebuff0 : shared_ctl->writebuff1; /* detect rapid ym updates */ - if (upd && !(writebuff_ptr & 0x80000000) && scanline < 224) + if (upd && !(writebuff_ptr & 0x80000000)) { - int mid = Pico.m.pal ? 68 : 93; - if (scanline > mid) { + int mid = (Pico.m.pal ? 313 : 262) / 2; + if (scanline >= mid) { //printf("%05i:%03i: rapid ym\n", Pico.m.frame_count, scanline); writebuff[writebuff_ptr++ & 0xffff] = 0xfffe; writebuff_ptr |= 0x80000000; @@ -282,7 +282,7 @@ void sharedmem940_finish(void) } -void YM2612Init_940(int baseclock, int rate) +void YM2612Init_940(int baseclock, int rate, int ssg) { static int oldrate; @@ -339,7 +339,7 @@ void YM2612Init_940(int baseclock, int rate) memset(shared_ctl, 0, sizeof(*shared_ctl)); /* cause local ym2612 to init REGS */ - YM2612Init_(baseclock, rate); + YM2612Init_(baseclock, rate, ssg); internal_reset(); @@ -425,8 +425,7 @@ int YM2612UpdateOne_940(int *buffer, int length, int stereo, int is_buf_empty) int mp3dec_decode(FILE *f, int *file_pos, int file_len) { if (!(PicoIn.opt & POPT_EXT_FM)) { - //mp3_update_local(buffer, length, stereo); - return 0; + return _mp3dec_decode(f, file_pos, file_len); } // check if playback was started, track not ended @@ -457,8 +456,7 @@ int mp3dec_decode(FILE *f, int *file_pos, int file_len) int mp3dec_start(FILE *f, int fpos_start) { if (!(PicoIn.opt & POPT_EXT_FM)) { - //mp3_start_play_local(f, pos); - return -1; + return _mp3dec_start(f, fpos_start); } if (loaded_mp3 != f) diff --git a/platform/gp2x/940ctl.h b/platform/gp2x/940ctl.h index 5b789dad3..dba6cc70d 100644 --- a/platform/gp2x/940ctl.h +++ b/platform/gp2x/940ctl.h @@ -1,7 +1,7 @@ void sharedmem940_init(void); void sharedmem940_finish(void); -void YM2612Init_940(int baseclock, int rate); +void YM2612Init_940(int baseclock, int rate, int ssg); void YM2612ResetChip_940(void); int YM2612UpdateOne_940(int *buffer, int length, int stereo, int is_buf_empty); diff --git a/platform/gp2x/PicoDrive.gpe b/platform/gp2x/PicoDrive.gpe index 1c0651856..59416d938 100644 --- a/platform/gp2x/PicoDrive.gpe +++ b/platform/gp2x/PicoDrive.gpe @@ -7,6 +7,8 @@ if ! [ -e /dev/accel ]; then export POLLUX_RAM_TIMINGS='ram_timings=2,9,4,1,1,1,1' export POLLUX_LCD_TIMINGS_NTSC='lcd_timings=397,1,37,277,341,0,17,337;clkdiv0=9' export POLLUX_LCD_TIMINGS_PAL='lcd_timings=428,1,37,277,341,0,17,337;clkdiv0=10' +else + export POLLUX_RAM_TIMINGS='ram_timings=3,9,4,1,1,1,1' fi ./PicoDrive "$@" diff --git a/platform/gp2x/code940/940.c b/platform/gp2x/code940/940.c index 760816eb4..db51fdc9c 100644 --- a/platform/gp2x/code940/940.c +++ b/platform/gp2x/code940/940.c @@ -2,7 +2,7 @@ // (c) Copyright 2006-2007, Grazvydas "notaz" Ignotas #include "940shared.h" -#include "../../common/mp3.h" +#include "../../common/helix/pub/mp3dec.h" static _940_data_t *shared_data = (_940_data_t *) 0x00100000; static _940_ctl_t *shared_ctl = (_940_ctl_t *) 0x00200000; @@ -19,7 +19,7 @@ void drain_wb(void); // is changed by other core just before we update it void set_if_not_changed(int *val, int oldval, int newval); -void _memcpy(void *dst, const void *src, int count); +extern void *memcpy(void *dest, const void *src, unsigned long n); // asm volatile ("mov r0, #0" ::: "r0"); // asm volatile ("mcr p15, 0, r0, c7, c6, 0" ::: "r0"); /* flush dcache */ @@ -153,6 +153,8 @@ void Main940(void) int job = 0; ym2612_940 = &shared_data->ym2612; +// extern unsigned __bss_start__, __bss_end__; +// memset(&__bss_start__, 0, &__bss_end__ - &__bss_start__); for (;;) { @@ -165,8 +167,9 @@ void Main940(void) case JOB940_INITALL: /* ym2612 */ shared_ctl->writebuff0[0] = shared_ctl->writebuff1[0] = 0xffff; - YM2612Init_(shared_ctl->baseclock, shared_ctl->rate); + YM2612Init_(shared_ctl->baseclock, shared_ctl->rate, 0); /* Helix mp3 decoder */ + __malloc_init(); shared_data->mp3dec = MP3InitDecoder(); break; @@ -185,7 +188,7 @@ void Main940(void) case JOB940_PICOSTATESAVE2: YM2612PicoStateSave2(0, 0); - _memcpy(shared_ctl->writebuff0, ym2612_940->REGS, 0x200); + memcpy(shared_ctl->writebuff0, ym2612_940->REGS, 0x200); break; case JOB940_PICOSTATELOAD2_PREP: @@ -193,7 +196,7 @@ void Main940(void) break; case JOB940_PICOSTATELOAD2: - _memcpy(ym2612_940->REGS, shared_ctl->writebuff0, 0x200); + memcpy(ym2612_940->REGS, shared_ctl->writebuff0, 0x200); YM2612PicoStateLoad2(0, 0); break; @@ -207,6 +210,7 @@ void Main940(void) case JOB940_MP3RESET: if (shared_data->mp3dec) MP3FreeDecoder(shared_data->mp3dec); + __malloc_init(); shared_data->mp3dec = MP3InitDecoder(); break; } @@ -215,4 +219,3 @@ void Main940(void) dcache_clean(); } } - diff --git a/platform/gp2x/code940/Makefile b/platform/gp2x/code940/Makefile index e327d1361..8561551b5 100644 --- a/platform/gp2x/code940/Makefile +++ b/platform/gp2x/code940/Makefile @@ -1,17 +1,23 @@ # you may or may not need to change this -#devkit_path = x:/stuff/dev/devkitgp2x/ -devkit_path ?= $(HOME)/opt/devkitGP2X/ -lgcc_path = $(devkit_path)lib/gcc/arm-linux/4.0.3/ -CROSS = arm-linux- +#devkit_path ?= $(HOME)/opt/devkitGP2X/ +#lgcc_path = $(devkit_path)lib/gcc/arm-linux/4.0.3/ #CROSS = $(devkit_path)bin/arm-linux- +#devkit_path ?= $(HOME)/opt/open2x +#lgcc_path = $(devkit_path)/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1/ +#CROSS ?= $(devkit_path)/gcc-4.1.1-glibc-2.3.6/bin/arm-open2x-linux- +#devkit_path ?= $(HOME)/opt/arm-unknown-linux-gnu +#lgcc_path = $(HOME)/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1/ +#CROSS ?= $(devkit_path)/bin/arm-unknown-linux-gnu- +lgcc_path = $(HOME)/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1/ +CROSS ?= arm-linux-gnueabi- # settings #up = 1 -CFLAGS += -O2 -Wall -fomit-frame-pointer -fstrict-aliasing -ffast-math -CFLAGS += -I../.. -I. -D__GP2X__ -DARM -CFLAGS += -mcpu=arm940t -mtune=arm940t -LDFLAGS = -static -s -e code940 -Ttext 0x0 -L$(lgcc_path) -lgcc +CFLAGS += -O2 -Wall -mno-thumb-interwork -fstrict-aliasing -ffast-math +CFLAGS += -I../../common/helix/pub -I../../.. -I. -D__GP2X__ -DARM +CFLAGS += -mcpu=arm940t -mtune=arm940t -mabi=apcs-gnu -mfloat-abi=soft -mfpu=fpa +LDFLAGS = -static -e code940 -Ttext 0x0 -L$(lgcc_path) -lgcc GCC = $(CROSS)gcc STRIP = $(CROSS)strip @@ -36,7 +42,9 @@ all: $(BIN) # stuff for 940 core # init, emu_control, emu -OBJS940 += 940init.o 940.o 940ym2612.o memcpy.o misc_arm.o mp3.o +OBJS940 += 940init.o 940.o 940ym2612.o misc_arm.o mp3_sync.o +# the asm memcpy code crashes job LOAD2 on 940. Possibly a globbered reg? +# OBJS940 += memcpy.o # the asm code seems to be faster when run on 920, but not on 940 for some reason # OBJS940 += ../../Pico/sound/ym2612_asm.o @@ -44,12 +52,13 @@ OBJS940 += 940init.o 940.o 940ym2612.o memcpy.o misc_arm.o mp3.o OBJS940 += uClibc/memset.o uClibc/s_floor.o uClibc/e_pow.o uClibc/e_sqrt.o uClibc/s_fabs.o OBJS940 += uClibc/s_scalbn.o uClibc/s_copysign.o uClibc/k_sin.o uClibc/k_cos.o uClibc/s_sin.o OBJS940 += uClibc/e_rem_pio2.o uClibc/k_rem_pio2.o uClibc/e_log.o uClibc/wrappers.o +LIBHELIX ?= ../../common/helix/$(notdir $(CROSS))helix_mp3.a $(BIN) : code940.elf @echo ">>>" $@ $(OBJCOPY) -O binary $< $@ -code940.elf : $(OBJS940) ../../common/helix/$(CROSS)helix-mp3.a +code940.elf : $(OBJS940) $(LIBHELIX) @echo ">>>" $@ $(LD) $^ $(LDFLAGS) -o $@ -Map code940.map @@ -64,8 +73,12 @@ misc_arm.o : ../../../pico/misc_arm.s @echo ">>>" $@ $(GCC) $(CFLAGS) -DEXTERNAL_YM2612 -c $< -o $@ -../../common/helix/helix_mp3.a: - @make -C ../../common/helix/ +mp3_sync.o: ../../common/mp3_sync.c + @echo ">>>" $@ + $(GCC) $(CFLAGS) -Os -DCODE940 -c $< -o $@ + +$(LIBHELIX): + @$(MAKE) -C ../../common/helix/ CROSS=$(CROSS) up: $(BIN) @@ -82,7 +95,7 @@ tidy: ## OBJSMP3T = mp3test.o ../gp2x.o ../asmutils.o ../usbjoy.o -mp3test.gpe : $(OBJSMP3T) ../helix/helix_mp3.a +mp3test.gpe : $(OBJSMP3T) $(LIBHELIX) $(GCC) -static -o $@ $^ $(STRIP) $@ @cp -v $@ /mnt/gp2x/mnt/sd diff --git a/platform/gp2x/code940/memcpy.s b/platform/gp2x/code940/memcpy.s index 282762fd0..1350639a7 100644 --- a/platform/gp2x/code940/memcpy.s +++ b/platform/gp2x/code940/memcpy.s @@ -114,14 +114,12 @@ subs r2, r2, #0x14 blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ stmdb sp!, {r4, r7, r8, r9, r10} /* borrow r4 */ -/* blat 64 bytes at a time */ +/* blat 32 bytes at a time */ /* XXX for really big copies perhaps we should use more registers */ Lmemcpy_floop32: ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr} stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr} -ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr} -stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr} -subs r2, r2, #0x40 +subs r2, r2, #0x20 bge Lmemcpy_floop32 cmn r2, #0x10 @@ -314,14 +312,12 @@ stmdb sp!, {r4, r7, r8, r9, r10, lr} subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */ blt Lmemcpy_bl32 -/* blat 64 bytes at a time */ +/* blat 32 bytes at a time */ /* XXX for really big copies perhaps we should use more registers */ Lmemcpy_bloop32: ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr} stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr} -ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr} -stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr} -subs r2, r2, #0x40 +subs r2, r2, #0x20 bge Lmemcpy_bloop32 Lmemcpy_bl32: diff --git a/platform/gp2x/code940/mp3test.c b/platform/gp2x/code940/mp3test.c index 9072d858b..cd2a66514 100644 --- a/platform/gp2x/code940/mp3test.c +++ b/platform/gp2x/code940/mp3test.c @@ -13,7 +13,7 @@ //#include "emu.h" //#include "menu.h" #include "../asmutils.h" -#include "../helix/pub/mp3dec.h" +#include "../../helix/pub/mp3dec.h" /* we will need some gp2x internals here */ extern volatile unsigned short *gp2x_memregs; /* from minimal library rlyeh */ diff --git a/platform/gp2x/code940/uClibc/memset.s b/platform/gp2x/code940/uClibc/memset.s index 0923014cd..80cdcb58d 100644 --- a/platform/gp2x/code940/uClibc/memset.s +++ b/platform/gp2x/code940/uClibc/memset.s @@ -22,7 +22,7 @@ .text .global memset .type memset,%function - .align 4 + .align 2 memset: mov a4, a1 diff --git a/platform/gp2x/code940/uClibc/wrappers.c b/platform/gp2x/code940/uClibc/wrappers.c index cc4e269ed..ce95a48c0 100644 --- a/platform/gp2x/code940/uClibc/wrappers.c +++ b/platform/gp2x/code940/uClibc/wrappers.c @@ -4,9 +4,17 @@ double pow(double x, double y) { return __ieee754_pow(x, y); } +double __pow_finite(double x, double y) +{ + return __ieee754_pow(x, y); +} double log(double x) { return __ieee754_log(x); } +double __log_finite(double x) +{ + return __ieee754_log(x); +} diff --git a/platform/gp2x/emu.c b/platform/gp2x/emu.c index 7e9a132f3..1deb84da5 100644 --- a/platform/gp2x/emu.c +++ b/platform/gp2x/emu.c @@ -55,7 +55,7 @@ void pemu_prep_defconfig(void) gp2x_soc_t soc; defaultConfig.CPUclock = default_cpu_clock; - defaultConfig.renderer32x = RT_8BIT_FAST; + defaultConfig.renderer32x = RT_8BIT_ACC; defaultConfig.analog_deadzone = 50; soc = soc_detect(); @@ -291,38 +291,51 @@ static int EmuScanEnd16_ld(unsigned int num) } static int localPal[0x100]; +static int localPalSize; + static void (*vidcpyM2)(void *dest, void *src, int m32col, int with_32c_border); static int (*make_local_pal)(int fast_mode); static int make_local_pal_md(int fast_mode) { - int pallen = 0xc0; - - bgr444_to_rgb32(localPal, Pico.cram); - if (fast_mode) - return 0x40; + int pallen = 0x100; - if (Pico.video.reg[0xC] & 8) { // shadow/hilight mode - bgr444_to_rgb32_sh(localPal, Pico.cram); - localPal[0xc0] = 0x0000c000; - localPal[0xd0] = 0x00c00000; - localPal[0xe0] = 0x00000000; // reserved pixels for OSD - localPal[0xf0] = 0x00ffffff; - pallen = 0x100; + if (fast_mode) { + bgr444_to_rgb32(localPal, PicoMem.cram); + pallen = 0x40; + Pico.m.dirtyPal = 0; } else if (Pico.est.rendstatus & PDRAW_SONIC_MODE) { // mid-frame palette changes - bgr444_to_rgb32(localPal+0x40, Pico.est.HighPal); - bgr444_to_rgb32(localPal+0x80, Pico.est.HighPal+0x40); + switch (Pico.est.SonicPalCount) { + case 3: bgr444_to_rgb32(localPal+0xc0, Pico.est.SonicPal+0xc0); + case 2: bgr444_to_rgb32(localPal+0x80, Pico.est.SonicPal+0x80); + case 1: bgr444_to_rgb32(localPal+0x40, Pico.est.SonicPal+0x40); + default:bgr444_to_rgb32(localPal, Pico.est.SonicPal); + } + pallen = (Pico.est.SonicPalCount+1)*0x40; } - else - memcpy(localPal + 0x80, localPal, 0x40 * 4); // for spr prio mess + else if (Pico.video.reg[0xC] & 8) { // shadow/hilight mode + bgr444_to_rgb32(localPal, Pico.est.SonicPal); + bgr444_to_rgb32_sh(localPal, Pico.est.SonicPal); + } + else { + bgr444_to_rgb32(localPal, Pico.est.SonicPal); + memcpy(localPal+0x40, localPal, 0x40*4); // for spr prio mess + memcpy(localPal+0x80, localPal, 0x80*4); // for spr prio mess + } + localPal[0xc0] = 0x0000c000; + localPal[0xd0] = 0x00c00000; + localPal[0xe0] = 0x00000000; // reserved pixels for OSD + localPal[0xf0] = 0x00ffffff; + if (Pico.m.dirtyPal == 2) + Pico.m.dirtyPal = 0; return pallen; } static int make_local_pal_sms(int fast_mode) { - unsigned short *spal = Pico.cram; + unsigned short *spal = PicoMem.cram; unsigned int *dpal = (void *)localPal; unsigned int i, t; @@ -334,25 +347,21 @@ static int make_local_pal_sms(int fast_mode) *dpal++ = t; } + Pico.m.dirtyPal = 0; return 0x40; } void pemu_finalize_frame(const char *fps, const char *notice) { int emu_opt = currentConfig.EmuOpt; - int ret; if (PicoIn.AHW & PAHW_32X) - ; // nothing to do + localPalSize = 0; // nothing to do else if (get_renderer() == RT_8BIT_FAST) { // 8bit fast renderer - if (Pico.m.dirtyPal) { - Pico.m.dirtyPal = 0; - ret = make_local_pal(1); - // feed new palette to our device - gp2x_video_setpalette(localPal, ret); - } + if (Pico.m.dirtyPal) + localPalSize = make_local_pal(1); // a hack for VR if (PicoIn.AHW & PAHW_SVP) memset32((int *)(Pico.est.Draw2FB+328*8+328*223), 0xe0e0e0e0, 328); @@ -364,12 +373,9 @@ void pemu_finalize_frame(const char *fps, const char *notice) { // 8bit accurate renderer if (Pico.m.dirtyPal) - { - Pico.m.dirtyPal = 0; - ret = make_local_pal(0); - gp2x_video_setpalette(localPal, ret); - } + localPalSize = make_local_pal(0); } + else localPalSize = 0; // no palette in 16bit mode if (notice) osd_text(4, osd_y, notice); @@ -385,6 +391,10 @@ void plat_video_flip(void) { int stride = g_screen_width; gp2x_video_flip(); + // switching the palette takes immediate effect, whilst flipping only + // takes effect with the next vsync; unavoidable flicker may occur! + if (localPalSize) + gp2x_video_setpalette(localPal, localPalSize); if (is_16bit_mode()) stride *= 2; @@ -502,9 +512,6 @@ static void vid_reset_mode(void) if (renderer == RT_16BIT && (currentConfig.EmuOpt & EOPT_WIZ_TEAR_FIX)) { PicoDrawSetOutFormat(PDF_RGB555, 1); } - else { - PicoDrawSetOutFormat(PDF_NONE, 0); - } PicoDrawSetOutBuf(g_screen_ptr, g_screen_width * 2); gp2x_mode = 16; } @@ -537,10 +544,7 @@ static void vid_reset_mode(void) localPal[0xe0] = 0x00000000; // reserved pixels for OSD localPal[0xf0] = 0x00ffffff; gp2x_video_setpalette(localPal, 0x100); - gp2x_memset_all_buffers(0, 0xe0, 320*240); } - else - gp2x_memset_all_buffers(0, 0, 320*240*2); if (currentConfig.EmuOpt & EOPT_WIZ_TEAR_FIX) gp2x_mode = -gp2x_mode; @@ -723,6 +727,8 @@ void pemu_forced_frame(int no_scale, int do_emu) PicoDrawSetCallbacks(NULL, NULL); Pico.m.dirtyPal = 1; + if (!no_scale) + no_scale = currentConfig.scaling == EOPT_SCALE_NONE; emu_cmn_forced_frame(no_scale, do_emu); g_menubg_src_ptr = g_screen_ptr; diff --git a/platform/libpicofe b/platform/libpicofe index 795b71c57..811cef4d9 160000 --- a/platform/libpicofe +++ b/platform/libpicofe @@ -1 +1 @@ -Subproject commit 795b71c571518b310a22138141bb6d1cd08d85f6 +Subproject commit 811cef4d9f3772d0bbf6c1f0434e5860c9550abc diff --git a/platform/libretro/libretro.c b/platform/libretro/libretro.c index 77e2c50ce..de727606c 100644 --- a/platform/libretro/libretro.c +++ b/platform/libretro/libretro.c @@ -116,6 +116,8 @@ static short ALIGNED(4) sndBuffer[2*INITIAL_SND_RATE/50]; static void snd_write(int len); +char **g_argv; + #ifdef _WIN32 #define SLASH '\\' #else @@ -565,6 +567,8 @@ void emu_video_mode_change(int start_line, int line_count, int is_32cols) void emu_32x_startup(void) { + PicoDrawSetOutFormat(PDF_RGB555, 0); + PicoDrawSetOutBuf(vout_buf, vout_width * 2); } void lprintf(const char *fmt, ...) diff --git a/platform/linux/blit.c b/platform/linux/blit.c index 96326fe13..82bc4ba53 100644 --- a/platform/linux/blit.c +++ b/platform/linux/blit.c @@ -61,10 +61,11 @@ void vidcpy_m2(void *dest, void *src, int m32col, int with_32c_border) for (i = 0; i < 224; i++) { ps += 8; + ps += 32; pd += 32; for (u = 0; u < 256; u++) *pd++ = *ps++; - ps += 64; + ps += 32; pd += 32; } } else { diff --git a/platform/linux/emu.c b/platform/linux/emu.c index 5d4432fa0..597c13086 100644 --- a/platform/linux/emu.c +++ b/platform/linux/emu.c @@ -29,7 +29,7 @@ void pemu_prep_defconfig(void) void pemu_validate_config(void) { -#if !defined(__arm__) && !defined(__i386__) && !defined(__x86_64__) +#if !defined(__arm__) && !defined(__aarch64__) && !defined(__mips__) && !defined(__riscv__) && !defined(__riscv) && !defined(__powerpc__) && !defined(__i386__) && !defined(__x86_64__) PicoIn.opt &= ~POPT_EN_DRC; #endif } @@ -39,10 +39,11 @@ static void draw_cd_leds(void) int led_reg, pitch, scr_offs, led_offs; led_reg = Pico_mcd->s68k_regs[0]; - pitch = 320; + pitch = g_screen_ppitch; led_offs = 4; scr_offs = pitch * 2 + 4; +#if 0 if (currentConfig.renderer != RT_16BIT) { #define p(x) px[(x) >> 2] // 8-bit modes @@ -52,7 +53,9 @@ static void draw_cd_leds(void) p(pitch*0) = p(pitch*1) = p(pitch*2) = col_g; p(pitch*0 + led_offs) = p(pitch*1 + led_offs) = p(pitch*2 + led_offs) = col_r; #undef p - } else { + } else +#endif + { #define p(x) px[(x)*2 >> 2] = px[((x)*2 >> 2) + 1] // 16-bit modes unsigned int *px = (unsigned int *)((short *)g_screen_ptr + scr_offs); @@ -71,8 +74,8 @@ void pemu_finalize_frame(const char *fps, const char *notice) unsigned char *ps = Pico.est.Draw2FB + 328*8 + 8; unsigned short *pal = Pico.est.HighPal; int i, x; - if (Pico.m.dirtyPal) - PicoDrawUpdateHighPal(); + + PicoDrawUpdateHighPal(); for (i = 0; i < 224; i++, ps += 8) for (x = 0; x < 320; x++) *pd++ = pal[*ps++]; @@ -109,6 +112,8 @@ static void apply_renderer(void) if (PicoIn.AHW & PAHW_32X) PicoDrawSetOutBuf(g_screen_ptr, g_screen_ppitch * 2); + + Pico.m.dirtyPal = 1; } void plat_video_toggle_renderer(int change, int is_menu) @@ -174,7 +179,10 @@ void plat_debug_cat(char *str) void emu_video_mode_change(int start_line, int line_count, int is_32cols) { // clear whole screen in all buffers - memset32(g_screen_ptr, 0, g_screen_ppitch * g_screen_height * 2 / 4); + if (currentConfig.renderer != RT_16BIT && !(PicoIn.AHW & PAHW_32X)) + memset32(Pico.est.Draw2FB, 0, (320+8) * (8+240+8) / 4); + else + memset32(g_screen_ptr, 0, g_screen_ppitch * g_screen_height * 2 / 4); } void pemu_loop_prep(void) diff --git a/platform/linux/pprof.c b/platform/linux/pprof.c index e1ecd1fd4..6c7c0ff9f 100644 --- a/platform/linux/pprof.c +++ b/platform/linux/pprof.c @@ -1,21 +1,46 @@ #include #include #include +#include #include #include #include +#include #include +int rc_mem[pp_total_points]; + struct pp_counters *pp_counters; +int *refcounts = rc_mem; static int shmemid; +static unsigned long devMem; +volatile unsigned long *gp2x_memregl; +volatile unsigned short *gp2x_memregs; + void pprof_init(void) { int this_is_new_shmem = 1; key_t shmemkey; void *shmem; +#if 0 + devMem = open("/dev/mem", O_RDWR); + if (devMem == -1) + { + perror("pprof: open failed"); + return; + } + gp2x_memregl = (unsigned long *)mmap(0, 0x10000, PROT_READ|PROT_WRITE, MAP_SHARED, devMem, 0xc0000000); + if (gp2x_memregl == (unsigned long *)-1) + { + perror("pprof: mmap failed"); + return; + } + gp2x_memregs = (unsigned short *)gp2x_memregl; +#endif + #ifndef PPROF_TOOL unsigned int tmp = pprof_get_one(); printf("pprof: measured diff is %u\n", pprof_get_one() - tmp); @@ -28,11 +53,11 @@ void pprof_init(void) return; } -#ifndef PPROF_TOOL +//#ifndef PPROF_TOOL shmemid = shmget(shmemkey, sizeof(*pp_counters), IPC_CREAT | IPC_EXCL | 0644); if (shmemid == -1) -#endif +//#endif { shmemid = shmget(shmemkey, sizeof(*pp_counters), 0644); @@ -76,15 +101,18 @@ static const struct { IT(draw), IT(sound), IT(m68k), + IT(s68k), + IT(mem68), IT(z80), IT(msh2), IT(ssh2), + IT(memsh), IT(dummy), }; int main(int argc, char *argv[]) { - unsigned long long old[pp_total_points], new[pp_total_points]; + pp_type old[pp_total_points], new[pp_total_points]; int base = 0; int l, i; @@ -107,11 +135,12 @@ int main(int argc, char *argv[]) memcpy(new, pp_counters->counter, sizeof(new)); for (i = 0; i < ARRAY_SIZE(pp_tab); i++) { - unsigned long long idiff = new[i] - old[i]; - unsigned long long bdiff = (new[base] - old[base]) | 1; + pp_type idiff = new[i] - old[i]; + pp_type bdiff = (new[base] - old[base]) | 1; printf("%6.2f ", (double)idiff * 100.0 / bdiff); } printf("\n"); + fflush(stdout); memcpy(old, new, sizeof(old)); if (argc < 3) diff --git a/platform/linux/pprof.h b/platform/linux/pprof.h index cccbcbd5b..91fd5b09f 100644 --- a/platform/linux/pprof.h +++ b/platform/linux/pprof.h @@ -7,21 +7,22 @@ enum pprof_points { pp_draw, pp_sound, pp_m68k, + pp_s68k, + pp_mem68, pp_z80, pp_msh2, pp_ssh2, + pp_memsh, pp_dummy, pp_total_points }; -struct pp_counters -{ - unsigned long long counter[pp_total_points]; -}; - extern struct pp_counters *pp_counters; +extern int *refcounts; #ifdef __i386__ +typedef unsigned long long pp_type; + static __attribute__((always_inline)) inline unsigned int pprof_get_one(void) { unsigned long long ret; @@ -31,24 +32,38 @@ static __attribute__((always_inline)) inline unsigned int pprof_get_one(void) #define unglitch_timer(x) #elif defined(__GP2X__) +typedef unsigned long pp_type; + +#if 0 // XXX: MMSP2 only, timer sometimes seems to return lower vals? extern volatile unsigned long *gp2x_memregl; #define pprof_get_one() (unsigned int)gp2x_memregl[0x0a00 >> 2] #define unglitch_timer(di) \ if ((signed int)(di) < 0) di = 0 +#else +extern unsigned int (*gp2x_get_ticks_us)(void); +#define pprof_get_one() gp2x_get_ticks_us() +#define unglitch_timer(di) \ + if ((signed int)(di) < 0) di = 0 +#endif #else #error no timer #endif +struct pp_counters +{ + pp_type counter[pp_total_points]; +}; + #define pprof_start(point) { \ - unsigned int pp_start_##point = pprof_get_one() + unsigned int pp_start_##point = pprof_get_one(); refcounts[pp_##point]++ #define pprof_end(point) \ { \ unsigned int di = pprof_get_one() - pp_start_##point; \ unglitch_timer(di); \ - pp_counters->counter[pp_##point] += di; \ + if (!--refcounts[pp_##point]) pp_counters->counter[pp_##point] += di; \ } \ } @@ -57,7 +72,7 @@ extern volatile unsigned long *gp2x_memregl; { \ unsigned int di = pprof_get_one() - pp_start_##point; \ unglitch_timer(di); \ - pp_counters->counter[pp_##point] -= di; \ + if (--refcounts[pp_##point]) pp_counters->counter[pp_##point] -= di; \ } \ } diff --git a/platform/psp/emu.c b/platform/psp/emu.c index 5c0cb57f7..5c7ff2162 100644 --- a/platform/psp/emu.c +++ b/platform/psp/emu.c @@ -201,13 +201,22 @@ static void do_pal_update(int allow_sh, int allow_as) //for (i = 0x3f/2; i >= 0; i--) // dpal[i] = ((spal[i]&0x000f000f)<< 1)|((spal[i]&0x00f000f0)<<3)|((spal[i]&0x0f000f00)<<4); - do_pal_convert(localPal, Pico.cram, currentConfig.gamma, currentConfig.gamma2); - - Pico.m.dirtyPal = 0; - need_pal_upload = 1; - - if (allow_sh && (Pico.video.reg[0xC]&8)) // shadow/hilight? + if ((currentConfig.EmuOpt&0x80) || (PicoOpt&0x10)) { + do_pal_convert(localPal, Pico.cram, currentConfig.gamma, currentConfig.gamma2); + Pico.m.dirtyPal = 0; + } + else if (Pico.est.rendstatus&0x20) + { + switch (Pico.est.SonicPalCount) { + case 3: do_pal_convert(localPal+0xc0, Pico.est.SonicPal+0xc0, currentConfig.gamma, currentConfig.gamma2); + case 2: do_pal_convert(localPal+0x80, Pico.est.SonicPal+0x80, currentConfig.gamma, currentConfig.gamma2); + case 1: do_pal_convert(localPal+0x40, Pico.est.SonicPal+0x40, currentConfig.gamma, currentConfig.gamma2); + default:do_pal_convert(localPal, Pico.est.SonicPal, currentConfig.gamma, currentConfig.gamma2); + } + } + else if (allow_sh && (Pico.video.reg[0xC]&8)) // shadow/hilight? { + do_pal_convert(localPal, Pico.est.SonicPal, currentConfig.gamma, currentConfig.gamma2); // shadowed pixels for (i = 0x3f/2; i >= 0; i--) dpal[0x20|i] = dpal[0x60|i] = (dpal[i]>>1)&0x7bcf7bcf; @@ -223,6 +232,16 @@ static void do_pal_update(int allow_sh, int allow_as) localPal[0xe0] = 0; localPal[0xf0] = 0x001f; } + else if (allow_as) + { + do_pal_convert(localPal, Pico.est.SonicPal, currentConfig.gamma, currentConfig.gamma2); + memcpy((int *)dpal+0x40/2, (void *)localPal, 0x40*2); + memcpy((int *)dpal+0x80/2, (void *)localPal, 0x80*2); + } + + if (Pico.m.dirtyPal == 2) + Pico.m.dirtyPal = 0; + need_pal_upload = 1; } static void do_slowmode_lines(int line_to) @@ -639,7 +658,7 @@ static void writeSound(int len) PicoIn.sndOut += len / 2; /*if (PicoIn.sndOut > sndBuffer_endptr) { - memcpy32((int *)(void *)sndBuffer, (int *)endptr, (PicoIn.sndOut - endptr + 1) / 2); + memcpy((int *)(void *)sndBuffer, (int *)endptr, (PicoIn.sndOut - endptr + 1) * 2); PicoIn.sndOut = &sndBuffer[PicoIn.sndOut - endptr]; lprintf("mov\n"); } diff --git a/platform/psp/menu.c b/platform/psp/menu.c index ab022f979..fc31b8e79 100644 --- a/platform/psp/menu.c +++ b/platform/psp/menu.c @@ -59,7 +59,7 @@ void menu_draw_begin(void) // int i; // for (i = 272; i >= 0; i--, dst += 512, src += 480) - // memcpy32((int *)dst, (int *)src, 480*2/4); + // memcpy((int *)dst, (int *)src, 480*2); sceGuSync(0,0); // sync with prev sceGuStart(GU_DIRECT, guCmdList); diff --git a/tools/Makefile b/tools/Makefile index 28b748d44..752cd6b26 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -1,13 +1,10 @@ -CFLAGS = -Wall -ggdb - -TARGETS = amalgamate textfilter mkoffsets +TARGETS = amalgamate textfilter OBJS = $(addsuffix .o,$(TARGETS)) all: $(TARGETS) + CC="$(XCC)" CFLAGS="$(XCFLAGS)" ./mkoffsets.sh ../pico clean: $(RM) $(TARGETS) $(OBJS) -mkoffsets: CFLAGS += -m32 -I.. - .PHONY: clean all diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh new file mode 100755 index 000000000..6b086a939 --- /dev/null +++ b/tools/mkoffsets.sh @@ -0,0 +1,115 @@ +# automatically compute structure offsets for gcc targets in ELF format +# (C) 2018 Kai-Uwe Bloem. This work is placed in the public domain. +# +# usage: mkoffsets + +CC=${CC:-gcc} + +# endianess of target (automagically determined below) +ENDIAN= + +# don't do this if ELF format isn't used. it doesn't matter since offsets are +# only needed for the asm parts (currently mips/arm32) and those have ELF +check_elf () +{ + echo '#include ' >/tmp/getoffs.c + echo "const int32_t val = 1;" >>/tmp/getoffs.c + $CC $CFLAGS -I .. -c /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 + if ! command -v readelf >/dev/null || ! file /tmp/getoffs.o | grep -q ELF; then + echo "/* mkoffset.sh: no readelf or not ELF, offset table not created */" >$fn + echo "WARNING: no readelf or not ELF, offset table not created" + exit + fi +} + +# compile with target C compiler and extract value from .rodata section +compile_rodata () +{ + $CC $CFLAGS -I .. -c /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 + # find the name of the .rodata section (in case -fdata-sections is used) + rosect=$(readelf -S /tmp/getoffs.o | grep '\.rodata\|\.sdata' | + sed 's/^[^.]*././;s/ .*//') + # read out .rodata section as hex string (should be only 4 bytes) + ro=$(readelf -x $rosect /tmp/getoffs.o | grep '0x' | cut -c14-48 | + tr -d ' \n' | cut -c1-8) + if [ "$ENDIAN" = "le" ]; then + # swap needed for le target + hex="" + for b in $(echo $ro | sed 's/\([0-9a-f]\{2\}\)/\1 /g'); do + hex=$b$hex; + done + else + hex=$ro + fi + # extract decimal value from hex string + rodata=$(printf "%d" 0x$hex) +} + +# determine member offset and create #define +get_define () # prefix struct member member... +{ + prefix=$1; shift + struct=$1; shift + field=$(echo $* | sed 's/ /./g') + name=$(echo $* | sed 's/ /_/g') + echo '#include ' > /tmp/getoffs.c + echo '#include "pico/pico_int.h"' >> /tmp/getoffs.c + echo "static const struct $struct p;" >> /tmp/getoffs.c + echo "const int32_t val = (char *)&p.$field - (char*)&p;" >>/tmp/getoffs.c + compile_rodata + line=$(printf "#define %-20s 0x%04x" $prefix$name $rodata) +} + +fn="${1:-.}/pico_int_offs.h" +if echo $CFLAGS | grep -qe -flto; then CFLAGS="$CFLAGS -fno-lto"; fi + +check_elf +# determine endianess +echo '#include ' >/tmp/getoffs.c +echo "const int32_t val = 1;" >>/tmp/getoffs.c +compile_rodata +ENDIAN=$(if [ "$rodata" -eq 1 ]; then echo be; else echo le; fi) +# output header +echo "/* autogenerated by mkoffset.sh, do not edit */" >$fn +echo "/* target endianess: $ENDIAN, compiled with: $CC $CFLAGS */" >>$fn +# output offsets +get_define OFS_Pico_ Pico video reg ; echo "$line" >>$fn +get_define OFS_Pico_ Pico m rotate ; echo "$line" >>$fn +get_define OFS_Pico_ Pico m z80Run ; echo "$line" >>$fn +get_define OFS_Pico_ Pico m dirtyPal ; echo "$line" >>$fn +get_define OFS_Pico_ Pico m hardware ; echo "$line" >>$fn +get_define OFS_Pico_ Pico m z80_reset ; echo "$line" >>$fn +get_define OFS_Pico_ Pico m sram_reg ; echo "$line" >>$fn +get_define OFS_Pico_ Pico sv ; echo "$line" >>$fn +get_define OFS_Pico_ Pico sv data ; echo "$line" >>$fn +get_define OFS_Pico_ Pico sv start ; echo "$line" >>$fn +get_define OFS_Pico_ Pico sv end ; echo "$line" >>$fn +get_define OFS_Pico_ Pico sv flags ; echo "$line" >>$fn +get_define OFS_Pico_ Pico rom ; echo "$line" >>$fn +get_define OFS_Pico_ Pico romsize ; echo "$line" >>$fn +get_define OFS_Pico_ Pico est ; echo "$line" >>$fn + +get_define OFS_EST_ PicoEState DrawScanline ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState rendstatus ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState DrawLineDest ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState HighCol ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState HighPreSpr ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState Pico ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState PicoMem_vram ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState PicoMem_cram ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState PicoOpt ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState Draw2FB ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState HighPal ; echo "$line" >>$fn + +get_define OFS_PMEM_ PicoMem vram ; echo "$line" >>$fn +get_define OFS_PMEM_ PicoMem vsram ; echo "$line" >>$fn +get_define OFS_PMEM32x_ Pico32xMem pal_native ; echo "$line" >>$fn + +get_define OFS_SH2_ SH2_ is_slave ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_bios ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_da ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_sdram ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_rom ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_dram ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_drcblk_da ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_drcblk_ram ; echo "$line" >>$fn