From 99dd1cbcc3d253a96ae32d912c326de541473044 Mon Sep 17 00:00:00 2001 From: orbea Date: Sun, 1 Apr 2018 19:43:22 -0700 Subject: [PATCH 001/174] Makefile: Build with optimizations if DEBUG=0 --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index d5a6ea725..3d7cbd629 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ $(LD) ?= $(CC) TARGET ?= PicoDrive +CFLAGS += -Wall -g CFLAGS += -I. CYCLONE_CC ?= gcc CYCLONE_CXX ?= g++ From 2d52bde825a3fe0c72c4b18f7dc75157ef678631 Mon Sep 17 00:00:00 2001 From: orbea Date: Tue, 3 Apr 2018 10:41:26 -0700 Subject: [PATCH 002/174] libretro: Allow setting GIT_VERSION. --- Makefile.libretro | 2 +- jni/Android.mk | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.libretro b/Makefile.libretro index daeb02855..89cfea1c4 100644 --- a/Makefile.libretro +++ b/Makefile.libretro @@ -41,7 +41,7 @@ CFLAGS += -I platform/libretro/libretro-common/include/vfs STATIC_LINKING:= 0 TARGET_NAME := picodrive LIBM := -lm -GIT_VERSION := " $(shell git rev-parse --short HEAD || echo unknown)" +GIT_VERSION ?= " $(shell git rev-parse --short HEAD || echo unknown)" ifneq ($(GIT_VERSION)," unknown") CFLAGS += -DGIT_VERSION=\"$(GIT_VERSION)\" endif diff --git a/jni/Android.mk b/jni/Android.mk index b72cbba9a..1ff6c9e72 100644 --- a/jni/Android.mk +++ b/jni/Android.mk @@ -52,8 +52,8 @@ SOURCES_C := $(LIBRETRO_DIR)/libretro.c \ COREFLAGS := $(addprefix -D,$(DEFINES)) -fno-strict-aliasing -GIT_VERSION := " $(shell git rev-parse --short HEAD || echo unknown)" -ifneq ($(GIT_VERSION)," unknown") +GIT_VERSION := $(shell git rev-parse --short HEAD || echo unknown) +ifneq ($(GIT_VERSION),"unknown") COREFLAGS += -DGIT_VERSION=\"$(GIT_VERSION)\" endif From e7e09a298ed7050d6764c3641dcf03df71805051 Mon Sep 17 00:00:00 2001 From: notaz Date: Fri, 25 Jan 2019 01:31:56 +0200 Subject: [PATCH 003/174] release 1.93 just because orbea wants a release tarball --- platform/common/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platform/common/version.h b/platform/common/version.h index ce4223b5b..f65ba1eda 100644 --- a/platform/common/version.h +++ b/platform/common/version.h @@ -1 +1 @@ -#define VERSION "1.92" +#define VERSION "1.93" From 15e04456e454af720740f61bf2481a76eba4d796 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 15 Mar 2019 20:51:51 +0100 Subject: [PATCH 004/174] fix gp2x compilation (using linaro arm gcc 4.7 on ubuntu) --- Makefile | 4 +++- pico/sound/ym2612.h | 9 ++++----- platform/common/common.mak | 2 +- platform/gp2x/emu.c | 6 +++--- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 3d7cbd629..55e76f91b 100644 --- a/Makefile +++ b/Makefile @@ -125,6 +125,8 @@ OBJS += platform/gp2x/vid_pollux.o OBJS += platform/gp2x/warm.o USE_FRONTEND = 1 PLATFORM_MP3 = 1 +PLATFORM_ZLIB = 1 +HAVE_ARMv6 = 0 endif ifeq "$(PLATFORM)" "libretro" OBJS += platform/libretro/libretro.o @@ -178,7 +180,7 @@ else OBJS += platform/common/mp3_dummy.o endif -ifeq "$(PLATFORM)" "libretro" +ifeq "$(PLATFORM_ZLIB)" "1" # zlib OBJS += zlib/gzio.o zlib/inffast.o zlib/inflate.o zlib/inftrees.o zlib/trees.o \ zlib/deflate.o zlib/crc32.o zlib/adler32.o zlib/zutil.o zlib/compress.o zlib/uncompr.o diff --git a/pico/sound/ym2612.h b/pico/sound/ym2612.h index a2921b222..bbe6b1a48 100644 --- a/pico/sound/ym2612.h +++ b/pico/sound/ym2612.h @@ -176,20 +176,19 @@ int YM2612PicoStateLoad2(int *tat, int *tbt); #else /* GP2X specific */ #include "../../platform/gp2x/940ctl.h" -extern int PicoIn.opt; #define YM2612Init(baseclock,rate) { \ - if (PicoIn.opt&0x200) YM2612Init_940(baseclock, rate); \ + if (PicoIn.opt&POPT_EXT_FM) YM2612Init_940(baseclock, rate); \ else YM2612Init_(baseclock, rate); \ } #define YM2612ResetChip() { \ - if (PicoIn.opt&0x200) YM2612ResetChip_940(); \ + if (PicoIn.opt&POPT_EXT_FM) YM2612ResetChip_940(); \ else YM2612ResetChip_(); \ } #define YM2612UpdateOne(buffer,length,stereo,is_buf_empty) \ - (PicoIn.opt&0x200) ? YM2612UpdateOne_940(buffer, length, stereo, is_buf_empty) : \ + (PicoIn.opt&POPT_EXT_FM) ? YM2612UpdateOne_940(buffer, length, stereo, is_buf_empty) : \ YM2612UpdateOne_(buffer, length, stereo, is_buf_empty); #define YM2612PicoStateLoad() { \ - if (PicoIn.opt&0x200) YM2612PicoStateLoad_940(); \ + if (PicoIn.opt&POPT_EXT_FM) YM2612PicoStateLoad_940(); \ else YM2612PicoStateLoad_(); \ } #endif /* __GP2X__ */ diff --git a/platform/common/common.mak b/platform/common/common.mak index 51eaa30f5..1389e7c9b 100644 --- a/platform/common/common.mak +++ b/platform/common/common.mak @@ -181,7 +181,7 @@ $(FR)cpu/cyclone/Cyclone.h: $(FR)cpu/cyclone/Cyclone.s: $(FR)cpu/$(CYCLONE_CONFIG) @echo building Cyclone... - @make CC=$(CYCLONE_CC) CXX=$(CYCLONE_CXX) -C $(R)cpu/cyclone/ CONFIG_FILE=../$(CYCLONE_CONFIG) + @make CC=$(CYCLONE_CC) CXX=$(CYCLONE_CXX) -C $(R)cpu/cyclone/ CONFIG_FILE=../$(CYCLONE_CONFIG) HAVE_ARMv6=$(HAVE_ARMv6) $(FR)cpu/cyclone/Cyclone.s: $(FR)cpu/cyclone/*.cpp $(FR)cpu/cyclone/*.h diff --git a/platform/gp2x/emu.c b/platform/gp2x/emu.c index 7e9a132f3..18d8a57eb 100644 --- a/platform/gp2x/emu.c +++ b/platform/gp2x/emu.c @@ -298,12 +298,12 @@ static int make_local_pal_md(int fast_mode) { int pallen = 0xc0; - bgr444_to_rgb32(localPal, Pico.cram); + bgr444_to_rgb32(localPal, PicoMem.cram); if (fast_mode) return 0x40; if (Pico.video.reg[0xC] & 8) { // shadow/hilight mode - bgr444_to_rgb32_sh(localPal, Pico.cram); + bgr444_to_rgb32_sh(localPal, PicoMem.cram); localPal[0xc0] = 0x0000c000; localPal[0xd0] = 0x00c00000; localPal[0xe0] = 0x00000000; // reserved pixels for OSD @@ -322,7 +322,7 @@ static int make_local_pal_md(int fast_mode) static int make_local_pal_sms(int fast_mode) { - unsigned short *spal = Pico.cram; + unsigned short *spal = PicoMem.cram; unsigned int *dpal = (void *)localPal; unsigned int i, t; From 5e30e9b8a36e71c9eb291ea37711ec6ca42d567e Mon Sep 17 00:00:00 2001 From: kub Date: Mon, 18 Mar 2019 23:14:07 +0100 Subject: [PATCH 005/174] make gp2x mp3 playback functional (need to unpack and compile helix decoder separately in platform/common/helix) --- Makefile | 4 +- platform/common/helix/Makefile | 42 ++++++++ platform/common/helix/lib.c | 122 ++++++++++++++++++++++++ platform/common/mp3.c | 27 ------ platform/common/mp3.h | 4 +- platform/common/mp3_helix.c | 37 ++++++- platform/common/mp3_sync.c | 27 ++++++ platform/gp2x/940ctl.c | 6 +- platform/gp2x/code940/940.c | 13 ++- platform/gp2x/code940/Makefile | 39 +++++--- platform/gp2x/code940/mp3test.c | 2 +- platform/gp2x/code940/uClibc/memset.s | 2 +- platform/gp2x/code940/uClibc/wrappers.c | 8 ++ 13 files changed, 274 insertions(+), 59 deletions(-) create mode 100644 platform/common/helix/Makefile create mode 100644 platform/common/helix/lib.c create mode 100644 platform/common/mp3_sync.c diff --git a/Makefile b/Makefile index 55e76f91b..77c661f65 100644 --- a/Makefile +++ b/Makefile @@ -172,8 +172,10 @@ endif endif # USE_FRONTEND -OBJS += platform/common/mp3.o +OBJS += platform/common/mp3.o platform/common/mp3_sync.o ifeq "$(PLATFORM_MP3)" "1" +platform/common/mp3_helix.o: CFLAGS += -Iplatform/libpicofe +OBJS += platform/common/mp3_helix.o else ifeq "$(HAVE_LIBAVCODEC)" "1" OBJS += platform/common/mp3_libavcodec.o else diff --git a/platform/common/helix/Makefile b/platform/common/helix/Makefile new file mode 100644 index 000000000..0021ea8e8 --- /dev/null +++ b/platform/common/helix/Makefile @@ -0,0 +1,42 @@ +CROSS ?= arm-linux-gnueabi- + +CC = $(CROSS)gcc +AS = $(CROSS)as +AR = $(CROSS)ar +TOOLCHAIN = $(notdir $(CROSS)) + +CFLAGS += -Ipub -O2 -Wall -fstrict-aliasing -ffast-math +ifneq ($(findstring arm-,$(TOOLCHAIN)),) +CFLAGS += -mcpu=arm940t -mtune=arm940t -mfloat-abi=soft -mfpu=fpa -mabi=apcs-gnu -mno-thumb-interwork +ASFLAGS = -mcpu=arm940t -mfloat-abi=soft -mfpu=fpa -mabi=apcs-gnu +OBJS += real/arm/asmpoly_gcc.o +else +CFLAGS += -m32 +ASFLAGS += -m32 +OBJS += real/polyphase.o +endif + +LIB = $(TOOLCHAIN)helix_mp3.a +SHLIB = $(TOOLCHAIN)helix_mp3.so + +all: $(LIB) $(SHLIB) + + +OBJS += mp3dec.o mp3tabs.o +#OBJS += ipp/bitstream.o ipp/buffers.o ipp/dequant.o ipp/huffman.o ipp/imdct.o ipp/subband.o +OBJS += real/bitstream.o real/buffers.o real/dct32.o real/dequant.o real/dqchan.o real/huffman.o +OBJS += real/hufftabs.o real/imdct.o real/scalfact.o real/stproc.o real/subband.o real/trigtabs.o + +OBJS += lib.o + +real/arm/asmpoly_gcc.o: real/arm/asmpoly_gcc.s + $(CC) -o $@ $(ASFLAGS) -c $< + +$(LIB) : $(OBJS) + $(AR) r $@ $^ +$(SHLIB) : $(OBJS) /home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1/libgcc.a + $(CC) -o $@ -nostdlib -shared $(CFLAGS) $^ + +clean: + $(RM) -f $(OBJS) + diff --git a/platform/common/helix/lib.c b/platform/common/helix/lib.c new file mode 100644 index 000000000..d7c511bed --- /dev/null +++ b/platform/common/helix/lib.c @@ -0,0 +1,122 @@ +#include +#include + +// libgcc has this with gcc 4.x +void raise(int sig) +{ +} + +// very limited heap functions for helix decoder + +static char heap[65000] __attribute__((aligned(16))); +static long heap_offs; + +void __malloc_init(void) +{ + heap_offs = 0; +} + +void *malloc(size_t size) +{ + void *chunk = heap + heap_offs; + size = (size+15) & ~15; + if (heap_offs + size > sizeof(heap)) + return NULL; + else { + heap_offs += size; + return chunk; + } +} + +void free(void *chunk) +{ + if (chunk == heap) + heap_offs = 0; +} + +#if 0 +void *memcpy (void *dest, const void *src, size_t n) +{ + char *_dest = dest; + const char *_src = src; + while (n--) *_dest++ = *_src++; + return dest; +} + +void *memmove (void *dest, const void *src, size_t n) +{ + char *_dest = dest+n; + const char *_src = src+n; + if (dest <= src || dest >= _src) + return memcpy(dest, src, n); + while (n--) *--_dest = *--_src; + return dest; +} +#else +/* memcpy/memmove in C with some simple optimizations. + * ATTN does dirty aliasing tricks with undefined behaviour by standard. + * (this works fine with gcc, though...) + */ +void *memcpy(void *dest, const void *src, size_t n) +{ + struct _16 { uint32_t a[4]; }; + union { const void *v; char *c; uint64_t *l; struct _16 *s; } + ss = { src }, ds = { dest }; + const int lm = sizeof(uint32_t)-1; + + if ((((unsigned)ss.c ^ (unsigned)ds.c) & lm) == 0) { + /* fast copy if pointers have the same aligment */ + while (((unsigned)ss.c & lm) && n > 0) /* align to word */ + *ds.c++ = *ss.c++, n--; + while (n >= sizeof(struct _16)) /* copy 16 bytes blocks */ + *ds.s++ = *ss.s++, n -= sizeof(struct _16); + if (n >= sizeof(uint64_t)) /* copy leftover 8 byte block */ + *ds.l++ = *ss.l++, n -= sizeof(uint64_t); + } else { + /* byte copy if pointers are unaligned */ + while (n >= 8) { /* copy 8 byte blocks */ + *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; + *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; + *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; + *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; + } + } + /* copy max. 8 leftover bytes */ + while (n > 0) + *ds.c++ = *ss.c++, n--; + return dest; +} + +void *memmove (void *dest, const void *src, size_t n) +{ + struct _16 { uint32_t a[4]; }; + union { const void *v; char *c; uint64_t *l; struct _16 *s; } + ss = { src+n }, ds = { dest+n }; + const int lm = sizeof(uint32_t)-1; + + if (dest <= src || dest >= src+n) + return memcpy(dest, src, n); + + if ((((unsigned)ss.c ^ (unsigned)ds.c) & lm) == 0) { + /* fast copy if pointers have the same aligment */ + while (((unsigned)ss.c & lm) && n > 0) + *--ds.c = *--ss.c, n--; + while (n >= sizeof(struct _16)) + *--ds.s = *--ss.s, n -= sizeof(struct _16); + if (n >= sizeof(uint64_t)) + *--ds.l = *--ss.l, n -= sizeof(uint64_t); + } else { + /* byte copy if pointers are unaligned */ + while (n >= 8) { + *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; + *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; + *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; + *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; + } + } + /* copy max. 8 leftover bytes */ + while (n > 0) + *--ds.c = *--ss.c, n--; + return dest; +} +#endif diff --git a/platform/common/mp3.c b/platform/common/mp3.c index c84962cc2..346e01958 100644 --- a/platform/common/mp3.c +++ b/platform/common/mp3.c @@ -21,33 +21,6 @@ unsigned short mpeg1_l3_bitrates[16] = { 0, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320 }; -int mp3_find_sync_word(const unsigned char *buf, int size) -{ - const unsigned char *p, *pe; - - /* find byte-aligned syncword - need 12 (MPEG 1,2) or 11 (MPEG 2.5) matching bits */ - for (p = buf, pe = buf + size - 3; p <= pe; p++) - { - int pn; - if (p[0] != 0xff) - continue; - pn = p[1]; - if ((pn & 0xf8) != 0xf8 || // currently must be MPEG1 - (pn & 6) == 0) { // invalid layer - p++; continue; - } - pn = p[2]; - if ((pn & 0xf0) < 0x20 || (pn & 0xf0) == 0xf0 || // bitrates - (pn & 0x0c) != 0) { // not 44kHz - continue; - } - - return p - buf; - } - - return -1; -} - static int try_get_bitrate(unsigned char *buf, int buf_size) { int offs1, offs = 0; diff --git a/platform/common/mp3.h b/platform/common/mp3.h index eb66db88b..4a2b230bd 100644 --- a/platform/common/mp3.h +++ b/platform/common/mp3.h @@ -12,8 +12,8 @@ int mp3dec_decode(FILE *f, int *file_pos, int file_len); extern unsigned short mpeg1_l3_bitrates[16]; #ifdef __GP2X__ -void mp3_update_local(int *buffer, int length, int stereo); -void mp3_start_play_local(void *f, int pos); +int _mp3dec_start(FILE *f, int fpos_start); +int _mp3dec_decode(FILE *f, int *file_pos, int file_len); #endif #endif // __COMMON_MP3_H__ diff --git a/platform/common/mp3_helix.c b/platform/common/mp3_helix.c index b27852981..75be8df3b 100644 --- a/platform/common/mp3_helix.c +++ b/platform/common/mp3_helix.c @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -20,10 +21,15 @@ static HMP3Decoder mp3dec; static unsigned char mp3_input_buffer[2 * 1024]; #ifdef __GP2X__ -#define mp3_update mp3_update_local -#define mp3_start_play mp3_start_play_local +#define mp3dec_decode _mp3dec_decode +#define mp3dec_start _mp3dec_start #endif +static void *libhelix; +HMP3Decoder (*p_MP3InitDecoder)(void); +void (*p_MP3FreeDecoder)(HMP3Decoder); +int (*p_MP3Decode)(HMP3Decoder, unsigned char **, int *, short *, int); + int mp3dec_decode(FILE *f, int *file_pos, int file_len) { unsigned char *readPtr; @@ -51,7 +57,7 @@ int mp3dec_decode(FILE *f, int *file_pos, int file_len) bytesLeft -= offset; had_err = err; - err = MP3Decode(mp3dec, &readPtr, &bytesLeft, cdda_out_buffer, 0); + err = p_MP3Decode(mp3dec, &readPtr, &bytesLeft, cdda_out_buffer, 0); if (err) { if (err == ERR_MP3_MAINDATA_UNDERFLOW && !had_err) { // just need another frame @@ -86,10 +92,31 @@ int mp3dec_decode(FILE *f, int *file_pos, int file_len) int mp3dec_start(FILE *f, int fpos_start) { + if (libhelix == NULL) { + libhelix = dlopen("./libhelix.so", RTLD_NOW); + if (libhelix == NULL) { + lprintf("mp3dec: load libhelix.so: %s\n", dlerror()); + return -1; + } + + p_MP3InitDecoder = dlsym(libhelix, "MP3InitDecoder"); + p_MP3FreeDecoder = dlsym(libhelix, "MP3FreeDecoder"); + p_MP3Decode = dlsym(libhelix, "MP3Decode"); + + if (p_MP3InitDecoder == NULL || p_MP3FreeDecoder == NULL + || p_MP3Decode == NULL) + { + lprintf("mp3dec: missing symbol(s) in libhelix.so\n"); + dlclose(libhelix); + libhelix = NULL; + return -1; + } + } + // must re-init decoder for new track if (mp3dec) - MP3FreeDecoder(mp3dec); - mp3dec = MP3InitDecoder(); + p_MP3FreeDecoder(mp3dec); + mp3dec = p_MP3InitDecoder(); return (mp3dec == 0) ? -1 : 0; } diff --git a/platform/common/mp3_sync.c b/platform/common/mp3_sync.c new file mode 100644 index 000000000..509c259dd --- /dev/null +++ b/platform/common/mp3_sync.c @@ -0,0 +1,27 @@ + +int mp3_find_sync_word(const unsigned char *buf, int size) +{ + const unsigned char *p, *pe; + + /* find byte-aligned syncword - need 12 (MPEG 1,2) or 11 (MPEG 2.5) matching bits */ + for (p = buf, pe = buf + size - 3; p <= pe; p++) + { + int pn; + if (p[0] != 0xff) + continue; + pn = p[1]; + if ((pn & 0xf8) != 0xf8 || // currently must be MPEG1 + (pn & 6) == 0) { // invalid layer + p++; continue; + } + pn = p[2]; + if ((pn & 0xf0) < 0x20 || (pn & 0xf0) == 0xf0 || // bitrates + (pn & 0x0c) != 0) { // not 44kHz + continue; + } + + return p - buf; + } + + return -1; +} diff --git a/platform/gp2x/940ctl.c b/platform/gp2x/940ctl.c index c270bfeea..31408d054 100644 --- a/platform/gp2x/940ctl.c +++ b/platform/gp2x/940ctl.c @@ -425,8 +425,7 @@ int YM2612UpdateOne_940(int *buffer, int length, int stereo, int is_buf_empty) int mp3dec_decode(FILE *f, int *file_pos, int file_len) { if (!(PicoIn.opt & POPT_EXT_FM)) { - //mp3_update_local(buffer, length, stereo); - return 0; + return _mp3dec_decode(f, file_pos, file_len); } // check if playback was started, track not ended @@ -457,8 +456,7 @@ int mp3dec_decode(FILE *f, int *file_pos, int file_len) int mp3dec_start(FILE *f, int fpos_start) { if (!(PicoIn.opt & POPT_EXT_FM)) { - //mp3_start_play_local(f, pos); - return -1; + return _mp3dec_start(f, fpos_start); } if (loaded_mp3 != f) diff --git a/platform/gp2x/code940/940.c b/platform/gp2x/code940/940.c index 760816eb4..f79db1e50 100644 --- a/platform/gp2x/code940/940.c +++ b/platform/gp2x/code940/940.c @@ -2,7 +2,7 @@ // (c) Copyright 2006-2007, Grazvydas "notaz" Ignotas #include "940shared.h" -#include "../../common/mp3.h" +#include "../../common/helix/pub/mp3dec.h" static _940_data_t *shared_data = (_940_data_t *) 0x00100000; static _940_ctl_t *shared_ctl = (_940_ctl_t *) 0x00200000; @@ -19,7 +19,7 @@ void drain_wb(void); // is changed by other core just before we update it void set_if_not_changed(int *val, int oldval, int newval); -void _memcpy(void *dst, const void *src, int count); +extern void *memcpy(void *dest, const void *src, unsigned long n); // asm volatile ("mov r0, #0" ::: "r0"); // asm volatile ("mcr p15, 0, r0, c7, c6, 0" ::: "r0"); /* flush dcache */ @@ -153,6 +153,8 @@ void Main940(void) int job = 0; ym2612_940 = &shared_data->ym2612; +// extern unsigned __bss_start__, __bss_end__; +// memset(&__bss_start__, 0, &__bss_end__ - &__bss_start__); for (;;) { @@ -167,6 +169,7 @@ void Main940(void) shared_ctl->writebuff0[0] = shared_ctl->writebuff1[0] = 0xffff; YM2612Init_(shared_ctl->baseclock, shared_ctl->rate); /* Helix mp3 decoder */ + __malloc_init(); shared_data->mp3dec = MP3InitDecoder(); break; @@ -185,7 +188,7 @@ void Main940(void) case JOB940_PICOSTATESAVE2: YM2612PicoStateSave2(0, 0); - _memcpy(shared_ctl->writebuff0, ym2612_940->REGS, 0x200); + memcpy(shared_ctl->writebuff0, ym2612_940->REGS, 0x200); break; case JOB940_PICOSTATELOAD2_PREP: @@ -193,7 +196,7 @@ void Main940(void) break; case JOB940_PICOSTATELOAD2: - _memcpy(ym2612_940->REGS, shared_ctl->writebuff0, 0x200); + memcpy(ym2612_940->REGS, shared_ctl->writebuff0, 0x200); YM2612PicoStateLoad2(0, 0); break; @@ -207,6 +210,7 @@ void Main940(void) case JOB940_MP3RESET: if (shared_data->mp3dec) MP3FreeDecoder(shared_data->mp3dec); + __malloc_init(); shared_data->mp3dec = MP3InitDecoder(); break; } @@ -215,4 +219,3 @@ void Main940(void) dcache_clean(); } } - diff --git a/platform/gp2x/code940/Makefile b/platform/gp2x/code940/Makefile index e327d1361..8561551b5 100644 --- a/platform/gp2x/code940/Makefile +++ b/platform/gp2x/code940/Makefile @@ -1,17 +1,23 @@ # you may or may not need to change this -#devkit_path = x:/stuff/dev/devkitgp2x/ -devkit_path ?= $(HOME)/opt/devkitGP2X/ -lgcc_path = $(devkit_path)lib/gcc/arm-linux/4.0.3/ -CROSS = arm-linux- +#devkit_path ?= $(HOME)/opt/devkitGP2X/ +#lgcc_path = $(devkit_path)lib/gcc/arm-linux/4.0.3/ #CROSS = $(devkit_path)bin/arm-linux- +#devkit_path ?= $(HOME)/opt/open2x +#lgcc_path = $(devkit_path)/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1/ +#CROSS ?= $(devkit_path)/gcc-4.1.1-glibc-2.3.6/bin/arm-open2x-linux- +#devkit_path ?= $(HOME)/opt/arm-unknown-linux-gnu +#lgcc_path = $(HOME)/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1/ +#CROSS ?= $(devkit_path)/bin/arm-unknown-linux-gnu- +lgcc_path = $(HOME)/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1/ +CROSS ?= arm-linux-gnueabi- # settings #up = 1 -CFLAGS += -O2 -Wall -fomit-frame-pointer -fstrict-aliasing -ffast-math -CFLAGS += -I../.. -I. -D__GP2X__ -DARM -CFLAGS += -mcpu=arm940t -mtune=arm940t -LDFLAGS = -static -s -e code940 -Ttext 0x0 -L$(lgcc_path) -lgcc +CFLAGS += -O2 -Wall -mno-thumb-interwork -fstrict-aliasing -ffast-math +CFLAGS += -I../../common/helix/pub -I../../.. -I. -D__GP2X__ -DARM +CFLAGS += -mcpu=arm940t -mtune=arm940t -mabi=apcs-gnu -mfloat-abi=soft -mfpu=fpa +LDFLAGS = -static -e code940 -Ttext 0x0 -L$(lgcc_path) -lgcc GCC = $(CROSS)gcc STRIP = $(CROSS)strip @@ -36,7 +42,9 @@ all: $(BIN) # stuff for 940 core # init, emu_control, emu -OBJS940 += 940init.o 940.o 940ym2612.o memcpy.o misc_arm.o mp3.o +OBJS940 += 940init.o 940.o 940ym2612.o misc_arm.o mp3_sync.o +# the asm memcpy code crashes job LOAD2 on 940. Possibly a globbered reg? +# OBJS940 += memcpy.o # the asm code seems to be faster when run on 920, but not on 940 for some reason # OBJS940 += ../../Pico/sound/ym2612_asm.o @@ -44,12 +52,13 @@ OBJS940 += 940init.o 940.o 940ym2612.o memcpy.o misc_arm.o mp3.o OBJS940 += uClibc/memset.o uClibc/s_floor.o uClibc/e_pow.o uClibc/e_sqrt.o uClibc/s_fabs.o OBJS940 += uClibc/s_scalbn.o uClibc/s_copysign.o uClibc/k_sin.o uClibc/k_cos.o uClibc/s_sin.o OBJS940 += uClibc/e_rem_pio2.o uClibc/k_rem_pio2.o uClibc/e_log.o uClibc/wrappers.o +LIBHELIX ?= ../../common/helix/$(notdir $(CROSS))helix_mp3.a $(BIN) : code940.elf @echo ">>>" $@ $(OBJCOPY) -O binary $< $@ -code940.elf : $(OBJS940) ../../common/helix/$(CROSS)helix-mp3.a +code940.elf : $(OBJS940) $(LIBHELIX) @echo ">>>" $@ $(LD) $^ $(LDFLAGS) -o $@ -Map code940.map @@ -64,8 +73,12 @@ misc_arm.o : ../../../pico/misc_arm.s @echo ">>>" $@ $(GCC) $(CFLAGS) -DEXTERNAL_YM2612 -c $< -o $@ -../../common/helix/helix_mp3.a: - @make -C ../../common/helix/ +mp3_sync.o: ../../common/mp3_sync.c + @echo ">>>" $@ + $(GCC) $(CFLAGS) -Os -DCODE940 -c $< -o $@ + +$(LIBHELIX): + @$(MAKE) -C ../../common/helix/ CROSS=$(CROSS) up: $(BIN) @@ -82,7 +95,7 @@ tidy: ## OBJSMP3T = mp3test.o ../gp2x.o ../asmutils.o ../usbjoy.o -mp3test.gpe : $(OBJSMP3T) ../helix/helix_mp3.a +mp3test.gpe : $(OBJSMP3T) $(LIBHELIX) $(GCC) -static -o $@ $^ $(STRIP) $@ @cp -v $@ /mnt/gp2x/mnt/sd diff --git a/platform/gp2x/code940/mp3test.c b/platform/gp2x/code940/mp3test.c index 9072d858b..cd2a66514 100644 --- a/platform/gp2x/code940/mp3test.c +++ b/platform/gp2x/code940/mp3test.c @@ -13,7 +13,7 @@ //#include "emu.h" //#include "menu.h" #include "../asmutils.h" -#include "../helix/pub/mp3dec.h" +#include "../../helix/pub/mp3dec.h" /* we will need some gp2x internals here */ extern volatile unsigned short *gp2x_memregs; /* from minimal library rlyeh */ diff --git a/platform/gp2x/code940/uClibc/memset.s b/platform/gp2x/code940/uClibc/memset.s index 0923014cd..80cdcb58d 100644 --- a/platform/gp2x/code940/uClibc/memset.s +++ b/platform/gp2x/code940/uClibc/memset.s @@ -22,7 +22,7 @@ .text .global memset .type memset,%function - .align 4 + .align 2 memset: mov a4, a1 diff --git a/platform/gp2x/code940/uClibc/wrappers.c b/platform/gp2x/code940/uClibc/wrappers.c index cc4e269ed..ce95a48c0 100644 --- a/platform/gp2x/code940/uClibc/wrappers.c +++ b/platform/gp2x/code940/uClibc/wrappers.c @@ -4,9 +4,17 @@ double pow(double x, double y) { return __ieee754_pow(x, y); } +double __pow_finite(double x, double y) +{ + return __ieee754_pow(x, y); +} double log(double x) { return __ieee754_log(x); } +double __log_finite(double x) +{ + return __ieee754_log(x); +} From 832a98d8187abcf5761025a052bc2dd62d1fd6fb Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 20 Mar 2019 19:07:16 +0100 Subject: [PATCH 006/174] arm asm syntax fixes for open2x --- Makefile | 3 ++- pico/carthw/svp/stub_arm.S | 16 ++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 77c661f65..77999e2e9 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ $(LD) ?= $(CC) TARGET ?= PicoDrive -CFLAGS += -Wall -g +CFLAGS += -Wall -ggdb -ffunction-sections -fdata-sections CFLAGS += -I. CYCLONE_CC ?= gcc CYCLONE_CXX ?= g++ @@ -17,6 +17,7 @@ endif ifndef NO_ALIGN_FUNCTIONS CFLAGS += -falign-functions=2 endif +LDFLAGS += -Wl,--gc-sections all: config.mak target_ diff --git a/pico/carthw/svp/stub_arm.S b/pico/carthw/svp/stub_arm.S index 9d5c5fa1a..736d459be 100644 --- a/pico/carthw/svp/stub_arm.S +++ b/pico/carthw/svp/stub_arm.S @@ -8,7 +8,7 @@ #include "../../arm_features.h" -.syntax unified +@.syntax unified .text .align 2 @@ -281,8 +281,8 @@ ssp_hle_902_loop: bgt ssp_hle_902_loop tst r12, #1 - ldrhne r0, [r2], #2 - strhne r0, [r3], #2 + ldrneh r0, [r2], #2 + strneh r0, [r3], #2 ldr r0, [r7, #SSP_OFFS_IRAM_ROM] add r1, r7, #0x200 @@ -501,7 +501,7 @@ FUNCTION(ssp_hle_07_036): mov r12, #0x4000 orr r12,r12,#0x0018 subs r12,r3, r12 - subsne r12,r12,#0x0400 + subnes r12,r12,#0x0400 blne tr_unhandled orr r2, r2, r2, lsl #16 @@ -510,7 +510,7 @@ FUNCTION(ssp_hle_07_036): hle_07_036_no_ovrwr: tst r1, #2 - strhne r2, [r1], #0x3e @ align + strneh r2, [r1], #0x3e @ align subne r0, r0, #1 subs r0, r0, #4 blt hle_07_036_l2 @@ -525,7 +525,7 @@ hle_07_036_l2: tst r0, #2 strne r2, [r1], #0x40 tst r0, #1 - strhne r2, [r1], #2 + strneh r2, [r1], #2 b hle_07_036_end_copy hle_07_036_ovrwr: @@ -562,10 +562,10 @@ hle_07_036_ol1: hle_07_036_ol2: tst r0, #1 - ldrhne r3, [r1] + ldrneh r3, [r1] andne r3, r3, r12 orrne r3, r3, r2 - strhne r3, [r1], #2 + strneh r3, [r1], #2 hle_07_036_end_copy: ldr r2, [r7, #SSP_OFFS_DRAM] From 3e5992c16a5ae52609a2269233b8f925fce3de05 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 20 Mar 2019 19:08:29 +0100 Subject: [PATCH 007/174] config templates for gp2x, caanoo, dingux either with system toolchain (open2x,gph,opendingux) or ubuntu arm(gcc 4.7 is highest possible),mips --- config.caanoo | 16 ++++++++++++++++ config.caanoo47 | 16 ++++++++++++++++ config.dingux | 16 ++++++++++++++++ config.dingux54 | 16 ++++++++++++++++ config.gp2x | 16 ++++++++++++++++ config.gp2x47 | 16 ++++++++++++++++ 6 files changed, 96 insertions(+) create mode 100644 config.caanoo create mode 100644 config.caanoo47 create mode 100644 config.dingux create mode 100644 config.dingux54 create mode 100644 config.gp2x create mode 100644 config.gp2x47 diff --git a/config.caanoo b/config.caanoo new file mode 100644 index 000000000..8e62573c8 --- /dev/null +++ b/config.caanoo @@ -0,0 +1,16 @@ +# Automatically generated by configure +# Configured with: './configure' '--platform=gp2x' +CC = arm-gph-linux-gnueabi-gcc +CXX = arm-gph-linux-gnueabi-g++ +AS = arm-gph-linux-gnueabi-as +STRIP = arm-gph-linux-gnueabi-strip +CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -fno-stack-protector -D__GP2X__ -DGPERF +CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers +CFLAGS += -I/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I/home/build/src/gp2x/armroot-eabi/include +ASFLAGS += -mfloat-abi=soft -mcpu=arm920t +LDFLAGS += -B/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/lib/gcc/arm-gph-linux-gnueabi/4.2.4 -B/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L/home/build/src/gp2x/armroot-eabi/lib -static +LDLIBS += -lpng -lm -ldl -lprofiler -lstdc++ + +ARCH = arm +PLATFORM = gp2x +SOUND_DRIVERS = oss diff --git a/config.caanoo47 b/config.caanoo47 new file mode 100644 index 000000000..f3efde0f7 --- /dev/null +++ b/config.caanoo47 @@ -0,0 +1,16 @@ +# Automatically generated by configure +# Configured with: './configure' '--platform=gp2x' +CC = arm-linux-gnueabi-gcc +CXX = arm-linux-gnueabi-g++ +AS = arm-linux-gnueabi-as +STRIP = arm-linux-gnueabi-strip +CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -Wno-unused-result -fno-stack-protector -D__GP2X__ +CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers +CFLAGS += -I/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I/home/build/src/gp2x/armroot-eabi/include +ASFLAGS += -mfloat-abi=soft -mcpu=arm920t +LDFLAGS += -B/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/lib/gcc/arm-gph-linux-gnueabi/4.2.4 -B/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -static +LDLIBS += -lpng -lm -ldl + +ARCH = arm +PLATFORM = gp2x +SOUND_DRIVERS = oss diff --git a/config.dingux b/config.dingux new file mode 100644 index 000000000..6611991cc --- /dev/null +++ b/config.dingux @@ -0,0 +1,16 @@ +# Automatically generated by configure +# Configured with: './configure' '--platform=generic' +CC = mipsel-linux-gcc +CXX = mipsel-linux-g++ +AS = mipsel-linux-as +STRIP = mipsel-linux-strip +CFLAGS += -I/home/build/opt/opendingux-toolchain/usr/include/ +CFLAGS += -I/home/build/opt/opendingux-toolchain/usr/include/SDL +CFLAGS += -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -fno-stack-protector +ASFLAGS += +LDFLAGS += +LDLIBS += -B/home/build/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=/home/build/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=/home/build/opt/opendingux-toolchain/lib -lSDL -lasound -lpng -lm -lstdc++ -ldl + +ARCH = mipsel +PLATFORM = opendingux +SOUND_DRIVERS = sdl diff --git a/config.dingux54 b/config.dingux54 new file mode 100644 index 000000000..96e550148 --- /dev/null +++ b/config.dingux54 @@ -0,0 +1,16 @@ +# Automatically generated by configure +# Configured with: './configure' '--platform=generic' +CC = mipsel-linux-gnu-gcc +CXX = mipsel-linux-gnu-g++ +AS = mipsel-linux-gnu-as +STRIP = mipsel-linux-gnu-strip +CFLAGS += -I/home/build/opt/opendingux-toolchain/usr/include/ +CFLAGS += -I/home/build/opt/opendingux-toolchain/usr/include/SDL +CFLAGS += -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -fno-stack-protector +ASFLAGS += +LDFLAGS += +LDLIBS += -B/home/build/opt/opendingux-toolchain/usr/lib -B/home/build/opt/opendingux-toolchain/lib -Wl,-rpath-link=/home/build/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=/home/build/opt/opendingux-toolchain/lib -lSDL -lasound -lpng -lz -lm -lstdc++ -ldl + +ARCH = mipsel +PLATFORM = opendingux +SOUND_DRIVERS = sdl diff --git a/config.gp2x b/config.gp2x new file mode 100644 index 000000000..de3e47c40 --- /dev/null +++ b/config.gp2x @@ -0,0 +1,16 @@ +# Automatically generated by configure +# Configured with: './configure' '--platform=gp2x' +CC = arm-open2x-linux-gcc +CXX = arm-open2x-linux-g++ +AS = arm-open2x-linux-as +STRIP = arm-open2x-linux-strip +CFLAGS += -msoft-float -mcpu=arm920t -mtune=arm920t -D__GP2X__ +CFLAGS += -I/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I/home/build/src/gp2x/armroot/include +CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers +ASFLAGS += -mcpu=arm920t -mfloat-abi=soft +LDFLAGS += --sysroot /home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L/home/build/src/gp2x/armroot/lib -static +LDLIBS += -lpng -lm -ldl + +ARCH = arm +PLATFORM = gp2x +SOUND_DRIVERS = oss diff --git a/config.gp2x47 b/config.gp2x47 new file mode 100644 index 000000000..1022166dd --- /dev/null +++ b/config.gp2x47 @@ -0,0 +1,16 @@ +# Automatically generated by configure +# Configured with: './configure' '--platform=gp2x' +CC = arm-linux-gnueabi-gcc +CXX = arm-linux-gnueabi-g++ +AS = arm-linux-gnueabi-as +STRIP = arm-linux-gnueabi-strip +CFLAGS += -mabi=apcs-gnu -mno-thumb-interwork -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t -mtune=arm920t -Wno-unused-result -fno-stack-protector -D__GP2X__ +CFLAGS += -I/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I/home/build/src/gp2x/armroot/include +CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers +ASFLAGS += -mabi=apcs-gnu -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t +LDFLAGS += -mabi=apcs-gnu -mfpu=fpa -B/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L/home/build/src/gp2x/armroot/lib -static +LDLIBS += -lpng -lm -ldl + +ARCH = arm +PLATFORM = gp2x +SOUND_DRIVERS = oss From 3c42e232e425dfd443dd1ea4cebc76733a35b97f Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 20 Mar 2019 19:08:46 +0100 Subject: [PATCH 008/174] arm asm memory access functions for m/s68k --- pico/cd/memory.c | 2 + pico/cd/memory_arm.S | 113 +++++++++++++++++++++++++++++++++++++++---- pico/memory.c | 3 ++ pico/memory_arm.S | 96 ++++++++++++++++++++++++++++++++++++ 4 files changed, 205 insertions(+), 9 deletions(-) diff --git a/pico/cd/memory.c b/pico/cd/memory.c index 1c5dcf94c..e64868400 100644 --- a/pico/cd/memory.c +++ b/pico/cd/memory.c @@ -14,12 +14,14 @@ uptr s68k_read16_map [0x1000000 >> M68K_MEM_SHIFT]; uptr s68k_write8_map [0x1000000 >> M68K_MEM_SHIFT]; uptr s68k_write16_map[0x1000000 >> M68K_MEM_SHIFT]; +#ifndef _ASM_CD_MEMORY_C MAKE_68K_READ8(s68k_read8, s68k_read8_map) MAKE_68K_READ16(s68k_read16, s68k_read16_map) MAKE_68K_READ32(s68k_read32, s68k_read16_map) MAKE_68K_WRITE8(s68k_write8, s68k_write8_map) MAKE_68K_WRITE16(s68k_write16, s68k_write16_map) MAKE_68K_WRITE32(s68k_write32, s68k_write16_map) +#endif // ----------------------------------------------------------------- diff --git a/pico/cd/memory_arm.S b/pico/cd/memory_arm.S index fe82ecb9e..335f36247 100644 --- a/pico/cd/memory_arm.S +++ b/pico/cd/memory_arm.S @@ -178,9 +178,9 @@ m_m68k_read8_r02: bx lr m_m68k_read8_r03: add r1, r1, #0x110000 - push {r1, lr} + stmfd sp!, {r1, lr} bl m68k_comm_check - pop {r1, lr} + ldmfd sp!, {r1, lr} ldrb r0, [r1, #3] and r0, r0, #0xc7 bx lr @@ -219,10 +219,10 @@ m_m68k_read8_hi: add r1, r1, #0x110000 movge r0, #0 bxge lr - add r1, r0 - push {r1, lr} + add r1, r1, r0 + stmfd sp!, {r1, lr} bl m68k_comm_check - pop {r1, lr} + ldmfd sp!, {r1, lr} ldrb r0, [r1] bx lr @@ -275,9 +275,9 @@ m_m68k_read16_r00: bx lr m_m68k_read16_r02: add r1, r1, #0x110000 - push {r1, lr} + stmfd sp!, {r1, lr} bl m68k_comm_check - pop {r1, lr} + ldmfd sp!, {r1, lr} ldrb r2, [r1, #3] ldrb r0, [r1, #2] and r2, r2, #0xc7 @@ -307,9 +307,9 @@ m_m68k_read16_hi: bxge lr add r1, r0, r1 - push {r1, lr} + stmfd sp!, {r1, lr} bl m68k_comm_check - pop {r0, lr} + ldmfd sp!, {r0, lr} ldrh r0, [r0] mov r1, r0, lsr #8 and r0, r0, #0xff @@ -701,6 +701,101 @@ m_s68k_write16_regs_spec: @ special case strb r1, [r2, r0] @ if (a == 0xe) s68k_regs[0xf] = d; bx lr +.global s68k_read8 +.global s68k_read16 +.global s68k_write8 +.global s68k_write16 + +s68k_read8: + ldr r3, =s68k_read8_map + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + eor r2, r0, #1 + movs r3, r3, lsl #1 + ldrccb r0, [r3, r2] + bxcc lr + bx r3 + +s68k_read16: + ldr r3, =s68k_read16_map + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + ldrcch r0, [r3, r0] + bxcc lr + bx r3 + +s68k_read32: + ldr r3, =s68k_read16_map + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + ldrcch r1, [r3, r0]! + ldrcch r0, [r3, #2] + orrcc r0, r0, r1, lsl #16 + bxcc lr + + stmfd sp!, {r0, r3, r4, lr} + mov lr, pc + bx r3 + ldmfd sp!, {r1, r3} + str r0, [sp] + add r0, r1, #2 + mov lr, pc + bx r3 + ldmfd sp!, {r1, lr} + mov r0, r0, lsl #16 + mov r1, r1, lsl #16 + orr r0, r1, r0, lsr #16 + bx lr + +s68k_write8: + ldr r3, =s68k_write8_map + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + eor r2, r0, #1 + movs r3, r3, lsl #1 + strccb r1, [r3, r2] + bxcc lr + bx r3 + +s68k_write16: + ldr r3, =s68k_write16_map + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + strcch r1, [r3, r0] + bxcc lr + bx r3 + +s68k_write32: + ldr r3, =s68k_write16_map + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + movcc r2, r1, lsr #16 + strcch r2, [r3, r0]! + strcch r1, [r3, #2] + bxcc lr + + stmfd sp!, {r0, r1, r3, lr} + mov r1, r1, lsr #16 + mov lr, pc + bx r3 + ldmfd sp!, {r0, r1, r3, lr} + add r0, r0, #2 + bx r3 + .pool @ vim:filetype=armasm diff --git a/pico/memory.c b/pico/memory.c index a31a08e99..cc82f7898 100644 --- a/pico/memory.c +++ b/pico/memory.c @@ -163,12 +163,14 @@ void m68k_map_unmap(int start_addr, int end_addr) m68k_write16_map[i] = (addr >> 1) | MAP_FLAG; } +#ifndef _ASM_MEMORY_C MAKE_68K_READ8(m68k_read8, m68k_read8_map) MAKE_68K_READ16(m68k_read16, m68k_read16_map) MAKE_68K_READ32(m68k_read32, m68k_read16_map) MAKE_68K_WRITE8(m68k_write8, m68k_write8_map) MAKE_68K_WRITE16(m68k_write16, m68k_write16_map) MAKE_68K_WRITE32(m68k_write32, m68k_write16_map) +#endif // ----------------------------------------------------------------- @@ -420,6 +422,7 @@ static u32 PicoRead8_sram(u32 a) d = EEPROM_read(); if (!(a & 1)) d >>= 8; + d &= 0xff; } else d = *(u8 *)(Pico.sv.data - Pico.sv.start + a); elprintf(EL_SRAMIO, "sram r8 [%06x] %02x @ %06x", a, d, SekPc); diff --git a/pico/memory_arm.S b/pico/memory_arm.S index bfe8ca109..117cea0b4 100644 --- a/pico/memory_arm.S +++ b/pico/memory_arm.S @@ -59,6 +59,7 @@ m_read8_eeprom: ldmfd sp!,{r1,lr} tst r1, #1 moveq r0, r0, lsr #8 + and r0, r0, #0xff bx lr @@ -220,6 +221,101 @@ m_write16_not_z80ctl: strb r2, [r3, #OFS_Pico_m_sram_reg] bx lr +.global m68k_read8 +.global m68k_read16 +.global m68k_write8 +.global m68k_write16 + +m68k_read8: + ldr r3, =m68k_read8_map + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + eor r2, r0, #1 + movs r3, r3, lsl #1 + ldrccb r0, [r3, r2] + bxcc lr + bx r3 + +m68k_read16: + ldr r3, =m68k_read16_map + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + ldrcch r0, [r3, r0] + bxcc lr + bx r3 + +m68k_read32: + ldr r3, =m68k_read16_map + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + ldrcch r1, [r3, r0]! + ldrcch r0, [r3, #2] + orrcc r0, r0, r1, lsl #16 + bxcc lr + + stmfd sp!, {r0, r3, r4, lr} + mov lr, pc + bx r3 + ldmfd sp!, {r1, r3} + str r0, [sp] + add r0, r1, #2 + mov lr, pc + bx r3 + ldmfd sp!, {r1, lr} + mov r0, r0, lsl #16 + mov r1, r1, lsl #16 + orr r0, r1, r0, lsr #16 + bx lr + +m68k_write8: + ldr r3, =m68k_write8_map + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + eor r2, r0, #1 + movs r3, r3, lsl #1 + strccb r1, [r3, r2] + bxcc lr + bx r3 + +m68k_write16: + ldr r3, =m68k_write16_map + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + strcch r1, [r3, r0] + bxcc lr + bx r3 + +m68k_write32: + ldr r3, =m68k_write16_map + bic r0, r0, #0xff000000 + mov r2, r0, lsr #16 + ldr r3, [r3, r2, lsl #2] + bic r0, r0, #1 + movs r3, r3, lsl #1 + movcc r2, r1, lsr #16 + strcch r2, [r3, r0]! + strcch r1, [r3, #2] + bxcc lr + + stmfd sp!, {r0, r1, r3, lr} + mov r1, r1, lsr #16 + mov lr, pc + bx r3 + ldmfd sp!, {r0, r1, r3, lr} + add r0, r0, #2 + bx r3 + .pool @ vim:filetype=armasm From 4f1283aed211d649a21a1e897da4d63a404c6ab8 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 20 Mar 2019 20:33:43 +0100 Subject: [PATCH 009/174] config for x86 (32 bit only, for SH2 drc), add/revive profiling --- Makefile | 8 +++++++- config.caanoo | 2 +- config.x86 | 15 +++++++++++++++ platform/common/common.mak | 4 ++++ platform/common/main.c | 7 +++++++ platform/linux/pprof.c | 39 +++++++++++++++++++++++++++++++++----- platform/linux/pprof.h | 31 ++++++++++++++++++++++-------- 7 files changed, 91 insertions(+), 15 deletions(-) create mode 100644 config.x86 diff --git a/Makefile b/Makefile index 77999e2e9..8eba86dfd 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,10 @@ CFLAGS += -falign-functions=2 endif LDFLAGS += -Wl,--gc-sections +# profiling +pprof ?= 0 +gperf ?= 0 + all: config.mak target_ ifndef NO_CONFIG_MAK @@ -124,6 +128,8 @@ OBJS += platform/gp2x/emu.o OBJS += platform/gp2x/vid_mmsp2.o OBJS += platform/gp2x/vid_pollux.o OBJS += platform/gp2x/warm.o +OBJS += platform/gp2x/host_dasm.o +OBJS += cpu/sh2/mame/sh2dasm.o USE_FRONTEND = 1 PLATFORM_MP3 = 1 PLATFORM_ZLIB = 1 @@ -219,7 +225,7 @@ else endif pprof: platform/linux/pprof.c - $(CC) -O2 -ggdb -DPPROF -DPPROF_TOOL -I../../ -I. $^ -o $@ + $(CC) $(CFLAGS) -O2 -ggdb -DPPROF -DPPROF_TOOL -I../../ -I. $^ -o $@ $(LDFLAGS) $(LDLIBS) tools/textfilter: tools/textfilter.c make -C tools/ textfilter diff --git a/config.caanoo b/config.caanoo index 8e62573c8..39edb5db0 100644 --- a/config.caanoo +++ b/config.caanoo @@ -9,7 +9,7 @@ CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commonin CFLAGS += -I/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I/home/build/src/gp2x/armroot-eabi/include ASFLAGS += -mfloat-abi=soft -mcpu=arm920t LDFLAGS += -B/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/lib/gcc/arm-gph-linux-gnueabi/4.2.4 -B/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L/home/build/src/gp2x/armroot-eabi/lib -static -LDLIBS += -lpng -lm -ldl -lprofiler -lstdc++ +LDLIBS += -lpng -lm -ldl ARCH = arm PLATFORM = gp2x diff --git a/config.x86 b/config.x86 new file mode 100644 index 000000000..24f9d2093 --- /dev/null +++ b/config.x86 @@ -0,0 +1,15 @@ +# Automatically generated by configure +# Configured with: './configure' '--platform=generic' +CC = gcc +CXX = g++ +AS = as +STRIP = strip +CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -m32 #-DGPERF -pg +ASFLAGS += +LDFLAGS += -m32 #-pg +LDLIBS += -L$(HOME)/opt/binutils-i386/usr/lib/ -lbfd-2.24-multiarch -lopcodes-2.24-multiarch +LDLIBS += -L/usr/lib/i386-linux-gnu/debug -L/home/build/opt/lib32 -lSDL-1.2 -lasound -lpng -lz -lm -ldl + +ARCH = x86 +PLATFORM = generic +SOUND_DRIVERS = oss alsa sdl diff --git a/platform/common/common.mak b/platform/common/common.mak index 1389e7c9b..a5f6078cd 100644 --- a/platform/common/common.mak +++ b/platform/common/common.mak @@ -40,6 +40,10 @@ ifeq "$(pprof)" "1" DEFINES += PPROF SRCS_COMMON += $(R)platform/linux/pprof.c endif +ifeq "$(gperf)" "1" +DEFINES += GPERF +LDFLAGS += -lprofiler -lstdc++ +endif # ARM asm stuff ifeq "$(ARCH)" "arm" diff --git a/platform/common/main.c b/platform/common/main.c index 424c1b5a9..24ca1865a 100644 --- a/platform/common/main.c +++ b/platform/common/main.c @@ -93,6 +93,10 @@ int main(int argc, char *argv[]) emu_init(); menu_init(); +#ifdef GPERF + ProfilerStart("gperf.out"); +#endif + engineState = PGS_Menu; if (argc > 1) @@ -148,6 +152,9 @@ int main(int argc, char *argv[]) } endloop: +#ifdef GPERF + ProfilerStop(); +#endif emu_finish(); plat_finish(); diff --git a/platform/linux/pprof.c b/platform/linux/pprof.c index e1ecd1fd4..6c7c0ff9f 100644 --- a/platform/linux/pprof.c +++ b/platform/linux/pprof.c @@ -1,21 +1,46 @@ #include #include #include +#include #include #include #include +#include #include +int rc_mem[pp_total_points]; + struct pp_counters *pp_counters; +int *refcounts = rc_mem; static int shmemid; +static unsigned long devMem; +volatile unsigned long *gp2x_memregl; +volatile unsigned short *gp2x_memregs; + void pprof_init(void) { int this_is_new_shmem = 1; key_t shmemkey; void *shmem; +#if 0 + devMem = open("/dev/mem", O_RDWR); + if (devMem == -1) + { + perror("pprof: open failed"); + return; + } + gp2x_memregl = (unsigned long *)mmap(0, 0x10000, PROT_READ|PROT_WRITE, MAP_SHARED, devMem, 0xc0000000); + if (gp2x_memregl == (unsigned long *)-1) + { + perror("pprof: mmap failed"); + return; + } + gp2x_memregs = (unsigned short *)gp2x_memregl; +#endif + #ifndef PPROF_TOOL unsigned int tmp = pprof_get_one(); printf("pprof: measured diff is %u\n", pprof_get_one() - tmp); @@ -28,11 +53,11 @@ void pprof_init(void) return; } -#ifndef PPROF_TOOL +//#ifndef PPROF_TOOL shmemid = shmget(shmemkey, sizeof(*pp_counters), IPC_CREAT | IPC_EXCL | 0644); if (shmemid == -1) -#endif +//#endif { shmemid = shmget(shmemkey, sizeof(*pp_counters), 0644); @@ -76,15 +101,18 @@ static const struct { IT(draw), IT(sound), IT(m68k), + IT(s68k), + IT(mem68), IT(z80), IT(msh2), IT(ssh2), + IT(memsh), IT(dummy), }; int main(int argc, char *argv[]) { - unsigned long long old[pp_total_points], new[pp_total_points]; + pp_type old[pp_total_points], new[pp_total_points]; int base = 0; int l, i; @@ -107,11 +135,12 @@ int main(int argc, char *argv[]) memcpy(new, pp_counters->counter, sizeof(new)); for (i = 0; i < ARRAY_SIZE(pp_tab); i++) { - unsigned long long idiff = new[i] - old[i]; - unsigned long long bdiff = (new[base] - old[base]) | 1; + pp_type idiff = new[i] - old[i]; + pp_type bdiff = (new[base] - old[base]) | 1; printf("%6.2f ", (double)idiff * 100.0 / bdiff); } printf("\n"); + fflush(stdout); memcpy(old, new, sizeof(old)); if (argc < 3) diff --git a/platform/linux/pprof.h b/platform/linux/pprof.h index cccbcbd5b..91fd5b09f 100644 --- a/platform/linux/pprof.h +++ b/platform/linux/pprof.h @@ -7,21 +7,22 @@ enum pprof_points { pp_draw, pp_sound, pp_m68k, + pp_s68k, + pp_mem68, pp_z80, pp_msh2, pp_ssh2, + pp_memsh, pp_dummy, pp_total_points }; -struct pp_counters -{ - unsigned long long counter[pp_total_points]; -}; - extern struct pp_counters *pp_counters; +extern int *refcounts; #ifdef __i386__ +typedef unsigned long long pp_type; + static __attribute__((always_inline)) inline unsigned int pprof_get_one(void) { unsigned long long ret; @@ -31,24 +32,38 @@ static __attribute__((always_inline)) inline unsigned int pprof_get_one(void) #define unglitch_timer(x) #elif defined(__GP2X__) +typedef unsigned long pp_type; + +#if 0 // XXX: MMSP2 only, timer sometimes seems to return lower vals? extern volatile unsigned long *gp2x_memregl; #define pprof_get_one() (unsigned int)gp2x_memregl[0x0a00 >> 2] #define unglitch_timer(di) \ if ((signed int)(di) < 0) di = 0 +#else +extern unsigned int (*gp2x_get_ticks_us)(void); +#define pprof_get_one() gp2x_get_ticks_us() +#define unglitch_timer(di) \ + if ((signed int)(di) < 0) di = 0 +#endif #else #error no timer #endif +struct pp_counters +{ + pp_type counter[pp_total_points]; +}; + #define pprof_start(point) { \ - unsigned int pp_start_##point = pprof_get_one() + unsigned int pp_start_##point = pprof_get_one(); refcounts[pp_##point]++ #define pprof_end(point) \ { \ unsigned int di = pprof_get_one() - pp_start_##point; \ unglitch_timer(di); \ - pp_counters->counter[pp_##point] += di; \ + if (!--refcounts[pp_##point]) pp_counters->counter[pp_##point] += di; \ } \ } @@ -57,7 +72,7 @@ extern volatile unsigned long *gp2x_memregl; { \ unsigned int di = pprof_get_one() - pp_start_##point; \ unglitch_timer(di); \ - pp_counters->counter[pp_##point] -= di; \ + if (--refcounts[pp_##point]) pp_counters->counter[pp_##point] -= di; \ } \ } From fb13cb3c2eeb08ae9d0483a6eec0f170fe5506b6 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 20 Mar 2019 20:34:01 +0100 Subject: [PATCH 010/174] bfd-less arm disassembler for gph --- cpu/sh2/mame/sh2dasm.c | 2 +- platform/common/disarm.c | 481 ++++++++++++++++++++++++++++++++++++++ platform/common/disarm.h | 28 +++ platform/gp2x/host_dasm.c | 85 +++++++ 4 files changed, 595 insertions(+), 1 deletion(-) create mode 100644 platform/common/disarm.c create mode 100644 platform/common/disarm.h create mode 100644 platform/gp2x/host_dasm.c diff --git a/cpu/sh2/mame/sh2dasm.c b/cpu/sh2/mame/sh2dasm.c index 3fa25e923..0ecb7f455 100644 --- a/cpu/sh2/mame/sh2dasm.c +++ b/cpu/sh2/mame/sh2dasm.c @@ -465,7 +465,7 @@ static UINT32 op1000(char *buffer, UINT32 pc, UINT16 opcode) sprintf(buffer, "MOV.B @($%02X,%s),R0", (opcode & 15), regname[Rm]); break; case 5: - sprintf(buffer, "MOV.W @($%02X,%s),R0", (opcode & 15), regname[Rm]); + sprintf(buffer, "MOV.W @($%02X,%s),R0", (opcode & 15) * 2, regname[Rm]); break; case 8: sprintf(buffer, "CMP/EQ #$%02X,R0", (opcode & 0xff)); diff --git a/platform/common/disarm.c b/platform/common/disarm.c new file mode 100644 index 000000000..2e7c04e70 --- /dev/null +++ b/platform/common/disarm.c @@ -0,0 +1,481 @@ +/* + * Copyright (c) 2012 Wojtek Kaniewski + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include + +#define IMM_FORMAT "0x%x" +//#define IMM_FORMAT "%d" +#define ADDR_FORMAT "0x%x" + +static inline unsigned int rol(unsigned int value, unsigned int shift) +{ + shift &= 31; + + return (value >> shift) | (value << (32 - shift)); +} + +static inline const char *condition(unsigned int insn) +{ + const char *conditions[16] = { "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc", "hi", "ls", "ge", "lt", "gt", "le", "", "nv" }; + return conditions[(insn >> 28) & 0x0f]; +} + +static inline const char *register_name(unsigned int reg) +{ + const char *register_names[16] = { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "sp", "lr", "pc" }; + return register_names[reg & 0x0f]; +} + +static const char *register_list(unsigned int list, char *buf, size_t buf_len) +{ + int i; + + buf[0] = 0; + + for (i = 0; i < 16; i++) + { + if ((list >> i) & 1) + { + snprintf(buf + strlen(buf), buf_len - strlen(buf), "%s%s", (buf[0] == 0) ? "" : ",", register_name(i)); + } + } + + return buf; +} + +static const char *shift(unsigned int insn, char *buf, size_t buf_len) +{ + unsigned int imm = (insn >> 7) & 0x1f; + const char *rn = register_name(insn >> 8); + unsigned int type = (insn >> 4) & 0x07; + + switch (type) + { + case 0: + snprintf(buf, buf_len, (imm != 0) ? ",lsl #%d" : "", imm); + break; + case 1: + snprintf(buf, buf_len, ",lsl %s", rn); + break; + case 2: + snprintf(buf, buf_len, ",lsr #%d", imm ? imm : 32); + break; + case 3: + snprintf(buf, buf_len, ",lsr %s", rn); + break; + case 4: + snprintf(buf, buf_len, ",asr #%d", imm ? imm : 32); + break; + case 5: + snprintf(buf, buf_len, ",asr %s", rn); + break; + case 6: + snprintf(buf, buf_len, (imm != 0) ? ",ror #%d" : ",rrx", imm); + break; + case 7: + snprintf(buf, buf_len, ",ror %s", rn); + break; + } + + return buf; +} + +static const char *immediate(unsigned int imm, int negative, int show_if_zero, char *buf, size_t buf_len) +{ + if (imm || show_if_zero) + { + snprintf(buf, buf_len, ",#%s" IMM_FORMAT, (negative) ? "-" : "", imm); + return buf; + } + + return ""; +} + +static int data_processing(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + unsigned int oper = (insn >> 21) & 15; + const char *names[16] = { "and", "eor", "sub", "rsb", "add", "adc", "sbc", "rsc", "tst", "teq", "cmp", "cmn", "orr", "mov", "bic", "mvn" }; + const char *name; + const char *s; + unsigned int rd; + unsigned int rn; + int is_move = ((oper == 13) || (oper == 15)); + int is_test = ((oper >= 8) && (oper <= 11)); + char tmp_buf[64]; + + name = names[oper]; + s = ((insn >> 20) & 1) ? "s" : ""; + rn = (insn >> 16) & 15; + rd = (insn >> 12) & 15; + + /* mov r0,r0,r0 is a nop */ + if (insn == 0xe1a00000) + { + snprintf(buf, buf_len, "nop"); + return 1; + } + + /* mrs */ + if ((insn & 0x0fbf0fff) == 0x010f0000) + { + const char *psr = ((insn >> 22) & 1) ? "spsr" : "cpsr"; + const char *rd = register_name(insn >> 12); + + snprintf(buf, buf_len, "mrs%s %s,%s", condition(insn), rd, psr); + + return 1; + } + + /* msr flag only*/ + if ((insn & 0x0db0f000) == 0x0120f000) + { + const char *psr = ((insn >> 22) & 1) ? "spsr" : "cpsr"; + const char *suffix; + + switch ((insn >> 16) & 15) + { + case 9: + suffix = ""; + break; + case 8: + suffix = "_f"; + break; + case 1: + suffix = "_c"; + break; + default: + return 0; + } + + if ((insn >> 25) & 1) + { + unsigned int imm = rol(insn & 0x000000ff, ((insn >> 8) & 15) * 2); + + snprintf(buf, buf_len, "msr%s %s%s,#" IMM_FORMAT, condition(insn), psr, suffix, imm); + } + else + { + const char *rm = register_name(insn >> 0); + + if (((insn >> 4) & 255) != 0) + { + return 0; + } + + snprintf(buf, buf_len, "msr%s %s%s,%s", condition(insn), psr, suffix, rm); + } + + return 1; + } + + if (((insn >> 25) & 1) == 0) + { + unsigned int rm; + + rm = (insn & 15); + + if (is_move) + { + snprintf(buf, buf_len, "%s%s%s %s,%s%s", name, condition(insn), s, register_name(rd), register_name(rm), shift(insn, tmp_buf, sizeof(tmp_buf))); + } + else if (is_test) + { + snprintf(buf, buf_len, "%s%s %s,%s%s", name, condition(insn), register_name(rn), register_name(rm), shift(insn, tmp_buf, sizeof(tmp_buf))); + } + else + { + snprintf(buf, buf_len, "%s%s%s %s,%s,%s%s", name, condition(insn), s, register_name(rd), register_name(rn), register_name(rm), shift(insn, tmp_buf, sizeof(tmp_buf))); + } + } + else + { + unsigned int imm; + + imm = rol(insn & 0x000000ff, ((insn >> 8) & 15) * 2); + + if (is_move) + { + snprintf(buf, buf_len, "%s%s%s %s%s", name, condition(insn), s, register_name(rd), immediate(imm, 0, 1, tmp_buf, sizeof(tmp_buf))); + } + else if (is_test) + { + snprintf(buf, buf_len, "%s%s %s%s", name, condition(insn), register_name(rn), immediate(imm, 0, 1, tmp_buf, sizeof(tmp_buf))); + } + else + { + snprintf(buf, buf_len, "%s%s%s %s,%s%s", name, condition(insn), s, register_name(rd), register_name(rn), immediate(imm, 0, 1, tmp_buf, sizeof(tmp_buf))); + } + } + + return 1; +} + +static int branch(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *link = ((insn >> 24) & 1) ? "l" : ""; + unsigned int address; + unsigned int offset; + + offset = insn & 0x00ffffff; + + if ((offset & 0x00800000) != 0) + { + offset |= 0xff000000; + } + + address = pc + 8 + (offset << 2); + + snprintf(buf, buf_len, "b%s%s " ADDR_FORMAT, link, condition(insn), address); + + return 1; +} + +static int multiply(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *rd = register_name(insn >> 16); + const char *rn = register_name(insn >> 12); + const char *rs = register_name(insn >> 8); + const char *rm = register_name(insn >> 0); + const char *s = ((insn >> 20) & 1) ? "s" : ""; + int mla = (insn >> 21) & 1; + + snprintf(buf, buf_len, (mla) ? "mla%s%s %s,%s,%s,%s" : "mul%s%s %s,%s,%s", condition(insn), s, rd, rm, rs, rn); + + return 1; +} + +static int multiply_long(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *rh = register_name(insn >> 16); + const char *rl = register_name(insn >> 12); + const char *rs = register_name(insn >> 8); + const char *rm = register_name(insn >> 0); + const char *u = ((insn >> 22) & 1) ? "s" : "u"; + const char *s = ((insn >> 20) & 1) ? "s" : ""; + const char *name = ((insn >> 21) & 1) ? "mlal" : "mull"; + + snprintf(buf, buf_len, "%s%s%s%s %s,%s,%s,%s", u, name, condition(insn), s, rl, rh, rm, rs); + + return 1; +} + +static int single_data_swap(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *rn = register_name(insn >> 16); + const char *rd = register_name(insn >> 12); + const char *rm = register_name(insn >> 0); + const char *b = ((insn >> 22) & 1) ? "b" : ""; + + snprintf(buf, buf_len, "swp%s%s %s,%s,[%s]", condition(insn), b, rd, rm, rn); + + return 1; +} + +static int branch_and_exchange(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *rn = register_name(insn >> 0); + const char *l = ((insn >> 5) & 1) ? "l" : ""; + + snprintf(buf, buf_len, "b%sx%s %s", l, condition(insn), rn); + + return 1; +} + +static int halfword_data_transfer(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *rn = register_name(insn >> 16); + const char *rd = register_name(insn >> 12); + const char *name = ((insn >> 20) & 1) ? "ldr" : "str"; + const char *w = ((insn >> 21) & 1) ? "!" : ""; + int sign = (insn >> 23) & 1; + int pre = (insn >> 24) & 1; + const char *suffix = ""; + char tmp_buf[64]; + + switch ((insn >> 5) & 3) + { + case 0: + name = "swp"; + break; + case 1: + suffix = "h"; + break; + case 2: + suffix = "sb"; + break; + case 3: + suffix = "sh"; + break; + } + + if ((insn >> 22) & 1) + { + unsigned int imm = ((insn >> 4) & 0xf0) | (insn & 0x0f); + + snprintf(buf, buf_len, (pre) ? "%s%s%s %s,[%s%s]%s" : "%s%s%s %s,[%s],%s%s", name, condition(insn), suffix, rd, rn, immediate(imm, !sign, 0, tmp_buf, sizeof(tmp_buf)), w); + } + else + { + const char *rm = register_name(insn >> 0); + + snprintf(buf, buf_len, (pre) ? "%s%s%s %s,[%s,%s%s]%s" : "%s%s%s %s,[%s],%s%s%s", name, condition(insn), suffix, rd, rn, sign ? "" : "-", rm, w); + } + + return 1; +} + +static int single_data_transfer(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *rn = register_name(insn >> 16); + const char *rd = register_name(insn >> 12); + const char *name = ((insn >> 20) & 1) ? "ldr" : "str"; + const char *w = ((insn >> 21) & 1) ? "!" : ""; + const char *b = ((insn >> 22) & 1) ? "b" : ""; + int sign = (insn >> 23) & 1; + int pre = (insn >> 24) & 1; + char tmp_buf[64]; + + if ((insn >> 25) & 1) + { + const char *rm = register_name(insn >> 0); + + snprintf(buf, buf_len, (pre) ? "%s%s%s %s,[%s,%s%s%s]%s" : "%s%s%s %s,[%s],%s%s%s%s", name, condition(insn), b, rd, rn, sign ? "" : "-", rm, shift(insn, tmp_buf, sizeof(tmp_buf)), w); + } + else + { + unsigned int imm = insn & 0x00000fff; + + snprintf(buf, buf_len, (pre) ? "%s%s%s %s,[%s%s]%s" : "%s%s%s %s,[%s]%s%s", name, condition(insn), b, rd, rn, immediate(imm, !sign, 0, tmp_buf, sizeof(tmp_buf)), w); + } + + return 1; +} + +static int block_data_transfer(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *s = ((insn >> 22) & 1) ? "^" : ""; + const char *w = ((insn >> 21) & 1) ? "!" : ""; + int load = (insn >> 20) & 1; + const char *name = (load) ? "ldm" : "stm"; + const char *ldm_stubs[4] = { "fa", "fd", "ea", "ed" }; + const char *stm_stubs[4] = { "ed", "ea", "fd", "fa" }; + int stub_idx = (insn >> 23) & 3; + const char *stub = (load) ? ldm_stubs[stub_idx] : stm_stubs[stub_idx]; + char tmp_buf[64]; + + snprintf(buf, buf_len, "%s%s%s %s%s, {%s}%s", name, condition(insn), stub, register_name(insn >> 16), w, register_list(insn & 0xffff, tmp_buf, sizeof(tmp_buf)), s); + + return 1; +} + +static int coprocessor_data_transfer(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *name = ((insn >> 20) & 1) ? "ldc" : "stc"; + const char *rn = register_name(insn >> 16); + int sign = (insn >> 23) & 1; + const char *l = ((insn >> 22) & 1) ? "l" : ""; + const char *w = ((insn >> 21) & 1) ? "!" : ""; + int pre = (insn >> 24) & 1; + unsigned int cp = (insn >> 8) & 15; + unsigned int cd = (insn >> 12) & 15; + unsigned int imm = (insn >> 0) & 255; + char tmp_buf[64]; + + snprintf(buf, buf_len, (pre) ? "%s%s%s p%d,cr%d,[%s%s]%s" : "%s%s%s p%d,cr%d,[%s]%s%s", name, condition(insn), l, cp, cd, rn, immediate(imm, !sign, 0, tmp_buf, sizeof(tmp_buf)), w); + + return 1; +} + +static int coprocessor_data_operation(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + snprintf(buf, buf_len, "cdp%s p%d,%d,cr%d,cr%d,cr%d,{%d}", condition(insn), (insn >> 8) & 15, (insn >> 20) & 15, (insn >> 12) & 15, (insn >> 16) & 15, (insn >> 0) & 15, (insn >> 5) & 7); + + return 1; +} + +static int coprocessor_register_transfer(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + const char *name = ((insn >> 20) & 1) ? "mrc" : "mcr"; + unsigned int cn = (insn >> 16) & 15; + const char *rd = register_name(insn >> 12); + unsigned int expr1 = (insn >> 21) & 7; + unsigned int expr2 = (insn >> 5) & 7; + unsigned int cp = (insn >> 8) & 15; + unsigned int cm = (insn >> 0) & 15; + + snprintf(buf, buf_len, "%s%s p%d,%d,%s,cr%d,cr%d,{%d}", name, condition(insn), cp, expr1, rd, cn, cm, expr2); + + return 1; +} + +static int software_interrupt(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + snprintf(buf, buf_len, "swi%s %u", condition(insn), insn & 0x00ffffff); + + return 1; +} + +int disarm(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +{ + if ((insn & 0x0fffffd0) == 0x012fff10) + return branch_and_exchange(pc, insn, buf, buf_len); + + if ((insn & 0x0fb00ff0) == 0x01000090) + return single_data_swap(pc, insn, buf, buf_len); + + if ((insn & 0x0fc000f0) == 0x00000090) + return multiply(pc, insn, buf, buf_len); + + if ((insn & 0x0f8000f0) == 0x00800090) + return multiply_long(pc, insn, buf, buf_len); + + if ((insn & 0x0f000010) == 0x0e000000) + return coprocessor_data_operation(pc, insn, buf, buf_len); + + if ((insn & 0x0f000010) == 0x0e000010) + return coprocessor_register_transfer(pc, insn, buf, buf_len); + + if ((insn & 0x0f000000) == 0x0f000000) + return software_interrupt(pc, insn, buf, buf_len); + + if ((insn & 0x0e000090) == 0x00000090) + return halfword_data_transfer(pc, insn, buf, buf_len); + + if ((insn & 0x0e000000) == 0x08000000) + return block_data_transfer(pc, insn, buf, buf_len); + + if ((insn & 0x0e000000) == 0x0a000000) + return branch(pc, insn, buf, buf_len); + + if ((insn & 0x0e000000) == 0x0c000000) + return coprocessor_data_transfer(pc, insn, buf, buf_len); + + if ((insn & 0x0c000000) == 0x00000000) + return data_processing(pc, insn, buf, buf_len); + + if ((insn & 0x0c000000) == 0x04000000) + return single_data_transfer(pc, insn, buf, buf_len); + + return 0; +} + diff --git a/platform/common/disarm.h b/platform/common/disarm.h new file mode 100644 index 000000000..2ea4ccc3b --- /dev/null +++ b/platform/common/disarm.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2012 Wojtek Kaniewski + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef DISARM_H +#define DISARM_H + +int disarm(unsigned int pc, unsigned int insn, char *buf, unsigned int buf_len); + +#endif /* DISARM_H */ diff --git a/platform/gp2x/host_dasm.c b/platform/gp2x/host_dasm.c new file mode 100644 index 000000000..5e1fc2182 --- /dev/null +++ b/platform/gp2x/host_dasm.c @@ -0,0 +1,85 @@ +#include +#include +#include + +#include "../common/disarm.c" + + +/* symbols */ +typedef struct { const char *name; void *value; } asymbol; + +static asymbol **symbols; +static long symcount, symstorage = 8; + +static const char *lookup_name(void *addr) +{ + asymbol **sptr = symbols; + int i; + + for (i = 0; i < symcount; i++) { + asymbol *sym = *sptr++; + + if (addr == sym->value) + return sym->name; + } + + return NULL; +} + +void host_dasm(void *addr, int len) +{ + void *end = (char *)addr + len; + const char *name; + char buf[64]; + long insn, symaddr; + + while (addr < end) { + name = lookup_name(addr); + if (name != NULL) + printf("%s:\n", name); + + insn = *(long *)addr; + printf(" %08lx %08lx ", (long)addr, insn); + if(disarm((unsigned)addr, insn, buf, sizeof(buf))) { + symaddr = 0; + if ((insn & 0xe000000) == 0xa000000) { + symaddr = (long)addr + 8 + ((long)(insn << 8) >> 6); + name = lookup_name((void *)symaddr); + } + if (symaddr && name) + printf("%s <%s>\n", buf, name); + else if (symaddr && !name) + printf("%s \n", buf); + else + printf("%s\n", buf); + } else + printf("unknown\n"); + addr = (char *)addr + sizeof(long); + } +} + +void host_dasm_new_symbol_(void *addr, const char *name) +{ + asymbol *sym, **tmp; + + if (symbols == NULL) + symbols = malloc(symstorage); + if (symstorage <= symcount * sizeof(symbols[0])) { + tmp = realloc(symbols, symstorage * 2); + if (tmp == NULL) + return; + symstorage *= 2; + symbols = tmp; + } + + symbols[symcount] = calloc(sizeof(*symbols[0]), 1); + if (symbols[symcount] == NULL) + return; + + // a HACK (should use correct section), but ohwell + sym = symbols[symcount]; + sym->value = addr; + sym->name = name; + symcount++; +} + From 8540388a105b8f392ccfe5a25dde8c9254a50313 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 20 Mar 2019 20:34:19 +0100 Subject: [PATCH 011/174] bugfix for 32x --- pico/32x/sh2soc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/pico/32x/sh2soc.c b/pico/32x/sh2soc.c index 62423d136..b5300119f 100644 --- a/pico/32x/sh2soc.c +++ b/pico/32x/sh2soc.c @@ -87,6 +87,7 @@ static void dmac_transfer_one(SH2 *sh2, struct dma_chan *chan) case 0: d = p32x_sh2_read8(chan->sar, sh2); p32x_sh2_write8(chan->dar, d, sh2); + break; case 1: d = p32x_sh2_read16(chan->sar, sh2); p32x_sh2_write16(chan->dar, d, sh2); From 4766b9309aed4e4be549c6b9da93a3013c6abc53 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 20 Mar 2019 21:12:10 +0100 Subject: [PATCH 012/174] sh2 drc host disassembler integration for gp2x --- Makefile | 2 -- config.x86 | 3 +-- cpu/sh2/compiler.c | 2 +- pico/sound/mix.c | 16 +++++++++++++--- platform/common/common.mak | 12 ++++++++++-- .../{gp2x/host_dasm.c => common/host_dasm_arm.c} | 2 +- 6 files changed, 26 insertions(+), 11 deletions(-) rename platform/{gp2x/host_dasm.c => common/host_dasm_arm.c} (98%) diff --git a/Makefile b/Makefile index 8eba86dfd..69b15867f 100644 --- a/Makefile +++ b/Makefile @@ -128,8 +128,6 @@ OBJS += platform/gp2x/emu.o OBJS += platform/gp2x/vid_mmsp2.o OBJS += platform/gp2x/vid_pollux.o OBJS += platform/gp2x/warm.o -OBJS += platform/gp2x/host_dasm.o -OBJS += cpu/sh2/mame/sh2dasm.o USE_FRONTEND = 1 PLATFORM_MP3 = 1 PLATFORM_ZLIB = 1 diff --git a/config.x86 b/config.x86 index 24f9d2093..d463157ea 100644 --- a/config.x86 +++ b/config.x86 @@ -4,10 +4,9 @@ CC = gcc CXX = g++ AS = as STRIP = strip -CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -m32 #-DGPERF -pg +CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -m32 # -pg ASFLAGS += LDFLAGS += -m32 #-pg -LDLIBS += -L$(HOME)/opt/binutils-i386/usr/lib/ -lbfd-2.24-multiarch -lopcodes-2.24-multiarch LDLIBS += -L/usr/lib/i386-linux-gnu/debug -L/home/build/opt/lib32 -lSDL-1.2 -lasound -lpng -lz -lm -ldl ARCH = x86 diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index aa41a84df..c6522f37f 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -2744,7 +2744,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (drcf.pending_branch_direct) { struct op_data *opd_b = - (op_flags[i] & OF_DELAY_OP) ? &ops[i-1] : opd; + (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd; u32 target_pc = opd_b->imm; int cond = -1, ncond = -1; void *target = NULL; diff --git a/pico/sound/mix.c b/pico/sound/mix.c index 636edb553..202ba3551 100644 --- a/pico/sound/mix.c +++ b/pico/sound/mix.c @@ -15,16 +15,17 @@ else if ( val < min ) val = min; \ } +int mix_32_to_16l_level; -void mix_32_to_16l_stereo(short *dest, int *src, int count) +void mix_32_to_16l_stereo_core(short *dest, int *src, int count, int level) { int l, r; for (; count > 0; count--) { l = r = *dest; - l += *src++; - r += *src++; + l += *src++ >> level; + r += *src++ >> level; Limit( l, MAXOUT, MINOUT ); Limit( r, MAXOUT, MINOUT ); *dest++ = l; @@ -32,6 +33,15 @@ void mix_32_to_16l_stereo(short *dest, int *src, int count) } } +void mix_32_to_16l_stereo_lvl(short *dest, int *src, int count) +{ + mix_32_to_16l_stereo_core(dest, src, count, mix_32_to_16l_level); +} + +void mix_32_to_16l_stereo(short *dest, int *src, int count) +{ + mix_32_to_16l_stereo_core(dest, src, count, 0); +} void mix_32_to_16_mono(short *dest, int *src, int count) { diff --git a/platform/common/common.mak b/platform/common/common.mak index a5f6078cd..4ba250219 100644 --- a/platform/common/common.mak +++ b/platform/common/common.mak @@ -161,8 +161,16 @@ SRCS_COMMON += $(R)cpu/sh2/compiler.c ifdef drc_debug DEFINES += DRC_DEBUG=$(drc_debug) SRCS_COMMON += $(R)cpu/sh2/mame/sh2dasm.c -SRCS_COMMON += $(R)platform/libpicofe/linux/host_dasm.c -LDFLAGS += -lbfd -lopcodes -liberty +DASM = $(R)platform/libpicofe/linux/host_dasm.c +DASMLIBS = -lbfd -lopcodes -liberty +ifeq "$(ARCH)" "arm" +ifeq ($(filter_out $(shell $(CC) --print-file-name=libbfd.so),"/"),) +DASM = $(R)platform/common/host_dasm_arm.c +DASMLIBS = +endif +endif +SRCS_COMMON += $(DASM) +LDFLAGS += $(DASMLIBS) endif endif # use_sh2drc SRCS_COMMON += $(R)cpu/sh2/mame/sh2pico.c diff --git a/platform/gp2x/host_dasm.c b/platform/common/host_dasm_arm.c similarity index 98% rename from platform/gp2x/host_dasm.c rename to platform/common/host_dasm_arm.c index 5e1fc2182..7951b7d92 100644 --- a/platform/gp2x/host_dasm.c +++ b/platform/common/host_dasm_arm.c @@ -2,7 +2,7 @@ #include #include -#include "../common/disarm.c" +#include "disarm.c" /* symbols */ From ac29016a8cc821702eaada53a39a4d7ee88c93f1 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 20 Mar 2019 23:39:45 +0100 Subject: [PATCH 013/174] improved sh2 clock handling, bug fixing + small improvement to drc emitters --- cpu/drc/emit_arm.c | 73 +++++++++++++++++++++++++--------------------- cpu/drc/emit_x86.c | 21 +++++++------ cpu/sh2/sh2.c | 2 +- cpu/sh2/sh2.h | 4 +-- pico/32x/32x.c | 29 +++++++++++------- pico/32x/memory.c | 2 +- pico/cd/mcd.c | 2 ++ pico/pico_int.h | 6 ++-- 8 files changed, 77 insertions(+), 62 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 91b474024..89582e8da 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -86,7 +86,7 @@ #define A_OP_TST 0x8 #define A_OP_TEQ 0x9 #define A_OP_CMP 0xa -#define A_OP_CMN 0xa +#define A_OP_CMN 0xb #define A_OP_ORR 0xc #define A_OP_MOV 0xd #define A_OP_BIC 0xe @@ -250,7 +250,16 @@ #define EOP_MOVT(rd,imm) \ EMIT(0xe3400000 | ((rd)<<12) | (((imm)>>16)&0xfff) | (((imm)>>12)&0xf0000)) -// XXX: AND, RSB, *C, will break if 1 insn is not enough +static int count_bits(unsigned val) +{ + val = (val & 0x55555555) + ((val >> 1) & 0x55555555); + val = (val & 0x33333333) + ((val >> 2) & 0x33333333); + val = (val & 0x0f0f0f0f) + ((val >> 4) & 0x0f0f0f0f); + val = (val & 0x00ff00ff) + ((val >> 8) & 0x00ff00ff); + return (val & 0xffff) + (val >> 16); +} + +// XXX: RSB, *S will break if 1 insn is not enough static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int imm) { int ror2; @@ -259,23 +268,11 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int switch (op) { case A_OP_MOV: rn = 0; - if (~imm < 0x10000) { + // count bits in imm and use MVN if more bits 1 than 0 + if (count_bits(imm) > 16) { imm = ~imm; op = A_OP_MVN; } -#ifdef HAVE_ARMV7 - for (v = imm, ror2 = 0; v && !(v & 3); v >>= 2) - ror2--; - if (v >> 8) { - /* 2+ insns needed - prefer movw/movt */ - if (op == A_OP_MVN) - imm = ~imm; - EOP_MOVW(rd, imm); - if (imm & 0xffff0000) - EOP_MOVT(rd, imm); - return; - } -#endif break; case A_OP_EOR: @@ -283,27 +280,37 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int case A_OP_ADD: case A_OP_ORR: case A_OP_BIC: - if (s == 0 && imm == 0) + if (s == 0 && imm == 0 && rd == rn) return; break; } - for (v = imm, ror2 = 0; ; ror2 -= 8/2) { - /* shift down to get 'best' rot2 */ - for (; v && !(v & 3); v >>= 2) - ror2--; - - EOP_C_DOP_IMM(cond, op, s, rn, rd, ror2 & 0x0f, v & 0xff); - - v >>= 8; - if (v == 0) - break; - if (op == A_OP_MOV) - op = A_OP_ORR; - if (op == A_OP_MVN) + again: + v = imm, ror2 = 32/2; // arm imm shift is ROR, so rotate for best fit + while ((v >> 24) && !(v & 0xc0)) + v = (v << 2) | (v >> 30), ror2++; + do { + // shift down to get 'best' rot2 + while (v > 0xff && !(v & 3)) + v >>= 2, ror2--; + // AND must fit into 1 insn. if not, use BIC + if (op == A_OP_AND && v != (v & 0xff)) { + imm = ~imm; op = A_OP_BIC; + goto again; + } + EOP_C_DOP_IMM(cond, op, s, rn, rd, ror2 & 0xf, v & 0xff); + + switch (op) { + case A_OP_MOV: op = A_OP_ORR; break; + case A_OP_MVN: op = A_OP_BIC; break; + case A_OP_ADC: op = A_OP_ADD; break; + case A_OP_SBC: op = A_OP_SUB; break; + } rn = rd; - } + + v >>= 8, ror2 -= 8/2; + } while (v); } #define emith_op_imm(cond, s, op, r, imm) \ @@ -491,7 +498,7 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_cmp_r_imm(r, imm) { \ u32 op = A_OP_CMP, imm_ = imm; \ if (~imm_ < 0x100) { \ - imm_ = ~imm_; \ + imm_ = -imm_; \ op = A_OP_CMN; \ } \ emith_top_imm(A_COND_AL, op, r, imm); \ @@ -652,12 +659,10 @@ static int emith_xbranch(int cond, void *target, int is_call) if ((count) <= 8) { \ t = (count) - 8; \ t = (0xff << t) & 0xff; \ - EOP_BIC_IMM(d,s,8/2,t); \ EOP_C_DOP_IMM(cond,A_OP_BIC,0,s,d,8/2,t); \ } else if ((count) >= 24) { \ t = (count) - 24; \ t = 0xff >> t; \ - EOP_AND_IMM(d,s,0,t); \ EOP_C_DOP_IMM(cond,A_OP_AND,0,s,d,0,t); \ } else { \ EOP_MOV_REG(cond,0,d,s,A_AM1_LSL,count); \ diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 865aab4b4..e5f2adefb 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -421,13 +421,10 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; rmr = s2; \ } \ EMIT_OP_MODRM(0xf7, 3, op, rmr); /* xMUL rmr */ \ - /* XXX: using push/pop for the case of edx->eax; eax->edx */ \ - if (dhi != xDX && dhi != -1) \ - emith_push(xDX); \ if (dlo != xAX) \ - emith_move_r_r(dlo, xAX); \ - if (dhi != xDX && dhi != -1) \ - emith_pop(dhi); \ + EMIT_OP(0x90 + (dlo)); /* XCHG eax, dlo */ \ + if (dhi != xDX && dhi != -1 && !(dhi == xAX && dlo == xDX)) \ + emith_move_r_r(dhi, (dlo == xDX ? xAX : xDX)); \ if (dlo != xDX && dhi != xDX) \ emith_pop(xDX); \ if (dlo != xAX && dhi != xAX) \ @@ -474,12 +471,12 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_deref_op(op, r, rs, offs) do { \ /* mov r <-> [ebp+#offs] */ \ - if ((offs) >= 0x80) { \ + if (abs(offs) >= 0x80) { \ EMIT_OP_MODRM64(op, 2, r, rs); \ EMIT(offs, u32); \ } else { \ EMIT_OP_MODRM64(op, 1, r, rs); \ - EMIT(offs, u8); \ + EMIT((u8)offs, u8); \ } \ } while (0) @@ -496,7 +493,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; int r_ = r; \ if (!is_abcdx(r)) \ r_ = rcache_get_tmp(); \ - emith_deref_op(0x8a, r_, rs, offs); \ + EMIT(0x0f, u8); \ + emith_deref_op(0xb6, r_, rs, offs); \ if ((r) != r_) { \ emith_move_r_r(r, r_); \ rcache_free_tmp(r_); \ @@ -515,8 +513,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; } while (0) #define emith_read16_r_r_offs(r, rs, offs) do { \ - EMIT(0x66, u8); /* operand override */ \ - emith_read_r_r_offs(r, rs, offs); \ + EMIT(0x0f, u8); \ + emith_deref_op(0xb7, r, rs, offs); \ } while (0) #define emith_write16_r_r_offs(r, rs, offs) do { \ @@ -688,6 +686,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; case 0: rd = xDI; break; \ case 1: rd = xSI; break; \ case 2: rd = xDX; break; \ + case 2: rd = xBX; break; \ } #define emith_sh2_drc_entry() { \ diff --git a/cpu/sh2/sh2.c b/cpu/sh2/sh2.c index 403c4c70c..ba2607185 100644 --- a/cpu/sh2/sh2.c +++ b/cpu/sh2/sh2.c @@ -84,7 +84,7 @@ int sh2_irl_irq(SH2 *sh2, int level, int nested_call) // do this to avoid missing irqs that other SH2 might clear int vector = sh2->irq_callback(sh2, level); sh2_do_irq(sh2, level, vector); - sh2->m68krcycles_done += C_SH2_TO_M68K(*sh2, 13); + sh2->m68krcycles_done += C_SH2_TO_M68K(sh2, 13); } else sh2->test_irq = 1; diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index 407270f11..69abf8cd8 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -72,9 +72,9 @@ typedef struct SH2_ #define CYCLE_MULT_SHIFT 10 #define C_M68K_TO_SH2(xsh2, c) \ - ((int)((c) * (xsh2).mult_m68k_to_sh2) >> CYCLE_MULT_SHIFT) + ((int)((long long)(c) * (xsh2)->mult_m68k_to_sh2) >> CYCLE_MULT_SHIFT) #define C_SH2_TO_M68K(xsh2, c) \ - ((int)((c + 3) * (xsh2).mult_sh2_to_m68k) >> CYCLE_MULT_SHIFT) + ((int)((long long)(c+3) * (xsh2)->mult_sh2_to_m68k) >> CYCLE_MULT_SHIFT) int sh2_init(SH2 *sh2, int is_slave, SH2 *other_sh2); void sh2_finish(SH2 *sh2); diff --git a/pico/32x/32x.c b/pico/32x/32x.c index 9bfbeface..3ee8c2ea1 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -254,8 +254,8 @@ static void p32x_start_blank(void) } p32x_trigger_irq(NULL, SekCyclesDone(), P32XI_VINT); - p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, 0); - p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, 0); + p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, SekCyclesDone()); + p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, SekCyclesDone()); } void p32x_schedule_hint(SH2 *sh2, int m68k_cycles) @@ -323,8 +323,12 @@ void p32x_event_schedule_sh2(SH2 *sh2, enum p32x_event event, int after) p32x_event_schedule(now, event, after); - left_to_next = (event_time_next - now) * 3; - sh2_end_run(sh2, left_to_next); + left_to_next = C_M68K_TO_SH2(sh2, (int)(event_time_next - now)); + if (sh2_cycles_left(sh2) > left_to_next) { + if (left_to_next < 1) + left_to_next = 1; + sh2_end_run(sh2, left_to_next); + } } static void p32x_run_events(unsigned int until) @@ -372,13 +376,13 @@ static void run_sh2(SH2 *sh2, int m68k_cycles) pevt_log_sh2_o(sh2, EVT_RUN_START); sh2->state |= SH2_STATE_RUN; - cycles = C_M68K_TO_SH2(*sh2, m68k_cycles); + cycles = C_M68K_TO_SH2(sh2, m68k_cycles); elprintf_sh2(sh2, EL_32X, "+run %u %d @%08x", sh2->m68krcycles_done, cycles, sh2->pc); done = sh2_execute(sh2, cycles, PicoIn.opt & POPT_EN_DRC); - sh2->m68krcycles_done += C_SH2_TO_M68K(*sh2, done); + sh2->m68krcycles_done += C_SH2_TO_M68K(sh2, done); sh2->state &= ~SH2_STATE_RUN; pevt_log_sh2_o(sh2, EVT_RUN_END); elprintf_sh2(sh2, EL_32X, "-run %u %d", @@ -412,8 +416,7 @@ void p32x_sync_other_sh2(SH2 *sh2, unsigned int m68k_target) // there might be new event to schedule current sh2 to if (event_time_next) { - left_to_event = event_time_next - m68k_target; - left_to_event *= 3; + left_to_event = C_M68K_TO_SH2(sh2, (int)(event_time_next - m68k_target)); if (sh2_cycles_left(sh2) > left_to_event) { if (left_to_event < 1) left_to_event = 1; @@ -446,6 +449,7 @@ void sync_sh2s_normal(unsigned int m68k_target) now = ssh2.m68krcycles_done; timer_cycles = now; + pprof_start(m68k); while (CYCLES_GT(m68k_target, now)) { if (event_time_next && CYCLES_GE(now, event_time_next)) @@ -463,6 +467,7 @@ void sync_sh2s_normal(unsigned int m68k_target) target - msh2.m68krcycles_done, target - ssh2.m68krcycles_done, m68k_target - now, Pico32x.emu_flags); + pprof_start(ssh2); if (!(ssh2.state & SH2_IDLE_STATES)) { cycles = target - ssh2.m68krcycles_done; if (cycles > 0) { @@ -472,7 +477,9 @@ void sync_sh2s_normal(unsigned int m68k_target) target = event_time_next; } } + pprof_end(ssh2); + pprof_start(msh2); if (!(msh2.state & SH2_IDLE_STATES)) { cycles = target - msh2.m68krcycles_done; if (cycles > 0) { @@ -482,6 +489,7 @@ void sync_sh2s_normal(unsigned int m68k_target) target = event_time_next; } } + pprof_end(msh2); now = target; if (!(msh2.state & SH2_IDLE_STATES)) { @@ -497,6 +505,7 @@ void sync_sh2s_normal(unsigned int m68k_target) p32x_timers_do(now - timer_cycles); timer_cycles = now; } + pprof_end_sub(m68k); // advance idle CPUs if (msh2.state & SH2_IDLE_STATES) { @@ -553,8 +562,8 @@ void PicoFrame32x(void) if (!(Pico32x.sh2_regs[0] & 0x80)) p32x_schedule_hint(NULL, SekCyclesDone()); - p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, 0); - p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, 0); + p32x_sh2_poll_event(&msh2, SH2_STATE_VPOLL, SekCyclesDone()); + p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, SekCyclesDone()); if (PicoIn.AHW & PAHW_MCD) pcd_prepare_frame(); diff --git a/pico/32x/memory.c b/pico/32x/memory.c index eff0ab07c..d815853d0 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -146,7 +146,7 @@ static void sh2s_sync_on_read(SH2 *sh2) cycles = sh2_cycles_done(sh2); if (cycles > 600) - p32x_sync_other_sh2(sh2, sh2->m68krcycles_done + cycles / 3); + p32x_sync_other_sh2(sh2, sh2->m68krcycles_done + C_SH2_TO_M68K(sh2, cycles)); } // SH2 faking diff --git a/pico/cd/mcd.c b/pico/cd/mcd.c index 5e3629a36..8a2f230d5 100644 --- a/pico/cd/mcd.c +++ b/pico/cd/mcd.c @@ -125,6 +125,7 @@ static void SekRunS68k(unsigned int to) if (SekShouldInterrupt()) Pico_mcd->m.s68k_poll_a = 0; + pprof_start(s68k); SekCycleCntS68k += cyc_do; #if defined(EMU_C68K) PicoCpuCS68k.cycles = cyc_do; @@ -137,6 +138,7 @@ static void SekRunS68k(unsigned int to) #elif defined(EMU_F68K) SekCycleCntS68k += fm68k_emulate(&PicoCpuFS68k, cyc_do, 0) - cyc_do; #endif + pprof_end(s68k); } static void pcd_set_cycle_mult(void) diff --git a/pico/pico_int.h b/pico/pico_int.h index 7225cab85..cca7f9541 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -241,11 +241,11 @@ extern SH2 sh2s[2]; # define sh2_pc(sh2) (sh2)->pc #endif -#define sh2_cycles_done(sh2) ((int)(sh2)->cycles_timeslice - sh2_cycles_left(sh2)) +#define sh2_cycles_done(sh2) ((unsigned)(sh2)->cycles_timeslice - sh2_cycles_left(sh2)) #define sh2_cycles_done_t(sh2) \ - ((sh2)->m68krcycles_done * 3 + sh2_cycles_done(sh2)) + (unsigned)(C_M68K_TO_SH2(sh2, (sh2)->m68krcycles_done) + sh2_cycles_done(sh2)) #define sh2_cycles_done_m68k(sh2) \ - ((sh2)->m68krcycles_done + (sh2_cycles_done(sh2) / 3)) + (unsigned)((sh2)->m68krcycles_done + C_SH2_TO_M68K(sh2, sh2_cycles_done(sh2))) #define sh2_reg(c, x) (c) ? ssh2.r[x] : msh2.r[x] #define sh2_gbr(c) (c) ? ssh2.gbr : msh2.gbr From 070035b5f373e4ee49b17fdb098befc54101d52b Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 22 Mar 2019 20:17:08 +0100 Subject: [PATCH 014/174] substituted tool to obtain target structure offsets (for asm) --- Makefile | 9 ++--- pico/pico_int_o32.h | 28 ---------------- tools/Makefile | 7 ++-- tools/mkoffsets.sh | 82 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+), 37 deletions(-) delete mode 100644 pico/pico_int_o32.h create mode 100755 tools/mkoffsets.sh diff --git a/Makefile b/Makefile index 69b15867f..0e9217573 100644 --- a/Makefile +++ b/Makefile @@ -53,6 +53,7 @@ asm_ym2612 ?= 1 asm_misc ?= 1 asm_cdmemory ?= 1 asm_mix ?= 1 +asm_32xdraw ?= 0 # currently defunct else # if not arm use_fame ?= 1 use_cz80 ?= 1 @@ -208,10 +209,10 @@ LDFLAGS += -Wl,-Map=$(TARGET).map endif endif -target_: $(TARGET) +target_: pico/pico_int_o32.h $(TARGET) clean: - $(RM) $(TARGET) $(OBJS) + $(RM) $(TARGET) $(OBJS) pico/pico_int_o32.h $(RM) -r .opk_data $(TARGET): $(OBJS) @@ -225,8 +226,8 @@ endif pprof: platform/linux/pprof.c $(CC) $(CFLAGS) -O2 -ggdb -DPPROF -DPPROF_TOOL -I../../ -I. $^ -o $@ $(LDFLAGS) $(LDLIBS) -tools/textfilter: tools/textfilter.c - make -C tools/ textfilter +pico/pico_int_o32.h:: tools/mkoffsets.sh + make -C tools/ XCC="$(CC)" XCFLAGS="$(CFLAGS)" %.o: %.c $(CC) -c $(OBJOUT)$@ $< $(CFLAGS) diff --git a/pico/pico_int_o32.h b/pico/pico_int_o32.h deleted file mode 100644 index 25c64f432..000000000 --- a/pico/pico_int_o32.h +++ /dev/null @@ -1,28 +0,0 @@ -/* autogenerated by tools/mkoffsets, do not edit */ -#define OFS_Pico_video_reg 0x0000 -#define OFS_Pico_m_rotate 0x0040 -#define OFS_Pico_m_z80Run 0x0041 -#define OFS_Pico_m_dirtyPal 0x0046 -#define OFS_Pico_m_hardware 0x0047 -#define OFS_Pico_m_z80_reset 0x004f -#define OFS_Pico_m_sram_reg 0x0049 -#define OFS_Pico_sv 0x008c -#define OFS_Pico_sv_data 0x008c -#define OFS_Pico_sv_start 0x0090 -#define OFS_Pico_sv_end 0x0094 -#define OFS_Pico_sv_flags 0x0098 -#define OFS_Pico_rom 0x033c -#define OFS_Pico_romsize 0x0340 -#define OFS_EST_DrawScanline 0x00 -#define OFS_EST_rendstatus 0x04 -#define OFS_EST_DrawLineDest 0x08 -#define OFS_EST_HighCol 0x0c -#define OFS_EST_HighPreSpr 0x10 -#define OFS_EST_Pico 0x14 -#define OFS_EST_PicoMem_vram 0x18 -#define OFS_EST_PicoMem_cram 0x1c -#define OFS_EST_PicoOpt 0x20 -#define OFS_EST_Draw2FB 0x24 -#define OFS_EST_HighPal 0x28 -#define OFS_PMEM_vram 0x10000 -#define OFS_PMEM_vsram 0x22100 diff --git a/tools/Makefile b/tools/Makefile index 28b748d44..752cd6b26 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -1,13 +1,10 @@ -CFLAGS = -Wall -ggdb - -TARGETS = amalgamate textfilter mkoffsets +TARGETS = amalgamate textfilter OBJS = $(addsuffix .o,$(TARGETS)) all: $(TARGETS) + CC="$(XCC)" CFLAGS="$(XCFLAGS)" ./mkoffsets.sh ../pico clean: $(RM) $(TARGETS) $(OBJS) -mkoffsets: CFLAGS += -m32 -I.. - .PHONY: clean all diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh new file mode 100755 index 000000000..60088f213 --- /dev/null +++ b/tools/mkoffsets.sh @@ -0,0 +1,82 @@ +# usage: mkoffsets +# automatically compute structure offsets for gcc targets in ELF format + +CC=${CC:-gcc} + +# endianess of target (automagically determined below) +ENDIAN= + +compile_rodata () +{ + $CC $CFLAGS -I .. -c /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 + rosect=$(readelf -S /tmp/getoffs.o | grep '\.rodata' | + sed 's/^[^.]*././;s/ .*//') + objcopy --dump-section $rosect=/tmp/getoffs.ro /tmp/getoffs.o || exit 1 + ro=$(xxd -ps /tmp/getoffs.ro) + if [ "$ENDIAN" = "le" ]; then + # swap needed for le target + hex="" + for b in $(echo $ro | sed 's/\([0-9a-f]\{2\}\)/\1 /g'); do + hex=$b$hex; + done + else + hex=$ro + fi + rodata=$(printf "%d" 0x$hex) +} + +get_define () # prefix struct member member... +{ + prefix=$1; shift + struct=$1; shift + field=$(echo $* | sed 's/ /./g') + name=$(echo $* | sed 's/ /_/g') + echo '#include "pico/pico_int.h"' > /tmp/getoffs.c + echo "static const struct $struct p;" >> /tmp/getoffs.c + echo "const int offs = (char *)&p.$field - (char*)&p;" >>/tmp/getoffs.c + compile_rodata + line=$(printf "#define %-20s 0x%04x" $prefix$name $rodata) +} + +# determine endianess +echo "const int one = 1;" >/tmp/getoffs.c +compile_rodata +ENDIAN=$(if [ "$rodata" -eq 1 ]; then echo be; else echo le; fi) +# determine output file +echo "const int vsz = sizeof(void *);" >/tmp/getoffs.c +compile_rodata +fn="${1:-.}/pico_int_o$((8*$rodata)).h" +# output header +echo "/* autogenerated by mkoffset.sh, do not edit */" >$fn +echo "/* target endianess: $ENDIAN, compiled with: $CC $CFLAGS */" >>$fn +# output offsets +get_define OFS_Pico_ Pico video reg ; echo "$line" >>$fn +get_define OFS_Pico_ Pico m rotate ; echo "$line" >>$fn +get_define OFS_Pico_ Pico m z80Run ; echo "$line" >>$fn +get_define OFS_Pico_ Pico m dirtyPal ; echo "$line" >>$fn +get_define OFS_Pico_ Pico m hardware ; echo "$line" >>$fn +get_define OFS_Pico_ Pico m z80_reset ; echo "$line" >>$fn +get_define OFS_Pico_ Pico m sram_reg ; echo "$line" >>$fn +get_define OFS_Pico_ Pico sv ; echo "$line" >>$fn +get_define OFS_Pico_ Pico sv data ; echo "$line" >>$fn +get_define OFS_Pico_ Pico sv start ; echo "$line" >>$fn +get_define OFS_Pico_ Pico sv end ; echo "$line" >>$fn +get_define OFS_Pico_ Pico sv flags ; echo "$line" >>$fn +get_define OFS_Pico_ Pico rom ; echo "$line" >>$fn +get_define OFS_Pico_ Pico romsize ; echo "$line" >>$fn +get_define OFS_Pico_ Pico est ; echo "$line" >>$fn + +get_define OFS_EST_ PicoEState DrawScanline ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState rendstatus ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState DrawLineDest ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState HighCol ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState HighPreSpr ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState Pico ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState PicoMem_vram ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState PicoMem_cram ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState PicoOpt ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState Draw2FB ; echo "$line" >>$fn +get_define OFS_EST_ PicoEState HighPal ; echo "$line" >>$fn + +get_define OFS_PMEM_ PicoMem vram ; echo "$line" >>$fn +get_define OFS_PMEM_ PicoMem vsram ; echo "$line" >>$fn From 59ea3b20f85859be012d731ff4de2e2fe0455b63 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 22 Mar 2019 20:18:33 +0100 Subject: [PATCH 015/174] kludges for wwf raw, nfl --- cpu/sh2/compiler.c | 8 ++++++++ pico/32x/memory.c | 15 +++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index c6522f37f..aa3e772c6 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -4233,6 +4233,14 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, if (opd->imm < end_pc + MAX_LITERAL_OFFSET) { if (end_literals < opd->imm + opd->size * 2) end_literals = opd->imm + opd->size * 2; + if (opd->size == 2) { + // tweak for NFL: treat a 32bit literal as an address and check if it + // points to the literal space. In that case handle it like MOVA. + tmp = FETCH32(opd->imm) & ~0x20000000; // MUST ignore wt bit here + if (tmp >= end_pc && tmp < end_pc + MAX_LITERAL_OFFSET) + if (lowest_mova == 0 || tmp < lowest_mova) + lowest_mova = tmp; + } } } } diff --git a/pico/32x/memory.c b/pico/32x/memory.c index d815853d0..c6b89a223 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -1487,6 +1487,20 @@ static void REGPARM(3) sh2_write16_da(u32 a, u32 d, SH2 *sh2) ((u16 *)sh2->data_array)[a1 / 2] = d; } +static void REGPARM(3) sh2_write16_rom(u32 a, u32 d, SH2 *sh2) +{ + u32 a1 = a & 0x3fffff; + // tweak for WWF Raw: does writes to ROM area, and it doesn't work without + // allowing this. + // Presumably the write goes to the CPU cache and is read back from there, + // but it would be extremely costly to emulate cache behaviour. Just allow + // writes to that region, hoping that the original ROM values are never used. + if ((a1 & 0x3e0000) == 0x3e0000) + ((u16 *)sh2->p_rom)[a1 / 2] = d; + else + sh2_write16_unmapped(a, d, sh2); +} + typedef u32 (sh2_read_handler)(u32 a, SH2 *sh2); typedef void REGPARM(3) (sh2_write_handler)(u32 a, u32 d, SH2 *sh2); @@ -1911,6 +1925,7 @@ void PicoMemSetup32x(void) bank_switch_rom_sh2(); sh2_read8_map[0x02/2].mask = sh2_read8_map[0x22/2].mask = sh2_read16_map[0x02/2].mask = sh2_read16_map[0x22/2].mask = 0x3fffff; // FIXME + sh2_write16_map[0x02/2] = sh2_write16_map[0x22/2] = sh2_write16_rom; // CS2 - DRAM - done by Pico32xSwapDRAM() sh2_read8_map[0x04/2].mask = sh2_read8_map[0x24/2].mask = sh2_read16_map[0x04/2].mask = sh2_read16_map[0x24/2].mask = 0x01ffff; From 5cebb689c12aca5ee5c4b40220b136261728823c Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 22 Mar 2019 23:02:11 +0100 Subject: [PATCH 016/174] revamped 32X draw arm asm code --- Makefile | 2 +- pico/32x/{draw_arm.s => draw_arm.S} | 46 ++++++++++++++++------------- 2 files changed, 26 insertions(+), 22 deletions(-) rename pico/32x/{draw_arm.s => draw_arm.S} (90%) diff --git a/Makefile b/Makefile index 0e9217573..76649ffd1 100644 --- a/Makefile +++ b/Makefile @@ -53,7 +53,7 @@ asm_ym2612 ?= 1 asm_misc ?= 1 asm_cdmemory ?= 1 asm_mix ?= 1 -asm_32xdraw ?= 0 # currently defunct +asm_32xdraw ?= 1 else # if not arm use_fame ?= 1 use_cz80 ?= 1 diff --git a/pico/32x/draw_arm.s b/pico/32x/draw_arm.S similarity index 90% rename from pico/32x/draw_arm.s rename to pico/32x/draw_arm.S index ba66fbf1f..e91f9893d 100644 --- a/pico/32x/draw_arm.s +++ b/pico/32x/draw_arm.S @@ -6,9 +6,10 @@ @* See COPYING file in the top-level directory. @* +#include "pico/pico_int_o32.h" + .extern Pico32x -.extern PicoDraw2FB -.extern HighPal +.extern Pico .equiv P32XV_PRI, (1<< 7) @@ -22,11 +23,11 @@ Pico32xNativePal: .align 2 -.macro call_scan_prep cond +.macro call_scan_prep cond est @ &Pico.est .if \cond ldr r4, =PicoScan32xBegin ldr r5, =PicoScan32xEnd - ldr r6, =DrawLineDest + ldr r6, [\est, #OFS_EST_DrawLineDest] ldr r4, [r4] ldr r5, [r5] stmfd sp!, {r4,r5,r6} @@ -70,19 +71,20 @@ Pico32xNativePal: \name: stmfd sp!, {r4-r11,lr} + ldr lr,=Pico ldr r10,=Pico32x - ldr r11,=PicoDraw2FB + ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] ldr r10,[r10, #0x40] @ Pico32x.vdp_regs[0] - ldr r11,[r11] - ldr r9, =HighPal @ palmd + add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd + and r4, r2, #0xff mov r5, #328 - lsl r3, #26 @ mdbg << 26 + mov r3, r3, lsl #26 @ mdbg << 26 mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data tst r10,#P32XV_PRI moveq r10,#0 movne r10,#0x8000 @ r10 = inv_bit - call_scan_prep \call_scan + call_scan_prep \call_scan lr mov r4, #0 @ line b 1f @ loop_outer_entry @@ -139,16 +141,17 @@ Pico32xNativePal: \name: stmfd sp!, {r4-r11,lr} - ldr r11,=PicoDraw2FB + ldr lr,=Pico ldr r10,=Pico32xNativePal - ldr r11,[r11] + ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] ldr r10,[r10] - ldr r9, =HighPal @ palmd + add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd + and r4, r2, #0xff mov r5, #328 - lsl r3, #26 @ mdbg << 26 + mov r3, r3, lsl #26 @ mdbg << 26 mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data - call_scan_prep \call_scan + call_scan_prep \call_scan lr mov r4, #0 @ line b 1f @ loop_outer_entry @@ -179,8 +182,8 @@ Pico32xNativePal: ldrneb r8, [r5, #2]! @ r7,r8 - pixel 0,1 index subs r6, r6, #1 blt 0b @ loop_outer - cmp r7, r8 - beq 5f @ check_fill @ +8 +@ cmp r7, r8 @ is this really improving things? +@ beq 5f @ check_fill @ +8 3: @ no_fill: mov r12,r7, lsl #1 @@ -297,16 +300,17 @@ Pico32xNativePal: \name: stmfd sp!, {r4-r11,lr} - ldr r11,=PicoDraw2FB + ldr lr,=Pico ldr r10,=Pico32xNativePal - ldr r11,[r11] + ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] ldr r10,[r10] - ldr r9, =HighPal @ palmd + add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd + and r4, r2, #0xff mov r5, #328 - lsl r3, #26 @ mdbg << 26 + mov r3, r3, lsl #26 @ mdbg << 26 mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data - call_scan_prep \call_scan + call_scan_prep \call_scan lr mov r4, #0 @ line b 1f @ loop_outer_entry From 8cfd88aac5349bf341e06960f81de2f7bd8cdd15 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 22 Mar 2019 23:03:26 +0100 Subject: [PATCH 017/174] reworked palette and buffer handling due to some 32X bugs --- pico/32x/draw.c | 52 +++++++++----- pico/draw.c | 135 +++++++++++++++++++++++++++---------- pico/draw_arm.S | 22 ++---- pico/pico_int.h | 3 + pico/videoport.c | 4 +- platform/common/common.mak | 2 +- platform/common/emu.c | 2 + platform/gizmondo/emu.c | 25 ++++--- platform/gizmondo/menu.c | 4 +- platform/gp2x/emu.c | 76 +++++++++++---------- platform/linux/emu.c | 6 +- platform/psp/emu.c | 33 +++++++-- platform/psp/menu.c | 2 +- 13 files changed, 236 insertions(+), 130 deletions(-) diff --git a/pico/32x/draw.c b/pico/32x/draw.c index 828e0adb2..3dd3d62f7 100644 --- a/pico/32x/draw.c +++ b/pico/32x/draw.c @@ -11,6 +11,9 @@ int (*PicoScan32xBegin)(unsigned int num); int (*PicoScan32xEnd)(unsigned int num); int Pico32xDrawMode; +void *DrawLineDestBase32x; +int DrawLineDestIncrement32x; + static void convert_pal555(int invert_prio) { unsigned int *ps = (void *)Pico32xMem->pal; @@ -233,13 +236,11 @@ void PicoDraw32xLayer(int offs, int lines, int md_bg) int lines_sft_offs; int which_func; - Pico.est.DrawLineDest = (char *)DrawLineDestBase + offs * DrawLineDestIncrement; + Pico.est.DrawLineDest = (char *)DrawLineDestBase32x + offs * DrawLineDestIncrement32x; dram = Pico32xMem->dram[Pico32x.vdp_regs[0x0a/2] & P32XV_FS]; - if (Pico32xDrawMode == PDM32X_BOTH) { - if (Pico.m.dirtyPal) - PicoDrawUpdateHighPal(); - } + if (Pico32xDrawMode == PDM32X_BOTH) + PicoDrawUpdateHighPal(); if ((Pico32x.vdp_regs[0] & P32XV_Mx) == 2) { @@ -278,20 +279,21 @@ void PicoDraw32xLayer(int offs, int lines, int md_bg) void PicoDraw32xLayerMdOnly(int offs, int lines) { int have_scan = PicoScan32xBegin != NULL && PicoScan32xEnd != NULL; - unsigned short *dst = (void *)((char *)DrawLineDestBase + offs * DrawLineDestIncrement); + unsigned short *dst = (void *)((char *)DrawLineDestBase32x + offs * DrawLineDestIncrement32x); unsigned char *pmd = Pico.est.Draw2FB + 328 * offs + 8; unsigned short *pal = Pico.est.HighPal; int poffs = 0, plen = 320; int l, p; if (!(Pico.video.reg[12] & 1)) { - // 32col mode + // 32col mode. for some render modes MD pixel data carries an offset + if (!(PicoIn.opt & (POPT_ALT_RENDERER|POPT_DIS_32C_BORDER))) + pmd += 32; poffs = 32; plen = 256; } - if (Pico.m.dirtyPal) - PicoDrawUpdateHighPal(); + PicoDrawUpdateHighPal(); dst += poffs; for (l = 0; l < lines; l++) { @@ -305,7 +307,7 @@ void PicoDraw32xLayerMdOnly(int offs, int lines) dst[p + 2] = pal[*pmd++]; dst[p + 3] = pal[*pmd++]; } - dst = (void *)((char *)dst + DrawLineDestIncrement); + dst = (void *)((char *)dst + DrawLineDestIncrement32x); pmd += 328 - plen; if (have_scan) PicoScan32xEnd(l + offs); @@ -319,16 +321,32 @@ void PicoDrawSetOutFormat32x(pdso_t which, int use_32x_line_mode) Pico32xNativePal = Pico32xMem->pal_native; #endif - if (which == PDF_RGB555 && use_32x_line_mode) { - // we'll draw via FinalizeLine32xRGB555 (rare) + if (which == PDF_RGB555) { + // need CLUT pixels in PicoDraw2FB for layer transparency + PicoDrawSetInternalBuf(Pico.est.Draw2FB, 328); + PicoDrawSetOutBufMD(DrawLineDestBase32x, DrawLineDestIncrement32x); + } else { + // use the same layout as alt renderer PicoDrawSetInternalBuf(NULL, 0); - Pico32xDrawMode = PDM32X_OFF; - return; + PicoDrawSetOutBufMD(Pico.est.Draw2FB + 8, 328); } - // use the same layout as alt renderer - PicoDrawSetInternalBuf(Pico.est.Draw2FB, 328); - Pico32xDrawMode = (which == PDF_RGB555) ? PDM32X_32X_ONLY : PDM32X_BOTH; + if (use_32x_line_mode) + // we'll draw via FinalizeLine32xRGB555 (rare) + Pico32xDrawMode = PDM32X_OFF; + else + // in RGB555 mode the 32x layer is drawn over the MD layer, in the other + // modes 32x and MD layer are merged together by the 32x renderer + Pico32xDrawMode = (which == PDF_RGB555) ? PDM32X_32X_ONLY : PDM32X_BOTH; +} + +void PicoDrawSetOutBuf32X(void *dest, int increment) +{ + DrawLineDestBase32x = dest; + DrawLineDestIncrement32x = increment; + // in RGB555 mode this buffer is also used by the MD renderer + if (Pico32xDrawMode != PDM32X_BOTH) + PicoDrawSetOutBufMD(DrawLineDestBase32x, DrawLineDestIncrement32x); } // vim:shiftwidth=2:ts=2:expandtab diff --git a/pico/draw.c b/pico/draw.c index a6c5903b3..6fa17988d 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -1239,6 +1239,49 @@ void BackFill(int reg7, int sh, struct PicoEState *est) // -------------------------------------------- +void PicoDoHighPal555_8bit(int sh, int line, struct PicoEState *est) +{ + unsigned int *spal, *dpal; + unsigned int cnt = (sh ? 1 : est->SonicPalCount+1); + unsigned int t, i; + + // reset dirty only if there are no outstanding changes + if (Pico.m.dirtyPal == 2) + Pico.m.dirtyPal = 0; + + // In Sonic render mode palettes were backuped in SonicPal + spal = (void *)est->SonicPal; + dpal = (void *)est->HighPal; + + // additional palettes stored after in-frame changes + for (i = 0; i < cnt * 0x40 / 2; i++) { + t = spal[i]; +#ifdef USE_BGR555 + t = ((t & 0x000e000e)<< 1) | ((t & 0x00e000e0)<<3) | ((t & 0x0e000e00)<<4); +#else + t = ((t & 0x000e000e)<<12) | ((t & 0x00e000e0)<<3) | ((t & 0x0e000e00)>>7); +#endif + // treat it like it was 4-bit per channel, since in s/h mode it somewhat is that. + // otherwise intensity difference between this and s/h will be wrong + t |= (t >> 4) & 0x08610861; // 0x18e318e3 + dpal[i] = t; + } + + // norm: xxx0, sh: 0xxx, hi: 0xxx + 7 + if (sh) + { + // shadowed pixels + for (i = 0; i < 0x40 / 2; i++) + dpal[0x40/2 | i] = dpal[0xc0/2 | i] = (dpal[i] >> 1) & 0x738e738e; + // hilighted pixels + for (i = 0; i < 0x40 / 2; i++) { + t = ((dpal[i] >> 1) & 0x738e738e) + 0x738e738e; // 0x7bef7bef; + t |= (t >> 4) & 0x08610861; + dpal[0x80/2 | i] = t; + } + } +} + #ifndef _ASM_DRAW_C void PicoDoHighPal555(int sh, int line, struct PicoEState *est) { @@ -1285,8 +1328,7 @@ void FinalizeLine555(int sh, int line, struct PicoEState *est) unsigned short *pal=est->HighPal; int len; - if (Pico.m.dirtyPal) - PicoDoHighPal555(sh, line, est); + PicoDrawUpdateHighPal(); if (Pico.video.reg[12]&1) { len = 320; @@ -1315,22 +1357,21 @@ void FinalizeLine555(int sh, int line, struct PicoEState *est) static void FinalizeLine8bit(int sh, int line, struct PicoEState *est) { unsigned char *pd = est->DrawLineDest; - int len, rs = est->rendstatus; - static int dirty_count; + int len; + static int dirty_line; - if (!sh && Pico.m.dirtyPal == 1) + if (Pico.m.dirtyPal == 1) { // a hack for mid-frame palette changes - if (!(rs & PDRAW_SONIC_MODE)) - dirty_count = 1; - else dirty_count++; - rs |= PDRAW_SONIC_MODE; - est->rendstatus = rs; - if (dirty_count == 3) { - blockcpy(est->HighPal, PicoMem.cram, 0x40*2); - } else if (dirty_count == 11) { - blockcpy(est->HighPal+0x40, PicoMem.cram, 0x40*2); + if (!(est->rendstatus & PDRAW_SONIC_MODE) || line - dirty_line > 4) { + // store a maximum of 3 additional palettes in SonicPal + if (est->SonicPalCount < 3) + est->SonicPalCount ++; + dirty_line = line; + est->rendstatus |= PDRAW_SONIC_MODE; } + blockcpy(est->SonicPal+est->SonicPalCount*0x40, PicoMem.cram, 0x40*2); + Pico.m.dirtyPal = 2; } if (Pico.video.reg[12]&1) { @@ -1341,12 +1382,9 @@ static void FinalizeLine8bit(int sh, int line, struct PicoEState *est) len = 256; } - if (!sh && (rs & PDRAW_SONIC_MODE)) { - if (dirty_count >= 11) { - blockcpy_or(pd, est->HighCol+8, len, 0x80); - } else { - blockcpy_or(pd, est->HighCol+8, len, 0x40); - } + if (!sh && (est->rendstatus & PDRAW_SONIC_MODE)) { + // select active backup palette + blockcpy_or(pd, est->HighCol+8, len, est->SonicPalCount*0x40); } else { blockcpy(pd, est->HighCol+8, len); } @@ -1478,6 +1516,7 @@ static int DrawDisplay(int sh) PICO_INTERNAL void PicoFrameStart(void) { int offs = 8, lines = 224; + int dirty = ((Pico.est.rendstatus & PDRAW_SONIC_MODE) || Pico.m.dirtyPal); // prepare to do this frame Pico.est.rendstatus = 0; @@ -1503,11 +1542,16 @@ PICO_INTERNAL void PicoFrameStart(void) Pico.est.DrawScanline = 0; skip_next_line = 0; + if (FinalizeLine == FinalizeLine8bit) { + // make a backup of the current palette in case Sonic mode is detected later + Pico.est.SonicPalCount = 0; + Pico.m.dirtyPal = (dirty ? 2 : 0); // mark as dirty but already copied + blockcpy(Pico.est.SonicPal, PicoMem.cram, 0x40*2); + } + if (PicoIn.opt & POPT_ALT_RENDERER) return; - if (Pico.m.dirtyPal) - Pico.m.dirtyPal = 2; // reset dirty if needed PrepareSprites(1); } @@ -1598,15 +1642,21 @@ void PicoDrawSync(int to, int blank_last_line) void PicoDrawUpdateHighPal(void) { struct PicoEState *est = &Pico.est; - int sh = (Pico.video.reg[0xC] & 8) >> 3; // shadow/hilight? - if (PicoIn.opt & POPT_ALT_RENDERER) - sh = 0; // no s/h support + if (Pico.m.dirtyPal) { + int sh = (Pico.video.reg[0xC] & 8) >> 3; // shadow/hilight? + if ((PicoIn.opt & POPT_ALT_RENDERER) | (est->rendstatus & PDRAW_SONIC_MODE)) + sh = 0; // no s/h support + + if (FinalizeLine == FinalizeLine8bit) + PicoDoHighPal555_8bit(sh, 0, est); + else + PicoDoHighPal555(sh, 0, est); - PicoDoHighPal555(sh, 0, &Pico.est); - if (est->rendstatus & PDRAW_SONIC_MODE) { - // FIXME? - memcpy(est->HighPal + 0x40, est->HighPal, 0x40*2); - memcpy(est->HighPal + 0x80, est->HighPal, 0x40*2); + // cover for sprite priority bits if not in s/h or sonic mode + if (!sh && !(est->rendstatus & PDRAW_SONIC_MODE)) { + blockcpy(est->HighPal+0x40, est->HighPal, 0x40*2); + blockcpy(est->HighPal+0x80, est->HighPal, 0x80*2); + } } } @@ -1629,17 +1679,33 @@ void PicoDrawSetOutFormat(pdso_t which, int use_32x_line_mode) FinalizeLine = NULL; break; } - PicoDrawSetOutFormat32x(which, use_32x_line_mode); + if (PicoIn.AHW & PAHW_32X) + PicoDrawSetOutFormat32x(which, use_32x_line_mode); PicoDrawSetOutputMode4(which); rendstatus_old = -1; } +void PicoDrawSetOutBufMD(void *dest, int increment) +{ + if (dest != NULL) { + DrawLineDestBase = dest; + DrawLineDestIncrement = increment; + Pico.est.DrawLineDest = DrawLineDestBase + Pico.est.DrawScanline * increment; + } + else { + DrawLineDestBase = DefOutBuff; + DrawLineDestIncrement = 0; + Pico.est.DrawLineDest = DefOutBuff; + } +} + // note: may be called on the middle of frame void PicoDrawSetOutBuf(void *dest, int increment) { - DrawLineDestBase = dest; - DrawLineDestIncrement = increment; - Pico.est.DrawLineDest = (char *)DrawLineDestBase + Pico.est.DrawScanline * increment; + if (PicoIn.AHW & PAHW_32X) + PicoDrawSetOutBuf32X(dest, increment); + else + PicoDrawSetOutBufMD(dest, increment); } void PicoDrawSetInternalBuf(void *dest, int increment) @@ -1652,6 +1718,7 @@ void PicoDrawSetInternalBuf(void *dest, int increment) else { HighColBase = DefHighCol; HighColIncrement = 0; + Pico.est.HighCol = DefHighCol; } } diff --git a/pico/draw_arm.S b/pico/draw_arm.S index 29af1c136..3bc270331 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -1498,11 +1498,9 @@ vidConvCpyRGB565: @ void *to, void *from, int pixels PicoDoHighPal555: stmfd sp!, {r4-r10,lr} mov r10,r2 @ est - mov r1, #0 ldr r8, [r10, #OFS_EST_Pico] -PicoDoHighPal555_nopush: - orr r9, r1, r0, lsl #31 @ 0:called from FinalizeLine555, 31: s/h + mov r9, r0 add r0, r10, #OFS_EST_HighPal @@ -1517,7 +1515,7 @@ PicoDoHighPal555_nopush: vidConvCpyRGB565_local - tst r9, #(1<<31) + cmp r9, #0 beq PicoDoHighPal555_end add r3, r10, #OFS_EST_HighPal @@ -1560,11 +1558,7 @@ PicoDoHighPal555_nopush: mov r0, #1 PicoDoHighPal555_end: - tst r9, #1 - ldmeqfd sp!, {r4-r10,pc} - - ldr r8, [r10, #OFS_EST_Pico] - b FinalizeLineRGB555_pal_done + ldmfd sp!, {r4-r10,pc} @ void FinalizeLine555(int sh, int line, struct PicoEState *est) @@ -1576,19 +1570,11 @@ FinalizeLine555: mov r10,r2 @ est ldr r8, [r10, #OFS_EST_Pico] - ldrb r2, [r8, #OFS_Pico_m_dirtyPal] - mov r1, #1 - tst r2, r2 - bne PicoDoHighPal555_nopush + bl PicoDrawUpdateHighPal -FinalizeLineRGB555_pal_done: add r3, r10, #OFS_EST_HighPal - ldr r12, [r10, #OFS_EST_rendstatus] - eors r0, r0, #1 @ sh is 0 mov lr, #0xff - tstne r12,#PDRAW_ACC_SPRITES - movne lr, #0x3f ldr r1, [r10, #OFS_EST_HighCol] ldr r0, [r10, #OFS_EST_DrawLineDest] diff --git a/pico/pico_int.h b/pico/pico_int.h index cca7f9541..f6d8b37f2 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -356,6 +356,8 @@ struct PicoEState unsigned int *PicoOpt; unsigned char *Draw2FB; unsigned short HighPal[0x100]; + unsigned short SonicPal[0x100]; + int SonicPalCount; }; struct PicoMem @@ -923,6 +925,7 @@ void p32x_sh2_poll_event(SH2 *sh2, unsigned int flags, unsigned int m68k_cycles) // 32x/draw.c void PicoDrawSetOutFormat32x(pdso_t which, int use_32x_line_mode); +void PicoDrawSetOutBuf32X(void *dest, int increment); void FinalizeLine32xRGB555(int sh, int line, struct PicoEState *est); void PicoDraw32xLayer(int offs, int lines, int mdbg); void PicoDraw32xLayerMdOnly(int offs, int lines); diff --git a/pico/videoport.c b/pico/videoport.c index cd76dc049..d18c2cf9d 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -41,7 +41,7 @@ static void VideoWrite(u16 d) if (a - ((unsigned)(Pico.video.reg[5]&0x7f) << 9) < 0x400) Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; break; - case 3: Pico.m.dirtyPal = 1; + case 3: if (PicoMem.cram [(a >> 1) & 0x3f] != d) Pico.m.dirtyPal = 1; PicoMem.cram [(a >> 1) & 0x3f] = d; break; case 5: PicoMem.vsram[(a >> 1) & 0x3f] = d; break; case 0x81: @@ -441,7 +441,7 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) break; case 0x0c: // renderers should update their palettes if sh/hi mode is changed - if ((d^dold)&8) Pico.m.dirtyPal = 2; + if ((d^dold)&8) Pico.m.dirtyPal = 1; break; } return; diff --git a/platform/common/common.mak b/platform/common/common.mak index 4ba250219..b1ccbb475 100644 --- a/platform/common/common.mak +++ b/platform/common/common.mak @@ -70,7 +70,7 @@ SRCS_COMMON += $(R)pico/cd/memory_arm.S endif ifeq "$(asm_32xdraw)" "1" DEFINES += _ASM_32X_DRAW -SRCS_COMMON += $(R)pico/32x/draw_arm.s +SRCS_COMMON += $(R)pico/32x/draw_arm.S endif ifeq "$(asm_mix)" "1" SRCS_COMMON += $(R)pico/sound/mix_arm.S diff --git a/platform/common/emu.c b/platform/common/emu.c index 7b68abe90..fdde3dd70 100644 --- a/platform/common/emu.c +++ b/platform/common/emu.c @@ -1411,8 +1411,10 @@ void emu_loop(void) { notice_msg_time = 0; plat_status_msg_clear(); +#ifndef __GP2X__ plat_video_flip(); plat_status_msg_clear(); /* Do it again in case of double buffering */ +#endif notice_msg = NULL; } else { diff --git a/platform/gizmondo/emu.c b/platform/gizmondo/emu.c index 86c473c2a..fcf271250 100644 --- a/platform/gizmondo/emu.c +++ b/platform/gizmondo/emu.c @@ -155,7 +155,7 @@ static void blit(const char *fps, const char *notice) } // a hack for VR if (PicoIn.AHW & PAHW_SVP) - memset32((int *)(Pico.est.Draw2FB+328*8+328*223), 0xe0e0e0e0, 328); + memset((int *)(Pico.est.Draw2FB+328*8+328*223), 0xe0e0e0e0, 328*4); if (!(Pico.video.reg[12]&1)) lines_flags|=0x10000; if (currentConfig.EmuOpt&0x4000) lines_flags|=0x40000; // (Pico.m.frame_count&1)?0x20000:0x40000; @@ -166,22 +166,25 @@ static void blit(const char *fps, const char *notice) int lines_flags; // 8bit accurate renderer if (Pico.m.dirtyPal) { - Pico.m.dirtyPal = 0; - vidConvCpyRGB565(localPal, Pico.cram, 0x40); + if (Pico.m.dirtyPal == 2) + Pico.m.dirtyPal = 0; + /* no support + switch (Pico.est.SonicPalCount) { + case 3: vidConvCpyRGB565(localPal+0xc0, Pico.est.SonicPal+0xc0, 0x40); + case 2: vidConvCpyRGB565(localPal+0x80, Pico.est.SonicPal+0x80, 0x40); + case 1: vidConvCpyRGB565(localPal+0x40, Pico.est.SonicPal+0x40, 0x40); + default://vidConvCpyRGB565(localPal, Pico.est.SonicPal, 0x40); + } */ + vidConvCpyRGB565(localPal, Pico.est.SonicPal, 0x40); if (Pico.video.reg[0xC]&8) { // shadow/hilight mode - //vidConvCpyRGB32sh(localPal+0x40, Pico.cram, 0x40); - //vidConvCpyRGB32hi(localPal+0x80, Pico.cram, 0x40); // TODO? - memcpy32((void *)(localPal+0xc0), (void *)(localPal+0x40), 0x40*2/4); + //vidConvCpyRGB32sh(localPal+0x40, Pico.est.SonicPal, 0x40); + //vidConvCpyRGB32hi(localPal+0x80, Pico.est.SonicPal, 0x40); // TODO? + memcpy((void *)(localPal+0xc0), (void *)(localPal+0x40), 0x40*2); localPal[0xc0] = 0x0600; localPal[0xd0] = 0xc000; localPal[0xe0] = 0x0000; // reserved pixels for OSD localPal[0xf0] = 0xffff; } - /* no support - else if (rendstatus & 0x20) { // mid-frame palette changes - vidConvCpyRGB565(localPal+0x40, HighPal, 0x40); - vidConvCpyRGB565(localPal+0x80, HighPal+0x40, 0x40); - } */ } lines_flags = (Pico.video.reg[1]&8) ? 240 : 224; if (!(Pico.video.reg[12]&1)) lines_flags|=0x10000; diff --git a/platform/gizmondo/menu.c b/platform/gizmondo/menu.c index 51f032f0d..1045f47b2 100644 --- a/platform/gizmondo/menu.c +++ b/platform/gizmondo/menu.c @@ -54,7 +54,7 @@ static unsigned int inp_prev = 0; void menu_draw_begin(int use_bgbuff) { if (use_bgbuff) - memcpy32((int *)menu_screen, (int *)bg_buffer, 321*240*2/4); + memcpy((int *)menu_screen, (int *)bg_buffer, 321*240*2); } @@ -66,7 +66,7 @@ void menu_draw_end(void) lprintf("%s: Framework2D_LockBuffer() returned NULL\n", __FUNCTION__); return; } - memcpy32(giz_screen, (int *)menu_screen, 321*240*2/4); + memcpy(giz_screen, (int *)menu_screen, 321*240*2); fb_unlock(); giz_screen = NULL; fb_flip(); diff --git a/platform/gp2x/emu.c b/platform/gp2x/emu.c index 18d8a57eb..450ac0803 100644 --- a/platform/gp2x/emu.c +++ b/platform/gp2x/emu.c @@ -291,32 +291,45 @@ static int EmuScanEnd16_ld(unsigned int num) } static int localPal[0x100]; +static int localPalSize; + static void (*vidcpyM2)(void *dest, void *src, int m32col, int with_32c_border); static int (*make_local_pal)(int fast_mode); static int make_local_pal_md(int fast_mode) { - int pallen = 0xc0; - - bgr444_to_rgb32(localPal, PicoMem.cram); - if (fast_mode) - return 0x40; + int pallen = 0x100; - if (Pico.video.reg[0xC] & 8) { // shadow/hilight mode - bgr444_to_rgb32_sh(localPal, PicoMem.cram); - localPal[0xc0] = 0x0000c000; - localPal[0xd0] = 0x00c00000; - localPal[0xe0] = 0x00000000; // reserved pixels for OSD - localPal[0xf0] = 0x00ffffff; - pallen = 0x100; + if (fast_mode) { + bgr444_to_rgb32(localPal, PicoMem.cram); + pallen = 0x40; + Pico.m.dirtyPal = 0; } else if (Pico.est.rendstatus & PDRAW_SONIC_MODE) { // mid-frame palette changes - bgr444_to_rgb32(localPal+0x40, Pico.est.HighPal); - bgr444_to_rgb32(localPal+0x80, Pico.est.HighPal+0x40); + switch (Pico.est.SonicPalCount) { + case 3: bgr444_to_rgb32(localPal+0xc0, Pico.est.SonicPal+0xc0); + case 2: bgr444_to_rgb32(localPal+0x80, Pico.est.SonicPal+0x80); + case 1: bgr444_to_rgb32(localPal+0x40, Pico.est.SonicPal+0x40); + default:bgr444_to_rgb32(localPal, Pico.est.SonicPal); + } + pallen = (Pico.est.SonicPalCount+1)*0x40; } - else - memcpy(localPal + 0x80, localPal, 0x40 * 4); // for spr prio mess + else if (Pico.video.reg[0xC] & 8) { // shadow/hilight mode + bgr444_to_rgb32(localPal, Pico.est.SonicPal); + bgr444_to_rgb32_sh(localPal, Pico.est.SonicPal); + } + else { + bgr444_to_rgb32(localPal, Pico.est.SonicPal); + memcpy(localPal+0x40, localPal, 0x40*4); // for spr prio mess + memcpy(localPal+0x80, localPal, 0x80*4); // for spr prio mess + } + localPal[0xc0] = 0x0000c000; + localPal[0xd0] = 0x00c00000; + localPal[0xe0] = 0x00000000; // reserved pixels for OSD + localPal[0xf0] = 0x00ffffff; + if (Pico.m.dirtyPal == 2) + Pico.m.dirtyPal = 0; return pallen; } @@ -334,25 +347,21 @@ static int make_local_pal_sms(int fast_mode) *dpal++ = t; } + Pico.m.dirtyPal = 0; return 0x40; } void pemu_finalize_frame(const char *fps, const char *notice) { int emu_opt = currentConfig.EmuOpt; - int ret; if (PicoIn.AHW & PAHW_32X) - ; // nothing to do + localPalSize = 0; // nothing to do else if (get_renderer() == RT_8BIT_FAST) { // 8bit fast renderer - if (Pico.m.dirtyPal) { - Pico.m.dirtyPal = 0; - ret = make_local_pal(1); - // feed new palette to our device - gp2x_video_setpalette(localPal, ret); - } + if (Pico.m.dirtyPal) + localPalSize = make_local_pal(1); // a hack for VR if (PicoIn.AHW & PAHW_SVP) memset32((int *)(Pico.est.Draw2FB+328*8+328*223), 0xe0e0e0e0, 328); @@ -364,12 +373,9 @@ void pemu_finalize_frame(const char *fps, const char *notice) { // 8bit accurate renderer if (Pico.m.dirtyPal) - { - Pico.m.dirtyPal = 0; - ret = make_local_pal(0); - gp2x_video_setpalette(localPal, ret); - } + localPalSize = make_local_pal(0); } + else localPalSize = 0; // no palette in 16bit mode if (notice) osd_text(4, osd_y, notice); @@ -385,6 +391,10 @@ void plat_video_flip(void) { int stride = g_screen_width; gp2x_video_flip(); + // switching the palette takes immediate effect, whilst flipping only + // takes effect with the next vsync; unavoidable flicker may occur! + if (localPalSize) + gp2x_video_setpalette(localPal, localPalSize); if (is_16bit_mode()) stride *= 2; @@ -502,9 +512,6 @@ static void vid_reset_mode(void) if (renderer == RT_16BIT && (currentConfig.EmuOpt & EOPT_WIZ_TEAR_FIX)) { PicoDrawSetOutFormat(PDF_RGB555, 1); } - else { - PicoDrawSetOutFormat(PDF_NONE, 0); - } PicoDrawSetOutBuf(g_screen_ptr, g_screen_width * 2); gp2x_mode = 16; } @@ -537,10 +544,7 @@ static void vid_reset_mode(void) localPal[0xe0] = 0x00000000; // reserved pixels for OSD localPal[0xf0] = 0x00ffffff; gp2x_video_setpalette(localPal, 0x100); - gp2x_memset_all_buffers(0, 0xe0, 320*240); } - else - gp2x_memset_all_buffers(0, 0, 320*240*2); if (currentConfig.EmuOpt & EOPT_WIZ_TEAR_FIX) gp2x_mode = -gp2x_mode; @@ -723,6 +727,8 @@ void pemu_forced_frame(int no_scale, int do_emu) PicoDrawSetCallbacks(NULL, NULL); Pico.m.dirtyPal = 1; + if (!no_scale) + no_scale = currentConfig.scaling == EOPT_SCALE_NONE; emu_cmn_forced_frame(no_scale, do_emu); g_menubg_src_ptr = g_screen_ptr; diff --git a/platform/linux/emu.c b/platform/linux/emu.c index 5d4432fa0..8af5afa80 100644 --- a/platform/linux/emu.c +++ b/platform/linux/emu.c @@ -71,8 +71,8 @@ void pemu_finalize_frame(const char *fps, const char *notice) unsigned char *ps = Pico.est.Draw2FB + 328*8 + 8; unsigned short *pal = Pico.est.HighPal; int i, x; - if (Pico.m.dirtyPal) - PicoDrawUpdateHighPal(); + + PicoDrawUpdateHighPal(); for (i = 0; i < 224; i++, ps += 8) for (x = 0; x < 320; x++) *pd++ = pal[*ps++]; @@ -109,6 +109,8 @@ static void apply_renderer(void) if (PicoIn.AHW & PAHW_32X) PicoDrawSetOutBuf(g_screen_ptr, g_screen_ppitch * 2); + + Pico.m.dirtyPal = 1; } void plat_video_toggle_renderer(int change, int is_menu) diff --git a/platform/psp/emu.c b/platform/psp/emu.c index 5c0cb57f7..5c7ff2162 100644 --- a/platform/psp/emu.c +++ b/platform/psp/emu.c @@ -201,13 +201,22 @@ static void do_pal_update(int allow_sh, int allow_as) //for (i = 0x3f/2; i >= 0; i--) // dpal[i] = ((spal[i]&0x000f000f)<< 1)|((spal[i]&0x00f000f0)<<3)|((spal[i]&0x0f000f00)<<4); - do_pal_convert(localPal, Pico.cram, currentConfig.gamma, currentConfig.gamma2); - - Pico.m.dirtyPal = 0; - need_pal_upload = 1; - - if (allow_sh && (Pico.video.reg[0xC]&8)) // shadow/hilight? + if ((currentConfig.EmuOpt&0x80) || (PicoOpt&0x10)) { + do_pal_convert(localPal, Pico.cram, currentConfig.gamma, currentConfig.gamma2); + Pico.m.dirtyPal = 0; + } + else if (Pico.est.rendstatus&0x20) + { + switch (Pico.est.SonicPalCount) { + case 3: do_pal_convert(localPal+0xc0, Pico.est.SonicPal+0xc0, currentConfig.gamma, currentConfig.gamma2); + case 2: do_pal_convert(localPal+0x80, Pico.est.SonicPal+0x80, currentConfig.gamma, currentConfig.gamma2); + case 1: do_pal_convert(localPal+0x40, Pico.est.SonicPal+0x40, currentConfig.gamma, currentConfig.gamma2); + default:do_pal_convert(localPal, Pico.est.SonicPal, currentConfig.gamma, currentConfig.gamma2); + } + } + else if (allow_sh && (Pico.video.reg[0xC]&8)) // shadow/hilight? { + do_pal_convert(localPal, Pico.est.SonicPal, currentConfig.gamma, currentConfig.gamma2); // shadowed pixels for (i = 0x3f/2; i >= 0; i--) dpal[0x20|i] = dpal[0x60|i] = (dpal[i]>>1)&0x7bcf7bcf; @@ -223,6 +232,16 @@ static void do_pal_update(int allow_sh, int allow_as) localPal[0xe0] = 0; localPal[0xf0] = 0x001f; } + else if (allow_as) + { + do_pal_convert(localPal, Pico.est.SonicPal, currentConfig.gamma, currentConfig.gamma2); + memcpy((int *)dpal+0x40/2, (void *)localPal, 0x40*2); + memcpy((int *)dpal+0x80/2, (void *)localPal, 0x80*2); + } + + if (Pico.m.dirtyPal == 2) + Pico.m.dirtyPal = 0; + need_pal_upload = 1; } static void do_slowmode_lines(int line_to) @@ -639,7 +658,7 @@ static void writeSound(int len) PicoIn.sndOut += len / 2; /*if (PicoIn.sndOut > sndBuffer_endptr) { - memcpy32((int *)(void *)sndBuffer, (int *)endptr, (PicoIn.sndOut - endptr + 1) / 2); + memcpy((int *)(void *)sndBuffer, (int *)endptr, (PicoIn.sndOut - endptr + 1) * 2); PicoIn.sndOut = &sndBuffer[PicoIn.sndOut - endptr]; lprintf("mov\n"); } diff --git a/platform/psp/menu.c b/platform/psp/menu.c index ab022f979..fc31b8e79 100644 --- a/platform/psp/menu.c +++ b/platform/psp/menu.c @@ -59,7 +59,7 @@ void menu_draw_begin(void) // int i; // for (i = 272; i >= 0; i--, dst += 512, src += 480) - // memcpy32((int *)dst, (int *)src, 480*2/4); + // memcpy((int *)dst, (int *)src, 480*2); sceGuSync(0,0); // sync with prev sceGuStart(GU_DIRECT, guCmdList); From 30e28fd63cb4de6960258966a33544e1fd797ec1 Mon Sep 17 00:00:00 2001 From: kub Date: Mon, 25 Mar 2019 19:31:32 +0100 Subject: [PATCH 018/174] minor changes --- pico/32x/32x.c | 4 +- pico/m68kif_cyclone.s | 8 +-- pico/pico_int.h | 3 +- pico/sms.c | 2 +- platform/common/helix/lib.c | 67 +------------------ platform/common/memcpy.c | 125 ++++++++++++++++++++++++++++++++++++ platform/common/plat_sdl.c | 9 ++- platform/common/version.h | 2 +- tools/mkoffsets.sh | 9 ++- 9 files changed, 151 insertions(+), 78 deletions(-) create mode 100644 platform/common/memcpy.c diff --git a/pico/32x/32x.c b/pico/32x/32x.c index 3ee8c2ea1..a15cb112c 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -194,11 +194,11 @@ void PicoPower32x(void) void PicoUnload32x(void) { + sh2_finish(&msh2); + sh2_finish(&ssh2); if (Pico32xMem != NULL) plat_munmap(Pico32xMem, sizeof(*Pico32xMem)); Pico32xMem = NULL; - sh2_finish(&msh2); - sh2_finish(&ssh2); PicoIn.AHW &= ~PAHW_32X; } diff --git a/pico/m68kif_cyclone.s b/pico/m68kif_cyclone.s index a0a508cd4..3a9621dc2 100644 --- a/pico/m68kif_cyclone.s +++ b/pico/m68kif_cyclone.s @@ -87,19 +87,19 @@ cyclone_fetch32: orrcc r0, r1, r0, lsl #16 bxcc lr - stmfd sp!,{r0,r1,lr} + stmfd sp!,{r0,r1,r2,lr} mov lr, pc bx r1 mov r2, r0, lsl #16 - ldmia sp, {r0,r1} + ldmfd sp!, {r0,r1} str r2, [sp] add r0, r0, #2 mov lr, pc bx r1 - ldr r1, [sp] + ldmfd sp!, {r1,lr} mov r0, r0, lsl #16 orr r0, r1, r0, lsr #16 - ldmfd sp!,{r1,r2,pc} + bx lr cyclone_write8: @ u32 a, u8 d diff --git a/pico/pico_int.h b/pico/pico_int.h index f6d8b37f2..4d599ce8e 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -241,7 +241,7 @@ extern SH2 sh2s[2]; # define sh2_pc(sh2) (sh2)->pc #endif -#define sh2_cycles_done(sh2) ((unsigned)(sh2)->cycles_timeslice - sh2_cycles_left(sh2)) +#define sh2_cycles_done(sh2) (unsigned)((int)(sh2)->cycles_timeslice - sh2_cycles_left(sh2)) #define sh2_cycles_done_t(sh2) \ (unsigned)(C_M68K_TO_SH2(sh2, (sh2)->m68krcycles_done) + sh2_cycles_done(sh2)) #define sh2_cycles_done_m68k(sh2) \ @@ -650,6 +650,7 @@ PICO_INTERNAL void PicoFrameStart(void); void PicoDrawSync(int to, int blank_last_line); void BackFill(int reg7, int sh, struct PicoEState *est); void FinalizeLine555(int sh, int line, struct PicoEState *est); +void PicoDrawSetOutBufMD(void *dest, int increment); extern int (*PicoScanBegin)(unsigned int num); extern int (*PicoScanEnd)(unsigned int num); #define MAX_LINE_SPRITES 29 diff --git a/pico/sms.c b/pico/sms.c index 286b8bf1d..2800e2094 100644 --- a/pico/sms.c +++ b/pico/sms.c @@ -46,8 +46,8 @@ static void vdp_data_write(unsigned char d) struct PicoVideo *pv = &Pico.video; if (pv->type == 3) { + if (PicoMem.cram[pv->addr & 0x1f] != d) Pico.m.dirtyPal = 1; PicoMem.cram[pv->addr & 0x1f] = d; - Pico.m.dirtyPal = 1; } else { PicoMem.vramb[pv->addr] = d; } diff --git a/platform/common/helix/lib.c b/platform/common/helix/lib.c index d7c511bed..d2b058987 100644 --- a/platform/common/helix/lib.c +++ b/platform/common/helix/lib.c @@ -53,70 +53,5 @@ void *memmove (void *dest, const void *src, size_t n) return dest; } #else -/* memcpy/memmove in C with some simple optimizations. - * ATTN does dirty aliasing tricks with undefined behaviour by standard. - * (this works fine with gcc, though...) - */ -void *memcpy(void *dest, const void *src, size_t n) -{ - struct _16 { uint32_t a[4]; }; - union { const void *v; char *c; uint64_t *l; struct _16 *s; } - ss = { src }, ds = { dest }; - const int lm = sizeof(uint32_t)-1; - - if ((((unsigned)ss.c ^ (unsigned)ds.c) & lm) == 0) { - /* fast copy if pointers have the same aligment */ - while (((unsigned)ss.c & lm) && n > 0) /* align to word */ - *ds.c++ = *ss.c++, n--; - while (n >= sizeof(struct _16)) /* copy 16 bytes blocks */ - *ds.s++ = *ss.s++, n -= sizeof(struct _16); - if (n >= sizeof(uint64_t)) /* copy leftover 8 byte block */ - *ds.l++ = *ss.l++, n -= sizeof(uint64_t); - } else { - /* byte copy if pointers are unaligned */ - while (n >= 8) { /* copy 8 byte blocks */ - *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; - *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; - *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; - *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; - } - } - /* copy max. 8 leftover bytes */ - while (n > 0) - *ds.c++ = *ss.c++, n--; - return dest; -} - -void *memmove (void *dest, const void *src, size_t n) -{ - struct _16 { uint32_t a[4]; }; - union { const void *v; char *c; uint64_t *l; struct _16 *s; } - ss = { src+n }, ds = { dest+n }; - const int lm = sizeof(uint32_t)-1; - - if (dest <= src || dest >= src+n) - return memcpy(dest, src, n); - - if ((((unsigned)ss.c ^ (unsigned)ds.c) & lm) == 0) { - /* fast copy if pointers have the same aligment */ - while (((unsigned)ss.c & lm) && n > 0) - *--ds.c = *--ss.c, n--; - while (n >= sizeof(struct _16)) - *--ds.s = *--ss.s, n -= sizeof(struct _16); - if (n >= sizeof(uint64_t)) - *--ds.l = *--ss.l, n -= sizeof(uint64_t); - } else { - /* byte copy if pointers are unaligned */ - while (n >= 8) { - *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; - *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; - *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; - *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; - } - } - /* copy max. 8 leftover bytes */ - while (n > 0) - *--ds.c = *--ss.c, n--; - return dest; -} +#include "../memcpy.c" #endif diff --git a/platform/common/memcpy.c b/platform/common/memcpy.c new file mode 100644 index 000000000..b99de4aec --- /dev/null +++ b/platform/common/memcpy.c @@ -0,0 +1,125 @@ +/* + * (C) 2018 Kai-Uwe Bloem + * + * 32bit ARM/MIPS optimized C implementation of memcpy and memove, designed for + * good performance with gcc. + * - if src and dest have the same alignment, 4-word copy is used. + * - if src and dest are unaligned to each other, still loads word data and + * stores correctly shifted word data (for all but the first and last bytes + * to avoid under/overstepping the src region). + * + * ATTN does dirty aliasing tricks with undefined behaviour by standard. + * (however, this was needed to improve the generated code). + * ATTN uses struct assignment, which only works if the compiler is inlining + * this (else it would probably call memcpy :-)). + */ +#include +#include + +#include +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define _L_ >> +#define _U_ << +#else +#define _L_ << +#define _U_ >> +#endif + +void *memcpy(void *dest, const void *src, size_t n) +{ + struct _16 { uint32_t a[4]; }; + union { const void *v; uint8_t *c; uint32_t *i; uint64_t *l; struct _16 *s; } + ss = { src }, ds = { dest }; + const int lm = sizeof(uint32_t)-1; + + /* align src to word */ + while (((unsigned)ss.c & lm) && n > 0) + *ds.c++ = *ss.c++, n--; + if (((unsigned)ds.c & lm) == 0) { + /* fast copy if pointers have the same aligment */ + while (n >= sizeof(struct _16)) /* copy 16 bytes blocks */ + *ds.s++ = *ss.s++, n -= sizeof(struct _16); + if (n >= sizeof(uint64_t)) /* copy leftover 8 byte block */ + *ds.l++ = *ss.l++, n -= sizeof(uint64_t); + } else if (n >= 2*sizeof(uint32_t)) { + /* unaligned data big enough to avoid overstepping src */ + uint32_t v1, v2, b, s; + /* align dest to word */ + while (((unsigned)ds.c & lm) && n > 0) + *ds.c++ = *ss.c++, n--; + /* copy loop: load aligned words and store shifted words */ + b = (unsigned)ss.c & lm, s = b*8; ss.c -= b; + v1 = *ss.i++, v2 = *ss.i++; + while (n >= 3*sizeof(uint32_t)) { + *ds.i++ = (v1 _L_ s) | (v2 _U_ (32-s)); v1 = *ss.i++; + *ds.i++ = (v2 _L_ s) | (v1 _U_ (32-s)); v2 = *ss.i++; + n -= 2*sizeof(uint32_t); + } + /* data for one more store is already loaded */ + if (n >= sizeof(uint32_t)) { + *ds.i++ = (v1 _L_ s) | (v2 _U_ (32-s)); + n -= sizeof(uint32_t); + ss.c += sizeof(uint32_t); + } + ss.c += b - 2*sizeof(uint32_t); + } + /* copy 0-7 leftover bytes */ + while (n >= 4) { + *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; + *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; + } + while (n > 0) + *ds.c++ = *ss.c++, n--; + return dest; +} + +void *memmove (void *dest, const void *src, size_t n) +{ + struct _16 { uint32_t a[4]; }; + union { const void *v; uint8_t *c; uint32_t *i; uint64_t *l; struct _16 *s; } + ss = { src+n }, ds = { dest+n }; + const int lm = sizeof(uint32_t)-1; + + if (dest <= src || dest >= src+n) + return memcpy(dest, src, n); + + /* align src to word */ + while (((unsigned)ss.c & lm) && n > 0) + *--ds.c = *--ss.c, n--; + if (((unsigned)ds.c & lm) == 0) { + /* fast copy if pointers have the same aligment */ + while (n >= sizeof(struct _16)) /* copy 16 byte blocks */ + *--ds.s = *--ss.s, n -= sizeof(struct _16); + if (n >= sizeof(uint64_t)) /* copy leftover 8 byte block */ + *--ds.l = *--ss.l, n -= sizeof(uint64_t); + } else if (n >= 2*sizeof(uint32_t)) { + /* unaligned data big enough to avoid understepping src */ + uint32_t v1, v2, b, s; + /* align dest to word */ + while (((unsigned)ds.c & lm) && n > 0) + *--ds.c = *--ss.c, n--; + /* copy loop: load aligned words and store shifted words */ + b = (unsigned)ss.c & lm, s = b*8; ss.c += b; + v1 = *--ss.i, v2 = *--ss.i; + while (n >= 3*sizeof(uint32_t)) { + *--ds.i = (v1 _U_ s) | (v2 _L_ (32-s)); v1 = *--ss.i; + *--ds.i = (v2 _U_ s) | (v1 _L_ (32-s)); v2 = *--ss.i; + n -= 2*sizeof(uint32_t); + } + /* data for one more store is already loaded */ + if (n >= sizeof(uint32_t)) { + *--ds.i = (v1 _U_ s) | (v2 _L_ (32-s)); + n -= sizeof(uint32_t); + ss.c -= sizeof(uint32_t); + } + ss.c -= b - 2*sizeof(uint32_t); + } + /* copy 0-7 leftover bytes */ + while (n >= 4) { + *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; + *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; + } + while (n > 0) + *--ds.c = *--ss.c, n--; + return dest; +} diff --git a/platform/common/plat_sdl.c b/platform/common/plat_sdl.c index 4446f72e2..ef99af2a0 100644 --- a/platform/common/plat_sdl.c +++ b/platform/common/plat_sdl.c @@ -89,6 +89,7 @@ static const struct in_pdata in_sdl_platform_data = { /* YUV stuff */ static int yuv_ry[32], yuv_gy[32], yuv_by[32]; static unsigned char yuv_u[32 * 2], yuv_v[32 * 2]; +static int yuv_y[256]; void bgr_to_uyvy_init(void) { @@ -119,6 +120,10 @@ void bgr_to_uyvy_init(void) v = 255; yuv_v[i + 32] = v; } + // valid Y range seems to be 16..235 + for (i = 0; i < 256; i++) { + yuv_y[i] = 16 + 219 * i / 32; + } } void rgb565_to_uyvy(void *d, const void *s, int pixels) @@ -143,8 +148,8 @@ void rgb565_to_uyvy(void *d, const void *s, int pixels) u = yu[b0 - y0]; v = yv[r0 - y0]; // valid Y range seems to be 16..235 - y0 = 16 + 219 * y0 / 31; - y1 = 16 + 219 * y1 / 31; + y0 = yuv_y[y0]; + y1 = yuv_y[y1]; *dst = (y1 << 24) | (v << 16) | (y0 << 8) | u; } diff --git a/platform/common/version.h b/platform/common/version.h index f65ba1eda..8b3adbf85 100644 --- a/platform/common/version.h +++ b/platform/common/version.h @@ -1 +1 @@ -#define VERSION "1.93" +#define VERSION "1.93+" diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index 60088f213..90e658677 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -1,16 +1,21 @@ -# usage: mkoffsets # automatically compute structure offsets for gcc targets in ELF format +# (C) 2018 Kai-Uwe Bloem. This work is placed in the public domain. +# +# usage: mkoffsets CC=${CC:-gcc} # endianess of target (automagically determined below) ENDIAN= +# compile with target C compiler and extract value from .rodata section compile_rodata () { $CC $CFLAGS -I .. -c /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 + # find the name of the .rodata section (in case -fdata-sections is used) rosect=$(readelf -S /tmp/getoffs.o | grep '\.rodata' | sed 's/^[^.]*././;s/ .*//') + # read out .rodata section as hex string (should be only 4 or 8 bytes) objcopy --dump-section $rosect=/tmp/getoffs.ro /tmp/getoffs.o || exit 1 ro=$(xxd -ps /tmp/getoffs.ro) if [ "$ENDIAN" = "le" ]; then @@ -22,9 +27,11 @@ compile_rodata () else hex=$ro fi + # extract decimal value from hex string rodata=$(printf "%d" 0x$hex) } +# determine member offset and create #define get_define () # prefix struct member member... { prefix=$1; shift From 771d8aca0f8cadc9a237a18474473ce2ce6061a9 Mon Sep 17 00:00:00 2001 From: kub Date: Mon, 25 Mar 2019 20:23:04 +0100 Subject: [PATCH 019/174] DRC: reworked scan_block (fix register usage masks, better block and literals detection) --- cpu/drc/cmn.h | 2 +- cpu/drc/emit_x86.c | 2 +- cpu/sh2/compiler.c | 243 +++++++++++++++++++++++++++++---------------- 3 files changed, 162 insertions(+), 85 deletions(-) diff --git a/cpu/drc/cmn.h b/cpu/drc/cmn.h index f5c595f29..5a44bbb7e 100644 --- a/cpu/drc/cmn.h +++ b/cpu/drc/cmn.h @@ -7,7 +7,7 @@ typedef unsigned int u32; typedef signed int s32; #endif -#define DRC_TCACHE_SIZE (2*1024*1024) +#define DRC_TCACHE_SIZE (4*1024*1024) extern u8 *tcache; diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index e5f2adefb..c5f4e865b 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -686,7 +686,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; case 0: rd = xDI; break; \ case 1: rd = xSI; break; \ case 2: rd = xDX; break; \ - case 2: rd = xBX; break; \ + case 3: rd = xBX; break; \ } #define emith_sh2_drc_entry() { \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index aa3e772c6..ece3b13e9 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -43,9 +43,9 @@ #define MAX_BLOCK_SIZE (BLOCK_INSN_LIMIT * 6 * 6) // max literal offset from the block end -#define MAX_LITERAL_OFFSET 32*2 +#define MAX_LITERAL_OFFSET 0x200 // max. MOVA, MOV @(PC) offset #define MAX_LITERALS (BLOCK_INSN_LIMIT / 4) -#define MAX_LOCAL_BRANCHES 32 +#define MAX_LOCAL_BRANCHES (BLOCK_INSN_LIMIT / 4) // debug stuff // 01 - warnings/errors @@ -98,8 +98,10 @@ static int insns_compiled, hash_collisions, host_insn_count; #define BITMASK3(v0,v1,v2) (BITMASK2(v0,v1) | (1 << (v2))) #define BITMASK4(v0,v1,v2,v3) (BITMASK3(v0,v1,v2) | (1 << (v3))) #define BITMASK5(v0,v1,v2,v3,v4) (BITMASK4(v0,v1,v2,v3) | (1 << (v4))) +#define BITMASK6(v0,v1,v2,v3,v4,v5) (BITMASK5(v0,v1,v2,v3,v4) | (1 << (v5))) -#define SHR_T SHR_SR // might make them separate someday +#define SHR_T SHR_SR // might make them separate someday +#define SHR_MEM 31 static struct op_data { u8 op; @@ -115,6 +117,7 @@ static struct op_data { enum op_types { OP_UNHANDLED = 0, OP_BRANCH, + OP_BRANCH_N, // conditional known not to be taken OP_BRANCH_CT, // conditional, branch if T set OP_BRANCH_CF, // conditional, branch if T clear OP_BRANCH_R, // indirect @@ -125,6 +128,8 @@ enum op_types { OP_MOVA, OP_SLEEP, OP_RTE, + OP_TRAPA, + OP_UNDEFINED, }; #ifdef DRC_SH2 @@ -1672,6 +1677,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp2 = ops[i-1].op == OP_BRANCH_CT ? DCOND_NE : DCOND_EQ; emith_move_r_imm_c(tmp2, tmp, ops[i-1].imm); break; + case OP_BRANCH_N: + emit_move_r_imm32(SHR_PC, pc); + break; // case OP_BRANCH_R OP_BRANCH_RF - PC already loaded } } @@ -1684,6 +1692,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) switch (opd->op) { + case OP_BRANCH_N: + goto end_op; + case OP_BRANCH: case OP_BRANCH_CT: case OP_BRANCH_CF: @@ -1734,6 +1745,32 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.pending_branch_indirect = 1; goto end_op; + case OP_UNDEFINED: + elprintf_sh2(sh2, EL_ANOMALY, + "drc: illegal op %04x @ %08x", op, pc - 2); + opd->imm = 4; + // fallthrough + case OP_TRAPA: + tmp = rcache_get_reg(SHR_SP, RC_GR_RMW); + emith_sub_r_imm(tmp, 4*2); + // push SR + tmp = rcache_get_reg_arg(0, SHR_SP); + emith_add_r_imm(tmp, 4); + tmp = rcache_get_reg_arg(1, SHR_SR); + emith_clear_msb(tmp, tmp, 22); + emit_memhandler_write(2); + // push PC + rcache_get_reg_arg(0, SHR_SP); + tmp = rcache_get_tmp_arg(1); + emith_move_r_imm(tmp, pc); + emit_memhandler_write(2); + // obtain new PC + emit_memhandler_read_rr(SHR_PC, SHR_VBR, opd->imm * 4, 2); + // indirect jump -> back to dispatcher + rcache_flush(); + emith_jump(sh2_drc_dispatcher); + goto end_op; + case OP_LOAD_POOL: #if PROPAGATE_CONSTANTS if (opd->imm != 0 && opd->imm < end_literals @@ -2610,26 +2647,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp = (op & 0x300) >> 8; emit_memhandler_read_rr(SHR_R0, SHR_GBR, (op & 0xff) << tmp, tmp); goto end_op; - case 0x0300: // TRAPA #imm 11000011iiiiiiii - tmp = rcache_get_reg(SHR_SP, RC_GR_RMW); - emith_sub_r_imm(tmp, 4*2); - // push SR - tmp = rcache_get_reg_arg(0, SHR_SP); - emith_add_r_imm(tmp, 4); - tmp = rcache_get_reg_arg(1, SHR_SR); - emith_clear_msb(tmp, tmp, 22); - emit_memhandler_write(2); - // push PC - rcache_get_reg_arg(0, SHR_SP); - tmp = rcache_get_tmp_arg(1); - emith_move_r_imm(tmp, pc); - emit_memhandler_write(2); - // obtain new PC - emit_memhandler_read_rr(SHR_PC, SHR_VBR, (op & 0xff) * 4, 2); - // indirect jump -> back to dispatcher - rcache_flush(); - emith_jump(sh2_drc_dispatcher); - goto end_op; case 0x0800: // TST #imm,R0 11001000iiiiiiii tmp = rcache_get_reg(SHR_R0, RC_GR_READ); sr = rcache_get_reg(SHR_SR, RC_GR_RMW); @@ -3446,13 +3463,15 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, u16 *dr_pc_base; u32 pc, op, tmp; u32 end_pc, end_literals = 0; + u32 lowest_literal = 0; u32 lowest_mova = 0; struct op_data *opd; int next_is_delay = 0; int end_block = 0; int i, i_end; - memset(op_flags, 0, BLOCK_INSN_LIMIT); + memset(op_flags, 0, sizeof(*op_flags) * BLOCK_INSN_LIMIT); + op_flags[0] |= OF_BTARGET; // block start is always a target dr_pc_base = dr_get_pc_base(base_pc, is_slave); @@ -3473,6 +3492,9 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, } else if (end_block || i >= BLOCK_INSN_LIMIT - 2) break; + else if ((lowest_mova && lowest_mova <= pc) || + (lowest_literal && lowest_literal <= pc)) + break; // text area collides with data area op = FETCH_OP(pc); switch ((op & 0xf000) >> 12) @@ -3506,18 +3528,22 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, // BSRF Rm 0000mmmm00000011 opd->op = OP_BRANCH_RF; opd->rm = GET_Rn(); - opd->source = BITMASK1(opd->rm); + opd->source = BITMASK2(SHR_PC, opd->rm); opd->dest = BITMASK1(SHR_PC); if (!(op & 0x20)) opd->dest |= BITMASK1(SHR_PR); opd->cycles = 2; next_is_delay = 1; - end_block = 1; + if (!(opd->dest & BITMASK1(SHR_PR))) + end_block = !(op_flags[i+1+next_is_delay] & OF_BTARGET); + else + op_flags[i+1+next_is_delay] |= OF_BTARGET; break; case 0x04: // MOV.B Rm,@(R0,Rn) 0000nnnnmmmm0100 case 0x05: // MOV.W Rm,@(R0,Rn) 0000nnnnmmmm0101 case 0x06: // MOV.L Rm,@(R0,Rn) 0000nnnnmmmm0110 opd->source = BITMASK3(GET_Rm(), SHR_R0, GET_Rn()); + opd->dest = BITMASK1(SHR_MEM); break; case 0x07: // MUL.L Rm,Rn 0000nnnnmmmm0111 @@ -3594,7 +3620,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->dest = BITMASK1(SHR_PC); opd->cycles = 2; next_is_delay = 1; - end_block = 1; + end_block = !(op_flags[i+1+next_is_delay] & OF_BTARGET); break; case 1: // SLEEP 0000000000011011 opd->op = OP_SLEEP; @@ -3603,10 +3629,10 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 2: // RTE 0000000000101011 opd->op = OP_RTE; opd->source = BITMASK1(SHR_SP); - opd->dest = BITMASK2(SHR_SR, SHR_PC); + opd->dest = BITMASK3(SHR_SP, SHR_SR, SHR_PC); opd->cycles = 4; next_is_delay = 1; - end_block = 1; + end_block = !(op_flags[i+1+next_is_delay] & OF_BTARGET); break; default: goto undefined; @@ -3615,11 +3641,11 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x0c: // MOV.B @(R0,Rm),Rn 0000nnnnmmmm1100 case 0x0d: // MOV.W @(R0,Rm),Rn 0000nnnnmmmm1101 case 0x0e: // MOV.L @(R0,Rm),Rn 0000nnnnmmmm1110 - opd->source = BITMASK2(GET_Rm(), SHR_R0); + opd->source = BITMASK3(GET_Rm(), SHR_R0, SHR_MEM); opd->dest = BITMASK1(GET_Rn()); break; case 0x0f: // MAC.L @Rm+,@Rn+ 0000nnnnmmmm1111 - opd->source = BITMASK5(GET_Rm(), GET_Rn(), SHR_SR, SHR_MACL, SHR_MACH); + opd->source = BITMASK6(GET_Rm(), GET_Rn(), SHR_SR, SHR_MACL, SHR_MACH, SHR_MEM); opd->dest = BITMASK4(GET_Rm(), GET_Rn(), SHR_MACL, SHR_MACH); opd->cycles = 3; break; @@ -3631,8 +3657,8 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, ///////////////////////////////////////////// case 0x01: // MOV.L Rm,@(disp,Rn) 0001nnnnmmmmdddd - opd->source = BITMASK1(GET_Rm()); - opd->source = BITMASK1(GET_Rn()); + opd->source = BITMASK2(GET_Rm(), GET_Rn()); + opd->dest = BITMASK1(SHR_MEM); opd->imm = (op & 0x0f) * 4; break; @@ -3643,14 +3669,14 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x00: // MOV.B Rm,@Rn 0010nnnnmmmm0000 case 0x01: // MOV.W Rm,@Rn 0010nnnnmmmm0001 case 0x02: // MOV.L Rm,@Rn 0010nnnnmmmm0010 - opd->source = BITMASK1(GET_Rm()); - opd->source = BITMASK1(GET_Rn()); + opd->source = BITMASK2(GET_Rm(), GET_Rn()); + opd->dest = BITMASK1(SHR_MEM); break; case 0x04: // MOV.B Rm,@-Rn 0010nnnnmmmm0100 case 0x05: // MOV.W Rm,@-Rn 0010nnnnmmmm0101 case 0x06: // MOV.L Rm,@-Rn 0010nnnnmmmm0110 opd->source = BITMASK2(GET_Rm(), GET_Rn()); - opd->dest = BITMASK1(GET_Rn()); + opd->dest = BITMASK2(GET_Rn(), SHR_MEM); break; case 0x07: // DIV0S Rm,Rn 0010nnnnmmmm0111 opd->source = BITMASK2(GET_Rm(), GET_Rn()); @@ -3791,7 +3817,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, goto undefined; } opd->source = BITMASK2(GET_Rn(), tmp); - opd->dest = BITMASK1(GET_Rn()); + opd->dest = BITMASK2(GET_Rn(), SHR_MEM); break; case 0x04: case 0x05: @@ -3843,7 +3869,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, default: goto undefined; } - opd->source = BITMASK1(GET_Rn()); + opd->source = BITMASK2(GET_Rn(), SHR_MEM); opd->dest = BITMASK2(GET_Rn(), tmp); break; case 0x08: @@ -3899,11 +3925,14 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->dest |= BITMASK1(SHR_PC); opd->cycles = 2; next_is_delay = 1; - end_block = 1; + if (!(opd->dest & BITMASK1(SHR_PR))) + end_block = !(op_flags[i+1+next_is_delay] & OF_BTARGET); + else + op_flags[i+1+next_is_delay] |= OF_BTARGET; break; case 1: // TAS.B @Rn 0100nnnn00011011 - opd->source = BITMASK1(GET_Rn()); - opd->dest = BITMASK1(SHR_T); + opd->source = BITMASK2(GET_Rn(), SHR_MEM); + opd->dest = BITMASK2(SHR_T, SHR_MEM); opd->cycles = 4; break; default: @@ -3931,7 +3960,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, break; case 0x0f: // MAC.W @Rm+,@Rn+ 0100nnnnmmmm1111 - opd->source = BITMASK5(GET_Rm(), GET_Rn(), SHR_SR, SHR_MACL, SHR_MACH); + opd->source = BITMASK6(GET_Rm(), GET_Rn(), SHR_SR, SHR_MACL, SHR_MACH, SHR_MEM); opd->dest = BITMASK4(GET_Rm(), GET_Rn(), SHR_MACL, SHR_MACH); opd->cycles = 3; break; @@ -3943,7 +3972,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, ///////////////////////////////////////////// case 0x05: // MOV.L @(disp,Rm),Rn 0101nnnnmmmmdddd - opd->source = BITMASK1(GET_Rm()); + opd->source = BITMASK2(GET_Rm(), SHR_MEM); opd->dest = BITMASK1(GET_Rn()); opd->imm = (op & 0x0f) * 4; break; @@ -3955,12 +3984,14 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x04: // MOV.B @Rm+,Rn 0110nnnnmmmm0100 case 0x05: // MOV.W @Rm+,Rn 0110nnnnmmmm0101 case 0x06: // MOV.L @Rm+,Rn 0110nnnnmmmm0110 - opd->dest = BITMASK1(GET_Rm()); + opd->dest = BITMASK2(GET_Rm(), GET_Rn()); + opd->source = BITMASK2(GET_Rm(), SHR_MEM); + break; case 0x00: // MOV.B @Rm,Rn 0110nnnnmmmm0000 case 0x01: // MOV.W @Rm,Rn 0110nnnnmmmm0001 case 0x02: // MOV.L @Rm,Rn 0110nnnnmmmm0010 - opd->source = BITMASK1(GET_Rm()); - opd->dest |= BITMASK1(GET_Rn()); + opd->dest = BITMASK1(GET_Rn()); + opd->source = BITMASK2(GET_Rm(), SHR_MEM); break; case 0x0a: // NEGC Rm,Rn 0110nnnnmmmm1010 opd->source = BITMASK2(GET_Rm(), SHR_T); @@ -3997,19 +4028,21 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, { case 0x0000: // MOV.B R0,@(disp,Rn) 10000000nnnndddd opd->source = BITMASK2(GET_Rm(), SHR_R0); + opd->dest = BITMASK1(SHR_MEM); opd->imm = (op & 0x0f); break; case 0x0100: // MOV.W R0,@(disp,Rn) 10000001nnnndddd opd->source = BITMASK2(GET_Rm(), SHR_R0); + opd->dest = BITMASK1(SHR_MEM); opd->imm = (op & 0x0f) * 2; break; case 0x0400: // MOV.B @(disp,Rm),R0 10000100mmmmdddd - opd->source = BITMASK1(GET_Rm()); + opd->source = BITMASK2(GET_Rm(), SHR_MEM); opd->dest = BITMASK1(SHR_R0); opd->imm = (op & 0x0f); break; case 0x0500: // MOV.W @(disp,Rm),R0 10000101mmmmdddd - opd->source = BITMASK1(GET_Rm()); + opd->source = BITMASK2(GET_Rm(), SHR_MEM); opd->dest = BITMASK1(SHR_R0); opd->imm = (op & 0x0f) * 2; break; @@ -4025,7 +4058,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x0900: // BT label 10001001dddddddd case 0x0b00: // BF label 10001011dddddddd opd->op = (op & 0x0200) ? OP_BRANCH_CF : OP_BRANCH_CT; - opd->source = BITMASK1(SHR_T); + opd->source = BITMASK2(SHR_PC, SHR_T); opd->dest = BITMASK1(SHR_PC); opd->imm = ((signed int)(op << 24) >> 23); opd->imm += pc + 4; @@ -4045,13 +4078,16 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, if (op_flags[i] & OF_DELAY_OP) { if (ops[i-1].op == OP_BRANCH) tmp = ops[i-1].imm; - else + else if (ops[i-1].op != OP_BRANCH_N) tmp = 0; } - opd->source = BITMASK1(SHR_PC); + opd->source = BITMASK2(SHR_PC, SHR_MEM); opd->dest = BITMASK1(GET_Rn()); - if (tmp) + if (tmp) { opd->imm = tmp + 2 + (op & 0xff) * 2; + if (lowest_literal == 0 || opd->imm < lowest_literal) + lowest_literal = opd->imm; + } opd->size = 1; break; @@ -4062,14 +4098,21 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x0a: // BRA label 1010dddddddddddd opd->op = OP_BRANCH; + opd->source = BITMASK1(SHR_PC); opd->dest |= BITMASK1(SHR_PC); opd->imm = ((signed int)(op << 20) >> 19); opd->imm += pc + 4; opd->cycles = 2; next_is_delay = 1; - end_block = 1; - if (base_pc <= opd->imm && opd->imm < base_pc + BLOCK_INSN_LIMIT * 2) - op_flags[(opd->imm - base_pc) / 2] |= OF_BTARGET; + if (!(opd->dest & BITMASK1(SHR_PR))) { + if (base_pc <= opd->imm && opd->imm < base_pc + BLOCK_INSN_LIMIT * 2) { + op_flags[(opd->imm - base_pc) / 2] |= OF_BTARGET; + if (opd->imm <= pc) + end_block = !(op_flags[i+1+next_is_delay] & OF_BTARGET); + } else + end_block = !(op_flags[i+1+next_is_delay] & OF_BTARGET); + } else + op_flags[i+1+next_is_delay] |= OF_BTARGET; break; ///////////////////////////////////////////// @@ -4080,23 +4123,25 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x0100: // MOV.W R0,@(disp,GBR) 11000001dddddddd case 0x0200: // MOV.L R0,@(disp,GBR) 11000010dddddddd opd->source = BITMASK2(SHR_GBR, SHR_R0); + opd->dest = BITMASK1(SHR_MEM); opd->size = (op & 0x300) >> 8; opd->imm = (op & 0xff) << opd->size; break; case 0x0400: // MOV.B @(disp,GBR),R0 11000100dddddddd case 0x0500: // MOV.W @(disp,GBR),R0 11000101dddddddd case 0x0600: // MOV.L @(disp,GBR),R0 11000110dddddddd - opd->source = BITMASK1(SHR_GBR); + opd->source = BITMASK2(SHR_GBR, SHR_MEM); opd->dest = BITMASK1(SHR_R0); opd->size = (op & 0x300) >> 8; opd->imm = (op & 0xff) << opd->size; break; case 0x0300: // TRAPA #imm 11000011iiiiiiii - opd->source = BITMASK2(SHR_PC, SHR_SR); - opd->dest = BITMASK1(SHR_PC); - opd->imm = (op & 0xff) * 4; + opd->op = OP_TRAPA; + opd->source = BITMASK3(SHR_SP, SHR_PC, SHR_SR); + opd->dest = BITMASK2(SHR_SP, SHR_PC); + opd->imm = (op & 0xff); opd->cycles = 8; - end_block = 1; // FIXME + op_flags[i+1] |= OF_BTARGET; break; case 0x0700: // MOVA @(disp,PC),R0 11000111dddddddd opd->op = OP_MOVA; @@ -4104,7 +4149,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, if (op_flags[i] & OF_DELAY_OP) { if (ops[i-1].op == OP_BRANCH) tmp = ops[i-1].imm; - else + else if (ops[i-1].op != OP_BRANCH_N) tmp = 0; } opd->dest = BITMASK1(SHR_R0); @@ -4134,7 +4179,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->imm = op & 0xff; break; case 0x0c00: // TST.B #imm,@(R0,GBR) 11001100iiiiiiii - opd->source = BITMASK2(SHR_GBR, SHR_R0); + opd->source = BITMASK3(SHR_GBR, SHR_R0, SHR_MEM); opd->dest = BITMASK1(SHR_T); opd->imm = op & 0xff; opd->cycles = 3; @@ -4142,7 +4187,8 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x0d00: // AND.B #imm,@(R0,GBR) 11001101iiiiiiii case 0x0e00: // XOR.B #imm,@(R0,GBR) 11001110iiiiiiii case 0x0f00: // OR.B #imm,@(R0,GBR) 11001111iiiiiiii - opd->source = BITMASK2(SHR_GBR, SHR_R0); + opd->source = BITMASK3(SHR_GBR, SHR_R0, SHR_MEM); + opd->dest = BITMASK1(SHR_MEM); opd->imm = op & 0xff; opd->cycles = 3; break; @@ -4159,13 +4205,16 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, if (op_flags[i] & OF_DELAY_OP) { if (ops[i-1].op == OP_BRANCH) tmp = ops[i-1].imm; - else + else if (ops[i-1].op != OP_BRANCH_N) tmp = 0; } - opd->source = BITMASK1(SHR_PC); + opd->source = BITMASK2(SHR_PC, SHR_MEM); opd->dest = BITMASK1(GET_Rn()); - if (tmp) + if (tmp) { opd->imm = (tmp + 2 + (op & 0xff) * 4) & ~3; + if (lowest_literal == 0 || opd->imm < lowest_literal) + lowest_literal = opd->imm; + } opd->size = 2; break; @@ -4180,6 +4229,10 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, undefined: elprintf(EL_ANOMALY, "%csh2 drc: unhandled op %04x @ %08x", is_slave ? 's' : 'm', op, pc); + opd->op = OP_UNDEFINED; + // an unhandled instruction is probably not code if it's not the 1st insn + if (!(op_flags[i] & OF_DELAY_OP) && pc != base_pc) + goto end; break; } @@ -4199,10 +4252,12 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, } } } +end: i_end = i; end_pc = pc; // 2nd pass: some analysis + lowest_literal = end_literals = lowest_mova = 0; for (i = 0; i < i_end; i++) { opd = &ops[i]; @@ -4217,22 +4272,39 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, else op_flags[i + 1] |= op_flags[i] & (OF_T_SET | OF_T_CLEAR); - if ((opd->op == OP_BRANCH_CT && (op_flags[i] & OF_T_SET)) - || (opd->op == OP_BRANCH_CF && (op_flags[i] & OF_T_CLEAR))) - { + if ((opd->op == OP_BRANCH_CT && (op_flags[i] & OF_T_CLEAR)) || + (opd->op == OP_BRANCH_CF && (op_flags[i] & OF_T_SET))) + opd->op = OP_BRANCH_N; + else if ((opd->op == OP_BRANCH_CT && (op_flags[i] & OF_T_SET)) || + (opd->op == OP_BRANCH_CF && (op_flags[i] & OF_T_CLEAR))) { opd->op = OP_BRANCH; - opd->cycles = 3; - i_end = i + 1; - if (op_flags[i + 1] & OF_DELAY_OP) { + if (op_flags[i + 1] & OF_DELAY_OP) opd->cycles = 2; - i_end++; + else + opd->cycles = 3; + } + // "overscan" detection: unreachable code after unconditional branch + // this can happen if the insn after a forward branch isn't a local target + if (opd->op == OP_BRANCH || opd->op == OP_BRANCH_R || opd->op == OP_BRANCH_RF) { + if (op_flags[i + 1] & OF_DELAY_OP) { + if (i_end > i + 2 && !(op_flags[i + 2] & OF_BTARGET)) + i_end = i + 2; + } else { + if (i_end > i + 1 && !(op_flags[i + 1] & OF_BTARGET)) + i_end = i + 1; } } - else if (opd->op == OP_LOAD_POOL) - { - if (opd->imm < end_pc + MAX_LITERAL_OFFSET) { + + // literal pool size detection + if (opd->op == OP_MOVA && opd->imm >= base_pc) + if (lowest_mova == 0 || opd->imm < lowest_mova) + lowest_mova = opd->imm; + if (opd->op == OP_LOAD_POOL) { + if (opd->imm >= base_pc && opd->imm < end_pc + MAX_LITERAL_OFFSET) { if (end_literals < opd->imm + opd->size * 2) end_literals = opd->imm + opd->size * 2; + if (lowest_literal == 0 || lowest_literal > opd->imm) + lowest_literal = opd->imm; if (opd->size == 2) { // tweak for NFL: treat a 32bit literal as an address and check if it // points to the literal space. In that case handle it like MOVA. @@ -4245,26 +4317,31 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, } } end_pc = base_pc + i_end * 2; - if (end_literals < end_pc) - end_literals = end_pc; // end_literals is used to decide to inline a literal or not // XXX: need better detection if this actually is used in write + if (lowest_literal >= base_pc) { + if (lowest_literal < end_pc) { + dbg(1, "warning: lowest_literal=%08x < end_pc=%08x", lowest_literal, end_pc); + // TODO: does this always mean end_pc covers data? + } + } if (lowest_mova >= base_pc) { if (lowest_mova < end_literals) { - dbg(1, "mova for %08x, block %08x", lowest_mova, base_pc); - end_literals = end_pc; + dbg(1, "warning: mova=%08x < end_literals=%08x", lowest_mova, end_literals); + end_literals = lowest_mova; } if (lowest_mova < end_pc) { - dbg(1, "warning: mova inside of blk for %08x, block %08x", - lowest_mova, base_pc); + dbg(1, "warning: mova=%08x < end_pc=%08x", lowest_mova, end_pc); end_literals = end_pc; } } + if (lowest_literal >= end_literals) + lowest_literal = end_literals; *end_pc_out = end_pc; if (end_literals_out != NULL) - *end_literals_out = end_literals; + *end_literals_out = (end_literals ?: end_pc); } // vim:shiftwidth=2:ts=2:expandtab From 4eb73cb54bd6e3349194859d30992c9f23aef9e5 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 26 Mar 2019 22:01:27 +0100 Subject: [PATCH 020/174] sh2 drc: sh2 addr modes generalization, more const propagation, code gen optimizations --- cpu/drc/emit_arm.c | 69 ++++++ cpu/drc/emit_x86.c | 86 +++++++- cpu/sh2/compiler.c | 507 +++++++++++++++++++++------------------------ 3 files changed, 378 insertions(+), 284 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 89582e8da..86d8a41d0 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -193,6 +193,11 @@ #define EOP_STRH_SIMPLE(rd,rn) EOP_C_AM3_IMM(A_COND_AL,1,0,rn,rd,0,1,0) #define EOP_STRH_REG( rd,rn,rm) EOP_C_AM3_REG(A_COND_AL,1,0,rn,rd,0,1,rm) +#define EOP_LDRSB_IMM2(cond,rd,rn,offset_8) EOP_C_AM3_IMM(cond,(offset_8) >= 0,1,rn,rd,1,0,abs(offset_8)) +#define EOP_LDRSB_REG2(cond,rd,rn,rm) EOP_C_AM3_REG(cond,1,1,rn,rd,1,0,rm) +#define EOP_LDRSH_IMM2(cond,rd,rn,offset_8) EOP_C_AM3_IMM(cond,(offset_8) >= 0,1,rn,rd,1,1,abs(offset_8)) +#define EOP_LDRSH_REG2(cond,rd,rn,rm) EOP_C_AM3_REG(cond,1,1,rn,rd,1,1,rm) + /* ldm and stm */ #define EOP_XXM(cond,p,u,s,w,l,rn,list) \ EMIT(((cond)<<28) | (1<<27) | ((p)<<24) | ((u)<<23) | ((s)<<22) | ((w)<<21) | ((l)<<20) | ((rn)<<16) | (list)) @@ -382,6 +387,9 @@ static int emith_xbranch(int cond, void *target, int is_call) #define EMITH_SJMP_END_(cond) EMITH_NOTHING1(cond) #define EMITH_SJMP_START(cond) EMITH_NOTHING1(cond) #define EMITH_SJMP_END(cond) EMITH_NOTHING1(cond) +#define EMITH_SJMP2_START(cond) EMITH_NOTHING1(cond) +#define EMITH_SJMP2_MID(cond) EMITH_JMP_START((cond)^1) // inverse cond +#define EMITH_SJMP2_END(cond) EMITH_JMP_END((cond)^1) #define EMITH_SJMP3_START(cond) EMITH_NOTHING1(cond) #define EMITH_SJMP3_MID(cond) EMITH_NOTHING1(cond) #define EMITH_SJMP3_END() @@ -398,6 +406,9 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_add_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_ADD_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) +#define emith_addf_r_r_r_lsr(d, s1, s2, lslimm) \ + EOP_ADD_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSR,lslimm) + #define emith_or_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_ORR_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) @@ -476,6 +487,9 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_adc_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 0, A_OP_ADC, r, imm) +#define emith_adcf_r_imm(r, imm) \ + emith_op_imm(A_COND_AL, 1, A_OP_ADC, r, (imm)) + #define emith_sub_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 0, A_OP_SUB, r, imm) @@ -606,6 +620,8 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_mul_s64(dlo, dhi, s1, s2) \ EOP_C_SMULL(A_COND_AL,0,dhi,dlo,s1,s2) +#define emith_mula_s64_c(cond, dlo, dhi, s1, s2) \ + EOP_C_SMLAL(cond,0,dhi,dlo,s1,s2) #define emith_mula_s64(dlo, dhi, s1, s2) \ EOP_C_SMLAL(A_COND_AL,0,dhi,dlo,s1,s2) @@ -622,9 +638,13 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_read_r_r_offs(r, rs, offs) \ emith_read_r_r_offs_c(A_COND_AL, r, rs, offs) +#define emith_read8s_r_r_offs(r, rs, offs) \ + EOP_LDRSB_IMM2(A_COND_AL, r, rs, offs) #define emith_read8_r_r_offs(r, rs, offs) \ emith_read8_r_r_offs_c(A_COND_AL, r, rs, offs) +#define emith_read16s_r_r_offs(r, rs, offs) \ + EOP_LDRSH_IMM2(A_COND_AL, r, rs, offs) #define emith_read16_r_r_offs(r, rs, offs) \ emith_read16_r_r_offs_c(A_COND_AL, r, rs, offs) @@ -851,3 +871,52 @@ static int emith_xbranch(int cond, void *target, int is_call) JMP_EMIT(A_COND_AL, jmp1); /* done: */ \ } +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macl(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP2_START(DCOND_NE); \ + emith_mula_s64_c(DCOND_EQ, ml, mh, rn, rm); \ + EMITH_SJMP2_MID(DCOND_NE); \ + /* MACH top 16 bits unused if saturated. sign ext for overfl detect */ \ + emith_sext(mh, mh, 16); \ + emith_mula_s64(ml, mh, rn, rm); \ + /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ + /* to check: add MACH[15] to MACH[31:16]. this is 0 if no overflow */ \ + emith_asrf(rn, mh, 16); /* sum = (MACH>>16) + ((MACH>>15)&1) */ \ + emith_adcf_r_imm(rn, 0); /* (MACH>>15) is in carry after shift */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ + EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> +ovl */ \ + emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0xffffffff */ \ + emith_sub_r_imm_c(DCOND_GT, mh, 1); /* 0x00007fff */ \ + EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP2_END(DCOND_NE); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macw(ml, mh, rn, rm, sr) do { \ + emith_sext(rn, rn, 16); \ + emith_sext(rm, rm, 16); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP2_START(DCOND_NE); \ + emith_mula_s64_c(DCOND_EQ, ml, mh, rn, rm); \ + EMITH_SJMP2_MID(DCOND_NE); \ + /* XXX: MACH should be untouched when S is set? */ \ + emith_asr(mh, ml, 31); /* sign ext MACL to MACH for ovrfl check */ \ + emith_mula_s64(ml, mh, rn, rm); \ + /* overflow if top 33 bits of MACH:MACL aren't all 1 or 0 */ \ + /* to check: add MACL[31] to MACH. this is 0 if no overflow */ \ + emith_addf_r_r_r_lsr(mh, mh, ml, 31); /* sum = MACH + ((MACL>>31)&1) */\ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> overflow */ \ + /* XXX: LSB signalling only in SH1, or in SH2 too? */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x00000001); /* LSB of MACH */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x80000000); /* negative ovrfl */ \ + EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> positive ovrfl */ \ + emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0x7fffffff */ \ + EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP2_END(DCOND_NE); \ +} while (0) + diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index c5f4e865b..01702e0c2 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -241,14 +241,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_and_r_imm(r, imm) \ emith_arith_r_imm(4, r, imm) -/* used for sub cycles after test, so retain flags with lea */ -#define emith_sub_r_imm(r, imm) do { \ - assert(r != xSP); \ - EMIT_OP_MODRM(0x8d, 2, r, r); \ - EMIT(-(s32)(imm), s32); \ -} while (0) - -#define emith_subf_r_imm(r, imm) \ +#define emith_sub_r_imm(r, imm) \ emith_arith_r_imm(5, r, imm) #define emith_eor_r_imm(r, imm) \ @@ -454,6 +447,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; } while (0) // "flag" instructions are the same +#define emith_adcf_r_imm emith_adc_r_imm +#define emith_subf_r_imm emith_sub_r_imm #define emith_addf_r_r emith_add_r_r #define emith_subf_r_r emith_sub_r_r #define emith_adcf_r_r emith_adc_r_r @@ -501,6 +496,18 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; } \ } while (0) +#define emith_read8s_r_r_offs(r, rs, offs) do { \ + int r_ = r; \ + if (!is_abcdx(r)) \ + r_ = rcache_get_tmp(); \ + EMIT(0x0f, u8); \ + emith_deref_op(0xbe, r_, rs, offs); \ + if ((r) != r_) { \ + emith_move_r_r(r, r_); \ + rcache_free_tmp(r_); \ + } \ +} while (0) + #define emith_write8_r_r_offs(r, rs, offs) do {\ int r_ = r; \ if (!is_abcdx(r)) { \ @@ -517,6 +524,11 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_deref_op(0xb7, r, rs, offs); \ } while (0) +#define emith_read16s_r_r_offs(r, rs, offs) do { \ + EMIT(0x0f, u8); \ + emith_deref_op(0xbf, r, rs, offs); \ +} while (0) + #define emith_write16_r_r_offs(r, rs, offs) do { \ EMIT(0x66, u8); \ emith_write_r_r_offs(r, rs, offs); \ @@ -653,6 +665,13 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define EMITH_SJMP3_MID EMITH_JMP3_MID #define EMITH_SJMP3_END EMITH_JMP3_END +#define EMITH_SJMP2_START(cond) \ + EMITH_SJMP3_START(cond) +#define EMITH_SJMP2_MID(cond) \ + EMITH_SJMP3_MID(cond) +#define EMITH_SJMP2_END(cond) \ + EMITH_SJMP3_END() + #define emith_pass_arg_r(arg, reg) do { \ int rd = 7; \ host_arg2reg(rd, arg); \ @@ -854,3 +873,54 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; rcache_free_tmp(tmp_); \ } +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macl(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* MACH top 16 bits unused if saturated. sign ext for overfl detect */ \ + emith_sext(mh, mh, 16); \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ + /* to check: add MACH[15] to MACH[31:16]. this is 0 if no overflow */ \ + emith_asrf(rn, mh, 16); /* sum = (MACH>>16) + ((MACH>>15)&1) */ \ + emith_adcf_r_imm(rn, 0); /* (MACH>>15) is in carry after shift */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ + EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> +ovl */ \ + emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0xffffffff */ \ + emith_sub_r_imm_c(DCOND_GT, mh, 1); /* 0x00007fff */ \ + EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macw(ml, mh, rn, rm, sr) do { \ + emith_sext(rn, rn, 16); \ + emith_sext(rm, rm, 16); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* XXX: MACH should be untouched when S is set? */ \ + emith_asr(mh, ml, 31); /* sign ext MACL to MACH for ovrfl check */ \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 33 bits of MACH:MACL aren't all 1 or 0 */ \ + /* to check: add MACL[31] to MACH. this is 0 if no overflow */ \ + emith_lsr(rn, ml, 31); \ + emith_addf_r_r(rn, mh); /* sum = MACH + ((MACL>>31)&1) */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> overflow */ \ + /* XXX: LSB signalling only in SH1, or in SH2 too? */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x00000001); /* LSB of MACH */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x80000000); /* negative ovrfl */ \ + EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> positive ovrfl */ \ + emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0x7fffffff */ \ + EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index ece3b13e9..af6ca9cd6 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -102,6 +102,7 @@ static int insns_compiled, hash_collisions, host_insn_count; #define SHR_T SHR_SR // might make them separate someday #define SHR_MEM 31 +#define SHR_TMP -1 static struct op_data { u8 op; @@ -391,6 +392,12 @@ static void REGPARM(2) (*sh2_drc_write16)(u32 a, u32 d); static void REGPARM(3) (*sh2_drc_write32)(u32 a, u32 d, SH2 *sh2); // address space stuff +static int dr_is_rom(u32 a) +{ + // tweak for WWF Raw which writes data to some high ROM addresses + return (a & 0xc6000000) == 0x02000000 && (a & 0x3f0000) < 0x3e0000; +} + static int dr_ctx_get_mem_ptr(u32 a, u32 *mask) { int poffs = -1; @@ -1162,6 +1169,26 @@ static int emit_get_rbase_and_offs(u32 a, u32 *offs) return hr; } +// read const data from const ROM address +static int emit_get_rom_data(sh2_reg_e r, u32 offs, int size, u32 *val) +{ + u32 tmp; + + *val = 0; + if (gconst_get(r, &tmp)) { + tmp += offs; + if (dr_is_rom(tmp)) { + switch (size) { + case 0: *val = (s8)p32x_sh2_read8(tmp, sh2s); break; // 8 + case 1: *val = (s16)p32x_sh2_read16(tmp, sh2s); break; // 16 + case 2: *val = p32x_sh2_read32(tmp, sh2s); break; // 32 + } + return 1; + } + } + return 0; +} + static void emit_move_r_imm32(sh2_reg_e dst, u32 imm) { #if PROPAGATE_CONSTANTS @@ -1174,10 +1201,19 @@ static void emit_move_r_imm32(sh2_reg_e dst, u32 imm) static void emit_move_r_r(sh2_reg_e dst, sh2_reg_e src) { - int hr_d = rcache_get_reg(dst, RC_GR_WRITE); - int hr_s = rcache_get_reg(src, RC_GR_READ); + int hr_d, hr_s; + u32 val; - emith_move_r_r(hr_d, hr_s); +#if PROPAGATE_CONSTANTS + if (gconst_get(src, &val)) + gconst_new(dst, val); + else +#endif + { + hr_s = rcache_get_reg(src, RC_GR_READ); + hr_d = rcache_get_reg(dst, RC_GR_WRITE); + emith_move_r_r(hr_d, hr_s); + } } // T must be clear, and comparison done just before this @@ -1188,15 +1224,10 @@ static void emit_or_t_if_eq(int srr) EMITH_SJMP_END(DCOND_NE); } -// arguments must be ready -// reg cache must be clean before call -static int emit_memhandler_read_(int size, int ram_check) +// rd = @(arg0) +static int emit_memhandler_read(int size) { int arg1; -#if 0 - int arg0; - host_arg2reg(arg0, 0); -#endif rcache_clean(); @@ -1207,53 +1238,10 @@ static int emit_memhandler_read_(int size, int ram_check) arg1 = rcache_get_tmp_arg(1); emith_move_r_r_ptr(arg1, CONTEXT_REG); - -#if 0 // can't do this because of unmapped reads - // ndef PDB_NET - if (ram_check && Pico.rom == (void *)0x02000000 && Pico32xMem->sdram == (void *)0x06000000) { - int tmp = rcache_get_tmp(); - emith_and_r_r_imm(tmp, arg0, 0xfb000000); - emith_cmp_r_imm(tmp, 0x02000000); - switch (size) { - case 0: // 8 - EMITH_SJMP3_START(DCOND_NE); - emith_eor_r_imm_c(DCOND_EQ, arg0, 1); - emith_read8_r_r_offs_c(DCOND_EQ, arg0, arg0, 0); - EMITH_SJMP3_MID(DCOND_NE); - emith_call_cond(DCOND_NE, sh2_drc_read8); - EMITH_SJMP3_END(); - break; - case 1: // 16 - EMITH_SJMP3_START(DCOND_NE); - emith_read16_r_r_offs_c(DCOND_EQ, arg0, arg0, 0); - EMITH_SJMP3_MID(DCOND_NE); - emith_call_cond(DCOND_NE, sh2_drc_read16); - EMITH_SJMP3_END(); - break; - case 2: // 32 - EMITH_SJMP3_START(DCOND_NE); - emith_read_r_r_offs_c(DCOND_EQ, arg0, arg0, 0); - emith_ror_c(DCOND_EQ, arg0, arg0, 16); - EMITH_SJMP3_MID(DCOND_NE); - emith_call_cond(DCOND_NE, sh2_drc_read32); - EMITH_SJMP3_END(); - break; - } - } - else -#endif - { - switch (size) { - case 0: // 8 - emith_call(sh2_drc_read8); - break; - case 1: // 16 - emith_call(sh2_drc_read16); - break; - case 2: // 32 - emith_call(sh2_drc_read32); - break; - } + switch (size) { + case 0: emith_call(sh2_drc_read8); break; // 8 + case 1: emith_call(sh2_drc_read16); break; // 16 + case 2: emith_call(sh2_drc_read32); break; // 32 } rcache_invalidate(); @@ -1263,28 +1251,56 @@ static int emit_memhandler_read_(int size, int ram_check) return rcache_get_tmp_ret(); } -static int emit_memhandler_read(int size) +// @(arg0) = arg1 +static void emit_memhandler_write(int size) { - return emit_memhandler_read_(size, 1); + int arg2; + + if (reg_map_g2h[SHR_SR] != -1) + emith_ctx_write(reg_map_g2h[SHR_SR], SHR_SR * 4); + + rcache_clean(); + + arg2 = rcache_get_tmp_arg(2); + emith_move_r_r_ptr(arg2, CONTEXT_REG); + switch (size) { + case 0: emith_call(sh2_drc_write8); break; // 8 + case 1: emith_call(sh2_drc_write16); break; // 16 + case 2: emith_call(sh2_drc_write32); break; // 32 + } + + rcache_invalidate(); + if (reg_map_g2h[SHR_SR] != -1) + emith_ctx_read(reg_map_g2h[SHR_SR], SHR_SR * 4); } +// rd = @(Rs,#offs) static int emit_memhandler_read_rr(sh2_reg_e rd, sh2_reg_e rs, u32 offs, int size) { - int hr, hr2, ram_check = 1; + int hr, hr2; u32 val, offs2; + if (emit_get_rom_data(rs, offs, size, &val)) { + if (rd == SHR_TMP) { + hr2 = rcache_get_tmp(); + emith_move_r_imm(hr2, val); + } else { + gconst_new(rd, val); + hr2 = rcache_get_reg(rd, RC_GR_RMW); + } + return hr2; + } + if (gconst_get(rs, &val)) { hr = emit_get_rbase_and_offs(val + offs, &offs2); if (hr != -1) { hr2 = rcache_get_reg(rd, RC_GR_WRITE); switch (size) { case 0: // 8 - emith_read8_r_r_offs(hr2, hr, offs2 ^ 1); - emith_sext(hr2, hr2, 8); + emith_read8s_r_r_offs(hr2, hr, offs2 ^ 1); break; case 1: // 16 - emith_read16_r_r_offs(hr2, hr, offs2); - emith_sext(hr2, hr2, 16); + emith_read16s_r_r_offs(hr2, hr, offs2); break; case 2: // 32 emith_read_r_r_offs(hr2, hr, offs2); @@ -1294,14 +1310,17 @@ static int emit_memhandler_read_rr(sh2_reg_e rd, sh2_reg_e rs, u32 offs, int siz rcache_free_tmp(hr); return hr2; } - - ram_check = 0; } - hr = rcache_get_reg_arg(0, rs); - if (offs != 0) - emith_add_r_imm(hr, offs); - hr = emit_memhandler_read_(size, ram_check); + if (gconst_get(rs, &val)) { + hr = rcache_get_tmp_arg(0); + emith_move_r_imm(hr, val + offs); + } else { + hr = rcache_get_reg_arg(0, rs); + if (offs) + emith_add_r_imm(hr, offs); + } + hr = emit_memhandler_read(size); hr2 = rcache_get_reg(rd, RC_GR_WRITE); if (size != 2) { emith_sext(hr2, hr, (size == 1) ? 16 : 8); @@ -1312,45 +1331,78 @@ static int emit_memhandler_read_rr(sh2_reg_e rd, sh2_reg_e rs, u32 offs, int siz return hr2; } -static void emit_memhandler_write(int size) +// @(Rs,#offs) = rd +static void emit_memhandler_write_rr(sh2_reg_e rd, sh2_reg_e rs, u32 offs, int size) { - int ctxr; - host_arg2reg(ctxr, 2); - if (reg_map_g2h[SHR_SR] != -1) - emith_ctx_write(reg_map_g2h[SHR_SR], SHR_SR * 4); + int hr; + u32 val; - rcache_clean(); + rcache_get_reg_arg(1, rd); - switch (size) { - case 0: // 8 - // XXX: consider inlining sh2_drc_write8 - emith_call(sh2_drc_write8); - break; - case 1: // 16 - emith_call(sh2_drc_write16); - break; - case 2: // 32 - emith_move_r_r_ptr(ctxr, CONTEXT_REG); - emith_call(sh2_drc_write32); - break; - } + if (gconst_get(rs, &val)) { + hr = rcache_get_tmp_arg(0); + emith_move_r_imm(hr, val + offs); + } else if (offs) { + hr = rcache_get_reg_arg(0, rs); + emith_add_r_imm(hr, offs); + } else + rcache_get_reg_arg(0, rs); - rcache_invalidate(); - if (reg_map_g2h[SHR_SR] != -1) - emith_ctx_read(reg_map_g2h[SHR_SR], SHR_SR * 4); + emit_memhandler_write(size); +} + +// rd = @(Rx,Ry) +static int emit_indirect_indexed_read(sh2_reg_e rd, sh2_reg_e rx, sh2_reg_e ry, int size) +{ + int hr, hr2; + int a0, t; +#if PROPAGATE_CONSTANTS + u32 offs; + + if (gconst_get(ry, &offs)) + return emit_memhandler_read_rr(rd, rx, offs, size); + if (gconst_get(rx, &offs)) + return emit_memhandler_read_rr(rd, ry, offs, size); +#endif + a0 = rcache_get_reg_arg(0, rx); + t = rcache_get_reg(ry, RC_GR_READ); + emith_add_r_r(a0, t); + hr = emit_memhandler_read(size); + if (rd != SHR_TMP) + hr2 = rcache_get_reg(rd, RC_GR_WRITE); + else + hr2 = hr; + + if (size != 2) { // 16, 8 + emith_sext(hr2, hr, size ? 16 : 8); + } else if (hr != hr2) // 32 + emith_move_r_r(hr2, hr); + + if (hr != hr2) + rcache_free_tmp(hr); + return hr2; } -// @(Rx,Ry) -static int emit_indirect_indexed_read(int rx, int ry, int size) +// @(Rx,Ry) = rd +static void emit_indirect_indexed_write(sh2_reg_e rd, sh2_reg_e rx, sh2_reg_e ry, int size) { int a0, t; +#if PROPAGATE_CONSTANTS + u32 offs; + + if (gconst_get(ry, &offs)) + return emit_memhandler_write_rr(rd, rx, offs, size); + if (gconst_get(rx, &offs)) + return emit_memhandler_write_rr(rd, ry, offs, size); +#endif + rcache_get_reg_arg(1, rd); a0 = rcache_get_reg_arg(0, rx); t = rcache_get_reg(ry, RC_GR_READ); emith_add_r_r(a0, t); - return emit_memhandler_read(size); + emit_memhandler_write(size); } -// read @Rn, @rm +// @Rn+,@Rm+ static void emit_indirect_read_double(u32 *rnr, u32 *rmr, int rn, int rm, int size) { int tmp; @@ -1670,8 +1722,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) break; case OP_BRANCH_CT: case OP_BRANCH_CF: - tmp = rcache_get_reg(SHR_PC, RC_GR_WRITE); sr = rcache_get_reg(SHR_SR, RC_GR_READ); + tmp = rcache_get_reg(SHR_PC, RC_GR_WRITE); emith_move_r_imm(tmp, pc); emith_tst_r_imm(sr, T); tmp2 = ops[i-1].op == OP_BRANCH_CT ? DCOND_NE : DCOND_EQ; @@ -1706,23 +1758,34 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case OP_BRANCH_R: if (opd->dest & BITMASK1(SHR_PR)) emit_move_r_imm32(SHR_PR, pc + 2); - emit_move_r_r(SHR_PC, opd->rm); - drcf.pending_branch_indirect = 1; + if (gconst_get(opd->rm, &tmp)) { + opd->imm = tmp; + drcf.pending_branch_direct = 1; + } else { + emit_move_r_r(SHR_PC, opd->rm); + ops[i+1].source |= SHR_PC; // need PC for jump after delay slot + drcf.pending_branch_indirect = 1; + } goto end_op; case OP_BRANCH_RF: - tmp = rcache_get_reg(SHR_PC, RC_GR_WRITE); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ); - if (opd->dest & BITMASK1(SHR_PR)) { - tmp3 = rcache_get_reg(SHR_PR, RC_GR_WRITE); - emith_move_r_imm(tmp3, pc + 2); - emith_add_r_r_r(tmp, tmp2, tmp3); - } - else { - emith_move_r_r(tmp, tmp2); - emith_add_r_imm(tmp, pc + 2); + if (gconst_get(GET_Rn(), &tmp)) { + if (opd->dest & BITMASK1(SHR_PR)) + emit_move_r_imm32(SHR_PR, pc + 2); + opd->imm = pc + 2 + tmp; + drcf.pending_branch_direct = 1; + } else { + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ); + tmp = rcache_get_reg(SHR_PC, RC_GR_WRITE); + emith_move_r_imm(tmp, pc + 2); + if (opd->dest & BITMASK1(SHR_PR)) { + tmp3 = rcache_get_reg(SHR_PR, RC_GR_WRITE); + emith_move_r_r(tmp3, tmp); + } + emith_add_r_r(tmp, tmp2); + ops[i+1].source |= SHR_PC; // need PC for jump after delay slot + drcf.pending_branch_indirect = 1; } - drcf.pending_branch_indirect = 1; goto end_op; case OP_SLEEP: @@ -1767,6 +1830,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // obtain new PC emit_memhandler_read_rr(SHR_PC, SHR_VBR, opd->imm * 4, 2); // indirect jump -> back to dispatcher + sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + FLUSH_CYCLES(sr); rcache_flush(); emith_jump(sh2_drc_dispatcher); goto end_op; @@ -1780,7 +1845,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (opd->size == 2) tmp = FETCH32(opd->imm); else - tmp = (u32)(int)(signed short)FETCH_OP(opd->imm); + tmp = (s16)FETCH_OP(opd->imm); gconst_new(GET_Rn(), tmp); } else @@ -1812,9 +1877,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case OP_MOVA: if (opd->imm != 0) emit_move_r_imm32(SHR_R0, opd->imm); - else { - tmp = rcache_get_reg(SHR_R0, RC_GR_WRITE); + else { // delay slot case, pc can have either value tmp2 = rcache_get_reg(SHR_PC, RC_GR_READ); + tmp = rcache_get_reg(SHR_R0, RC_GR_WRITE); emith_add_r_r_imm(tmp, tmp2, 2 + (op & 0xff) * 4); emith_bic_r_imm(tmp, 3); } @@ -1828,7 +1893,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) switch (op & 0x0f) { case 0x02: - tmp = rcache_get_reg(GET_Rn(), RC_GR_WRITE); switch (GET_Fx()) { case 0: // STC SR,Rn 0000nnnn00000010 @@ -1844,6 +1908,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto default_; } tmp3 = rcache_get_reg(tmp2, RC_GR_READ); + tmp = rcache_get_reg(GET_Rn(), RC_GR_WRITE); emith_move_r_r(tmp, tmp3); if (tmp2 == SHR_SR) emith_clear_msb(tmp, tmp, 22); // reserved bits defined by ISA as 0 @@ -1851,12 +1916,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x04: // MOV.B Rm,@(R0,Rn) 0000nnnnmmmm0100 case 0x05: // MOV.W Rm,@(R0,Rn) 0000nnnnmmmm0101 case 0x06: // MOV.L Rm,@(R0,Rn) 0000nnnnmmmm0110 - rcache_clean(); - tmp = rcache_get_reg_arg(1, GET_Rm()); - tmp2 = rcache_get_reg_arg(0, SHR_R0); - tmp3 = rcache_get_reg(GET_Rn(), RC_GR_READ); - emith_add_r_r(tmp2, tmp3); - emit_memhandler_write(op & 3); + emit_indirect_indexed_write(GET_Rm(), SHR_R0, GET_Rn(), op & 3); goto end_op; case 0x07: // MUL.L Rm,Rn 0000nnnnmmmm0111 @@ -1903,7 +1963,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } goto end_op; case 0x0a: - tmp = rcache_get_reg(GET_Rn(), RC_GR_WRITE); switch (GET_Fx()) { case 0: // STS MACH,Rn 0000nnnn00001010 @@ -1918,50 +1977,21 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) default: goto default_; } - tmp2 = rcache_get_reg(tmp2, RC_GR_READ); - emith_move_r_r(tmp, tmp2); + emit_move_r_r(GET_Rn(), tmp2); goto end_op; case 0x0c: // MOV.B @(R0,Rm),Rn 0000nnnnmmmm1100 case 0x0d: // MOV.W @(R0,Rm),Rn 0000nnnnmmmm1101 case 0x0e: // MOV.L @(R0,Rm),Rn 0000nnnnmmmm1110 - tmp = emit_indirect_indexed_read(SHR_R0, GET_Rm(), op & 3); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_WRITE); - if ((op & 3) != 2) { - emith_sext(tmp2, tmp, (op & 1) ? 16 : 8); - } else - emith_move_r_r(tmp2, tmp); - rcache_free_tmp(tmp); + emit_indirect_indexed_read(GET_Rn(), SHR_R0, GET_Rm(), op & 3); goto end_op; case 0x0f: // MAC.L @Rm+,@Rn+ 0000nnnnmmmm1111 emit_indirect_read_double(&tmp, &tmp2, GET_Rn(), GET_Rm(), 2); - tmp4 = rcache_get_reg(SHR_MACH, RC_GR_RMW); - /* MS 16 MAC bits unused if saturated */ + sr = rcache_get_reg(SHR_SR, RC_GR_READ); - emith_tst_r_imm(sr, S); - EMITH_SJMP_START(DCOND_EQ); - emith_clear_msb_c(DCOND_NE, tmp4, tmp4, 16); - EMITH_SJMP_END(DCOND_EQ); - rcache_unlock(sr); - tmp3 = rcache_get_reg(SHR_MACL, RC_GR_RMW); // might evict SR - emith_mula_s64(tmp3, tmp4, tmp, tmp2); + tmp3 = rcache_get_reg(SHR_MACL, RC_GR_RMW); + tmp4 = rcache_get_reg(SHR_MACH, RC_GR_RMW); + emith_sh2_macl(tmp3, tmp4, tmp, tmp2, sr); rcache_free_tmp(tmp2); - sr = rcache_get_reg(SHR_SR, RC_GR_READ); // reget just in case - emith_tst_r_imm(sr, S); - - EMITH_JMP_START(DCOND_EQ); - emith_asr(tmp, tmp4, 15); - emith_cmp_r_imm(tmp, -1); // negative overflow (0x80000000..0xffff7fff) - EMITH_SJMP_START(DCOND_GE); - emith_move_r_imm_c(DCOND_LT, tmp4, 0x8000); - emith_move_r_imm_c(DCOND_LT, tmp3, 0x0000); - EMITH_SJMP_END(DCOND_GE); - emith_cmp_r_imm(tmp, 0); // positive overflow (0x00008000..0x7fffffff) - EMITH_SJMP_START(DCOND_LE); - emith_move_r_imm_c(DCOND_GT, tmp4, 0x00007fff); - emith_move_r_imm_c(DCOND_GT, tmp3, 0xffffffff); - EMITH_SJMP_END(DCOND_LE); - EMITH_JMP_END(DCOND_EQ); - rcache_free_tmp(tmp); goto end_op; } @@ -1970,12 +2000,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) ///////////////////////////////////////////// case 0x01: // MOV.L Rm,@(disp,Rn) 0001nnnnmmmmdddd - rcache_clean(); - tmp = rcache_get_reg_arg(0, GET_Rn()); - tmp2 = rcache_get_reg_arg(1, GET_Rm()); - if (op & 0x0f) - emith_add_r_imm(tmp, (op & 0x0f) * 4); - emit_memhandler_write(2); + emit_memhandler_write_rr(GET_Rm(), GET_Rn(), (op & 0x0f) * 4, 2); goto end_op; case 0x02: @@ -1984,20 +2009,14 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x00: // MOV.B Rm,@Rn 0010nnnnmmmm0000 case 0x01: // MOV.W Rm,@Rn 0010nnnnmmmm0001 case 0x02: // MOV.L Rm,@Rn 0010nnnnmmmm0010 - rcache_clean(); - rcache_get_reg_arg(0, GET_Rn()); - rcache_get_reg_arg(1, GET_Rm()); - emit_memhandler_write(op & 3); + emit_memhandler_write_rr(GET_Rm(), GET_Rn(), 0, op & 3); goto end_op; case 0x04: // MOV.B Rm,@-Rn 0010nnnnmmmm0100 case 0x05: // MOV.W Rm,@-Rn 0010nnnnmmmm0101 case 0x06: // MOV.L Rm,@-Rn 0010nnnnmmmm0110 - rcache_get_reg_arg(1, GET_Rm()); // for Rm == Rn tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); emith_sub_r_imm(tmp, (1 << (op & 3))); - rcache_clean(); - rcache_get_reg_arg(0, GET_Rn()); - emit_memhandler_write(op & 3); + emit_memhandler_write_rr(GET_Rm(), GET_Rn(), 0, op & 3); goto end_op; case 0x07: // DIV0S Rm,Rn 0010nnnnmmmm0111 sr = rcache_get_reg(SHR_SR, RC_GR_RMW); @@ -2132,8 +2151,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // Q2 = carry(Rn -= Rm) // Q = M ^ Q1 ^ Q2 // T = (Q == M) = !(Q ^ M) = !(Q1 ^ Q2) - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW); tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW); sr = rcache_get_reg(SHR_SR, RC_GR_RMW); emith_tpop_carry(sr, 0); emith_adcf_r_r(tmp2, tmp2); @@ -2228,20 +2247,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto end_op; case 1: // DT Rn 0100nnnn00010000 sr = rcache_get_reg(SHR_SR, RC_GR_RMW); -#if 0 // scheduling needs tuning - if (FETCH_OP(pc) == 0x8bfd) { // BF #-2 - if (gconst_get(GET_Rn(), &tmp)) { - // XXX: limit burned cycles - emit_move_r_imm32(GET_Rn(), 0); - emith_or_r_imm(sr, T); - cycles += tmp * 4 + 1; // +1 syncs with noconst version, not sure why - skip_op = 1; - } - else - emith_sh2_dtbf_loop(); - goto end_op; - } -#endif tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); emith_bic_r_imm(sr, T); emith_subf_r_imm(tmp, 1); @@ -2370,17 +2375,14 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) default: goto default_; } - rcache_get_reg_arg(0, GET_Rn()); - tmp2 = emit_memhandler_read(2); if (tmp == SHR_SR) { + tmp2 = emit_memhandler_read_rr(SHR_TMP, GET_Rn(), 0, 2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW); emith_write_sr(sr, tmp2); + rcache_free_tmp(tmp2); drcf.test_irq = 1; - } else { - tmp = rcache_get_reg(tmp, RC_GR_WRITE); - emith_move_r_r(tmp, tmp2); - } - rcache_free_tmp(tmp2); + } else + emit_memhandler_read_rr(tmp, GET_Rn(), 0, 2); tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); emith_add_r_imm(tmp, 4); goto end_op; @@ -2440,7 +2442,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_bic_r_imm(sr, T); emith_cmp_r_imm(tmp, 0); emit_or_t_if_eq(sr); - rcache_clean(); emith_or_r_imm(tmp, 0x80); tmp2 = rcache_get_tmp_arg(1); // assuming it differs to tmp emith_move_r_r(tmp2, tmp); @@ -2480,28 +2481,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x0f: // MAC.W @Rm+,@Rn+ 0100nnnnmmmm1111 emit_indirect_read_double(&tmp, &tmp2, GET_Rn(), GET_Rm(), 1); - emith_sext(tmp, tmp, 16); - emith_sext(tmp2, tmp2, 16); + sr = rcache_get_reg(SHR_SR, RC_GR_READ); tmp3 = rcache_get_reg(SHR_MACL, RC_GR_RMW); tmp4 = rcache_get_reg(SHR_MACH, RC_GR_RMW); - emith_mula_s64(tmp3, tmp4, tmp, tmp2); + emith_sh2_macw(tmp3, tmp4, tmp, tmp2, sr); rcache_free_tmp(tmp2); - // XXX: MACH should be untouched when S is set? - sr = rcache_get_reg(SHR_SR, RC_GR_READ); - emith_tst_r_imm(sr, S); - EMITH_JMP_START(DCOND_EQ); - - emith_asr(tmp, tmp3, 31); - emith_eorf_r_r(tmp, tmp4); // tmp = ((signed)macl >> 31) ^ mach - EMITH_JMP_START(DCOND_EQ); - emith_move_r_imm(tmp3, 0x80000000); - emith_tst_r_r(tmp4, tmp4); - EMITH_SJMP_START(DCOND_MI); - emith_sub_r_imm_c(DCOND_PL, tmp3, 1); // positive - EMITH_SJMP_END(DCOND_MI); - EMITH_JMP_END(DCOND_EQ); - - EMITH_JMP_END(DCOND_EQ); rcache_free_tmp(tmp); goto end_op; } @@ -2600,13 +2584,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { case 0x0000: // MOV.B R0,@(disp,Rn) 10000000nnnndddd case 0x0100: // MOV.W R0,@(disp,Rn) 10000001nnnndddd - rcache_clean(); - tmp = rcache_get_reg_arg(0, GET_Rm()); - tmp2 = rcache_get_reg_arg(1, SHR_R0); - tmp3 = (op & 0x100) >> 8; - if (op & 0x0f) - emith_add_r_imm(tmp, (op & 0x0f) << tmp3); - emit_memhandler_write(tmp3); + tmp = (op & 0x100) >> 8; + emit_memhandler_write_rr(SHR_R0, GET_Rm(), (op & 0x0f) << tmp, tmp); goto end_op; case 0x0400: // MOV.B @(disp,Rm),R0 10000100mmmmdddd case 0x0500: // MOV.W @(disp,Rm),R0 10000101mmmmdddd @@ -2615,14 +2594,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto end_op; case 0x0800: // CMP/EQ #imm,R0 10001000iiiiiiii // XXX: could use cmn - tmp = rcache_get_tmp(); tmp2 = rcache_get_reg(0, RC_GR_READ); sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - emith_move_r_imm_s8(tmp, op & 0xff); emith_bic_r_imm(sr, T); - emith_cmp_r_r(tmp2, tmp); + emith_cmp_r_imm(tmp2, (s8)(op & 0xff)); emit_or_t_if_eq(sr); - rcache_free_tmp(tmp); goto end_op; } goto default_; @@ -2634,12 +2610,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x0000: // MOV.B R0,@(disp,GBR) 11000000dddddddd case 0x0100: // MOV.W R0,@(disp,GBR) 11000001dddddddd case 0x0200: // MOV.L R0,@(disp,GBR) 11000010dddddddd - rcache_clean(); - tmp = rcache_get_reg_arg(0, SHR_GBR); - tmp2 = rcache_get_reg_arg(1, SHR_R0); - tmp3 = (op & 0x300) >> 8; - emith_add_r_imm(tmp, (op & 0xff) << tmp3); - emit_memhandler_write(tmp3); + tmp = (op & 0x300) >> 8; + emit_memhandler_write_rr(SHR_R0, SHR_GBR, (op & 0xff) << tmp, tmp); goto end_op; case 0x0400: // MOV.B @(disp,GBR),R0 11000100dddddddd case 0x0500: // MOV.W @(disp,GBR),R0 11000101dddddddd @@ -2667,7 +2639,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_or_r_imm(tmp, op & 0xff); goto end_op; case 0x0c00: // TST.B #imm,@(R0,GBR) 11001100iiiiiiii - tmp = emit_indirect_indexed_read(SHR_R0, SHR_GBR, 0); + tmp = emit_indirect_indexed_read(SHR_TMP, SHR_R0, SHR_GBR, 0); sr = rcache_get_reg(SHR_SR, RC_GR_RMW); emith_bic_r_imm(sr, T); emith_tst_r_imm(tmp, op & 0xff); @@ -2675,15 +2647,15 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_free_tmp(tmp); goto end_op; case 0x0d00: // AND.B #imm,@(R0,GBR) 11001101iiiiiiii - tmp = emit_indirect_indexed_read(SHR_R0, SHR_GBR, 0); + tmp = emit_indirect_indexed_read(SHR_TMP, SHR_R0, SHR_GBR, 0); emith_and_r_imm(tmp, op & 0xff); goto end_rmw_op; case 0x0e00: // XOR.B #imm,@(R0,GBR) 11001110iiiiiiii - tmp = emit_indirect_indexed_read(SHR_R0, SHR_GBR, 0); + tmp = emit_indirect_indexed_read(SHR_TMP, SHR_R0, SHR_GBR, 0); emith_eor_r_imm(tmp, op & 0xff); goto end_rmw_op; case 0x0f00: // OR.B #imm,@(R0,GBR) 11001111iiiiiiii - tmp = emit_indirect_indexed_read(SHR_R0, SHR_GBR, 0); + tmp = emit_indirect_indexed_read(SHR_TMP, SHR_R0, SHR_GBR, 0); emith_or_r_imm(tmp, op & 0xff); end_rmw_op: tmp2 = rcache_get_tmp_arg(1); @@ -2708,32 +2680,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (!(op_flags[i] & OF_B_IN_DS)) elprintf_sh2(sh2, EL_ANOMALY, "drc: illegal op %04x @ %08x", op, pc - 2); - - tmp = rcache_get_reg(SHR_SP, RC_GR_RMW); - emith_sub_r_imm(tmp, 4*2); - // push SR - tmp = rcache_get_reg_arg(0, SHR_SP); - emith_add_r_imm(tmp, 4); - tmp = rcache_get_reg_arg(1, SHR_SR); - emith_clear_msb(tmp, tmp, 22); - emit_memhandler_write(2); - // push PC - rcache_get_reg_arg(0, SHR_SP); - tmp = rcache_get_tmp_arg(1); - if (drcf.pending_branch_indirect) { - tmp2 = rcache_get_reg(SHR_PC, RC_GR_READ); - emith_move_r_r(tmp, tmp2); - } - else - emith_move_r_imm(tmp, pc - 2); - emit_memhandler_write(2); - // obtain new PC - v = (op_flags[i] & OF_B_IN_DS) ? 6 : 4; - emit_memhandler_read_rr(SHR_PC, SHR_VBR, v * 4, 2); - // indirect jump -> back to dispatcher - rcache_flush(); - emith_jump(sh2_drc_dispatcher); - break; + exit(1); } end_op: @@ -2754,6 +2701,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emit_move_r_imm32(SHR_PC, pc); rcache_flush(); emith_call(sh2_drc_test_irq); + if (pc < end_pc) // mark next insns as entry point for RTE + op_flags[i+1] |= OF_BTARGET; drcf.test_irq = 0; } @@ -2763,36 +2712,37 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) struct op_data *opd_b = (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd; u32 target_pc = opd_b->imm; - int cond = -1, ncond = -1; + int cond = -1; void *target = NULL; - EMITH_SJMP_DECL_(); + int ctaken = 0; + if (opd_b->op == OP_BRANCH_CT || opd_b->op == OP_BRANCH_CF) { + ctaken = (op_flags[i] & OF_DELAY_OP) ? 1 : 2; + } + cycles += ctaken; // assume branch taken sr = rcache_get_reg(SHR_SR, RC_GR_RMW); FLUSH_CYCLES(sr); rcache_clean(); - if (opd_b->op != OP_BRANCH) { + // emit condition test for conditional branch + if (opd_b->op == OP_BRANCH_CT || opd_b->op == OP_BRANCH_CF) { cond = (opd_b->op == OP_BRANCH_CF) ? DCOND_EQ : DCOND_NE; - ncond = (opd_b->op == OP_BRANCH_CF) ? DCOND_NE : DCOND_EQ; - } - if (cond != -1) { - int ctaken = (op_flags[i] & OF_DELAY_OP) ? 1 : 2; - if (delay_dep_fw & BITMASK1(SHR_T)) emith_tst_r_imm(sr, T_save); else emith_tst_r_imm(sr, T); - - EMITH_SJMP_START_(ncond); - emith_sub_r_imm_c(cond, sr, ctaken<<12); } + // no modification of host status/flags between here and branching! #if LINK_BRANCHES - if (find_in_array(branch_target_pc, branch_target_count, target_pc) >= 0) + v = find_in_array(branch_target_pc, branch_target_count, target_pc); + if (v >= 0) { // local branch - // XXX: jumps back can be linked already - if (branch_patch_count < MAX_LOCAL_BRANCHES) { + if (branch_target_ptr[v]) { + // jumps back can be linked here since host PC is already known + target = branch_target_ptr[v]; + } else if (branch_patch_count < MAX_LOCAL_BRANCHES) { target = tcache_ptr; branch_patch_pc[branch_patch_count] = target_pc; branch_patch_ptr[branch_patch_count] = target; @@ -2801,9 +2751,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) else dbg(1, "warning: too many local branches"); } - - if (target == NULL) #endif + if (target == NULL) { // can't resolve branch locally, make a block exit emit_move_r_imm32(SHR_PC, target_pc); @@ -2816,13 +2765,16 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (cond != -1) { emith_jump_cond_patchable(cond, target); - EMITH_SJMP_END_(ncond); } else { emith_jump_patchable(target); rcache_invalidate(); } + // branch not taken, correct cycle count + if (ctaken) + emith_add_r_imm(sr, ctaken << 12); + drcf.pending_branch_direct = 0; } else if (drcf.pending_branch_indirect) { @@ -2851,6 +2803,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { void *target; + s32 tmp = rcache_get_reg(SHR_SR, RC_GR_RMW); + FLUSH_CYCLES(tmp); + emit_move_r_imm32(SHR_PC, pc); rcache_flush(); From 38e9622eb687a0f7bc67171085220c3e6d5995d8 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 27 Mar 2019 20:24:48 +0100 Subject: [PATCH 021/174] add 32bit memory access functions for SH2 --- cpu/sh2/compiler.c | 8 +- cpu/sh2/sh2.h | 9 ++- pico/32x/memory.c | 177 ++++++++++++++++++++++++++++++++++----------- pico/32x/sh2soc.c | 4 +- 4 files changed, 150 insertions(+), 48 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index af6ca9cd6..3c82420e6 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -2906,7 +2906,6 @@ static void sh2_generate_utils(void) { int arg0, arg1, arg2, sr, tmp; - sh2_drc_write32 = p32x_sh2_write32; sh2_drc_read8 = p32x_sh2_read8; sh2_drc_read16 = p32x_sh2_read16; sh2_drc_read32 = p32x_sh2_read32; @@ -3015,6 +3014,11 @@ static void sh2_generate_utils(void) emith_ctx_read_ptr(arg2, offsetof(SH2, write16_tab)); emith_sh2_wcall(arg0, arg2); + // sh2_drc_write32(u32 a, u32 d) + sh2_drc_write32 = (void *)tcache_ptr; + emith_ctx_read_ptr(arg2, offsetof(SH2, write32_tab)); + emith_sh2_wcall(arg0, arg2); + #ifdef PDB_NET // debug #define MAKE_READ_WRAPPER(func) { \ @@ -3053,7 +3057,6 @@ static void sh2_generate_utils(void) host_dasm_new_symbol(sh2_drc_read8); host_dasm_new_symbol(sh2_drc_read16); host_dasm_new_symbol(sh2_drc_read32); - host_dasm_new_symbol(sh2_drc_write32); #endif #endif @@ -3065,6 +3068,7 @@ static void sh2_generate_utils(void) host_dasm_new_symbol(sh2_drc_test_irq); host_dasm_new_symbol(sh2_drc_write8); host_dasm_new_symbol(sh2_drc_write16); + host_dasm_new_symbol(sh2_drc_write32); #endif } diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index 69abf8cd8..7faa844b3 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -12,6 +12,7 @@ typedef enum { typedef struct SH2_ { + // registers. this MUST correlate with enum sh2_reg_e. unsigned int r[16]; // 00 unsigned int pc; // 40 unsigned int ppc; @@ -21,17 +22,19 @@ typedef struct SH2_ unsigned int mach, macl; // 58 // common - const void *read8_map; // 60 + const void *read8_map; const void *read16_map; + const void *read32_map; const void **write8_tab; const void **write16_tab; + const void **write32_tab; // drc stuff - int drc_tmp; // 70 + int drc_tmp; int irq_cycles; void *p_bios; // convenience pointers void *p_da; - void *p_sdram; // 80 + void *p_sdram; void *p_rom; unsigned int pdb_io_csum[2]; diff --git a/pico/32x/memory.c b/pico/32x/memory.c index c6b89a223..d399d758d 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -1087,8 +1087,8 @@ static void m68k_write8_dram1_ow(u32 a, u32 d) return; \ } \ /* overwrite */ \ - if (!(d & 0xff00)) d |= *pd & 0xff00; \ if (!(d & 0x00ff)) d |= *pd & 0x00ff; \ + if (!(d & 0xff00)) d |= *pd & 0xff00; \ *pd = d; static void m68k_write16_dram0_ow(u32 a, u32 d) @@ -1344,6 +1344,31 @@ static u32 sh2_read16_rom(u32 a, SH2 *sh2) return *(u16 *)(Pico.rom + bank + (a & 0x7fffe)); } +static u32 sh2_read32_unmapped(u32 a, SH2 *sh2) +{ + elprintf_sh2(sh2, EL_32X, "unmapped r32 [%08x] %08x @%06x", + a, 0, sh2_pc(sh2)); + return 0; +} + +static u32 sh2_read32_cs0(u32 a, SH2 *sh2) +{ + return (sh2_read16_cs0(a, sh2) << 16) | sh2_read16_cs0(a + 2, sh2); +} + +static u32 sh2_read32_da(u32 a, SH2 *sh2) +{ + u32 d = *(u32 *)(sh2->data_array + (a & 0xfff)); + return (d << 16) | (d >> 16); +} + +static u32 sh2_read32_rom(u32 a, SH2 *sh2) +{ + u32 bank = carthw_ssf2_banks[(a >> 19) & 7] << 19; + u32 d = *(u32 *)(Pico.rom + bank + (a & 0x7fffc)); + return (d << 16) | (d >> 16); +} + // writes static void REGPARM(3) sh2_write_ignore(u32 a, u32 d, SH2 *sh2) { @@ -1501,6 +1526,73 @@ static void REGPARM(3) sh2_write16_rom(u32 a, u32 d, SH2 *sh2) sh2_write16_unmapped(a, d, sh2); } +static void REGPARM(3) sh2_write32_unmapped(u32 a, u32 d, SH2 *sh2) +{ + elprintf_sh2(sh2, EL_32X, "unmapped w32 [%08x] %08x @%06x", + a, d, sh2_pc(sh2)); +} + +static void REGPARM(3) sh2_write32_cs0(u32 a, u32 d, SH2 *sh2) +{ + sh2_write16_cs0(a, d >> 16, sh2); + sh2_write16_cs0(a + 2, d, sh2); +} + +#define sh2_write32_dramN(n) \ + u32 *pd = (u32 *)&Pico32xMem->dram[n][(a & 0x1ffff) / 2]; \ + if (!(a & 0x20000)) { \ + *pd = (d << 16) | (d >> 16); \ + return; \ + } \ + /* overwrite */ \ + u8 *pb = (u8 *)pd; \ + if (d & 0x000000ff) pb[2] = d; \ + if (d & 0x0000ff00) pb[3] = d >> 8; \ + if (d & 0x00ff0000) pb[0] = d >> 16; \ + if (d & 0xff000000) pb[1] = d >> 24; \ + +static void REGPARM(3) sh2_write32_dram0(u32 a, u32 d, SH2 *sh2) +{ + sh2_write32_dramN(0); +} + +static void REGPARM(3) sh2_write32_dram1(u32 a, u32 d, SH2 *sh2) +{ + sh2_write32_dramN(1); +} + +static void REGPARM(3) sh2_write32_sdram(u32 a, u32 d, SH2 *sh2) +{ + u32 a1 = a & 0x3ffff; + *(u32 *)(sh2->p_sdram + a1) = (d << 16) | (d >> 16); +#ifdef DRC_SH2 + unsigned short *p = &Pico32xMem->drcblk_ram[a1 >> SH2_DRCBLK_RAM_SHIFT]; + if (p[0]) + sh2_drc_wcheck_ram(a, p[0], sh2->is_slave); + if (p[1]) + sh2_drc_wcheck_ram(a, p[1], sh2->is_slave); +#endif +} + +static void REGPARM(3) sh2_write32_da(u32 a, u32 d, SH2 *sh2) +{ + u32 a1 = a & 0xfff; + *(u32 *)(sh2->data_array + a1) = (d << 16) | (d >> 16); +#ifdef DRC_SH2 + int id = sh2->is_slave; + unsigned short *p = &Pico32xMem->drcblk_da[id][a1 >> SH2_DRCBLK_DA_SHIFT]; + if (p[0]) + sh2_drc_wcheck_da(a, p[0], id); + if (p[1]) + sh2_drc_wcheck_da(a, p[1], id); +#endif +} + +static void REGPARM(3) sh2_write32_rom(u32 a, u32 d, SH2 *sh2) +{ + sh2_write16_rom(a, d >> 16, sh2); + sh2_write16_rom(a + 2, d, sh2); +} typedef u32 (sh2_read_handler)(u32 a, SH2 *sh2); typedef void REGPARM(3) (sh2_write_handler)(u32 a, u32 d, SH2 *sh2); @@ -1534,30 +1626,21 @@ u32 REGPARM(2) p32x_sh2_read16(u32 a, SH2 *sh2) if (map_flag_set(p)) return ((sh2_read_handler *)(p << 1))(a, sh2); else - return *(u16 *)((p << 1) + ((a & sh2_map->mask) & ~1)); + return *(u16 *)((p << 1) + (a & sh2_map->mask)); } u32 REGPARM(2) p32x_sh2_read32(u32 a, SH2 *sh2) { - const sh2_memmap *sh2_map = sh2->read16_map; - sh2_read_handler *handler; - u32 offs; + const sh2_memmap *sh2_map = sh2->read32_map; uptr p; - offs = SH2MAP_ADDR2OFFS_R(a); - sh2_map += offs; + sh2_map += SH2MAP_ADDR2OFFS_R(a); p = sh2_map->addr; if (!map_flag_set(p)) { - // XXX: maybe 32bit access instead with ror? - u16 *pd = (u16 *)((p << 1) + ((a & sh2_map->mask) & ~1)); - return (pd[0] << 16) | pd[1]; - } - - if (offs == SH2MAP_ADDR2OFFS_R(0xffffc000)) - return sh2_peripheral_read32(a, sh2); - - handler = (sh2_read_handler *)(p << 1); - return (handler(a, sh2) << 16) | handler(a + 2, sh2); + u32 *pd = (u32 *)((p << 1) + (a & sh2_map->mask)); + return (*pd << 16) | (*pd >> 16); + } else + return ((sh2_read_handler *)(p << 1))(a, sh2); } void REGPARM(3) p32x_sh2_write8(u32 a, u32 d, SH2 *sh2) @@ -1580,20 +1663,11 @@ void REGPARM(3) p32x_sh2_write16(u32 a, u32 d, SH2 *sh2) void REGPARM(3) p32x_sh2_write32(u32 a, u32 d, SH2 *sh2) { - const void **sh2_wmap = sh2->write16_tab; + const void **sh2_wmap = sh2->write32_tab; sh2_write_handler *wh; - u32 offs; - - offs = SH2MAP_ADDR2OFFS_W(a); - if (offs == SH2MAP_ADDR2OFFS_W(0xffffc000)) { - sh2_peripheral_write32(a, d, sh2); - return; - } - - wh = sh2_wmap[offs]; - wh(a, d >> 16, sh2); - wh(a + 2, d, sh2); + wh = sh2_wmap[SH2MAP_ADDR2OFFS_W(a)]; + wh(a, d, sh2); } // ----------------------------------------------------------------- @@ -1801,9 +1875,9 @@ static void get_bios(void) #define MAP_MEMORY(m) ((uptr)(m) >> 1) #define MAP_HANDLER(h) ( ((uptr)(h) >> 1) | ((uptr)1 << (sizeof(uptr) * 8 - 1)) ) -static sh2_memmap sh2_read8_map[0x80], sh2_read16_map[0x80]; +static sh2_memmap sh2_read8_map[0x80], sh2_read16_map[0x80], sh2_read32_map[0x80]; // for writes we are using handlers only -static sh2_write_handler *sh2_write8_map[0x80], *sh2_write16_map[0x80]; +static sh2_write_handler *sh2_write8_map[0x80], *sh2_write16_map[0x80], *sh2_write32_map[0x80]; void Pico32xSwapDRAM(int b) { @@ -1818,10 +1892,12 @@ void Pico32xSwapDRAM(int b) // SH2 sh2_read8_map[0x04/2].addr = sh2_read8_map[0x24/2].addr = - sh2_read16_map[0x04/2].addr = sh2_read16_map[0x24/2].addr = MAP_MEMORY(Pico32xMem->dram[b]); + sh2_read16_map[0x04/2].addr = sh2_read16_map[0x24/2].addr = + sh2_read32_map[0x04/2].addr = sh2_read32_map[0x24/2].addr = MAP_MEMORY(Pico32xMem->dram[b]); sh2_write8_map[0x04/2] = sh2_write8_map[0x24/2] = b ? sh2_write8_dram1 : sh2_write8_dram0; sh2_write16_map[0x04/2] = sh2_write16_map[0x24/2] = b ? sh2_write16_dram1 : sh2_write16_dram0; + sh2_write32_map[0x04/2] = sh2_write32_map[0x24/2] = b ? sh2_write32_dram1 : sh2_write32_dram0; } static void bank_switch_rom_sh2(void) @@ -1829,11 +1905,13 @@ static void bank_switch_rom_sh2(void) if (!carthw_ssf2_active) { // easy sh2_read8_map[0x02/2].addr = sh2_read8_map[0x22/2].addr = - sh2_read16_map[0x02/2].addr = sh2_read16_map[0x22/2].addr = MAP_MEMORY(Pico.rom); + sh2_read16_map[0x02/2].addr = sh2_read16_map[0x22/2].addr = + sh2_read32_map[0x02/2].addr = sh2_read32_map[0x22/2].addr = MAP_MEMORY(Pico.rom); } else { sh2_read8_map[0x02/2].addr = sh2_read8_map[0x22/2].addr = MAP_HANDLER(sh2_read8_rom); sh2_read16_map[0x02/2].addr = sh2_read16_map[0x22/2].addr = MAP_HANDLER(sh2_read16_rom); + sh2_read32_map[0x02/2].addr = sh2_read32_map[0x22/2].addr = MAP_HANDLER(sh2_read32_rom); } } @@ -1903,58 +1981,75 @@ void PicoMemSetup32x(void) for (i = 0; i < ARRAY_SIZE(sh2_read8_map); i++) { sh2_read8_map[i].addr = MAP_HANDLER(sh2_read8_unmapped); sh2_read16_map[i].addr = MAP_HANDLER(sh2_read16_unmapped); + sh2_read32_map[i].addr = MAP_HANDLER(sh2_read32_unmapped); } for (i = 0; i < ARRAY_SIZE(sh2_write8_map); i++) { sh2_write8_map[i] = sh2_write8_unmapped; sh2_write16_map[i] = sh2_write16_unmapped; + sh2_write32_map[i] = sh2_write32_unmapped; } // "purge area" for (i = 0x40; i <= 0x5f; i++) { sh2_write8_map[i >> 1] = - sh2_write16_map[i >> 1] = sh2_write_ignore; + sh2_write16_map[i >> 1] = + sh2_write32_map[i >> 1] = sh2_write_ignore; } // CS0 sh2_read8_map[0x00/2].addr = sh2_read8_map[0x20/2].addr = MAP_HANDLER(sh2_read8_cs0); sh2_read16_map[0x00/2].addr = sh2_read16_map[0x20/2].addr = MAP_HANDLER(sh2_read16_cs0); + sh2_read32_map[0x00/2].addr = sh2_read32_map[0x20/2].addr = MAP_HANDLER(sh2_read32_cs0); sh2_write8_map[0x00/2] = sh2_write8_map[0x20/2] = sh2_write8_cs0; sh2_write16_map[0x00/2] = sh2_write16_map[0x20/2] = sh2_write16_cs0; + sh2_write32_map[0x00/2] = sh2_write32_map[0x20/2] = sh2_write32_cs0; // CS1 - ROM bank_switch_rom_sh2(); - sh2_read8_map[0x02/2].mask = sh2_read8_map[0x22/2].mask = - sh2_read16_map[0x02/2].mask = sh2_read16_map[0x22/2].mask = 0x3fffff; // FIXME + sh2_read8_map[0x02/2].mask = sh2_read8_map[0x22/2].mask = 0x3fffff; // FIXME + sh2_read16_map[0x02/2].mask = sh2_read16_map[0x22/2].mask = 0x3ffffe; // FIXME + sh2_read32_map[0x02/2].mask = sh2_read32_map[0x22/2].mask = 0x3ffffc; // FIXME sh2_write16_map[0x02/2] = sh2_write16_map[0x22/2] = sh2_write16_rom; + sh2_write32_map[0x02/2] = sh2_write32_map[0x22/2] = sh2_write32_rom; // CS2 - DRAM - done by Pico32xSwapDRAM() - sh2_read8_map[0x04/2].mask = sh2_read8_map[0x24/2].mask = - sh2_read16_map[0x04/2].mask = sh2_read16_map[0x24/2].mask = 0x01ffff; + sh2_read8_map[0x04/2].mask = sh2_read8_map[0x24/2].mask = 0x01ffff; + sh2_read16_map[0x04/2].mask = sh2_read16_map[0x24/2].mask = 0x01fffe; + sh2_read32_map[0x04/2].mask = sh2_read32_map[0x24/2].mask = 0x01fffc; // CS3 - SDRAM sh2_read8_map[0x06/2].addr = sh2_read8_map[0x26/2].addr = - sh2_read16_map[0x06/2].addr = sh2_read16_map[0x26/2].addr = MAP_MEMORY(Pico32xMem->sdram); + sh2_read16_map[0x06/2].addr = sh2_read16_map[0x26/2].addr = + sh2_read32_map[0x06/2].addr = sh2_read32_map[0x26/2].addr = MAP_MEMORY(Pico32xMem->sdram); sh2_write8_map[0x06/2] = sh2_write8_sdram; sh2_write8_map[0x26/2] = sh2_write8_sdram_wt; sh2_write16_map[0x06/2] = sh2_write16_map[0x26/2] = sh2_write16_sdram; - sh2_read8_map[0x06/2].mask = sh2_read8_map[0x26/2].mask = - sh2_read16_map[0x06/2].mask = sh2_read16_map[0x26/2].mask = 0x03ffff; + sh2_write32_map[0x06/2] = sh2_write32_map[0x26/2] = sh2_write32_sdram; + sh2_read8_map[0x06/2].mask = sh2_read8_map[0x26/2].mask = 0x03ffff; + sh2_read16_map[0x06/2].mask = sh2_read16_map[0x26/2].mask = 0x03fffe; + sh2_read32_map[0x06/2].mask = sh2_read32_map[0x26/2].mask = 0x03fffc; // SH2 data array sh2_read8_map[0xc0/2].addr = MAP_HANDLER(sh2_read8_da); sh2_read16_map[0xc0/2].addr = MAP_HANDLER(sh2_read16_da); + sh2_read32_map[0xc0/2].addr = MAP_HANDLER(sh2_read32_da); sh2_write8_map[0xc0/2] = sh2_write8_da; sh2_write16_map[0xc0/2] = sh2_write16_da; + sh2_write32_map[0xc0/2] = sh2_write32_da; // SH2 IO sh2_read8_map[0xff/2].addr = MAP_HANDLER(sh2_peripheral_read8); sh2_read16_map[0xff/2].addr = MAP_HANDLER(sh2_peripheral_read16); + sh2_read32_map[0xff/2].addr = MAP_HANDLER(sh2_peripheral_read32); sh2_write8_map[0xff/2] = sh2_peripheral_write8; sh2_write16_map[0xff/2] = sh2_peripheral_write16; + sh2_write32_map[0xff/2] = sh2_peripheral_write32; // map DRAM area, both 68k and SH2 Pico32xSwapDRAM(1); msh2.read8_map = ssh2.read8_map = sh2_read8_map; msh2.read16_map = ssh2.read16_map = sh2_read16_map; + msh2.read32_map = ssh2.read32_map = sh2_read32_map; msh2.write8_tab = ssh2.write8_tab = (const void **)(void *)sh2_write8_map; msh2.write16_tab = ssh2.write16_tab = (const void **)(void *)sh2_write16_map; + msh2.write32_tab = ssh2.write32_tab = (const void **)(void *)sh2_write32_map; sh2_drc_mem_setup(&msh2); sh2_drc_mem_setup(&ssh2); diff --git a/pico/32x/sh2soc.c b/pico/32x/sh2soc.c index b5300119f..0f75d9b49 100644 --- a/pico/32x/sh2soc.c +++ b/pico/32x/sh2soc.c @@ -244,7 +244,7 @@ u32 sh2_peripheral_read16(u32 a, SH2 *sh2) u16 *r = (void *)sh2->peri_regs; u32 d; - a &= 0x1ff; + a &= 0x1fe; d = r[(a / 2) ^ 1]; elprintf_sh2(sh2, EL_32XP, "peri r16 [%08x] %04x @%06x", @@ -343,7 +343,7 @@ void REGPARM(3) sh2_peripheral_write16(u32 a, u32 d, SH2 *sh2) elprintf_sh2(sh2, EL_32XP, "peri w16 [%08x] %04x @%06x", a, d, sh2_pc(sh2)); - a &= 0x1ff; + a &= 0x1fe; // evil WDT if (a == 0x80) { From 94eb72693c84f133eb2ec5329bc662241921b7ed Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 27 Mar 2019 21:58:32 +0100 Subject: [PATCH 022/174] move saving SH2 SR into memory access and do so only if needed --- cpu/sh2/compiler.c | 14 +++++----- cpu/sh2/compiler.h | 28 +++++++++++++++++++- pico/32x/memory.c | 64 +++++++++++++++++++++++++++++++--------------- pico/32x/sh2soc.c | 5 ++++ 4 files changed, 84 insertions(+), 27 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 3c82420e6..3c5ce5b9e 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -1231,11 +1231,11 @@ static int emit_memhandler_read(int size) rcache_clean(); +#ifndef DCR_SR_REG // must writeback cycles for poll detection stuff - // FIXME: rm if (reg_map_g2h[SHR_SR] != -1) emith_ctx_write(reg_map_g2h[SHR_SR], SHR_SR * 4); - +#endif arg1 = rcache_get_tmp_arg(1); emith_move_r_r_ptr(arg1, CONTEXT_REG); switch (size) { @@ -1244,10 +1244,10 @@ static int emit_memhandler_read(int size) case 2: emith_call(sh2_drc_read32); break; // 32 } rcache_invalidate(); - +#ifndef DCR_SR_REG if (reg_map_g2h[SHR_SR] != -1) emith_ctx_read(reg_map_g2h[SHR_SR], SHR_SR * 4); - +#endif return rcache_get_tmp_ret(); } @@ -1255,10 +1255,10 @@ static int emit_memhandler_read(int size) static void emit_memhandler_write(int size) { int arg2; - +#ifndef DCR_SR_REG if (reg_map_g2h[SHR_SR] != -1) emith_ctx_write(reg_map_g2h[SHR_SR], SHR_SR * 4); - +#endif rcache_clean(); arg2 = rcache_get_tmp_arg(2); @@ -1270,8 +1270,10 @@ static void emit_memhandler_write(int size) } rcache_invalidate(); +#ifndef DCR_SR_REG if (reg_map_g2h[SHR_SR] != -1) emith_ctx_read(reg_map_g2h[SHR_SR], SHR_SR * 4); +#endif } // rd = @(Rs,#offs) diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 61d8d2daf..70fdbf4ea 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -13,7 +13,7 @@ void sh2_drc_frame(void); #define sh2_drc_frame() #endif -#define BLOCK_INSN_LIMIT 128 +#define BLOCK_INSN_LIMIT 1024 /* op_flags */ #define OF_DELAY_OP (1 << 0) @@ -25,3 +25,29 @@ void sh2_drc_frame(void); void scan_block(unsigned int base_pc, int is_slave, unsigned char *op_flags, unsigned int *end_pc, unsigned int *end_literals); + +#if defined(DRC_SH2) +// direct access to some host CPU registers used by the DRC +// XXX MUST match definitions in cpu/sh2/compiler.c +#if defined(_arm__) +#define DRC_SR_REG r10 +#elif defined(__i386__) +#define DRC_SR_REG edi +#else +#warning "direct DRC register access not available for this host" +#endif + +#ifdef DCR_SR_REG +#define DRC_DECLARE_SR register int sh2_sr asm(#DCR_SR_REG) +#define DRC_SAVE_SR(sh2) \ + if ((sh2->state & (SH2_STATE_RUN|SH2_STATE_BUSY)) == SH2_STATE_RUN) \ + sh2->sr = sh2_sr; +#define DRC_RESTORE_SR(sh2) \ + if ((sh2->state & (SH2_STATE_RUN|SH2_STATE_BUSY)) == SH2_STATE_RUN) \ + sh2_sr = sh2->sr; +#else +#define DRC_DECLARE_SR +#define DRC_SAVE_SR(sh2) +#define DRC_RESTORE_SR(sh2) +#endif +#endif diff --git a/pico/32x/memory.c b/pico/32x/memory.c index d399d758d..f82b9f99e 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -40,7 +40,9 @@ */ #include "../pico_int.h" #include "../memory.h" + #include "../../cpu/sh2/compiler.h" +DRC_DECLARE_SR; static const char str_mars[] = "MARS"; @@ -1237,6 +1239,7 @@ static u32 sh2_read8_unmapped(u32 a, SH2 *sh2) static u32 sh2_read8_cs0(u32 a, SH2 *sh2) { u32 d = 0; + DRC_SAVE_SR(sh2); sh2_burn_cycles(sh2, 1*2); @@ -1252,18 +1255,19 @@ static u32 sh2_read8_cs0(u32 a, SH2 *sh2) goto out_16to8; } - // TODO: mirroring? - if (!sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_m)) - return Pico32xMem->sh2_rom_m.b[a ^ 1]; - if (sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_s)) - return Pico32xMem->sh2_rom_s.b[a ^ 1]; - if ((a & 0x3fe00) == 0x4200) { d = Pico32xMem->pal[(a & 0x1ff) / 2]; goto out_16to8; } - return sh2_read8_unmapped(a, sh2); + // TODO: mirroring? + if (!sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_m)) + d = Pico32xMem->sh2_rom_m.b[a ^ 1]; + else if (sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_s)) + d = Pico32xMem->sh2_rom_s.b[a ^ 1]; + else + d = sh2_read8_unmapped(a, sh2); + goto out; out_16to8: if (a & 1) @@ -1271,8 +1275,10 @@ static u32 sh2_read8_cs0(u32 a, SH2 *sh2) else d >>= 8; +out: elprintf_sh2(sh2, EL_32X, "r8 [%08x] %02x @%06x", a, d, sh2_pc(sh2)); + DRC_RESTORE_SR(sh2); return d; } @@ -1299,13 +1305,14 @@ static u32 sh2_read16_unmapped(u32 a, SH2 *sh2) static u32 sh2_read16_cs0(u32 a, SH2 *sh2) { u32 d = 0; + DRC_SAVE_SR(sh2); sh2_burn_cycles(sh2, 1*2); if ((a & 0x3ffc0) == 0x4000) { d = p32x_sh2reg_read16(a, sh2); if (!(EL_LOGMASK & EL_PWM) && (a & 0x30) == 0x30) // hide PWM - return d; + goto out_noprint; goto out; } @@ -1315,21 +1322,23 @@ static u32 sh2_read16_cs0(u32 a, SH2 *sh2) goto out; } - if (!sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_m)) - return Pico32xMem->sh2_rom_m.w[a / 2]; - if (sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_s)) - return Pico32xMem->sh2_rom_s.w[a / 2]; - if ((a & 0x3fe00) == 0x4200) { d = Pico32xMem->pal[(a & 0x1ff) / 2]; goto out; } - return sh2_read16_unmapped(a, sh2); + if (!sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_m)) + d = Pico32xMem->sh2_rom_m.w[a / 2]; + else if (sh2->is_slave && a < sizeof(Pico32xMem->sh2_rom_s)) + d = Pico32xMem->sh2_rom_s.w[a / 2]; + else + d = sh2_read16_unmapped(a, sh2); out: elprintf_sh2(sh2, EL_32X, "r16 [%08x] %04x @%06x", a, d, sh2_pc(sh2)); +out_noprint: + DRC_RESTORE_SR(sh2); return d; } @@ -1383,6 +1392,7 @@ static void REGPARM(3) sh2_write8_unmapped(u32 a, u32 d, SH2 *sh2) static void REGPARM(3) sh2_write8_cs0(u32 a, u32 d, SH2 *sh2) { + DRC_SAVE_SR(sh2); elprintf_sh2(sh2, EL_32X, "w8 [%08x] %02x @%06x", a, d & 0xff, sh2_pc(sh2)); @@ -1390,16 +1400,24 @@ static void REGPARM(3) sh2_write8_cs0(u32 a, u32 d, SH2 *sh2) if ((a & 0x3fff0) == 0x4100) { sh2->poll_addr = 0; p32x_vdp_write8(a, d); - return; + goto out; + } + + if ((a & 0x3fe00) == 0x4200) { + ((u8 *)Pico32xMem->pal)[(a & 0x1ff) ^ 1] = d; + Pico32x.dirty_pal = 1; + goto out; } } if ((a & 0x3ffc0) == 0x4000) { p32x_sh2reg_write8(a, d, sh2); - return; + goto out; } sh2_write8_unmapped(a, d, sh2); +out: + DRC_RESTORE_SR(sh2); } static void REGPARM(3) sh2_write8_dram0(u32 a, u32 d, SH2 *sh2) @@ -1426,8 +1444,11 @@ static void REGPARM(3) sh2_write8_sdram(u32 a, u32 d, SH2 *sh2) static void REGPARM(3) sh2_write8_sdram_wt(u32 a, u32 d, SH2 *sh2) { // xmen sync hack.. - if (a < 0x26000200) + if (a < 0x26000200) { + DRC_SAVE_SR(sh2); sh2_end_run(sh2, 32); + DRC_RESTORE_SR(sh2); + } sh2_write8_sdram(a, d, sh2); } @@ -1453,6 +1474,7 @@ static void REGPARM(3) sh2_write16_unmapped(u32 a, u32 d, SH2 *sh2) static void REGPARM(3) sh2_write16_cs0(u32 a, u32 d, SH2 *sh2) { + DRC_SAVE_SR(sh2); if (((EL_LOGMASK & EL_PWM) || (a & 0x30) != 0x30)) // hide PWM elprintf_sh2(sh2, EL_32X, "w16 [%08x] %04x @%06x", a, d & 0xffff, sh2_pc(sh2)); @@ -1461,22 +1483,24 @@ static void REGPARM(3) sh2_write16_cs0(u32 a, u32 d, SH2 *sh2) if ((a & 0x3fff0) == 0x4100) { sh2->poll_addr = 0; p32x_vdp_write16(a, d, sh2); - return; + goto out; } if ((a & 0x3fe00) == 0x4200) { Pico32xMem->pal[(a & 0x1ff) / 2] = d; Pico32x.dirty_pal = 1; - return; + goto out; } } if ((a & 0x3ffc0) == 0x4000) { p32x_sh2reg_write16(a, d, sh2); - return; + goto out; } sh2_write16_unmapped(a, d, sh2); +out: + DRC_RESTORE_SR(sh2); } static void REGPARM(3) sh2_write16_dram0(u32 a, u32 d, SH2 *sh2) diff --git a/pico/32x/sh2soc.c b/pico/32x/sh2soc.c index 0f75d9b49..f8e657f5e 100644 --- a/pico/32x/sh2soc.c +++ b/pico/32x/sh2soc.c @@ -25,6 +25,9 @@ #include "../pico_int.h" #include "../memory.h" +#include "../../cpu/sh2/compiler.h" +DRC_DECLARE_SR; + // DMAC handling struct dma_chan { unsigned int sar, dar; // src, dst addr @@ -413,10 +416,12 @@ void REGPARM(3) sh2_peripheral_write32(u32 a, u32 d, SH2 *sh2) if (!(dmac->dmaor & DMA_DME)) return; + DRC_SAVE_SR(sh2); if ((dmac->chan[0].chcr & (DMA_TE|DMA_DE)) == DMA_DE) dmac_trigger(sh2, &dmac->chan[0]); if ((dmac->chan[1].chcr & (DMA_TE|DMA_DE)) == DMA_DE) dmac_trigger(sh2, &dmac->chan[1]); + DRC_RESTORE_SR(sh2); } } From 2d133c17d6ff2488d4defd6feaf167feaeecc58a Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 29 Mar 2019 18:36:44 +0100 Subject: [PATCH 023/174] debug stuff, bug fixing --- cpu/drc/emit_arm.c | 10 +-- cpu/sh2/compiler.c | 215 +++++++++++++++++++++++++++++++++++---------- cpu/sh2/compiler.h | 4 +- pico/32x/memory.c | 14 +-- 4 files changed, 183 insertions(+), 60 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 86d8a41d0..632d476e9 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -510,12 +510,12 @@ static int emith_xbranch(int cond, void *target, int is_call) emith_top_imm(A_COND_AL, A_OP_TST, r, imm) #define emith_cmp_r_imm(r, imm) { \ - u32 op = A_OP_CMP, imm_ = imm; \ - if (~imm_ < 0x100) { \ - imm_ = -imm_; \ - op = A_OP_CMN; \ + u32 op_ = A_OP_CMP, imm_ = (u8)imm; \ + if ((s8)imm_ < 0) { \ + imm_ = (u8)-imm_; \ + op_ = A_OP_CMN; \ } \ - emith_top_imm(A_COND_AL, op, r, imm); \ + emith_top_imm(A_COND_AL, op_, r, imm_); \ } #define emith_subf_r_imm(r, imm) \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 3c5ce5b9e..800e9d329 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -53,6 +53,9 @@ // 04 - asm // 08 - runtime block entry log // 10 - smc self-check +// 100 - write trace +// 200 - compare trace +// 400 - print block entry backtrace // { #ifndef DRC_DEBUG #define DRC_DEBUG 0 @@ -73,6 +76,7 @@ static int insns_compiled, hash_collisions, host_insn_count; #define dbg(...) #endif + /// #define FETCH_OP(pc) \ dr_pc_base[(pc) / 2] @@ -147,13 +151,86 @@ static char sh2dasm_buff[64]; #define do_host_disasm(x) #endif -#if (DRC_DEBUG & 8) || defined(PDB) +#if (DRC_DEBUG & (8|256|512|1024)) || defined(PDB) + +#define SH2_DUMP(sh2, reason) { \ + char ms = (sh2)->is_slave ? 's' : 'm'; \ + printf("%csh2 %s %08x\n", ms, reason, (sh2)->pc); \ + printf("%csh2 r0-7 %08x %08x %08x %08x %08x %08x %08x %08x\n", ms, \ + (sh2)->r[0], (sh2)->r[1], (sh2)->r[2], (sh2)->r[3], \ + (sh2)->r[4], (sh2)->r[5], (sh2)->r[6], (sh2)->r[7]); \ + printf("%csh2 r8-15 %08x %08x %08x %08x %08x %08x %08x %08x\n", ms, \ + (sh2)->r[8], (sh2)->r[9], (sh2)->r[10], (sh2)->r[11], \ + (sh2)->r[12], (sh2)->r[13], (sh2)->r[14], (sh2)->r[15]); \ + printf("%csh2 pc-ml %08x %08x %08x %08x %08x %08x %08x %08x\n", ms, \ + (sh2)->pc, (sh2)->ppc, (sh2)->pr, (sh2)->sr&0x3ff, \ + (sh2)->gbr, (sh2)->vbr, (sh2)->mach, (sh2)->macl); \ + printf("%csh2 tmp-p %08x %08x %08x %08x %08x %08x %08x %08x\n", ms, \ + (sh2)->drc_tmp, (sh2)->irq_cycles, \ + (sh2)->pdb_io_csum[0], (sh2)->pdb_io_csum[1], (sh2)->state, \ + (sh2)->poll_addr, (sh2)->poll_cycles, (sh2)->poll_cnt); \ +} +static SH2 csh2[2][4]; static void REGPARM(3) *sh2_drc_log_entry(void *block, SH2 *sh2, u32 sr) { if (block != NULL) { dbg(8, "= %csh2 enter %08x %p, c=%d", sh2->is_slave ? 's' : 'm', sh2->pc, block, (signed int)sr >> 12); +#if defined PDB pdb_step(sh2, sh2->pc); +#elif (DRC_DEBUG & 256) + { + static FILE *trace[2]; + int idx = sh2->is_slave; +if (sh2 != &sh2s[0] && sh2 != &sh2s[1]) printf("sh2 %p?\n",sh2); + if (!trace[0]) { + truncate("pico.trace", 0); + trace[0] = fopen("pico.trace0", "wb"); + trace[1] = fopen("pico.trace1", "wb"); + } + if (csh2[idx][0].pc != sh2->pc) { + fwrite(sh2, offsetof(SH2, read8_map), 1, trace[idx]); + fwrite(&sh2->pdb_io_csum, sizeof(sh2->pdb_io_csum), 1, trace[idx]); + memcpy(&csh2[idx][0], sh2, offsetof(SH2, icount)); + } + } +#elif (DRC_DEBUG & 512) + { + static FILE *trace[2]; + static SH2 fsh2; + int idx = sh2->is_slave; + if (!trace[0]) { + trace[0] = fopen("pico.trace0", "rb"); + trace[1] = fopen("pico.trace1", "rb"); + } + if (csh2[idx][0].pc != sh2->pc) { + if (!fread(&fsh2, offsetof(SH2, read8_map), 1, trace[idx]) || + !fread(&fsh2.pdb_io_csum, sizeof(sh2->pdb_io_csum), 1, trace[idx])) { + printf("trace eof at %08lx\n",ftell(trace[idx])); + exit(1); + } + fsh2.sr = (fsh2.sr & 0xfff) | (sh2->sr & ~0xfff); + fsh2.is_slave = idx; + if (memcmp(&fsh2, sh2, offsetof(SH2, read8_map)) || + 0)//memcmp(&fsh2.pdb_io_csum, &sh2->pdb_io_csum, sizeof(sh2->pdb_io_csum))) + { + printf("difference at %08lx!\n",ftell(trace[idx])); + SH2_DUMP(&fsh2, "file"); + SH2_DUMP(sh2, "current"); + SH2_DUMP(&csh2[idx][0], "previous"); + exit(1); + } + csh2[idx][0] = fsh2; + } + } +#elif (DRC_DEBUG & 1024) + { + int x = sh2->is_slave, i; + for (i = 0; i < ARRAY_SIZE(csh2[x]); i++) + memcpy(&csh2[x][i], &csh2[x][i+1], offsetof(SH2, icount)); + memcpy(&csh2[x][3], sh2, offsetof(SH2, icount)); + } +#endif } return block; } @@ -759,13 +836,18 @@ static u32 dr_gcregs_mask; static u32 dr_gcregs_dirty; #if PROPAGATE_CONSTANTS +static void gconst_set(sh2_reg_e r, u32 val) +{ + dr_gcregs_mask |= 1 << r; + dr_gcregs[r] = val; +} + static void gconst_new(sh2_reg_e r, u32 val) { int i; - dr_gcregs_mask |= 1 << r; + gconst_set(r, val); dr_gcregs_dirty |= 1 << r; - dr_gcregs[r] = val; // throw away old r that we might have cached for (i = ARRAY_SIZE(reg_temp) - 1; i >= 0; i--) { @@ -818,6 +900,17 @@ static void gconst_kill(sh2_reg_e r) dr_gcregs_dirty &= ~(1 << r); } +#if PROPAGATE_CONSTANTS +static void gconst_copy(sh2_reg_e rd, sh2_reg_e rs) +{ + u32 val; + + gconst_kill(rd); + if (gconst_get(rs, &val)) + gconst_set(rd, val); +} +#endif + static void gconst_clean(void) { int i; @@ -1104,7 +1197,7 @@ static void rcache_unlock_all(void) reg_temp[i].flags &= ~HRF_LOCKED; } -#ifdef DRC_CMP +#if (DRC_DEBUG & (8|256|512|1024)) || defined(DRC_CMP) static u32 rcache_used_hreg_mask(void) { u32 mask = 0; @@ -1202,18 +1295,13 @@ static void emit_move_r_imm32(sh2_reg_e dst, u32 imm) static void emit_move_r_r(sh2_reg_e dst, sh2_reg_e src) { int hr_d, hr_s; - u32 val; + hr_s = rcache_get_reg(src, RC_GR_READ); + hr_d = rcache_get_reg(dst, RC_GR_WRITE); + emith_move_r_r(hr_d, hr_s); #if PROPAGATE_CONSTANTS - if (gconst_get(src, &val)) - gconst_new(dst, val); - else + gconst_copy(dst, src); #endif - { - hr_s = rcache_get_reg(src, RC_GR_READ); - hr_d = rcache_get_reg(dst, RC_GR_WRITE); - emith_move_r_r(hr_d, hr_s); - } } // T must be clear, and comparison done just before this @@ -1231,7 +1319,7 @@ static int emit_memhandler_read(int size) rcache_clean(); -#ifndef DCR_SR_REG +#ifndef DRC_SR_REG // must writeback cycles for poll detection stuff if (reg_map_g2h[SHR_SR] != -1) emith_ctx_write(reg_map_g2h[SHR_SR], SHR_SR * 4); @@ -1244,7 +1332,7 @@ static int emit_memhandler_read(int size) case 2: emith_call(sh2_drc_read32); break; // 32 } rcache_invalidate(); -#ifndef DCR_SR_REG +#ifndef DRC_SR_REG if (reg_map_g2h[SHR_SR] != -1) emith_ctx_read(reg_map_g2h[SHR_SR], SHR_SR * 4); #endif @@ -1255,7 +1343,7 @@ static int emit_memhandler_read(int size) static void emit_memhandler_write(int size) { int arg2; -#ifndef DCR_SR_REG +#ifndef DRC_SR_REG if (reg_map_g2h[SHR_SR] != -1) emith_ctx_write(reg_map_g2h[SHR_SR], SHR_SR * 4); #endif @@ -1270,7 +1358,7 @@ static void emit_memhandler_write(int size) } rcache_invalidate(); -#ifndef DCR_SR_REG +#ifndef DRC_SR_REG if (reg_map_g2h[SHR_SR] != -1) emith_ctx_read(reg_map_g2h[SHR_SR], SHR_SR * 4); #endif @@ -1287,8 +1375,8 @@ static int emit_memhandler_read_rr(sh2_reg_e rd, sh2_reg_e rs, u32 offs, int siz hr2 = rcache_get_tmp(); emith_move_r_imm(hr2, val); } else { - gconst_new(rd, val); - hr2 = rcache_get_reg(rd, RC_GR_RMW); + emit_move_r_imm32(rd, val); + hr2 = rcache_get_reg(rd, RC_GR_READ); } return hr2; } @@ -1296,7 +1384,10 @@ static int emit_memhandler_read_rr(sh2_reg_e rd, sh2_reg_e rs, u32 offs, int siz if (gconst_get(rs, &val)) { hr = emit_get_rbase_and_offs(val + offs, &offs2); if (hr != -1) { - hr2 = rcache_get_reg(rd, RC_GR_WRITE); + if (rd == SHR_TMP) + hr2 = rcache_get_tmp(); + else + hr2 = rcache_get_reg(rd, RC_GR_WRITE); switch (size) { case 0: // 8 emith_read8s_r_r_offs(hr2, hr, offs2 ^ 1); @@ -1323,13 +1414,18 @@ static int emit_memhandler_read_rr(sh2_reg_e rd, sh2_reg_e rs, u32 offs, int siz emith_add_r_imm(hr, offs); } hr = emit_memhandler_read(size); - hr2 = rcache_get_reg(rd, RC_GR_WRITE); - if (size != 2) { + if (rd == SHR_TMP) + hr2 = hr; + else + hr2 = rcache_get_reg(rd, RC_GR_WRITE); + + if (rd != SHR_TMP && size != 2) { emith_sext(hr2, hr, (size == 1) ? 16 : 8); - } else + } else if (hr != hr2) emith_move_r_r(hr2, hr); - rcache_free_tmp(hr); + if (hr != hr2) + rcache_free_tmp(hr); return hr2; } @@ -1339,6 +1435,7 @@ static void emit_memhandler_write_rr(sh2_reg_e rd, sh2_reg_e rs, u32 offs, int s int hr; u32 val; + rcache_clean(); // XXX rcache_get_reg_arg(1, rd); if (gconst_get(rs, &val)) { @@ -1375,7 +1472,7 @@ static int emit_indirect_indexed_read(sh2_reg_e rd, sh2_reg_e rx, sh2_reg_e ry, else hr2 = hr; - if (size != 2) { // 16, 8 + if (rd != SHR_TMP && size != 2) { // 16, 8 emith_sext(hr2, hr, size ? 16 : 8); } else if (hr != hr2) // 32 emith_move_r_r(hr2, hr); @@ -1397,6 +1494,7 @@ static void emit_indirect_indexed_write(sh2_reg_e rd, sh2_reg_e rx, sh2_reg_e ry if (gconst_get(rx, &offs)) return emit_memhandler_write_rr(rd, ry, offs, size); #endif + rcache_clean(); // XXX rcache_get_reg_arg(1, rd); a0 = rcache_get_reg_arg(0, rx); t = rcache_get_reg(ry, RC_GR_READ); @@ -1459,17 +1557,6 @@ static void emit_do_static_regs(int is_write, int tmpr) /* just after lookup function, jump to address returned */ static void emit_block_entry(void) { -#if (DRC_DEBUG & 8) || defined(PDB) - int arg1, arg2; - host_arg2reg(arg1, 1); - host_arg2reg(arg2, 2); - - emit_do_static_regs(1, arg2); - emith_move_r_r_ptr(arg1, CONTEXT_REG); - emith_move_r_r(arg2, rcache_get_reg(SHR_SR, RC_GR_READ)); - emith_call(sh2_drc_log_entry); - rcache_invalidate(); -#endif emith_tst_r_r_ptr(RET_REG, RET_REG); EMITH_SJMP_START(DCOND_EQ); emith_jump_reg_c(DCOND_NE, RET_REG); @@ -1675,6 +1762,24 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_jump_cond(DCOND_LE, sh2_drc_exit); do_host_disasm(tcache_id); rcache_unlock_all(); + +#if (DRC_DEBUG & (8|256|512|1024)) + emit_move_r_imm32(SHR_PC, pc); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + FLUSH_CYCLES(sr); + rcache_clean(); + tmp = rcache_used_hreg_mask(); + emith_save_caller_regs(tmp); + emit_do_static_regs(1, 0); + rcache_get_reg_arg(2, SHR_SR); + tmp2 = rcache_get_tmp_arg(0); + tmp3 = rcache_get_tmp_arg(1); + emith_move_r_imm(tmp2, (u32)tcache_ptr); + emith_move_r_r_ptr(tmp3,CONTEXT_REG); + emith_call(sh2_drc_log_entry); + emith_restore_caller_regs(tmp); + rcache_invalidate(); +#endif } #ifdef DRC_CMP @@ -1729,7 +1834,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_move_r_imm(tmp, pc); emith_tst_r_imm(sr, T); tmp2 = ops[i-1].op == OP_BRANCH_CT ? DCOND_NE : DCOND_EQ; + tmp3 = ops[i-1].op == OP_BRANCH_CT ? DCOND_EQ : DCOND_NE; + EMITH_SJMP_START(tmp3); emith_move_r_imm_c(tmp2, tmp, ops[i-1].imm); + EMITH_SJMP_END(tmp3); break; case OP_BRANCH_N: emit_move_r_imm32(SHR_PC, pc); @@ -1765,7 +1873,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.pending_branch_direct = 1; } else { emit_move_r_r(SHR_PC, opd->rm); - ops[i+1].source |= SHR_PC; // need PC for jump after delay slot drcf.pending_branch_indirect = 1; } goto end_op; @@ -1785,7 +1892,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_move_r_r(tmp3, tmp); } emith_add_r_r(tmp, tmp2); - ops[i+1].source |= SHR_PC; // need PC for jump after delay slot drcf.pending_branch_indirect = 1; } goto end_op; @@ -1813,7 +1919,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case OP_UNDEFINED: elprintf_sh2(sh2, EL_ANOMALY, "drc: illegal op %04x @ %08x", op, pc - 2); - opd->imm = 4; + opd->imm = (op_flags[i] & OF_B_IN_DS) ? 6 : 4; // fallthrough case OP_TRAPA: tmp = rcache_get_reg(SHR_SP, RC_GR_RMW); @@ -1827,7 +1933,13 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // push PC rcache_get_reg_arg(0, SHR_SP); tmp = rcache_get_tmp_arg(1); - emith_move_r_imm(tmp, pc); + if (op == OP_TRAPA) + emith_move_r_imm(tmp, pc); + else if (drcf.pending_branch_indirect) { + tmp2 = rcache_get_reg(SHR_PC, RC_GR_READ); + emith_move_r_r(tmp, tmp2); + } else + emith_move_r_imm(tmp, pc - 2); emit_memhandler_write(2); // obtain new PC emit_memhandler_read_rr(SHR_PC, SHR_VBR, opd->imm * 4, 2); @@ -1988,7 +2100,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto end_op; case 0x0f: // MAC.L @Rm+,@Rn+ 0000nnnnmmmm1111 emit_indirect_read_double(&tmp, &tmp2, GET_Rn(), GET_Rm(), 2); - sr = rcache_get_reg(SHR_SR, RC_GR_READ); tmp3 = rcache_get_reg(SHR_MACL, RC_GR_RMW); tmp4 = rcache_get_reg(SHR_MACH, RC_GR_RMW); @@ -2087,12 +2198,12 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x0e: // MULU.W Rm,Rn 0010nnnnmmmm1110 case 0x0f: // MULS.W Rm,Rn 0010nnnnmmmm1111 tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ); + tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ); tmp = rcache_get_reg(SHR_MACL, RC_GR_WRITE); if (op & 1) { emith_sext(tmp, tmp2, 16); } else emith_clear_msb(tmp, tmp2, 16); - tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ); tmp2 = rcache_get_tmp(); if (op & 1) { emith_sext(tmp2, tmp3, 16); @@ -2308,7 +2419,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW); emith_sub_r_imm(tmp2, 4); - rcache_clean(); + rcache_clean(); // XXX rcache_get_reg_arg(0, GET_Rn()); tmp3 = rcache_get_reg_arg(1, tmp); if (tmp == SHR_SR) @@ -2444,6 +2555,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_bic_r_imm(sr, T); emith_cmp_r_imm(tmp, 0); emit_or_t_if_eq(sr); + rcache_clean(); // XXX emith_or_r_imm(tmp, 0x80); tmp2 = rcache_get_tmp_arg(1); // assuming it differs to tmp emith_move_r_r(tmp2, tmp); @@ -2596,7 +2708,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto end_op; case 0x0800: // CMP/EQ #imm,R0 10001000iiiiiiii // XXX: could use cmn - tmp2 = rcache_get_reg(0, RC_GR_READ); + tmp2 = rcache_get_reg(SHR_R0, RC_GR_READ); sr = rcache_get_reg(SHR_SR, RC_GR_RMW); emith_bic_r_imm(sr, T); emith_cmp_r_imm(tmp2, (s8)(op & 0xff)); @@ -2679,10 +2791,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) default: default_: - if (!(op_flags[i] & OF_B_IN_DS)) + if (!(op_flags[i] & OF_B_IN_DS)) { elprintf_sh2(sh2, EL_ANOMALY, "drc: illegal op %04x @ %08x", op, pc - 2); exit(1); + } } end_op: @@ -3268,6 +3381,15 @@ void block_stats(void) void sh2_drc_flush_all(void) { +#if (DRC_DEBUG & 1024) + int i; + printf("backtrace master:\n"); + for (i = 0; i < ARRAY_SIZE(csh2[0]); i++) + SH2_DUMP(&csh2[0][i], "bt msh2"); + printf("backtrace slave:\n"); + for (i = 0; i < ARRAY_SIZE(csh2[1]); i++) + SH2_DUMP(&csh2[1][i], "bt ssh2"); +#endif block_stats(); flush_tcache(0); flush_tcache(1); @@ -4200,13 +4322,14 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, if (op_flags[i] & OF_DELAY_OP) { switch (opd->op) { case OP_BRANCH: + case OP_BRANCH_N: case OP_BRANCH_CT: case OP_BRANCH_CF: case OP_BRANCH_R: case OP_BRANCH_RF: elprintf(EL_ANOMALY, "%csh2 drc: branch in DS @ %08x", is_slave ? 's' : 'm', pc); - opd->op = OP_UNHANDLED; + opd->op = OP_UNDEFINED; op_flags[i] |= OF_B_IN_DS; next_is_delay = 0; break; diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 70fdbf4ea..c9cf7ab0b 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -29,9 +29,9 @@ void scan_block(unsigned int base_pc, int is_slave, #if defined(DRC_SH2) // direct access to some host CPU registers used by the DRC // XXX MUST match definitions in cpu/sh2/compiler.c -#if defined(_arm__) +#if defined(__arm__) #define DRC_SR_REG r10 -#elif defined(__i386__) +#elif defined(__i386__) || defined(__x86_64__) #define DRC_SR_REG edi #else #warning "direct DRC register access not available for this host" diff --git a/pico/32x/memory.c b/pico/32x/memory.c index f82b9f99e..8f2a7c2fa 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -1344,7 +1344,7 @@ static u32 sh2_read16_cs0(u32 a, SH2 *sh2) static u32 sh2_read16_da(u32 a, SH2 *sh2) { - return ((u16 *)sh2->data_array)[(a & 0xfff) / 2]; + return ((u16 *)sh2->data_array)[(a & 0xffe) / 2]; } static u32 sh2_read16_rom(u32 a, SH2 *sh2) @@ -1367,7 +1367,7 @@ static u32 sh2_read32_cs0(u32 a, SH2 *sh2) static u32 sh2_read32_da(u32 a, SH2 *sh2) { - u32 d = *(u32 *)(sh2->data_array + (a & 0xfff)); + u32 d = *((u32 *)sh2->data_array + (a & 0xffc)/4); return (d << 16) | (d >> 16); } @@ -1587,28 +1587,28 @@ static void REGPARM(3) sh2_write32_dram1(u32 a, u32 d, SH2 *sh2) static void REGPARM(3) sh2_write32_sdram(u32 a, u32 d, SH2 *sh2) { - u32 a1 = a & 0x3ffff; + u32 a1 = a & 0x3fffc; *(u32 *)(sh2->p_sdram + a1) = (d << 16) | (d >> 16); #ifdef DRC_SH2 unsigned short *p = &Pico32xMem->drcblk_ram[a1 >> SH2_DRCBLK_RAM_SHIFT]; if (p[0]) sh2_drc_wcheck_ram(a, p[0], sh2->is_slave); if (p[1]) - sh2_drc_wcheck_ram(a, p[1], sh2->is_slave); + sh2_drc_wcheck_ram(a+2, p[1], sh2->is_slave); #endif } static void REGPARM(3) sh2_write32_da(u32 a, u32 d, SH2 *sh2) { - u32 a1 = a & 0xfff; - *(u32 *)(sh2->data_array + a1) = (d << 16) | (d >> 16); + u32 a1 = a & 0xffc; + *((u32 *)sh2->data_array + a1/4) = (d << 16) | (d >> 16); #ifdef DRC_SH2 int id = sh2->is_slave; unsigned short *p = &Pico32xMem->drcblk_da[id][a1 >> SH2_DRCBLK_DA_SHIFT]; if (p[0]) sh2_drc_wcheck_da(a, p[0], id); if (p[1]) - sh2_drc_wcheck_da(a, p[1], id); + sh2_drc_wcheck_da(a+2, p[1], id); #endif } From 24f21f3b8ac3187a3bfe0921653a50d4261e2484 Mon Sep 17 00:00:00 2001 From: kub Date: Mon, 1 Apr 2019 23:39:58 +0200 Subject: [PATCH 024/174] overhaul of the register cache (improves generated code by some 10+%) --- cpu/drc/emit_arm.c | 55 +- cpu/drc/emit_x86.c | 91 +- cpu/sh2/compiler.c | 1972 +++++++++++++++++++++++++++++--------------- cpu/sh2/compiler.h | 4 +- 4 files changed, 1455 insertions(+), 667 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 632d476e9..4421c6411 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -5,6 +5,7 @@ * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. */ +#define HOST_REGS 16 #define CONTEXT_REG 11 #define RET_REG 0 @@ -406,9 +407,24 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_add_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_ADD_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) +#define emith_addf_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_ADD_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSL,lslimm) + #define emith_addf_r_r_r_lsr(d, s1, s2, lslimm) \ EOP_ADD_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSR,lslimm) +#define emith_adcf_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_ADC_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSL,lslimm) + +#define emith_sub_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_SUB_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) + +#define emith_subf_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_SUB_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSL,lslimm) + +#define emith_sbcf_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_SBC_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSL,lslimm) + #define emith_or_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_ORR_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) @@ -418,6 +434,9 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_eor_r_r_r_lsr(d, s1, s2, lsrimm) \ EOP_EOR_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSR,lsrimm) +#define emith_and_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_AND_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) + #define emith_or_r_r_lsl(d, s, lslimm) \ emith_or_r_r_r_lsl(d, d, s, lslimm) @@ -427,12 +446,30 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_add_r_r_r(d, s1, s2) \ emith_add_r_r_r_lsl(d, s1, s2, 0) +#define emith_addf_r_r_r(d, s1, s2) \ + emith_addf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_adcf_r_r_r(d, s1, s2) \ + emith_adcf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_sub_r_r_r(d, s1, s2) \ + emith_sub_r_r_r_lsl(d, s1, s2, 0) + +#define emith_subf_r_r_r(d, s1, s2) \ + emith_subf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_sbcf_r_r_r(d, s1, s2) \ + emith_sbcf_r_r_r_lsl(d, s1, s2, 0) + #define emith_or_r_r_r(d, s1, s2) \ emith_or_r_r_r_lsl(d, s1, s2, 0) #define emith_eor_r_r_r(d, s1, s2) \ emith_eor_r_r_r_lsl(d, s1, s2, 0) +#define emith_and_r_r_r(d, s1, s2) \ + emith_and_r_r_r_lsl(d, s1, s2, 0) + #define emith_add_r_r(d, s) \ emith_add_r_r_r(d, d, s) @@ -539,11 +576,14 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_bic_r_imm_c(cond, r, imm) \ emith_op_imm(cond, 0, A_OP_BIC, r, imm) +#define emith_tst_r_imm_c(cond, r, imm) \ + emith_top_imm(cond, A_OP_TST, r, imm) + #define emith_move_r_imm_s8(r, imm) { \ - if ((imm) & 0x80) \ - EOP_MVN_IMM(r, 0, ((imm) ^ 0xff)); \ + if ((s8)(imm) < 0) \ + EOP_MVN_IMM(r, 0, ((u8)(imm) ^ 0xff)); \ else \ - EOP_MOV_IMM(r, 0, imm); \ + EOP_MOV_IMM(r, 0, (u8)imm); \ } #define emith_and_r_r_imm(d, s, imm) \ @@ -558,6 +598,15 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_sub_r_r_imm(d, s, imm) \ emith_op_imm2(A_COND_AL, 0, A_OP_SUB, d, s, imm) +#define emith_subf_r_r_imm(d, s, imm) \ + emith_op_imm2(A_COND_AL, 1, A_OP_SUB, d, s, (imm)) + +#define emith_or_r_r_imm(d, s, imm) \ + emith_op_imm2(A_COND_AL, 0, A_OP_ORR, d, s, (imm)) + +#define emith_eor_r_r_imm(d, s, imm) \ + emith_op_imm2(A_COND_AL, 0, A_OP_EOR, d, s, (imm)) + #define emith_neg_r_r(d, s) \ EOP_RSB_IMM(d, s, 0, 0) diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 01702e0c2..4f9dd5a71 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -15,6 +15,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; +#define HOST_REGS 8 #define CONTEXT_REG xBP #define RET_REG xAX @@ -185,6 +186,61 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; } \ } while (0) +#define emith_sub_r_r_r(d, s1, s2) do { \ + if (d == s1) { \ + emith_sub_r_r(d, s2); \ + } else if (d == s2) { \ + emith_sub_r_r(d, s1); \ + } else { \ + emith_move_r_r(d, s1); \ + emith_sub_r_r(d, s2); \ + } \ +} while (0) + +#define emith_adc_r_r_r(d, s1, s2) do { \ + if (d == s1) { \ + emith_adc_r_r(d, s2); \ + } else if (d == s2) { \ + emith_adc_r_r(d, s1); \ + } else { \ + emith_move_r_r(d, s1); \ + emith_adc_r_r(d, s2); \ + } \ +} while (0) + +#define emith_sbc_r_r_r(d, s1, s2) do { \ + if (d == s1) { \ + emith_sbc_r_r(d, s2); \ + } else if (d == s2) { \ + emith_sbc_r_r(d, s1); \ + } else { \ + emith_move_r_r(d, s1); \ + emith_sbc_r_r(d, s2); \ + } \ +} while (0) + +#define emith_and_r_r_r(d, s1, s2) do { \ + if (d == s1) { \ + emith_and_r_r(d, s2); \ + } else if (d == s2) { \ + emith_and_r_r(d, s1); \ + } else { \ + emith_move_r_r(d, s1); \ + emith_and_r_r(d, s2); \ + } \ +} while (0) + +#define emith_or_r_r_r(d, s1, s2) do { \ + if (d == s1) { \ + emith_or_r_r(d, s2); \ + } else if (d == s2) { \ + emith_or_r_r(d, s1); \ + } else { \ + emith_move_r_r(d, s1); \ + emith_or_r_r(d, s2); \ + } \ +} while (0) + #define emith_eor_r_r_r(d, s1, s2) do { \ if (d == s1) { \ emith_eor_r_r(d, s2); \ @@ -281,6 +337,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_eor_r_imm(r, imm) #define emith_bic_r_imm_c(cond, r, imm) \ emith_bic_r_imm(r, imm) +#define emith_tst_r_imm_c(cond, r, imm) \ + emith_tst_r_imm(r, imm) #define emith_ror_c(cond, d, s, cnt) \ emith_ror(d, s, cnt) @@ -324,12 +382,33 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMIT(imm, s32); \ } while (0) +#define emith_sub_r_r_imm(d, s, imm) do { \ + if (d != s) \ + emith_move_r_r(d, s); \ + if (imm) \ + emith_sub_r_imm(d, imm); \ +} while (0) + #define emith_and_r_r_imm(d, s, imm) do { \ if (d != s) \ emith_move_r_r(d, s); \ emith_and_r_imm(d, imm); \ } while (0) +#define emith_or_r_r_imm(d, s, imm) do { \ + if (d != s) \ + emith_move_r_r(d, s); \ + if ((s32)imm != 0) \ + emith_or_r_imm(d, imm); \ +} while (0) + +#define emith_eor_r_r_imm(d, s, imm) do { \ + if (d != s) \ + emith_move_r_r(d, s); \ + if ((s32)imm != 0) \ + emith_eor_r_imm(d, imm); \ +} while (0) + // shift #define emith_shift(op, d, s, cnt) do { \ if (d != s) \ @@ -456,6 +535,14 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_eorf_r_r emith_eor_r_r #define emith_negcf_r_r emith_negc_r_r +#define emith_subf_r_r_imm emith_sub_r_r_imm +#define emith_addf_r_r_r emith_add_r_r_r +#define emith_subf_r_r_r emith_sub_r_r_r +#define emith_adcf_r_r_r emith_adc_r_r_r +#define emith_sbcf_r_r_r emith_sbc_r_r_r +#define emith_eorf_r_r_r emith_eor_r_r_r +#define emith_addf_r_r_r_lsr emith_add_r_r_r_lsr + #define emith_lslf emith_lsl #define emith_lsrf emith_lsr #define emith_asrf emith_asr @@ -705,7 +792,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; case 0: rd = xDI; break; \ case 1: rd = xSI; break; \ case 2: rd = xDX; break; \ - case 3: rd = xBX; break; \ + default: rd = xCX; break; \ } #define emith_sh2_drc_entry() { \ @@ -728,6 +815,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; case 0: rd = xCX; break; \ case 1: rd = xDX; break; \ case 2: rd = 8; break; \ + default: rd = 9; break; \ } #define emith_sh2_drc_entry() { \ @@ -764,6 +852,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; case 0: rd = xAX; break; \ case 1: rd = xDX; break; \ case 2: rd = xCX; break; \ + default: rd = xBX; break; \ } #define emith_sh2_drc_entry() { \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 800e9d329..1b300cc3b 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -38,6 +38,8 @@ // features #define PROPAGATE_CONSTANTS 1 #define LINK_BRANCHES 1 +#define ALIAS_REGISTERS 1 +#define REMAP_REGISTER 1 // limits (per block) #define MAX_BLOCK_SIZE (BLOCK_INSN_LIMIT * 6 * 6) @@ -103,6 +105,7 @@ static int insns_compiled, hash_collisions, host_insn_count; #define BITMASK4(v0,v1,v2,v3) (BITMASK3(v0,v1,v2) | (1 << (v3))) #define BITMASK5(v0,v1,v2,v3,v4) (BITMASK4(v0,v1,v2,v3) | (1 << (v4))) #define BITMASK6(v0,v1,v2,v3,v4,v5) (BITMASK5(v0,v1,v2,v3,v4) | (1 << (v5))) +#define BITRANGE(v0,v1) (BITMASK1(v1+1)-BITMASK1(v0)) // set with v0..v1 #define SHR_T SHR_SR // might make them separate someday #define SHR_MEM 31 @@ -137,6 +140,11 @@ enum op_types { OP_UNDEFINED, }; +#define OP_ISBRANCH(op) (BITRANGE(OP_BRANCH, OP_BRANCH_RF) & BITMASK1(op)) +#define OP_ISBRAUC(op) (BITMASK4(OP_BRANCH, OP_BRANCH_R, OP_BRANCH_RF, OP_RTE) \ + & BITMASK1(op)) +#define OP_ISBRACND(op) (BITMASK2(OP_BRANCH_CT, OP_BRANCH_CF) & BITMASK1(op)) + #ifdef DRC_SH2 static int literal_disabled_frames; @@ -294,9 +302,9 @@ static int block_counts[TCACHE_BUFFERS]; // we have block_link_pool to avoid using mallocs static const int block_link_pool_max_counts[TCACHE_BUFFERS] = { - 4*1024, - 256, - 256, + 16*1024, + 4*256, + 4*256, }; static struct block_link *block_link_pool[TCACHE_BUFFERS]; static int block_link_pool_counts[TCACHE_BUFFERS]; @@ -332,119 +340,148 @@ static struct block_entry **hash_tables[TCACHE_BUFFERS]; // host register tracking enum { HR_FREE, - HR_CACHED, // 'val' has sh2_reg_e -// HR_CONST, // 'val' has a constant + HR_STATIC, // vreg has a static mapping + HR_CACHED, // vreg has sh2_reg_e HR_TEMP, // reg used for temp storage -}; +} cach_reg_type; enum { - HRF_DIRTY = 1 << 0, // reg has "dirty" value to be written to ctx - HRF_LOCKED = 1 << 1, // HR_CACHED can't be evicted -}; + HRF_DIRTY = 1 << 0, // has "dirty" value to be written to ctx + HRF_LOCKED = 1 << 1, // can't be evicted + HRF_TEMP = 1 << 2, // is for temps and args + HRF_REG = 1 << 3, // is for sh2 regs +} cache_reg_flags; typedef struct { - u32 hreg:5; // "host" reg - u32 greg:5; // "guest" reg - u32 type:3; - u32 flags:3; - u32 stamp:16; // kind of a timestamp -} temp_reg_t; - -// note: reg_temp[] must have at least the amount of -// registers used by handlers in worst case (currently 4) -#ifdef __arm__ -#include "../drc/emit_arm.c" + u8 hreg; // "host" reg + u8 flags:4; // TEMP or REG? + u8 type:4; + u16 stamp; // kind of a timestamp + u32 gregs; // "guest" reg mask +} cache_reg_t; + +// guest register tracking +enum { + GRF_DIRTY = 1 << 0, // reg has "dirty" value to be written to ctx + GRF_CONST = 1 << 1, // reg has a constant + GRF_CDIRTY = 1 << 2, // constant not yet written to ctx + GRF_STATIC = 1 << 3, // reg has static mapping to vreg +} guest_reg_flags; -#ifndef __MACH__ +typedef struct { + u16 flags; // guest flags: is constant, is dirty? + s8 sreg; // cache reg for static mapping + s8 vreg; // cache_reg this is currently mapped to, -1 if not mapped + u32 val; // value if this is constant +} guest_reg_t; -static const int reg_map_g2h[] = { - 4, 5, 6, 7, - 8, -1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, 9, // r12 .. sp - -1, -1, -1, 10, // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, - -1, -1, -1, -1, // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, -}; -#else +// note: cache_regs[] must have at least the amount of +// HRF_REG registers used by handlers in worst case (currently 4) +#ifdef __arm__ +#include "../drc/emit_arm.c" -// no r9.. -static const int reg_map_g2h[] = { - 4, 5, 6, 7, - -1, -1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, 8, // r12 .. sp - -1, -1, -1, 10, // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, - -1, -1, -1, -1, // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, +// register assigment goes by ABI convention. All caller save registers are TEMP +// the others are either static or REG. SR must be static, R0 very recommended +static guest_reg_t guest_regs[] = { + // SHR_R0 .. SHR_SP +#ifndef __MACH__ // no r9.. + { GRF_STATIC, 8 }, { GRF_STATIC, 9 }, { 0 } , { 0 } , +#else + { GRF_STATIC, 8 }, { 0 } , { 0 } , { 0 } , +#endif + { 0 } , { 0 } , { 0 } , { 0 } , + { 0 } , { 0 } , { 0 } , { 0 } , + { 0 } , { 0 } , { 0 } , { 0 } , + // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, + // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, + { 0 } , { 0 } , { 0 } , { GRF_STATIC, 10 }, + { 0 } , { 0 } , { 0 } , { 0 } , }; +// NB first TEMP, then REG. alloc/evict algorithm depends on this +static cache_reg_t cache_regs[] = { + { 12, HRF_TEMP }, + { 14, HRF_TEMP }, + { 0, HRF_TEMP }, + { 1, HRF_TEMP }, + { 2, HRF_TEMP }, + { 3, HRF_TEMP }, + { 8, HRF_LOCKED }, +#ifndef __MACH__ // no r9.. + { 9, HRF_LOCKED }, #endif - -static temp_reg_t reg_temp[] = { - { 0, }, - { 1, }, - { 12, }, - { 14, }, - { 2, }, - { 3, }, + { 10, HRF_LOCKED }, + { 4, HRF_REG }, + { 5, HRF_REG }, + { 6, HRF_REG }, + { 7, HRF_REG }, }; #elif defined(__i386__) #include "../drc/emit_x86.c" -static const int reg_map_g2h[] = { - xSI,-1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, -1, // r12 .. sp - -1, -1, -1, xDI, // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, - -1, -1, -1, -1, // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, +static guest_reg_t guest_regs[] = { + // SHR_R0 .. SHR_SP + {GRF_STATIC, xSI}, { 0 } , { 0 } , { 0 } , + { 0 } , { 0 } , { 0 } , { 0 } , + { 0 } , { 0 } , { 0 } , { 0 } , + { 0 } , { 0 } , { 0 } , { 0 } , + // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, + // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, + { 0 } , { 0 } , { 0 } , {GRF_STATIC, xDI}, + { 0 } , { 0 } , { 0 } , { 0 } , }; // ax, cx, dx are usually temporaries by convention -static temp_reg_t reg_temp[] = { - { xAX, }, - { xBX, }, - { xCX, }, - { xDX, }, +static cache_reg_t cache_regs[] = { + { xBX, HRF_REG|HRF_TEMP }, + { xCX, HRF_REG|HRF_TEMP }, + { xDX, HRF_REG|HRF_TEMP }, + { xAX, HRF_REG|HRF_TEMP }, + { xSI, HRF_LOCKED }, + { xDI, HRF_LOCKED }, }; #elif defined(__x86_64__) #include "../drc/emit_x86.c" -static const int reg_map_g2h[] = { +static guest_reg_t guest_regs[] = { + // SHR_R0 .. SHR_SP #ifndef _WIN32 - -1, -1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, -1, // r12 .. sp - -1, -1, -1, xBX, // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, - -1, -1, -1, -1, // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, + { 0 } , { 0 } , { 0 } , { 0 } , #else - xDI,-1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, -1, - -1, -1, -1, -1, // r12 .. sp - -1, -1, -1, xBX, // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, - -1, -1, -1, -1, // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, + {GRF_STATIC, xDI}, { 0 } , { 0 } , { 0 } , #endif + { 0 } , { 0 } , { 0 } , { 0 } , + { 0 } , { 0 } , { 0 } , { 0 } , + { 0 } , { 0 } , { 0 } , { 0 } , + // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, + // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, + { 0 } , { 0 } , { 0 } , {GRF_STATIC, xBX}, + { 0 } , { 0 } , { 0 } , { 0 } , }; // ax, cx, dx are usually temporaries by convention -static temp_reg_t reg_temp[] = { - { xAX, }, - { xCX, }, - { xDX, }, - { xSI, }, +static cache_reg_t cache_regs[] = { + { xCX, HRF_REG|HRF_TEMP }, + { xDX, HRF_REG|HRF_TEMP }, + { xAX, HRF_REG|HRF_TEMP }, + { xSI, HRF_REG|HRF_TEMP }, #ifndef _WIN32 - { xDI, }, + { xDI, HRF_REG|HRF_TEMP }, +#else + { xDI, HRF_LOCKED }, #endif + { xBX, HRF_LOCKED }, }; #else #error unsupported arch #endif +static signed char reg_map_host[HOST_REGS]; + #define T 0x00000001 #define S 0x00000002 #define I 0x000000f0 @@ -468,6 +505,11 @@ static void REGPARM(2) (*sh2_drc_write8)(u32 a, u32 d); static void REGPARM(2) (*sh2_drc_write16)(u32 a, u32 d); static void REGPARM(3) (*sh2_drc_write32)(u32 a, u32 d, SH2 *sh2); +// flags for memory access +#define MF_SIZEMASK 0x03 // size of access +#define MF_POSTINCR 0x10 // post increment (for read_rr) +#define MF_PREDECR MF_POSTINCR // pre decrement (for write_rr) + // address space stuff static int dr_is_rom(u32 a) { @@ -801,12 +843,13 @@ static void dr_link_blocks(struct block_entry *be, int tcache_id) #endif } -#define ADD_TO_ARRAY(array, count, item, failcode) \ +#define ADD_TO_ARRAY(array, count, item, failcode) { \ if (count >= ARRAY_SIZE(array)) { \ dbg(1, "warning: " #array " overflow"); \ failcode; \ - } \ - array[count++] = item; + } else \ + array[count++] = item; \ +} static int find_in_array(u32 *array, size_t size, u32 what) { @@ -820,6 +863,11 @@ static int find_in_array(u32 *array, size_t size, u32 what) // --------------------------------------------------------------- +// NB rcache allocation dependencies: +// - get_reg_arg/get_tmp_arg first (might evict other regs just allocated) +// - get_reg(..., NULL) before get_reg(..., &x) if it might get the same reg +// - get_reg(..., RC_GR_READ/RMW, ...) before WRITE (might evict needed reg) + // register cache / constant propagation stuff typedef enum { RC_GR_READ, @@ -827,43 +875,57 @@ typedef enum { RC_GR_RMW, } rc_gr_mode; -static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking); - -// guest regs with constants -static u32 dr_gcregs[24]; -// a mask of constant/dirty regs -static u32 dr_gcregs_mask; -static u32 dr_gcregs_dirty; +static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr); +static void rcache_remove_vreg_alias(int x, sh2_reg_e r); + +#define RCACHE_DUMP(msg) { \ + cache_reg_t *cp; \ + guest_reg_t *gp; \ + int i; \ + printf("cache dump %s:\n",msg); \ + printf("cache_regs:\n"); \ + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { \ + cp = &cache_regs[i]; \ + if (cp->type != HR_FREE || cp->gregs) \ + printf("%d: hr=%d t=%d f=%x m=%x\n", i, cp->hreg, cp->type, cp->flags, cp->gregs); \ + } \ + printf("guest_regs:\n"); \ + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { \ + gp = &guest_regs[i]; \ + if (gp->vreg != -1 || gp->sreg >= 0) \ + printf("%d: v=%d f=%x s=%d\n", i, gp->vreg, gp->flags, gp->sreg); \ + } \ +} #if PROPAGATE_CONSTANTS static void gconst_set(sh2_reg_e r, u32 val) { - dr_gcregs_mask |= 1 << r; - dr_gcregs[r] = val; + guest_regs[r].flags |= GRF_CONST; + guest_regs[r].val = val; } static void gconst_new(sh2_reg_e r, u32 val) { - int i; - gconst_set(r, val); - dr_gcregs_dirty |= 1 << r; + guest_regs[r].flags |= GRF_CDIRTY; // throw away old r that we might have cached - for (i = ARRAY_SIZE(reg_temp) - 1; i >= 0; i--) { - if ((reg_temp[i].type == HR_CACHED) && - reg_temp[i].greg == r) { - reg_temp[i].type = HR_FREE; - reg_temp[i].flags = 0; - } - } + if (guest_regs[r].vreg >= 0) + rcache_remove_vreg_alias(guest_regs[r].vreg, r); +} + +static void gconst_copy(sh2_reg_e rd, sh2_reg_e rs) +{ + guest_regs[rd].flags &= ~(GRF_CONST|GRF_CDIRTY); + if (guest_regs[rs].flags & GRF_CONST) + gconst_set(rd, guest_regs[rs].val); } #endif static int gconst_get(sh2_reg_e r, u32 *val) { - if (dr_gcregs_mask & (1 << r)) { - *val = dr_gcregs[r]; + if (guest_regs[r].flags & GRF_CONST) { + *val = guest_regs[r].val; return 1; } return 0; @@ -871,7 +933,7 @@ static int gconst_get(sh2_reg_e r, u32 *val) static int gconst_check(sh2_reg_e r) { - if ((dr_gcregs_mask | dr_gcregs_dirty) & (1 << r)) + if (guest_regs[r].flags & (GRF_CONST|GRF_CDIRTY)) return 1; return 0; } @@ -879,68 +941,182 @@ static int gconst_check(sh2_reg_e r) // update hr if dirty, else do nothing static int gconst_try_read(int hr, sh2_reg_e r) { - if (dr_gcregs_dirty & (1 << r)) { - emith_move_r_imm(hr, dr_gcregs[r]); - dr_gcregs_dirty &= ~(1 << r); + if (guest_regs[r].flags & GRF_CDIRTY) { + emith_move_r_imm(hr, guest_regs[r].val); + guest_regs[r].flags &= ~GRF_CDIRTY; return 1; } return 0; } -static void gconst_check_evict(sh2_reg_e r) +static u32 gconst_dirty_mask(void) { - if (dr_gcregs_mask & (1 << r)) - // no longer cached in reg, make dirty again - dr_gcregs_dirty |= 1 << r; + u32 mask = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) + if (guest_regs[i].flags & GRF_CDIRTY) + mask |= (1 << i); + return mask; } static void gconst_kill(sh2_reg_e r) { - dr_gcregs_mask &= ~(1 << r); - dr_gcregs_dirty &= ~(1 << r); + guest_regs[r].flags &= ~(GRF_CONST|GRF_CDIRTY); } -#if PROPAGATE_CONSTANTS -static void gconst_copy(sh2_reg_e rd, sh2_reg_e rs) -{ - u32 val; - - gconst_kill(rd); - if (gconst_get(rs, &val)) - gconst_set(rd, val); -} -#endif - static void gconst_clean(void) { int i; - for (i = 0; i < ARRAY_SIZE(dr_gcregs); i++) - if (dr_gcregs_dirty & (1 << i)) { + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) + if (guest_regs[i].flags & GRF_CDIRTY) { // using RC_GR_READ here: it will call gconst_try_read, // cache the reg and mark it dirty. - rcache_get_reg_(i, RC_GR_READ, 0); + rcache_get_reg_(i, RC_GR_READ, 0, NULL); } } static void gconst_invalidate(void) { - dr_gcregs_mask = dr_gcregs_dirty = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) + guest_regs[i].flags &= ~(GRF_CONST|GRF_CDIRTY); } static u16 rcache_counter; +static u32 rcache_static; +static u32 rcache_locked; +static u32 rcache_hint_soon; +static u32 rcache_hint_late; +#define rcache_hint (rcache_hint_soon|rcache_hint_late) + +// binary search approach, since we don't have CLZ on ARM920T +#define FOR_ALL_BITS_SET_DO(mask, bit, code) { \ + u32 __mask = mask; \ + for (bit = 31; bit >= 0 && mask; bit--, __mask <<= 1) { \ + if (!(__mask & (0xffff << 16))) \ + bit -= 16, __mask <<= 16; \ + if (!(__mask & (0xff << 24))) \ + bit -= 8, __mask <<= 8; \ + if (!(__mask & (0xf << 28))) \ + bit -= 4, __mask <<= 4; \ + if (!(__mask & (0x3 << 30))) \ + bit -= 2, __mask <<= 2; \ + if (!(__mask & (0x1 << 31))) \ + bit -= 1, __mask <<= 1; \ + if (__mask & (0x1 << 31)) { \ + code; \ + } \ + } \ +} + +static void rcache_unmap_vreg(int x) +{ + int i; + + FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, i, + guest_regs[i].vreg = -1); + if (cache_regs[x].type != HR_STATIC) + cache_regs[x].type = HR_FREE; + cache_regs[x].gregs = 0; + cache_regs[x].flags &= (HRF_REG|HRF_TEMP); +} + +static void rcache_clean_vreg(int x) +{ + int r; + + if (cache_regs[x].flags & HRF_DIRTY) { // writeback + cache_regs[x].flags &= ~HRF_DIRTY; + FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, r, + if (guest_regs[r].flags & GRF_DIRTY) { + if (guest_regs[r].flags & GRF_STATIC) { + if (guest_regs[r].vreg != guest_regs[r].sreg) { + if (!(cache_regs[guest_regs[r].sreg].flags & HRF_LOCKED)) { + // statically mapped reg not in its sreg. move back to sreg + rcache_clean_vreg(guest_regs[r].sreg); + rcache_unmap_vreg(guest_regs[r].sreg); + emith_move_r_r(cache_regs[guest_regs[r].sreg].hreg, cache_regs[guest_regs[r].vreg].hreg); + rcache_remove_vreg_alias(x, r); + cache_regs[guest_regs[r].sreg].gregs = (1 << r); + guest_regs[r].vreg = guest_regs[r].sreg; + } else { + // must evict since sreg is locked + emith_ctx_write(cache_regs[x].hreg, r * 4); + guest_regs[r].vreg = -1; + } + } + } else + emith_ctx_write(cache_regs[x].hreg, r * 4); + } + guest_regs[r].flags &= ~GRF_DIRTY;) + } +} + +static void rcache_remove_vreg_alias(int x, sh2_reg_e r) +{ + cache_regs[x].gregs &= ~(1 << r); + if (!cache_regs[x].gregs) { + // no reg mapped -> free vreg + if (cache_regs[x].type != HR_STATIC) + cache_regs[x].type = HR_FREE; + cache_regs[x].flags &= (HRF_REG|HRF_TEMP); + } + guest_regs[r].vreg = -1; +} + +static void rcache_evict_vreg(int x) +{ + rcache_clean_vreg(x); + rcache_unmap_vreg(x); +} -static temp_reg_t *rcache_evict(void) +static void rcache_evict_vreg_aliases(int x, sh2_reg_e r) { - // evict reg with oldest stamp - int i, oldest = -1; + cache_regs[x].gregs &= ~(1 << r); + rcache_evict_vreg(x); + cache_regs[x].gregs = (1 << r); + if (cache_regs[x].type != HR_STATIC) + cache_regs[x].type = HR_CACHED; + if (guest_regs[r].flags & GRF_DIRTY) + cache_regs[x].flags |= HRF_DIRTY; +} + +static cache_reg_t *rcache_evict(void) +{ + // evict reg with oldest stamp (only for HRF_REG, no temps) + int i, i_prio, oldest = -1, prio = 0; u16 min_stamp = (u16)-1; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) { - if (reg_temp[i].type == HR_CACHED && !(reg_temp[i].flags & HRF_LOCKED) && - reg_temp[i].stamp <= min_stamp) { - min_stamp = reg_temp[i].stamp; + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { + // consider only unlocked REG + if (!(cache_regs[i].flags & HRF_REG) || (cache_regs[i].flags & HRF_LOCKED)) + continue; + if (cache_regs[i].type == HR_FREE || (cache_regs[i].type == HR_TEMP)) { oldest = i; + break; + } + if (cache_regs[i].type == HR_CACHED) { + if (rcache_locked & cache_regs[i].gregs) + // REGs needed for the current insn + i_prio = 1; + else if (rcache_hint_soon & cache_regs[i].gregs) + // REGs needed in some future insn + i_prio = 2; + else if (rcache_hint_late & cache_regs[i].gregs) + // REGs needed in some future insn + i_prio = 3; + else + // REGs not needed soon + i_prio = 4; + + if (prio < i_prio || (prio == i_prio && cache_regs[i].stamp < min_stamp)) { + min_stamp = cache_regs[i].stamp; + oldest = i; + prio = i_prio; + } } } @@ -949,110 +1125,254 @@ static temp_reg_t *rcache_evict(void) exit(1); } - i = oldest; - if (reg_temp[i].type == HR_CACHED) { - if (reg_temp[i].flags & HRF_DIRTY) - // writeback - emith_ctx_write(reg_temp[i].hreg, reg_temp[i].greg * 4); - gconst_check_evict(reg_temp[i].greg); + if (cache_regs[oldest].type == HR_CACHED) + rcache_evict_vreg(oldest); + cache_regs[oldest].type = HR_FREE; + cache_regs[oldest].flags &= (HRF_TEMP|HRF_REG); + cache_regs[oldest].gregs = 0; + + return &cache_regs[oldest]; +} + +#if REMAP_REGISTER +// maps a host register to a REG +static int rcache_map_reg(sh2_reg_e r, int hr, int mode) +{ + int i; + + gconst_kill(r); + + // lookup the TEMP hr maps to + i = reg_map_host[hr]; + if (i < 0) { + // must not happen + printf("invalid host register %d\n", hr); + exit(1); + } + + // deal with statically mapped regs + if (mode == RC_GR_RMW && (guest_regs[r].flags & GRF_STATIC)) { + if (guest_regs[r].vreg == guest_regs[r].sreg) { + // STATIC in its sreg with no aliases, and some processing pending + if (cache_regs[guest_regs[r].vreg].gregs == 1 << r) + return cache_regs[guest_regs[r].vreg].hreg; + } else if (!cache_regs[guest_regs[r].sreg].gregs) + // STATIC not in its sreg, with sreg available -> move it + i = guest_regs[r].sreg; } - reg_temp[i].type = HR_FREE; - reg_temp[i].flags = 0; - return ®_temp[i]; + // remove old mappings of r and i if one exists + if (guest_regs[r].vreg >= 0) + rcache_remove_vreg_alias(guest_regs[r].vreg, r); + if (cache_regs[i].type == HR_CACHED) + rcache_unmap_vreg(i); + // set new mappping + if (cache_regs[i].type != HR_STATIC) + cache_regs[i].type = HR_CACHED; + cache_regs[i].gregs = 1 << r; + cache_regs[i].flags &= (HRF_TEMP|HRF_REG); + cache_regs[i].stamp = ++rcache_counter; + cache_regs[i].flags |= HRF_DIRTY|HRF_LOCKED; + guest_regs[r].flags |= GRF_DIRTY; + guest_regs[r].vreg = i; + return cache_regs[i].hreg; } -static int get_reg_static(sh2_reg_e r, rc_gr_mode mode) +// remap vreg from a TEMP to a REG if it is hinted (upcoming TEMP invalidation) +static void rcache_remap_vreg(int r) { - int i = reg_map_g2h[r]; - if (i != -1) { - if (mode != RC_GR_WRITE) - gconst_try_read(i, r); + int i, j, free = -1, cached = -1, hinted = -1; + u16 min_stamp_cached = (u16)-1, min_stamp_hinted = -1; + + // r must be a vreg + if (cache_regs[r].type != HR_CACHED) + return; + // if r is already a REG or isn't used, clean here to avoid data loss on inval + if ((cache_regs[r].flags & HRF_REG) || !(rcache_hint & cache_regs[r].gregs)) { + rcache_clean_vreg(r); + return; } - return i; + + // find REG, either free or unused temp or oldest cached + for (i = 0; i < ARRAY_SIZE(cache_regs) && free < 0; i++) { + if ((cache_regs[i].flags & HRF_TEMP) || (cache_regs[i].flags & HRF_LOCKED)) + continue; + if (cache_regs[i].type == HR_FREE || cache_regs[i].type == HR_TEMP) + free = i; + if (cache_regs[i].type == HR_CACHED && !(rcache_hint & cache_regs[i].gregs)) { + if (cache_regs[i].stamp < min_stamp_cached) { + min_stamp_cached = cache_regs[i].stamp; + cached = i; + } + } + if (cache_regs[i].type == HR_CACHED && !(rcache_hint_soon & cache_regs[i].gregs) + && (rcache_hint_soon & cache_regs[r].gregs)) + if (cache_regs[i].stamp < min_stamp_hinted) { + min_stamp_hinted = cache_regs[i].stamp; + hinted = i; + } + } + + if (free >= 0) { + i = free; + } else if (cached >= 0 && cached != r) { + i = cached; + rcache_evict_vreg(i); + } else if (hinted >= 0 && hinted != r) { + i = hinted; + rcache_evict_vreg(i); + } else { + rcache_clean_vreg(r); + return; + } + + // set new mapping and remove old one + cache_regs[i].type = HR_CACHED; + cache_regs[i].gregs = cache_regs[r].gregs; + cache_regs[i].flags &= (HRF_TEMP|HRF_REG); + cache_regs[i].flags |= cache_regs[r].flags & ~(HRF_TEMP|HRF_REG); + cache_regs[i].stamp = cache_regs[r].stamp; + emith_move_r_r(cache_regs[i].hreg, cache_regs[r].hreg); + for (j = 0; j < ARRAY_SIZE(guest_regs); j++) + if (guest_regs[j].vreg == r) + guest_regs[j].vreg = i; + cache_regs[r].type = HR_FREE; + cache_regs[r].flags &= (HRF_TEMP|HRF_REG); + cache_regs[r].gregs = 0; } +#endif // note: must not be called when doing conditional code -static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking) +static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr) { - temp_reg_t *tr; - int i, ret; - - // maybe statically mapped? - ret = get_reg_static(r, mode); - if (ret != -1) - goto end; + cache_reg_t *tr = NULL; + int i, h, split = -1; rcache_counter++; // maybe already cached? // if so, prefer against gconst (they must be in sync) - for (i = ARRAY_SIZE(reg_temp) - 1; i >= 0; i--) { - if (reg_temp[i].type == HR_CACHED && reg_temp[i].greg == r) { - reg_temp[i].stamp = rcache_counter; - if (mode != RC_GR_READ) - reg_temp[i].flags |= HRF_DIRTY; - ret = reg_temp[i].hreg; + i = guest_regs[r].vreg; + if ((guest_regs[r].flags & GRF_STATIC) && i != guest_regs[r].sreg && + !(cache_regs[guest_regs[r].sreg].flags & HRF_LOCKED) && + (i < 0 || mode != RC_GR_READ) && + !((rcache_hint_soon|rcache_locked) & cache_regs[guest_regs[r].sreg].gregs)) { + // good opportunity to relocate a remapped STATIC + h = guest_regs[r].sreg; + rcache_evict_vreg(h); + tr = &cache_regs[h]; + if (i >= 0) { + if (mode != RC_GR_WRITE) { + if (hr) + *hr = cache_regs[i].hreg; + else + emith_move_r_r(cache_regs[h].hreg, cache_regs[i].hreg); + hr = NULL; + } + rcache_remove_vreg_alias(guest_regs[r].vreg, r); + } else if (mode != RC_GR_WRITE) { + if (gconst_try_read(tr->hreg, r)) { + tr->flags |= HRF_DIRTY; + guest_regs[r].flags |= GRF_DIRTY; + } else + emith_ctx_read(tr->hreg, r * 4); + } + guest_regs[r].vreg = guest_regs[r].sreg; + tr->gregs = 1 << r; + goto end; + } else if (i >= 0) { + if (mode == RC_GR_READ || !(cache_regs[i].gregs & ~(1 << r))) { + // either only reading, or no multiple mapping + tr = &cache_regs[i]; + goto end; + } + // split if aliases needed rsn, or already locked, or r is STATIC in sreg + if (((rcache_hint|rcache_locked) & cache_regs[i].gregs & ~(1 << r)) || + (cache_regs[i].flags & HRF_LOCKED) || + (cache_regs[i].type == HR_STATIC && !(guest_regs[r].flags & GRF_STATIC))) { + // need to split up. take reg out here to avoid unnecessary writebacks + cache_regs[i].gregs &= ~(1 << r); + split = i; + } else { + // aliases not needed anytime soon, remove them + // XXX split aliases away if writing and static and not locked and hinted? + rcache_evict_vreg_aliases(i, r); + tr = &cache_regs[i]; goto end; } } - // use any free reg - for (i = ARRAY_SIZE(reg_temp) - 1; i >= 0; i--) { - if (reg_temp[i].type == HR_FREE) { - tr = ®_temp[i]; - goto do_alloc; + // get a free reg, but use temps only if r is not needed soon + for (i = ARRAY_SIZE(cache_regs) - 1; i >= 0; i--) { + if ((cache_regs[i].type == HR_FREE || + (cache_regs[i].type == HR_TEMP && !(cache_regs[i].flags & HRF_LOCKED))) && + (!(rcache_hint & (1 << r)) || (cache_regs[i].flags & HRF_REG))) { + tr = &cache_regs[i]; + break; } } - tr = rcache_evict(); + if (!tr) + tr = rcache_evict(); -do_alloc: tr->type = HR_CACHED; - if (do_locking) - tr->flags |= HRF_LOCKED; - if (mode != RC_GR_READ) - tr->flags |= HRF_DIRTY; - tr->greg = r; - tr->stamp = rcache_counter; - ret = tr->hreg; + tr->gregs = 1 << r; + guest_regs[r].vreg = tr - cache_regs; if (mode != RC_GR_WRITE) { - if (gconst_check(r)) { - if (gconst_try_read(ret, r)) - tr->flags |= HRF_DIRTY; - } - else + if (gconst_try_read(tr->hreg, r)) { + tr->flags |= HRF_DIRTY; + guest_regs[r].flags |= GRF_DIRTY; + } else if (split >= 0) { + if (hr) { + cache_regs[split].flags |= HRF_LOCKED; + *hr = cache_regs[split].hreg; + hr = NULL; + } else if (tr->hreg != cache_regs[split].hreg) + emith_move_r_r(tr->hreg, cache_regs[split].hreg); + } else emith_ctx_read(tr->hreg, r * 4); } end: - if (mode != RC_GR_READ) + if (hr) + *hr = tr->hreg; + if (do_locking) + tr->flags |= HRF_LOCKED; + tr->stamp = rcache_counter; + if (mode != RC_GR_READ) { + tr->flags |= HRF_DIRTY; + guest_regs[r].flags |= GRF_DIRTY; gconst_kill(r); + } - return ret; + return tr->hreg; } -static int rcache_get_reg(sh2_reg_e r, rc_gr_mode mode) +static int rcache_get_reg(sh2_reg_e r, rc_gr_mode mode, int *hr) { - return rcache_get_reg_(r, mode, 1); + return rcache_get_reg_(r, mode, 1, hr); } static int rcache_get_tmp(void) { - temp_reg_t *tr; + cache_reg_t *tr = NULL; int i; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) - if (reg_temp[i].type == HR_FREE) { - tr = ®_temp[i]; - goto do_alloc; + // use any free reg, but prefer TEMP regs + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { + if (cache_regs[i].type == HR_FREE || + (cache_regs[i].type == HR_TEMP && !(cache_regs[i].flags & HRF_LOCKED))) { + tr = &cache_regs[i]; + break; } + } - tr = rcache_evict(); + if (!tr) + tr = rcache_evict(); -do_alloc: tr->type = HR_TEMP; + tr->flags |= HRF_LOCKED; return tr->hreg; } @@ -1060,192 +1380,421 @@ static int rcache_get_hr_id(int hr) { int i; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) - if (reg_temp[i].hreg == hr) - break; - - if (i == ARRAY_SIZE(reg_temp)) // can't happen + i = reg_map_host[hr]; + if (i < 0) // can't happen exit(1); - if (reg_temp[i].type == HR_CACHED) { - // writeback - if (reg_temp[i].flags & HRF_DIRTY) - emith_ctx_write(reg_temp[i].hreg, reg_temp[i].greg * 4); - gconst_check_evict(reg_temp[i].greg); - } - else if (reg_temp[i].type == HR_TEMP) { +#if REMAP_REGISTER + if (cache_regs[i].type == HR_CACHED) + rcache_remap_vreg(i); +#endif + if (cache_regs[i].type == HR_CACHED) + rcache_evict_vreg(i); + else if (cache_regs[i].type == HR_TEMP && (cache_regs[i].flags & HRF_LOCKED)) { printf("host reg %d already used, aborting\n", hr); exit(1); } - reg_temp[i].type = HR_FREE; - reg_temp[i].flags = 0; - return i; } static int rcache_get_arg_id(int arg) { - int r = 0; - host_arg2reg(r, arg); - return rcache_get_hr_id(r); + int hr = 0; + + host_arg2reg(hr, arg); + return rcache_get_hr_id(hr); } // get a reg to be used as function arg static int rcache_get_tmp_arg(int arg) { int id = rcache_get_arg_id(arg); - reg_temp[id].type = HR_TEMP; + cache_regs[id].type = HR_TEMP; + cache_regs[id].flags |= HRF_LOCKED; - return reg_temp[id].hreg; + return cache_regs[id].hreg; } // ... as return value after a call static int rcache_get_tmp_ret(void) { int id = rcache_get_hr_id(RET_REG); - reg_temp[id].type = HR_TEMP; + cache_regs[id].type = HR_TEMP; + cache_regs[id].flags |= HRF_LOCKED; - return reg_temp[id].hreg; + return cache_regs[id].hreg; } -// same but caches a reg. RC_GR_READ only. -static int rcache_get_reg_arg(int arg, sh2_reg_e r) +// same but caches a reg if access is readonly (announced by hr being NULL) +static int rcache_get_reg_arg(int arg, sh2_reg_e r, int *hr) { int i, srcr, dstr, dstid; - int dirty = 0, src_dirty = 0; - - dstid = rcache_get_arg_id(arg); - dstr = reg_temp[dstid].hreg; + int dirty = 0, src_dirty = 0, is_const = 0, is_cached = 0; + u32 val; + host_arg2reg(dstr, arg); - // maybe already statically mapped? - srcr = get_reg_static(r, RC_GR_READ); - if (srcr != -1) - goto do_cache; + i = guest_regs[r].vreg; + if (i >= 0 && cache_regs[i].type == HR_CACHED && cache_regs[i].hreg == dstr) + // r is already in arg + dstid = i; + else + dstid = rcache_get_arg_id(arg); + dstr = cache_regs[dstid].hreg; + + if (rcache_hint & (1 << r)) { + // r is needed later on anyway + srcr = rcache_get_reg_(r, RC_GR_READ, 0, NULL); + is_cached = (cache_regs[reg_map_host[srcr]].type == HR_CACHED); + } else if ((guest_regs[r].flags & GRF_CDIRTY) && gconst_get(r, &val)) { + // r has an uncomitted const - load into arg, but keep constant uncomitted + srcr = dstr; + is_const = 1; + } else if ((i = guest_regs[r].vreg) >= 0) { + // maybe already cached? + srcr = cache_regs[i].hreg; + is_cached = (cache_regs[reg_map_host[srcr]].type == HR_CACHED); + } else { + // must read either const or from ctx + srcr = dstr; + if (rcache_static & (1 << r)) + srcr = rcache_get_reg_(r, RC_GR_READ, 0, NULL); + else if (gconst_try_read(srcr, r)) + dirty = 1; + else + emith_ctx_read(srcr, r * 4); + } - // maybe already cached? - for (i = ARRAY_SIZE(reg_temp) - 1; i >= 0; i--) { - if ((reg_temp[i].type == HR_CACHED) && - reg_temp[i].greg == r) - { - srcr = reg_temp[i].hreg; - if (reg_temp[i].flags & HRF_DIRTY) - src_dirty = 1; - goto do_cache; + if (is_cached) { + i = reg_map_host[srcr]; + if (srcr == dstr) { // evict aliases here since it is reallocated below + if (guest_regs[r].flags & GRF_STATIC) // move STATIC back to its sreg + rcache_clean_vreg(guest_regs[r].vreg); +#if REMAP_REGISTER + rcache_remap_vreg(i); +#endif + if (cache_regs[i].type == HR_CACHED) + rcache_evict_vreg(i); } + else if (hr != NULL) // must lock srcr if not copied here + cache_regs[i].flags |= HRF_LOCKED; + if (guest_regs[r].flags & GRF_DIRTY) + src_dirty = 1; } - // must read - srcr = dstr; - if (gconst_check(r)) { - if (gconst_try_read(srcr, r)) - dirty = 1; + cache_regs[dstid].type = HR_TEMP; + if (is_const) { + // uncomitted constant + emith_move_r_imm(srcr, val); + } else if (dstr != srcr) { + // arg is a copy of cached r + if (hr == NULL) + emith_move_r_r(dstr, srcr); + } else if (hr != NULL) { + // caller will modify arg, so it will soon be out of sync with r + if (dirty || src_dirty) + emith_ctx_write(dstr, r * 4); // must clean since arg will be modified + } else if (guest_regs[r].vreg < 0) { + // keep arg as vreg for r + cache_regs[dstid].type = HR_CACHED; + cache_regs[dstid].gregs = 1 << r; + guest_regs[r].vreg = dstid; + if (dirty || src_dirty) { // mark as modifed for cleaning later on + cache_regs[dstid].flags |= HRF_DIRTY; + guest_regs[r].flags |= GRF_DIRTY; + } } - else - emith_ctx_read(srcr, r * 4); - -do_cache: - if (dstr != srcr) - emith_move_r_r(dstr, srcr); -#if 1 - else - dirty |= src_dirty; - if (dirty) - // must clean, callers might want to modify the arg before call - emith_ctx_write(dstr, r * 4); -#else - if (dirty) - reg_temp[dstid].flags |= HRF_DIRTY; -#endif + if (hr) + *hr = srcr; - reg_temp[dstid].stamp = ++rcache_counter; - reg_temp[dstid].type = HR_CACHED; - reg_temp[dstid].greg = r; - reg_temp[dstid].flags |= HRF_LOCKED; + cache_regs[dstid].stamp = ++rcache_counter; + cache_regs[dstid].flags |= HRF_LOCKED; return dstr; } static void rcache_free_tmp(int hr) { - int i; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) - if (reg_temp[i].hreg == hr) - break; - - if (i == ARRAY_SIZE(reg_temp) || reg_temp[i].type != HR_TEMP) { - printf("rcache_free_tmp fail: #%i hr %d, type %d\n", i, hr, reg_temp[i].type); + int i = reg_map_host[hr]; + if (i < 0 || cache_regs[i].type != HR_TEMP) { + printf("rcache_free_tmp fail: #%i hr %d, type %d\n", i, hr, cache_regs[i].type); return; } - reg_temp[i].type = HR_FREE; - reg_temp[i].flags = 0; + cache_regs[i].type = HR_FREE; + cache_regs[i].flags &= (HRF_REG|HRF_TEMP); +} + +// saves temporary result either in REG or in drctmp +static int rcache_save_tmp(int hr) +{ + int i, free = -1, cached = -1; + u16 min_stamp = (u16)-1; + + // find REG, either free or unlocked temp or oldest non-hinted cached + for (i = 0; i < ARRAY_SIZE(cache_regs) && free < 0; i++) { + if ((cache_regs[i].flags & HRF_TEMP) || (cache_regs[i].flags & HRF_LOCKED)) + continue; + if (cache_regs[i].type == HR_FREE || cache_regs[i].type == HR_TEMP) + free = i; + if (cache_regs[i].type == HR_CACHED && + !((rcache_hint | rcache_locked) & cache_regs[i].gregs)) { + if (cache_regs[i].stamp < min_stamp) { + min_stamp = cache_regs[i].stamp; + cached = i; + } + } + } + + if (free >= 0) + i = free; + else if (cached >= 0) { + i = cached; + rcache_evict_vreg(i); + } else { + // if none is available, store in drctmp + emith_ctx_write(hr, offsetof(SH2, drc_tmp)); + rcache_free_tmp(hr); + return -1; + } + + cache_regs[i].type = HR_CACHED; + cache_regs[i].gregs = 0; // not storing any guest register + cache_regs[i].flags &= (HRF_TEMP|HRF_REG); + cache_regs[i].flags |= HRF_LOCKED; + cache_regs[i].stamp = ++rcache_counter; + emith_move_r_r(cache_regs[i].hreg, hr); + rcache_free_tmp(hr); + return i; +} + +static int rcache_restore_tmp(int r) +{ + int hr; + + // find REG with tmp store: cached but with no gregs + if (r >= 0) { + if (cache_regs[r].type != HR_CACHED || cache_regs[r].gregs) { + printf("invalid tmp storage %d\n", r); + exit(1); + } + // found, transform to a TEMP + cache_regs[r].type = HR_TEMP; + cache_regs[r].flags |= HRF_LOCKED; + return cache_regs[r].hreg; + } + + // if not available, create a TEMP store and fetch from drctmp + hr = rcache_get_tmp(); + emith_ctx_read(hr, offsetof(SH2, drc_tmp)); + + return hr; } static void rcache_unlock(int hr) { - int i; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) - if (reg_temp[i].type == HR_CACHED && reg_temp[i].hreg == hr) - reg_temp[i].flags &= ~HRF_LOCKED; + if (hr >= 0) { + cache_regs[hr].flags &= ~HRF_LOCKED; + rcache_locked &= ~cache_regs[hr].gregs; + } } static void rcache_unlock_all(void) { int i; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) - reg_temp[i].flags &= ~HRF_LOCKED; + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + cache_regs[i].flags &= ~HRF_LOCKED; +} + +static inline void rcache_set_locked(u32 mask) +{ + rcache_locked = mask & ~rcache_static; +} + +static inline void rcache_set_hint_soon(u32 mask) +{ + rcache_hint_soon = mask & ~rcache_static; +} + +static inline void rcache_set_hint_late(u32 mask) +{ + rcache_hint_late = mask & ~rcache_static; +} + +static inline int rcache_is_hinted(sh2_reg_e r) +{ + // consider static REGs as always hinted, since they are always there + return ((rcache_hint | rcache_static) & (1 << r)); +} + +static inline int rcache_is_cached(sh2_reg_e r) +{ + // consider static REGs as always hinted, since they are always there + return (guest_regs[r].vreg >= 0); +} + +static inline u32 rcache_used_hreg_mask(void) +{ + u32 mask = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + if (cache_regs[i].type != HR_FREE) + mask |= 1 << cache_regs[i].hreg; + + return mask & ~rcache_static; +} + +static inline u32 rcache_dirty_mask(void) +{ + u32 mask = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) + if (guest_regs[i].flags & GRF_DIRTY) + mask |= 1 << i; + mask |= gconst_dirty_mask(); + + return mask; } -#if (DRC_DEBUG & (8|256|512|1024)) || defined(DRC_CMP) -static u32 rcache_used_hreg_mask(void) +static inline u32 rcache_reg_mask(void) { u32 mask = 0; int i; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) - if (reg_temp[i].type != HR_FREE) - mask |= 1 << reg_temp[i].hreg; + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + if (cache_regs[i].type == HR_CACHED) + mask |= cache_regs[i].gregs; return mask; } + +static void rcache_clean_tmp(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + if (cache_regs[i].type == HR_CACHED && (cache_regs[i].flags & HRF_TEMP)) +#if REMAP_REGISTER + rcache_remap_vreg(i); +#else + rcache_clean_vreg(i); #endif +} + +static void rcache_clean_mask(u32 mask) +{ + int i; + + // XXX consider gconst? + if (!(mask &= ~rcache_static & ~gconst_dirty_mask())) + return; + + // clean only vregs where all aliases are covered by the mask + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + if (cache_regs[i].type == HR_CACHED && + (cache_regs[i].gregs & mask) && !(cache_regs[i].gregs & ~mask)) + rcache_clean_vreg(i); +} static void rcache_clean(void) { int i; gconst_clean(); - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) - if (reg_temp[i].type == HR_CACHED && (reg_temp[i].flags & HRF_DIRTY)) { - // writeback - emith_ctx_write(reg_temp[i].hreg, reg_temp[i].greg * 4); - reg_temp[i].flags &= ~HRF_DIRTY; + for (i = ARRAY_SIZE(cache_regs)-1; i >= 0; i--) + if (cache_regs[i].type == HR_CACHED || cache_regs[i].type == HR_STATIC) + rcache_clean_vreg(i); +} + +static void rcache_invalidate_tmp(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { + if (cache_regs[i].flags & HRF_TEMP) { + if (cache_regs[i].type == HR_CACHED) + rcache_unmap_vreg(i); + cache_regs[i].type = HR_FREE; + cache_regs[i].flags &= (HRF_TEMP|HRF_REG); + cache_regs[i].gregs = 0; } + } } static void rcache_invalidate(void) { int i; - for (i = 0; i < ARRAY_SIZE(reg_temp); i++) { - reg_temp[i].type = HR_FREE; - reg_temp[i].flags = 0; + + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { + cache_regs[i].flags &= (HRF_TEMP|HRF_REG); + if (cache_regs[i].type != HR_STATIC) + cache_regs[i].type = HR_FREE; + cache_regs[i].gregs = 0; } + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { + guest_regs[i].flags &= GRF_STATIC; + if (!(guest_regs[i].flags & GRF_STATIC)) + guest_regs[i].vreg = -1; + else { + if (guest_regs[i].vreg < 0) + emith_ctx_read(cache_regs[guest_regs[i].sreg].hreg, i*4); + else if (guest_regs[i].vreg != guest_regs[i].sreg) + emith_move_r_r(cache_regs[guest_regs[i].sreg].hreg, + cache_regs[guest_regs[i].vreg].hreg); + cache_regs[guest_regs[i].sreg].gregs = 1 << i; + guest_regs[i].vreg = guest_regs[i].sreg; + } + }; rcache_counter = 0; + rcache_hint_soon = rcache_hint_late = 0; gconst_invalidate(); } static void rcache_flush(void) { + rcache_unlock_all(); rcache_clean(); rcache_invalidate(); } +static void rcache_init(void) +{ + static int once = 1; + int i; + + // init is executed on every rom load, but this must only be executed once... + if (once) { + memset(reg_map_host, -1, sizeof(reg_map_host)); + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + reg_map_host[cache_regs[i].hreg] = i; + + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) + if (guest_regs[i].flags & GRF_STATIC) { + rcache_static |= (1 << i); + guest_regs[i].sreg = reg_map_host[guest_regs[i].sreg]; + cache_regs[guest_regs[i].sreg].type = HR_STATIC; + } else + guest_regs[i].sreg = -1; + once = 0; + } + + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) + if (guest_regs[i].flags & GRF_STATIC) { + guest_regs[i].vreg = guest_regs[i].sreg; + cache_regs[guest_regs[i].sreg].gregs = (1 << i); + } + + rcache_invalidate(); +} + // --------------------------------------------------------------- -static int emit_get_rbase_and_offs(u32 a, u32 *offs) +static int emit_get_rbase_and_offs(SH2 *sh2, u32 a, u32 *offs) { + u32 omask = 0xff; // offset mask, XXX: ARM oriented.. u32 mask = 0; int poffs; int hr; @@ -1254,11 +1803,19 @@ static int emit_get_rbase_and_offs(u32 a, u32 *offs) if (poffs == -1) return -1; - // XXX: could use some related reg hr = rcache_get_tmp(); - emith_ctx_read_ptr(hr, poffs); - emith_add_r_r_ptr_imm(hr, hr, a & mask & ~0xff); - *offs = a & 0xff; // XXX: ARM oriented.. + if (mask < 0x1000) { + // can't access data array or BIOS directly from ROM or SDRAM, + // since code may run on both SH2s (if the tcache_id would be known...) + emith_ctx_read(hr, poffs); + if (a & mask & ~omask) + emith_add_r_imm(hr, a & mask & ~omask); + } else { + // known fixed host address + a = (a & mask) + *(u32 *)((char *)sh2 + poffs); + emith_move_r_imm(hr, (a & ~omask)); + } + *offs = a & omask; return hr; } @@ -1271,7 +1828,7 @@ static int emit_get_rom_data(sh2_reg_e r, u32 offs, int size, u32 *val) if (gconst_get(r, &tmp)) { tmp += offs; if (dr_is_rom(tmp)) { - switch (size) { + switch (size & MF_SIZEMASK) { case 0: *val = (s8)p32x_sh2_read8(tmp, sh2s); break; // 8 case 1: *val = (s16)p32x_sh2_read16(tmp, sh2s); break; // 16 case 2: *val = p32x_sh2_read32(tmp, sh2s); break; // 32 @@ -1287,7 +1844,7 @@ static void emit_move_r_imm32(sh2_reg_e dst, u32 imm) #if PROPAGATE_CONSTANTS gconst_new(dst, imm); #else - int hr = rcache_get_reg(dst, RC_GR_WRITE); + int hr = rcache_get_reg(dst, RC_GR_WRITE, NULL); emith_move_r_imm(hr, imm); #endif } @@ -1296,12 +1853,36 @@ static void emit_move_r_r(sh2_reg_e dst, sh2_reg_e src) { int hr_d, hr_s; - hr_s = rcache_get_reg(src, RC_GR_READ); - hr_d = rcache_get_reg(dst, RC_GR_WRITE); - emith_move_r_r(hr_d, hr_s); + if (guest_regs[src].vreg >= 0 || gconst_check(src) || rcache_is_hinted(src)) { + hr_s = rcache_get_reg(src, RC_GR_READ, NULL); +#if ALIAS_REGISTERS + // check for aliasing + int i = guest_regs[src].vreg; + if (guest_regs[dst].vreg != i) { + // remove possible old mapping of dst + if (guest_regs[dst].vreg >= 0) + rcache_remove_vreg_alias(guest_regs[dst].vreg, dst); + // make dst an alias of src + cache_regs[i].gregs |= (1 << dst); + cache_regs[i].flags |= HRF_DIRTY; + guest_regs[dst].flags |= GRF_DIRTY; + guest_regs[dst].vreg = i; + gconst_kill(dst); +#if PROPAGATE_CONSTANTS + gconst_copy(dst, src); +#endif + return; + } +#endif + hr_d = rcache_get_reg(dst, RC_GR_WRITE, NULL); + emith_move_r_r(hr_d, hr_s); #if PROPAGATE_CONSTANTS - gconst_copy(dst, src); + gconst_copy(dst, src); #endif + } else { + hr_d = rcache_get_reg(dst, RC_GR_WRITE, NULL); + emith_ctx_read(hr_d, src * 4); + } } // T must be clear, and comparison done just before this @@ -1317,25 +1898,22 @@ static int emit_memhandler_read(int size) { int arg1; - rcache_clean(); - + rcache_clean_tmp(); #ifndef DRC_SR_REG // must writeback cycles for poll detection stuff - if (reg_map_g2h[SHR_SR] != -1) - emith_ctx_write(reg_map_g2h[SHR_SR], SHR_SR * 4); + if (guest_regs[SHR_SR].vreg != -1) + rcache_evict_vreg(guest_regs[SHR_SR].vreg); #endif + arg1 = rcache_get_tmp_arg(1); emith_move_r_r_ptr(arg1, CONTEXT_REG); - switch (size) { + switch (size & MF_SIZEMASK) { case 0: emith_call(sh2_drc_read8); break; // 8 case 1: emith_call(sh2_drc_read16); break; // 16 case 2: emith_call(sh2_drc_read32); break; // 32 } - rcache_invalidate(); -#ifndef DRC_SR_REG - if (reg_map_g2h[SHR_SR] != -1) - emith_ctx_read(reg_map_g2h[SHR_SR], SHR_SR * 4); -#endif + + rcache_invalidate_tmp(); return rcache_get_tmp_ret(); } @@ -1343,52 +1921,52 @@ static int emit_memhandler_read(int size) static void emit_memhandler_write(int size) { int arg2; + + rcache_clean_tmp(); #ifndef DRC_SR_REG - if (reg_map_g2h[SHR_SR] != -1) - emith_ctx_write(reg_map_g2h[SHR_SR], SHR_SR * 4); + if (guest_regs[SHR_SR].vreg != -1) + rcache_evict_vreg(guest_regs[SHR_SR].vreg); #endif - rcache_clean(); arg2 = rcache_get_tmp_arg(2); emith_move_r_r_ptr(arg2, CONTEXT_REG); - switch (size) { + switch (size & MF_SIZEMASK) { case 0: emith_call(sh2_drc_write8); break; // 8 case 1: emith_call(sh2_drc_write16); break; // 16 case 2: emith_call(sh2_drc_write32); break; // 32 } - rcache_invalidate(); -#ifndef DRC_SR_REG - if (reg_map_g2h[SHR_SR] != -1) - emith_ctx_read(reg_map_g2h[SHR_SR], SHR_SR * 4); -#endif + rcache_invalidate_tmp(); } -// rd = @(Rs,#offs) -static int emit_memhandler_read_rr(sh2_reg_e rd, sh2_reg_e rs, u32 offs, int size) +// rd = @(Rs,#offs); rd < 0 -> return a temp +static int emit_memhandler_read_rr(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rs, u32 offs, int size) { int hr, hr2; u32 val, offs2; +#if PROPAGATE_CONSTANTS if (emit_get_rom_data(rs, offs, size, &val)) { if (rd == SHR_TMP) { hr2 = rcache_get_tmp(); emith_move_r_imm(hr2, val); } else { emit_move_r_imm32(rd, val); - hr2 = rcache_get_reg(rd, RC_GR_READ); + hr2 = rcache_get_reg(rd, RC_GR_READ, NULL); } + if ((size & MF_POSTINCR) && gconst_get(rs, &val)) + gconst_new(rs, val + (1 << (size & MF_SIZEMASK))); return hr2; } if (gconst_get(rs, &val)) { - hr = emit_get_rbase_and_offs(val + offs, &offs2); + hr = emit_get_rbase_and_offs(sh2, val + offs, &offs2); if (hr != -1) { if (rd == SHR_TMP) hr2 = rcache_get_tmp(); else - hr2 = rcache_get_reg(rd, RC_GR_WRITE); - switch (size) { + hr2 = rcache_get_reg(rd, RC_GR_WRITE, NULL); + switch (size & MF_SIZEMASK) { case 0: // 8 emith_read8s_r_r_offs(hr2, hr, offs2 ^ 1); break; @@ -1401,142 +1979,170 @@ static int emit_memhandler_read_rr(sh2_reg_e rd, sh2_reg_e rs, u32 offs, int siz break; } rcache_free_tmp(hr); + if (size & MF_POSTINCR) + gconst_new(rs, val + (1 << (size & MF_SIZEMASK))); return hr2; } } - - if (gconst_get(rs, &val)) { +#endif + if (gconst_get(rs, &val) && (!(size & MF_POSTINCR) /*|| !(rcache_hint_soon & (1 << rs))*/)) { hr = rcache_get_tmp_arg(0); emith_move_r_imm(hr, val + offs); - } else { - hr = rcache_get_reg_arg(0, rs); - if (offs) - emith_add_r_imm(hr, offs); - } - hr = emit_memhandler_read(size); + if (size & MF_POSTINCR) + gconst_new(rs, val + (1 << (size & MF_SIZEMASK))); + } else if (offs || (size & MF_POSTINCR)) { + hr = rcache_get_reg_arg(0, rs, &hr2); + if (offs || hr != hr2) + emith_add_r_r_imm(hr, hr2, offs); + if (size & MF_POSTINCR) { + hr = rcache_get_reg(rs, RC_GR_WRITE, NULL); + emith_add_r_r_imm(hr, hr2, 1 << (size & MF_SIZEMASK)); + } + } else + rcache_get_reg_arg(0, rs, NULL); + hr = emit_memhandler_read(size); + + size &= MF_SIZEMASK; if (rd == SHR_TMP) hr2 = hr; else - hr2 = rcache_get_reg(rd, RC_GR_WRITE); +#if REMAP_REGISTER + hr2 = rcache_map_reg(rd, hr, size != 2 ? RC_GR_RMW : RC_GR_WRITE); +#else + hr2 = rcache_get_reg(rd, RC_GR_WRITE, NULL); +#endif - if (rd != SHR_TMP && size != 2) { - emith_sext(hr2, hr, (size == 1) ? 16 : 8); - } else if (hr != hr2) + if (rd != SHR_TMP && size != 2) { // 16, 8 + emith_sext(hr2, hr, size ? 16 : 8); + } else if (hr != hr2) // 32 emith_move_r_r(hr2, hr); - if (hr != hr2) rcache_free_tmp(hr); return hr2; } -// @(Rs,#offs) = rd -static void emit_memhandler_write_rr(sh2_reg_e rd, sh2_reg_e rs, u32 offs, int size) +// @(Rs,#offs) = rd; rd < 0 -> write arg1 +static void emit_memhandler_write_rr(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rs, u32 offs, int size) { - int hr; + int hr, hr2; u32 val; - rcache_clean(); // XXX - rcache_get_reg_arg(1, rd); + if (rd == SHR_TMP) { + host_arg2reg(hr2, 1); + } else if ((size & MF_PREDECR) && rd == rs) { // must avoid caching rd in arg1 + hr2 = rcache_get_reg_arg(1, rd, &hr); + if (hr != hr2) emith_move_r_r(hr2, hr); + } else + hr2 = rcache_get_reg_arg(1, rd, NULL); - if (gconst_get(rs, &val)) { + if (gconst_get(rs, &val) && (!(size & MF_PREDECR) /*|| !(rcache_hint_soon & (1 << rs))*/)) { + if (size & MF_PREDECR) { + val -= 1 << (size & MF_SIZEMASK); + gconst_new(rs, val); + } hr = rcache_get_tmp_arg(0); emith_move_r_imm(hr, val + offs); - } else if (offs) { - hr = rcache_get_reg_arg(0, rs); - emith_add_r_imm(hr, offs); + } else if (offs || (size & MF_PREDECR)) { + if (size & MF_PREDECR) { + hr = rcache_get_reg(rs, RC_GR_RMW, &hr2); + emith_sub_r_r_imm(hr, hr2, 1 << (size & MF_SIZEMASK)); + } + hr = rcache_get_reg_arg(0, rs, &hr2); + if (offs || hr != hr2) + emith_add_r_r_imm(hr, hr2, offs); } else - rcache_get_reg_arg(0, rs); + rcache_get_reg_arg(0, rs, NULL); emit_memhandler_write(size); } -// rd = @(Rx,Ry) -static int emit_indirect_indexed_read(sh2_reg_e rd, sh2_reg_e rx, sh2_reg_e ry, int size) +// rd = @(Rx,Ry); rd < 0 -> return a temp +static int emit_indirect_indexed_read(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rx, sh2_reg_e ry, int size) { int hr, hr2; - int a0, t; + int tx, ty; #if PROPAGATE_CONSTANTS u32 offs; if (gconst_get(ry, &offs)) - return emit_memhandler_read_rr(rd, rx, offs, size); + return emit_memhandler_read_rr(sh2, rd, rx, offs, size); if (gconst_get(rx, &offs)) - return emit_memhandler_read_rr(rd, ry, offs, size); + return emit_memhandler_read_rr(sh2, rd, ry, offs, size); #endif - a0 = rcache_get_reg_arg(0, rx); - t = rcache_get_reg(ry, RC_GR_READ); - emith_add_r_r(a0, t); + hr = rcache_get_reg_arg(0, rx, &tx); + ty = rcache_get_reg(ry, RC_GR_READ, NULL); + emith_add_r_r_r(hr, tx, ty); hr = emit_memhandler_read(size); + + size &= MF_SIZEMASK; if (rd != SHR_TMP) - hr2 = rcache_get_reg(rd, RC_GR_WRITE); +#if REMAP_REGISTER + hr2 = rcache_map_reg(rd, hr, size != 2 ? RC_GR_RMW : RC_GR_WRITE); +#else + hr2 = rcache_get_reg(rd, RC_GR_WRITE, NULL); +#endif else hr2 = hr; - if (rd != SHR_TMP && size != 2) { // 16, 8 + if (rd != SHR_TMP && size != 2) { // 16, 8 emith_sext(hr2, hr, size ? 16 : 8); } else if (hr != hr2) // 32 emith_move_r_r(hr2, hr); - if (hr != hr2) rcache_free_tmp(hr); return hr2; } -// @(Rx,Ry) = rd -static void emit_indirect_indexed_write(sh2_reg_e rd, sh2_reg_e rx, sh2_reg_e ry, int size) +// @(Rx,Ry) = rd; rd < 0 -> write arg1 +static void emit_indirect_indexed_write(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rx, sh2_reg_e ry, int size) { - int a0, t; + int hr, tx, ty; #if PROPAGATE_CONSTANTS u32 offs; if (gconst_get(ry, &offs)) - return emit_memhandler_write_rr(rd, rx, offs, size); + return emit_memhandler_write_rr(sh2, rd, rx, offs, size); if (gconst_get(rx, &offs)) - return emit_memhandler_write_rr(rd, ry, offs, size); + return emit_memhandler_write_rr(sh2, rd, ry, offs, size); #endif - rcache_clean(); // XXX - rcache_get_reg_arg(1, rd); - a0 = rcache_get_reg_arg(0, rx); - t = rcache_get_reg(ry, RC_GR_READ); - emith_add_r_r(a0, t); + if (rd != SHR_TMP) + rcache_get_reg_arg(1, rd, NULL); + hr = rcache_get_reg_arg(0, rx, &tx); + ty = rcache_get_reg(ry, RC_GR_READ, NULL); + emith_add_r_r_r(hr, tx, ty); emit_memhandler_write(size); } // @Rn+,@Rm+ -static void emit_indirect_read_double(u32 *rnr, u32 *rmr, int rn, int rm, int size) +static void emit_indirect_read_double(SH2 *sh2, int *rnr, int *rmr, sh2_reg_e rn, sh2_reg_e rm, int size) { int tmp; - rcache_get_reg_arg(0, rn); - tmp = emit_memhandler_read(size); - emith_ctx_write(tmp, offsetof(SH2, drc_tmp)); - rcache_free_tmp(tmp); - tmp = rcache_get_reg(rn, RC_GR_RMW); - emith_add_r_imm(tmp, 1 << size); - rcache_unlock(tmp); - - rcache_get_reg_arg(0, rm); - *rmr = emit_memhandler_read(size); - *rnr = rcache_get_tmp(); - emith_ctx_read(*rnr, offsetof(SH2, drc_tmp)); - tmp = rcache_get_reg(rm, RC_GR_RMW); - emith_add_r_imm(tmp, 1 << size); - rcache_unlock(tmp); + // unlock rn, rm here to avoid REG shortage in MAC operation + tmp = emit_memhandler_read_rr(sh2, SHR_TMP, rn, 0, size | MF_POSTINCR); + rcache_unlock(guest_regs[rn].vreg); + tmp = rcache_save_tmp(tmp); + *rmr = emit_memhandler_read_rr(sh2, SHR_TMP, rm, 0, size | MF_POSTINCR); + rcache_unlock(guest_regs[rm].vreg); + *rnr = rcache_restore_tmp(tmp); } static void emit_do_static_regs(int is_write, int tmpr) { int i, r, count; - for (i = 0; i < ARRAY_SIZE(reg_map_g2h); i++) { - r = reg_map_g2h[i]; - if (r == -1) + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { + if (guest_regs[i].flags & GRF_STATIC) + r = cache_regs[guest_regs[i].vreg].hreg; + else continue; - for (count = 1; i < ARRAY_SIZE(reg_map_g2h) - 1; i++, r++) { - if (reg_map_g2h[i + 1] != r + 1) + for (count = 1; i < ARRAY_SIZE(guest_regs) - 1; i++, r++) { + if ((guest_regs[i + 1].flags & GRF_STATIC) && + cache_regs[guest_regs[i + 1].vreg].hreg == r + 1) + count++; + else break; - count++; } if (count > 1) { @@ -1606,9 +2212,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) struct op_data *opd; int blkid_main = 0; int skip_op = 0; - u32 tmp, tmp2; + int tmp, tmp2; int cycles; int i, v; + u32 u; int op; base_pc = sh2->pc; @@ -1625,8 +2232,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tcache_ptr = tcache_ptrs[tcache_id]; // predict tcache overflow - tmp = tcache_ptr - tcache_bases[tcache_id]; - if (tmp > tcache_sizes[tcache_id] - MAX_BLOCK_SIZE) { + u = tcache_ptr - tcache_bases[tcache_id]; + if (u > tcache_sizes[tcache_id] - MAX_BLOCK_SIZE) { dbg(1, "tcache %d overflow", tcache_id); return NULL; } @@ -1673,7 +2280,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) for (i = 0; pc < end_pc; i++) { u32 delay_dep_fw = 0, delay_dep_bk = 0; - u32 tmp3, tmp4, sr; + int tmp3, tmp4; + u32 sr; opd = &ops[i]; op = FETCH_OP(pc); @@ -1691,7 +2299,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { if (pc != base_pc) { - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); rcache_flush(); @@ -1741,7 +2349,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_clean(); #if (DRC_DEBUG & 0x10) - rcache_get_reg_arg(0, SHR_PC); + rcache_get_reg_arg(0, SHR_PC, NULL); tmp = emit_memhandler_read(2); tmp2 = rcache_get_tmp(); tmp3 = rcache_get_tmp(); @@ -1757,7 +2365,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #endif // check cycles - sr = rcache_get_reg(SHR_SR, RC_GR_READ); + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); emith_cmp_r_imm(sr, 0); emith_jump_cond(DCOND_LE, sh2_drc_exit); do_host_disasm(tcache_id); @@ -1765,27 +2373,27 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #if (DRC_DEBUG & (8|256|512|1024)) emit_move_r_imm32(SHR_PC, pc); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); rcache_clean(); tmp = rcache_used_hreg_mask(); emith_save_caller_regs(tmp); emit_do_static_regs(1, 0); - rcache_get_reg_arg(2, SHR_SR); + rcache_get_reg_arg(2, SHR_SR, NULL); tmp2 = rcache_get_tmp_arg(0); tmp3 = rcache_get_tmp_arg(1); emith_move_r_imm(tmp2, (u32)tcache_ptr); emith_move_r_r_ptr(tmp3,CONTEXT_REG); emith_call(sh2_drc_log_entry); emith_restore_caller_regs(tmp); - rcache_invalidate(); + rcache_invalidate_tmp(); #endif } #ifdef DRC_CMP if (!(op_flags[i] & OF_DELAY_OP)) { emit_move_r_imm32(SHR_PC, pc); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); rcache_clean(); @@ -1811,7 +2419,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) delay_dep_fw = opd->dest & ops[i-1].source; delay_dep_bk = opd->source & ops[i-1].dest; if (delay_dep_fw & BITMASK1(SHR_T)) { - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); DELAY_SAVE_T(sr); } if (delay_dep_bk & BITMASK1(SHR_PC)) { @@ -1820,8 +2428,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) elprintf_sh2(sh2, EL_ANOMALY, "drc: illegal slot insn %04x @ %08x?", op, pc - 2); } + // store PC for MOVA/MOV @PC address calculation if (opd->imm != 0) - ; // addr already resolved somehow + ; // case OP_BRANCH - addr already resolved in scan_block else { switch (ops[i-1].op) { case OP_BRANCH: @@ -1829,8 +2438,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) break; case OP_BRANCH_CT: case OP_BRANCH_CF: - sr = rcache_get_reg(SHR_SR, RC_GR_READ); - tmp = rcache_get_reg(SHR_PC, RC_GR_WRITE); + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + tmp = rcache_get_reg(SHR_PC, RC_GR_WRITE, NULL); emith_move_r_imm(tmp, pc); emith_tst_r_imm(sr, T); tmp2 = ops[i-1].op == OP_BRANCH_CT ? DCOND_NE : DCOND_EQ; @@ -1839,7 +2448,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_move_r_imm_c(tmp2, tmp, ops[i-1].imm); EMITH_SJMP_END(tmp3); break; - case OP_BRANCH_N: + case OP_BRANCH_N: // BT/BF known not to be taken + // XXX could modify opd->imm instead? emit_move_r_imm32(SHR_PC, pc); break; // case OP_BRANCH_R OP_BRANCH_RF - PC already loaded @@ -1850,13 +2460,46 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // dbg(1, "unhandled delay_dep_fw: %x", delay_dep_fw & ~BITMASK1(SHR_T)); if (delay_dep_bk & ~BITMASK2(SHR_PC, SHR_PR)) dbg(1, "unhandled delay_dep_bk: %x", delay_dep_bk); + rcache_set_hint_soon(0); + rcache_set_hint_late(0); } + else + { + // inform cache about future register usage + u32 late = 0; // regs read by future ops + u32 write = 0; // regs written to (to detect write before read) + u32 soon = 0; // regs read soon + tmp = OP_ISBRANCH(opd[0].op); // branch insn detected + for (v = 1; v <= 9; v++) { + // no sense in looking any further than the next rcache flush + if (pc + 2*v < end_pc && !(op_flags[i+v] & OF_BTARGET) && + (!tmp || (op_flags[i+v] & OF_DELAY_OP))) { + late |= opd[v].source & ~write; + // ignore source regs after they have been written to + write |= opd[v].dest; + } else { + // upcoming rcache_flush, start writing back unused dirty stuff + tmp2 = write|opd[0].source|opd[0].dest; // insn may change reg aliases + rcache_clean_mask(rcache_dirty_mask() & ~tmp2); + break; + } + // XXX must also include test-irq locations! + tmp |= (OP_ISBRANCH(opd[v].op) || opd[v].op == OP_RTE || + opd[v].op == OP_TRAPA || opd[v].op == OP_UNDEFINED); + // regs needed in the next few instructions + if (v <= 4) + soon = late; + } + rcache_set_hint_soon(late); // insns 1-3 + rcache_set_hint_late(late & ~soon); // insns 4-9 + } + rcache_set_locked(opd[0].source); // try not to evict src regs for this op switch (opd->op) { case OP_BRANCH_N: + // never taken, just use up cycles goto end_op; - case OP_BRANCH: case OP_BRANCH_CT: case OP_BRANCH_CF: @@ -1868,8 +2511,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case OP_BRANCH_R: if (opd->dest & BITMASK1(SHR_PR)) emit_move_r_imm32(SHR_PR, pc + 2); - if (gconst_get(opd->rm, &tmp)) { - opd->imm = tmp; + if (gconst_get(opd->rm, &u)) { + opd->imm = u; drcf.pending_branch_direct = 1; } else { emit_move_r_r(SHR_PC, opd->rm); @@ -1878,17 +2521,17 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto end_op; case OP_BRANCH_RF: - if (gconst_get(GET_Rn(), &tmp)) { + if (gconst_get(GET_Rn(), &u)) { if (opd->dest & BITMASK1(SHR_PR)) emit_move_r_imm32(SHR_PR, pc + 2); - opd->imm = pc + 2 + tmp; + opd->imm = pc + 2 + u; drcf.pending_branch_direct = 1; } else { - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp = rcache_get_reg(SHR_PC, RC_GR_WRITE); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp = rcache_get_reg(SHR_PC, RC_GR_WRITE, NULL); emith_move_r_imm(tmp, pc + 2); if (opd->dest & BITMASK1(SHR_PR)) { - tmp3 = rcache_get_reg(SHR_PR, RC_GR_WRITE); + tmp3 = rcache_get_reg(SHR_PR, RC_GR_WRITE, NULL); emith_move_r_r(tmp3, tmp); } emith_add_r_r(tmp, tmp2); @@ -1896,22 +2539,18 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } goto end_op; - case OP_SLEEP: + case OP_SLEEP: // SLEEP 0000000000011011 printf("TODO sleep\n"); goto end_op; - case OP_RTE: + case OP_RTE: // RTE 0000000000101011 // pop PC - emit_memhandler_read_rr(SHR_PC, SHR_SP, 0, 2); + emit_memhandler_read_rr(sh2, SHR_PC, SHR_SP, 0, 2 | MF_POSTINCR); // pop SR - tmp = rcache_get_reg_arg(0, SHR_SP); - emith_add_r_imm(tmp, 4); - tmp = emit_memhandler_read(2); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + tmp = emit_memhandler_read_rr(sh2, SHR_TMP, SHR_SP, 0, 2 | MF_POSTINCR); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_write_sr(sr, tmp); rcache_free_tmp(tmp); - tmp = rcache_get_reg(SHR_SP, RC_GR_RMW); - emith_add_r_imm(tmp, 4*2); drcf.test_irq = 1; drcf.pending_branch_indirect = 1; goto end_op; @@ -1921,30 +2560,27 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) "drc: illegal op %04x @ %08x", op, pc - 2); opd->imm = (op_flags[i] & OF_B_IN_DS) ? 6 : 4; // fallthrough - case OP_TRAPA: - tmp = rcache_get_reg(SHR_SP, RC_GR_RMW); - emith_sub_r_imm(tmp, 4*2); + case OP_TRAPA: // TRAPA #imm 11000011iiiiiiii // push SR - tmp = rcache_get_reg_arg(0, SHR_SP); - emith_add_r_imm(tmp, 4); - tmp = rcache_get_reg_arg(1, SHR_SR); - emith_clear_msb(tmp, tmp, 22); - emit_memhandler_write(2); + tmp = rcache_get_reg_arg(1, SHR_SR, &tmp2); + emith_clear_msb(tmp, tmp2, 22); + emit_memhandler_write_rr(sh2, SHR_TMP, SHR_SP, 0, 2 | MF_PREDECR); // push PC - rcache_get_reg_arg(0, SHR_SP); - tmp = rcache_get_tmp_arg(1); - if (op == OP_TRAPA) + if (op == OP_TRAPA) { + tmp = rcache_get_tmp_arg(1); emith_move_r_imm(tmp, pc); - else if (drcf.pending_branch_indirect) { - tmp2 = rcache_get_reg(SHR_PC, RC_GR_READ); - emith_move_r_r(tmp, tmp2); - } else + } else if (drcf.pending_branch_indirect) { + tmp = rcache_get_reg_arg(1, SHR_PC, NULL); + } else { + tmp = rcache_get_tmp_arg(1); emith_move_r_imm(tmp, pc - 2); - emit_memhandler_write(2); + } + emith_move_r_imm(tmp, pc); + emit_memhandler_write_rr(sh2, SHR_TMP, SHR_SP, 0, 2 | MF_PREDECR); // obtain new PC - emit_memhandler_read_rr(SHR_PC, SHR_VBR, opd->imm * 4, 2); + emit_memhandler_read_rr(sh2, SHR_PC, SHR_VBR, opd->imm * 4, 2); // indirect jump -> back to dispatcher - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); rcache_flush(); emith_jump(sh2_drc_dispatcher); @@ -1952,25 +2588,27 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case OP_LOAD_POOL: #if PROPAGATE_CONSTANTS - if (opd->imm != 0 && opd->imm < end_literals - && literal_addr_count < MAX_LITERALS) + if ((opd->imm && opd->imm >= base_pc && opd->imm < end_literals) || + dr_is_rom(opd->imm)) { ADD_TO_ARRAY(literal_addr, literal_addr_count, opd->imm,); if (opd->size == 2) - tmp = FETCH32(opd->imm); + u = FETCH32(opd->imm); else - tmp = (s16)FETCH_OP(opd->imm); - gconst_new(GET_Rn(), tmp); + u = (s16)FETCH_OP(opd->imm); + // tweak for Blackthorne: avoid stack overwriting + if (GET_Rn() == SHR_SP && u == 0x0603f800) u = 0x0603f880; + gconst_new(GET_Rn(), u); } else #endif { - tmp = rcache_get_tmp_arg(0); - if (opd->imm != 0) + if (opd->imm != 0) { + tmp = rcache_get_tmp_arg(0); emith_move_r_imm(tmp, opd->imm); - else { - // have to calculate read addr from PC - tmp2 = rcache_get_reg(SHR_PC, RC_GR_READ); + } else { + // have to calculate read addr from PC for delay slot + tmp = rcache_get_reg_arg(0, SHR_PC, &tmp2); if (opd->size == 2) { emith_add_r_r_imm(tmp, tmp2, 2 + (op & 0xff) * 4); emith_bic_r_imm(tmp, 3); @@ -1979,21 +2617,27 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_add_r_r_imm(tmp, tmp2, 2 + (op & 0xff) * 2); } tmp2 = emit_memhandler_read(opd->size); - tmp3 = rcache_get_reg(GET_Rn(), RC_GR_WRITE); - if (opd->size == 2) - emith_move_r_r(tmp3, tmp2); - else +#if REMAP_REGISTER + tmp3 = rcache_map_reg(GET_Rn(), tmp2, opd->size != 2 ? RC_GR_RMW : RC_GR_WRITE); +#else + tmp3 = rcache_get_reg(GET_Rn(), RC_GR_WRITE, NULL); +#endif + if (opd->size != 2) { emith_sext(tmp3, tmp2, 16); - rcache_free_tmp(tmp2); + } else if (tmp3 != tmp2) + emith_move_r_r(tmp3, tmp2); + if (tmp3 != tmp2) + rcache_free_tmp(tmp2); } goto end_op; - case OP_MOVA: + case OP_MOVA: // MOVA @(disp,PC),R0 11000111dddddddd if (opd->imm != 0) emit_move_r_imm32(SHR_R0, opd->imm); - else { // delay slot case, pc can have either value - tmp2 = rcache_get_reg(SHR_PC, RC_GR_READ); - tmp = rcache_get_reg(SHR_R0, RC_GR_WRITE); + else { + // have to calculate addr from PC for delay slot + tmp2 = rcache_get_reg(SHR_PC, RC_GR_READ, NULL); + tmp = rcache_get_reg(SHR_R0, RC_GR_WRITE, NULL); emith_add_r_r_imm(tmp, tmp2, 2 + (op & 0xff) * 4); emith_bic_r_imm(tmp, 3); } @@ -2021,33 +2665,34 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) default: goto default_; } - tmp3 = rcache_get_reg(tmp2, RC_GR_READ); - tmp = rcache_get_reg(GET_Rn(), RC_GR_WRITE); - emith_move_r_r(tmp, tmp3); - if (tmp2 == SHR_SR) - emith_clear_msb(tmp, tmp, 22); // reserved bits defined by ISA as 0 + if (tmp2 == SHR_SR) { + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_WRITE, NULL); + emith_clear_msb(tmp, sr, 22); // reserved bits defined by ISA as 0 + } else + emit_move_r_r(GET_Rn(), tmp2); goto end_op; case 0x04: // MOV.B Rm,@(R0,Rn) 0000nnnnmmmm0100 case 0x05: // MOV.W Rm,@(R0,Rn) 0000nnnnmmmm0101 case 0x06: // MOV.L Rm,@(R0,Rn) 0000nnnnmmmm0110 - emit_indirect_indexed_write(GET_Rm(), SHR_R0, GET_Rn(), op & 3); + emit_indirect_indexed_write(sh2, GET_Rm(), SHR_R0, GET_Rn(), op & 3); goto end_op; case 0x07: // MUL.L Rm,Rn 0000nnnnmmmm0111 - tmp = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - tmp3 = rcache_get_reg(SHR_MACL, RC_GR_WRITE); + tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(SHR_MACL, RC_GR_WRITE, NULL); emith_mul(tmp3, tmp2, tmp); goto end_op; case 0x08: switch (GET_Fx()) { case 0: // CLRT 0000000000001000 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_bic_r_imm(sr, T); break; case 1: // SETT 0000000000011000 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_or_r_imm(sr, T); break; case 2: // CLRMAC 0000000000101000 @@ -2064,12 +2709,12 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0: // NOP 0000000000001001 break; case 1: // DIV0U 0000000000011001 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_bic_r_imm(sr, M|Q|T); break; case 2: // MOVT Rn 0000nnnn00101001 - sr = rcache_get_reg(SHR_SR, RC_GR_READ); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_WRITE); + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_WRITE, NULL); emith_clear_msb(tmp2, sr, 31); break; default: @@ -2096,13 +2741,13 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x0c: // MOV.B @(R0,Rm),Rn 0000nnnnmmmm1100 case 0x0d: // MOV.W @(R0,Rm),Rn 0000nnnnmmmm1101 case 0x0e: // MOV.L @(R0,Rm),Rn 0000nnnnmmmm1110 - emit_indirect_indexed_read(GET_Rn(), SHR_R0, GET_Rm(), op & 3); + emit_indirect_indexed_read(sh2, GET_Rn(), SHR_R0, GET_Rm(), op & 3); goto end_op; case 0x0f: // MAC.L @Rm+,@Rn+ 0000nnnnmmmm1111 - emit_indirect_read_double(&tmp, &tmp2, GET_Rn(), GET_Rm(), 2); - sr = rcache_get_reg(SHR_SR, RC_GR_READ); - tmp3 = rcache_get_reg(SHR_MACL, RC_GR_RMW); - tmp4 = rcache_get_reg(SHR_MACH, RC_GR_RMW); + emit_indirect_read_double(sh2, &tmp, &tmp2, GET_Rn(), GET_Rm(), 2); + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + tmp3 = rcache_get_reg(SHR_MACL, RC_GR_RMW, NULL); + tmp4 = rcache_get_reg(SHR_MACH, RC_GR_RMW, NULL); emith_sh2_macl(tmp3, tmp4, tmp, tmp2, sr); rcache_free_tmp(tmp2); rcache_free_tmp(tmp); @@ -2113,7 +2758,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) ///////////////////////////////////////////// case 0x01: // MOV.L Rm,@(disp,Rn) 0001nnnnmmmmdddd - emit_memhandler_write_rr(GET_Rm(), GET_Rn(), (op & 0x0f) * 4, 2); + emit_memhandler_write_rr(sh2, GET_Rm(), GET_Rn(), (op & 0x0f) * 4, 2); goto end_op; case 0x02: @@ -2122,19 +2767,17 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x00: // MOV.B Rm,@Rn 0010nnnnmmmm0000 case 0x01: // MOV.W Rm,@Rn 0010nnnnmmmm0001 case 0x02: // MOV.L Rm,@Rn 0010nnnnmmmm0010 - emit_memhandler_write_rr(GET_Rm(), GET_Rn(), 0, op & 3); + emit_memhandler_write_rr(sh2, GET_Rm(), GET_Rn(), 0, op & 3); goto end_op; case 0x04: // MOV.B Rm,@-Rn 0010nnnnmmmm0100 case 0x05: // MOV.W Rm,@-Rn 0010nnnnmmmm0101 case 0x06: // MOV.L Rm,@-Rn 0010nnnnmmmm0110 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - emith_sub_r_imm(tmp, (1 << (op & 3))); - emit_memhandler_write_rr(GET_Rm(), GET_Rn(), 0, op & 3); + emit_memhandler_write_rr(sh2, GET_Rm(), GET_Rn(), 0, (op & 3) | MF_PREDECR); goto end_op; case 0x07: // DIV0S Rm,Rn 0010nnnnmmmm0111 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); emith_bic_r_imm(sr, M|Q|T); emith_tst_r_imm(tmp2, (1<<31)); EMITH_SJMP_START(DCOND_EQ); @@ -2150,56 +2793,69 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) EMITH_SJMP_END(DCOND_PL); goto end_op; case 0x08: // TST Rm,Rn 0010nnnnmmmm1000 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); emith_bic_r_imm(sr, T); emith_tst_r_r(tmp2, tmp3); emit_or_t_if_eq(sr); goto end_op; case 0x09: // AND Rm,Rn 0010nnnnmmmm1001 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - emith_and_r_r(tmp, tmp2); + if (GET_Rm() != GET_Rn()) { + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); + emith_and_r_r_r(tmp, tmp3, tmp2); + } goto end_op; case 0x0a: // XOR Rm,Rn 0010nnnnmmmm1010 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - emith_eor_r_r(tmp, tmp2); +#if PROPAGATE_CONSTANTS + if (GET_Rn() == GET_Rm()) { + gconst_new(GET_Rn(), 0); + goto end_op; + } +#endif + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); + emith_eor_r_r_r(tmp, tmp3, tmp2); goto end_op; case 0x0b: // OR Rm,Rn 0010nnnnmmmm1011 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - emith_or_r_r(tmp, tmp2); + if (GET_Rm() != GET_Rn()) { + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); + emith_or_r_r_r(tmp, tmp3, tmp2); + } goto end_op; case 0x0c: // CMP/STR Rm,Rn 0010nnnnmmmm1100 tmp = rcache_get_tmp(); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); emith_eor_r_r_r(tmp, tmp2, tmp3); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_bic_r_imm(sr, T); emith_tst_r_imm(tmp, 0x000000ff); - emit_or_t_if_eq(sr); - emith_tst_r_imm(tmp, 0x0000ff00); - emit_or_t_if_eq(sr); - emith_tst_r_imm(tmp, 0x00ff0000); - emit_or_t_if_eq(sr); - emith_tst_r_imm(tmp, 0xff000000); + EMITH_SJMP_START(DCOND_EQ); + emith_tst_r_imm_c(DCOND_NE, tmp, 0x0000ff00); + EMITH_SJMP_START(DCOND_EQ); + emith_tst_r_imm_c(DCOND_NE, tmp, 0x00ff0000); + EMITH_SJMP_START(DCOND_EQ); + emith_tst_r_imm_c(DCOND_NE, tmp, 0xff000000); + EMITH_SJMP_END(DCOND_EQ); + EMITH_SJMP_END(DCOND_EQ); + EMITH_SJMP_END(DCOND_EQ); emit_or_t_if_eq(sr); rcache_free_tmp(tmp); goto end_op; case 0x0d: // XTRCT Rm,Rn 0010nnnnmmmm1101 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - emith_lsr(tmp, tmp, 16); + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); + emith_lsr(tmp, tmp3, 16); emith_or_r_r_lsl(tmp, tmp2, 16); goto end_op; case 0x0e: // MULU.W Rm,Rn 0010nnnnmmmm1110 case 0x0f: // MULS.W Rm,Rn 0010nnnnmmmm1111 - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ); - tmp = rcache_get_reg(SHR_MACL, RC_GR_WRITE); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(SHR_MACL, RC_GR_WRITE, NULL); if (op & 1) { emith_sext(tmp, tmp2, 16); } else @@ -2224,9 +2880,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x03: // CMP/GE Rm,Rn 0011nnnnmmmm0011 case 0x06: // CMP/HI Rm,Rn 0011nnnnmmmm0110 case 0x07: // CMP/GT Rm,Rn 0011nnnnmmmm0111 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); emith_bic_r_imm(sr, T); emith_cmp_r_r(tmp2, tmp3); switch (op & 0x07) @@ -2264,11 +2920,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // Q2 = carry(Rn -= Rm) // Q = M ^ Q1 ^ Q2 // T = (Q == M) = !(Q ^ M) = !(Q1 ^ Q2) - tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_tpop_carry(sr, 0); - emith_adcf_r_r(tmp2, tmp2); + emith_adcf_r_r_r(tmp2, tmp, tmp); emith_tpush_carry(sr, 0); // keep Q1 in T for now tmp4 = rcache_get_tmp(); emith_and_r_r_imm(tmp4, sr, M); @@ -2289,55 +2945,61 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_eor_r_imm(sr, T); // T = !(Q1 ^ Q2) goto end_op; case 0x05: // DMULU.L Rm,Rn 0011nnnnmmmm0101 - tmp = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - tmp3 = rcache_get_reg(SHR_MACL, RC_GR_WRITE); - tmp4 = rcache_get_reg(SHR_MACH, RC_GR_WRITE); + tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(SHR_MACL, RC_GR_WRITE, NULL); + tmp4 = rcache_get_reg(SHR_MACH, RC_GR_WRITE, NULL); emith_mul_u64(tmp3, tmp4, tmp, tmp2); goto end_op; case 0x08: // SUB Rm,Rn 0011nnnnmmmm1000 +#if PROPAGATE_CONSTANTS + if (GET_Rn() == GET_Rm()) { + gconst_new(GET_Rn(), 0); + goto end_op; + } +#endif case 0x0c: // ADD Rm,Rn 0011nnnnmmmm1100 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); if (op & 4) { - emith_add_r_r(tmp, tmp2); + emith_add_r_r_r(tmp, tmp3, tmp2); } else - emith_sub_r_r(tmp, tmp2); + emith_sub_r_r_r(tmp, tmp3, tmp2); goto end_op; case 0x0a: // SUBC Rm,Rn 0011nnnnmmmm1010 case 0x0e: // ADDC Rm,Rn 0011nnnnmmmm1110 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); if (op & 4) { // adc emith_tpop_carry(sr, 0); - emith_adcf_r_r(tmp, tmp2); + emith_adcf_r_r_r(tmp, tmp3, tmp2); emith_tpush_carry(sr, 0); } else { emith_tpop_carry(sr, 1); - emith_sbcf_r_r(tmp, tmp2); + emith_sbcf_r_r_r(tmp, tmp3, tmp2); emith_tpush_carry(sr, 1); } goto end_op; case 0x0b: // SUBV Rm,Rn 0011nnnnmmmm1011 case 0x0f: // ADDV Rm,Rn 0011nnnnmmmm1111 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_bic_r_imm(sr, T); if (op & 4) { - emith_addf_r_r(tmp, tmp2); + emith_addf_r_r_r(tmp, tmp3, tmp2); } else - emith_subf_r_r(tmp, tmp2); + emith_subf_r_r_r(tmp, tmp3, tmp2); EMITH_SJMP_START(DCOND_VC); emith_or_r_imm_c(DCOND_VS, sr, T); EMITH_SJMP_END(DCOND_VC); goto end_op; case 0x0d: // DMULS.L Rm,Rn 0011nnnnmmmm1101 - tmp = rcache_get_reg(GET_Rn(), RC_GR_READ); - tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ); - tmp3 = rcache_get_reg(SHR_MACL, RC_GR_WRITE); - tmp4 = rcache_get_reg(SHR_MACH, RC_GR_WRITE); + tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp3 = rcache_get_reg(SHR_MACL, RC_GR_WRITE, NULL); + tmp4 = rcache_get_reg(SHR_MACH, RC_GR_WRITE, NULL); emith_mul_s64(tmp3, tmp4, tmp, tmp2); goto end_op; } @@ -2352,17 +3014,17 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { case 0: // SHLL Rn 0100nnnn00000000 case 2: // SHAL Rn 0100nnnn00100000 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_tpop_carry(sr, 0); // dummy - emith_lslf(tmp, tmp, 1); + emith_lslf(tmp, tmp2, 1); emith_tpush_carry(sr, 0); goto end_op; case 1: // DT Rn 0100nnnn00010000 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_bic_r_imm(sr, T); - emith_subf_r_imm(tmp, 1); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); + emith_subf_r_r_imm(tmp, tmp2, 1); emit_or_t_if_eq(sr); goto end_op; } @@ -2372,18 +3034,18 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { case 0: // SHLR Rn 0100nnnn00000001 case 2: // SHAR Rn 0100nnnn00100001 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_tpop_carry(sr, 0); // dummy if (op & 0x20) { - emith_asrf(tmp, tmp, 1); + emith_asrf(tmp, tmp2, 1); } else - emith_lsrf(tmp, tmp, 1); + emith_lsrf(tmp, tmp2, 1); emith_tpush_carry(sr, 0); goto end_op; case 1: // CMP/PZ Rn 0100nnnn00010001 - tmp = rcache_get_reg(GET_Rn(), RC_GR_READ); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_bic_r_imm(sr, T); emith_cmp_r_imm(tmp, 0); EMITH_SJMP_START(DCOND_LT); @@ -2417,14 +3079,12 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) default: goto default_; } - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW); - emith_sub_r_imm(tmp2, 4); - rcache_clean(); // XXX - rcache_get_reg_arg(0, GET_Rn()); - tmp3 = rcache_get_reg_arg(1, tmp); - if (tmp == SHR_SR) - emith_clear_msb(tmp3, tmp3, 22); // reserved bits defined by ISA as 0 - emit_memhandler_write(2); + tmp3 = rcache_get_reg_arg(1, tmp, &tmp4); + if (tmp == SHR_SR) { + emith_clear_msb(tmp3, tmp4, 22); // reserved bits defined by ISA as 0 + } else if (tmp3 != tmp4) + emith_move_r_r(tmp3, tmp4); + emit_memhandler_write_rr(sh2, SHR_TMP, GET_Rn(), 0, 2 | MF_PREDECR); goto end_op; case 0x04: case 0x05: @@ -2432,19 +3092,19 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { case 0x04: // ROTL Rn 0100nnnn00000100 case 0x05: // ROTR Rn 0100nnnn00000101 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_tpop_carry(sr, 0); // dummy if (op & 1) { - emith_rorf(tmp, tmp, 1); + emith_rorf(tmp, tmp2, 1); } else - emith_rolf(tmp, tmp, 1); + emith_rolf(tmp, tmp2, 1); emith_tpush_carry(sr, 0); goto end_op; case 0x24: // ROTCL Rn 0100nnnn00100100 case 0x25: // ROTCR Rn 0100nnnn00100101 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, NULL); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_tpop_carry(sr, 0); if (op & 1) { emith_rorcf(tmp); @@ -2453,8 +3113,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_tpush_carry(sr, 0); goto end_op; case 0x15: // CMP/PL Rn 0100nnnn00010101 - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_bic_r_imm(sr, T); emith_cmp_r_imm(tmp, 0); EMITH_SJMP_START(DCOND_LE); @@ -2489,15 +3149,13 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto default_; } if (tmp == SHR_SR) { - tmp2 = emit_memhandler_read_rr(SHR_TMP, GET_Rn(), 0, 2); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + tmp2 = emit_memhandler_read_rr(sh2, SHR_TMP, GET_Rn(), 0, 2 | MF_POSTINCR); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_write_sr(sr, tmp2); rcache_free_tmp(tmp2); drcf.test_irq = 1; } else - emit_memhandler_read_rr(tmp, GET_Rn(), 0, 2); - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); - emith_add_r_imm(tmp, 4); + emit_memhandler_read_rr(sh2, tmp, GET_Rn(), 0, 2 | MF_POSTINCR); goto end_op; case 0x08: case 0x09: @@ -2521,11 +3179,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) default: goto default_; } - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); if (op & 1) { - emith_lsr(tmp2, tmp2, tmp); + emith_lsr(tmp2, tmp3, tmp); } else - emith_lsl(tmp2, tmp2, tmp); + emith_lsl(tmp2, tmp3, tmp); goto end_op; case 0x0a: switch (GET_Fx()) @@ -2549,18 +3207,17 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { case 1: // TAS.B @Rn 0100nnnn00011011 // XXX: is TAS working on 32X? - rcache_get_reg_arg(0, GET_Rn()); + rcache_get_reg_arg(0, GET_Rn(), NULL); tmp = emit_memhandler_read(0); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_bic_r_imm(sr, T); emith_cmp_r_imm(tmp, 0); emit_or_t_if_eq(sr); - rcache_clean(); // XXX emith_or_r_imm(tmp, 0x80); tmp2 = rcache_get_tmp_arg(1); // assuming it differs to tmp emith_move_r_r(tmp2, tmp); rcache_free_tmp(tmp); - rcache_get_reg_arg(0, GET_Rn()); + rcache_get_reg_arg(0, GET_Rn(), NULL); emit_memhandler_write(0); break; default: @@ -2568,7 +3225,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } goto end_op; case 0x0e: - tmp = rcache_get_reg(GET_Rn(), RC_GR_READ); switch (GET_Fx()) { case 0: // LDC Rm,SR 0100mmmm00001110 @@ -2584,20 +3240,19 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto default_; } if (tmp2 == SHR_SR) { - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); emith_write_sr(sr, tmp); drcf.test_irq = 1; - } else { - tmp2 = rcache_get_reg(tmp2, RC_GR_WRITE); - emith_move_r_r(tmp2, tmp); - } + } else + emit_move_r_r(tmp2, GET_Rn()); goto end_op; case 0x0f: // MAC.W @Rm+,@Rn+ 0100nnnnmmmm1111 - emit_indirect_read_double(&tmp, &tmp2, GET_Rn(), GET_Rm(), 1); - sr = rcache_get_reg(SHR_SR, RC_GR_READ); - tmp3 = rcache_get_reg(SHR_MACL, RC_GR_RMW); - tmp4 = rcache_get_reg(SHR_MACH, RC_GR_RMW); + emit_indirect_read_double(sh2, &tmp, &tmp2, GET_Rn(), GET_Rm(), 1); + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + tmp3 = rcache_get_reg(SHR_MACL, RC_GR_RMW, NULL); + tmp4 = rcache_get_reg(SHR_MACH, RC_GR_RMW, NULL); emith_sh2_macw(tmp3, tmp4, tmp, tmp2, sr); rcache_free_tmp(tmp2); rcache_free_tmp(tmp); @@ -2608,7 +3263,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) ///////////////////////////////////////////// case 0x05: // MOV.L @(disp,Rm),Rn 0101nnnnmmmmdddd - emit_memhandler_read_rr(GET_Rn(), GET_Rm(), (op & 0x0f) * 4, 2); + emit_memhandler_read_rr(sh2, GET_Rn(), GET_Rm(), (op & 0x0f) * 4, 2); goto end_op; ///////////////////////////////////////////// @@ -2621,21 +3276,17 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x04: // MOV.B @Rm+,Rn 0110nnnnmmmm0100 case 0x05: // MOV.W @Rm+,Rn 0110nnnnmmmm0101 case 0x06: // MOV.L @Rm+,Rn 0110nnnnmmmm0110 - emit_memhandler_read_rr(GET_Rn(), GET_Rm(), 0, op & 3); - if ((op & 7) >= 4 && GET_Rn() != GET_Rm()) { - tmp = rcache_get_reg(GET_Rm(), RC_GR_RMW); - emith_add_r_imm(tmp, (1 << (op & 3))); - } + tmp = ((op & 7) >= 4 && GET_Rn() != GET_Rm()) ? MF_POSTINCR : 0; + emit_memhandler_read_rr(sh2, GET_Rn(), GET_Rm(), 0, (op & 3) | tmp); + goto end_op; + case 0x03: // MOV Rm,Rn 0110nnnnmmmm0011 + emit_move_r_r(GET_Rn(), GET_Rm()); goto end_op; - case 0x03: case 0x07 ... 0x0f: - tmp = rcache_get_reg(GET_Rm(), RC_GR_READ); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_WRITE); + tmp = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_WRITE, NULL); switch (op & 0x0f) { - case 0x03: // MOV Rm,Rn 0110nnnnmmmm0011 - emith_move_r_r(tmp2, tmp); - break; case 0x07: // NOT Rm,Rn 0110nnnnmmmm0111 emith_mvn_r_r(tmp2, tmp); break; @@ -2657,7 +3308,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_rol(tmp2, tmp, 16); break; case 0x0a: // NEGC Rm,Rn 0110nnnnmmmm1010 - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_tpop_carry(sr, 1); emith_negcf_r_r(tmp2, tmp); emith_tpush_carry(sr, 1); @@ -2685,11 +3336,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) ///////////////////////////////////////////// case 0x07: // ADD #imm,Rn 0111nnnniiiiiiii - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW); + tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); if (op & 0x80) { // adding negative - emith_sub_r_imm(tmp, -op & 0xff); + emith_sub_r_r_imm(tmp, tmp2, -op & 0xff); } else - emith_add_r_imm(tmp, op & 0xff); + emith_add_r_r_imm(tmp, tmp2, op & 0xff); goto end_op; ///////////////////////////////////////////// @@ -2699,17 +3350,16 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x0000: // MOV.B R0,@(disp,Rn) 10000000nnnndddd case 0x0100: // MOV.W R0,@(disp,Rn) 10000001nnnndddd tmp = (op & 0x100) >> 8; - emit_memhandler_write_rr(SHR_R0, GET_Rm(), (op & 0x0f) << tmp, tmp); + emit_memhandler_write_rr(sh2, SHR_R0, GET_Rm(), (op & 0x0f) << tmp, tmp); goto end_op; case 0x0400: // MOV.B @(disp,Rm),R0 10000100mmmmdddd case 0x0500: // MOV.W @(disp,Rm),R0 10000101mmmmdddd tmp = (op & 0x100) >> 8; - emit_memhandler_read_rr(SHR_R0, GET_Rm(), (op & 0x0f) << tmp, tmp); + emit_memhandler_read_rr(sh2, SHR_R0, GET_Rm(), (op & 0x0f) << tmp, tmp); goto end_op; case 0x0800: // CMP/EQ #imm,R0 10001000iiiiiiii - // XXX: could use cmn - tmp2 = rcache_get_reg(SHR_R0, RC_GR_READ); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + tmp2 = rcache_get_reg(SHR_R0, RC_GR_READ, NULL); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_bic_r_imm(sr, T); emith_cmp_r_imm(tmp2, (s8)(op & 0xff)); emit_or_t_if_eq(sr); @@ -2725,60 +3375,62 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x0100: // MOV.W R0,@(disp,GBR) 11000001dddddddd case 0x0200: // MOV.L R0,@(disp,GBR) 11000010dddddddd tmp = (op & 0x300) >> 8; - emit_memhandler_write_rr(SHR_R0, SHR_GBR, (op & 0xff) << tmp, tmp); + emit_memhandler_write_rr(sh2, SHR_R0, SHR_GBR, (op & 0xff) << tmp, tmp); goto end_op; case 0x0400: // MOV.B @(disp,GBR),R0 11000100dddddddd case 0x0500: // MOV.W @(disp,GBR),R0 11000101dddddddd case 0x0600: // MOV.L @(disp,GBR),R0 11000110dddddddd tmp = (op & 0x300) >> 8; - emit_memhandler_read_rr(SHR_R0, SHR_GBR, (op & 0xff) << tmp, tmp); + emit_memhandler_read_rr(sh2, SHR_R0, SHR_GBR, (op & 0xff) << tmp, tmp); goto end_op; case 0x0800: // TST #imm,R0 11001000iiiiiiii - tmp = rcache_get_reg(SHR_R0, RC_GR_READ); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + tmp = rcache_get_reg(SHR_R0, RC_GR_READ, NULL); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_bic_r_imm(sr, T); emith_tst_r_imm(tmp, op & 0xff); emit_or_t_if_eq(sr); goto end_op; case 0x0900: // AND #imm,R0 11001001iiiiiiii - tmp = rcache_get_reg(SHR_R0, RC_GR_RMW); - emith_and_r_imm(tmp, op & 0xff); + tmp = rcache_get_reg(SHR_R0, RC_GR_RMW, &tmp2); + emith_and_r_r_imm(tmp, tmp2, (op & 0xff)); goto end_op; case 0x0a00: // XOR #imm,R0 11001010iiiiiiii - tmp = rcache_get_reg(SHR_R0, RC_GR_RMW); - emith_eor_r_imm(tmp, op & 0xff); + if (op & 0xff) { + tmp = rcache_get_reg(SHR_R0, RC_GR_RMW, &tmp2); + emith_eor_r_r_imm(tmp, tmp2, (op & 0xff)); + } goto end_op; case 0x0b00: // OR #imm,R0 11001011iiiiiiii - tmp = rcache_get_reg(SHR_R0, RC_GR_RMW); - emith_or_r_imm(tmp, op & 0xff); + if (op & 0xff) { + tmp = rcache_get_reg(SHR_R0, RC_GR_RMW, &tmp2); + emith_or_r_r_imm(tmp, tmp2, (op & 0xff)); + } goto end_op; case 0x0c00: // TST.B #imm,@(R0,GBR) 11001100iiiiiiii - tmp = emit_indirect_indexed_read(SHR_TMP, SHR_R0, SHR_GBR, 0); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + tmp = emit_indirect_indexed_read(sh2, SHR_TMP, SHR_R0, SHR_GBR, 0); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_bic_r_imm(sr, T); emith_tst_r_imm(tmp, op & 0xff); emit_or_t_if_eq(sr); rcache_free_tmp(tmp); goto end_op; case 0x0d00: // AND.B #imm,@(R0,GBR) 11001101iiiiiiii - tmp = emit_indirect_indexed_read(SHR_TMP, SHR_R0, SHR_GBR, 0); - emith_and_r_imm(tmp, op & 0xff); + tmp = emit_indirect_indexed_read(sh2, SHR_TMP, SHR_R0, SHR_GBR, 0); + tmp2 = rcache_get_tmp_arg(1); + emith_and_r_r_imm(tmp2, tmp, (op & 0xff)); goto end_rmw_op; case 0x0e00: // XOR.B #imm,@(R0,GBR) 11001110iiiiiiii - tmp = emit_indirect_indexed_read(SHR_TMP, SHR_R0, SHR_GBR, 0); - emith_eor_r_imm(tmp, op & 0xff); + tmp = emit_indirect_indexed_read(sh2, SHR_TMP, SHR_R0, SHR_GBR, 0); + tmp2 = rcache_get_tmp_arg(1); + emith_eor_r_r_imm(tmp2, tmp, (op & 0xff)); goto end_rmw_op; case 0x0f00: // OR.B #imm,@(R0,GBR) 11001111iiiiiiii - tmp = emit_indirect_indexed_read(SHR_TMP, SHR_R0, SHR_GBR, 0); - emith_or_r_imm(tmp, op & 0xff); - end_rmw_op: + tmp = emit_indirect_indexed_read(sh2, SHR_TMP, SHR_R0, SHR_GBR, 0); tmp2 = rcache_get_tmp_arg(1); - emith_move_r_r(tmp2, tmp); + emith_or_r_r_imm(tmp2, tmp, (op & 0xff)); + end_rmw_op: rcache_free_tmp(tmp); - tmp3 = rcache_get_reg_arg(0, SHR_GBR); - tmp4 = rcache_get_reg(SHR_R0, RC_GR_READ); - emith_add_r_r(tmp3, tmp4); - emit_memhandler_write(0); + emit_indirect_indexed_write(sh2, SHR_TMP, SHR_R0, SHR_GBR, 0); goto end_op; } goto default_; @@ -2786,7 +3438,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) ///////////////////////////////////////////// case 0x0e: // MOV #imm,Rn 1110nnnniiiiiiii - emit_move_r_imm32(GET_Rn(), (u32)(signed int)(signed char)op); + emit_move_r_imm32(GET_Rn(), (s8)op); goto end_op; default: @@ -2810,7 +3462,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // test irq? if (drcf.test_irq && !drcf.pending_branch_direct) { - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); if (!drcf.pending_branch_indirect) emit_move_r_imm32(SHR_PC, pc); @@ -2831,16 +3483,16 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) void *target = NULL; int ctaken = 0; - if (opd_b->op == OP_BRANCH_CT || opd_b->op == OP_BRANCH_CF) { + if (OP_ISBRACND(opd_b->op)) { ctaken = (op_flags[i] & OF_DELAY_OP) ? 1 : 2; } cycles += ctaken; // assume branch taken - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); rcache_clean(); // emit condition test for conditional branch - if (opd_b->op == OP_BRANCH_CT || opd_b->op == OP_BRANCH_CF) { + if (OP_ISBRACND(opd_b->op)) { cond = (opd_b->op == OP_BRANCH_CF) ? DCOND_EQ : DCOND_NE; if (delay_dep_fw & BITMASK1(SHR_T)) emith_tst_r_imm(sr, T_save); @@ -2893,7 +3545,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.pending_branch_direct = 0; } else if (drcf.pending_branch_indirect) { - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); rcache_flush(); emith_jump(sh2_drc_dispatcher); @@ -2903,22 +3555,17 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) do_host_disasm(tcache_id); } - tmp = rcache_get_reg(SHR_SR, RC_GR_RMW); - FLUSH_CYCLES(tmp); - rcache_flush(); - // check the last op if (op_flags[i-1] & OF_DELAY_OP) opd = &ops[i-2]; else opd = &ops[i-1]; - if (opd->op != OP_BRANCH && opd->op != OP_BRANCH_R - && opd->op != OP_BRANCH_RF && opd->op != OP_RTE) + if (! OP_ISBRAUC(opd->op)) { void *target; - s32 tmp = rcache_get_reg(SHR_SR, RC_GR_RMW); + s32 tmp = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(tmp); emit_move_r_imm32(SHR_PC, pc); @@ -2975,8 +3622,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // mark literals for (i = 0; i < literal_addr_count; i++) { - tmp = literal_addr[i]; - drc_ram_blk[(tmp & mask) >> shift] = 1; + u = literal_addr[i]; + drc_ram_blk[(u & mask) >> shift] = 1; } // add to invalidation lookup lists @@ -3037,7 +3684,7 @@ static void sh2_generate_utils(void) // sh2_drc_dispatcher(void) sh2_drc_dispatcher = (void *)tcache_ptr; - sr = rcache_get_reg(SHR_SR, RC_GR_READ); + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); emith_cmp_r_imm(sr, 0); emith_jump_cond(DCOND_LT, sh2_drc_exit); rcache_invalidate(); @@ -3065,7 +3712,7 @@ static void sh2_generate_utils(void) // assumes it's called from main function (may jump to dispatcher) sh2_drc_test_irq = (void *)tcache_ptr; emith_ctx_read(arg1, offsetof(SH2, pending_level)); - sr = rcache_get_reg(SHR_SR, RC_GR_READ); + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); emith_lsr(arg0, sr, I_SHIFT); emith_and_r_imm(arg0, 0x0f); emith_cmp_r_r(arg1, arg0); // pending_level > ((sr >> 4) & 0x0f)? @@ -3073,26 +3720,26 @@ static void sh2_generate_utils(void) emith_ret_c(DCOND_LE); // nope, return EMITH_SJMP_END(DCOND_GT); // adjust SP - tmp = rcache_get_reg(SHR_SP, RC_GR_RMW); + tmp = rcache_get_reg(SHR_SP, RC_GR_RMW, NULL); emith_sub_r_imm(tmp, 4*2); rcache_clean(); // push SR - tmp = rcache_get_reg_arg(0, SHR_SP); + tmp = rcache_get_reg_arg(0, SHR_SP, NULL); emith_add_r_imm(tmp, 4); - tmp = rcache_get_reg_arg(1, SHR_SR); + tmp = rcache_get_reg_arg(1, SHR_SR, NULL); emith_clear_msb(tmp, tmp, 22); emith_move_r_r_ptr(arg2, CONTEXT_REG); emith_call(p32x_sh2_write32); // XXX: use sh2_drc_write32? rcache_invalidate(); // push PC - rcache_get_reg_arg(0, SHR_SP); + rcache_get_reg_arg(0, SHR_SP, NULL); emith_ctx_read(arg1, SHR_PC * 4); emith_move_r_r_ptr(arg2, CONTEXT_REG); emith_call(p32x_sh2_write32); rcache_invalidate(); // update I, cycles, do callback emith_ctx_read(arg1, offsetof(SH2, pending_level)); - sr = rcache_get_reg(SHR_SR, RC_GR_RMW); + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_bic_r_imm(sr, I); emith_or_r_r_lsl(sr, arg1, I_SHIFT); emith_sub_r_imm(sr, 13 << 12); // at least 13 cycles @@ -3441,6 +4088,7 @@ int sh2_drc_init(SH2 *sh2) memset(block_link_pool_counts, 0, sizeof(block_link_pool_counts)); drc_cmn_init(); + rcache_init(); tcache_ptr = tcache; sh2_generate_utils(); host_instructions_updated(tcache, tcache_ptr); @@ -4102,7 +4750,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x07: // ADD #imm,Rn 0111nnnniiiiiiii opd->source = opd->dest = BITMASK1(GET_Rn()); - opd->imm = (int)(signed char)op; + opd->imm = (s8)op; break; ///////////////////////////////////////////// @@ -4132,7 +4780,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x0800: // CMP/EQ #imm,R0 10001000iiiiiiii opd->source = BITMASK1(SHR_R0); opd->dest = BITMASK1(SHR_T); - opd->imm = (int)(signed char)op; + opd->imm = (s8)op; break; case 0x0d00: // BT/S label 10001101dddddddd case 0x0f00: // BF/S label 10001111dddddddd @@ -4305,7 +4953,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x0e: // MOV #imm,Rn 1110nnnniiiiiiii opd->dest = BITMASK1(GET_Rn()); - opd->imm = (u32)(signed int)(signed char)op; + opd->imm = (s8)op; break; default: @@ -4369,7 +5017,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, } // "overscan" detection: unreachable code after unconditional branch // this can happen if the insn after a forward branch isn't a local target - if (opd->op == OP_BRANCH || opd->op == OP_BRANCH_R || opd->op == OP_BRANCH_RF) { + if (OP_ISBRAUC(opd->op)) { if (op_flags[i + 1] & OF_DELAY_OP) { if (i_end > i + 2 && !(op_flags[i + 2] & OF_BTARGET)) i_end = i + 2; diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index c9cf7ab0b..b690435ce 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -31,8 +31,10 @@ void scan_block(unsigned int base_pc, int is_slave, // XXX MUST match definitions in cpu/sh2/compiler.c #if defined(__arm__) #define DRC_SR_REG r10 -#elif defined(__i386__) || defined(__x86_64__) +#elif defined(__i386__) #define DRC_SR_REG edi +#elif defined(__x86_64__) +#define DRC_SR_REG ebx #else #warning "direct DRC register access not available for this host" #endif From 5f166c638c9d01854bcd0cba2691dce53e2edea6 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 3 Apr 2019 23:21:20 +0200 Subject: [PATCH 025/174] sh2 memory interface optimzations --- cpu/drc/emit_arm.c | 38 +++++- cpu/drc/emit_x86.c | 68 +++++++++- cpu/sh2/compiler.c | 123 ++++++++++------- cpu/sh2/compiler.h | 4 +- cpu/sh2/sh2.h | 3 + pico/32x/memory.c | 180 ++++++++++++------------- pico/32x/memory_arm.S | 305 ++++++++++++++++++++++++++++++++++++++++++ pico/32x/sh2soc.c | 6 +- pico/pico_int.h | 6 +- tools/mkoffsets.sh | 9 ++ 10 files changed, 585 insertions(+), 157 deletions(-) create mode 100644 pico/32x/memory_arm.S diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 4421c6411..c255a8b82 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -65,6 +65,9 @@ #define DCOND_VS A_COND_VS #define DCOND_VC A_COND_VC +#define DCOND_CS A_COND_HS +#define DCOND_CC A_COND_LO + /* addressing mode 1 */ #define A_AM1_LSL 0 #define A_AM1_LSR 1 @@ -184,8 +187,10 @@ #define EOP_STR_SIMPLE(rd,rn) EOP_C_AM2_IMM(A_COND_AL,1,0,0,rn,rd,0) #define EOP_LDR_REG_LSL(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,0,1,rn,rd,shift_imm,A_AM1_LSL,rm) +#define EOP_LDRB_REG_LSL(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,1,1,rn,rd,shift_imm,A_AM1_LSL,rm); #define EOP_LDRH_IMM2(cond,rd,rn,offset_8) EOP_C_AM3_IMM(cond,1,1,rn,rd,0,1,offset_8) +#define EOP_LDRH_REG2(cond,rd,rn,rm) EOP_C_AM3_REG(cond,1,1,rn,rd,0,1,rm) #define EOP_LDRH_IMM( rd,rn,offset_8) EOP_C_AM3_IMM(A_COND_AL,1,1,rn,rd,0,1,offset_8) #define EOP_LDRH_SIMPLE(rd,rn) EOP_C_AM3_IMM(A_COND_AL,1,1,rn,rd,0,1,0) @@ -479,6 +484,8 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_adc_r_r(d, s) \ EOP_ADC_REG(A_COND_AL,0,d,d,s,A_AM1_LSL,0) +#define emith_and_r_r_c(cond, d, s) \ + EOP_AND_REG(cond,0,d,d,s,A_AM1_LSL,0) #define emith_and_r_r(d, s) \ EOP_AND_REG(A_COND_AL,0,d,d,s,A_AM1_LSL,0) @@ -677,12 +684,24 @@ static int emith_xbranch(int cond, void *target, int is_call) // misc #define emith_read_r_r_offs_c(cond, r, rs, offs) \ EOP_LDR_IMM2(cond, r, rs, offs) +#define emith_read_r_r_r_c(cond, r, rs, rm) \ + EOP_LDR_REG_LSL(cond, r, rs, rm, 0) +#define emith_read_r_r_r(r, rs, rm) \ + EOP_LDR_REG_LSL(A_COND_AL, r, rs, rm, 0) #define emith_read8_r_r_offs_c(cond, r, rs, offs) \ EOP_LDRB_IMM2(cond, r, rs, offs) +#define emith_read8_r_r_r_c(cond, r, rs, rm) \ + EOP_LDRB_REG_LSL(cond, r, rs, rm, 0) +#define emith_read8_r_r_r(r, rs, rm) \ + EOP_LDRB_REG_LSL(A_COND_AL, r, rs, rm, 0) #define emith_read16_r_r_offs_c(cond, r, rs, offs) \ EOP_LDRH_IMM2(cond, r, rs, offs) +#define emith_read16_r_r_r_c(cond, r, rs, rm) \ + EOP_LDRH_REG2(cond, r, rs, rm) +#define emith_read16_r_r_r(r, rs, rm) \ + EOP_LDRH_REG2(A_COND_AL, r, rs, rm) #define emith_read_r_r_offs(r, rs, offs) \ emith_read_r_r_offs_c(A_COND_AL, r, rs, offs) @@ -844,11 +863,20 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_sh2_drc_exit() \ EOP_LDMFD_SP(A_R4M|A_R5M|A_R6M|A_R7M|A_R8M|A_R9M|A_R10M|A_R11M|A_R12M|A_R15M) -#define emith_sh2_wcall(a, tab) { \ - emith_lsr(12, a, SH2_WRITE_SHIFT); \ - EOP_LDR_REG_LSL(A_COND_AL,12,tab,12,2); \ - emith_move_r_r(2, CONTEXT_REG); \ - emith_jump_reg(12); \ +// assumes a is in arg0, tab, func and mask are temp +#define emith_sh2_rcall(a, tab, func, mask) { \ + emith_lsr(mask, a, SH2_READ_SHIFT); \ + EOP_ADD_REG_LSL(tab, tab, mask, 3); \ + EOP_LDMIA(tab, (1<> WRT_SHIFT */ \ + EMIT_REX_IF(1, func, tab); \ + EMIT_OP_MODRM64(0x8b, 0, func, 4); \ + EMIT_SIB64(PTR_SCALE, func, tab); /* mov tmp, [tab + tmp * {4,8}] */ \ emith_move_r_r_ptr(arg2_, CONTEXT_REG); \ - emith_jump_reg(NA_TMP_REG); \ + emith_jump_reg(func); \ } #define emith_sh2_dtbf_loop() { \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 1b300cc3b..bfd98e2be 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -328,7 +328,7 @@ struct block_list { static struct block_list **inval_lookup[TCACHE_BUFFERS]; static const int hash_table_sizes[TCACHE_BUFFERS] = { - 0x1000, + 0x4000, 0x100, 0x100, }; @@ -498,12 +498,12 @@ static void (*sh2_drc_dispatcher)(void); static void (*sh2_drc_exit)(void); static void (*sh2_drc_test_irq)(void); -static u32 REGPARM(2) (*sh2_drc_read8)(u32 a, SH2 *sh2); -static u32 REGPARM(2) (*sh2_drc_read16)(u32 a, SH2 *sh2); -static u32 REGPARM(2) (*sh2_drc_read32)(u32 a, SH2 *sh2); +static u32 REGPARM(1) (*sh2_drc_read8)(u32 a); +static u32 REGPARM(1) (*sh2_drc_read16)(u32 a); +static u32 REGPARM(1) (*sh2_drc_read32)(u32 a); static void REGPARM(2) (*sh2_drc_write8)(u32 a, u32 d); static void REGPARM(2) (*sh2_drc_write16)(u32 a, u32 d); -static void REGPARM(3) (*sh2_drc_write32)(u32 a, u32 d, SH2 *sh2); +static void REGPARM(2) (*sh2_drc_write32)(u32 a, u32 d); // flags for memory access #define MF_SIZEMASK 0x03 // size of access @@ -787,7 +787,7 @@ static void *dr_prepare_ext_branch(u32 pc, int is_slave, int tcache_id) cnt = i + 1; if (cnt >= block_link_pool_max_counts[tcache_id]) { dbg(1, "bl overflow for tcache %d", tcache_id); - return NULL; + return sh2_drc_dispatcher; } bl += cnt; block_link_pool_counts[tcache_id]++; @@ -848,7 +848,7 @@ static void dr_link_blocks(struct block_entry *be, int tcache_id) dbg(1, "warning: " #array " overflow"); \ failcode; \ } else \ - array[count++] = item; \ + array[count++] = item; \ } static int find_in_array(u32 *array, size_t size, u32 what) @@ -1806,7 +1806,7 @@ static int emit_get_rbase_and_offs(SH2 *sh2, u32 a, u32 *offs) hr = rcache_get_tmp(); if (mask < 0x1000) { // can't access data array or BIOS directly from ROM or SDRAM, - // since code may run on both SH2s (if the tcache_id would be known...) + // since code may run on both SH2s (tcache_id of translation block needed)) emith_ctx_read(hr, poffs); if (a & mask & ~omask) emith_add_r_imm(hr, a & mask & ~omask); @@ -1896,8 +1896,6 @@ static void emit_or_t_if_eq(int srr) // rd = @(arg0) static int emit_memhandler_read(int size) { - int arg1; - rcache_clean_tmp(); #ifndef DRC_SR_REG // must writeback cycles for poll detection stuff @@ -1905,8 +1903,6 @@ static int emit_memhandler_read(int size) rcache_evict_vreg(guest_regs[SHR_SR].vreg); #endif - arg1 = rcache_get_tmp_arg(1); - emith_move_r_r_ptr(arg1, CONTEXT_REG); switch (size & MF_SIZEMASK) { case 0: emith_call(sh2_drc_read8); break; // 8 case 1: emith_call(sh2_drc_read16); break; // 16 @@ -1920,16 +1916,12 @@ static int emit_memhandler_read(int size) // @(arg0) = arg1 static void emit_memhandler_write(int size) { - int arg2; - rcache_clean_tmp(); #ifndef DRC_SR_REG if (guest_regs[SHR_SR].vreg != -1) rcache_evict_vreg(guest_regs[SHR_SR].vreg); #endif - arg2 = rcache_get_tmp_arg(2); - emith_move_r_r_ptr(arg2, CONTEXT_REG); switch (size & MF_SIZEMASK) { case 0: emith_call(sh2_drc_write8); break; // 8 case 1: emith_call(sh2_drc_write16); break; // 16 @@ -2372,7 +2364,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_unlock_all(); #if (DRC_DEBUG & (8|256|512|1024)) - emit_move_r_imm32(SHR_PC, pc); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); rcache_clean(); @@ -2392,7 +2383,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #ifdef DRC_CMP if (!(op_flags[i] & OF_DELAY_OP)) { - emit_move_r_imm32(SHR_PC, pc); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); rcache_clean(); @@ -3666,16 +3656,69 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) static void sh2_generate_utils(void) { - int arg0, arg1, arg2, sr, tmp; - - sh2_drc_read8 = p32x_sh2_read8; - sh2_drc_read16 = p32x_sh2_read16; - sh2_drc_read32 = p32x_sh2_read32; + int arg0, arg1, arg2, arg3, sr, tmp; host_arg2reg(arg0, 0); host_arg2reg(arg1, 1); host_arg2reg(arg2, 2); + host_arg2reg(arg3, 3); emith_move_r_r(arg0, arg0); // nop + emith_move_r_r(arg1, arg1); // nop + emith_move_r_r(arg2, arg2); // nop + emith_move_r_r(arg3, arg3); // nop + + // sh2_drc_write8(u32 a, u32 d) + sh2_drc_write8 = (void *)tcache_ptr; + emith_ctx_read_ptr(arg2, offsetof(SH2, write8_tab)); + emith_sh2_wcall(arg0, arg1, arg2, arg3); + + // sh2_drc_write16(u32 a, u32 d) + sh2_drc_write16 = (void *)tcache_ptr; + emith_ctx_read_ptr(arg2, offsetof(SH2, write16_tab)); + emith_sh2_wcall(arg0, arg1, arg2, arg3); + + // sh2_drc_write32(u32 a, u32 d) + sh2_drc_write32 = (void *)tcache_ptr; + emith_ctx_read_ptr(arg2, offsetof(SH2, write32_tab)); + emith_sh2_wcall(arg0, arg1, arg2, arg3); + + // d = sh2_drc_read8(u32 a) + sh2_drc_read8 = (void *)tcache_ptr; + emith_ctx_read_ptr(arg1, offsetof(SH2, read8_map)); + emith_sh2_rcall(arg0, arg1, arg2, arg3); + EMITH_SJMP_START(DCOND_CS); + emith_and_r_r_c(DCOND_CC, arg0, arg3); + emith_eor_r_imm_c(DCOND_CC, arg0, 1); + emith_read8_r_r_r_c(DCOND_CC, RET_REG, arg0, arg2); + emith_ret_c(DCOND_CC); + EMITH_SJMP_END(DCOND_CS); + emith_move_r_r_ptr(arg1, CONTEXT_REG); + emith_jump_reg(arg2); + + // d = sh2_drc_read16(u32 a) + sh2_drc_read16 = (void *)tcache_ptr; + emith_ctx_read_ptr(arg1, offsetof(SH2, read16_map)); + emith_sh2_rcall(arg0, arg1, arg2, arg3); + EMITH_SJMP_START(DCOND_CS); + emith_and_r_r_c(DCOND_CC, arg0, arg3); + emith_read16_r_r_r_c(DCOND_CC, RET_REG, arg0, arg2); + emith_ret_c(DCOND_CC); + EMITH_SJMP_END(DCOND_CS); + emith_move_r_r_ptr(arg1, CONTEXT_REG); + emith_jump_reg(arg2); + + // d = sh2_drc_read32(u32 a) + sh2_drc_read32 = (void *)tcache_ptr; + emith_ctx_read_ptr(arg1, offsetof(SH2, read32_map)); + emith_sh2_rcall(arg0, arg1, arg2, arg3); + EMITH_SJMP_START(DCOND_CS); + emith_and_r_r_c(DCOND_CC, arg0, arg3); + emith_read_r_r_r_c(DCOND_CC, RET_REG, arg0, arg2); + emith_ror_c(DCOND_CC, RET_REG, RET_REG, 16); + emith_ret_c(DCOND_CC); + EMITH_SJMP_END(DCOND_CS); + emith_move_r_r_ptr(arg1, CONTEXT_REG); + emith_jump_reg(arg2); // sh2_drc_exit(void) sh2_drc_exit = (void *)tcache_ptr; @@ -3766,21 +3809,6 @@ static void sh2_generate_utils(void) emith_call(sh2_drc_test_irq); emith_jump(sh2_drc_dispatcher); - // sh2_drc_write8(u32 a, u32 d) - sh2_drc_write8 = (void *)tcache_ptr; - emith_ctx_read_ptr(arg2, offsetof(SH2, write8_tab)); - emith_sh2_wcall(arg0, arg2); - - // sh2_drc_write16(u32 a, u32 d) - sh2_drc_write16 = (void *)tcache_ptr; - emith_ctx_read_ptr(arg2, offsetof(SH2, write16_tab)); - emith_sh2_wcall(arg0, arg2); - - // sh2_drc_write32(u32 a, u32 d) - sh2_drc_write32 = (void *)tcache_ptr; - emith_ctx_read_ptr(arg2, offsetof(SH2, write32_tab)); - emith_sh2_wcall(arg0, arg2); - #ifdef PDB_NET // debug #define MAKE_READ_WRAPPER(func) { \ @@ -3815,11 +3843,6 @@ static void sh2_generate_utils(void) MAKE_WRITE_WRAPPER(sh2_drc_write8); MAKE_WRITE_WRAPPER(sh2_drc_write16); MAKE_WRITE_WRAPPER(sh2_drc_write32); -#if (DRC_DEBUG & 4) - host_dasm_new_symbol(sh2_drc_read8); - host_dasm_new_symbol(sh2_drc_read16); - host_dasm_new_symbol(sh2_drc_read32); -#endif #endif rcache_invalidate(); @@ -3831,6 +3854,9 @@ static void sh2_generate_utils(void) host_dasm_new_symbol(sh2_drc_write8); host_dasm_new_symbol(sh2_drc_write16); host_dasm_new_symbol(sh2_drc_write32); + host_dasm_new_symbol(sh2_drc_read8); + host_dasm_new_symbol(sh2_drc_read16); + host_dasm_new_symbol(sh2_drc_read32); #endif } @@ -3955,14 +3981,15 @@ static void sh2_smc_rm_blocks(u32 a, u16 *drc_ram_blk, int tcache_id, u32 shift, } } -void sh2_drc_wcheck_ram(unsigned int a, int val, int cpuid) +void sh2_drc_wcheck_ram(unsigned int a, int val, SH2 *sh2) { - dbg(2, "%csh2 smc check @%08x", cpuid ? 's' : 'm', a); + dbg(2, "%csh2 smc check @%08x", sh2->is_slave ? 's' : 'm', a); sh2_smc_rm_blocks(a, Pico32xMem->drcblk_ram, 0, SH2_DRCBLK_RAM_SHIFT, 0x3ffff); } -void sh2_drc_wcheck_da(unsigned int a, int val, int cpuid) +void sh2_drc_wcheck_da(unsigned int a, int val, SH2 *sh2) { + int cpuid = sh2->is_slave; dbg(2, "%csh2 smc check @%08x", cpuid ? 's' : 'm', a); sh2_smc_rm_blocks(a, Pico32xMem->drcblk_da[cpuid], 1 + cpuid, SH2_DRCBLK_DA_SHIFT, 0xfff); @@ -4051,6 +4078,9 @@ void sh2_drc_mem_setup(SH2 *sh2) sh2->p_da = sh2->data_array; sh2->p_sdram = Pico32xMem->sdram; sh2->p_rom = Pico.rom; + // sh2->p_dram filled in dram bank switching + sh2->p_drcblk_da = Pico32xMem->drcblk_da[!!sh2->is_slave]; + sh2->p_drcblk_ram = Pico32xMem->drcblk_ram; } void sh2_drc_frame(void) @@ -4103,6 +4133,7 @@ int sh2_drc_init(SH2 *sh2) // disasm the utils tcache_dsm_ptrs[0] = tcache; do_host_disasm(0); + fflush(stdout); #endif #if (DRC_DEBUG & 1) hash_collisions = 0; diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index b690435ce..6a8596b83 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -1,7 +1,7 @@ int sh2_drc_init(SH2 *sh2); void sh2_drc_finish(SH2 *sh2); -void sh2_drc_wcheck_ram(unsigned int a, int val, int cpuid); -void sh2_drc_wcheck_da(unsigned int a, int val, int cpuid); +void sh2_drc_wcheck_ram(unsigned int a, int val, SH2 *sh2); +void sh2_drc_wcheck_da(unsigned int a, int val, SH2 *sh2); #ifdef DRC_SH2 void sh2_drc_mem_setup(SH2 *sh2); diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index 7faa844b3..a073d43f2 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -36,6 +36,9 @@ typedef struct SH2_ void *p_da; void *p_sdram; void *p_rom; + void *p_dram; + void *p_drcblk_da; + void *p_drcblk_ram; unsigned int pdb_io_csum[2]; #define SH2_STATE_RUN (1 << 0) // to prevent recursion diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 8f2a7c2fa..47329835b 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -1066,41 +1066,41 @@ void PicoWrite16_32x(u32 a, u32 d) } /* quirk: in both normal and overwrite areas only nonzero values go through */ -#define sh2_write8_dramN(n) \ +#define sh2_write8_dramN(p, a, d) \ if ((d & 0xff) != 0) { \ - u8 *dram = (u8 *)Pico32xMem->dram[n]; \ + u8 *dram = (u8 *)p; \ dram[(a & 0x1ffff) ^ 1] = d; \ } static void m68k_write8_dram0_ow(u32 a, u32 d) { - sh2_write8_dramN(0); + sh2_write8_dramN(Pico32xMem->dram[0], a, d); } static void m68k_write8_dram1_ow(u32 a, u32 d) { - sh2_write8_dramN(1); + sh2_write8_dramN(Pico32xMem->dram[1], a, d); } -#define sh2_write16_dramN(n) \ - u16 *pd = &Pico32xMem->dram[n][(a & 0x1ffff) / 2]; \ +#define sh2_write16_dramN(p, a, d) \ + u16 *pd = &((u16 *)p)[(a & 0x1ffff) / 2]; \ if (!(a & 0x20000)) { \ *pd = d; \ - return; \ - } \ - /* overwrite */ \ - if (!(d & 0x00ff)) d |= *pd & 0x00ff; \ - if (!(d & 0xff00)) d |= *pd & 0xff00; \ - *pd = d; + } else { \ + u16 v = *pd; /* overwrite */ \ + if (!(d & 0x00ff)) d |= v & 0x00ff; \ + if (!(d & 0xff00)) d |= v & 0xff00; \ + *pd = d; \ + } static void m68k_write16_dram0_ow(u32 a, u32 d) { - sh2_write16_dramN(0); + sh2_write16_dramN(Pico32xMem->dram[0], a, d); } static void m68k_write16_dram1_ow(u32 a, u32 d) { - sh2_write16_dramN(1); + sh2_write16_dramN(Pico32xMem->dram[1], a, d); } // ----------------------------------------------------------------- @@ -1229,14 +1229,14 @@ static void bank_switch_rom_68k(int b) // ----------------------------------------------------------------- // read8 -static u32 sh2_read8_unmapped(u32 a, SH2 *sh2) +static REGPARM(2) u32 sh2_read8_unmapped(u32 a, SH2 *sh2) { elprintf_sh2(sh2, EL_32X, "unmapped r8 [%08x] %02x @%06x", a, 0, sh2_pc(sh2)); return 0; } -static u32 sh2_read8_cs0(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read8_cs0(u32 a, SH2 *sh2) { u32 d = 0; DRC_SAVE_SR(sh2); @@ -1282,27 +1282,28 @@ static u32 sh2_read8_cs0(u32 a, SH2 *sh2) return d; } -static u32 sh2_read8_da(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read8_da(u32 a, SH2 *sh2) { return sh2->data_array[(a & 0xfff) ^ 1]; } // for ssf2 -static u32 sh2_read8_rom(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read8_rom(u32 a, SH2 *sh2) { u32 bank = carthw_ssf2_banks[(a >> 19) & 7] << 19; - return Pico.rom[(bank + (a & 0x7ffff)) ^ 1]; + u8 *p = sh2->p_rom; + return p[(bank + (a & 0x7ffff)) ^ 1]; } // read16 -static u32 sh2_read16_unmapped(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read16_unmapped(u32 a, SH2 *sh2) { elprintf_sh2(sh2, EL_32X, "unmapped r16 [%08x] %04x @%06x", a, 0, sh2_pc(sh2)); return 0; } -static u32 sh2_read16_cs0(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read16_cs0(u32 a, SH2 *sh2) { u32 d = 0; DRC_SAVE_SR(sh2); @@ -1342,39 +1343,41 @@ static u32 sh2_read16_cs0(u32 a, SH2 *sh2) return d; } -static u32 sh2_read16_da(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read16_da(u32 a, SH2 *sh2) { return ((u16 *)sh2->data_array)[(a & 0xffe) / 2]; } -static u32 sh2_read16_rom(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read16_rom(u32 a, SH2 *sh2) { u32 bank = carthw_ssf2_banks[(a >> 19) & 7] << 19; - return *(u16 *)(Pico.rom + bank + (a & 0x7fffe)); + u16 *p = sh2->p_rom; + return p[(bank + (a & 0x7fffe)) / 2]; } -static u32 sh2_read32_unmapped(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read32_unmapped(u32 a, SH2 *sh2) { elprintf_sh2(sh2, EL_32X, "unmapped r32 [%08x] %08x @%06x", a, 0, sh2_pc(sh2)); return 0; } -static u32 sh2_read32_cs0(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read32_cs0(u32 a, SH2 *sh2) { return (sh2_read16_cs0(a, sh2) << 16) | sh2_read16_cs0(a + 2, sh2); } -static u32 sh2_read32_da(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read32_da(u32 a, SH2 *sh2) { u32 d = *((u32 *)sh2->data_array + (a & 0xffc)/4); return (d << 16) | (d >> 16); } -static u32 sh2_read32_rom(u32 a, SH2 *sh2) +static u32 REGPARM(2) sh2_read32_rom(u32 a, SH2 *sh2) { u32 bank = carthw_ssf2_banks[(a >> 19) & 7] << 19; - u32 d = *(u32 *)(Pico.rom + bank + (a & 0x7fffc)); + u32 *p = sh2->p_rom; + u32 d = p[(bank + (a & 0x7fffc)) / 4]; return (d << 16) | (d >> 16); } @@ -1420,25 +1423,21 @@ static void REGPARM(3) sh2_write8_cs0(u32 a, u32 d, SH2 *sh2) DRC_RESTORE_SR(sh2); } -static void REGPARM(3) sh2_write8_dram0(u32 a, u32 d, SH2 *sh2) -{ - sh2_write8_dramN(0); -} - -static void REGPARM(3) sh2_write8_dram1(u32 a, u32 d, SH2 *sh2) +static void REGPARM(3) sh2_write8_dram(u32 a, u32 d, SH2 *sh2) { - sh2_write8_dramN(1); + sh2_write8_dramN(sh2->p_dram, a, d); } static void REGPARM(3) sh2_write8_sdram(u32 a, u32 d, SH2 *sh2) { u32 a1 = a & 0x3ffff; #ifdef DRC_SH2 - int t = Pico32xMem->drcblk_ram[a1 >> SH2_DRCBLK_RAM_SHIFT]; + u16 *p = sh2->p_drcblk_ram; + int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; if (t) - sh2_drc_wcheck_ram(a, t, sh2->is_slave); + sh2_drc_wcheck_ram(a, t, sh2); #endif - Pico32xMem->sdram[a1 ^ 1] = d; + ((u8 *)sh2->p_sdram)[a1 ^ 1] = d; } static void REGPARM(3) sh2_write8_sdram_wt(u32 a, u32 d, SH2 *sh2) @@ -1457,10 +1456,10 @@ static void REGPARM(3) sh2_write8_da(u32 a, u32 d, SH2 *sh2) { u32 a1 = a & 0xfff; #ifdef DRC_SH2 - int id = sh2->is_slave; - int t = Pico32xMem->drcblk_da[id][a1 >> SH2_DRCBLK_DA_SHIFT]; + u16 *p = sh2->p_drcblk_da; + int t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; if (t) - sh2_drc_wcheck_da(a, t, id); + sh2_drc_wcheck_da(a, t, sh2); #endif sh2->data_array[a1 ^ 1] = d; } @@ -1503,42 +1502,38 @@ static void REGPARM(3) sh2_write16_cs0(u32 a, u32 d, SH2 *sh2) DRC_RESTORE_SR(sh2); } -static void REGPARM(3) sh2_write16_dram0(u32 a, u32 d, SH2 *sh2) +static void REGPARM(3) sh2_write16_dram(u32 a, u32 d, SH2 *sh2) { - sh2_write16_dramN(0); -} - -static void REGPARM(3) sh2_write16_dram1(u32 a, u32 d, SH2 *sh2) -{ - sh2_write16_dramN(1); + sh2_write16_dramN(sh2->p_dram, a, d); } static void REGPARM(3) sh2_write16_sdram(u32 a, u32 d, SH2 *sh2) { - u32 a1 = a & 0x3ffff; + u32 a1 = a & 0x3fffe; #ifdef DRC_SH2 - int t = Pico32xMem->drcblk_ram[a1 >> SH2_DRCBLK_RAM_SHIFT]; + u16 *p = sh2->p_drcblk_ram; + int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; if (t) - sh2_drc_wcheck_ram(a, t, sh2->is_slave); + sh2_drc_wcheck_ram(a, t, sh2); #endif - ((u16 *)Pico32xMem->sdram)[a1 / 2] = d; + ((u16 *)sh2->p_sdram)[a1 / 2] = d; } static void REGPARM(3) sh2_write16_da(u32 a, u32 d, SH2 *sh2) { - u32 a1 = a & 0xfff; + u32 a1 = a & 0xffe; #ifdef DRC_SH2 - int id = sh2->is_slave; - int t = Pico32xMem->drcblk_da[id][a1 >> SH2_DRCBLK_DA_SHIFT]; + u16 *p = sh2->p_drcblk_da; + int t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; if (t) - sh2_drc_wcheck_da(a, t, id); + sh2_drc_wcheck_da(a, t, sh2); #endif ((u16 *)sh2->data_array)[a1 / 2] = d; } static void REGPARM(3) sh2_write16_rom(u32 a, u32 d, SH2 *sh2) { - u32 a1 = a & 0x3fffff; + u32 a1 = a & 0x3ffffe; // tweak for WWF Raw: does writes to ROM area, and it doesn't work without // allowing this. // Presumably the write goes to the CPU cache and is read back from there, @@ -1562,54 +1557,53 @@ static void REGPARM(3) sh2_write32_cs0(u32 a, u32 d, SH2 *sh2) sh2_write16_cs0(a + 2, d, sh2); } -#define sh2_write32_dramN(n) \ - u32 *pd = (u32 *)&Pico32xMem->dram[n][(a & 0x1ffff) / 2]; \ +#define sh2_write32_dramN(p, a, d) \ + u32 *pd = &((u32 *)p)[(a & 0x1ffff) / 4]; \ if (!(a & 0x20000)) { \ *pd = (d << 16) | (d >> 16); \ - return; \ - } \ - /* overwrite */ \ - u8 *pb = (u8 *)pd; \ - if (d & 0x000000ff) pb[2] = d; \ - if (d & 0x0000ff00) pb[3] = d >> 8; \ - if (d & 0x00ff0000) pb[0] = d >> 16; \ - if (d & 0xff000000) pb[1] = d >> 24; \ - -static void REGPARM(3) sh2_write32_dram0(u32 a, u32 d, SH2 *sh2) -{ - sh2_write32_dramN(0); -} + } else { \ + /* overwrite */ \ + u32 v = *pd, m = 0; d = (d << 16) | (d >> 16) ; \ + if (!(d & 0x000000ff)) m |= 0x000000ff; \ + if (!(d & 0x0000ff00)) m |= 0x0000ff00; \ + if (!(d & 0x00ff0000)) m |= 0x00ff0000; \ + if (!(d & 0xff000000)) m |= 0xff000000; \ + *pd = d | (v&m); \ + } -static void REGPARM(3) sh2_write32_dram1(u32 a, u32 d, SH2 *sh2) +static void REGPARM(3) sh2_write32_dram(u32 a, u32 d, SH2 *sh2) { - sh2_write32_dramN(1); + sh2_write32_dramN(sh2->p_dram, a, d); } static void REGPARM(3) sh2_write32_sdram(u32 a, u32 d, SH2 *sh2) { u32 a1 = a & 0x3fffc; - *(u32 *)(sh2->p_sdram + a1) = (d << 16) | (d >> 16); #ifdef DRC_SH2 - unsigned short *p = &Pico32xMem->drcblk_ram[a1 >> SH2_DRCBLK_RAM_SHIFT]; - if (p[0]) - sh2_drc_wcheck_ram(a, p[0], sh2->is_slave); - if (p[1]) - sh2_drc_wcheck_ram(a+2, p[1], sh2->is_slave); + u16 *p = sh2->p_drcblk_ram; + int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; + if (t) + sh2_drc_wcheck_ram(a, t, sh2); + int u = p[(a1+2) >> SH2_DRCBLK_RAM_SHIFT]; + if (u) + sh2_drc_wcheck_ram(a+2, u, sh2); #endif + *(u32 *)(sh2->p_sdram + a1) = (d << 16) | (d >> 16); } static void REGPARM(3) sh2_write32_da(u32 a, u32 d, SH2 *sh2) { u32 a1 = a & 0xffc; - *((u32 *)sh2->data_array + a1/4) = (d << 16) | (d >> 16); #ifdef DRC_SH2 - int id = sh2->is_slave; - unsigned short *p = &Pico32xMem->drcblk_da[id][a1 >> SH2_DRCBLK_DA_SHIFT]; - if (p[0]) - sh2_drc_wcheck_da(a, p[0], id); - if (p[1]) - sh2_drc_wcheck_da(a+2, p[1], id); + u16 *p = sh2->p_drcblk_da; + int t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; + if (t) + sh2_drc_wcheck_da(a, t, sh2); + int u = p[(a1+2) >> SH2_DRCBLK_DA_SHIFT]; + if (u) + sh2_drc_wcheck_da(a+2, u, sh2); #endif + *((u32 *)sh2->data_array + a1/4) = (d << 16) | (d >> 16); } static void REGPARM(3) sh2_write32_rom(u32 a, u32 d, SH2 *sh2) @@ -1919,9 +1913,7 @@ void Pico32xSwapDRAM(int b) sh2_read16_map[0x04/2].addr = sh2_read16_map[0x24/2].addr = sh2_read32_map[0x04/2].addr = sh2_read32_map[0x24/2].addr = MAP_MEMORY(Pico32xMem->dram[b]); - sh2_write8_map[0x04/2] = sh2_write8_map[0x24/2] = b ? sh2_write8_dram1 : sh2_write8_dram0; - sh2_write16_map[0x04/2] = sh2_write16_map[0x24/2] = b ? sh2_write16_dram1 : sh2_write16_dram0; - sh2_write32_map[0x04/2] = sh2_write32_map[0x24/2] = b ? sh2_write32_dram1 : sh2_write32_dram0; + msh2.p_dram = ssh2.p_dram = Pico32xMem->dram[b]; // DRC conveniance ptr } static void bank_switch_rom_sh2(void) @@ -2035,10 +2027,14 @@ void PicoMemSetup32x(void) sh2_read32_map[0x02/2].mask = sh2_read32_map[0x22/2].mask = 0x3ffffc; // FIXME sh2_write16_map[0x02/2] = sh2_write16_map[0x22/2] = sh2_write16_rom; sh2_write32_map[0x02/2] = sh2_write32_map[0x22/2] = sh2_write32_rom; - // CS2 - DRAM - done by Pico32xSwapDRAM() + // CS2 - DRAM sh2_read8_map[0x04/2].mask = sh2_read8_map[0x24/2].mask = 0x01ffff; sh2_read16_map[0x04/2].mask = sh2_read16_map[0x24/2].mask = 0x01fffe; sh2_read32_map[0x04/2].mask = sh2_read32_map[0x24/2].mask = 0x01fffc; + sh2_write8_map[0x04/2] = sh2_write8_map[0x24/2] = sh2_write8_dram; + sh2_write16_map[0x04/2] = sh2_write16_map[0x24/2] = sh2_write16_dram; + sh2_write32_map[0x04/2] = sh2_write32_map[0x24/2] = sh2_write32_dram; + // CS3 - SDRAM sh2_read8_map[0x06/2].addr = sh2_read8_map[0x26/2].addr = sh2_read16_map[0x06/2].addr = sh2_read16_map[0x26/2].addr = diff --git a/pico/32x/memory_arm.S b/pico/32x/memory_arm.S new file mode 100644 index 000000000..90c86ddf6 --- /dev/null +++ b/pico/32x/memory_arm.S @@ -0,0 +1,305 @@ +/* + * PicoDrive 32X memory access functions, assembler version + * (C) KUB, 2018 + * + * This work is licensed under the terms of MAME license. + * See COPYING file in the top-level directory. + */ + +#include "../pico_int_o32.h" + +@ 32X bank sizes... TODO this should somehow come from an include file +.equ SH2_ROM_SHIFT, 10 @ 0x003fffff +.equ SH2_RAM_SHIFT, 14 @ 0x0003ffff +.equ SH2_DRAM_SHIFT,15 @ 0x0001ffff +.equ SH2_DA_SHIFT, 20 @ 0x00000fff + +.equ SH2_DRAM_OW, 1<<(32-SH2_DRAM_SHIFT) @ DRAM overwrite mode bit + +.text + +@ u32 a +.global sh2_read8_rom +.global sh2_read8_sdram +.global sh2_read8_da +.global sh2_read8_dram +.global sh2_read16_rom +.global sh2_read16_sdram +.global sh2_read16_da +.global sh2_read16_dram +.global sh2_read32_rom +.global sh2_read32_sdram +.global sh2_read32_da +.global sh2_read32_dram + +@ u32 a, u32 d +.global sh2_write8_sdram +.global sh2_write8_da +.global sh2_write8_dram +.global sh2_write16_sdram +.global sh2_write16_da +.global sh2_write16_dram +.global sh2_write32_sdram +.global sh2_write32_da +.global sh2_write32_dram + +sh2_read8_rom: + ldr ip, [r1, #OFS_SH2_p_rom] + eor r0, r0, #1 + lsl r0, #SH2_ROM_SHIFT + ldrb r0, [ip, r0, lsr #SH2_ROM_SHIFT] + bx lr + +sh2_read8_sdram: + ldr ip, [r1, #OFS_SH2_p_sdram] + eor r0, r0, #1 + lsl r0, #SH2_RAM_SHIFT + ldrb r0, [ip, r0, lsr #SH2_RAM_SHIFT] + bx lr + +sh2_read8_da: + ldr ip, [r1, #OFS_SH2_p_da] + eor r0, r0, #1 + lsl r0, #SH2_DA_SHIFT + ldrb r0, [ip, r0, lsr #SH2_DA_SHIFT] + bx lr + +sh2_read8_dram: + ldr ip, [r1, #OFS_SH2_p_dram] + eor r0, r0, #1 + lsl r0, #SH2_DRAM_SHIFT + ldrb r0, [ip, r0, lsr #SH2_DRAM_SHIFT] + bx lr + +sh2_read16_rom: + ldr ip, [r1, #OFS_SH2_p_rom] + lsl r0, #SH2_ROM_SHIFT + lsr r0, #SH2_ROM_SHIFT + ldrh r0, [ip, r0] + bx lr + +sh2_read16_sdram: + ldr ip, [r1, #OFS_SH2_p_sdram] + lsl r0, #SH2_RAM_SHIFT + lsr r0, #SH2_RAM_SHIFT + ldrh r0, [ip, r0] + bx lr + +sh2_read16_da: + ldr ip, [r1, #OFS_SH2_p_da] + lsl r0, #SH2_DA_SHIFT + lsr r0, #SH2_DA_SHIFT + ldrh r0, [ip, r0] + bx lr + +sh2_read16_dram: + ldr ip, [r1, #OFS_SH2_p_dram] + lsl r0, #SH2_DRAM_SHIFT + lsr r0, #SH2_DRAM_SHIFT + ldrh r0, [ip, r0] + bx lr + +sh2_read32_rom: + ldr ip, [r1, #OFS_SH2_p_rom] + lsl r0, #SH2_ROM_SHIFT + ldr r0, [ip, r0, lsr #SH2_ROM_SHIFT] + ror r0, r0, #16 + bx lr + +sh2_read32_sdram: + ldr ip, [r1, #OFS_SH2_p_sdram] + lsl r0, #SH2_RAM_SHIFT + ldr r0, [ip, r0, lsr #SH2_RAM_SHIFT] + ror r0, r0, #16 + bx lr + +sh2_read32_da: + ldr ip, [r1, #OFS_SH2_p_da] + lsl r0, #SH2_DA_SHIFT + ldr r0, [ip, r0, lsr #SH2_DA_SHIFT] + ror r0, r0, #16 + bx lr + +sh2_read32_dram: + ldr ip, [r1, #OFS_SH2_p_dram] + lsl r0, #SH2_DRAM_SHIFT + ldr r0, [ip, r0, lsr #SH2_DRAM_SHIFT] + ror r0, r0, #16 + bx lr + +sh2_write8_sdram: + @ preserve r0 and r2 for tail call + ldr ip, [r2, #OFS_SH2_p_sdram] + eor r3, r0, #1 + lsl r3, #SH2_RAM_SHIFT + strb r1, [ip, r3, lsr #SH2_RAM_SHIFT] +#ifdef DRC_SH2 + ldr ip, [r2, #OFS_SH2_p_drcblk_ram] + ldrb r1, [ip, r3, lsr #SH2_RAM_SHIFT+1] + bic r0, r0, #1 + cmp r1, #0 + bxeq lr + b sh2_drc_wcheck_ram +#else + bx lr +#endif + +sh2_write8_da: + @ preserve r0 and r2 for tail call + ldr ip, [r2, #OFS_SH2_p_da] + eor r3, r0, #1 + lsl r3, #SH2_DA_SHIFT + strb r1, [ip, r3, lsr #SH2_DA_SHIFT] +#ifdef DRC_SH2 + ldr ip, [r2, #OFS_SH2_p_drcblk_da] + ldrb r1, [ip, r3, lsr #SH2_DA_SHIFT+1] + bic r0, r0, #1 + cmp r1, #0 + bxeq lr + b sh2_drc_wcheck_da +#else + bx lr +#endif + +sh2_write8_dram: + tst r1, #0xff + ldrne ip, [r2, #OFS_SH2_p_dram] + eorne r3, r0, #1 + lslne r3, #SH2_DRAM_SHIFT + strneb r1, [ip, r3, lsr #SH2_DRAM_SHIFT] + bx lr + +sh2_write16_sdram: + @ preserve r0 and r2 for tail call + ldr ip, [r2, #OFS_SH2_p_sdram] + lsl r3, r0, #SH2_RAM_SHIFT + lsr r3, r3, #SH2_RAM_SHIFT + strh r1, [ip, r3] +#ifdef DRC_SH2 + ldr ip, [r2, #OFS_SH2_p_drcblk_ram] + ldrb r1, [ip, r3, lsr #1] + cmp r1, #0 + bxeq lr + b sh2_drc_wcheck_ram +#else + bx lr +#endif + +sh2_write16_da: + @ preserve r0 and r2 for tail call + ldr ip, [r2, #OFS_SH2_p_da] + lsl r3, r0, #SH2_DA_SHIFT + lsr r3, r3, #SH2_DA_SHIFT + strh r1, [ip, r3] +#ifdef DRC_SH2 + ldr ip, [r2, #OFS_SH2_p_drcblk_da] + ldrb r1, [ip, r3, lsr #1] + cmp r1, #0 + bxeq lr + b sh2_drc_wcheck_da +#else + bx lr +#endif + +sh2_write16_dram: + ldr ip, [r2, #OFS_SH2_p_dram] + tst r0, #SH2_DRAM_OW + lsl r3, r0, #SH2_DRAM_SHIFT + lsr r3, r3, #SH2_DRAM_SHIFT + streqh r1, [ip, r3] + bxeq lr + add ip, ip, r3 + tst r1, #0xff + strneb r1, [ip, #0] + tst r1, #0xff00 + lsrne r1, r1, #8 + strneb r1, [ip, #1] + bx lr + +sh2_write32_sdram: + @ preserve r0 and r2 for tail call + ldr ip, [r2, #OFS_SH2_p_sdram] + ror r1, r1, #16 + lsl r3, r0, #SH2_RAM_SHIFT + str r1, [ip, r3, lsr #SH2_RAM_SHIFT] +#ifdef DRC_SH2 + ldr ip, [r2, #OFS_SH2_p_drcblk_ram] + ldrb r1, [ip, r3, lsr #SH2_RAM_SHIFT+1]! + cmp r1, #0 + beq 1f + stmfd sp!, {r0, r1, r2, ip} + bl sh2_drc_wcheck_ram + ldmfd sp!, {r0, r1, r2, ip} +1: ldrb r1, [ip, #1] + cmp r1, #0 + bxeq lr + add r0, r0, #2 + b sh2_drc_wcheck_ram +#else + bx lr +#endif + +sh2_write32_da: + @ preserve r0 and r2 for tail call + ldr ip, [r2, #OFS_SH2_p_da] + ror r1, r1, #16 + lsl r3, r0, #SH2_DA_SHIFT + str r1, [ip, r3, lsr #SH2_DA_SHIFT] +#ifdef DRC_SH2 + ldr ip, [r2, #OFS_SH2_p_drcblk_da] + ldrb r1, [ip, r3, lsr #SH2_DA_SHIFT+1]! + cmp r1, #0 + beq 1f + stmfd sp!, {r0, r1, r2, ip} + bl sh2_drc_wcheck_da + ldmfd sp!, {r0, r1, r2, ip} +1: ldrb r1, [ip, #1] + cmp r1, #0 + bxeq lr + add r0, r0, #2 + b sh2_drc_wcheck_da +#else + bx lr +#endif + +sh2_write32_dram: + ldr ip, [r2, #OFS_SH2_p_dram] + tst r0, #SH2_DRAM_OW + lsl r3, r0, #SH2_DRAM_SHIFT + roreq r1, r1, #16 + streq r1, [ip, r3, lsr #SH2_DRAM_SHIFT] + bxeq lr +#if 1 + ldr r0, [ip, r3, lsr #SH2_DRAM_SHIFT] + ror r1, r1, #16 + mov r2, #0 + tst r1, #0x00ff0000 + orrne r2, r2, #0x00ff0000 + tst r1, #0xff000000 + orrne r2, r2, #0xff000000 + tst r1, #0x000000ff + orrne r2, r2, #0x000000ff + tst r1, #0x0000ff00 + orrne r2, r2, #0x0000ff00 + bic r0, r0, r2 + orr r0, r0, r1 + str r0, [ip, r3, lsr #SH2_DRAM_SHIFT] +#else + add ip, ip, r3, lsr #SH2_DRAM_SHIFT + tst r1, #0x00ff0000 + lsrne r3, r1, #16 + strneb r3, [ip, #0] + tst r1, #0xff000000 + lsrne r3, r1, #24 + strneb r3, [ip, #1] + tst r1, #0x000000ff + strneb r1, [ip, #2] + tst r1, #0x0000ff00 + lsrne r3, r1, #8 + strneb r3, [ip, #3] +#endif + bx lr + +.pool + +@ vim:filetype=armasm diff --git a/pico/32x/sh2soc.c b/pico/32x/sh2soc.c index f8e657f5e..4aae2a045 100644 --- a/pico/32x/sh2soc.c +++ b/pico/32x/sh2soc.c @@ -229,7 +229,7 @@ void sh2_peripheral_reset(SH2 *sh2) // SH2 internal peripheral memhandlers // we keep them in little endian format -u32 sh2_peripheral_read8(u32 a, SH2 *sh2) +u32 REGPARM(2) sh2_peripheral_read8(u32 a, SH2 *sh2) { u8 *r = (void *)sh2->peri_regs; u32 d; @@ -242,7 +242,7 @@ u32 sh2_peripheral_read8(u32 a, SH2 *sh2) return d; } -u32 sh2_peripheral_read16(u32 a, SH2 *sh2) +u32 REGPARM(2) sh2_peripheral_read16(u32 a, SH2 *sh2) { u16 *r = (void *)sh2->peri_regs; u32 d; @@ -255,7 +255,7 @@ u32 sh2_peripheral_read16(u32 a, SH2 *sh2) return d; } -u32 sh2_peripheral_read32(u32 a, SH2 *sh2) +u32 REGPARM(2) sh2_peripheral_read32(u32 a, SH2 *sh2) { u32 d; a &= 0x1fc; diff --git a/pico/pico_int.h b/pico/pico_int.h index 4d599ce8e..497649b6d 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -958,9 +958,9 @@ void p32x_dreq1_trigger(void); void p32x_timers_recalc(void); void p32x_timers_do(unsigned int m68k_slice); void sh2_peripheral_reset(SH2 *sh2); -unsigned int sh2_peripheral_read8(unsigned int a, SH2 *sh2); -unsigned int sh2_peripheral_read16(unsigned int a, SH2 *sh2); -unsigned int sh2_peripheral_read32(unsigned int a, SH2 *sh2); +unsigned int REGPARM(2) sh2_peripheral_read8(unsigned int a, SH2 *sh2); +unsigned int REGPARM(2) sh2_peripheral_read16(unsigned int a, SH2 *sh2); +unsigned int REGPARM(2) sh2_peripheral_read32(unsigned int a, SH2 *sh2); void REGPARM(3) sh2_peripheral_write8(unsigned int a, unsigned int d, SH2 *sh2); void REGPARM(3) sh2_peripheral_write16(unsigned int a, unsigned int d, SH2 *sh2); void REGPARM(3) sh2_peripheral_write32(unsigned int a, unsigned int d, SH2 *sh2); diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index 90e658677..13e554955 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -87,3 +87,12 @@ get_define OFS_EST_ PicoEState HighPal ; echo "$line" >>$fn get_define OFS_PMEM_ PicoMem vram ; echo "$line" >>$fn get_define OFS_PMEM_ PicoMem vsram ; echo "$line" >>$fn + +get_define OFS_SH2_ SH2_ is_slave ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_bios ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_da ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_sdram ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_rom ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_dram ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_drcblk_da ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_drcblk_ram ; echo "$line" >>$fn From 65072b81811c45ca5499f1efed7f473dc5e44636 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 4 Apr 2019 20:29:39 +0200 Subject: [PATCH 026/174] added branch cache to sh2 drc to improve cross-tcache jump speed --- cpu/drc/emit_arm.c | 15 +++++- cpu/drc/emit_x86.c | 29 +++++++++++- cpu/sh2/compiler.c | 112 +++++++++++++++++++++++++++++++++++++++------ cpu/sh2/sh2.h | 3 ++ 4 files changed, 143 insertions(+), 16 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index c255a8b82..3f782bb60 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -179,6 +179,7 @@ /* ldr and str */ #define EOP_LDR_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,1,0,1,rn,rd,offset_12) #define EOP_LDRB_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,1,1,1,rn,rd,offset_12) +#define EOP_STR_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,(offset_12) >= 0,0,0,rn,rd,abs(offset_12)) #define EOP_LDR_IMM( rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,1,0,1,rn,rd,offset_12) #define EOP_LDR_NEGIMM(rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,0,0,1,rn,rd,offset_12) @@ -478,6 +479,9 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_add_r_r(d, s) \ emith_add_r_r_r(d, d, s) +#define emith_add_r_r_ptr(d, s) \ + emith_add_r_r_r(d, d, s) + #define emith_sub_r_r(d, s) \ EOP_SUB_REG(A_COND_AL,0,d,d,s,A_AM1_LSL,0) @@ -684,6 +688,8 @@ static int emith_xbranch(int cond, void *target, int is_call) // misc #define emith_read_r_r_offs_c(cond, r, rs, offs) \ EOP_LDR_IMM2(cond, r, rs, offs) +#define emith_read_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_read_r_r_offs_c(cond, r, rs, offs) #define emith_read_r_r_r_c(cond, r, rs, rm) \ EOP_LDR_REG_LSL(cond, r, rs, rm, 0) #define emith_read_r_r_r(r, rs, rm) \ @@ -716,8 +722,15 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_read16_r_r_offs(r, rs, offs) \ emith_read16_r_r_offs_c(A_COND_AL, r, rs, offs) +#define emith_write_r_r_offs_c(cond, r, rs, offs) \ + EOP_STR_IMM2(cond, r, rs, offs) +#define emith_write_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_write_r_r_offs_c(cond, r, rs, offs) + +#define emith_ctx_read_c(cond, r, offs) \ + emith_read_r_r_offs_c(cond, r, CONTEXT_REG, offs) #define emith_ctx_read(r, offs) \ - emith_read_r_r_offs(r, CONTEXT_REG, offs) + emith_ctx_read_c(A_COND_AL, r, offs) #define emith_ctx_read_ptr(r, offs) \ emith_ctx_read(r, offs) diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 816e92949..58476a943 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -122,7 +122,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMIT_OP_MODRM(0x01, 3, s, d) #define emith_add_r_r_ptr(d, s) do { \ - EMIT_REX_IF(1, dst, src); \ + EMIT_REX_IF(1, s, d); \ EMIT_OP_MODRM64(0x01, 3, s, d); \ } while (0) @@ -260,6 +260,21 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; } \ } while (0) +// _r_r_r_shift +#define emith_add_r_r_r_lsl(d, s1, s2, lslimm) do { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsl(tmp_, s2, lslimm); \ + emith_add_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ +} while (0) + +#define emith_add_r_r_r_lsr(d, s1, s2, lslimm) do { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsr(tmp_, s2, lslimm); \ + emith_add_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ +} while (0) + // _r_r_shift #define emith_or_r_r_lsl(d, s, lslimm) do { \ int tmp_ = rcache_get_tmp(); \ @@ -361,8 +376,12 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_read_r_r_offs_c(cond, r, rs, offs) \ emith_read_r_r_offs(r, rs, offs) +#define emith_read_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_read_r_r_offs_ptr(r, rs, offs) #define emith_write_r_r_offs_c(cond, r, rs, offs) \ emith_write_r_r_offs(r, rs, offs) +#define emith_write_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_write_r_r_offs_ptr(r, rs, offs) #define emith_read8_r_r_offs_c(cond, r, rs, offs) \ emith_read8_r_r_offs(r, rs, offs) #define emith_write8_r_r_offs_c(cond, r, rs, offs) \ @@ -583,9 +602,15 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_read_r_r_offs(r, rs, offs) \ emith_deref_op(0x8b, r, rs, offs) +#define emith_read_r_r_offs_ptr(r, rs, offs) \ + EMIT_REX_IF(1, r, rs); \ + emith_deref_op(0x8b, r, rs, offs) #define emith_write_r_r_offs(r, rs, offs) \ emith_deref_op(0x89, r, rs, offs) +#define emith_write_r_r_offs_ptr(r, rs, offs) \ + EMIT_REX_IF(1, r, rs); \ + emith_deref_op(0x89, r, rs, offs) // note: don't use prefixes on this #define emith_read8_r_r_offs(r, rs, offs) do { \ @@ -664,6 +689,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_ctx_read(r, offs) \ emith_read_r_r_offs(r, CONTEXT_REG, offs) +#define emith_ctx_read_c(cond, r, offs) \ + emith_ctx_read(r, offs) #define emith_ctx_read_ptr(r, offs) do { \ EMIT_REX_IF(1, r, CONTEXT_REG); \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index bfd98e2be..d54d204ea 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -38,6 +38,7 @@ // features #define PROPAGATE_CONSTANTS 1 #define LINK_BRANCHES 1 +#define BRANCH_CACHE 1 #define ALIAS_REGISTERS 1 #define REMAP_REGISTER 1 @@ -57,10 +58,11 @@ // 10 - smc self-check // 100 - write trace // 200 - compare trace -// 400 - print block entry backtrace +// 400 - block entry backtraceA on exit +// 800 - state dump on exit // { #ifndef DRC_DEBUG -#define DRC_DEBUG 0 +#define DRC_DEBUG 0x800 #endif #if DRC_DEBUG @@ -159,8 +161,6 @@ static char sh2dasm_buff[64]; #define do_host_disasm(x) #endif -#if (DRC_DEBUG & (8|256|512|1024)) || defined(PDB) - #define SH2_DUMP(sh2, reason) { \ char ms = (sh2)->is_slave ? 's' : 'm'; \ printf("%csh2 %s %08x\n", ms, reason, (sh2)->pc); \ @@ -178,6 +178,8 @@ static char sh2dasm_buff[64]; (sh2)->pdb_io_csum[0], (sh2)->pdb_io_csum[1], (sh2)->state, \ (sh2)->poll_addr, (sh2)->poll_cycles, (sh2)->poll_cnt); \ } + +#if (DRC_DEBUG & (8|256|512|1024)) || defined(PDB) static SH2 csh2[2][4]; static void REGPARM(3) *sh2_drc_log_entry(void *block, SH2 *sh2, u32 sr) { @@ -631,6 +633,14 @@ static void REGPARM(1) flush_tcache(int tcid) memset(Pico32xMem->drcblk_da[tcid - 1], 0, sizeof(Pico32xMem->drcblk_da[0])); } +#if BRANCH_CACHE + if (tcid) + memset32(sh2s[tcid-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + else { + memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4); + } +#endif #if (DRC_DEBUG & 4) tcache_dsm_ptrs[tcid] = tcache_bases[tcid]; #endif @@ -3727,14 +3737,35 @@ static void sh2_generate_utils(void) // sh2_drc_dispatcher(void) sh2_drc_dispatcher = (void *)tcache_ptr; - sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); - emith_cmp_r_imm(sr, 0); - emith_jump_cond(DCOND_LT, sh2_drc_exit); - rcache_invalidate(); emith_ctx_read(arg0, SHR_PC * 4); +#if BRANCH_CACHE + // check if PC is in branch target cache + emith_and_r_r_imm(arg1, arg0, (ARRAY_SIZE(sh2s->branch_cache)-1)*4); + // TODO implement emith_add_r_r_r_lsl_ptr, saves one insn on 32bit ARM + emith_lsl(arg1, arg1, sizeof(void *) == 8 ? 2 : 1); + emith_add_r_r_ptr(arg1, CONTEXT_REG); + emith_read_r_r_offs(arg2, arg1, offsetof(SH2, branch_cache)); + emith_cmp_r_r(arg2, arg0); + EMITH_SJMP_START(DCOND_NE); + emith_read_r_r_offs_ptr_c(DCOND_EQ, RET_REG, arg1, offsetof(SH2, branch_cache) + sizeof(void *)); + emith_jump_reg_c(DCOND_EQ, RET_REG); + EMITH_SJMP_END(DCOND_NE); +#endif emith_ctx_read(arg1, offsetof(SH2, is_slave)); emith_add_r_r_ptr_imm(arg2, CONTEXT_REG, offsetof(SH2, drc_tmp)); emith_call(dr_lookup_block); +#if BRANCH_CACHE + // store PC and block entry ptr (in arg0) in branch target cache + emith_tst_r_r_ptr(RET_REG, RET_REG); + EMITH_SJMP_START(DCOND_EQ); + emith_ctx_read_c(DCOND_NE, arg2, SHR_PC * 4); + emith_and_r_r_imm(arg1, arg2, (ARRAY_SIZE(sh2s->branch_cache)-1)*4); + emith_lsl(arg1, arg1, sizeof(void *) == 8 ? 2 : 1); + emith_add_r_r_ptr(arg1, CONTEXT_REG); + emith_write_r_r_offs_c(DCOND_NE, arg2, arg1, offsetof(SH2, branch_cache)); + emith_write_r_r_offs_ptr_c(DCOND_NE, RET_REG, arg1, offsetof(SH2, branch_cache) + sizeof(void *)); + EMITH_SJMP_END(DCOND_EQ); +#endif emit_block_entry(); // lookup failed, call sh2_translate() emith_move_r_r_ptr(arg0, CONTEXT_REG); @@ -3904,6 +3935,15 @@ static void sh2_smc_rm_block(struct block_desc *bd, int tcache_id, u32 ram_mask) bd->addr = bd->size = bd->size_nolit = 0; bd->entry_count = 0; + +#if BRANCH_CACHE + if (tcache_id) + memset32(sh2s[tcache_id-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + else { + memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4); + } +#endif } /* @@ -4015,9 +4055,9 @@ int sh2_execute_drc(SH2 *sh2c, int cycles) return ret_cycles; } -#if (DRC_DEBUG & 2) -void block_stats(void) +static void block_stats(void) { +#if (DRC_DEBUG & 2) int c, b, i, total = 0; printf("block stats:\n"); @@ -4048,12 +4088,10 @@ void block_stats(void) for (b = 0; b < ARRAY_SIZE(block_tables); b++) for (i = 0; i < block_counts[b]; i++) block_tables[b][i].refcount = 0; -} -#else -#define block_stats() #endif +} -void sh2_drc_flush_all(void) +static void backtrace(void) { #if (DRC_DEBUG & 1024) int i; @@ -4064,6 +4102,52 @@ void sh2_drc_flush_all(void) for (i = 0; i < ARRAY_SIZE(csh2[1]); i++) SH2_DUMP(&csh2[1][i], "bt ssh2"); #endif +} + +static void state_dump(void) +{ +#if (DRC_DEBUG & 2048) + int i; + + SH2_DUMP(&sh2s[0], "master"); + printf("VBR msh2: %x\n", sh2s[0].vbr); + for (i = 0; i < 0x60; i++) { + printf("%08x ",p32x_sh2_read32(sh2s[0].vbr + i*4, &sh2s[0])); + if ((i+1) % 8 == 0) printf("\n"); + } + printf("stack msh2: %x\n", sh2s[0].r[15]); + for (i = -0x30; i < 0x30; i++) { + printf("%08x ",p32x_sh2_read32(sh2s[0].r[15] + i*4, &sh2s[0])); + if ((i+1) % 8 == 0) printf("\n"); + } + printf("branch cache master:\n"); + for (i = 0; i < ARRAY_SIZE(sh2s[0].branch_cache); i++) { + printf("%08x ",sh2s[0].branch_cache[i].pc); + if ((i+1) % 8 == 0) printf("\n"); + } + SH2_DUMP(&sh2s[1], "slave"); + printf("VBR ssh2: %x\n", sh2s[1].vbr); + for (i = 0; i < 0x60; i++) { + printf("%08x ",p32x_sh2_read32(sh2s[1].vbr + i*4, &sh2s[1])); + if ((i+1) % 8 == 0) printf("\n"); + } + printf("stack ssh2: %x\n", sh2s[1].r[15]); + for (i = -0x30; i < 0x30; i++) { + printf("%08x ",p32x_sh2_read32(sh2s[1].r[15] + i*4, &sh2s[1])); + if ((i+1) % 8 == 0) printf("\n"); + } + printf("branch cache slave:\n"); + for (i = 0; i < ARRAY_SIZE(sh2s[1].branch_cache); i++) { + printf("%08x ",sh2s[1].branch_cache[i].pc); + if ((i+1) % 8 == 0) printf("\n"); + } +#endif +} + +void sh2_drc_flush_all(void) +{ + backtrace(); + state_dump(); block_stats(); flush_tcache(0); flush_tcache(1); diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index a073d43f2..e53bbf057 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -50,6 +50,9 @@ typedef struct SH2_ int poll_cycles; int poll_cnt; + // DRC branch cache. size must be 2^n and <=128 + struct { unsigned int pc; void *code; } branch_cache[128]; + // interpreter stuff int icount; // cycles left in current timeslice unsigned int ea; From 48fdcb03901676ecd1a674934898654b9a05834f Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 5 Apr 2019 21:01:03 +0200 Subject: [PATCH 027/174] overhaul of translation cache and sh2 literals handling --- cpu/sh2/compiler.c | 855 +++++++++++++++++++++++++++------------------ cpu/sh2/compiler.h | 12 +- pico/32x/memory.c | 12 +- pico/pico_int.h | 6 +- 4 files changed, 523 insertions(+), 362 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index d54d204ea..e6ce34745 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -62,7 +62,7 @@ // 800 - state dump on exit // { #ifndef DRC_DEBUG -#define DRC_DEBUG 0x800 +#define DRC_DEBUG 0 #endif #if DRC_DEBUG @@ -149,8 +149,6 @@ enum op_types { #ifdef DRC_SH2 -static int literal_disabled_frames; - #if (DRC_DEBUG & 4) static u8 *tcache_dsm_ptrs[3]; static char sh2dasm_buff[64]; @@ -261,6 +259,7 @@ static const int tcache_sizes[TCACHE_BUFFERS] = { static u8 *tcache_bases[TCACHE_BUFFERS]; static u8 *tcache_ptrs[TCACHE_BUFFERS]; +static u8 *tcache_limit[TCACHE_BUFFERS]; // ptr for code emiters static u8 *tcache_ptr; @@ -270,14 +269,21 @@ static u8 *tcache_ptr; struct block_link { u32 target_pc; void *jump; // insn address - struct block_link *next; // either in block_entry->links or + struct block_link *next; // either in block_entry->links or unresolved + struct block_link *o_next; // ...in block_entry->o_links + struct block_link *prev; + struct block_link *o_prev; + struct block_entry *target;// target block this is linked in (be->links) + int tcache_id; }; struct block_entry { u32 pc; - void *tcache_ptr; // translated block for above PC - struct block_entry *next; // next block in hash_table with same pc hash - struct block_link *links; // links to this entry + u8 *tcache_ptr; // translated block for above PC + struct block_entry *next; // chain in hash_table with same pc hash + struct block_entry *prev; + struct block_link *links; // incoming links to this entry + struct block_link *o_links;// outgoing links from this entry #if (DRC_DEBUG & 2) struct block_desc *block; #endif @@ -285,8 +291,12 @@ struct block_entry { struct block_desc { u32 addr; // block start SH2 PC address - u16 size; // ..of recompiled insns+lit. pool - u16 size_nolit; // same without literals + u32 addr_lit; // block start SH2 literal pool addr + int size; // ..of recompiled insns + int size_lit; // ..of (insns+)literal pool + u8 *tcache_ptr; // start address of block in cache + u16 active; // actively used or deactivated? + struct block_list *list; #if (DRC_DEBUG & 2) int refcount; #endif @@ -301,6 +311,7 @@ static const int block_max_counts[TCACHE_BUFFERS] = { }; static struct block_desc *block_tables[TCACHE_BUFFERS]; static int block_counts[TCACHE_BUFFERS]; +static int block_limit[TCACHE_BUFFERS]; // we have block_link_pool to avoid using mallocs static const int block_link_pool_max_counts[TCACHE_BUFFERS] = { @@ -310,7 +321,8 @@ static const int block_link_pool_max_counts[TCACHE_BUFFERS] = { }; static struct block_link *block_link_pool[TCACHE_BUFFERS]; static int block_link_pool_counts[TCACHE_BUFFERS]; -static struct block_link *unresolved_links[TCACHE_BUFFERS]; +static struct block_link **unresolved_links[TCACHE_BUFFERS]; +static struct block_link *blink_free[TCACHE_BUFFERS]; // used for invalidation static const int ram_sizes[TCACHE_BUFFERS] = { @@ -323,7 +335,11 @@ static const int ram_sizes[TCACHE_BUFFERS] = { struct block_list { struct block_desc *block; struct block_list *next; + struct block_list *prev; + struct block_list **head; + struct block_list *l_next; }; +struct block_list *blist_free; // array of pointers to block_lists for RAM and 2 data arrays // each array has len: sizeof(mem) / INVAL_PAGE_SIZE @@ -573,41 +589,59 @@ static struct block_entry *dr_get_entry(u32 pc, int is_slave, int *tcache_id) // block management static void add_to_block_list(struct block_list **blist, struct block_desc *block) { - struct block_list *added = malloc(sizeof(*added)); + struct block_list *added; + + if (blist_free) { + added = blist_free; + blist_free = added->next; + } else { + added = malloc(sizeof(*added)); + } if (!added) { elprintf(EL_ANOMALY, "drc OOM (1)"); return; } added->block = block; + added->l_next = block->list; + block->list = added; + added->head = blist; + + added->prev = NULL; + if (*blist) + (*blist)->prev = added; added->next = *blist; *blist = added; } -static void rm_from_block_list(struct block_list **blist, struct block_desc *block) +static void rm_from_block_lists(struct block_desc *block) { - struct block_list *prev = NULL, *current = *blist; - for (; current != NULL; current = current->next) { - if (current->block == block) { - if (prev == NULL) - *blist = current->next; - else - prev->next = current->next; - free(current); - return; - } - prev = current; + struct block_list *entry; + + entry = block->list; + while (entry != NULL) { + if (entry->prev != NULL) + entry->prev->next = entry->next; + else + *(entry->head) = entry->next; + if (entry->next != NULL) + entry->next->prev = entry->prev; + + entry->next = blist_free; + blist_free = entry; + + entry = entry->l_next; } - dbg(1, "can't rm block %p (%08x-%08x)", - block, block->addr, block->addr + block->size); + block->list = NULL; } static void rm_block_list(struct block_list **blist) { - struct block_list *tmp, *current = *blist; + struct block_list *next, *current = *blist; while (current != NULL) { - tmp = current; - current = current->next; - free(tmp); + next = current->next; + current->next = blist_free; + blist_free = current; + current = next; } *blist = NULL; } @@ -615,32 +649,37 @@ static void rm_block_list(struct block_list **blist) static void REGPARM(1) flush_tcache(int tcid) { int i; +#if (DRC_DEBUG & 1) + int tc_used, bl_used; - dbg(1, "tcache #%d flush! (%d/%d, bds %d/%d)", tcid, - tcache_ptrs[tcid] - tcache_bases[tcid], tcache_sizes[tcid], - block_counts[tcid], block_max_counts[tcid]); + tc_used = tcache_sizes[tcid] - (tcache_limit[tcid] - tcache_ptrs[tcid]); + bl_used = block_max_counts[tcid] - (block_limit[tcid] - block_counts[tcid]); + elprintf(EL_STATUS, "tcache #%d flush! (%d/%d, bds %d/%d)", tcid, tc_used, + tcache_sizes[tcid], bl_used, block_max_counts[tcid]); +#endif block_counts[tcid] = 0; + block_limit[tcid] = block_max_counts[tcid] - 1; block_link_pool_counts[tcid] = 0; - unresolved_links[tcid] = NULL; + blink_free[tcid] = NULL; + memset(unresolved_links[tcid], 0, sizeof(*unresolved_links[0]) * hash_table_sizes[tcid]); memset(hash_tables[tcid], 0, sizeof(*hash_tables[0]) * hash_table_sizes[tcid]); tcache_ptrs[tcid] = tcache_bases[tcid]; - if (Pico32xMem != NULL) { - if (tcid == 0) // ROM, RAM - memset(Pico32xMem->drcblk_ram, 0, - sizeof(Pico32xMem->drcblk_ram)); - else - memset(Pico32xMem->drcblk_da[tcid - 1], 0, - sizeof(Pico32xMem->drcblk_da[0])); - } -#if BRANCH_CACHE - if (tcid) - memset32(sh2s[tcid-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); - else { - memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); - memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4); + tcache_limit[tcid] = tcache_bases[tcid] + tcache_sizes[tcid]; + if (Pico32xMem->sdram != NULL) { + if (tcid == 0) { // ROM, RAM + memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram)); + memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram)); + memset(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)); + memset(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)); + } else { + memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram)); + memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram)); + memset(Pico32xMem->drcblk_da[tcid - 1], 0, sizeof(Pico32xMem->drcblk_da[tcid - 1])); + memset(Pico32xMem->drclit_da[tcid - 1], 0, sizeof(Pico32xMem->drclit_da[tcid - 1])); + memset(sh2s[tcid - 1].branch_cache, -1, sizeof(sh2s[0].branch_cache)); } -#endif + } #if (DRC_DEBUG & 4) tcache_dsm_ptrs[tcid] = tcache_bases[tcid]; #endif @@ -652,69 +691,222 @@ static void REGPARM(1) flush_tcache(int tcid) static void add_to_hashlist(struct block_entry *be, int tcache_id) { u32 tcmask = hash_table_sizes[tcache_id] - 1; + struct block_entry **head = &HASH_FUNC(hash_tables[tcache_id], be->pc, tcmask); - be->next = HASH_FUNC(hash_tables[tcache_id], be->pc, tcmask); - HASH_FUNC(hash_tables[tcache_id], be->pc, tcmask) = be; + be->prev = NULL; + if (*head) + (*head)->prev = be; + be->next = *head; + *head = be; +} -#if (DRC_DEBUG & 2) - if (be->next != NULL) { - printf(" %08x: hash collision with %08x\n", - be->pc, be->next->pc); - hash_collisions++; - } +static void rm_from_hashlist(struct block_entry *be, int tcache_id) +{ + u32 tcmask = hash_table_sizes[tcache_id] - 1; + struct block_entry **head = &HASH_FUNC(hash_tables[tcache_id], be->pc, tcmask); + +#if DRC_DEBUG & 1 + struct block_entry *current = be; + while (current->prev != NULL) + current = current->prev; + if (current != *head) + dbg(1, "rm_from_hashlist @%p: be %p %08x missing?", head, be, be->pc); #endif + + if (be->prev != NULL) + be->prev->next = be->next; + else + *head = be->next; + if (be->next != NULL) + be->next->prev = be->prev; } -static void rm_from_hashlist(struct block_entry *be, int tcache_id) + +static void add_to_hashlist_unresolved(struct block_link *bl, int tcache_id) { u32 tcmask = hash_table_sizes[tcache_id] - 1; - struct block_entry *cur, *prev; - - cur = HASH_FUNC(hash_tables[tcache_id], be->pc, tcmask); - if (cur == NULL) - goto missing; - - if (be == cur) { // first - HASH_FUNC(hash_tables[tcache_id], be->pc, tcmask) = be->next; - return; + struct block_link **head = &HASH_FUNC(unresolved_links[tcache_id], bl->target_pc, tcmask); + + bl->target = NULL; // marker for not resolved + bl->prev = NULL; + if (*head) + (*head)->prev = bl; + bl->next = *head; + *head = bl; +} + +static void rm_from_hashlist_unresolved(struct block_link *bl, int tcache_id) +{ + u32 tcmask = hash_table_sizes[tcache_id] - 1; + struct block_link **head = &HASH_FUNC(unresolved_links[tcache_id], bl->target_pc, tcmask); + +#if DRC_DEBUG & 1 + struct block_link *current = bl; + while (current->prev != NULL) + current = current->prev; + if (current != *head) + dbg(1, "rm_from_hashlist unresolved @%p: bl %p %p %08x missing?", head, bl, bl->target, bl->target_pc); +#endif + + if (bl->prev != NULL) + bl->prev->next = bl->next; + else + *head = bl->next; + if (bl->next != NULL) + bl->next->prev = bl->prev; +} + +static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit); +static void dr_free_oldest_block(int tcache_id) +{ + struct block_desc *bd; + + if (block_limit[tcache_id] >= block_max_counts[tcache_id]) { + // block desc wrap around + block_limit[tcache_id] = 0; } + bd = &block_tables[tcache_id][block_limit[tcache_id]]; - for (prev = cur, cur = cur->next; cur != NULL; cur = cur->next) { - if (cur == be) { - prev->next = cur->next; - return; - } + if (bd->tcache_ptr && bd->tcache_ptr < tcache_ptrs[tcache_id]) { + // cache wrap around + tcache_ptrs[tcache_id] = bd->tcache_ptr; } -missing: - dbg(1, "rm_from_hashlist: be %p %08x missing?", be, be->pc); + if (bd->addr && bd->entry_count) + sh2_smc_rm_block_entry(bd, tcache_id, 0); + + block_limit[tcache_id]++; + if (block_limit[tcache_id] >= block_max_counts[tcache_id]) + block_limit[tcache_id] = 0; + bd = &block_tables[tcache_id][block_limit[tcache_id]]; + if (bd->tcache_ptr >= tcache_ptrs[tcache_id]) + tcache_limit[tcache_id] = bd->tcache_ptr; + else + tcache_limit[tcache_id] = tcache_bases[tcache_id] + tcache_sizes[tcache_id]; } -static void unregister_links(struct block_entry *be, int tcache_id) +static u8 *dr_prepare_cache(int tcache_id, int insn_count) { - struct block_link *bl_unresolved = unresolved_links[tcache_id]; - struct block_link *bl, *bl_next; - - for (bl = be->links; bl != NULL; ) { - bl_next = bl->next; - bl->next = bl_unresolved; - bl_unresolved = bl; - bl = bl_next; +#if BRANCH_CACHE + u8 *limit = tcache_limit[tcache_id]; +#endif + + // if no block desc available + if (block_counts[tcache_id] == block_limit[tcache_id]) + dr_free_oldest_block(tcache_id); + + // while not enough cache space left (limit - tcache_ptr < max space needed) + while (tcache_limit[tcache_id] - tcache_ptrs[tcache_id] < insn_count * 128) + dr_free_oldest_block(tcache_id); + +#if BRANCH_CACHE + if (limit != tcache_limit[tcache_id]) { + if (tcache_id) + memset32(sh2s[tcache_id-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + else { + memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4); + } } - be->links = NULL; - unresolved_links[tcache_id] = bl_unresolved; +#endif + return (u8 *)tcache_ptrs[tcache_id]; } -// unlike sh2_smc_rm_block, the block stays and can still be accessed -// by other already directly linked blocks, just not preferred -static void kill_block_entry(struct block_entry *be, int tcache_id) +static void dr_mark_memory(int mark, struct block_desc *block, int tcache_id, u32 nolit) { - rm_from_hashlist(be, tcache_id); - unregister_links(be, tcache_id); + u8 *drc_ram_blk = NULL, *lit_ram_blk = NULL; + u32 addr, end, mask = 0, shift = 0, idx; + + // mark memory blocks as containing compiled code + if ((block->addr & 0xc7fc0000) == 0x06000000 + || (block->addr & 0xfffff000) == 0xc0000000) + { + if (tcache_id != 0) { + // data array + drc_ram_blk = Pico32xMem->drcblk_da[tcache_id-1]; + lit_ram_blk = Pico32xMem->drclit_da[tcache_id-1]; + shift = SH2_DRCBLK_DA_SHIFT; + } + else { + // SDRAM + drc_ram_blk = Pico32xMem->drcblk_ram; + lit_ram_blk = Pico32xMem->drclit_ram; + shift = SH2_DRCBLK_RAM_SHIFT; + } + mask = ram_sizes[tcache_id] - 1; + + // mark recompiled insns + addr = block->addr & ~((1 << shift) - 1); + end = block->addr + block->size; + for (idx = (addr & mask) >> shift; addr < end; addr += (1 << shift)) + drc_ram_blk[idx++] += mark; + + // mark literal pool + if (addr < (block->addr_lit & ~((1 << shift) - 1))) + addr = block->addr_lit & ~((1 << shift) - 1); + end = block->addr_lit + block->size_lit; + for (idx = (addr & mask) >> shift; addr < end; addr += (1 << shift)) + drc_ram_blk[idx++] += mark; + + // mark for literals disabled + if (nolit) { + addr = nolit & ~((1 << shift) - 1); + end = block->addr_lit + block->size_lit; + for (idx = (addr & mask) >> shift; addr < end; addr += (1 << shift)) + lit_ram_blk[idx++] = 1; + } + + if (mark < 0) + rm_from_block_lists(block); + else { + // add to invalidation lookup lists + addr = block->addr & ~(INVAL_PAGE_SIZE - 1); + end = block->addr + block->size; + for (idx = (addr & mask) / INVAL_PAGE_SIZE; addr < end; addr += INVAL_PAGE_SIZE) + add_to_block_list(&inval_lookup[tcache_id][idx++], block); + + if (addr < (block->addr_lit & ~(INVAL_PAGE_SIZE - 1))) + addr = block->addr_lit & ~(INVAL_PAGE_SIZE - 1); + end = block->addr_lit + block->size_lit; + for (idx = (addr & mask) / INVAL_PAGE_SIZE; addr < end; addr += INVAL_PAGE_SIZE) + add_to_block_list(&inval_lookup[tcache_id][idx++], block); + } + } } -static struct block_desc *dr_add_block(u32 addr, u16 size_lit, - u16 size_nolit, int is_slave, int *blk_id) +static u32 dr_check_nolit(u32 start, u32 end, int tcache_id) +{ + u8 *lit_ram_blk = NULL; + u32 mask = 0, shift = 0, addr, idx; + + if ((start & 0xc7fc0000) == 0x06000000 + || (start & 0xfffff000) == 0xc0000000) + { + if (tcache_id != 0) { + // data array + lit_ram_blk = Pico32xMem->drclit_da[tcache_id-1]; + shift = SH2_DRCBLK_DA_SHIFT; + } + else { + // SDRAM + lit_ram_blk = Pico32xMem->drclit_ram; + shift = SH2_DRCBLK_RAM_SHIFT; + } + mask = ram_sizes[tcache_id] - 1; + + addr = start & ~((1 << shift) - 1); + for (idx = (addr & mask) >> shift; addr < end; addr += (1 << shift)) + if (lit_ram_blk[idx++]) + break; + + return (addr < start ? start : addr > end ? end : addr); + } + + return end; +} + +static struct block_desc *dr_add_block(u32 addr, int size, + u32 addr_lit, int size_lit, int is_slave, int *blk_id) { struct block_entry *be; struct block_desc *bd; @@ -723,26 +915,27 @@ static struct block_desc *dr_add_block(u32 addr, u16 size_lit, // do a lookup to get tcache_id and override check be = dr_get_entry(addr, is_slave, &tcache_id); - if (be != NULL) { - dbg(1, "block override for %08x, was %p", addr, be->tcache_ptr); - kill_block_entry(be, tcache_id); - } + if (be != NULL) + dbg(1, "block override for %08x", addr); bcount = &block_counts[tcache_id]; - if (*bcount >= block_max_counts[tcache_id]) { + if (*bcount == block_limit[tcache_id]) { dbg(1, "bd overflow for tcache %d", tcache_id); return NULL; } bd = &block_tables[tcache_id][*bcount]; bd->addr = addr; - bd->size = size_lit; - bd->size_nolit = size_nolit; + bd->size = size; + bd->addr_lit = addr_lit; + bd->size_lit = size_lit; + bd->tcache_ptr = tcache_ptr; + bd->active = 1; bd->entry_count = 1; bd->entryp[0].pc = addr; bd->entryp[0].tcache_ptr = tcache_ptr; - bd->entryp[0].links = NULL; + bd->entryp[0].links = bd->entryp[0].o_links = NULL; #if (DRC_DEBUG & 2) bd->entryp[0].block = bd; bd->refcount = 0; @@ -751,6 +944,8 @@ static struct block_desc *dr_add_block(u32 addr, u16 size_lit, *blk_id = *bcount; (*bcount)++; + if (*bcount >= block_max_counts[tcache_id]) + *bcount = 0; return bd; } @@ -777,43 +972,47 @@ static void *dr_failure(void) exit(1); } -static void *dr_prepare_ext_branch(u32 pc, int is_slave, int tcache_id) +static void *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_slave, int tcache_id) { #if LINK_BRANCHES struct block_link *bl = block_link_pool[tcache_id]; int cnt = block_link_pool_counts[tcache_id]; struct block_entry *be = NULL; int target_tcache_id; - int i; be = dr_get_entry(pc, is_slave, &target_tcache_id); - if (target_tcache_id != tcache_id) + if (target_tcache_id && target_tcache_id != tcache_id) return sh2_drc_dispatcher; - // if pool has been freed, reuse - for (i = cnt - 1; i >= 0; i--) - if (bl[i].target_pc != 0) - break; - cnt = i + 1; - if (cnt >= block_link_pool_max_counts[tcache_id]) { + if (blink_free[tcache_id] != NULL) { + bl = blink_free[tcache_id]; + blink_free[tcache_id] = bl->next; + } else if (cnt >= block_link_pool_max_counts[tcache_id]) { dbg(1, "bl overflow for tcache %d", tcache_id); return sh2_drc_dispatcher; + } else { + bl += cnt; + block_link_pool_counts[tcache_id] = cnt+1; } - bl += cnt; - block_link_pool_counts[tcache_id]++; + bl->tcache_id = tcache_id; bl->target_pc = pc; bl->jump = tcache_ptr; + bl->o_next = owner->o_links; + owner->o_links = bl; if (be != NULL) { - dbg(2, "- early link from %p to pc %08x", bl->jump, pc); + dbg(2, "- early link from %p to pc %08x entry %p", bl->jump, pc, be->tcache_ptr); + bl->target = be; + bl->prev = NULL; + if (be->links) + be->links->prev = bl; bl->next = be->links; be->links = bl; return be->tcache_ptr; } else { - bl->next = unresolved_links[tcache_id]; - unresolved_links[tcache_id] = bl; + add_to_hashlist_unresolved(bl, tcache_id); return sh2_drc_dispatcher; } #else @@ -824,30 +1023,28 @@ static void *dr_prepare_ext_branch(u32 pc, int is_slave, int tcache_id) static void dr_link_blocks(struct block_entry *be, int tcache_id) { #if LINK_BRANCHES - struct block_link *first = unresolved_links[tcache_id]; - struct block_link *bl, *prev, *tmp; + u32 tcmask = hash_table_sizes[tcache_id] - 1; u32 pc = be->pc; + struct block_link **head = &HASH_FUNC(unresolved_links[tcache_id], pc, tcmask); + struct block_link *bl = *head, *next; - for (bl = prev = first; bl != NULL; ) { + while (bl != NULL) { + next = bl->next; if (bl->target_pc == pc) { - dbg(2, "- link from %p to pc %08x", bl->jump, pc); - emith_jump_patch(bl->jump, tcache_ptr); - + dbg(2, "- link from %p to pc %08x entry %p", bl->jump, pc, be->tcache_ptr); // move bl from unresolved_links to block_entry - tmp = bl->next; + rm_from_hashlist_unresolved(bl, tcache_id); + + emith_jump_patch(bl->jump, be->tcache_ptr); + bl->target = be; + bl->prev = NULL; + if (be->links) + be->links->prev = bl; bl->next = be->links; be->links = bl; - - if (bl == first) - first = prev = bl = tmp; - else - prev->next = bl = tmp; - continue; } - prev = bl; - bl = bl->next; + bl = next; } - unresolved_links[tcache_id] = first; // could sync arm caches here, but that's unnecessary #endif @@ -1954,7 +2151,7 @@ static int emit_memhandler_read_rr(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rs, u32 off emith_move_r_imm(hr2, val); } else { emit_move_r_imm32(rd, val); - hr2 = rcache_get_reg(rd, RC_GR_READ, NULL); + hr2 = rcache_get_reg(rd, RC_GR_RMW, NULL); } if ((size & MF_POSTINCR) && gconst_get(rs, &val)) gconst_new(rs, val + (1 << (size & MF_SIZEMASK))); @@ -2202,14 +2399,14 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) u32 test_irq:1; u32 pending_branch_direct:1; u32 pending_branch_indirect:1; - u32 literals_disabled:1; } drcf = { 0, }; // PC of current, first, last SH2 insn u32 pc, base_pc, end_pc; - u32 end_literals; + u32 base_literals, end_literals; void *block_entry_ptr; struct block_desc *block; + struct block_entry *entry; u16 *dr_pc_base; struct op_data *opd; int blkid_main = 0; @@ -2221,7 +2418,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) int op; base_pc = sh2->pc; - drcf.literals_disabled = literal_disabled_frames != 0; // get base/validate PC dr_pc_base = dr_get_pc_base(base_pc, sh2->is_slave); @@ -2231,31 +2427,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) exit(1); } - tcache_ptr = tcache_ptrs[tcache_id]; - - // predict tcache overflow - u = tcache_ptr - tcache_bases[tcache_id]; - if (u > tcache_sizes[tcache_id] - MAX_BLOCK_SIZE) { - dbg(1, "tcache %d overflow", tcache_id); - return NULL; - } - // initial passes to disassemble and analyze the block - scan_block(base_pc, sh2->is_slave, op_flags, &end_pc, &end_literals); - - if (drcf.literals_disabled) - end_literals = end_pc; - - block = dr_add_block(base_pc, end_literals - base_pc, - end_pc - base_pc, sh2->is_slave, &blkid_main); - if (block == NULL) - return NULL; - - block_entry_ptr = tcache_ptr; - dbg(2, "== %csh2 block #%d,%d %08x-%08x -> %p", sh2->is_slave ? 's' : 'm', - tcache_id, blkid_main, base_pc, end_pc, block_entry_ptr); - - dr_link_blocks(&block->entryp[0], tcache_id); + scan_block(base_pc, sh2->is_slave, op_flags, &end_pc, &base_literals, &end_literals); + end_literals = dr_check_nolit(base_literals, end_literals, tcache_id); + if (base_literals == end_literals) // map empty lit section to end of code + base_literals = end_literals = end_pc; // collect branch_targets that don't land on delay slots for (pc = base_pc, i = 0; pc < end_pc; i++, pc += 2) { @@ -2272,6 +2448,20 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) memset(branch_target_ptr, 0, sizeof(branch_target_ptr[0]) * branch_target_count); } + tcache_ptr = dr_prepare_cache(tcache_id, (end_pc - base_pc) / 2); +#if (DRC_DEBUG & 4) + tcache_dsm_ptrs[tcache_id] = tcache_ptr; +#endif + + block = dr_add_block(base_pc, end_pc - base_pc, base_literals, + end_literals - base_literals, sh2->is_slave, &blkid_main); + if (block == NULL) + return NULL; + + block_entry_ptr = tcache_ptr; + dbg(2, "== %csh2 block #%d,%d %08x-%08x -> %p", sh2->is_slave ? 's' : 'm', + tcache_id, blkid_main, base_pc, end_pc, block_entry_ptr); + // clear stale state after compile errors rcache_invalidate(); @@ -2307,41 +2497,36 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // make block entry v = block->entry_count; + entry = &block->entryp[v]; if (v < ARRAY_SIZE(block->entryp)) { - struct block_entry *be_old; - - block->entryp[v].pc = pc; - block->entryp[v].tcache_ptr = tcache_ptr; - block->entryp[v].links = NULL; + entry = &block->entryp[v]; + entry->pc = pc; + entry->tcache_ptr = tcache_ptr; + entry->links = entry->o_links = NULL; #if (DRC_DEBUG & 2) - block->entryp[v].block = block; + entry->block = block; #endif - be_old = dr_get_entry(pc, sh2->is_slave, &tcache_id); - if (be_old != NULL) { - dbg(1, "entry override for %08x, was %p", pc, be_old->tcache_ptr); - kill_block_entry(be_old, tcache_id); - } - - add_to_hashlist(&block->entryp[v], tcache_id); + add_to_hashlist(entry, tcache_id); block->entry_count++; dbg(2, "-- %csh2 block #%d,%d entry %08x -> %p", sh2->is_slave ? 's' : 'm', tcache_id, blkid_main, pc, tcache_ptr); - - // since we made a block entry, link any other blocks - // that jump to current pc - dr_link_blocks(&block->entryp[v], tcache_id); } else { dbg(1, "too many entryp for block #%d,%d pc=%08x", tcache_id, blkid_main, pc); } - - do_host_disasm(tcache_id); + } else { + entry = block->entryp; } + // since we made a block entry, link any other blocks that jump to it + dr_link_blocks(entry, tcache_id); + if (!tcache_id) // can safely link from cpu-local to global memory + dr_link_blocks(entry, sh2->is_slave?2:1); + v = find_in_array(branch_target_pc, branch_target_count, pc); if (v >= 0) branch_target_ptr[v] = tcache_ptr; @@ -2370,8 +2555,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); emith_cmp_r_imm(sr, 0); emith_jump_cond(DCOND_LE, sh2_drc_exit); - do_host_disasm(tcache_id); - rcache_unlock_all(); #if (DRC_DEBUG & (8|256|512|1024)) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); @@ -2389,6 +2572,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_restore_caller_regs(tmp); rcache_invalidate_tmp(); #endif + + do_host_disasm(tcache_id); + rcache_unlock_all(); } #ifdef DRC_CMP @@ -2556,8 +2742,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto end_op; case OP_UNDEFINED: - elprintf_sh2(sh2, EL_ANOMALY, - "drc: illegal op %04x @ %08x", op, pc - 2); + elprintf_sh2(sh2, EL_ANOMALY, "drc: unhandled op %04x @ %08x", op, pc-2); opd->imm = (op_flags[i] & OF_B_IN_DS) ? 6 : 4; // fallthrough case OP_TRAPA: // TRAPA #imm 11000011iiiiiiii @@ -3525,7 +3710,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emit_move_r_imm32(SHR_PC, target_pc); rcache_clean(); - target = dr_prepare_ext_branch(target_pc, sh2->is_slave, tcache_id); + target = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); if (target == NULL) return NULL; } @@ -3571,7 +3756,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emit_move_r_imm32(SHR_PC, pc); rcache_flush(); - target = dr_prepare_ext_branch(pc, sh2->is_slave, tcache_id); + target = dr_prepare_ext_branch(block->entryp, pc, sh2->is_slave, tcache_id); if (target == NULL) return NULL; emith_jump_patchable(target); @@ -3594,45 +3779,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_jump_patch(branch_patch_ptr[i], target); } - // mark memory blocks as containing compiled code - // override any overlay blocks as they become unreachable anyway - if ((block->addr & 0xc7fc0000) == 0x06000000 - || (block->addr & 0xfffff000) == 0xc0000000) - { - u16 *drc_ram_blk = NULL; - u32 addr, mask = 0, shift = 0; - - if (tcache_id != 0) { - // data array, BIOS - drc_ram_blk = Pico32xMem->drcblk_da[sh2->is_slave]; - shift = SH2_DRCBLK_DA_SHIFT; - mask = 0xfff; - } - else { - // SDRAM - drc_ram_blk = Pico32xMem->drcblk_ram; - shift = SH2_DRCBLK_RAM_SHIFT; - mask = 0x3ffff; - } - - // mark recompiled insns - drc_ram_blk[(base_pc & mask) >> shift] = 1; - for (pc = base_pc; pc < end_pc; pc += 2) - drc_ram_blk[(pc & mask) >> shift] = 1; - - // mark literals - for (i = 0; i < literal_addr_count; i++) { - u = literal_addr[i]; - drc_ram_blk[(u & mask) >> shift] = 1; - } - - // add to invalidation lookup lists - addr = base_pc & ~(INVAL_PAGE_SIZE - 1); - for (; addr < end_literals; addr += INVAL_PAGE_SIZE) { - i = (addr & mask) / INVAL_PAGE_SIZE; - add_to_block_list(&inval_lookup[tcache_id][i], block); - } - } + dr_mark_memory(1, block, tcache_id, 0); tcache_ptrs[tcache_id] = tcache_ptr; @@ -3640,10 +3787,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) do_host_disasm(tcache_id); - if (drcf.literals_disabled && literal_addr_count) - dbg(1, "literals_disabled && literal_addr_count?"); - dbg(2, " block #%d,%d tcache %d/%d, insns %d -> %d %.3f", - tcache_id, blkid_main, + dbg(2, " block #%d,%d -> %p tcache %d/%d, insns %d -> %d %.3f", + tcache_id, blkid_main, tcache_ptr, tcache_ptr - tcache_bases[tcache_id], tcache_sizes[tcache_id], insns_compiled, host_insn_count, (float)host_insn_count / insns_compiled); if ((sh2->pc & 0xc6000000) == 0x02000000) { // ROM @@ -3657,7 +3802,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) printf("~~~\n"); */ -#if (DRC_DEBUG & 4) +#if (DRC_DEBUG) fflush(stdout); #endif @@ -3772,13 +3917,6 @@ static void sh2_generate_utils(void) emith_ctx_read(arg1, offsetof(SH2, drc_tmp)); // tcache_id emith_call(sh2_translate); emit_block_entry(); - // sh2_translate() failed, flush cache and retry - emith_ctx_read(arg0, offsetof(SH2, drc_tmp)); - emith_call(flush_tcache); - emith_move_r_r_ptr(arg0, CONTEXT_REG); - emith_ctx_read(arg1, offsetof(SH2, drc_tmp)); - emith_call(sh2_translate); - emit_block_entry(); // XXX: can't translate, fail emith_call(dr_failure); @@ -3891,148 +4029,126 @@ static void sh2_generate_utils(void) #endif } -static void sh2_smc_rm_block(struct block_desc *bd, int tcache_id, u32 ram_mask) +static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit) { - u32 i, addr, end_addr; - void *tmp; + struct block_link *bl; + u32 i; - dbg(2, " killing block %08x-%08x-%08x, blkid %d,%d", - bd->addr, bd->addr + bd->size_nolit, bd->addr + bd->size, + dbg(2, " killing entry %08x-%08x,%08x-%08x, blkid %d,%d", + bd->addr, bd->addr + bd->size, bd->addr_lit, bd->addr_lit + bd->size_lit, tcache_id, bd - block_tables[tcache_id]); if (bd->addr == 0 || bd->entry_count == 0) { dbg(1, " killing dead block!? %08x", bd->addr); return; } - // remove from inval_lookup - addr = bd->addr & ~(INVAL_PAGE_SIZE - 1); - end_addr = bd->addr + bd->size; - for (; addr < end_addr; addr += INVAL_PAGE_SIZE) { - i = (addr & ram_mask) / INVAL_PAGE_SIZE; - rm_from_block_list(&inval_lookup[tcache_id][i], bd); - } - - tmp = tcache_ptr; - - // remove from hash table, make incoming links unresolved - // XXX: maybe patch branches w/flush instead? + // remove from hash table, make incoming links unresolved, revoke outgoing links for (i = 0; i < bd->entry_count; i++) { - rm_from_hashlist(&bd->entryp[i], tcache_id); - - // since we never reuse tcache space of dead blocks, - // insert jump to dispatcher for blocks that are linked to this - tcache_ptr = bd->entryp[i].tcache_ptr; - emit_move_r_imm32(SHR_PC, bd->entryp[i].pc); - rcache_flush(); - emith_jump(sh2_drc_dispatcher); + if (bd->active) + rm_from_hashlist(&bd->entryp[i], tcache_id); + + for (bl = bd->entryp[i].o_links; bl != NULL; ) { + struct block_link *bl_next = bl->o_next; + if (bl->target) { + if (bl->prev) + bl->prev->next = bl->next; + else + bl->target->links = bl->next; + if (bl->next) + bl->next->prev = bl->prev; + bl->target = NULL; + } else if (bd->active) + rm_from_hashlist_unresolved(bl, tcache_id); + // free bl + bl->jump = NULL; + bl->next = blink_free[bl->tcache_id]; + blink_free[bl->tcache_id] = bl; + bl = bl_next; + } + bd->entryp[i].o_links = NULL; - host_instructions_updated(bd->entryp[i].tcache_ptr, tcache_ptr); + for (bl = bd->entryp[i].links; bl != NULL; ) { + struct block_link *bl_next = bl->next; + dbg(2, "- unlink from %p to pc %08x", bl->jump, bl->target_pc); + emith_jump_patch(bl->jump, sh2_drc_dispatcher); + // update cpu caches since the previous jump target doesn't exist anymore + host_instructions_updated(bl->jump, bl->jump+4); - unregister_links(&bd->entryp[i], tcache_id); + add_to_hashlist_unresolved(bl, tcache_id); + bl = bl_next; + } + bd->entryp[i].links = NULL; } - tcache_ptr = tmp; + if (bd->active) + dr_mark_memory(-1, bd, tcache_id, nolit); - bd->addr = bd->size = bd->size_nolit = 0; + bd->addr = bd->size = bd->addr_lit = bd->size_lit = 0; bd->entry_count = 0; - -#if BRANCH_CACHE - if (tcache_id) - memset32(sh2s[tcache_id-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); - else { - memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); - memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4); - } -#endif + bd->active = 0; + rm_from_block_lists(bd); } -/* -04205:243: == msh2 block #0,200 060017a8-060017f0 -> 0x27cb9c - 060017a8 d11c MOV.L @($70,PC),R1 ; @$0600181c - -04230:261: msh2 xsh w32 [260017a8] d225e304 -04230:261: msh2 smc check @260017a8 -04239:226: = ssh2 enter 060017a8 0x27cb9c, c=173 -*/ -static void sh2_smc_rm_blocks(u32 a, u16 *drc_ram_blk, int tcache_id, u32 shift, u32 mask) +static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift) { - struct block_list **blist = NULL, *entry; + struct block_list **blist, *entry, *next; + u32 mask = ram_sizes[tcache_id] - 1; + u32 wtmask = ~0x20000000; // writethrough area mask + u32 start_addr, end_addr; + u32 start_lit, end_lit; struct block_desc *block; - u32 start_addr, end_addr, taddr, i; - u32 from = ~0, to = 0; - - // ignore cache-through - a &= ~0x20000000; +#if (DRC_DEBUG & 2) + int removed = 0; +#endif + // need to check cached and writethrough area + a &= wtmask; blist = &inval_lookup[tcache_id][(a & mask) / INVAL_PAGE_SIZE]; entry = *blist; while (entry != NULL) { + next = entry->next; block = entry->block; - start_addr = block->addr & ~0x20000000; + start_addr = block->addr & wtmask; end_addr = start_addr + block->size; - if (start_addr <= a && a < end_addr) { - // get addr range that includes all removed blocks - if (from > start_addr) - from = start_addr; - if (to < end_addr) - to = end_addr; - - if (a >= start_addr + block->size_nolit) - literal_disabled_frames = 3; - sh2_smc_rm_block(block, tcache_id, mask); - - // entry lost, restart search - entry = *blist; - continue; - } - entry = entry->next; - } - - if (from >= to) - return; - - // update range around a to match latest state - from &= ~(INVAL_PAGE_SIZE - 1); - to |= (INVAL_PAGE_SIZE - 1); - for (taddr = from; taddr < to; taddr += INVAL_PAGE_SIZE) { - i = (taddr & mask) / INVAL_PAGE_SIZE; - entry = inval_lookup[tcache_id][i]; - - for (; entry != NULL; entry = entry->next) { - block = entry->block; - - start_addr = block->addr & ~0x20000000; - if (start_addr > a) { - if (to > start_addr) - to = start_addr; - } - else { - end_addr = start_addr + block->size; - if (from < end_addr) - from = end_addr; - } + start_lit = block->addr_lit & wtmask; + end_lit = start_lit + block->size_lit; + if ((start_addr <= a && a < end_addr) || + (start_lit <= a && a < end_lit)) + { + dbg(2, "smc remove @%08x", a); + end_addr = (start_lit <= a && block->size_lit ? a : 0); + sh2_smc_rm_block_entry(block, tcache_id, end_addr); +#if (DRC_DEBUG & 2) + removed = 1; +#endif } + entry = next; } - - // clear code marks - if (from < to) { - u16 *p = drc_ram_blk + ((from & mask) >> shift); - memset(p, 0, (to - from) >> (shift - 1)); +#if (DRC_DEBUG & 2) + if (!removed) + dbg(2, "rm_blocks called @%08x, no work?", a); +#endif +#if BRANCH_CACHE + if (tcache_id) + memset32(sh2s[tcache_id-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + else { + memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4); } +#endif } void sh2_drc_wcheck_ram(unsigned int a, int val, SH2 *sh2) { - dbg(2, "%csh2 smc check @%08x", sh2->is_slave ? 's' : 'm', a); - sh2_smc_rm_blocks(a, Pico32xMem->drcblk_ram, 0, SH2_DRCBLK_RAM_SHIFT, 0x3ffff); + dbg(2, "%csh2 smc check @%08x v=%d", sh2->is_slave ? 's' : 'm', a, val); + sh2_smc_rm_blocks(a, 0, SH2_DRCBLK_RAM_SHIFT); } void sh2_drc_wcheck_da(unsigned int a, int val, SH2 *sh2) { int cpuid = sh2->is_slave; - dbg(2, "%csh2 smc check @%08x", cpuid ? 's' : 'm', a); - sh2_smc_rm_blocks(a, Pico32xMem->drcblk_da[cpuid], - 1 + cpuid, SH2_DRCBLK_DA_SHIFT, 0xfff); + dbg(2, "%csh2 smc check @%08x v=%d", cpuid ? 's' : 'm', a, val); + sh2_smc_rm_blocks(a, 1 + cpuid, SH2_DRCBLK_DA_SHIFT); } int sh2_execute_drc(SH2 *sh2c, int cycles) @@ -4061,10 +4177,14 @@ static void block_stats(void) int c, b, i, total = 0; printf("block stats:\n"); - for (b = 0; b < ARRAY_SIZE(block_tables); b++) + for (b = 0; b < ARRAY_SIZE(block_tables); b++) { for (i = 0; i < block_counts[b]; i++) if (block_tables[b][i].addr != 0) total += block_tables[b][i].refcount; + for (i = block_limit[b]; i < block_max_counts[b]; i++) + if (block_tables[b][i].addr != 0) + total += block_tables[b][i].refcount; + } for (c = 0; c < 10; c++) { struct block_desc *blk, *maxb = NULL; @@ -4077,17 +4197,27 @@ static void block_stats(void) maxb = blk; } } + for (i = block_limit[b]; i < block_max_counts[b]; i++) { + blk = &block_tables[b][i]; + if (blk->addr != 0 && blk->refcount > max) { + max = blk->refcount; + maxb = blk; + } + } } if (maxb == NULL) break; - printf("%08x %9d %2.3f%%\n", maxb->addr, maxb->refcount, + printf("%08x %p %9d %2.3f%%\n", maxb->addr, maxb->tcache_ptr, maxb->refcount, (double)maxb->refcount / total * 100.0); maxb->refcount = 0; } - for (b = 0; b < ARRAY_SIZE(block_tables); b++) + for (b = 0; b < ARRAY_SIZE(block_tables); b++) { for (i = 0; i < block_counts[b]; i++) block_tables[b][i].refcount = 0; + for (i = block_limit[b]; i < block_max_counts[b]; i++) + block_tables[b][i].refcount = 0; + } #endif } @@ -4169,8 +4299,6 @@ void sh2_drc_mem_setup(SH2 *sh2) void sh2_drc_frame(void) { - if (literal_disabled_frames > 0) - literal_disabled_frames--; } int sh2_drc_init(SH2 *sh2) @@ -4197,9 +4325,19 @@ int sh2_drc_init(SH2 *sh2) hash_tables[i] = calloc(hash_table_sizes[i], sizeof(*hash_tables[0])); if (hash_tables[i] == NULL) goto fail; + + unresolved_links[i] = calloc(hash_table_sizes[i], sizeof(*unresolved_links[0])); + if (unresolved_links[i] == NULL) + goto fail; } memset(block_counts, 0, sizeof(block_counts)); + for (i = 0; i < ARRAY_SIZE(block_counts); i++) { + block_limit[i] = block_max_counts[i] - 1; + } memset(block_link_pool_counts, 0, sizeof(block_link_pool_counts)); + for (i = 0; i < ARRAY_SIZE(blink_free); i++) { + blink_free[i] = NULL; + } drc_cmn_init(); rcache_init(); @@ -4208,8 +4346,11 @@ int sh2_drc_init(SH2 *sh2) host_instructions_updated(tcache, tcache_ptr); tcache_bases[0] = tcache_ptrs[0] = tcache_ptr; - for (i = 1; i < ARRAY_SIZE(tcache_bases); i++) + tcache_limit[0] = tcache_bases[0] + tcache_sizes[0] - (tcache_ptr-tcache); + for (i = 1; i < ARRAY_SIZE(tcache_bases); i++) { tcache_bases[i] = tcache_ptrs[i] = tcache_bases[i - 1] + tcache_sizes[i - 1]; + tcache_limit[i] = tcache_bases[i] + tcache_sizes[i]; + } #if (DRC_DEBUG & 4) for (i = 0; i < ARRAY_SIZE(block_tables); i++) @@ -4233,6 +4374,7 @@ int sh2_drc_init(SH2 *sh2) void sh2_drc_finish(SH2 *sh2) { + struct block_list *bl, *bn; int i; if (block_tables[0] == NULL) @@ -4243,19 +4385,28 @@ void sh2_drc_finish(SH2 *sh2) for (i = 0; i < TCACHE_BUFFERS; i++) { #if (DRC_DEBUG & 4) printf("~~~ tcache %d\n", i); +#if 0 tcache_dsm_ptrs[i] = tcache_bases[i]; tcache_ptr = tcache_ptrs[i]; do_host_disasm(i); + if (tcache_limit[i] < tcache_bases[i] + tcache_sizes[i]) { + tcache_dsm_ptrs[i] = tcache_limit[i]; + tcache_ptr = tcache_bases[i] + tcache_sizes[i]; + do_host_disasm(i); + } +#endif + printf("max links: %d\n", block_link_pool_counts[i]); #endif if (block_tables[i] != NULL) free(block_tables[i]); block_tables[i] = NULL; - if (block_link_pool[i] == NULL) + if (block_link_pool[i] != NULL) free(block_link_pool[i]); block_link_pool[i] = NULL; + blink_free[i] = NULL; - if (inval_lookup[i] == NULL) + if (inval_lookup[i] != NULL) free(inval_lookup[i]); inval_lookup[i] = NULL; @@ -4265,6 +4416,12 @@ void sh2_drc_finish(SH2 *sh2) } } + for (bl = blist_free; bl; bl = bn) { + bn = bl->next; + free(bl); + } + blist_free = NULL; + drc_cmn_cleanup(); } @@ -4304,7 +4461,7 @@ static void *dr_get_pc_base(u32 pc, int is_slave) } void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, - u32 *end_literals_out) + u32 *base_literals_out, u32 *end_literals_out) { u16 *dr_pc_base; u32 pc, op, tmp; @@ -5073,8 +5230,6 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, default: undefined: - elprintf(EL_ANOMALY, "%csh2 drc: unhandled op %04x @ %08x", - is_slave ? 's' : 'm', op, pc); opd->op = OP_UNDEFINED; // an unhandled instruction is probably not code if it's not the 1st insn if (!(op_flags[i] & OF_DELAY_OP) && pc != base_pc) @@ -5187,6 +5342,8 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, lowest_literal = end_literals; *end_pc_out = end_pc; + if (base_literals_out != NULL) + *base_literals_out = (lowest_literal ?: end_pc); if (end_literals_out != NULL) *end_literals_out = (end_literals ?: end_pc); } diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 6a8596b83..36dfd9456 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -24,7 +24,7 @@ void sh2_drc_frame(void); void scan_block(unsigned int base_pc, int is_slave, unsigned char *op_flags, unsigned int *end_pc, - unsigned int *end_literals); + unsigned int *base_literals, unsigned int *end_literals); #if defined(DRC_SH2) // direct access to some host CPU registers used by the DRC @@ -39,13 +39,15 @@ void scan_block(unsigned int base_pc, int is_slave, #warning "direct DRC register access not available for this host" #endif -#ifdef DCR_SR_REG -#define DRC_DECLARE_SR register int sh2_sr asm(#DCR_SR_REG) +#ifdef DRC_SR_REG +#define __DRC_DECLARE_SR(SR) register int sh2_sr asm(#SR) +#define _DRC_DECLARE_SR(SR) __DRC_DECLARE_SR(SR) +#define DRC_DECLARE_SR _DRC_DECLARE_SR(DRC_SR_REG) #define DRC_SAVE_SR(sh2) \ - if ((sh2->state & (SH2_STATE_RUN|SH2_STATE_BUSY)) == SH2_STATE_RUN) \ + if ((sh2->state & (SH2_STATE_RUN)) == SH2_STATE_RUN) \ sh2->sr = sh2_sr; #define DRC_RESTORE_SR(sh2) \ - if ((sh2->state & (SH2_STATE_RUN|SH2_STATE_BUSY)) == SH2_STATE_RUN) \ + if ((sh2->state & (SH2_STATE_RUN)) == SH2_STATE_RUN) \ sh2_sr = sh2->sr; #else #define DRC_DECLARE_SR diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 47329835b..30d0e4d54 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -1432,7 +1432,7 @@ static void REGPARM(3) sh2_write8_sdram(u32 a, u32 d, SH2 *sh2) { u32 a1 = a & 0x3ffff; #ifdef DRC_SH2 - u16 *p = sh2->p_drcblk_ram; + u8 *p = sh2->p_drcblk_ram; int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; if (t) sh2_drc_wcheck_ram(a, t, sh2); @@ -1456,7 +1456,7 @@ static void REGPARM(3) sh2_write8_da(u32 a, u32 d, SH2 *sh2) { u32 a1 = a & 0xfff; #ifdef DRC_SH2 - u16 *p = sh2->p_drcblk_da; + u8 *p = sh2->p_drcblk_da; int t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; if (t) sh2_drc_wcheck_da(a, t, sh2); @@ -1511,7 +1511,7 @@ static void REGPARM(3) sh2_write16_sdram(u32 a, u32 d, SH2 *sh2) { u32 a1 = a & 0x3fffe; #ifdef DRC_SH2 - u16 *p = sh2->p_drcblk_ram; + u8 *p = sh2->p_drcblk_ram; int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; if (t) sh2_drc_wcheck_ram(a, t, sh2); @@ -1523,7 +1523,7 @@ static void REGPARM(3) sh2_write16_da(u32 a, u32 d, SH2 *sh2) { u32 a1 = a & 0xffe; #ifdef DRC_SH2 - u16 *p = sh2->p_drcblk_da; + u8 *p = sh2->p_drcblk_da; int t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; if (t) sh2_drc_wcheck_da(a, t, sh2); @@ -1580,7 +1580,7 @@ static void REGPARM(3) sh2_write32_sdram(u32 a, u32 d, SH2 *sh2) { u32 a1 = a & 0x3fffc; #ifdef DRC_SH2 - u16 *p = sh2->p_drcblk_ram; + u8 *p = sh2->p_drcblk_ram; int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; if (t) sh2_drc_wcheck_ram(a, t, sh2); @@ -1595,7 +1595,7 @@ static void REGPARM(3) sh2_write32_da(u32 a, u32 d, SH2 *sh2) { u32 a1 = a & 0xffc; #ifdef DRC_SH2 - u16 *p = sh2->p_drcblk_da; + u8 *p = sh2->p_drcblk_da; int t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; if (t) sh2_drc_wcheck_da(a, t, sh2); diff --git a/pico/pico_int.h b/pico/pico_int.h index 497649b6d..133382420 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -599,7 +599,8 @@ struct Pico32xMem { unsigned char sdram[0x40000]; #ifdef DRC_SH2 - unsigned short drcblk_ram[1 << (18 - SH2_DRCBLK_RAM_SHIFT)]; + unsigned char drcblk_ram[1 << (18 - SH2_DRCBLK_RAM_SHIFT)]; + unsigned char drclit_ram[1 << (18 - SH2_DRCBLK_RAM_SHIFT)]; #endif unsigned short dram[2][0x20000/2]; // AKA fb union { @@ -607,7 +608,8 @@ struct Pico32xMem unsigned char m68k_rom_bank[0x10000]; // M68K_BANK_SIZE }; #ifdef DRC_SH2 - unsigned short drcblk_da[2][1 << (12 - SH2_DRCBLK_DA_SHIFT)]; + unsigned char drcblk_da[2][1 << (12 - SH2_DRCBLK_DA_SHIFT)]; + unsigned char drclit_da[2][1 << (12 - SH2_DRCBLK_DA_SHIFT)]; #endif union { unsigned char b[0x800]; From 1f8cc9c081b366496aa34726f5625433032dc16b Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 16 Apr 2019 20:37:52 +0200 Subject: [PATCH 028/174] various small improvements and fixes --- Makefile | 6 +- config.caanoo | 6 +- config.caanoo47 | 4 +- config.dingux | 6 +- config.dingux54 | 6 +- config.gp2x | 4 +- config.gp2x47 | 4 +- config.i386 | 14 +++ config.x86 | 8 +- cpu/cz80/cz80.c | 1 + cpu/drc/cmn.h | 8 -- cpu/drc/emit_arm.c | 56 +++++++--- cpu/drc/emit_x86.c | 68 +++++++----- cpu/sh2/compiler.c | 194 ++++++++++++++++++++++++--------- cpu/sh2/mame/sh2pico.c | 2 +- cpu/sh2/sh2.h | 4 +- pico/32x/32x.c | 28 ++--- pico/32x/draw_arm.S | 20 ++-- pico/32x/memory.c | 8 +- pico/32x/memory_arm.S | 76 ++++++------- pico/cd/gfx_dma.c | 4 - pico/cd/memory_arm.S | 2 +- pico/draw2_arm.S | 2 +- pico/draw_arm.S | 2 +- pico/memory.h | 7 -- pico/memory_amips.S | 2 +- pico/memory_arm.S | 2 +- pico/pico_int.h | 18 ++- platform/common/common.mak | 1 + platform/common/memcpy.c | 37 ++++--- platform/gp2x/code940/memcpy.s | 12 +- tools/mkoffsets.sh | 5 +- 32 files changed, 372 insertions(+), 245 deletions(-) create mode 100644 config.i386 diff --git a/Makefile b/Makefile index 76649ffd1..45fde98fb 100644 --- a/Makefile +++ b/Makefile @@ -209,10 +209,10 @@ LDFLAGS += -Wl,-Map=$(TARGET).map endif endif -target_: pico/pico_int_o32.h $(TARGET) +target_: pico/pico_int_offs.h $(TARGET) clean: - $(RM) $(TARGET) $(OBJS) pico/pico_int_o32.h + $(RM) $(TARGET) $(OBJS) pico/pico_int_offs.h $(RM) -r .opk_data $(TARGET): $(OBJS) @@ -226,7 +226,7 @@ endif pprof: platform/linux/pprof.c $(CC) $(CFLAGS) -O2 -ggdb -DPPROF -DPPROF_TOOL -I../../ -I. $^ -o $@ $(LDFLAGS) $(LDLIBS) -pico/pico_int_o32.h:: tools/mkoffsets.sh +pico/pico_int_offs.h:: tools/mkoffsets.sh make -C tools/ XCC="$(CC)" XCFLAGS="$(CFLAGS)" %.o: %.c diff --git a/config.caanoo b/config.caanoo index 39edb5db0..dd053bc5f 100644 --- a/config.caanoo +++ b/config.caanoo @@ -4,11 +4,11 @@ CC = arm-gph-linux-gnueabi-gcc CXX = arm-gph-linux-gnueabi-g++ AS = arm-gph-linux-gnueabi-as STRIP = arm-gph-linux-gnueabi-strip -CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -fno-stack-protector -D__GP2X__ -DGPERF +CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -fno-stack-protector -D__GP2X__ CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers -CFLAGS += -I/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I/home/build/src/gp2x/armroot-eabi/include +CFLAGS += -I${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I${HOME}/src/gp2x/armroot-eabi/include ASFLAGS += -mfloat-abi=soft -mcpu=arm920t -LDFLAGS += -B/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/lib/gcc/arm-gph-linux-gnueabi/4.2.4 -B/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L/home/build/src/gp2x/armroot-eabi/lib -static +LDFLAGS += -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/lib/gcc/arm-gph-linux-gnueabi/4.2.4 -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L${HOME}/src/gp2x/armroot-eabi/lib -static LDLIBS += -lpng -lm -ldl ARCH = arm diff --git a/config.caanoo47 b/config.caanoo47 index f3efde0f7..2c0ee5aff 100644 --- a/config.caanoo47 +++ b/config.caanoo47 @@ -6,9 +6,9 @@ AS = arm-linux-gnueabi-as STRIP = arm-linux-gnueabi-strip CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -Wno-unused-result -fno-stack-protector -D__GP2X__ CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers -CFLAGS += -I/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I/home/build/src/gp2x/armroot-eabi/include +CFLAGS += -I${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I${HOME}/src/gp2x/armroot-eabi/include ASFLAGS += -mfloat-abi=soft -mcpu=arm920t -LDFLAGS += -B/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/lib/gcc/arm-gph-linux-gnueabi/4.2.4 -B/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -static +LDFLAGS += -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/lib/gcc/arm-gph-linux-gnueabi/4.2.4 -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -static LDLIBS += -lpng -lm -ldl ARCH = arm diff --git a/config.dingux b/config.dingux index 6611991cc..8aca06a63 100644 --- a/config.dingux +++ b/config.dingux @@ -4,12 +4,12 @@ CC = mipsel-linux-gcc CXX = mipsel-linux-g++ AS = mipsel-linux-as STRIP = mipsel-linux-strip -CFLAGS += -I/home/build/opt/opendingux-toolchain/usr/include/ -CFLAGS += -I/home/build/opt/opendingux-toolchain/usr/include/SDL +CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/ +CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/SDL CFLAGS += -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -fno-stack-protector ASFLAGS += LDFLAGS += -LDLIBS += -B/home/build/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=/home/build/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=/home/build/opt/opendingux-toolchain/lib -lSDL -lasound -lpng -lm -lstdc++ -ldl +LDLIBS += -B${HOME}/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/lib -lSDL -lasound -lpng -lm -lstdc++ -ldl ARCH = mipsel PLATFORM = opendingux diff --git a/config.dingux54 b/config.dingux54 index 96e550148..5f292652b 100644 --- a/config.dingux54 +++ b/config.dingux54 @@ -4,12 +4,12 @@ CC = mipsel-linux-gnu-gcc CXX = mipsel-linux-gnu-g++ AS = mipsel-linux-gnu-as STRIP = mipsel-linux-gnu-strip -CFLAGS += -I/home/build/opt/opendingux-toolchain/usr/include/ -CFLAGS += -I/home/build/opt/opendingux-toolchain/usr/include/SDL +CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/ +CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/SDL CFLAGS += -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -fno-stack-protector ASFLAGS += LDFLAGS += -LDLIBS += -B/home/build/opt/opendingux-toolchain/usr/lib -B/home/build/opt/opendingux-toolchain/lib -Wl,-rpath-link=/home/build/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=/home/build/opt/opendingux-toolchain/lib -lSDL -lasound -lpng -lz -lm -lstdc++ -ldl +LDLIBS += -B${HOME}/opt/opendingux-toolchain/usr/lib -B${HOME}/opt/opendingux-toolchain/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/lib -lSDL -lasound -lpng -lz -lm -lstdc++ -ldl ARCH = mipsel PLATFORM = opendingux diff --git a/config.gp2x b/config.gp2x index de3e47c40..248d73aa1 100644 --- a/config.gp2x +++ b/config.gp2x @@ -5,10 +5,10 @@ CXX = arm-open2x-linux-g++ AS = arm-open2x-linux-as STRIP = arm-open2x-linux-strip CFLAGS += -msoft-float -mcpu=arm920t -mtune=arm920t -D__GP2X__ -CFLAGS += -I/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I/home/build/src/gp2x/armroot/include +CFLAGS += -I${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I${HOME}/src/gp2x/armroot/include CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers ASFLAGS += -mcpu=arm920t -mfloat-abi=soft -LDFLAGS += --sysroot /home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L/home/build/src/gp2x/armroot/lib -static +LDFLAGS += --sysroot ${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/src/gp2x/armroot/lib -static LDLIBS += -lpng -lm -ldl ARCH = arm diff --git a/config.gp2x47 b/config.gp2x47 index 1022166dd..21769ada5 100644 --- a/config.gp2x47 +++ b/config.gp2x47 @@ -5,10 +5,10 @@ CXX = arm-linux-gnueabi-g++ AS = arm-linux-gnueabi-as STRIP = arm-linux-gnueabi-strip CFLAGS += -mabi=apcs-gnu -mno-thumb-interwork -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t -mtune=arm920t -Wno-unused-result -fno-stack-protector -D__GP2X__ -CFLAGS += -I/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I/home/build/src/gp2x/armroot/include +CFLAGS += -I${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I${HOME}/src/gp2x/armroot/include CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers ASFLAGS += -mabi=apcs-gnu -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t -LDFLAGS += -mabi=apcs-gnu -mfpu=fpa -B/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L/home/build/src/gp2x/armroot/lib -static +LDFLAGS += -mabi=apcs-gnu -mfpu=fpa -B${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/src/gp2x/armroot/lib -static LDLIBS += -lpng -lm -ldl ARCH = arm diff --git a/config.i386 b/config.i386 new file mode 100644 index 000000000..ce07b103e --- /dev/null +++ b/config.i386 @@ -0,0 +1,14 @@ +# Automatically generated by configure +# Configured with: './configure' '--platform=generic' +CC = gcc +CXX = g++ +AS = as +STRIP = strip +CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -m32 # -pg +ASFLAGS += +LDFLAGS += -m32 #-pg +LDLIBS += -L/usr/lib/i386-linux-gnu -L${HOME}/opt/lib32 -lSDL-1.2 -lasound -lpng -lz -lm -ldl + +ARCH = i386 +PLATFORM = generic +SOUND_DRIVERS = oss alsa sdl diff --git a/config.x86 b/config.x86 index d463157ea..287b82d32 100644 --- a/config.x86 +++ b/config.x86 @@ -4,11 +4,11 @@ CC = gcc CXX = g++ AS = as STRIP = strip -CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -m32 # -pg +CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result # -pg ASFLAGS += -LDFLAGS += -m32 #-pg -LDLIBS += -L/usr/lib/i386-linux-gnu/debug -L/home/build/opt/lib32 -lSDL-1.2 -lasound -lpng -lz -lm -ldl +LDFLAGS += #-pg +LDLIBS += -L/usr/lib/x86_64-linux-gnu -lSDL-1.2 -lasound -lpng -lz -lm -ldl -ARCH = x86 +ARCH = x86_64 PLATFORM = generic SOUND_DRIVERS = oss alsa sdl diff --git a/cpu/cz80/cz80.c b/cpu/cz80/cz80.c index 61ca5f84d..0326b0b84 100644 --- a/cpu/cz80/cz80.c +++ b/cpu/cz80/cz80.c @@ -14,6 +14,7 @@ #include "cz80.h" #if PICODRIVE_HACKS +#include #include #endif diff --git a/cpu/drc/cmn.h b/cpu/drc/cmn.h index 5a44bbb7e..bad02a1b3 100644 --- a/cpu/drc/cmn.h +++ b/cpu/drc/cmn.h @@ -1,11 +1,3 @@ -#ifndef UTYPES_DEFINED -typedef unsigned char u8; -typedef signed char s8; -typedef unsigned short u16; -typedef signed short s16; -typedef unsigned int u32; -typedef signed int s32; -#endif #define DRC_TCACHE_SIZE (4*1024*1024) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 3f782bb60..4744b1279 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -177,26 +177,25 @@ #define EOP_C_AM3_REG(cond,u,l,rn,rd,s,h,rm) EOP_C_AM3(cond,u,0,l,rn,rd,s,h,rm) /* ldr and str */ -#define EOP_LDR_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,1,0,1,rn,rd,offset_12) -#define EOP_LDRB_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,1,1,1,rn,rd,offset_12) +#define EOP_LDR_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,(offset_12) >= 0,0,1,rn,rd,abs(offset_12)) +#define EOP_LDRB_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,(offset_12) >= 0,1,1,rn,rd,abs(offset_12)) #define EOP_STR_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,(offset_12) >= 0,0,0,rn,rd,abs(offset_12)) -#define EOP_LDR_IMM( rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,1,0,1,rn,rd,offset_12) -#define EOP_LDR_NEGIMM(rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,0,0,1,rn,rd,offset_12) +#define EOP_LDR_IMM( rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,(offset_12) >= 0,0,1,rn,rd,abs(offset_12)) #define EOP_LDR_SIMPLE(rd,rn) EOP_C_AM2_IMM(A_COND_AL,1,0,1,rn,rd,0) -#define EOP_STR_IMM( rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,1,0,0,rn,rd,offset_12) +#define EOP_STR_IMM( rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,(offset_12) >= 0,0,0,rn,rd,abs(offset_12)) #define EOP_STR_SIMPLE(rd,rn) EOP_C_AM2_IMM(A_COND_AL,1,0,0,rn,rd,0) #define EOP_LDR_REG_LSL(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,0,1,rn,rd,shift_imm,A_AM1_LSL,rm) #define EOP_LDRB_REG_LSL(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,1,1,rn,rd,shift_imm,A_AM1_LSL,rm); -#define EOP_LDRH_IMM2(cond,rd,rn,offset_8) EOP_C_AM3_IMM(cond,1,1,rn,rd,0,1,offset_8) +#define EOP_LDRH_IMM2(cond,rd,rn,offset_8) EOP_C_AM3_IMM(cond,(offset_8) >= 0,1,rn,rd,0,1,abs(offset_8)) #define EOP_LDRH_REG2(cond,rd,rn,rm) EOP_C_AM3_REG(cond,1,1,rn,rd,0,1,rm) -#define EOP_LDRH_IMM( rd,rn,offset_8) EOP_C_AM3_IMM(A_COND_AL,1,1,rn,rd,0,1,offset_8) +#define EOP_LDRH_IMM( rd,rn,offset_8) EOP_C_AM3_IMM(A_COND_AL,(offset_8) >= 0,1,rn,rd,0,1,abs(offset_8)) #define EOP_LDRH_SIMPLE(rd,rn) EOP_C_AM3_IMM(A_COND_AL,1,1,rn,rd,0,1,0) #define EOP_LDRH_REG( rd,rn,rm) EOP_C_AM3_REG(A_COND_AL,1,1,rn,rd,0,1,rm) -#define EOP_STRH_IMM( rd,rn,offset_8) EOP_C_AM3_IMM(A_COND_AL,1,0,rn,rd,0,1,offset_8) +#define EOP_STRH_IMM( rd,rn,offset_8) EOP_C_AM3_IMM(A_COND_AL,(offset_8) >= 0,0,rn,rd,0,1,abs(offset_8)) #define EOP_STRH_SIMPLE(rd,rn) EOP_C_AM3_IMM(A_COND_AL,1,0,rn,rd,0,1,0) #define EOP_STRH_REG( rd,rn,rm) EOP_C_AM3_REG(A_COND_AL,1,0,rn,rd,0,1,rm) @@ -285,11 +284,29 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int imm = ~imm; op = A_OP_MVN; } +#ifdef HAVE_ARMV7 + for (v = imm, ror2 = 0; v && !(v & 3); v >>= 2) + ror2--; + if (v >> 8) { + /* 2+ insns needed - prefer movw/movt */ + if (op == A_OP_MVN) + imm = ~imm; + EOP_MOVW(rd, imm); + if (imm & 0xffff0000) + EOP_MOVT(rd, imm); + return; + } +#endif break; - case A_OP_EOR: case A_OP_SUB: case A_OP_ADD: + // count bits in imm and swap ADD and SUB if more bits 1 than 0 + if (s == 0 && count_bits(imm) > 16) { + imm = -imm; + op ^= (A_OP_ADD^A_OP_SUB); + } + case A_OP_EOR: case A_OP_ORR: case A_OP_BIC: if (s == 0 && imm == 0 && rd == rn) @@ -412,6 +429,8 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_add_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_ADD_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) +#define emith_add_r_r_r_lsl_ptr(d, s1, s2, lslimm) \ + emith_add_r_r_r_lsl(d, s1, s2, lslimm) #define emith_addf_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_ADD_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSL,lslimm) @@ -483,7 +502,7 @@ static int emith_xbranch(int cond, void *target, int is_call) emith_add_r_r_r(d, d, s) #define emith_sub_r_r(d, s) \ - EOP_SUB_REG(A_COND_AL,0,d,d,s,A_AM1_LSL,0) + emith_sub_r_r_r(d, d, s) #define emith_adc_r_r(d, s) \ EOP_ADC_REG(A_COND_AL,0,d,d,s,A_AM1_LSL,0) @@ -529,6 +548,9 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_move_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 0, A_OP_MOV, r, imm) +#define emith_move_r_ptr_imm(r, imm) \ + emith_move_r_imm(r, (u32)(imm)) + #define emith_add_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 0, A_OP_ADD, r, imm) @@ -536,7 +558,7 @@ static int emith_xbranch(int cond, void *target, int is_call) emith_op_imm(A_COND_AL, 0, A_OP_ADC, r, imm) #define emith_adcf_r_imm(r, imm) \ - emith_op_imm(A_COND_AL, 1, A_OP_ADC, r, (imm)) + emith_op_imm(A_COND_AL, 1, A_OP_ADC, r, imm) #define emith_sub_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 0, A_OP_SUB, r, imm) @@ -610,13 +632,13 @@ static int emith_xbranch(int cond, void *target, int is_call) emith_op_imm2(A_COND_AL, 0, A_OP_SUB, d, s, imm) #define emith_subf_r_r_imm(d, s, imm) \ - emith_op_imm2(A_COND_AL, 1, A_OP_SUB, d, s, (imm)) + emith_op_imm2(A_COND_AL, 1, A_OP_SUB, d, s, imm) #define emith_or_r_r_imm(d, s, imm) \ - emith_op_imm2(A_COND_AL, 0, A_OP_ORR, d, s, (imm)) + emith_op_imm2(A_COND_AL, 0, A_OP_ORR, d, s, imm) #define emith_eor_r_r_imm(d, s, imm) \ - emith_op_imm2(A_COND_AL, 0, A_OP_EOR, d, s, (imm)) + emith_op_imm2(A_COND_AL, 0, A_OP_EOR, d, s, imm) #define emith_neg_r_r(d, s) \ EOP_RSB_IMM(d, s, 0, 0) @@ -758,7 +780,7 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_clear_msb_c(cond, d, s, count) { \ u32 t; \ if ((count) <= 8) { \ - t = (count) - 8; \ + t = 8 - (count); \ t = (0xff << t) & 0xff; \ EOP_C_DOP_IMM(cond,A_OP_BIC,0,s,d,8/2,t); \ } else if ((count) >= 24) { \ @@ -880,7 +902,9 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_sh2_rcall(a, tab, func, mask) { \ emith_lsr(mask, a, SH2_READ_SHIFT); \ EOP_ADD_REG_LSL(tab, tab, mask, 3); \ - EOP_LDMIA(tab, (1<is_slave; -if (sh2 != &sh2s[0] && sh2 != &sh2s[1]) printf("sh2 %p?\n",sh2); if (!trace[0]) { truncate("pico.trace", 0); trace[0] = fopen("pico.trace0", "wb"); @@ -199,7 +199,8 @@ if (sh2 != &sh2s[0] && sh2 != &sh2s[1]) printf("sh2 %p?\n",sh2); if (csh2[idx][0].pc != sh2->pc) { fwrite(sh2, offsetof(SH2, read8_map), 1, trace[idx]); fwrite(&sh2->pdb_io_csum, sizeof(sh2->pdb_io_csum), 1, trace[idx]); - memcpy(&csh2[idx][0], sh2, offsetof(SH2, icount)); + memcpy(&csh2[idx][0], sh2, offsetof(SH2, poll_cnt)+4); + csh2[idx][0].is_slave = idx; } } #elif (DRC_DEBUG & 512) @@ -234,9 +235,10 @@ if (sh2 != &sh2s[0] && sh2 != &sh2s[1]) printf("sh2 %p?\n",sh2); #elif (DRC_DEBUG & 1024) { int x = sh2->is_slave, i; - for (i = 0; i < ARRAY_SIZE(csh2[x]); i++) - memcpy(&csh2[x][i], &csh2[x][i+1], offsetof(SH2, icount)); - memcpy(&csh2[x][3], sh2, offsetof(SH2, icount)); + for (i = 0; i < ARRAY_SIZE(csh2[x])-1; i++) + memcpy(&csh2[x][i], &csh2[x][i+1], offsetof(SH2, poll_cnt)+4); + memcpy(&csh2[x][ARRAY_SIZE(csh2[x])-1], sh2, offsetof(SH2, poll_cnt)+4); + csh2[x][0].is_slave = x; } #endif } @@ -252,9 +254,9 @@ if (sh2 != &sh2s[0] && sh2 != &sh2s[1]) printf("sh2 %p?\n",sh2); // and can be discarded early // XXX: need to tune sizes static const int tcache_sizes[TCACHE_BUFFERS] = { - DRC_TCACHE_SIZE * 6 / 8, // ROM (rarely used), DRAM - DRC_TCACHE_SIZE / 8, // BIOS, data array in master sh2 - DRC_TCACHE_SIZE / 8, // ... slave + DRC_TCACHE_SIZE * 14 / 16, // ROM (rarely used), DRAM + DRC_TCACHE_SIZE / 16, // BIOS, data array in master sh2 + DRC_TCACHE_SIZE / 16, // ... slave }; static u8 *tcache_bases[TCACHE_BUFFERS]; @@ -287,6 +289,9 @@ struct block_entry { #if (DRC_DEBUG & 2) struct block_desc *block; #endif +#if (DRC_DEBUG & 32) + int entry_count; +#endif }; struct block_desc { @@ -698,6 +703,14 @@ static void add_to_hashlist(struct block_entry *be, int tcache_id) (*head)->prev = be; be->next = *head; *head = be; + +#if (DRC_DEBUG & 2) + if (be->next != NULL) { + printf(" %08x: entry hash collision with %08x\n", + be->pc, be->next->pc); + hash_collisions++; + } +#endif } static void rm_from_hashlist(struct block_entry *be, int tcache_id) @@ -727,6 +740,14 @@ static void add_to_hashlist_unresolved(struct block_link *bl, int tcache_id) u32 tcmask = hash_table_sizes[tcache_id] - 1; struct block_link **head = &HASH_FUNC(unresolved_links[tcache_id], bl->target_pc, tcmask); +#if DRC_DEBUG & 1 + struct block_link *current = *head; + while (current != NULL && current != bl) + current = current->next; + if (current == bl) + dbg(1, "add_to_hashlist_unresolved @%p: bl %p %p %08x already in?", head, bl, bl->target, bl->target_pc); +#endif + bl->target = NULL; // marker for not resolved bl->prev = NULL; if (*head) @@ -745,7 +766,7 @@ static void rm_from_hashlist_unresolved(struct block_link *bl, int tcache_id) while (current->prev != NULL) current = current->prev; if (current != *head) - dbg(1, "rm_from_hashlist unresolved @%p: bl %p %p %08x missing?", head, bl, bl->target, bl->target_pc); + dbg(1, "rm_from_hashlist_unresolved @%p: bl %p %p %08x missing?", head, bl, bl->target, bl->target_pc); #endif if (bl->prev != NULL) @@ -980,10 +1001,12 @@ static void *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_sla struct block_entry *be = NULL; int target_tcache_id; + // get the target block entry be = dr_get_entry(pc, is_slave, &target_tcache_id); if (target_tcache_id && target_tcache_id != tcache_id) return sh2_drc_dispatcher; + // get a block link if (blink_free[tcache_id] != NULL) { bl = blink_free[tcache_id]; blink_free[tcache_id] = bl->next; @@ -995,6 +1018,7 @@ static void *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_sla block_link_pool_counts[tcache_id] = cnt+1; } + // prepare link and add to ougoing list of owner bl->tcache_id = tcache_id; bl->target_pc = pc; bl->jump = tcache_ptr; @@ -1940,6 +1964,7 @@ static void rcache_invalidate(void) cache_regs[i].type = HR_FREE; cache_regs[i].gregs = 0; } + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { guest_regs[i].flags &= GRF_STATIC; if (!(guest_regs[i].flags & GRF_STATIC)) @@ -1953,7 +1978,8 @@ static void rcache_invalidate(void) cache_regs[guest_regs[i].sreg].gregs = 1 << i; guest_regs[i].vreg = guest_regs[i].sreg; } - }; + } + rcache_counter = 0; rcache_hint_soon = rcache_hint_late = 0; @@ -2005,6 +2031,7 @@ static int emit_get_rbase_and_offs(SH2 *sh2, u32 a, u32 *offs) u32 mask = 0; int poffs; int hr; + unsigned long la; poffs = dr_ctx_get_mem_ptr(a, &mask); if (poffs == -1) @@ -2014,15 +2041,16 @@ static int emit_get_rbase_and_offs(SH2 *sh2, u32 a, u32 *offs) if (mask < 0x1000) { // can't access data array or BIOS directly from ROM or SDRAM, // since code may run on both SH2s (tcache_id of translation block needed)) - emith_ctx_read(hr, poffs); + emith_ctx_read_ptr(hr, poffs); if (a & mask & ~omask) - emith_add_r_imm(hr, a & mask & ~omask); + emith_add_r_r_ptr_imm(hr, hr, a & mask & ~omask); + *offs = a & omask; } else { // known fixed host address - a = (a & mask) + *(u32 *)((char *)sh2 + poffs); - emith_move_r_imm(hr, (a & ~omask)); + la = (unsigned long)*(void **)((char *)sh2 + poffs) + (a & mask); + *offs = la & omask; + emith_move_r_ptr_imm(hr, la & ~omask); } - *offs = a & omask; return hr; } @@ -2392,8 +2420,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) void *branch_patch_ptr[MAX_LOCAL_BRANCHES]; u32 branch_patch_pc[MAX_LOCAL_BRANCHES]; int branch_patch_count = 0; - u32 literal_addr[MAX_LITERALS]; - int literal_addr_count = 0; u8 op_flags[BLOCK_INSN_LIMIT]; struct { u32 test_irq:1; @@ -2473,7 +2499,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { u32 delay_dep_fw = 0, delay_dep_bk = 0; int tmp3, tmp4; - u32 sr; + int sr; opd = &ops[i]; op = FETCH_OP(pc); @@ -2487,7 +2513,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) pc, op, sh2dasm_buff); #endif - if ((op_flags[i] & OF_BTARGET) || pc == base_pc) + if (op_flags[i] & OF_BTARGET) { if (pc != base_pc) { @@ -2517,6 +2543,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) else { dbg(1, "too many entryp for block #%d,%d pc=%08x", tcache_id, blkid_main, pc); + break; } } else { entry = block->entryp; @@ -2537,10 +2564,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #if (DRC_DEBUG & 0x10) rcache_get_reg_arg(0, SHR_PC, NULL); - tmp = emit_memhandler_read(2); + tmp = emit_memhandler_read(1); tmp2 = rcache_get_tmp(); tmp3 = rcache_get_tmp(); - emith_move_r_imm(tmp2, FETCH32(pc)); + emith_move_r_imm(tmp2, (s16)FETCH_OP(pc)); emith_move_r_imm(tmp3, 0); emith_cmp_r_r(tmp, tmp2); EMITH_SJMP_START(DCOND_EQ); @@ -2556,9 +2583,20 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_cmp_r_imm(sr, 0); emith_jump_cond(DCOND_LE, sh2_drc_exit); +#if (DRC_DEBUG & 32) + // block hit counter + tmp = rcache_get_tmp_arg(0); + tmp2 = rcache_get_tmp_arg(1); + emith_move_r_ptr_imm(tmp, (uptr)entry); + emith_read_r_r_offs(tmp2, tmp, offsetof(struct block_entry, entry_count)); + emith_add_r_imm(tmp2, 1); + emith_write_r_r_offs(tmp2, tmp, offsetof(struct block_entry, entry_count)); + rcache_free_tmp(tmp); + rcache_free_tmp(tmp2); +#endif + #if (DRC_DEBUG & (8|256|512|1024)) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - FLUSH_CYCLES(sr); rcache_clean(); tmp = rcache_used_hreg_mask(); emith_save_caller_regs(tmp); @@ -2566,7 +2604,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_get_reg_arg(2, SHR_SR, NULL); tmp2 = rcache_get_tmp_arg(0); tmp3 = rcache_get_tmp_arg(1); - emith_move_r_imm(tmp2, (u32)tcache_ptr); + emith_move_r_ptr_imm(tmp2, tcache_ptr); emith_move_r_r_ptr(tmp3,CONTEXT_REG); emith_call(sh2_drc_log_entry); emith_restore_caller_regs(tmp); @@ -2776,7 +2814,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if ((opd->imm && opd->imm >= base_pc && opd->imm < end_literals) || dr_is_rom(opd->imm)) { - ADD_TO_ARRAY(literal_addr, literal_addr_count, opd->imm,); if (opd->size == 2) u = FETCH32(opd->imm); else @@ -2862,8 +2899,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x06: // MOV.L Rm,@(R0,Rn) 0000nnnnmmmm0110 emit_indirect_indexed_write(sh2, GET_Rm(), SHR_R0, GET_Rn(), op & 3); goto end_op; - case 0x07: - // MUL.L Rm,Rn 0000nnnnmmmm0111 + case 0x07: // MUL.L Rm,Rn 0000nnnnmmmm0111 tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); tmp3 = rcache_get_reg(SHR_MACL, RC_GR_WRITE, NULL); @@ -2941,8 +2977,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto default_; ///////////////////////////////////////////// - case 0x01: - // MOV.L Rm,@(disp,Rn) 0001nnnnmmmmdddd + case 0x01: // MOV.L Rm,@(disp,Rn) 0001nnnnmmmmdddd emit_memhandler_write_rr(sh2, GET_Rm(), GET_Rn(), (op & 0x0f) * 4, 2); goto end_op; @@ -3346,19 +3381,16 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x09: switch (GET_Fx()) { - case 0: - // SHLL2 Rn 0100nnnn00001000 - // SHLR2 Rn 0100nnnn00001001 + case 0: // SHLL2 Rn 0100nnnn00001000 + // SHLR2 Rn 0100nnnn00001001 tmp = 2; break; - case 1: - // SHLL8 Rn 0100nnnn00011000 - // SHLR8 Rn 0100nnnn00011001 + case 1: // SHLL8 Rn 0100nnnn00011000 + // SHLR8 Rn 0100nnnn00011001 tmp = 8; break; - case 2: - // SHLL16 Rn 0100nnnn00101000 - // SHLR16 Rn 0100nnnn00101001 + case 2: // SHLL16 Rn 0100nnnn00101000 + // SHLR16 Rn 0100nnnn00101001 tmp = 16; break; default: @@ -3432,8 +3464,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } else emit_move_r_r(tmp2, GET_Rn()); goto end_op; - case 0x0f: - // MAC.W @Rm+,@Rn+ 0100nnnnmmmm1111 + case 0x0f: // MAC.W @Rm+,@Rn+ 0100nnnnmmmm1111 emit_indirect_read_double(sh2, &tmp, &tmp2, GET_Rn(), GET_Rm(), 1); sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); tmp3 = rcache_get_reg(SHR_MACL, RC_GR_RMW, NULL); @@ -3446,8 +3477,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto default_; ///////////////////////////////////////////// - case 0x05: - // MOV.L @(disp,Rm),Rn 0101nnnnmmmmdddd + case 0x05: // MOV.L @(disp,Rm),Rn 0101nnnnmmmmdddd emit_memhandler_read_rr(sh2, GET_Rn(), GET_Rm(), (op & 0x0f) * 4, 2); goto end_op; @@ -3519,8 +3549,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto default_; ///////////////////////////////////////////// - case 0x07: - // ADD #imm,Rn 0111nnnniiiiiiii + case 0x07: // ADD #imm,Rn 0111nnnniiiiiiii tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); if (op & 0x80) { // adding negative emith_sub_r_r_imm(tmp, tmp2, -op & 0xff); @@ -3621,8 +3650,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto default_; ///////////////////////////////////////////// - case 0x0e: - // MOV #imm,Rn 1110nnnniiiiiiii + case 0x0e: // MOV #imm,Rn 1110nnnniiiiiiii emit_move_r_imm32(GET_Rn(), (s8)op); goto end_op; @@ -3886,9 +3914,7 @@ static void sh2_generate_utils(void) #if BRANCH_CACHE // check if PC is in branch target cache emith_and_r_r_imm(arg1, arg0, (ARRAY_SIZE(sh2s->branch_cache)-1)*4); - // TODO implement emith_add_r_r_r_lsl_ptr, saves one insn on 32bit ARM - emith_lsl(arg1, arg1, sizeof(void *) == 8 ? 2 : 1); - emith_add_r_r_ptr(arg1, CONTEXT_REG); + emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg1, sizeof(void *) == 8 ? 2 : 1); emith_read_r_r_offs(arg2, arg1, offsetof(SH2, branch_cache)); emith_cmp_r_r(arg2, arg0); EMITH_SJMP_START(DCOND_NE); @@ -3905,8 +3931,7 @@ static void sh2_generate_utils(void) EMITH_SJMP_START(DCOND_EQ); emith_ctx_read_c(DCOND_NE, arg2, SHR_PC * 4); emith_and_r_r_imm(arg1, arg2, (ARRAY_SIZE(sh2s->branch_cache)-1)*4); - emith_lsl(arg1, arg1, sizeof(void *) == 8 ? 2 : 1); - emith_add_r_r_ptr(arg1, CONTEXT_REG); + emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg1, sizeof(void *) == 8 ? 2 : 1); emith_write_r_r_offs_c(DCOND_NE, arg2, arg1, offsetof(SH2, branch_cache)); emith_write_r_r_offs_ptr_c(DCOND_NE, RET_REG, arg1, offsetof(SH2, branch_cache) + sizeof(void *)); EMITH_SJMP_END(DCOND_EQ); @@ -4174,7 +4199,8 @@ int sh2_execute_drc(SH2 *sh2c, int cycles) static void block_stats(void) { #if (DRC_DEBUG & 2) - int c, b, i, total = 0; + int c, b, i; + long total = 0; printf("block stats:\n"); for (b = 0; b < ARRAY_SIZE(block_tables); b++) { @@ -4185,8 +4211,9 @@ static void block_stats(void) if (block_tables[b][i].addr != 0) total += block_tables[b][i].refcount; } + printf("total: %ld\n",total); - for (c = 0; c < 10; c++) { + for (c = 0; c < 20; c++) { struct block_desc *blk, *maxb = NULL; int max = 0; for (b = 0; b < ARRAY_SIZE(block_tables); b++) { @@ -4221,6 +4248,63 @@ static void block_stats(void) #endif } +void entry_stats(void) +{ +#if (DRC_DEBUG & 32) + int c, b, i, j; + long total = 0; + + printf("block entry stats:\n"); + for (b = 0; b < ARRAY_SIZE(block_tables); b++) { + for (i = 0; i < block_counts[b]; i++) + for (j = 0; j < block_tables[b][i].entry_count; j++) + total += block_tables[b][i].entryp[j].entry_count; + for (i = block_limit[b]; i < block_max_counts[b]; i++) + for (j = 0; j < block_tables[b][i].entry_count; j++) + total += block_tables[b][i].entryp[j].entry_count; + } + printf("total: %ld\n",total); + + for (c = 0; c < 20; c++) { + struct block_desc *blk; + struct block_entry *maxb = NULL; + int max = 0; + for (b = 0; b < ARRAY_SIZE(block_tables); b++) { + for (i = 0; i < block_counts[b]; i++) { + blk = &block_tables[b][i]; + for (j = 0; j < blk->entry_count; j++) + if (blk->entryp[j].entry_count > max) { + max = blk->entryp[j].entry_count; + maxb = &blk->entryp[j]; + } + } + for (i = block_limit[b]; i < block_max_counts[b]; i++) { + blk = &block_tables[b][i]; + for (j = 0; j < blk->entry_count; j++) + if (blk->entryp[j].entry_count > max) { + max = blk->entryp[j].entry_count; + maxb = &blk->entryp[j]; + } + } + } + if (maxb == NULL) + break; + printf("%08x %p %9d %2.3f%%\n", maxb->pc, maxb->tcache_ptr, maxb->entry_count, + (double)100 * maxb->entry_count / total); + maxb->entry_count = 0; + } + + for (b = 0; b < ARRAY_SIZE(block_tables); b++) { + for (i = 0; i < block_counts[b]; i++) + for (j = 0; j < block_tables[b][i].entry_count; j++) + block_tables[b][i].entryp[j].entry_count = 0; + for (i = block_limit[b]; i < block_max_counts[b]; i++) + for (j = 0; j < block_tables[b][i].entry_count; j++) + block_tables[b][i].entryp[j].entry_count = 0; + } +#endif +} + static void backtrace(void) { #if (DRC_DEBUG & 1024) @@ -4279,6 +4363,7 @@ void sh2_drc_flush_all(void) backtrace(); state_dump(); block_stats(); + entry_stats(); flush_tcache(0); flush_tcache(1); flush_tcache(2); @@ -4364,6 +4449,7 @@ int sh2_drc_init(SH2 *sh2) hash_collisions = 0; #endif } + memset(sh2->branch_cache, -1, sizeof(sh2->branch_cache)); return 0; diff --git a/cpu/sh2/mame/sh2pico.c b/cpu/sh2/mame/sh2pico.c index 636ebc6f4..f9d30d778 100644 --- a/cpu/sh2/mame/sh2pico.c +++ b/cpu/sh2/mame/sh2pico.c @@ -214,7 +214,7 @@ int sh2_execute_interpreter(SH2 *sh2, int cycles) if (sh2->pc < *base_pc || sh2->pc >= *end_pc) { *base_pc = sh2->pc; scan_block(*base_pc, sh2->is_slave, - op_flags, end_pc, NULL); + op_flags, end_pc, NULL, NULL); } if ((op_flags[(sh2->pc - *base_pc) / 2] & OF_BTARGET) || sh2->pc == *base_pc diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index e53bbf057..5a0661eaf 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -81,9 +81,9 @@ typedef struct SH2_ #define CYCLE_MULT_SHIFT 10 #define C_M68K_TO_SH2(xsh2, c) \ - ((int)((long long)(c) * (xsh2)->mult_m68k_to_sh2) >> CYCLE_MULT_SHIFT) + (int)(((unsigned long long)(c) * (xsh2)->mult_m68k_to_sh2) >> CYCLE_MULT_SHIFT) #define C_SH2_TO_M68K(xsh2, c) \ - ((int)((long long)(c+3) * (xsh2)->mult_sh2_to_m68k) >> CYCLE_MULT_SHIFT) + (int)(((unsigned long long)(c+3U) * (xsh2)->mult_sh2_to_m68k) >> CYCLE_MULT_SHIFT) int sh2_init(SH2 *sh2, int is_slave, SH2 *other_sh2); void sh2_finish(SH2 *sh2); diff --git a/pico/32x/32x.c b/pico/32x/32x.c index a15cb112c..4e8377eb3 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -30,7 +30,7 @@ static int REGPARM(2) sh2_irq_cb(SH2 *sh2, int level) } // MUST specify active_sh2 when called from sh2 memhandlers -void p32x_update_irls(SH2 *active_sh2, int m68k_cycles) +void p32x_update_irls(SH2 *active_sh2, unsigned int m68k_cycles) { int irqs, mlvl = 0, slvl = 0; int mrun, srun; @@ -50,18 +50,18 @@ void p32x_update_irls(SH2 *active_sh2, int m68k_cycles) slvl++; slvl *= 2; - mrun = sh2_irl_irq(&msh2, mlvl, active_sh2 == &msh2); + mrun = sh2_irl_irq(&msh2, mlvl, msh2.state & SH2_STATE_RUN); if (mrun) { p32x_sh2_poll_event(&msh2, SH2_IDLE_STATES, m68k_cycles); - if (active_sh2 == &msh2) - sh2_end_run(active_sh2, 1); + if (msh2.state & SH2_STATE_RUN) + sh2_end_run(&msh2, 1); } - srun = sh2_irl_irq(&ssh2, slvl, active_sh2 == &ssh2); + srun = sh2_irl_irq(&ssh2, slvl, ssh2.state & SH2_STATE_RUN); if (srun) { p32x_sh2_poll_event(&ssh2, SH2_IDLE_STATES, m68k_cycles); - if (active_sh2 == &ssh2) - sh2_end_run(active_sh2, 1); + if (ssh2.state & SH2_STATE_RUN) + sh2_end_run(&ssh2, 1); } elprintf(EL_32X, "update_irls: m %d/%d, s %d/%d", mlvl, mrun, slvl, srun); @@ -70,7 +70,7 @@ void p32x_update_irls(SH2 *active_sh2, int m68k_cycles) // the mask register is inconsistent, CMD is supposed to be a mask, // while others are actually irq trigger enables? // TODO: test on hw.. -void p32x_trigger_irq(SH2 *sh2, int m68k_cycles, unsigned int mask) +void p32x_trigger_irq(SH2 *sh2, unsigned int m68k_cycles, unsigned int mask) { Pico32x.sh2irqs |= mask & P32XI_VRES; Pico32x.sh2irqi[0] |= mask & (Pico32x.sh2irq_mask[0] << 3); @@ -79,7 +79,7 @@ void p32x_trigger_irq(SH2 *sh2, int m68k_cycles, unsigned int mask) p32x_update_irls(sh2, m68k_cycles); } -void p32x_update_cmd_irq(SH2 *sh2, int m68k_cycles) +void p32x_update_cmd_irq(SH2 *sh2, unsigned int m68k_cycles) { if ((Pico32x.sh2irq_mask[0] & 2) && (Pico32x.regs[2 / 2] & 1)) Pico32x.sh2irqi[0] |= P32XI_CMD; @@ -207,8 +207,8 @@ void PicoReset32x(void) { if (PicoIn.AHW & PAHW_32X) { p32x_trigger_irq(NULL, SekCyclesDone(), P32XI_VRES); - p32x_sh2_poll_event(&msh2, SH2_IDLE_STATES, 0); - p32x_sh2_poll_event(&ssh2, SH2_IDLE_STATES, 0); + p32x_sh2_poll_event(&msh2, SH2_IDLE_STATES, SekCyclesDone()); + p32x_sh2_poll_event(&ssh2, SH2_IDLE_STATES, SekCyclesDone()); p32x_pwm_ctl_changed(); p32x_timers_recalc(); } @@ -258,7 +258,7 @@ static void p32x_start_blank(void) p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, SekCyclesDone()); } -void p32x_schedule_hint(SH2 *sh2, int m68k_cycles) +void p32x_schedule_hint(SH2 *sh2, unsigned int m68k_cycles) { // rather rough, 32x hint is useless in practice int after; @@ -370,9 +370,9 @@ static void p32x_run_events(unsigned int until) oldest, event_time_next); } -static void run_sh2(SH2 *sh2, int m68k_cycles) +static void run_sh2(SH2 *sh2, unsigned int m68k_cycles) { - int cycles, done; + unsigned int cycles, done; pevt_log_sh2_o(sh2, EVT_RUN_START); sh2->state |= SH2_STATE_RUN; diff --git a/pico/32x/draw_arm.S b/pico/32x/draw_arm.S index e91f9893d..c59fa8f5a 100644 --- a/pico/32x/draw_arm.S +++ b/pico/32x/draw_arm.S @@ -6,7 +6,7 @@ @* See COPYING file in the top-level directory. @* -#include "pico/pico_int_o32.h" +#include "pico/pico_int_offs.h" .extern Pico32x .extern Pico @@ -74,7 +74,7 @@ Pico32xNativePal: ldr lr,=Pico ldr r10,=Pico32x ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] - ldr r10,[r10, #0x40] @ Pico32x.vdp_regs[0] + ldrh r10,[r10, #0x40] @ Pico32x.vdp_regs[0] add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd and r4, r2, #0xff @@ -118,6 +118,8 @@ Pico32xNativePal: mov r7, r7, lsl #1 ldreqh r12,[r9, r7] streqh r12,[r0], #2 @ *dst++ = palmd[*pmd] +.else + addeq r0, r0, #2 .endif beq 2b @ loop_inner @@ -182,8 +184,8 @@ Pico32xNativePal: ldrneb r8, [r5, #2]! @ r7,r8 - pixel 0,1 index subs r6, r6, #1 blt 0b @ loop_outer -@ cmp r7, r8 @ is this really improving things? -@ beq 5f @ check_fill @ +8 + cmp r7, r8 @ is this really improving things? + beq 5f @ check_fill @ +8 3: @ no_fill: mov r12,r7, lsl #1 @@ -242,7 +244,7 @@ Pico32xNativePal: beq 6b 7: @ count_done - sub r5, r5, #4 @ undo readahead + sub r5, r5, #4 @ undo readahead @ fix alignment and check type sub r8, r5, lr @@ -268,14 +270,14 @@ Pico32xNativePal: b 2b @ loop_inner 9: @ bg_mode: - ldrb r12,[r11],#1 @ MD pixel + ldrb r12,[r11],#1 @ MD pixel 0,1 ldrb lr, [r11],#1 - cmp r3, lr, lsl #26 @ MD has bg pixel? + cmp r3, r12,lsl #26 @ MD pixel 0 has bg? .if \do_md mov r12,r12,lsl #1 ldrneh r12,[r9, r12] @ t = palmd[*pmd] moveq r12,r7 - cmp r3, lr, lsl #26 + cmp r3, lr, lsl #26 @ MD pixel 1 has bg? mov lr, lr, lsl #1 ldrneh lr, [r9, lr] moveq lr, r7 @@ -283,7 +285,7 @@ Pico32xNativePal: strh lr, [r0], #2 .else streqh r7, [r0] - cmp r3, lr, lsl #26 + cmp r3, lr, lsl #26 @ MD pixel 1 has bg? streqh r7, [r0, #2] add r0, r0, #4 .endif diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 30d0e4d54..6a3b22229 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -398,9 +398,6 @@ static void p32x_reg_write8(u32 a, u32 d) p32x_sh2_poll_event(&sh2s[1], SH2_STATE_CPOLL, cycles); comreg = 1 << (a & 0x0f) / 2; Pico32x.comm_dirty |= comreg; - - if (cycles - (int)msh2.m68krcycles_done > 120) - p32x_sync_sh2s(cycles); return; } } @@ -453,6 +450,9 @@ static void p32x_reg_write16(u32 a, u32 d) int cycles = SekCyclesDone(); int comreg; + if (r[a / 2] == d) + return; + p32x_sync_sh2s(cycles); r[a / 2] = d; @@ -685,7 +685,7 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) case 0x3f: return; pwm_write: - p32x_pwm_write16(a & ~1, d, sh2, 0); + p32x_pwm_write16(a & ~1, d, sh2, sh2_cycles_done_m68k(sh2)); return; } diff --git a/pico/32x/memory_arm.S b/pico/32x/memory_arm.S index 90c86ddf6..1082c7b7f 100644 --- a/pico/32x/memory_arm.S +++ b/pico/32x/memory_arm.S @@ -6,7 +6,7 @@ * See COPYING file in the top-level directory. */ -#include "../pico_int_o32.h" +#include "../pico_int_offs.h" @ 32X bank sizes... TODO this should somehow come from an include file .equ SH2_ROM_SHIFT, 10 @ 0x003fffff @@ -46,92 +46,92 @@ sh2_read8_rom: ldr ip, [r1, #OFS_SH2_p_rom] eor r0, r0, #1 - lsl r0, #SH2_ROM_SHIFT + mov r0, r0, lsl #SH2_ROM_SHIFT ldrb r0, [ip, r0, lsr #SH2_ROM_SHIFT] bx lr sh2_read8_sdram: ldr ip, [r1, #OFS_SH2_p_sdram] eor r0, r0, #1 - lsl r0, #SH2_RAM_SHIFT + mov r0, r0, lsl #SH2_RAM_SHIFT ldrb r0, [ip, r0, lsr #SH2_RAM_SHIFT] bx lr sh2_read8_da: ldr ip, [r1, #OFS_SH2_p_da] eor r0, r0, #1 - lsl r0, #SH2_DA_SHIFT + mov r0, r0, lsl #SH2_DA_SHIFT ldrb r0, [ip, r0, lsr #SH2_DA_SHIFT] bx lr sh2_read8_dram: ldr ip, [r1, #OFS_SH2_p_dram] eor r0, r0, #1 - lsl r0, #SH2_DRAM_SHIFT + mov r0, r0, lsl #SH2_DRAM_SHIFT ldrb r0, [ip, r0, lsr #SH2_DRAM_SHIFT] bx lr sh2_read16_rom: ldr ip, [r1, #OFS_SH2_p_rom] - lsl r0, #SH2_ROM_SHIFT - lsr r0, #SH2_ROM_SHIFT + mov r0, r0, lsl #SH2_ROM_SHIFT + mov r0, r0, lsr #SH2_ROM_SHIFT ldrh r0, [ip, r0] bx lr sh2_read16_sdram: ldr ip, [r1, #OFS_SH2_p_sdram] - lsl r0, #SH2_RAM_SHIFT - lsr r0, #SH2_RAM_SHIFT + mov r0, r0, lsl #SH2_RAM_SHIFT + mov r0, r0, lsr #SH2_RAM_SHIFT ldrh r0, [ip, r0] bx lr sh2_read16_da: ldr ip, [r1, #OFS_SH2_p_da] - lsl r0, #SH2_DA_SHIFT - lsr r0, #SH2_DA_SHIFT + mov r0, r0, lsl #SH2_DA_SHIFT + mov r0, r0, lsr #SH2_DA_SHIFT ldrh r0, [ip, r0] bx lr sh2_read16_dram: ldr ip, [r1, #OFS_SH2_p_dram] - lsl r0, #SH2_DRAM_SHIFT - lsr r0, #SH2_DRAM_SHIFT + mov r0, r0, lsl #SH2_DRAM_SHIFT + mov r0, r0, lsr #SH2_DRAM_SHIFT ldrh r0, [ip, r0] bx lr sh2_read32_rom: ldr ip, [r1, #OFS_SH2_p_rom] - lsl r0, #SH2_ROM_SHIFT + mov r0, r0, lsl #SH2_ROM_SHIFT ldr r0, [ip, r0, lsr #SH2_ROM_SHIFT] - ror r0, r0, #16 + mov r0, r0, ror #16 bx lr sh2_read32_sdram: ldr ip, [r1, #OFS_SH2_p_sdram] - lsl r0, #SH2_RAM_SHIFT + mov r0, r0, lsl #SH2_RAM_SHIFT ldr r0, [ip, r0, lsr #SH2_RAM_SHIFT] - ror r0, r0, #16 + mov r0, r0, ror #16 bx lr sh2_read32_da: ldr ip, [r1, #OFS_SH2_p_da] - lsl r0, #SH2_DA_SHIFT + mov r0, r0, lsl #SH2_DA_SHIFT ldr r0, [ip, r0, lsr #SH2_DA_SHIFT] - ror r0, r0, #16 + mov r0, r0, ror #16 bx lr sh2_read32_dram: ldr ip, [r1, #OFS_SH2_p_dram] - lsl r0, #SH2_DRAM_SHIFT + mov r0, r0, lsl #SH2_DRAM_SHIFT ldr r0, [ip, r0, lsr #SH2_DRAM_SHIFT] - ror r0, r0, #16 + mov r0, r0, ror #16 bx lr sh2_write8_sdram: @ preserve r0 and r2 for tail call ldr ip, [r2, #OFS_SH2_p_sdram] eor r3, r0, #1 - lsl r3, #SH2_RAM_SHIFT + mov r3, r3, lsl #SH2_RAM_SHIFT strb r1, [ip, r3, lsr #SH2_RAM_SHIFT] #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_ram] @@ -148,7 +148,7 @@ sh2_write8_da: @ preserve r0 and r2 for tail call ldr ip, [r2, #OFS_SH2_p_da] eor r3, r0, #1 - lsl r3, #SH2_DA_SHIFT + mov r3, r3, lsl #SH2_DA_SHIFT strb r1, [ip, r3, lsr #SH2_DA_SHIFT] #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_da] @@ -165,15 +165,15 @@ sh2_write8_dram: tst r1, #0xff ldrne ip, [r2, #OFS_SH2_p_dram] eorne r3, r0, #1 - lslne r3, #SH2_DRAM_SHIFT + movne r3, r3, lsl #SH2_DRAM_SHIFT strneb r1, [ip, r3, lsr #SH2_DRAM_SHIFT] bx lr sh2_write16_sdram: @ preserve r0 and r2 for tail call ldr ip, [r2, #OFS_SH2_p_sdram] - lsl r3, r0, #SH2_RAM_SHIFT - lsr r3, r3, #SH2_RAM_SHIFT + mov r3, r0, lsl #SH2_RAM_SHIFT + mov r3, r3, lsr #SH2_RAM_SHIFT strh r1, [ip, r3] #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_ram] @@ -188,8 +188,8 @@ sh2_write16_sdram: sh2_write16_da: @ preserve r0 and r2 for tail call ldr ip, [r2, #OFS_SH2_p_da] - lsl r3, r0, #SH2_DA_SHIFT - lsr r3, r3, #SH2_DA_SHIFT + mov r3, r0, lsl #SH2_DA_SHIFT + mov r3, r3, lsr #SH2_DA_SHIFT strh r1, [ip, r3] #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_da] @@ -204,23 +204,23 @@ sh2_write16_da: sh2_write16_dram: ldr ip, [r2, #OFS_SH2_p_dram] tst r0, #SH2_DRAM_OW - lsl r3, r0, #SH2_DRAM_SHIFT - lsr r3, r3, #SH2_DRAM_SHIFT + mov r3, r0, lsl #SH2_DRAM_SHIFT + mov r3, r3, lsr #SH2_DRAM_SHIFT streqh r1, [ip, r3] bxeq lr add ip, ip, r3 tst r1, #0xff strneb r1, [ip, #0] tst r1, #0xff00 - lsrne r1, r1, #8 + movne r1, r1, lsr #8 strneb r1, [ip, #1] bx lr sh2_write32_sdram: @ preserve r0 and r2 for tail call ldr ip, [r2, #OFS_SH2_p_sdram] - ror r1, r1, #16 - lsl r3, r0, #SH2_RAM_SHIFT + mov r1, r1, ror #16 + mov r3, r0, lsl #SH2_RAM_SHIFT str r1, [ip, r3, lsr #SH2_RAM_SHIFT] #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_ram] @@ -242,8 +242,8 @@ sh2_write32_sdram: sh2_write32_da: @ preserve r0 and r2 for tail call ldr ip, [r2, #OFS_SH2_p_da] - ror r1, r1, #16 - lsl r3, r0, #SH2_DA_SHIFT + mov r1, r1, ror #16 + mov r3, r0, lsl #SH2_DA_SHIFT str r1, [ip, r3, lsr #SH2_DA_SHIFT] #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_da] @@ -265,13 +265,13 @@ sh2_write32_da: sh2_write32_dram: ldr ip, [r2, #OFS_SH2_p_dram] tst r0, #SH2_DRAM_OW - lsl r3, r0, #SH2_DRAM_SHIFT - roreq r1, r1, #16 + mov r3, r0, lsl #SH2_DRAM_SHIFT + moveq r1, r1, ror #16 streq r1, [ip, r3, lsr #SH2_DRAM_SHIFT] bxeq lr #if 1 ldr r0, [ip, r3, lsr #SH2_DRAM_SHIFT] - ror r1, r1, #16 + mov r1, r1, ror #16 mov r2, #0 tst r1, #0x00ff0000 orrne r2, r2, #0x00ff0000 diff --git a/pico/cd/gfx_dma.c b/pico/cd/gfx_dma.c index 7dfe4bc9c..ff93a2dc0 100644 --- a/pico/cd/gfx_dma.c +++ b/pico/cd/gfx_dma.c @@ -10,10 +10,6 @@ #include "cell_map.c" -#ifndef UTYPES_DEFINED -typedef unsigned short u16; -#endif - // check: Heart of the alien, jaguar xj 220 PICO_INTERNAL void DmaSlowCell(unsigned int source, unsigned int a, int len, unsigned char inc) { diff --git a/pico/cd/memory_arm.S b/pico/cd/memory_arm.S index 335f36247..04920b625 100644 --- a/pico/cd/memory_arm.S +++ b/pico/cd/memory_arm.S @@ -6,7 +6,7 @@ @* See COPYING file in the top-level directory. @* -#include "../pico_int_o32.h" +#include "../pico_int_offs.h" .equiv PCM_STEP_SHIFT, 11 diff --git a/pico/draw2_arm.S b/pico/draw2_arm.S index 6b110b320..6b094495a 100644 --- a/pico/draw2_arm.S +++ b/pico/draw2_arm.S @@ -8,7 +8,7 @@ * this is highly specialized, be careful if changing related C code! */ -#include "pico_int_o32.h" +#include "pico_int_offs.h" @ define these constants in your include file: @ .equiv START_ROW, 1 diff --git a/pico/draw_arm.S b/pico/draw_arm.S index 3bc270331..2efc804c4 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -8,7 +8,7 @@ * this is highly specialized, be careful if changing related C code! */ -#include "pico_int_o32.h" +#include "pico_int_offs.h" .extern DrawStripInterlace diff --git a/pico/memory.h b/pico/memory.h index c878a40f8..d55267ba1 100644 --- a/pico/memory.h +++ b/pico/memory.h @@ -2,13 +2,6 @@ #include "pico_port.h" -#ifndef UTYPES_DEFINED -typedef unsigned char u8; -typedef unsigned short u16; -typedef unsigned int u32; -#endif -typedef uintptr_t uptr; // unsigned pointer-sized int - #define M68K_MEM_SHIFT 16 // minimum size we can map #define M68K_BANK_SIZE (1 << M68K_MEM_SHIFT) diff --git a/pico/memory_amips.S b/pico/memory_amips.S index 7ae259220..7932c2c90 100644 --- a/pico/memory_amips.S +++ b/pico/memory_amips.S @@ -8,7 +8,7 @@ # OUT OF DATE -#include "pico_int_o32.h" +#include "pico_int_offs.h" .set noreorder .set noat diff --git a/pico/memory_arm.S b/pico/memory_arm.S index 117cea0b4..07d6a128c 100644 --- a/pico/memory_arm.S +++ b/pico/memory_arm.S @@ -6,7 +6,7 @@ * See COPYING file in the top-level directory. */ -#include "pico_int_o32.h" +#include "pico_int_offs.h" .equ SRR_MAPPED, (1 << 0) .equ SRR_READONLY, (1 << 1) diff --git a/pico/pico_int.h b/pico/pico_int.h index 133382420..831bfc725 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -33,6 +33,14 @@ extern "C" { #endif +typedef unsigned char u8; +typedef signed char s8; +typedef unsigned short u16; +typedef signed short s16; +typedef unsigned int u32; +typedef signed int s32; +typedef uintptr_t uptr; // unsigned pointer-sized int + // ----------------------- 68000 CPU ----------------------- #ifdef EMU_C68K #include "../cpu/cyclone/Cyclone.h" @@ -427,7 +435,7 @@ struct PicoSound short psg_line; }; -// run tools/mkoffsets pico/pico_int_o32.h if you change these +// run tools/mkoffsets pico/pico_int_offs.h if you change these // careful with savestate compat struct Pico { @@ -905,13 +913,13 @@ void PicoFrame32x(void); void Pico32xStateLoaded(int is_early); void p32x_sync_sh2s(unsigned int m68k_target); void p32x_sync_other_sh2(SH2 *sh2, unsigned int m68k_target); -void p32x_update_irls(SH2 *active_sh2, int m68k_cycles); -void p32x_trigger_irq(SH2 *sh2, int m68k_cycles, unsigned int mask); -void p32x_update_cmd_irq(SH2 *sh2, int m68k_cycles); +void p32x_update_irls(SH2 *active_sh2, unsigned int m68k_cycles); +void p32x_trigger_irq(SH2 *sh2, unsigned int m68k_cycles, unsigned int mask); +void p32x_update_cmd_irq(SH2 *sh2, unsigned int m68k_cycles); void p32x_reset_sh2s(void); void p32x_event_schedule(unsigned int now, enum p32x_event event, int after); void p32x_event_schedule_sh2(SH2 *sh2, enum p32x_event event, int after); -void p32x_schedule_hint(SH2 *sh2, int m68k_cycles); +void p32x_schedule_hint(SH2 *sh2, unsigned int m68k_cycles); // 32x/memory.c extern struct Pico32xMem *Pico32xMem; diff --git a/platform/common/common.mak b/platform/common/common.mak index b1ccbb475..89b72bb81 100644 --- a/platform/common/common.mak +++ b/platform/common/common.mak @@ -9,6 +9,7 @@ asm_render = 0 asm_ym2612 = 0 asm_misc = 0 asm_cdmemory = 0 +asm_32xdraw = 0 asm_mix = 0 endif diff --git a/platform/common/memcpy.c b/platform/common/memcpy.c index b99de4aec..1cd741759 100644 --- a/platform/common/memcpy.c +++ b/platform/common/memcpy.c @@ -9,7 +9,7 @@ * to avoid under/overstepping the src region). * * ATTN does dirty aliasing tricks with undefined behaviour by standard. - * (however, this was needed to improve the generated code). + * (however, this improved the generated code). * ATTN uses struct assignment, which only works if the compiler is inlining * this (else it would probably call memcpy :-)). */ @@ -33,22 +33,24 @@ void *memcpy(void *dest, const void *src, size_t n) const int lm = sizeof(uint32_t)-1; /* align src to word */ - while (((unsigned)ss.c & lm) && n > 0) + while (((uintptr_t)ss.c & lm) && n > 0) *ds.c++ = *ss.c++, n--; - if (((unsigned)ds.c & lm) == 0) { + if (((uintptr_t)ds.c & lm) == 0) { /* fast copy if pointers have the same aligment */ - while (n >= sizeof(struct _16)) /* copy 16 bytes blocks */ + while (n >= sizeof(struct _16)) /* copy 16 byte blocks */ *ds.s++ = *ss.s++, n -= sizeof(struct _16); if (n >= sizeof(uint64_t)) /* copy leftover 8 byte block */ *ds.l++ = *ss.l++, n -= sizeof(uint64_t); +// if (n >= sizeof(uint32_t)) /* copy leftover 4 byte block */ +// *ds.i++ = *ss.i++, n -= sizeof(uint32_t); } else if (n >= 2*sizeof(uint32_t)) { /* unaligned data big enough to avoid overstepping src */ uint32_t v1, v2, b, s; /* align dest to word */ - while (((unsigned)ds.c & lm) && n > 0) + while (((uintptr_t)ds.c & lm) && n > 0) *ds.c++ = *ss.c++, n--; /* copy loop: load aligned words and store shifted words */ - b = (unsigned)ss.c & lm, s = b*8; ss.c -= b; + b = (uintptr_t)ss.c & lm, s = b*8; ss.c -= b; v1 = *ss.i++, v2 = *ss.i++; while (n >= 3*sizeof(uint32_t)) { *ds.i++ = (v1 _L_ s) | (v2 _U_ (32-s)); v1 = *ss.i++; @@ -78,28 +80,35 @@ void *memmove (void *dest, const void *src, size_t n) struct _16 { uint32_t a[4]; }; union { const void *v; uint8_t *c; uint32_t *i; uint64_t *l; struct _16 *s; } ss = { src+n }, ds = { dest+n }; + size_t pd = dest > src ? dest - src : src - dest; const int lm = sizeof(uint32_t)-1; if (dest <= src || dest >= src+n) return memcpy(dest, src, n); /* align src to word */ - while (((unsigned)ss.c & lm) && n > 0) + while (((uintptr_t)ss.c & lm) && n > 0) *--ds.c = *--ss.c, n--; - if (((unsigned)ds.c & lm) == 0) { + /* take care not to copy multi-byte data if it overlaps */ + if (((uintptr_t)ds.c & lm) == 0) { /* fast copy if pointers have the same aligment */ - while (n >= sizeof(struct _16)) /* copy 16 byte blocks */ + while (n >= sizeof(struct _16) && pd >= sizeof(struct _16)) + /* copy 16 bytes blocks if no overlap */ *--ds.s = *--ss.s, n -= sizeof(struct _16); - if (n >= sizeof(uint64_t)) /* copy leftover 8 byte block */ + while (n >= sizeof(uint64_t) && pd >= sizeof(uint64_t)) + /* copy leftover 8 byte blocks if no overlap */ *--ds.l = *--ss.l, n -= sizeof(uint64_t); - } else if (n >= 2*sizeof(uint32_t)) { + while (n >= sizeof(uint32_t) && pd >= sizeof(uint32_t)) + /* copy leftover 4 byte blocks if no overlap */ + *--ds.i = *--ss.i, n -= sizeof(uint32_t); + } else if (n >= 2*sizeof(uint32_t) && pd >= 2*sizeof(uint32_t)) { /* unaligned data big enough to avoid understepping src */ uint32_t v1, v2, b, s; /* align dest to word */ - while (((unsigned)ds.c & lm) && n > 0) + while (((uintptr_t)ds.c & lm) && n > 0) *--ds.c = *--ss.c, n--; /* copy loop: load aligned words and store shifted words */ - b = (unsigned)ss.c & lm, s = b*8; ss.c += b; + b = (uintptr_t)ss.c & lm, s = b*8; ss.c += b; v1 = *--ss.i, v2 = *--ss.i; while (n >= 3*sizeof(uint32_t)) { *--ds.i = (v1 _U_ s) | (v2 _L_ (32-s)); v1 = *--ss.i; @@ -114,7 +123,7 @@ void *memmove (void *dest, const void *src, size_t n) } ss.c -= b - 2*sizeof(uint32_t); } - /* copy 0-7 leftover bytes */ + /* copy 0-7 leftover bytes (or upto everything if ptrs are too close) */ while (n >= 4) { *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; diff --git a/platform/gp2x/code940/memcpy.s b/platform/gp2x/code940/memcpy.s index 282762fd0..1350639a7 100644 --- a/platform/gp2x/code940/memcpy.s +++ b/platform/gp2x/code940/memcpy.s @@ -114,14 +114,12 @@ subs r2, r2, #0x14 blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ stmdb sp!, {r4, r7, r8, r9, r10} /* borrow r4 */ -/* blat 64 bytes at a time */ +/* blat 32 bytes at a time */ /* XXX for really big copies perhaps we should use more registers */ Lmemcpy_floop32: ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr} stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr} -ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr} -stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr} -subs r2, r2, #0x40 +subs r2, r2, #0x20 bge Lmemcpy_floop32 cmn r2, #0x10 @@ -314,14 +312,12 @@ stmdb sp!, {r4, r7, r8, r9, r10, lr} subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */ blt Lmemcpy_bl32 -/* blat 64 bytes at a time */ +/* blat 32 bytes at a time */ /* XXX for really big copies perhaps we should use more registers */ Lmemcpy_bloop32: ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr} stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr} -ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr} -stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr} -subs r2, r2, #0x40 +subs r2, r2, #0x20 bge Lmemcpy_bloop32 Lmemcpy_bl32: diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index 13e554955..6d68a1bc2 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -49,11 +49,8 @@ get_define () # prefix struct member member... echo "const int one = 1;" >/tmp/getoffs.c compile_rodata ENDIAN=$(if [ "$rodata" -eq 1 ]; then echo be; else echo le; fi) -# determine output file -echo "const int vsz = sizeof(void *);" >/tmp/getoffs.c -compile_rodata -fn="${1:-.}/pico_int_o$((8*$rodata)).h" # output header +fn="${1:-.}/pico_int_offs.h" echo "/* autogenerated by mkoffset.sh, do not edit */" >$fn echo "/* target endianess: $ENDIAN, compiled with: $CC $CFLAGS */" >>$fn # output offsets From 87316e5941e8200dad02b41a83763e7757d706bc Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 25 Apr 2019 18:56:26 +0200 Subject: [PATCH 029/174] sh2 drc, reuse blocks if already previously compiled (speedup for Virtua *) --- cpu/sh2/compiler.c | 144 ++++++++++++++++++++++++++++++++++++++------- cpu/sh2/compiler.h | 2 +- 2 files changed, 124 insertions(+), 22 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index f8f64ef61..fa0a6b71e 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -300,6 +300,7 @@ struct block_desc { int size; // ..of recompiled insns int size_lit; // ..of (insns+)literal pool u8 *tcache_ptr; // start address of block in cache + u16 crc; // crc of insns and literals u16 active; // actively used or deactivated? struct block_list *list; #if (DRC_DEBUG & 2) @@ -346,6 +347,8 @@ struct block_list { }; struct block_list *blist_free; +static struct block_list *inactive_blocks[TCACHE_BUFFERS]; + // array of pointers to block_lists for RAM and 2 data arrays // each array has len: sizeof(mem) / INVAL_PAGE_SIZE static struct block_list **inval_lookup[TCACHE_BUFFERS]; @@ -691,6 +694,7 @@ static void REGPARM(1) flush_tcache(int tcid) for (i = 0; i < ram_sizes[tcid] / INVAL_PAGE_SIZE; i++) rm_block_list(&inval_lookup[tcid][i]); + rm_block_list(&inactive_blocks[tcid]); } static void add_to_hashlist(struct block_entry *be, int tcache_id) @@ -777,7 +781,7 @@ static void rm_from_hashlist_unresolved(struct block_link *bl, int tcache_id) bl->next->prev = bl->prev; } -static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit); +static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit, int free); static void dr_free_oldest_block(int tcache_id) { struct block_desc *bd; @@ -794,7 +798,7 @@ static void dr_free_oldest_block(int tcache_id) } if (bd->addr && bd->entry_count) - sh2_smc_rm_block_entry(bd, tcache_id, 0); + sh2_smc_rm_block_entry(bd, tcache_id, 0, 1); block_limit[tcache_id]++; if (block_limit[tcache_id] >= block_max_counts[tcache_id]) @@ -926,8 +930,32 @@ static u32 dr_check_nolit(u32 start, u32 end, int tcache_id) return end; } +static struct block_desc *dr_find_inactive_block(int tcache_id, u16 crc, + u32 addr, int size, u32 addr_lit, int size_lit) +{ + struct block_list **head = &inactive_blocks[tcache_id]; + struct block_list *prev = NULL, *current = *head; + + for (; current != NULL; prev = current, current = current->next) { + struct block_desc *block = current->block; + if (block->crc == crc && block->addr == addr && block->size == size && + block->addr_lit == addr_lit && block->size_lit == size_lit) + { + if (prev == NULL) + *head = current->next; + else + prev->next = current->next; + block->list = NULL; // should now be empty + current->next = blist_free; + blist_free = current; + return block; + } + } + return NULL; +} + static struct block_desc *dr_add_block(u32 addr, int size, - u32 addr_lit, int size_lit, int is_slave, int *blk_id) + u32 addr_lit, int size_lit, u16 crc, int is_slave, int *blk_id) { struct block_entry *be; struct block_desc *bd; @@ -951,6 +979,7 @@ static struct block_desc *dr_add_block(u32 addr, int size, bd->addr_lit = addr_lit; bd->size_lit = size_lit; bd->tcache_ptr = tcache_ptr; + bd->crc = crc; bd->active = 1; bd->entry_count = 1; @@ -1074,6 +1103,34 @@ static void dr_link_blocks(struct block_entry *be, int tcache_id) #endif } +static void dr_link_outgoing(struct block_entry *be, int tcache_id, int is_slave) +{ +#if LINK_BRANCHES + struct block_link *bl; + int target_tcache_id; + + for (bl = be->o_links; bl; bl = bl->o_next) { + be = dr_get_entry(bl->target_pc, is_slave, &target_tcache_id); + if (!target_tcache_id || target_tcache_id == tcache_id) { + if (be) { + dbg(2, "- link from %p to pc %08x entry %p", bl->jump, bl->target_pc, be->tcache_ptr); + emith_jump_patch(bl->jump, be->tcache_ptr); + bl->target = be; + bl->prev = NULL; + if (be->links) + be->links->prev = bl; + bl->next = be->links; + be->links = bl; + } else { + emith_jump_patch(bl->jump, sh2_drc_dispatcher); + add_to_hashlist_unresolved(bl, tcache_id); + } + host_instructions_updated(bl->jump, bl->jump+4); + } + } +#endif +} + #define ADD_TO_ARRAY(array, count, item, failcode) { \ if (count >= ARRAY_SIZE(array)) { \ dbg(1, "warning: " #array " overflow"); \ @@ -2442,6 +2499,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) int i, v; u32 u; int op; + u16 crc; base_pc = sh2->pc; @@ -2454,11 +2512,37 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } // initial passes to disassemble and analyze the block - scan_block(base_pc, sh2->is_slave, op_flags, &end_pc, &base_literals, &end_literals); + crc = scan_block(base_pc, sh2->is_slave, op_flags, &end_pc, &base_literals, &end_literals); end_literals = dr_check_nolit(base_literals, end_literals, tcache_id); if (base_literals == end_literals) // map empty lit section to end of code base_literals = end_literals = end_pc; + // if there is already a translated but inactive block, reuse it + block = dr_find_inactive_block(tcache_id, crc, base_pc, end_pc - base_pc, + base_literals, end_literals - base_literals); + + if (block) { + // connect branches + dbg(2, "== %csh2 reuse block %08x-%08x,%08x-%08x -> %p", sh2->is_slave ? 's' : 'm', + base_pc, end_pc, base_literals, end_literals, block->entryp->tcache_ptr); + for (i = 0; i < block->entry_count; i++) { + entry = &block->entryp[i]; + add_to_hashlist(entry, tcache_id); +#if LINK_BRANCHES + // incoming branches + dr_link_blocks(entry, tcache_id); + if (!tcache_id) + dr_link_blocks(entry, sh2->is_slave?2:1); + // outgoing branches + dr_link_outgoing(entry, tcache_id, sh2->is_slave); +#endif + } + // mark memory for overwrite detection + dr_mark_memory(1, block, tcache_id, 0); + block->active = 1; + return block->entryp[0].tcache_ptr; + } + // collect branch_targets that don't land on delay slots for (pc = base_pc, i = 0; pc < end_pc; i++, pc += 2) { if (!(op_flags[i] & OF_BTARGET)) @@ -2480,13 +2564,14 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #endif block = dr_add_block(base_pc, end_pc - base_pc, base_literals, - end_literals - base_literals, sh2->is_slave, &blkid_main); + end_literals - base_literals, crc, sh2->is_slave, &blkid_main); if (block == NULL) return NULL; block_entry_ptr = tcache_ptr; - dbg(2, "== %csh2 block #%d,%d %08x-%08x -> %p", sh2->is_slave ? 's' : 'm', - tcache_id, blkid_main, base_pc, end_pc, block_entry_ptr); + dbg(2, "== %csh2 block #%d,%d crc %04x %08x-%08x,%08x-%08x -> %p", sh2->is_slave ? 's' : 'm', + tcache_id, blkid_main, crc, base_pc, end_pc, base_literals, end_literals, block_entry_ptr); + // clear stale state after compile errors rcache_invalidate(); @@ -4054,7 +4139,7 @@ static void sh2_generate_utils(void) #endif } -static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit) +static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit, int free) { struct block_link *bl; u32 i; @@ -4066,6 +4151,7 @@ static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nol dbg(1, " killing dead block!? %08x", bd->addr); return; } + free = free || nolit; // block is invalid if literals are overwritten // remove from hash table, make incoming links unresolved, revoke outgoing links for (i = 0; i < bd->entry_count; i++) { @@ -4073,7 +4159,6 @@ static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nol rm_from_hashlist(&bd->entryp[i], tcache_id); for (bl = bd->entryp[i].o_links; bl != NULL; ) { - struct block_link *bl_next = bl->o_next; if (bl->target) { if (bl->prev) bl->prev->next = bl->next; @@ -4084,13 +4169,8 @@ static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nol bl->target = NULL; } else if (bd->active) rm_from_hashlist_unresolved(bl, tcache_id); - // free bl - bl->jump = NULL; - bl->next = blink_free[bl->tcache_id]; - blink_free[bl->tcache_id] = bl; - bl = bl_next; + bl = bl->o_next; } - bd->entryp[i].o_links = NULL; for (bl = bd->entryp[i].links; bl != NULL; ) { struct block_link *bl_next = bl->next; @@ -4108,10 +4188,21 @@ static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nol if (bd->active) dr_mark_memory(-1, bd, tcache_id, nolit); - bd->addr = bd->size = bd->addr_lit = bd->size_lit = 0; - bd->entry_count = 0; + if (free) { + while ((bl = bd->entryp[0].o_links) != NULL) { + bd->entryp[0].o_links = bl->next; + bl->jump = NULL; + bl->next = blink_free[bl->tcache_id]; + blink_free[bl->tcache_id] = bl; + } + bd->entryp[0].o_links = NULL; + rm_from_block_lists(bd); + bd->addr = bd->size = bd->addr_lit = bd->size_lit = 0; + bd->entry_count = 0; + } else { + add_to_block_list(&inactive_blocks[tcache_id], bd); + } bd->active = 0; - rm_from_block_lists(bd); } static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift) @@ -4142,7 +4233,7 @@ static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift) { dbg(2, "smc remove @%08x", a); end_addr = (start_lit <= a && block->size_lit ? a : 0); - sh2_smc_rm_block_entry(block, tcache_id, end_addr); + sh2_smc_rm_block_entry(block, tcache_id, end_addr, 0); #if (DRC_DEBUG & 2) removed = 1; #endif @@ -4546,7 +4637,7 @@ static void *dr_get_pc_base(u32 pc, int is_slave) return (char *)ret - (pc & ~mask); } -void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, +u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, u32 *base_literals_out, u32 *end_literals_out) { u16 *dr_pc_base; @@ -4558,6 +4649,7 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, int next_is_delay = 0; int end_block = 0; int i, i_end; + u32 crc = 0; memset(op_flags, 0, sizeof(*op_flags) * BLOCK_INSN_LIMIT); op_flags[0] |= OF_BTARGET; // block start is always a target @@ -5346,8 +5438,9 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, // 2nd pass: some analysis lowest_literal = end_literals = lowest_mova = 0; - for (i = 0; i < i_end; i++) { + for (i = 0, pc = base_pc; i < i_end; i++, pc += 2) { opd = &ops[i]; + crc += FETCH_OP(pc); // propagate T (TODO: DIV0U) if ((opd->op == OP_SETCLRT && !opd->imm) || opd->op == OP_BRANCH_CT) @@ -5427,11 +5520,20 @@ void scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, if (lowest_literal >= end_literals) lowest_literal = end_literals; + if (lowest_literal && end_literals) + for (pc = lowest_literal; pc < end_literals; pc += 2) + crc += FETCH_OP(pc); + *end_pc_out = end_pc; if (base_literals_out != NULL) *base_literals_out = (lowest_literal ?: end_pc); if (end_literals_out != NULL) *end_literals_out = (end_literals ?: end_pc); + + // crc overflow handling, twice to collect all overflows + crc = (crc & 0xffff) + (crc >> 16); + crc = (crc & 0xffff) + (crc >> 16); + return crc; } // vim:shiftwidth=2:ts=2:expandtab diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 36dfd9456..07e76cca8 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -22,7 +22,7 @@ void sh2_drc_frame(void); #define OF_T_CLEAR (1 << 3) // ... clear #define OF_B_IN_DS (1 << 4) -void scan_block(unsigned int base_pc, int is_slave, +unsigned short scan_block(unsigned int base_pc, int is_slave, unsigned char *op_flags, unsigned int *end_pc, unsigned int *base_literals, unsigned int *end_literals); From c77e3bf5e75d585087fec904a556bfb5fb65ed31 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 25 Apr 2019 18:57:18 +0200 Subject: [PATCH 030/174] add literal pool to sh2 drc (for armv[456] without MOVT/W) --- cpu/drc/emit_arm.c | 125 ++++++++++++++++++++++++++++++------- cpu/drc/emit_x86.c | 3 + cpu/sh2/compiler.c | 10 ++- pico/carthw/svp/compiler.c | 1 + 4 files changed, 114 insertions(+), 25 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 4744b1279..d8674a030 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -261,13 +261,30 @@ #define EOP_MOVT(rd,imm) \ EMIT(0xe3400000 | ((rd)<<12) | (((imm)>>16)&0xfff) | (((imm)>>12)&0xf0000)) -static int count_bits(unsigned val) +static inline int count_bits(unsigned val) { - val = (val & 0x55555555) + ((val >> 1) & 0x55555555); + val = val - ((val >> 1) & 0x55555555); val = (val & 0x33333333) + ((val >> 2) & 0x33333333); - val = (val & 0x0f0f0f0f) + ((val >> 4) & 0x0f0f0f0f); - val = (val & 0x00ff00ff) + ((val >> 8) & 0x00ff00ff); - return (val & 0xffff) + (val >> 16); + return (((val + (val >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24; +} + +// host literal pool; must be significantly smaller than 1024 (max LDR offset = 4096) +#define MAX_HOST_LITERALS 128 +static u32 literal_pool[MAX_HOST_LITERALS]; +static u32 *literal_insn[MAX_HOST_LITERALS]; +static int literal_pindex, literal_iindex; + +static int emith_pool_literal(u32 imm, int *offs) +{ + int idx = literal_pindex - 8; // max look behind in pool + // see if one of the last literals was the same (or close enough) + for (idx = (idx < 0 ? 0 : idx); idx < literal_pindex; idx++) + if (abs((int)(imm - literal_pool[idx])) <= 0xff) + break; + if (idx == literal_pindex) // store new literal + literal_pool[literal_pindex++] = imm; + *offs = imm - literal_pool[idx]; + return idx; } // XXX: RSB, *S will break if 1 insn is not enough @@ -275,6 +292,7 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int { int ror2; u32 v; + int i; switch (op) { case A_OP_MOV: @@ -284,19 +302,48 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int imm = ~imm; op = A_OP_MVN; } -#ifdef HAVE_ARMV7 - for (v = imm, ror2 = 0; v && !(v & 3); v >>= 2) - ror2--; - if (v >> 8) { - /* 2+ insns needed - prefer movw/movt */ + // count insns needed for mov/orr #imm + for (v = imm, ror2 = 0; (v >> 24) && ror2 < 32/2; ror2++) + v = (v << 2) | (v >> 30); + for (i = 2; i > 0; i--, v >>= 8) + while (v > 0xff && !(v & 3)) + v >>= 2; + if (v) { // 3+ insns needed... if (op == A_OP_MVN) imm = ~imm; +#ifdef HAVE_ARMV7 + // ...prefer movw/movt EOP_MOVW(rd, imm); if (imm & 0xffff0000) EOP_MOVT(rd, imm); +#else + // ...emit literal load + int idx, o; + if (literal_iindex >= MAX_HOST_LITERALS) { + elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, + "pool overflow"); + exit(1); + } + idx = emith_pool_literal(imm, &o); + literal_insn[literal_iindex++] = (u32 *)tcache_ptr; + EOP_LDR_IMM2(cond, rd, 15, idx * sizeof(u32)); + if (o > 0) + EOP_C_DOP_IMM(cond, A_OP_ADD, 0, rd, rd, 0, o); + else if (o < 0) + EOP_C_DOP_IMM(cond, A_OP_SUB, 0, rd, rd, 0, -o); +#endif return; } -#endif + break; + + case A_OP_AND: + // AND must fit into 1 insn. if not, use BIC + for (v = imm, ror2 = 0; (v >> 8) && ror2 < 32/2; ror2++) + v = (v << 2) | (v >> 30); + if (v >> 8) { + imm = ~imm; + op = A_OP_BIC; + } break; case A_OP_SUB: @@ -314,20 +361,13 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int break; } - again: - v = imm, ror2 = 32/2; // arm imm shift is ROR, so rotate for best fit - while ((v >> 24) && !(v & 0xc0)) - v = (v << 2) | (v >> 30), ror2++; + // try to get the topmost byte empty to possibly save an insn + for (v = imm, ror2 = 0; (v >> 24) && ror2 < 32/2; ror2++) + v = (v << 2) | (v >> 30); do { // shift down to get 'best' rot2 while (v > 0xff && !(v & 3)) v >>= 2, ror2--; - // AND must fit into 1 insn. if not, use BIC - if (op == A_OP_AND && v != (v & 0xff)) { - imm = ~imm; - op = A_OP_BIC; - goto again; - } EOP_C_DOP_IMM(cond, op, s, rn, rd, ror2 & 0xf, v & 0xff); switch (op) { @@ -385,6 +425,47 @@ static int emith_xbranch(int cond, void *target, int is_call) return (u32 *)tcache_ptr - start_ptr; } +static void emith_pool_commit(int jumpover) +{ + int i, sz = literal_pindex * sizeof(u32); + u8 *pool = (u8 *)tcache_ptr; + + // nothing to commit if pool is empty + if (sz == 0) + return; + // need branch over pool if not at block end + if (jumpover) { + pool += sizeof(u32); + emith_xbranch(A_COND_AL, (u8 *)pool + sz, 0); + } + // safety check - pool must be after insns and reachable + if ((u32)(pool - (u8 *)literal_insn[0] + 8) > 0xfff) { + elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, + "pool offset out of range"); + exit(1); + } + // copy pool and adjust addresses in insns accessing the pool + memcpy(pool, literal_pool, sz); + for (i = 0; i < literal_iindex; i++) { + *literal_insn[i] += (u8 *)pool - ((u8 *)literal_insn[i] + 8); + } + // count pool constants as insns for statistics + for (i = 0; i < literal_pindex; i++) + COUNT_OP; + + tcache_ptr = (void *)((u8 *)pool + sz); + literal_pindex = literal_iindex = 0; +} + +static inline void emith_pool_check(void) +{ + // check if pool must be committed + if (literal_iindex > MAX_HOST_LITERALS-4 || + (u8 *)tcache_ptr - (u8 *)literal_insn[0] > 0xe00) + // pool full, or displacement is approaching the limit + emith_pool_commit(1); +} + #define JMP_POS(ptr) \ ptr = tcache_ptr; \ tcache_ptr += sizeof(u32) @@ -769,7 +850,7 @@ static int emith_xbranch(int cond, void *target, int is_call) b_ = tmpr; \ } \ op(b_,v_); \ -} while(0) +} while (0) #define emith_ctx_read_multiple(r, offs, count, tmpr) \ emith_ctx_do_multiple(EOP_LDMIA, r, offs, count, tmpr) diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index ce13c618c..1ac4ee013 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -1104,3 +1104,6 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMITH_SJMP_END(DCOND_EQ); \ EMITH_SJMP_END(DCOND_EQ); \ } while (0) + +#define emith_pool_check() /**/ +#define emith_pool_commit(j) /**/ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index fa0a6b71e..bc63e18bd 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -369,7 +369,7 @@ enum { HR_STATIC, // vreg has a static mapping HR_CACHED, // vreg has sh2_reg_e HR_TEMP, // reg used for temp storage -} cach_reg_type; +} cache_reg_type; enum { HRF_DIRTY = 1 << 0, // has "dirty" value to be written to ctx @@ -2569,8 +2569,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) return NULL; block_entry_ptr = tcache_ptr; - dbg(2, "== %csh2 block #%d,%d crc %04x %08x-%08x,%08x-%08x -> %p", sh2->is_slave ? 's' : 'm', - tcache_id, blkid_main, crc, base_pc, end_pc, base_literals, end_literals, block_entry_ptr); + dbg(2, "== %csh2 block #%d,%d %08x-%08x,%08x-%08x -> %p", sh2->is_slave ? 's' : 'm', + tcache_id, blkid_main, base_pc, end_pc, base_literals, end_literals, block_entry_ptr); // clear stale state after compile errors @@ -2715,6 +2715,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } #endif + emith_pool_check(); pc += 2; if (skip_op > 0) { @@ -3892,6 +3893,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_jump_patch(branch_patch_ptr[i], target); } + emith_pool_commit(0); + dr_mark_memory(1, block, tcache_id, 0); tcache_ptrs[tcache_id] = tcache_ptr; @@ -4124,6 +4127,7 @@ static void sh2_generate_utils(void) MAKE_WRITE_WRAPPER(sh2_drc_write32); #endif + emith_pool_commit(0); rcache_invalidate(); #if (DRC_DEBUG & 4) host_dasm_new_symbol(sh2_drc_entry); diff --git a/pico/carthw/svp/compiler.c b/pico/carthw/svp/compiler.c index b31197c2a..1ec71e754 100644 --- a/pico/carthw/svp/compiler.c +++ b/pico/carthw/svp/compiler.c @@ -1795,6 +1795,7 @@ void *ssp_translate_block(int pc) tr_flush_dirty_ST(); tr_flush_dirty_pmcrs(); block_end = emit_block_epilogue(ccount, end_cond, jump_pc, pc); + emith_pool_commit(0); if (tcache_ptr - (u32 *)tcache > DRC_TCACHE_SIZE/4) { elprintf(EL_ANOMALY|EL_STATUS|EL_SVP, "tcache overflow!\n"); From 77569b214f596d00294971f86b3963efc4732df4 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 25 Apr 2019 19:02:29 +0200 Subject: [PATCH 031/174] speed improvement and fixes for 32x ARM asm draw --- pico/32x/draw.c | 5 -- pico/32x/draw_arm.S | 156 +++++++++++++++++++++++++++++++------------- pico/draw.c | 4 +- platform/gp2x/emu.c | 2 +- tools/mkoffsets.sh | 1 + 5 files changed, 113 insertions(+), 55 deletions(-) diff --git a/pico/32x/draw.c b/pico/32x/draw.c index 3dd3d62f7..229ed914d 100644 --- a/pico/32x/draw.c +++ b/pico/32x/draw.c @@ -316,11 +316,6 @@ void PicoDraw32xLayerMdOnly(int offs, int lines) void PicoDrawSetOutFormat32x(pdso_t which, int use_32x_line_mode) { -#ifdef _ASM_32X_DRAW - extern void *Pico32xNativePal; - Pico32xNativePal = Pico32xMem->pal_native; -#endif - if (which == PDF_RGB555) { // need CLUT pixels in PicoDraw2FB for layer transparency PicoDrawSetInternalBuf(Pico.est.Draw2FB, 328); diff --git a/pico/32x/draw_arm.S b/pico/32x/draw_arm.S index c59fa8f5a..e0cdcbe50 100644 --- a/pico/32x/draw_arm.S +++ b/pico/32x/draw_arm.S @@ -13,12 +13,6 @@ .equiv P32XV_PRI, (1<< 7) -.bss -.align 2 -.global Pico32xNativePal -Pico32xNativePal: - .word 0 - .text .align 2 @@ -82,8 +76,8 @@ Pico32xNativePal: mov r3, r3, lsl #26 @ mdbg << 26 mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data tst r10,#P32XV_PRI - moveq r10,#0 - movne r10,#0x8000 @ r10 = inv_bit + movne r10,#0 + moveq r10,#0x8000 @ r10 = inv_bit call_scan_prep \call_scan lr mov r4, #0 @ line @@ -92,7 +86,6 @@ Pico32xNativePal: 0: @ loop_outer: call_scan_end \call_scan add r4, r4, #1 - sub r11,r11,#1 @ adjust for prev read cmp r4, r2, lsr #16 call_scan_fin_ge \call_scan ldmgefd sp!, {r4-r11,pc} @@ -106,31 +99,86 @@ Pico32xNativePal: add r5, r1, r12, lsl #1 @ p32x = dram + dram[l] 2: @ loop_inner: - ldrb r7, [r11], #1 @ MD pixel - subs r6, r6, #1 + ldrh r8, [r5], #2 + subs lr, r6, #1 blt 0b @ loop_outer - ldrh r8, [r5], #2 @ 32x pixel - cmp r3, r7, lsl #26 @ MD has bg pixel? - beq 3f @ draw32x + +3: @ loop_innermost: + ldrh r7, [r5], #2 @ 32x pixel + subs lr, lr, #1 + cmpge r7, r8 + beq 3b @ loop_innermost + + sub r5, r5, #2 + add lr, lr, #1 + sub lr, r6, lr + sub r6, r6, lr + eor r12,r8, r10 - ands r12,r12,#0x8000 @ !((t ^ inv) & 0x8000) + tst r12, #0x8000 @ !((t ^ inv) & 0x8000) + bne 5f @ draw_md + + and r7 ,r8, #0x03e0 + mov r8, r8, lsl #11 + orr r8, r8, r8, lsr #(10+11) + orr r8, r8, r7 ,lsl #1 + bic r8, r8, #0x0020 @ kill prio bit + + add r11,r11,lr + tst r0, #2 @ dst unaligned? + strneh r8, [r0], #2 + subne lr, lr, #1 + cmp lr, #0 + beq 2b @ loop_inner + mov r8, r8, lsl #16 + orr r12,r8, r8, lsr #16 + mov r8 ,r12 +4: @ draw_32x: + subs lr, lr, #4 @ store 4 pixels + stmgeia r0!, {r8, r12} + bgt 4b @ draw_32x + beq 2b @ loop_inner + adds lr, lr, #2 @ store 1-3 leftover pixels + strge r8, [r0], #4 + strneh r8, [r0], #2 + b 2b @ loop_inner + +5: @ draw_md: + subs lr, lr, #1 + ldrgeb r7, [r11], #1 @ MD pixel + blt 2b @ loop_inner + cmp r3, r7, lsl #26 @ MD has bg pixel? .if \do_md mov r7, r7, lsl #1 - ldreqh r12,[r9, r7] - streqh r12,[r0], #2 @ *dst++ = palmd[*pmd] + ldrneh r7 ,[r9, r7] + strneh r7 ,[r0], #2 @ *dst++ = palmd[*pmd] .else - addeq r0, r0, #2 + addne r0, r0, #2 .endif - beq 2b @ loop_inner + bne 5b @ draw_md -3: @ draw32x: - and r12,r8, #0x03e0 + and r7 ,r8, #0x03e0 mov r8, r8, lsl #11 orr r8, r8, r8, lsr #(10+11) - orr r8, r8, r12,lsl #1 + orr r8, r8, r7 ,lsl #1 bic r8, r8, #0x0020 @ kill prio bit strh r8, [r0], #2 @ *dst++ = bgr2rgb(*p32x++) - b 2b @ loop_inner + +6: @ draw_md_32x: + subs lr, lr, #1 + ldrgeb r7, [r11], #1 @ MD pixel + blt 2b @ loop_inner + cmp r3, r7, lsl #26 @ MD has bg pixel? +.if \do_md + mov r7, r7, lsl #1 + ldrneh r7 ,[r9, r7] @ *dst++ = palmd[*pmd] + moveq r7 ,r8 @ *dst++ = bgr2rgb(*p32x++) + strh r7 ,[r0], #2 +.else + streqh r8, [r0] @ *dst++ = bgr2rgb(*p32x++) + add r0, r0, #2 +.endif + b 6b @ draw_md_32x .endm @@ -144,9 +192,11 @@ Pico32xNativePal: stmfd sp!, {r4-r11,lr} ldr lr,=Pico - ldr r10,=Pico32xNativePal + ldr r10,=Pico32xMem + ldr r9,=OFS_PMEM32x_pal_native + ldr r10, [r10] ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] - ldr r10,[r10] + add r10,r10,r9 add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd and r4, r2, #0xff @@ -184,7 +234,7 @@ Pico32xNativePal: ldrneb r8, [r5, #2]! @ r7,r8 - pixel 0,1 index subs r6, r6, #1 blt 0b @ loop_outer - cmp r7, r8 @ is this really improving things? + cmp r7, r8 beq 5f @ check_fill @ +8 3: @ no_fill: @@ -204,11 +254,11 @@ Pico32xNativePal: ldrneh r7, [r9, r12] @ t = palmd[pmd[0]] tst lr, #0x20 ldrneb lr, [r11,#-1] @ MD pixel 1 - strh r7, [r0], #2 cmpne r3, lr, lsl #26 @ MD has bg pixel? mov lr, lr, lsl #1 ldrneh r8, [r9, lr] @ t = palmd[pmd[1]] - strh r8, [r0], #2 + orr r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth + str r7, [r0], #4 @ (no write combining on ARM9) .else streqh r7, [r0] tst lr, #0x20 @@ -219,18 +269,21 @@ Pico32xNativePal: .endif b 2b @ loop_inner -5: @ check_fill +5: @ check_fill: @ count pixels, align if needed bic r12,r5, #1 + ldrh lr ,[r12, #2] @ only do this for at least 4 pixels ldrh r12,[r12] + orr r12,lr,r12, lsl #16 orr lr, r7, r7, lsl #8 + orr lr, lr, lr, lsl #16 cmp r12,lr bne 3b @ no_fill tst r5, #1 sub lr, r5, #2 @ starting r5 (32x render data start) - addeq r5, r5, #2 - addne r5, r5, #1 @ add for the check above + addeq r5, r5, #4 + addne r5, r5, #3 @ add for the check above add r6, r6, #1 @ restore from dec orr r7, r7, r7, lsl #8 6: @@ -240,11 +293,12 @@ Pico32xNativePal: ldrh r12,[r5], #2 bge 7f @ count_done cmp r8, r7 + subne r5, r5, #2 @ undo readahead cmpeq r12,r7 beq 6b -7: @ count_done - sub r5, r5, #4 @ undo readahead +7: @ count_done: + sub r5, r5, #2 @ undo readahead @ fix alignment and check type sub r8, r5, lr @@ -262,11 +316,15 @@ Pico32xNativePal: beq 9f @ bg_mode add r11,r11,r8 -8: - subs r8, r8, #2 - strgeh r7, [r0], #2 - strgeh r7, [r0], #2 - bgt 8b + orr r12,r7, r7, lsl #16 + mov r7 ,r12 +8: @ 32x_loop: + subs r8, r8, #4 @ store 4 pixels + stmgeia r0!, {r7, r12} + bgt 8b @ 32x_loop + beq 2b @ loop_inner + adds r8, r8, #2 + strge r7, [r0], #4 @ store 2 leftover pixels b 2b @ loop_inner 9: @ bg_mode: @@ -281,8 +339,8 @@ Pico32xNativePal: mov lr, lr, lsl #1 ldrneh lr, [r9, lr] moveq lr, r7 - strh r12,[r0], #2 - strh lr, [r0], #2 + orr r12,r12,lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth + str r12,[r0], #4 @ (no write combining on ARM9) .else streqh r7, [r0] cmp r3, lr, lsl #26 @ MD pixel 1 has bg? @@ -303,9 +361,11 @@ Pico32xNativePal: stmfd sp!, {r4-r11,lr} ldr lr,=Pico - ldr r10,=Pico32xNativePal + ldr r10,=Pico32xMem + ldr r9,=OFS_PMEM32x_pal_native + ldr r10, [r10] ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] - ldr r10,[r10] + add r10,r10,r9 add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd and r4, r2, #0xff @@ -320,7 +380,6 @@ Pico32xNativePal: 0: @ loop_outer: call_scan_end \call_scan add r4, r4, #1 - sub r11,r11,#1 @ adjust for prev read cmp r4, r2, lsr #16 call_scan_fin_ge \call_scan ldmgefd sp!, {r4-r11,pc} @@ -341,13 +400,13 @@ Pico32xNativePal: eor lr, lr, #0x20 3: @ loop_innermost: - ldrb r7, [r11], #1 @ MD pixel subs r6, r6, #1 + ldrgeb r7, [r11], #1 @ MD pixel blt 0b @ loop_outer - cmp r3, r7, lsl #26 @ MD has bg pixel? - mov r7, r7, lsl #1 - tstne lr, #0x20 + tst lr, #0x20 + cmpne r3, r7, lsl #26 @ MD has bg pixel? .if \do_md + mov r7, r7, lsl #1 ldrneh r12,[r9, r7] @ t = palmd[*pmd] streqh lr, [r0], #2 strneh r12,[r0], #2 @ *dst++ = t @@ -365,15 +424,18 @@ make_do_loop_dc do_loop_dc, 0, 0 make_do_loop_dc do_loop_dc_md, 0, 1 make_do_loop_dc do_loop_dc_scan, 1, 0 make_do_loop_dc do_loop_dc_scan_md, 1, 1 +.pool make_do_loop_pp do_loop_pp, 0, 0 make_do_loop_pp do_loop_pp_md, 0, 1 make_do_loop_pp do_loop_pp_scan, 1, 0 make_do_loop_pp do_loop_pp_scan_md, 1, 1 +.pool make_do_loop_rl do_loop_rl, 0, 0 make_do_loop_rl do_loop_rl_md, 0, 1 make_do_loop_rl do_loop_rl_scan, 1, 0 make_do_loop_rl do_loop_rl_scan_md, 1, 1 +.pool @ vim:filetype=armasm diff --git a/pico/draw.c b/pico/draw.c index 6fa17988d..7326aec56 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -1364,8 +1364,8 @@ static void FinalizeLine8bit(int sh, int line, struct PicoEState *est) { // a hack for mid-frame palette changes if (!(est->rendstatus & PDRAW_SONIC_MODE) || line - dirty_line > 4) { - // store a maximum of 3 additional palettes in SonicPal - if (est->SonicPalCount < 3) + // store a maximum of 2 additional palettes in SonicPal + if (est->SonicPalCount < 2) est->SonicPalCount ++; dirty_line = line; est->rendstatus |= PDRAW_SONIC_MODE; diff --git a/platform/gp2x/emu.c b/platform/gp2x/emu.c index 450ac0803..4ad90b83c 100644 --- a/platform/gp2x/emu.c +++ b/platform/gp2x/emu.c @@ -328,7 +328,7 @@ static int make_local_pal_md(int fast_mode) localPal[0xe0] = 0x00000000; // reserved pixels for OSD localPal[0xf0] = 0x00ffffff; - if (Pico.m.dirtyPal == 2) + if (Pico.m.dirtyPal == 2) Pico.m.dirtyPal = 0; return pallen; } diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index 6d68a1bc2..461fbfa7d 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -84,6 +84,7 @@ get_define OFS_EST_ PicoEState HighPal ; echo "$line" >>$fn get_define OFS_PMEM_ PicoMem vram ; echo "$line" >>$fn get_define OFS_PMEM_ PicoMem vsram ; echo "$line" >>$fn +get_define OFS_PMEM32x_ Pico32xMem pal_native ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ is_slave ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ p_bios ; echo "$line" >>$fn From 74385d04c3168e4a8a6c573aa56e0f11286e55b5 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 25 Apr 2019 19:03:58 +0200 Subject: [PATCH 032/174] sh2 drc, improved constant handling and register allocator --- cpu/sh2/compiler.c | 151 ++++++++++++++++++++++++++++++++------------- 1 file changed, 108 insertions(+), 43 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index bc63e18bd..cd85b3737 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -395,10 +395,10 @@ enum { } guest_reg_flags; typedef struct { - u16 flags; // guest flags: is constant, is dirty? + u8 flags; // guest flags: is constant, is dirty? s8 sreg; // cache reg for static mapping s8 vreg; // cache_reg this is currently mapped to, -1 if not mapped - u32 val; // value if this is constant + s8 cnst; // const index if this is constant } guest_reg_t; @@ -1153,7 +1153,7 @@ static int find_in_array(u32 *array, size_t size, u32 what) // NB rcache allocation dependencies: // - get_reg_arg/get_tmp_arg first (might evict other regs just allocated) -// - get_reg(..., NULL) before get_reg(..., &x) if it might get the same reg +// - get_reg(..., NULL) before get_reg(..., &hr) if it might get the same reg // - get_reg(..., RC_GR_READ/RMW, ...) before WRITE (might evict needed reg) // register cache / constant propagation stuff @@ -1163,7 +1163,15 @@ typedef enum { RC_GR_RMW, } rc_gr_mode; +typedef struct { + u32 gregs; + u32 val; +} gconst_t; + +gconst_t gconsts[ARRAY_SIZE(guest_regs)]; + static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr); +static void rcache_add_vreg_alias(int x, sh2_reg_e r); static void rcache_remove_vreg_alias(int x, sh2_reg_e r); #define RCACHE_DUMP(msg) { \ @@ -1185,11 +1193,51 @@ static void rcache_remove_vreg_alias(int x, sh2_reg_e r); } \ } +// binary search approach, since we don't have CLZ on ARM920T +#define FOR_ALL_BITS_SET_DO(mask, bit, code) { \ + u32 __mask = mask; \ + for (bit = 31; bit >= 0 && mask; bit--, __mask <<= 1) { \ + if (!(__mask & (0xffff << 16))) \ + bit -= 16, __mask <<= 16; \ + if (!(__mask & (0xff << 24))) \ + bit -= 8, __mask <<= 8; \ + if (!(__mask & (0xf << 28))) \ + bit -= 4, __mask <<= 4; \ + if (!(__mask & (0x3 << 30))) \ + bit -= 2, __mask <<= 2; \ + if (!(__mask & (0x1 << 31))) \ + bit -= 1, __mask <<= 1; \ + if (__mask & (0x1 << 31)) { \ + code; \ + } \ + } \ +} + #if PROPAGATE_CONSTANTS +static inline int gconst_alloc(sh2_reg_e r) +{ + int i, n = -1; + + for (i = 0; i < ARRAY_SIZE(gconsts); i++) { + if (gconsts[i].gregs & (1 << r)) + gconsts[i].gregs &= ~(1 << r); + if (gconsts[i].gregs == 0 && n < 0) + n = i; + } + if (n >= 0) + gconsts[n].gregs = (1 << r); + else + exit(1); // cannot happen - more constants than guest regs? + return n; +} + static void gconst_set(sh2_reg_e r, u32 val) { + int i = gconst_alloc(r); + guest_regs[r].flags |= GRF_CONST; - guest_regs[r].val = val; + guest_regs[r].cnst = i; + gconsts[i].val = val; } static void gconst_new(sh2_reg_e r, u32 val) @@ -1204,16 +1252,22 @@ static void gconst_new(sh2_reg_e r, u32 val) static void gconst_copy(sh2_reg_e rd, sh2_reg_e rs) { - guest_regs[rd].flags &= ~(GRF_CONST|GRF_CDIRTY); - if (guest_regs[rs].flags & GRF_CONST) - gconst_set(rd, guest_regs[rs].val); + if (guest_regs[rd].flags & GRF_CONST) { + guest_regs[rd].flags &= ~(GRF_CONST|GRF_CDIRTY); + gconsts[guest_regs[rd].cnst].gregs &= ~(1 << rd); + } + if (guest_regs[rs].flags & GRF_CONST) { + guest_regs[rd].flags |= GRF_CONST; + guest_regs[rd].cnst = guest_regs[rs].cnst; + gconsts[guest_regs[rd].cnst].gregs |= (1 << rd); + } } #endif static int gconst_get(sh2_reg_e r, u32 *val) { if (guest_regs[r].flags & GRF_CONST) { - *val = guest_regs[r].val; + *val = gconsts[guest_regs[r].cnst].val; return 1; } return 0; @@ -1227,11 +1281,20 @@ static int gconst_check(sh2_reg_e r) } // update hr if dirty, else do nothing -static int gconst_try_read(int hr, sh2_reg_e r) +static int gconst_try_read(int vreg, sh2_reg_e r) { + int i, x; if (guest_regs[r].flags & GRF_CDIRTY) { - emith_move_r_imm(hr, guest_regs[r].val); - guest_regs[r].flags &= ~GRF_CDIRTY; + x = guest_regs[r].cnst; + emith_move_r_imm(cache_regs[vreg].hreg, gconsts[x].val); + FOR_ALL_BITS_SET_DO(gconsts[x].gregs, i, + { + if (guest_regs[i].vreg >= 0 && i != r) + rcache_remove_vreg_alias(guest_regs[i].vreg, i); + rcache_add_vreg_alias(vreg, i); + guest_regs[i].flags &= ~GRF_CDIRTY; + guest_regs[i].flags |= GRF_DIRTY; + }); return 1; } return 0; @@ -1250,6 +1313,8 @@ static u32 gconst_dirty_mask(void) static void gconst_kill(sh2_reg_e r) { + if (guest_regs[r].flags &= ~(GRF_CONST|GRF_CDIRTY)) + gconsts[guest_regs[r].cnst].gregs &= ~(1 << r); guest_regs[r].flags &= ~(GRF_CONST|GRF_CDIRTY); } @@ -1269,8 +1334,11 @@ static void gconst_invalidate(void) { int i; - for (i = 0; i < ARRAY_SIZE(guest_regs); i++) + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { + if (guest_regs[i].flags & (GRF_CONST|GRF_CDIRTY)) + gconsts[guest_regs[i].cnst].gregs &= ~(1 << i); guest_regs[i].flags &= ~(GRF_CONST|GRF_CDIRTY); + } } static u16 rcache_counter; @@ -1278,28 +1346,9 @@ static u32 rcache_static; static u32 rcache_locked; static u32 rcache_hint_soon; static u32 rcache_hint_late; +static u32 rcache_hint_write; #define rcache_hint (rcache_hint_soon|rcache_hint_late) -// binary search approach, since we don't have CLZ on ARM920T -#define FOR_ALL_BITS_SET_DO(mask, bit, code) { \ - u32 __mask = mask; \ - for (bit = 31; bit >= 0 && mask; bit--, __mask <<= 1) { \ - if (!(__mask & (0xffff << 16))) \ - bit -= 16, __mask <<= 16; \ - if (!(__mask & (0xff << 24))) \ - bit -= 8, __mask <<= 8; \ - if (!(__mask & (0xf << 28))) \ - bit -= 4, __mask <<= 4; \ - if (!(__mask & (0x3 << 30))) \ - bit -= 2, __mask <<= 2; \ - if (!(__mask & (0x1 << 31))) \ - bit -= 1, __mask <<= 1; \ - if (__mask & (0x1 << 31)) { \ - code; \ - } \ - } \ -} - static void rcache_unmap_vreg(int x) { int i; @@ -1328,8 +1377,7 @@ static void rcache_clean_vreg(int x) rcache_unmap_vreg(guest_regs[r].sreg); emith_move_r_r(cache_regs[guest_regs[r].sreg].hreg, cache_regs[guest_regs[r].vreg].hreg); rcache_remove_vreg_alias(x, r); - cache_regs[guest_regs[r].sreg].gregs = (1 << r); - guest_regs[r].vreg = guest_regs[r].sreg; + rcache_add_vreg_alias(guest_regs[r].sreg, r); } else { // must evict since sreg is locked emith_ctx_write(cache_regs[x].hreg, r * 4); @@ -1343,6 +1391,12 @@ static void rcache_clean_vreg(int x) } } +static void rcache_add_vreg_alias(int x, sh2_reg_e r) +{ + cache_regs[x].gregs |= (1 << r); + guest_regs[r].vreg = x; +} + static void rcache_remove_vreg_alias(int x, sh2_reg_e r) { cache_regs[x].gregs &= ~(1 << r); @@ -1396,9 +1450,12 @@ static cache_reg_t *rcache_evict(void) else if (rcache_hint_late & cache_regs[i].gregs) // REGs needed in some future insn i_prio = 3; - else + else if ((rcache_hint_write & cache_regs[i].gregs) != cache_regs[i].gregs) // REGs not needed soon i_prio = 4; + else + // REGs soon overwritten anyway + i_prio = 5; if (prio < i_prio || (prio == i_prio && cache_regs[i].stamp < min_stamp)) { min_stamp = cache_regs[i].stamp; @@ -1549,6 +1606,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr h = guest_regs[r].sreg; rcache_evict_vreg(h); tr = &cache_regs[h]; + tr->gregs = 1 << r; if (i >= 0) { if (mode != RC_GR_WRITE) { if (hr) @@ -1559,14 +1617,13 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr } rcache_remove_vreg_alias(guest_regs[r].vreg, r); } else if (mode != RC_GR_WRITE) { - if (gconst_try_read(tr->hreg, r)) { + if (gconst_try_read(h, r)) { tr->flags |= HRF_DIRTY; guest_regs[r].flags |= GRF_DIRTY; } else emith_ctx_read(tr->hreg, r * 4); } guest_regs[r].vreg = guest_regs[r].sreg; - tr->gregs = 1 << r; goto end; } else if (i >= 0) { if (mode == RC_GR_READ || !(cache_regs[i].gregs & ~(1 << r))) { @@ -1608,7 +1665,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr guest_regs[r].vreg = tr - cache_regs; if (mode != RC_GR_WRITE) { - if (gconst_try_read(tr->hreg, r)) { + if (gconst_try_read(guest_regs[r].vreg, r)) { tr->flags |= HRF_DIRTY; guest_regs[r].flags |= GRF_DIRTY; } else if (split >= 0) { @@ -1747,7 +1804,7 @@ static int rcache_get_reg_arg(int arg, sh2_reg_e r, int *hr) srcr = dstr; if (rcache_static & (1 << r)) srcr = rcache_get_reg_(r, RC_GR_READ, 0, NULL); - else if (gconst_try_read(srcr, r)) + else if (gconst_try_read(guest_regs[r].vreg, r)) dirty = 1; else emith_ctx_read(srcr, r * 4); @@ -1780,8 +1837,10 @@ static int rcache_get_reg_arg(int arg, sh2_reg_e r, int *hr) emith_move_r_r(dstr, srcr); } else if (hr != NULL) { // caller will modify arg, so it will soon be out of sync with r - if (dirty || src_dirty) + if (dirty || src_dirty) { emith_ctx_write(dstr, r * 4); // must clean since arg will be modified + guest_regs[r].flags &= ~GRF_DIRTY; + } } else if (guest_regs[r].vreg < 0) { // keep arg as vreg for r cache_regs[dstid].type = HR_CACHED; @@ -1909,6 +1968,11 @@ static inline void rcache_set_hint_late(u32 mask) rcache_hint_late = mask & ~rcache_static; } +static inline void rcache_set_hint_write(u32 mask) +{ + rcache_hint_write = mask & ~rcache_static; +} + static inline int rcache_is_hinted(sh2_reg_e r) { // consider static REGs as always hinted, since they are always there @@ -2038,7 +2102,7 @@ static void rcache_invalidate(void) } rcache_counter = 0; - rcache_hint_soon = rcache_hint_late = 0; + rcache_hint_soon = rcache_hint_late = rcache_hint_write = 0; gconst_invalidate(); } @@ -2155,10 +2219,9 @@ static void emit_move_r_r(sh2_reg_e dst, sh2_reg_e src) if (guest_regs[dst].vreg >= 0) rcache_remove_vreg_alias(guest_regs[dst].vreg, dst); // make dst an alias of src - cache_regs[i].gregs |= (1 << dst); + rcache_add_vreg_alias(i, dst); cache_regs[i].flags |= HRF_DIRTY; guest_regs[dst].flags |= GRF_DIRTY; - guest_regs[dst].vreg = i; gconst_kill(dst); #if PROPAGATE_CONSTANTS gconst_copy(dst, src); @@ -2772,6 +2835,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) dbg(1, "unhandled delay_dep_bk: %x", delay_dep_bk); rcache_set_hint_soon(0); rcache_set_hint_late(0); + rcache_set_hint_write(0); } else { @@ -2802,6 +2866,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } rcache_set_hint_soon(late); // insns 1-3 rcache_set_hint_late(late & ~soon); // insns 4-9 + rcache_set_hint_write(write & ~(late|soon)); // next access is write } rcache_set_locked(opd[0].source); // try not to evict src regs for this op From 6caa1fa6e165e2051e246d63e3d2b061ed8b89e1 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 26 Apr 2019 18:53:21 +0200 Subject: [PATCH 033/174] sh2 drc, make B/W read functions signed (reduces generated code size) --- cpu/drc/emit_arm.c | 44 ++++++++++++++++++++++++++++-------------- cpu/drc/emit_x86.c | 22 ++++++++++++++++++--- cpu/sh2/compiler.c | 34 ++++++++++++++------------------ cpu/sh2/compiler.h | 4 ++-- cpu/sh2/mame/sh2.c | 10 +++++----- cpu/sh2/mame/sh2pico.c | 8 ++++---- pico/32x/memory.c | 19 +++++++++--------- 7 files changed, 83 insertions(+), 58 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index d8674a030..586f0a540 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -795,6 +795,8 @@ static inline void emith_pool_check(void) emith_read_r_r_offs_c(cond, r, rs, offs) #define emith_read_r_r_r_c(cond, r, rs, rm) \ EOP_LDR_REG_LSL(cond, r, rs, rm, 0) +#define emith_read_r_r_offs(r, rs, offs) \ + emith_read_r_r_offs_c(A_COND_AL, r, rs, offs) #define emith_read_r_r_r(r, rs, rm) \ EOP_LDR_REG_LSL(A_COND_AL, r, rs, rm, 0) @@ -802,28 +804,37 @@ static inline void emith_pool_check(void) EOP_LDRB_IMM2(cond, r, rs, offs) #define emith_read8_r_r_r_c(cond, r, rs, rm) \ EOP_LDRB_REG_LSL(cond, r, rs, rm, 0) +#define emith_read8_r_r_offs(r, rs, offs) \ + emith_read8_r_r_offs_c(A_COND_AL, r, rs, offs) #define emith_read8_r_r_r(r, rs, rm) \ - EOP_LDRB_REG_LSL(A_COND_AL, r, rs, rm, 0) + emith_read8_r_r_r_c(A_COND_AL, r, rs, rm) #define emith_read16_r_r_offs_c(cond, r, rs, offs) \ EOP_LDRH_IMM2(cond, r, rs, offs) #define emith_read16_r_r_r_c(cond, r, rs, rm) \ EOP_LDRH_REG2(cond, r, rs, rm) +#define emith_read16_r_r_offs(r, rs, offs) \ + emith_read16_r_r_offs_c(A_COND_AL, r, rs, offs) #define emith_read16_r_r_r(r, rs, rm) \ - EOP_LDRH_REG2(A_COND_AL, r, rs, rm) - -#define emith_read_r_r_offs(r, rs, offs) \ - emith_read_r_r_offs_c(A_COND_AL, r, rs, offs) + emith_read16_r_r_r_c(A_COND_AL, r, rs, rm) +#define emith_read8s_r_r_offs_c(cond, r, rs, offs) \ + EOP_LDRSB_IMM2(cond, r, rs, offs) +#define emith_read8s_r_r_r_c(cond, r, rs, rm) \ + EOP_LDRSB_REG2(cond, r, rs, rm) #define emith_read8s_r_r_offs(r, rs, offs) \ - EOP_LDRSB_IMM2(A_COND_AL, r, rs, offs) -#define emith_read8_r_r_offs(r, rs, offs) \ - emith_read8_r_r_offs_c(A_COND_AL, r, rs, offs) - + emith_read8s_r_r_offs_c(A_COND_AL, r, rs, offs) +#define emith_read8s_r_r_r(r, rs, rm) \ + emith_read8s_r_r_r_c(A_COND_AL, r, rs, rm) + +#define emith_read16s_r_r_offs_c(cond, r, rs, offs) \ + EOP_LDRSH_IMM2(cond, r, rs, offs) +#define emith_read16s_r_r_r_c(cond, r, rs, rm) \ + EOP_LDRSH_REG2(cond, r, rs, rm) #define emith_read16s_r_r_offs(r, rs, offs) \ - EOP_LDRSH_IMM2(A_COND_AL, r, rs, offs) -#define emith_read16_r_r_offs(r, rs, offs) \ - emith_read16_r_r_offs_c(A_COND_AL, r, rs, offs) + emith_read16s_r_r_offs_c(A_COND_AL, r, rs, offs) +#define emith_read16s_r_r_r(r, rs, rm) \ + emith_read16s_r_r_r_c(A_COND_AL, r, rs, rm) #define emith_write_r_r_offs_c(cond, r, rs, offs) \ EOP_STR_IMM2(cond, r, rs, offs) @@ -945,6 +956,11 @@ static inline void emith_pool_check(void) #define emith_call(target) \ emith_call_cond(A_COND_AL, target) +#define emith_call_reg(r) { \ + emith_move_r_r(14, 15); \ + EOP_C_BX(A_COND_AL, r); \ +} + #define emith_call_ctx(offs) { \ emith_move_r_r(14, 15); \ emith_jump_ctx(offs); \ @@ -1091,9 +1107,7 @@ static inline void emith_pool_check(void) } while (0) /* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ -#define emith_sh2_macw(ml, mh, rn, rm, sr) do { \ - emith_sext(rn, rn, 16); \ - emith_sext(rm, rm, 16); \ +#define emith_sh2_macw(ml, mh, rn, rm, sr) do { \ emith_tst_r_imm(sr, S); \ EMITH_SJMP2_START(DCOND_NE); \ emith_mula_s64_c(DCOND_EQ, ml, mh, rn, rm); \ diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 1ac4ee013..5805aadd9 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -397,8 +397,12 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_read8_r_r_r_c(cond, r, rs, rm) \ emith_read8_r_r_r(r, rs, rm) +#define emith_read8s_r_r_r_c(cond, r, rs, rm) \ + emith_read8s_r_r_r(r, rs, rm) #define emith_read16_r_r_r_c(cond, r, rs, rm) \ emith_read16_r_r_r(r, rs, rm) +#define emith_read16s_r_r_r_c(cond, r, rs, rm) \ + emith_read16s_r_r_r(r, rs, rm) #define emith_read_r_r_r_c(cond, r, rs, rm) \ emith_read_r_r_r(r, rs, rm) @@ -684,12 +688,24 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMIT_SIB(0, rs, rm); /* mov r, [rm + rs * 1] */ \ } while (0) +#define emith_read8s_r_r_r(r, rs, rm) do { \ + EMIT(0x0f, u8); \ + EMIT_OP_MODRM(0xbe, 0, r, 4); \ + EMIT_SIB(0, rs, rm); /* mov r, [rm + rs * 1] */ \ +} while (0) + #define emith_read16_r_r_r(r, rs, rm) do { \ EMIT(0x0f, u8); \ EMIT_OP_MODRM(0xb7, 0, r, 4); \ EMIT_SIB(0, rs, rm); /* mov r, [rm + rs * 1] */ \ } while (0) +#define emith_read16s_r_r_r(r, rs, rm) do { \ + EMIT(0x0f, u8); \ + EMIT_OP_MODRM(0xbf, 0, r, 4); \ + EMIT_SIB(0, rs, rm); /* mov r, [rm + rs * 1] */ \ +} while (0) + #define emith_read_r_r_r(r, rs, rm) do { \ EMIT_OP_MODRM(0x8b, 0, r, 4); \ EMIT_SIB(0, rs, rm); /* mov r, [rm + rs * 1] */ \ @@ -785,9 +801,11 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMIT(offs, u32); \ } while (0) -#define emith_push_ret() +#define emith_push_ret() \ + emith_push(xSI); /* to align */ #define emith_pop_and_ret() \ + emith_pop(xSI); \ emith_ret() #define EMITH_JMP_START(cond) { \ @@ -1080,8 +1098,6 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; /* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ #define emith_sh2_macw(ml, mh, rn, rm, sr) do { \ - emith_sext(rn, rn, 16); \ - emith_sext(rm, rm, 16); \ emith_tst_r_imm(sr, S); \ EMITH_SJMP_START(DCOND_EQ); \ /* XXX: MACH should be untouched when S is set? */ \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index cd85b3737..517be81c2 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -2354,17 +2354,15 @@ static int emit_memhandler_read_rr(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rs, u32 off hr2 = hr; else #if REMAP_REGISTER - hr2 = rcache_map_reg(rd, hr, size != 2 ? RC_GR_RMW : RC_GR_WRITE); + hr2 = rcache_map_reg(rd, hr, RC_GR_WRITE); #else hr2 = rcache_get_reg(rd, RC_GR_WRITE, NULL); #endif - if (rd != SHR_TMP && size != 2) { // 16, 8 - emith_sext(hr2, hr, size ? 16 : 8); - } else if (hr != hr2) // 32 + if (hr != hr2) { emith_move_r_r(hr2, hr); - if (hr != hr2) rcache_free_tmp(hr); + } return hr2; } @@ -2422,21 +2420,19 @@ static int emit_indirect_indexed_read(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rx, sh2_ hr = emit_memhandler_read(size); size &= MF_SIZEMASK; - if (rd != SHR_TMP) + if (rd == SHR_TMP) + hr2 = hr; + else #if REMAP_REGISTER - hr2 = rcache_map_reg(rd, hr, size != 2 ? RC_GR_RMW : RC_GR_WRITE); + hr2 = rcache_map_reg(rd, hr, RC_GR_WRITE); #else hr2 = rcache_get_reg(rd, RC_GR_WRITE, NULL); #endif - else - hr2 = hr; - if (rd != SHR_TMP && size != 2) { // 16, 8 - emith_sext(hr2, hr, size ? 16 : 8); - } else if (hr != hr2) // 32 + if (hr != hr2) { emith_move_r_r(hr2, hr); - if (hr != hr2) rcache_free_tmp(hr); + } return hr2; } @@ -2991,16 +2987,14 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } tmp2 = emit_memhandler_read(opd->size); #if REMAP_REGISTER - tmp3 = rcache_map_reg(GET_Rn(), tmp2, opd->size != 2 ? RC_GR_RMW : RC_GR_WRITE); + tmp3 = rcache_map_reg(GET_Rn(), tmp2, RC_GR_WRITE); #else tmp3 = rcache_get_reg(GET_Rn(), RC_GR_WRITE, NULL); #endif - if (opd->size != 2) { - emith_sext(tmp3, tmp2, 16); - } else if (tmp3 != tmp2) + if (tmp3 != tmp2) { emith_move_r_r(tmp3, tmp2); - if (tmp3 != tmp2) rcache_free_tmp(tmp2); + } } goto end_op; @@ -4025,7 +4019,7 @@ static void sh2_generate_utils(void) EMITH_SJMP_START(DCOND_CS); emith_and_r_r_c(DCOND_CC, arg0, arg3); emith_eor_r_imm_c(DCOND_CC, arg0, 1); - emith_read8_r_r_r_c(DCOND_CC, RET_REG, arg0, arg2); + emith_read8s_r_r_r_c(DCOND_CC, RET_REG, arg0, arg2); emith_ret_c(DCOND_CC); EMITH_SJMP_END(DCOND_CS); emith_move_r_r_ptr(arg1, CONTEXT_REG); @@ -4037,7 +4031,7 @@ static void sh2_generate_utils(void) emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CS); emith_and_r_r_c(DCOND_CC, arg0, arg3); - emith_read16_r_r_r_c(DCOND_CC, RET_REG, arg0, arg2); + emith_read16s_r_r_r_c(DCOND_CC, RET_REG, arg0, arg2); emith_ret_c(DCOND_CC); EMITH_SJMP_END(DCOND_CS); emith_move_r_r_ptr(arg1, CONTEXT_REG); diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 07e76cca8..d5cde5200 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -44,10 +44,10 @@ unsigned short scan_block(unsigned int base_pc, int is_slave, #define _DRC_DECLARE_SR(SR) __DRC_DECLARE_SR(SR) #define DRC_DECLARE_SR _DRC_DECLARE_SR(DRC_SR_REG) #define DRC_SAVE_SR(sh2) \ - if ((sh2->state & (SH2_STATE_RUN)) == SH2_STATE_RUN) \ + if ((sh2->state & (SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN) \ sh2->sr = sh2_sr; #define DRC_RESTORE_SR(sh2) \ - if ((sh2->state & (SH2_STATE_RUN)) == SH2_STATE_RUN) \ + if ((sh2->state & (SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN) \ sh2_sr = sh2->sr; #else #define DRC_DECLARE_SR diff --git a/cpu/sh2/mame/sh2.c b/cpu/sh2/mame/sh2.c index 2fb964b6c..fa49153aa 100644 --- a/cpu/sh2/mame/sh2.c +++ b/cpu/sh2/mame/sh2.c @@ -372,7 +372,7 @@ INLINE void BRA(sh2_state *sh2, UINT32 d) #if BUSY_LOOP_HACKS if (disp == -2) { - UINT32 next_opcode = RW( sh2, sh2->ppc & AM ); + UINT32 next_opcode = (UINT32)(UINT16)RW( sh2, sh2->ppc & AM ); /* BRA $ * NOP */ @@ -802,7 +802,7 @@ INLINE void DT(sh2_state *sh2, UINT32 n) sh2->sr &= ~T; #if BUSY_LOOP_HACKS { - UINT32 next_opcode = RW( sh2, sh2->ppc & AM ); + UINT32 next_opcode = (UINT32)(UINT16)RW( sh2, sh2->ppc & AM ); /* DT Rn * BF $-2 */ @@ -1049,12 +1049,12 @@ INLINE void MAC_W(sh2_state *sh2, UINT32 m, UINT32 n) INT32 tempm, tempn, dest, src, ans; UINT32 templ; - tempn = (INT32) RW( sh2, sh2->r[n] ); + tempn = (INT32)(INT16) RW( sh2, sh2->r[n] ); sh2->r[n] += 2; - tempm = (INT32) RW( sh2, sh2->r[m] ); + tempm = (INT32)(INT16) RW( sh2, sh2->r[m] ); sh2->r[m] += 2; templ = sh2->macl; - tempm = ((INT32) (short) tempn * (INT32) (short) tempm); + tempm = (tempn * tempm); if ((INT32) sh2->macl >= 0) dest = 0; else diff --git a/cpu/sh2/mame/sh2pico.c b/cpu/sh2/mame/sh2pico.c index f9d30d778..467b2adc9 100644 --- a/cpu/sh2/mame/sh2pico.c +++ b/cpu/sh2/mame/sh2pico.c @@ -121,7 +121,7 @@ int sh2_execute_interpreter(SH2 *sh2, int cycles) if (sh2->delay) { sh2->ppc = sh2->delay; - opcode = RW(sh2, sh2->delay); + opcode = (UINT32)(UINT16)RW(sh2, sh2->delay); // TODO: more branch types if ((opcode >> 13) == 5) { // BRA/BSR @@ -139,7 +139,7 @@ int sh2_execute_interpreter(SH2 *sh2, int cycles) else { sh2->ppc = sh2->pc; - opcode = RW(sh2, sh2->pc); + opcode = (UINT32)(UINT16)RW(sh2, sh2->pc); } sh2->delay = 0; @@ -232,13 +232,13 @@ int sh2_execute_interpreter(SH2 *sh2, int cycles) if (sh2->delay) { sh2->ppc = sh2->delay; - opcode = RW(sh2, sh2->delay); + opcode = (UINT32)(UINT16)RW(sh2, sh2->delay); sh2->pc -= 2; } else { sh2->ppc = sh2->pc; - opcode = RW(sh2, sh2->pc); + opcode = (UINT32)(UINT16)RW(sh2, sh2->pc); } sh2->delay = 0; diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 6a3b22229..8a4b53654 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -1279,19 +1279,19 @@ static u32 REGPARM(2) sh2_read8_cs0(u32 a, SH2 *sh2) elprintf_sh2(sh2, EL_32X, "r8 [%08x] %02x @%06x", a, d, sh2_pc(sh2)); DRC_RESTORE_SR(sh2); - return d; + return (s8)d; } static u32 REGPARM(2) sh2_read8_da(u32 a, SH2 *sh2) { - return sh2->data_array[(a & 0xfff) ^ 1]; + return (s8)sh2->data_array[(a & 0xfff) ^ 1]; } // for ssf2 static u32 REGPARM(2) sh2_read8_rom(u32 a, SH2 *sh2) { u32 bank = carthw_ssf2_banks[(a >> 19) & 7] << 19; - u8 *p = sh2->p_rom; + s8 *p = sh2->p_rom; return p[(bank + (a & 0x7ffff)) ^ 1]; } @@ -1340,18 +1340,18 @@ static u32 REGPARM(2) sh2_read16_cs0(u32 a, SH2 *sh2) a, d, sh2_pc(sh2)); out_noprint: DRC_RESTORE_SR(sh2); - return d; + return (s16)d; } static u32 REGPARM(2) sh2_read16_da(u32 a, SH2 *sh2) { - return ((u16 *)sh2->data_array)[(a & 0xffe) / 2]; + return ((s16 *)sh2->data_array)[(a & 0xffe) / 2]; } static u32 REGPARM(2) sh2_read16_rom(u32 a, SH2 *sh2) { u32 bank = carthw_ssf2_banks[(a >> 19) & 7] << 19; - u16 *p = sh2->p_rom; + s16 *p = sh2->p_rom; return p[(bank + (a & 0x7fffe)) / 2]; } @@ -1364,7 +1364,8 @@ static u32 REGPARM(2) sh2_read32_unmapped(u32 a, SH2 *sh2) static u32 REGPARM(2) sh2_read32_cs0(u32 a, SH2 *sh2) { - return (sh2_read16_cs0(a, sh2) << 16) | sh2_read16_cs0(a + 2, sh2); + u32 d1 = sh2_read16_cs0(a, sh2) << 16, d2 = sh2_read16_cs0(a + 2, sh2) << 16; + return d1 | (d2 >> 16); } static u32 REGPARM(2) sh2_read32_da(u32 a, SH2 *sh2) @@ -1631,7 +1632,7 @@ u32 REGPARM(2) p32x_sh2_read8(u32 a, SH2 *sh2) if (map_flag_set(p)) return ((sh2_read_handler *)(p << 1))(a, sh2); else - return *(u8 *)((p << 1) + ((a & sh2_map->mask) ^ 1)); + return *(s8 *)((p << 1) + ((a & sh2_map->mask) ^ 1)); } u32 REGPARM(2) p32x_sh2_read16(u32 a, SH2 *sh2) @@ -1644,7 +1645,7 @@ u32 REGPARM(2) p32x_sh2_read16(u32 a, SH2 *sh2) if (map_flag_set(p)) return ((sh2_read_handler *)(p << 1))(a, sh2); else - return *(u16 *)((p << 1) + (a & sh2_map->mask)); + return *(s16 *)((p << 1) + (a & sh2_map->mask)); } u32 REGPARM(2) p32x_sh2_read32(u32 a, SH2 *sh2) From a0bef37586c6d51cf5858bf4a7d29030466aa8cd Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 28 Apr 2019 23:42:02 +0200 Subject: [PATCH 034/174] sh2 drc, code emitter cleanup, add ARM reorder stage to reduce interlock --- cpu/drc/emit_arm.c | 316 ++++++++++++++++++++++++++----------- cpu/drc/emit_x86.c | 74 ++++----- cpu/sh2/compiler.c | 22 ++- pico/carthw/svp/compiler.c | 1 + 4 files changed, 285 insertions(+), 128 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 586f0a540..bfce29fae 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -14,22 +14,130 @@ do { \ *(u32 *)ptr = x; \ ptr = (void *)((u8 *)ptr + sizeof(u32)); \ - COUNT_OP; \ } while (0) -#define EMIT(x) EMIT_PTR(tcache_ptr, x) +// ARM special registers and peephole optimization flags +#define SP 13 // stack pointer +#define LR 14 // link (return address) +#define PC 15 // program counter +#define SR 16 // CPSR, status register +#define MEM 17 // memory access (src=LDR, dst=STR) +#define CYC1 20 // 1 cycle interlock (LDR, reg-cntrld shift) +#define CYC2 21 // 2+ cycles interlock (LDR[BH], MUL/MLA etc) +#define SWAP 31 // swapped +#define NO 32 // token for "no register" + +// bitmask builders +#define M1(x) (u32)(1ULL<<(x)) // u32 to have NO evaluate to 0 +#define M2(x,y) (M1(x)|M1(y)) +#define M3(x,y,z) (M2(x,y)|M1(z)) +#define M4(x,y,z,a) (M3(x,y,z)|M1(a)) +#define M5(x,y,z,a,b) (M4(x,y,z,a)|M1(b)) +#define M10(a,b,c,d,e,f,g,h,i,j) (M5(a,b,c,d,e)|M5(f,g,h,i,j)) + +// peephole optimizer. ATM only tries to reduce interlock +#define EMIT_CACHE_SIZE 3 +struct emit_op { + u32 op; + u32 src, dst; +}; + +// peephole cache, last commited insn + cache + next insn + empty insn = size+3 +static struct emit_op emit_cache[EMIT_CACHE_SIZE+3]; +static int emit_index; +#define emith_insn_ptr() (u8 *)((u32 *)tcache_ptr-emit_index) + +static int emith_pool_index(int tcache_offs); +static void emith_pool_adjust(int pool_index, int move_offs); + +static NOINLINE void EMIT(u32 op, u32 dst, u32 src) +{ + void *emit_ptr = (u32 *)tcache_ptr - emit_index; + int i; -#define A_R4M (1 << 4) -#define A_R5M (1 << 5) -#define A_R6M (1 << 6) -#define A_R7M (1 << 7) -#define A_R8M (1 << 8) -#define A_R9M (1 << 9) -#define A_R10M (1 << 10) -#define A_R11M (1 << 11) -#define A_R12M (1 << 12) -#define A_R14M (1 << 14) -#define A_R15M (1 << 15) + EMIT_PTR(tcache_ptr, op); // emit to keep tcache_ptr current + COUNT_OP; + // for conditional execution SR is always source + if (op < 0xe0000000 /*A_COND_AL << 28*/) + src |= M1(SR); + // put insn on back of queue + emit_cache[emit_index+1].op = op; + emit_cache[emit_index+1].src = src & ~M1(NO); // mask away the NO token + emit_cache[emit_index+1].dst = dst & ~M1(NO); + // move insn down in the queue as long as permitted by dependencies + for (i = emit_index-1; i > 0; i--) { + struct emit_op *ptr = &emit_cache[i]; + int deps = 0; + // never swap branch insns (changes semantics) + if ((ptr[0].dst | ptr[1].dst) & M1(PC)) + continue; + // dst deps between 0 and 1 must not be swapped, since any deps + // but [0].src & [1].src lead to changed semantics if swapped. + if ((ptr[0].dst & ptr[1].src) || (ptr[1].dst & ptr[0].src) || + (ptr[0].dst & ptr[1].dst)) + continue; +#if 1 + // just move loads as far up as possible + deps -= !!(ptr[1].src & M1(MEM)); + deps += !!(ptr[0].src & M1(MEM)); +#elif 0 + // treat all dest->src deps as a potential interlock +#define DEP_INSN(x,y) !!(ptr[x].dst & ptr[y].src) + // insn sequence: -1, 0, 1, 2 + deps -= DEP_INSN(1,2) + DEP_INSN(-1,0); + deps -= !!(ptr[1].src & M1(MEM)); // favour moving LDR's down + // insn sequence: -1, 1, 0, 2 + deps += DEP_INSN(0,2) + DEP_INSN(-1,1); + deps += !!(ptr[0].src & M1(SWAP)); // penalise if swapped +#else + // calculate ARM920T interlock cycles +#define DEP_CYC1(x,y) ((ptr[x].dst & ptr[y].src)&&(ptr[x].src & M1(CYC1))) +#define DEP_CYC2(x,y) ((ptr[x].dst & ptr[y].src)&&(ptr[x].src & M1(CYC2))) +#define DEP_INSN(x,y,z) DEP_CYC1(x,y)+DEP_CYC1(y,z)+2*DEP_CYC2(x,y)+DEP_CYC2(x,z) + // insn sequence: -1, 0, 1, 2 + deps -= DEP_INSN(0,1,2) + DEP_INSN(-1,0,1); + deps -= !!(ptr[1].src & M1(MEM)); // favour moving LDR's down + // insn sequence: -1, 1, 0, 2 + deps += DEP_INSN(0,2,1) + DEP_INSN(-1,1,0); + deps += !!(ptr[0].src & M1(SWAP)); // penalise multiple swaps +#endif + // swap if fewer depencies + if (deps < 0) { + // swap insn reading PC only if uncomitted pool load + struct emit_op tmp; + int i0 = -1, i1 = -1; + if ((!(ptr[0].src & M1(PC)) || + (i0 = emith_pool_index(emit_index+2 - i)) >= 0) && + (!(ptr[1].src & M1(PC)) || + (i1 = emith_pool_index(emit_index+1 - i)) >= 0)) { + // not using PC, or pool load + emith_pool_adjust(i0, 1); + emith_pool_adjust(i1, -1); + tmp = ptr[0], ptr[0] = ptr[1], ptr[1] = tmp; + ptr[0].src |= M1(SWAP); + } + } + } + if (emit_index <= EMIT_CACHE_SIZE) { + // queue not yet full + emit_index++; + } else { + // commit oldest insn from cache + EMIT_PTR(emit_ptr, emit_cache[1].op); + for (i = 0; i <= emit_index; i++) + emit_cache[i] = emit_cache[i+1]; + } +} + +static void emith_flush(void) +{ + int i; + void *emit_ptr = tcache_ptr - emit_index*sizeof(u32); + + for (i = 1; i <= emit_index; i++) + EMIT_PTR(emit_ptr, emit_cache[i].op); + emit_index = 0; +} #define A_COND_AL 0xe #define A_COND_EQ 0x0 @@ -96,12 +204,20 @@ #define A_OP_BIC 0xe #define A_OP_MVN 0xf -#define EOP_C_DOP_X(cond,op,s,rn,rd,shifter_op) \ - EMIT(((cond)<<28) | ((op)<< 21) | ((s)<<20) | ((rn)<<16) | ((rd)<<12) | (shifter_op)) +// operation specific register usage in DOP +#define A_Rn(op,rn) (((op)&0xd)!=0xd ? rn:NO) // no rn for MOV,MVN +#define A_Rd(op,rd) (((op)&0xc)!=0x8 ? rd:NO) // no rd for TST,TEQ,CMP,CMN +// CSPR is dst if S set, CSPR is src if op is ADC/SBC/RSC or shift is RRX +#define A_Sd(s) ((s) ? SR:NO) +#define A_Sr(op,sop) (((op)>=0x5 && (op)<=0x7) || (sop)>>4==A_AM1_ROR<<1 ? SR:NO) + +#define EOP_C_DOP_X(cond,op,s,rn,rd,sop,rm,rs) \ + EMIT(((cond)<<28) | ((op)<< 21) | ((s)<<20) | ((rn)<<16) | ((rd)<<12) | (sop), \ + M2(A_Rd(op,rd),A_Sd(s)), M5(A_Sr(op,sop),A_Rn(op,rn),rm,rs,rs==NO?NO:CYC1)) -#define EOP_C_DOP_IMM( cond,op,s,rn,rd,ror2,imm8) EOP_C_DOP_X(cond,op,s,rn,rd,A_AM1_IMM(ror2,imm8)) -#define EOP_C_DOP_REG_XIMM(cond,op,s,rn,rd,shift_imm,shift_op,rm) EOP_C_DOP_X(cond,op,s,rn,rd,A_AM1_REG_XIMM(shift_imm,shift_op,rm)) -#define EOP_C_DOP_REG_XREG(cond,op,s,rn,rd,rs, shift_op,rm) EOP_C_DOP_X(cond,op,s,rn,rd,A_AM1_REG_XREG(rs, shift_op,rm)) +#define EOP_C_DOP_IMM( cond,op,s,rn,rd,ror2,imm8) EOP_C_DOP_X(cond,op,s,rn,rd,A_AM1_IMM(ror2,imm8), NO, NO) +#define EOP_C_DOP_REG_XIMM(cond,op,s,rn,rd,shift_imm,shift_op,rm) EOP_C_DOP_X(cond,op,s,rn,rd,A_AM1_REG_XIMM(shift_imm,shift_op,rm), rm, NO) +#define EOP_C_DOP_REG_XREG(cond,op,s,rn,rd,rs, shift_op,rm) EOP_C_DOP_X(cond,op,s,rn,rd,A_AM1_REG_XREG(rs, shift_op,rm), rm, rs) #define EOP_MOV_IMM(rd, ror2,imm8) EOP_C_DOP_IMM(A_COND_AL,A_OP_MOV,0, 0,rd,ror2,imm8) #define EOP_MVN_IMM(rd, ror2,imm8) EOP_C_DOP_IMM(A_COND_AL,A_OP_MVN,0, 0,rd,ror2,imm8) @@ -161,16 +277,17 @@ /* addressing mode 2 */ #define EOP_C_AM2_IMM(cond,u,b,l,rn,rd,offset_12) \ - EMIT(((cond)<<28) | 0x05000000 | ((u)<<23) | ((b)<<22) | ((l)<<20) | ((rn)<<16) | ((rd)<<12) | (offset_12)) + EMIT(((cond)<<28) | 0x05000000 | ((u)<<23) | ((b)<<22) | ((l)<<20) | ((rn)<<16) | ((rd)<<12) | \ + ((offset_12) & 0xfff), M1(l?rd:MEM), M3(rn,l?MEM:rd,l?b?CYC2:CYC1:NO)) #define EOP_C_AM2_REG(cond,u,b,l,rn,rd,shift_imm,shift_op,rm) \ EMIT(((cond)<<28) | 0x07000000 | ((u)<<23) | ((b)<<22) | ((l)<<20) | ((rn)<<16) | ((rd)<<12) | \ - ((shift_imm)<<7) | ((shift_op)<<5) | (rm)) + A_AM1_REG_XIMM(shift_imm, shift_op, rm), M1(l?rd:MEM), M4(rn,rm,l?MEM:rd,l?b?CYC2:CYC1:NO)) /* addressing mode 3 */ #define EOP_C_AM3(cond,u,r,l,rn,rd,s,h,immed_reg) \ EMIT(((cond)<<28) | 0x01000090 | ((u)<<23) | ((r)<<22) | ((l)<<20) | ((rn)<<16) | ((rd)<<12) | \ - ((s)<<6) | ((h)<<5) | (immed_reg)) + ((s)<<6) | ((h)<<5) | (immed_reg), M1(l?rd:MEM), M4(rn,r?NO:immed_reg,l?MEM:rd,l?CYC2:NO)) #define EOP_C_AM3_IMM(cond,u,l,rn,rd,s,h,offset_8) EOP_C_AM3(cond,u,1,l,rn,rd,s,h,(((offset_8)&0xf0)<<4)|((offset_8)&0xf)) @@ -206,60 +323,61 @@ /* ldm and stm */ #define EOP_XXM(cond,p,u,s,w,l,rn,list) \ - EMIT(((cond)<<28) | (1<<27) | ((p)<<24) | ((u)<<23) | ((s)<<22) | ((w)<<21) | ((l)<<20) | ((rn)<<16) | (list)) + EMIT(((cond)<<28) | (1<<27) | ((p)<<24) | ((u)<<23) | ((s)<<22) | ((w)<<21) | ((l)<<20) | ((rn)<<16) | (list), \ + M2(rn,l?NO:MEM)|(l?list:0), M3(rn,l?MEM:NO,l?CYC2:NO)|(l?0:list)) #define EOP_STMIA(rb,list) EOP_XXM(A_COND_AL,0,1,0,0,0,rb,list) #define EOP_LDMIA(rb,list) EOP_XXM(A_COND_AL,0,1,0,0,1,rb,list) -#define EOP_STMFD_SP(list) EOP_XXM(A_COND_AL,1,0,0,1,0,13,list) -#define EOP_LDMFD_SP(list) EOP_XXM(A_COND_AL,0,1,0,1,1,13,list) +#define EOP_STMFD_SP(list) EOP_XXM(A_COND_AL,1,0,0,1,0,SP,list) +#define EOP_LDMFD_SP(list) EOP_XXM(A_COND_AL,0,1,0,1,1,SP,list) /* branches */ #define EOP_C_BX(cond,rm) \ - EMIT(((cond)<<28) | 0x012fff10 | (rm)) + EMIT(((cond)<<28) | 0x012fff10 | (rm), M1(PC), M1(rm)) #define EOP_C_B_PTR(ptr,cond,l,signed_immed_24) \ EMIT_PTR(ptr, ((cond)<<28) | 0x0a000000 | ((l)<<24) | (signed_immed_24)) #define EOP_C_B(cond,l,signed_immed_24) \ - EOP_C_B_PTR(tcache_ptr,cond,l,signed_immed_24) + EMIT(((cond)<<28) | 0x0a000000 | ((l)<<24) | (signed_immed_24), M2(PC,l?LR:NO), M1(PC)) #define EOP_B( signed_immed_24) EOP_C_B(A_COND_AL,0,signed_immed_24) #define EOP_BL(signed_immed_24) EOP_C_B(A_COND_AL,1,signed_immed_24) /* misc */ #define EOP_C_MUL(cond,s,rd,rs,rm) \ - EMIT(((cond)<<28) | ((s)<<20) | ((rd)<<16) | ((rs)<<8) | 0x90 | (rm)) + EMIT(((cond)<<28) | ((s)<<20) | ((rd)<<16) | ((rs)<<8) | 0x90 | (rm), M2(rd,s?SR:NO), M3(rs,rm,CYC2)) #define EOP_C_UMULL(cond,s,rdhi,rdlo,rs,rm) \ - EMIT(((cond)<<28) | 0x00800000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm)) + EMIT(((cond)<<28) | 0x00800000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm), M3(rdhi,rdlo,s?SR:NO), M3(rs,rm,CYC2)) #define EOP_C_SMULL(cond,s,rdhi,rdlo,rs,rm) \ - EMIT(((cond)<<28) | 0x00c00000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm)) + EMIT(((cond)<<28) | 0x00c00000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm), M3(rdhi,rdlo,s?SR:NO), M3(rs,rm,CYC2)) #define EOP_C_SMLAL(cond,s,rdhi,rdlo,rs,rm) \ - EMIT(((cond)<<28) | 0x00e00000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm)) + EMIT(((cond)<<28) | 0x00e00000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm), M3(rdhi,rdlo,s?SR:NO), M5(rs,rm,rdlo,rdhi,CYC2)) #define EOP_MUL(rd,rm,rs) EOP_C_MUL(A_COND_AL,0,rd,rs,rm) // note: rd != rm #define EOP_C_MRS(cond,rd) \ - EMIT(((cond)<<28) | 0x010f0000 | ((rd)<<12)) + EMIT(((cond)<<28) | 0x010f0000 | ((rd)<<12), M1(rd), M1(SR)) #define EOP_C_MSR_IMM(cond,ror2,imm) \ - EMIT(((cond)<<28) | 0x0328f000 | ((ror2)<<8) | (imm)) // cpsr_f + EMIT(((cond)<<28) | 0x0328f000 | ((ror2)<<8) | (imm), M1(SR), 0) // cpsr_f #define EOP_C_MSR_REG(cond,rm) \ - EMIT(((cond)<<28) | 0x0128f000 | (rm)) // cpsr_f + EMIT(((cond)<<28) | 0x0128f000 | (rm), M1(SR), M1(rm)) // cpsr_f #define EOP_MRS(rd) EOP_C_MRS(A_COND_AL,rd) #define EOP_MSR_IMM(ror2,imm) EOP_C_MSR_IMM(A_COND_AL,ror2,imm) #define EOP_MSR_REG(rm) EOP_C_MSR_REG(A_COND_AL,rm) #define EOP_MOVW(rd,imm) \ - EMIT(0xe3000000 | ((rd)<<12) | ((imm)&0xfff) | (((imm)<<4)&0xf0000)) + EMIT(0xe3000000 | ((rd)<<12) | ((imm)&0xfff) | (((imm)<<4)&0xf0000), M1(rd), NO) #define EOP_MOVT(rd,imm) \ - EMIT(0xe3400000 | ((rd)<<12) | (((imm)>>16)&0xfff) | (((imm)>>12)&0xf0000)) + EMIT(0xe3400000 | ((rd)<<12) | (((imm)>>16)&0xfff) | (((imm)>>12)&0xf0000), M1(rd), NO) static inline int count_bits(unsigned val) { @@ -326,7 +444,7 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int } idx = emith_pool_literal(imm, &o); literal_insn[literal_iindex++] = (u32 *)tcache_ptr; - EOP_LDR_IMM2(cond, rd, 15, idx * sizeof(u32)); + EOP_LDR_IMM2(cond, rd, PC, idx * sizeof(u32)); if (o > 0) EOP_C_DOP_IMM(cond, A_OP_ADD, 0, rd, rd, 0, o); else if (o < 0) @@ -411,10 +529,10 @@ static int emith_xbranch(int cond, void *target, int is_call) #ifdef __EPOC32__ // elprintf(EL_SVP, "emitting indirect jmp %08x->%08x", tcache_ptr, target); if (is_call) - EOP_ADD_IMM(14,15,0,8); // add lr,pc,#8 - EOP_C_AM2_IMM(cond,1,0,1,15,15,0); // ldrcc pc,[pc] - EOP_MOV_REG_SIMPLE(15,15); // mov pc, pc - EMIT((u32)target); + EOP_ADD_IMM(LR,PC,0,8); // add lr,pc,#8 + EOP_C_AM2_IMM(cond,1,0,1,PC,PC,0); // ldrcc pc,[pc] + EOP_MOV_REG_SIMPLE(PC,PC); // mov pc, pc + EMIT((u32)target,M1(PC),0); #else // should never happen elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, "indirect jmp %08x->%08x", target, tcache_ptr); @@ -438,6 +556,7 @@ static void emith_pool_commit(int jumpover) pool += sizeof(u32); emith_xbranch(A_COND_AL, (u8 *)pool + sz, 0); } + emith_flush(); // safety check - pool must be after insns and reachable if ((u32)(pool - (u8 *)literal_insn[0] + 8) > 0xfff) { elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, @@ -466,12 +585,30 @@ static inline void emith_pool_check(void) emith_pool_commit(1); } +static inline int emith_pool_index(int tcache_offs) +{ + u32 *ptr = (u32 *)tcache_ptr - tcache_offs; + int i; + + for (i = literal_iindex-1; i >= 0 && literal_insn[i] >= ptr; i--) + if (literal_insn[i] == ptr) + return i; + return -1; +} + +static inline void emith_pool_adjust(int pool_index, int move_offs) +{ + if (pool_index >= 0) + literal_insn[pool_index] += move_offs; +} + #define JMP_POS(ptr) \ ptr = tcache_ptr; \ - tcache_ptr += sizeof(u32) + EMIT(0,M1(PC),0); #define JMP_EMIT(cond, ptr) { \ u32 val_ = (u32 *)tcache_ptr - (u32 *)(ptr) - 2; \ + emith_flush(); \ EOP_C_B_PTR(ptr, cond, 0, val_ & 0xffffff); \ } @@ -660,14 +797,14 @@ static inline void emith_pool_check(void) #define emith_tst_r_imm(r, imm) \ emith_top_imm(A_COND_AL, A_OP_TST, r, imm) -#define emith_cmp_r_imm(r, imm) { \ +#define emith_cmp_r_imm(r, imm) do { \ u32 op_ = A_OP_CMP, imm_ = (u8)imm; \ if ((s8)imm_ < 0) { \ imm_ = (u8)-imm_; \ op_ = A_OP_CMN; \ } \ emith_top_imm(A_COND_AL, op_, r, imm_); \ -} +} while (0) #define emith_subf_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 1, A_OP_SUB, r, imm) @@ -693,12 +830,12 @@ static inline void emith_pool_check(void) #define emith_tst_r_imm_c(cond, r, imm) \ emith_top_imm(cond, A_OP_TST, r, imm) -#define emith_move_r_imm_s8(r, imm) { \ +#define emith_move_r_imm_s8(r, imm) do { \ if ((s8)(imm) < 0) \ EOP_MVN_IMM(r, 0, ((u8)(imm) ^ 0xff)); \ else \ EOP_MOV_IMM(r, 0, (u8)imm); \ -} +} while (0) #define emith_and_r_r_imm(d, s, imm) \ emith_op_imm2(A_COND_AL, 0, A_OP_AND, d, s, imm) @@ -752,11 +889,11 @@ static inline void emith_pool_check(void) EOP_MOV_REG(A_COND_AL,1,d,s,A_AM1_ASR,cnt) // note: only C flag updated correctly -#define emith_rolf(d, s, cnt) { \ +#define emith_rolf(d, s, cnt) do { \ EOP_MOV_REG(A_COND_AL,1,d,s,A_AM1_ROR,32-(cnt)); \ /* we don't have ROL so we shift to get the right carry */ \ EOP_TST_REG(A_COND_AL,d,d,A_AM1_LSR,1); \ -} +} while (0) #define emith_rorf(d, s, cnt) \ EOP_MOV_REG(A_COND_AL,1,d,s,A_AM1_ROR,cnt) @@ -770,12 +907,12 @@ static inline void emith_pool_check(void) #define emith_negcf_r_r(d, s) \ EOP_C_DOP_IMM(A_COND_AL,A_OP_RSC,1,s,d,0,0) -#define emith_mul(d, s1, s2) { \ +#define emith_mul(d, s1, s2) do { \ if ((d) != (s1)) /* rd != rm limitation */ \ EOP_MUL(d, s1, s2); \ else \ EOP_MUL(d, s2, s1); \ -} +} while (0) #define emith_mul_u64(dlo, dhi, s1, s2) \ EOP_C_UMULL(A_COND_AL,0,dhi,dlo,s1,s2) @@ -855,7 +992,7 @@ static inline void emith_pool_check(void) #define emith_ctx_do_multiple(op, r, offs, count, tmpr) do { \ int v_, r_ = r, c_ = count, b_ = CONTEXT_REG; \ for (v_ = 0; c_; c_--, r_++) \ - v_ |= 1 << r_; \ + v_ |= M1(r_); \ if ((offs) != 0) { \ EOP_ADD_IMM(tmpr,CONTEXT_REG,30/2,(offs)>>2);\ b_ = tmpr; \ @@ -869,7 +1006,7 @@ static inline void emith_pool_check(void) #define emith_ctx_write_multiple(r, offs, count, tmpr) \ emith_ctx_do_multiple(EOP_STMIA, r, offs, count, tmpr) -#define emith_clear_msb_c(cond, d, s, count) { \ +#define emith_clear_msb_c(cond, d, s, count) do { \ u32 t; \ if ((count) <= 8) { \ t = 8 - (count); \ @@ -883,24 +1020,24 @@ static inline void emith_pool_check(void) EOP_MOV_REG(cond,0,d,s,A_AM1_LSL,count); \ EOP_MOV_REG(cond,0,d,d,A_AM1_LSR,count); \ } \ -} +} while (0) #define emith_clear_msb(d, s, count) \ emith_clear_msb_c(A_COND_AL, d, s, count) -#define emith_sext(d, s, bits) { \ +#define emith_sext(d, s, bits) do { \ EOP_MOV_REG_LSL(d,s,32 - (bits)); \ EOP_MOV_REG_ASR(d,d,32 - (bits)); \ -} +} while (0) -#define emith_do_caller_regs(mask, func) { \ +#define emith_do_caller_regs(mask, func) do { \ u32 _reg_mask = (mask) & 0x500f; \ if (_reg_mask) { \ if (__builtin_parity(_reg_mask) == 1) \ _reg_mask |= 0x10; /* eabi align */ \ func(_reg_mask); \ } \ -} +} while (0) #define emith_save_caller_regs(mask) \ emith_do_caller_regs(mask, EOP_STMFD_SP) @@ -933,10 +1070,11 @@ static inline void emith_pool_check(void) *ptr_ = (*ptr_ & 0xff000000) | (val_ & 0x00ffffff); \ } while (0) -#define emith_jump_at(ptr, target) { \ +#define emith_jump_at(ptr, target) do { \ u32 val_ = (u32 *)(target) - (u32 *)(ptr) - 2; \ + emith_flush(); \ EOP_C_B_PTR(ptr, A_COND_AL, 0, val_ & 0xffffff); \ -} +} while (0) #define emith_jump_reg_c(cond, r) \ EOP_C_BX(cond, r) @@ -945,7 +1083,7 @@ static inline void emith_pool_check(void) emith_jump_reg_c(A_COND_AL, r) #define emith_jump_ctx_c(cond, offs) \ - EOP_LDR_IMM2(cond,15,CONTEXT_REG,offs) + EOP_LDR_IMM2(cond,PC,CONTEXT_REG,offs) #define emith_jump_ctx(offs) \ emith_jump_ctx_c(A_COND_AL, offs) @@ -956,30 +1094,30 @@ static inline void emith_pool_check(void) #define emith_call(target) \ emith_call_cond(A_COND_AL, target) -#define emith_call_reg(r) { \ - emith_move_r_r(14, 15); \ +#define emith_call_reg(r) do { \ + emith_move_r_r(LR, PC); \ EOP_C_BX(A_COND_AL, r); \ -} +} while (0) -#define emith_call_ctx(offs) { \ - emith_move_r_r(14, 15); \ +#define emith_call_ctx(offs) do { \ + emith_move_r_r(LR, PC); \ emith_jump_ctx(offs); \ -} +} while (0) #define emith_ret_c(cond) \ - emith_jump_reg_c(cond, 14) + emith_jump_reg_c(cond, LR) #define emith_ret() \ emith_ret_c(A_COND_AL) #define emith_ret_to_ctx(offs) \ - emith_ctx_write(14, offs) + emith_ctx_write(LR, offs) #define emith_push_ret() \ - EOP_STMFD_SP(A_R14M) + EOP_STMFD_SP(M1(LR)) #define emith_pop_and_ret() \ - EOP_LDMFD_SP(A_R15M) + EOP_LDMFD_SP(M1(PC)) #define host_instructions_updated(base, end) \ cache_flush_d_inval_i(base, end) @@ -990,30 +1128,30 @@ static inline void emith_pool_check(void) /* SH2 drc specific */ /* pushes r12 for eabi alignment */ #define emith_sh2_drc_entry() \ - EOP_STMFD_SP(A_R4M|A_R5M|A_R6M|A_R7M|A_R8M|A_R9M|A_R10M|A_R11M|A_R12M|A_R14M) + EOP_STMFD_SP(M10(4,5,6,7,8,9,10,11,12,LR)) #define emith_sh2_drc_exit() \ - EOP_LDMFD_SP(A_R4M|A_R5M|A_R6M|A_R7M|A_R8M|A_R9M|A_R10M|A_R11M|A_R12M|A_R15M) + EOP_LDMFD_SP(M10(4,5,6,7,8,9,10,11,12,PC)) // assumes a is in arg0, tab, func and mask are temp -#define emith_sh2_rcall(a, tab, func, mask) { \ +#define emith_sh2_rcall(a, tab, func, mask) do { \ emith_lsr(mask, a, SH2_READ_SHIFT); \ EOP_ADD_REG_LSL(tab, tab, mask, 3); \ - if (func < mask) EOP_LDMIA(tab, (1<>= count; \ if (d != s) \ emith_move_r_r(d, s); \ emith_and_r_imm(d, t); \ -} +} while (0) -#define emith_clear_msb_c(cond, d, s, count) { \ +#define emith_clear_msb_c(cond, d, s, count) do { \ (void)(cond); \ emith_clear_msb(d, s, count); \ -} +} while (0) -#define emith_sext(d, s, bits) { \ +#define emith_sext(d, s, bits) do { \ emith_lsl(d, s, 32 - (bits)); \ emith_asr(d, d, 32 - (bits)); \ -} +} while (0) #define emith_setc(r) do { \ assert(is_abcdx(r)); \ @@ -737,16 +737,16 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; } while (0) // assumes EBX is free -#define emith_ret_to_ctx(offs) { \ +#define emith_ret_to_ctx(offs) do { \ emith_pop(xBX); \ emith_ctx_write(xBX, offs); \ -} +} while (0) -#define emith_jump(ptr) { \ +#define emith_jump(ptr) do { \ u32 disp = (u8 *)(ptr) - ((u8 *)tcache_ptr + 5); \ EMIT_OP(0xe9); \ EMIT(disp, u32); \ -} +} while (0) #define emith_jump_patchable(target) \ emith_jump(target) @@ -767,17 +767,17 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMIT_PTR((u8 *)(ptr) + offs_, disp_ - offs_, u32); \ } while (0) -#define emith_jump_at(ptr, target) { \ +#define emith_jump_at(ptr, target) do { \ u32 disp_ = (u8 *)(target) - ((u8 *)(ptr) + 5); \ EMIT_PTR(ptr, 0xe9, u8); \ EMIT_PTR((u8 *)(ptr) + 1, disp_, u32); \ -} +} while (0) -#define emith_call(ptr) { \ +#define emith_call(ptr) do { \ u32 disp = (u8 *)(ptr) - ((u8 *)tcache_ptr + 5); \ EMIT_OP(0xe8); \ EMIT(disp, u32); \ -} +} while (0) #define emith_call_cond(cond, ptr) \ emith_call(ptr) @@ -889,18 +889,18 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; default: rd = xCX; break; \ } -#define emith_sh2_drc_entry() { \ +#define emith_sh2_drc_entry() do { \ emith_push(xBX); \ emith_push(xBP); \ emith_push(xSI); /* to align */ \ -} +} while (0) -#define emith_sh2_drc_exit() { \ +#define emith_sh2_drc_exit() do { \ emith_pop(xSI); \ emith_pop(xBP); \ emith_pop(xBX); \ emith_ret(); \ -} +} while (0) #else // _WIN32 @@ -912,22 +912,22 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; default: rd = 9; break; \ } -#define emith_sh2_drc_entry() { \ +#define emith_sh2_drc_entry() do { \ emith_push(xBX); \ emith_push(xBP); \ emith_push(xSI); \ emith_push(xDI); \ emith_add_r_r_ptr_imm(xSP, xSP, -8*5); \ -} +} while (0) -#define emith_sh2_drc_exit() { \ +#define emith_sh2_drc_exit() do { \ emith_add_r_r_ptr_imm(xSP, xSP, 8*5); \ emith_pop(xDI); \ emith_pop(xSI); \ emith_pop(xBP); \ emith_pop(xBX); \ emith_ret(); \ -} +} while (0) #endif // _WIN32 @@ -949,20 +949,20 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; default: rd = xBX; break; \ } -#define emith_sh2_drc_entry() { \ +#define emith_sh2_drc_entry() do { \ emith_push(xBX); \ emith_push(xBP); \ emith_push(xSI); \ emith_push(xDI); \ -} +} while (0) -#define emith_sh2_drc_exit() { \ +#define emith_sh2_drc_exit() do { \ emith_pop(xDI); \ emith_pop(xSI); \ emith_pop(xBP); \ emith_pop(xBX); \ emith_ret(); \ -} +} while (0) #endif @@ -982,7 +982,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; if ((mask) & (1 << xAX)) emith_pop(xAX); \ } while (0) -#define emith_sh2_rcall(a, tab, func, mask) { \ +#define emith_sh2_rcall(a, tab, func, mask) do { \ emith_lsr(mask, a, SH2_READ_SHIFT); \ EMIT_REX_IF(1, mask, tab); \ EMIT_OP_MODRM64(0x8d, 0, tab, 4); \ @@ -995,9 +995,9 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMIT_OP_MODRM64(0x8b, 1, mask, tab); \ EMIT(1 << PTR_SCALE, u8); /* mov mask, [tab + {4,8}] */ \ emith_add_r_r_ptr(func, func); \ -} +} while (0) -#define emith_sh2_wcall(a, val, tab, func) { \ +#define emith_sh2_wcall(a, val, tab, func) do { \ int arg2_; \ host_arg2reg(arg2_, 2); \ emith_lsr(func, a, SH2_WRITE_SHIFT); /* tmp = a >> WRT_SHIFT */ \ @@ -1006,9 +1006,9 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMIT_SIB64(PTR_SCALE, func, tab); /* mov tmp, [tab + tmp * {4,8}] */ \ emith_move_r_r_ptr(arg2_, CONTEXT_REG); \ emith_jump_reg(func); \ -} +} while (0) -#define emith_sh2_dtbf_loop() { \ +#define emith_sh2_dtbf_loop() do { \ u8 *jmp0; /* negative cycles check */ \ u8 *jmp1; /* unsinged overflow check */ \ int cr, rn; \ @@ -1032,15 +1032,15 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_move_r_imm(rn, 0); \ JMP8_EMIT(ICOND_JA, jmp1); \ rcache_free_tmp(tmp_); \ -} +} while (0) -#define emith_write_sr(sr, srcr) { \ +#define emith_write_sr(sr, srcr) do { \ int tmp_ = rcache_get_tmp(); \ emith_clear_msb(tmp_, srcr, 22); \ emith_bic_r_imm(sr, 0x3ff); \ emith_or_r_r(sr, tmp_); \ rcache_free_tmp(tmp_); \ -} +} while (0) #define emith_tpop_carry(sr, is_sub) \ emith_lsr(sr, sr, 1) @@ -1055,7 +1055,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; * t = carry(Rn -= Rm) * T ^= t */ -#define emith_sh2_div1_step(rn, rm, sr) { \ +#define emith_sh2_div1_step(rn, rm, sr) do { \ u8 *jmp0, *jmp1; \ int tmp_ = rcache_get_tmp(); \ emith_eor_r_r(tmp_, tmp_); \ @@ -1069,7 +1069,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_adc_r_r(tmp_, tmp_); \ emith_eor_r_r(sr, tmp_); \ rcache_free_tmp(tmp_); \ -} +} while (0) /* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ #define emith_sh2_macl(ml, mh, rn, rm, sr) do { \ @@ -1123,3 +1123,5 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_pool_check() /**/ #define emith_pool_commit(j) /**/ +#define emith_insn_ptr() ((u8 *)tcache_ptr) +#define emith_flush() /**/ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 517be81c2..85ce799bf 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -154,8 +154,8 @@ enum op_types { static u8 *tcache_dsm_ptrs[3]; static char sh2dasm_buff[64]; #define do_host_disasm(tcid) \ - host_dasm(tcache_dsm_ptrs[tcid], tcache_ptr - tcache_dsm_ptrs[tcid]); \ - tcache_dsm_ptrs[tcid] = tcache_ptr + host_dasm(tcache_dsm_ptrs[tcid], emith_insn_ptr() - tcache_dsm_ptrs[tcid]); \ + tcache_dsm_ptrs[tcid] = emith_insn_ptr() #else #define do_host_disasm(x) #endif @@ -2664,6 +2664,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); rcache_flush(); + emith_flush(); // make block entry v = block->entry_count; @@ -3933,7 +3934,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (target == NULL) return NULL; emith_jump_patchable(target); - } + } else + rcache_flush(); + emith_flush(); // link local branches for (i = 0; i < branch_patch_count; i++) { @@ -3996,21 +3999,25 @@ static void sh2_generate_utils(void) emith_move_r_r(arg1, arg1); // nop emith_move_r_r(arg2, arg2); // nop emith_move_r_r(arg3, arg3); // nop + emith_flush(); // sh2_drc_write8(u32 a, u32 d) sh2_drc_write8 = (void *)tcache_ptr; emith_ctx_read_ptr(arg2, offsetof(SH2, write8_tab)); emith_sh2_wcall(arg0, arg1, arg2, arg3); + emith_flush(); // sh2_drc_write16(u32 a, u32 d) sh2_drc_write16 = (void *)tcache_ptr; emith_ctx_read_ptr(arg2, offsetof(SH2, write16_tab)); emith_sh2_wcall(arg0, arg1, arg2, arg3); + emith_flush(); // sh2_drc_write32(u32 a, u32 d) sh2_drc_write32 = (void *)tcache_ptr; emith_ctx_read_ptr(arg2, offsetof(SH2, write32_tab)); emith_sh2_wcall(arg0, arg1, arg2, arg3); + emith_flush(); // d = sh2_drc_read8(u32 a) sh2_drc_read8 = (void *)tcache_ptr; @@ -4024,6 +4031,7 @@ static void sh2_generate_utils(void) EMITH_SJMP_END(DCOND_CS); emith_move_r_r_ptr(arg1, CONTEXT_REG); emith_jump_reg(arg2); + emith_flush(); // d = sh2_drc_read16(u32 a) sh2_drc_read16 = (void *)tcache_ptr; @@ -4036,6 +4044,7 @@ static void sh2_generate_utils(void) EMITH_SJMP_END(DCOND_CS); emith_move_r_r_ptr(arg1, CONTEXT_REG); emith_jump_reg(arg2); + emith_flush(); // d = sh2_drc_read32(u32 a) sh2_drc_read32 = (void *)tcache_ptr; @@ -4049,11 +4058,13 @@ static void sh2_generate_utils(void) EMITH_SJMP_END(DCOND_CS); emith_move_r_r_ptr(arg1, CONTEXT_REG); emith_jump_reg(arg2); + emith_flush(); // sh2_drc_exit(void) sh2_drc_exit = (void *)tcache_ptr; emit_do_static_regs(1, arg2); emith_sh2_drc_exit(); + emith_flush(); // sh2_drc_dispatcher(void) sh2_drc_dispatcher = (void *)tcache_ptr; @@ -4091,6 +4102,7 @@ static void sh2_generate_utils(void) emit_block_entry(); // XXX: can't translate, fail emith_call(dr_failure); + emith_flush(); // sh2_drc_test_irq(void) // assumes it's called from main function (may jump to dispatcher) @@ -4141,6 +4153,7 @@ static void sh2_generate_utils(void) #endif emith_jump(sh2_drc_dispatcher); rcache_invalidate(); + emith_flush(); // sh2_drc_entry(SH2 *sh2) sh2_drc_entry = (void *)tcache_ptr; @@ -4149,6 +4162,7 @@ static void sh2_generate_utils(void) emit_do_static_regs(0, arg2); emith_call(sh2_drc_test_irq); emith_jump(sh2_drc_dispatcher); + emith_flush(); #ifdef PDB_NET // debug @@ -4163,6 +4177,7 @@ static void sh2_generate_utils(void) emith_adc_r_imm(arg2, 0x01000000); \ emith_ctx_write(arg2, offsetof(SH2, pdb_io_csum[1])); \ emith_pop_and_ret(); \ + emith_flush(); \ func = tmp; \ } #define MAKE_WRITE_WRAPPER(func) { \ @@ -4175,6 +4190,7 @@ static void sh2_generate_utils(void) emith_ctx_write(arg2, offsetof(SH2, pdb_io_csum[1])); \ emith_move_r_r_ptr(arg2, CONTEXT_REG); \ emith_jump(func); \ + emith_flush(); \ func = tmp; \ } diff --git a/pico/carthw/svp/compiler.c b/pico/carthw/svp/compiler.c index 1ec71e754..06aa17919 100644 --- a/pico/carthw/svp/compiler.c +++ b/pico/carthw/svp/compiler.c @@ -1796,6 +1796,7 @@ void *ssp_translate_block(int pc) tr_flush_dirty_pmcrs(); block_end = emit_block_epilogue(ccount, end_cond, jump_pc, pc); emith_pool_commit(0); + emith_flush(); if (tcache_ptr - (u32 *)tcache > DRC_TCACHE_SIZE/4) { elprintf(EL_ANOMALY|EL_STATUS|EL_SVP, "tcache overflow!\n"); From 0b520c10140567d88654f220b6e17a4152e1e596 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 30 Apr 2019 21:18:12 +0200 Subject: [PATCH 035/174] sh2 drc, add loop detector, handle delay/idle loops --- cpu/drc/emit_arm.c | 35 +++++++++ cpu/drc/emit_x86.c | 56 ++++++++++++++ cpu/sh2/compiler.c | 177 ++++++++++++++++++++++++++++++++++++++------- cpu/sh2/compiler.h | 9 ++- 4 files changed, 247 insertions(+), 30 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index bfce29fae..37d5cf1b8 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -846,6 +846,9 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_add_r_r_ptr_imm(d, s, imm) \ emith_add_r_r_imm(d, s, imm) +#define emith_sub_r_r_imm_c(cond, d, s, imm) \ + emith_op_imm2(cond, 0, A_OP_SUB, d, s, (imm)) + #define emith_sub_r_r_imm(d, s, imm) \ emith_op_imm2(A_COND_AL, 0, A_OP_SUB, d, s, imm) @@ -1172,6 +1175,38 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) rcache_free_tmp(tmp_); \ } while (0) +#define emith_sh2_delay_loop(cycles, reg) do { \ + int sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); \ + int t1 = rcache_get_tmp(); \ + int t2 = rcache_get_tmp(); \ + int t3 = rcache_get_tmp(); \ + /* if (sr < 0) return */ \ + emith_asrf(t2, sr, 12); \ + EMITH_JMP_START(DCOND_LE); \ + /* turns = sr.cycles / cycles */ \ + emith_move_r_imm(t3, (u32)((1ULL<<32) / (cycles)) + 1); \ + emith_mul_u64(t1, t2, t2, t3); /* multiply by 1/x */ \ + rcache_free_tmp(t3); \ + if (reg >= 0) { \ + /* if (reg <= turns) turns = reg-1 */ \ + t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \ + emith_cmp_r_r(t3, t2); \ + emith_sub_r_r_imm_c(DCOND_LE, t2, t3, 1); \ + /* if (reg <= 1) turns = 0 */ \ + emith_cmp_r_imm(t3, 1); \ + emith_move_r_imm_c(DCOND_LE, t2, 0); \ + /* reg -= turns */ \ + emith_sub_r_r(t3, t2); \ + } \ + /* sr.cycles -= turns * cycles; */ \ + emith_move_r_imm(t1, cycles); \ + emith_mul(t1, t2, t1); \ + emith_sub_r_r_r_lsl(sr, sr, t1, 12); \ + EMITH_JMP_END(DCOND_LE); \ + rcache_free_tmp(t1); \ + rcache_free_tmp(t2); \ +} while (0) + #define emith_write_sr(sr, srcr) do { \ emith_lsr(sr, sr, 10); \ emith_or_r_r_r_lsl(sr, sr, srcr, 22); \ diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 10528abd3..b8354789c 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -293,6 +293,20 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; rcache_free_tmp(tmp_); \ } while (0) +#define emith_sub_r_r_r_lsl(d, s1, s2, lslimm) do { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsl(tmp_, s2, lslimm); \ + emith_sub_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ +} while (0) + +#define emith_or_r_r_r_lsl(d, s1, s2, lslimm) do { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsl(tmp_, s2, lslimm); \ + emith_or_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ +} while (0) + // _r_r_shift #define emith_or_r_r_lsl(d, s, lslimm) do { \ int tmp_ = rcache_get_tmp(); \ @@ -394,6 +408,10 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_ror(d, s, cnt) #define emith_and_r_r_c(cond, d, s) \ emith_and_r_r(d, s); +#define emith_add_r_r_imm_c(cond, d, s, imm) \ + emith_add_r_r_imm(d, s, imm); +#define emith_sub_r_r_imm_c(cond, d, s, imm) \ + emith_sub_r_r_imm(d, s, imm); #define emith_read8_r_r_r_c(cond, r, rs, rm) \ emith_read8_r_r_r(r, rs, rm) @@ -1034,6 +1052,44 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; rcache_free_tmp(tmp_); \ } while (0) +#define emith_sh2_delay_loop(cycles, reg) do { \ + int sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); \ + int t1 = rcache_get_tmp(); \ + int t2 = rcache_get_tmp(); \ + int t3 = rcache_get_tmp(); \ + if (t3 == xAX) { t3 = t1; t1 = xAX; } /* for MUL */ \ + if (t3 == xDX) { t3 = t2; t2 = xDX; } \ + /* if (sr < 0) return */ \ + emith_asrf(t2, sr, 12); \ + EMITH_JMP_START(DCOND_LE); \ + /* turns = sr.cycles / cycles */ \ + emith_move_r_imm(t3, (u32)((1ULL<<32) / (cycles)) + 1); \ + emith_mul_u64(t1, t2, t2, t3); /* multiply by 1/x */ \ + rcache_free_tmp(t3); \ + if (reg >= 0) { \ + /* if (reg <= turns) turns = reg-1 */ \ + t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \ + emith_cmp_r_r(t3, t2); \ + EMITH_SJMP_START(DCOND_GT); \ + emith_sub_r_r_imm_c(DCOND_LE, t2, t3, 1); \ + EMITH_SJMP_END(DCOND_GT); \ + /* if (reg <= 1) turns = 0 */ \ + emith_cmp_r_imm(t3, 1); \ + EMITH_SJMP_START(DCOND_GT); \ + emith_move_r_imm_c(DCOND_LE, t2, 0); \ + EMITH_SJMP_END(DCOND_GT); \ + /* reg -= turns */ \ + emith_sub_r_r(t3, t2); \ + } \ + /* sr.cycles -= turns * cycles; */ \ + emith_move_r_imm(t1, cycles); \ + emith_mul_u64(t1, t2, t1, t2); \ + emith_sub_r_r_r_lsl(sr, sr, t1, 12); \ + EMITH_JMP_END(DCOND_LE); \ + rcache_free_tmp(t1); \ + rcache_free_tmp(t2); \ +} while (0) + #define emith_write_sr(sr, srcr) do { \ int tmp_ = rcache_get_tmp(); \ emith_clear_msb(tmp_, srcr, 22); \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 85ce799bf..fd75cc44f 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -41,6 +41,7 @@ #define BRANCH_CACHE 1 #define ALIAS_REGISTERS 1 #define REMAP_REGISTER 1 +#define LOOP_DETECTION 1 // limits (per block) #define MAX_BLOCK_SIZE (BLOCK_INSN_LIMIT * 6 * 6) @@ -135,6 +136,7 @@ enum op_types { OP_BRANCH_RF, // indirect far (PC + Rm) OP_SETCLRT, // T flag set/clear OP_MOVE, // register move + OP_LOAD_CONST,// load const to register OP_LOAD_POOL, // literal pool load, imm is address OP_MOVA, OP_SLEEP, @@ -147,6 +149,9 @@ enum op_types { #define OP_ISBRAUC(op) (BITMASK4(OP_BRANCH, OP_BRANCH_R, OP_BRANCH_RF, OP_RTE) \ & BITMASK1(op)) #define OP_ISBRACND(op) (BITMASK2(OP_BRANCH_CT, OP_BRANCH_CF) & BITMASK1(op)) +#define OP_ISBRAIMM(op) (BITMASK3(OP_BRANCH, OP_BRANCH_CT, OP_BRANCH_CF) \ + & BITMASK1(op)) +#define OP_ISBRAIND(op) (BITMASK2(OP_BRANCH_R, OP_BRANCH_RF) & BITMASK1(op)) #ifdef DRC_SH2 @@ -2537,7 +2542,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) u32 branch_patch_pc[MAX_LOCAL_BRANCHES]; int branch_patch_count = 0; u8 op_flags[BLOCK_INSN_LIMIT]; - struct { + struct drcf { + int delay_reg:8; + u32 loop_type:8; u32 test_irq:1; u32 pending_branch_direct:1; u32 pending_branch_indirect:1; @@ -2556,7 +2563,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) int tmp, tmp2; int cycles; int i, v; - u32 u; + u32 u, m1, m2; int op; u16 crc; @@ -2603,14 +2610,64 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } // collect branch_targets that don't land on delay slots + m1 = m2 = v = op = 0; for (pc = base_pc, i = 0; pc < end_pc; i++, pc += 2) { - if (!(op_flags[i] & OF_BTARGET)) - continue; - if (op_flags[i] & OF_DELAY_OP) { + if (op_flags[i] & OF_DELAY_OP) op_flags[i] &= ~OF_BTARGET; - continue; + if (op_flags[i] & OF_BTARGET) + ADD_TO_ARRAY(branch_target_pc, branch_target_count, pc, ); +#if LOOP_DETECTION + // loop types detected: + // 1. target: ... BRA target -> idle loop + // 2. target: ... delay insn ... BF target -> delay loop + // 3. target: ... poll insn ... BF/BT target -> poll loop + // 4. target: ... poll insn ... BF/BT exit ... BRA target, exit: -> poll + // conditions: + // a. no further branch targets between target and back jump. + // b. no unconditional branch insn inside the loop. + // c. exactly one poll or delay insn is allowed inside a delay/poll loop + // (scan_block marks loops only if they meet conditions a through c) + // d. idle loops do not modify anything but PC,SR and contain no branches + // e. delay/poll loops do not modify anything but the concerned reg,PC,SR + // f. loading constants into registers inside the loop is allowed + // g. a delay/poll loop must have a conditional branch somewhere + // h. an idle loop must not have a conditional branch + if (op_flags[i] & OF_BTARGET) { + // possible loop entry point + drcf.loop_type = op_flags[i] & OF_LOOP; + drcf.pending_branch_direct = drcf.pending_branch_indirect = 0; + op = OF_IDLE_LOOP; // loop type + v = i; + m1 = m2 = 0; + } + if (drcf.loop_type) { + // detect loop type, and store poll/delay register + if (op_flags[i] & OF_POLL_INSN) { + op = OF_POLL_LOOP; + m1 |= ops[i].dest; // loop poll/delay regs + } else if (op_flags[i] & OF_DELAY_INSN) { + op = OF_DELAY_LOOP; + m1 |= ops[i].dest; + } else if (ops[i].op != OP_LOAD_POOL && ops[i].op != OP_LOAD_CONST + && (ops[i].op != OP_MOVE || op != OF_POLL_LOOP)) { + // not (MOV @(PC) or MOV # or (MOV reg and poll)), condition f + m2 |= ops[i].dest; // regs modified by other insns + } + // branch detector + if (OP_ISBRAIMM(ops[i].op) && ops[i].imm == base_pc + 2*v) + drcf.pending_branch_direct = 1; // backward branch detected + if (OP_ISBRACND(ops[i].op)) + drcf.pending_branch_indirect = 1; // conditions g,h - cond.branch + // poll/idle loops terminate with their backwards branch to the loop start + if (drcf.pending_branch_direct && !(op_flags[i+1] & OF_DELAY_OP)) { + m2 &= ~(m1 | BITMASK2(SHR_PC, SHR_SR)); // conditions d,e + g,h + if (m2 || ((op == OF_IDLE_LOOP) == (drcf.pending_branch_indirect))) + op = 0; // conditions not met + op_flags[v] = (op_flags[v] & ~OF_LOOP) | op; // set loop type + drcf.loop_type = 0; + } } - ADD_TO_ARRAY(branch_target_pc, branch_target_count, pc, break); +#endif } if (branch_target_count > 0) { @@ -2634,6 +2691,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // clear stale state after compile errors rcache_invalidate(); + drcf = (struct drcf) { 0 }; // ------------------------------------------------- // 3rd pass: actual compilation @@ -2653,8 +2711,14 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #endif #if (DRC_DEBUG & 4) DasmSH2(sh2dasm_buff, pc, op); - printf("%c%08x %04x %s\n", (op_flags[i] & OF_BTARGET) ? '*' : ' ', - pc, op, sh2dasm_buff); + if (op_flags[i] & OF_BTARGET) { + if ((op_flags[i] & OF_LOOP) == OF_DELAY_LOOP) tmp3 = '+'; + else if ((op_flags[i] & OF_LOOP) == OF_POLL_LOOP) tmp3 = '='; + else if ((op_flags[i] & OF_LOOP) == OF_IDLE_LOOP) tmp3 = '~'; + else tmp3 = '*'; + } else if (drcf.loop_type) tmp3 = '.'; + else tmp3 = ' '; + printf("%c%08x %04x %s\n", tmp3, pc, op, sh2dasm_buff); #endif if (op_flags[i] & OF_BTARGET) @@ -2702,6 +2766,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) v = find_in_array(branch_target_pc, branch_target_count, pc); if (v >= 0) branch_target_ptr[v] = tcache_ptr; +#if LOOP_DETECTION + drcf.loop_type = op_flags[i] & OF_LOOP; + drcf.delay_reg = -1; +#endif // must update PC emit_move_r_imm32(SHR_PC, pc); @@ -3388,6 +3456,14 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto end_op; case 1: // DT Rn 0100nnnn00010000 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); +#if LOOP_DETECTION + if (drcf.loop_type == OF_DELAY_LOOP) { + if (drcf.delay_reg == -1) + drcf.delay_reg = GET_Rn(); + else + drcf.loop_type = 0; + } +#endif emith_bic_r_imm(sr, T); tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); emith_subf_r_r_imm(tmp, tmp2, 1); @@ -3832,7 +3908,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.test_irq = 0; } - // branch handling (with/without delay) + // branch handling if (drcf.pending_branch_direct) { struct op_data *opd_b = @@ -3846,6 +3922,16 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) ctaken = (op_flags[i] & OF_DELAY_OP) ? 1 : 2; } cycles += ctaken; // assume branch taken +#if LOOP_DETECTION + if ((drcf.loop_type == OF_IDLE_LOOP || + (drcf.loop_type == OF_DELAY_LOOP && drcf.delay_reg >= 0))) + { + // idle or delay loop + emith_sh2_delay_loop(cycles, drcf.delay_reg); + drcf.loop_type = 0; + } +#endif + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); rcache_clean(); @@ -3902,6 +3988,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_add_r_imm(sr, ctaken << 12); drcf.pending_branch_direct = 0; + if (target_pc >= base_pc && target_pc < pc) + drcf.loop_type = 0; } else if (drcf.pending_branch_indirect) { sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); @@ -3909,6 +3997,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_flush(); emith_jump(sh2_drc_dispatcher); drcf.pending_branch_indirect = 0; + drcf.loop_type = 0; } do_host_disasm(tcache_id); @@ -4729,6 +4818,9 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, int end_block = 0; int i, i_end; u32 crc = 0; + // 2nd pass stuff + int last_btarget; // loop detector + enum { T_UNKNOWN, T_CLEAR, T_SET } t; // T propagation state memset(op_flags, 0, sizeof(*op_flags) * BLOCK_INSN_LIMIT); op_flags[0] |= OF_BTARGET; // block start is always a target @@ -4903,6 +4995,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x0e: // MOV.L @(R0,Rm),Rn 0000nnnnmmmm1110 opd->source = BITMASK3(GET_Rm(), SHR_R0, SHR_MEM); opd->dest = BITMASK1(GET_Rn()); + op_flags[i] |= OF_POLL_INSN; break; case 0x0f: // MAC.L @Rm+,@Rn+ 0000nnnnmmmm1111 opd->source = BITMASK6(GET_Rm(), GET_Rn(), SHR_SR, SHR_MACL, SHR_MACH, SHR_MEM); @@ -5027,6 +5120,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 1: // DT Rn 0100nnnn00010000 opd->source = BITMASK1(GET_Rn()); opd->dest = BITMASK2(GET_Rn(), SHR_T); + op_flags[i] |= OF_DELAY_INSN; break; default: goto undefined; @@ -5235,6 +5329,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->source = BITMASK2(GET_Rm(), SHR_MEM); opd->dest = BITMASK1(GET_Rn()); opd->imm = (op & 0x0f) * 4; + op_flags[i] |= OF_POLL_INSN; break; ///////////////////////////////////////////// @@ -5252,6 +5347,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x02: // MOV.L @Rm,Rn 0110nnnnmmmm0010 opd->dest = BITMASK1(GET_Rn()); opd->source = BITMASK2(GET_Rm(), SHR_MEM); + op_flags[i] |= OF_POLL_INSN; break; case 0x0a: // NEGC Rm,Rn 0110nnnnmmmm1010 opd->source = BITMASK2(GET_Rm(), SHR_T); @@ -5394,6 +5490,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->dest = BITMASK1(SHR_R0); opd->size = (op & 0x300) >> 8; opd->imm = (op & 0xff) << opd->size; + op_flags[i] |= OF_POLL_INSN; break; case 0x0300: // TRAPA #imm 11000011iiiiiiii opd->op = OP_TRAPA; @@ -5481,6 +5578,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, ///////////////////////////////////////////// case 0x0e: // MOV #imm,Rn 1110nnnniiiiiiii + opd->op = OP_LOAD_CONST; opd->dest = BITMASK1(GET_Rn()); opd->imm = (s8)op; break; @@ -5517,32 +5615,29 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, // 2nd pass: some analysis lowest_literal = end_literals = lowest_mova = 0; + t = T_UNKNOWN; + last_btarget = 0; + op = 0; // delay/poll insns counter for (i = 0, pc = base_pc; i < i_end; i++, pc += 2) { opd = &ops[i]; crc += FETCH_OP(pc); // propagate T (TODO: DIV0U) - if ((opd->op == OP_SETCLRT && !opd->imm) || opd->op == OP_BRANCH_CT) - op_flags[i + 1] |= OF_T_CLEAR; - else if ((opd->op == OP_SETCLRT && opd->imm) || opd->op == OP_BRANCH_CF) - op_flags[i + 1] |= OF_T_SET; - if ((op_flags[i] & OF_BTARGET) || (opd->dest & BITMASK1(SHR_T))) - op_flags[i] &= ~(OF_T_SET | OF_T_CLEAR); - else - op_flags[i + 1] |= op_flags[i] & (OF_T_SET | OF_T_CLEAR); + t = T_UNKNOWN; - if ((opd->op == OP_BRANCH_CT && (op_flags[i] & OF_T_CLEAR)) || - (opd->op == OP_BRANCH_CF && (op_flags[i] & OF_T_SET))) - opd->op = OP_BRANCH_N; - else if ((opd->op == OP_BRANCH_CT && (op_flags[i] & OF_T_SET)) || - (opd->op == OP_BRANCH_CF && (op_flags[i] & OF_T_CLEAR))) { + if ((opd->op == OP_BRANCH_CT && t == T_SET) || + (opd->op == OP_BRANCH_CF && t == T_CLEAR)) { opd->op = OP_BRANCH; - if (op_flags[i + 1] & OF_DELAY_OP) - opd->cycles = 2; - else - opd->cycles = 3; - } + opd->cycles = (op_flags[i + 1] & OF_DELAY_OP) ? 2 : 3; + } else if ((opd->op == OP_BRANCH_CT && t == T_CLEAR) || + (opd->op == OP_BRANCH_CF && t == T_SET)) + opd->op = OP_BRANCH_N; + else if ((opd->op == OP_SETCLRT && !opd->imm) || opd->op == OP_BRANCH_CT) + t = T_CLEAR; + else if ((opd->op == OP_SETCLRT && opd->imm) || opd->op == OP_BRANCH_CF) + t = T_SET; + // "overscan" detection: unreachable code after unconditional branch // this can happen if the insn after a forward branch isn't a local target if (OP_ISBRAUC(opd->op)) { @@ -5575,6 +5670,32 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, } } } +#if LOOP_DETECTION + // inner loop detection + // 1. a loop always starts with a branch target (for the backwards jump) + // 2. it doesn't contain more than one polling and/or delaying insn + // 3. it doesn't contain unconditional jumps + // 4. no overlapping of loops + if (op_flags[i] & OF_BTARGET) { + last_btarget = i; // possible loop starting point + op = 0; + } + // XXX let's hope nobody is putting a delay or poll insn in a delay slot :-/ + if (OP_ISBRAIMM(opd->op)) { + // BSR, BRA, BT, BF with immediate target + int i_tmp = (opd->imm - base_pc) / 2; // branch target, index in ops + if (i_tmp == last_btarget && op <= 1) { + op_flags[i_tmp] |= OF_LOOP; // conditions met -> mark loop + last_btarget = i+1; // condition 4 + } else if (opd->op == OP_BRANCH) + last_btarget = i+1; // condition 3 + } + else if (OP_ISBRAIND(opd->op)) + // BRAF, BSRF, JMP, JSR, register indirect. treat it as off-limits jump + last_btarget = i+1; // condition 3 + else if (op_flags[i] & (OF_POLL_INSN|OF_DELAY_INSN)) + op ++; // condition 2 +#endif } end_pc = base_pc + i_end * 2; diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index d5cde5200..b098f6c6b 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -18,9 +18,14 @@ void sh2_drc_frame(void); /* op_flags */ #define OF_DELAY_OP (1 << 0) #define OF_BTARGET (1 << 1) -#define OF_T_SET (1 << 2) // T is known to be set -#define OF_T_CLEAR (1 << 3) // ... clear +#define OF_LOOP (3 << 2) // NONE, IDLE, DELAY, POLL loop #define OF_B_IN_DS (1 << 4) +#define OF_DELAY_INSN (1 << 5) // DT, (TODO ADD+CMP?) +#define OF_POLL_INSN (1 << 6) // MOV @(...),Rn (no post increment), TST @(...) + +#define OF_IDLE_LOOP (1 << 2) +#define OF_DELAY_LOOP (2 << 2) +#define OF_POLL_LOOP (3 << 2) unsigned short scan_block(unsigned int base_pc, int is_slave, unsigned char *op_flags, unsigned int *end_pc, From 835adf871d2aae1f991c8be5f2402172747bbb2f Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 2 May 2019 23:16:55 +0200 Subject: [PATCH 036/174] sh2 drc, add detection for in-memory polling --- cpu/drc/emit_arm.c | 19 ++++-- cpu/drc/emit_x86.c | 37 +++++----- cpu/sh2/compiler.c | 94 +++++++++++++++++++++++--- cpu/sh2/sh2.h | 3 +- pico/32x/32x.c | 2 +- pico/32x/memory.c | 152 ++++++++++++++++++++++++++---------------- pico/32x/memory_arm.S | 23 ++----- pico/32x/sh2soc.c | 6 ++ pico/pico_int.h | 1 + 9 files changed, 224 insertions(+), 113 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 37d5cf1b8..1b429b352 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -636,9 +636,13 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define EMITH_SJMP3_MID(cond) EMITH_NOTHING1(cond) #define EMITH_SJMP3_END() +#define emith_move_r_r_c(cond, d, s) \ + EOP_MOV_REG(cond,0,d,s,A_AM1_LSL,0) #define emith_move_r_r(d, s) \ - EOP_MOV_REG_SIMPLE(d, s) + emith_move_r_r_c(A_COND_AL, d, s) +#define emith_move_r_r_ptr_c(cond, d, s) \ + emith_move_r_r_c(cond, d, s) #define emith_move_r_r_ptr(d, s) \ emith_move_r_r(d, s) @@ -1116,11 +1120,16 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_ret_to_ctx(offs) \ emith_ctx_write(LR, offs) -#define emith_push_ret() \ - EOP_STMFD_SP(M1(LR)) +/* pushes r12 for eabi alignment */ +#define emith_push_ret(r) do { \ + int r_ = (r >= 0 ? r : 12); \ + EOP_STMFD_SP(M2(r_,LR)); \ +} while (0) -#define emith_pop_and_ret() \ - EOP_LDMFD_SP(M1(PC)) +#define emith_pop_and_ret(r) do { \ + int r_ = (r >= 0 ? r : 12); \ + EOP_LDMFD_SP(M2(r_,PC)); \ +} while (0) #define host_instructions_updated(base, end) \ cache_flush_d_inval_i(base, end) diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index b8354789c..9dd062624 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -381,21 +381,12 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_arith_r_imm(4, r, ~(imm)) // fake conditionals (using SJMP instead) -#define emith_move_r_imm_c(cond, r, imm) do { \ - (void)(cond); \ - emith_move_r_imm(r, imm); \ -} while (0) - -#define emith_add_r_imm_c(cond, r, imm) do { \ - (void)(cond); \ - emith_add_r_imm(r, imm); \ -} while (0) - -#define emith_sub_r_imm_c(cond, r, imm) do { \ - (void)(cond); \ - emith_sub_r_imm(r, imm); \ -} while (0) - +#define emith_move_r_imm_c(cond, r, imm) \ + emith_move_r_imm(r, imm); +#define emith_add_r_imm_c(cond, r, imm) \ + emith_add_r_imm(r, imm); +#define emith_sub_r_imm_c(cond, r, imm) \ + emith_sub_r_imm(r, imm); #define emith_or_r_imm_c(cond, r, imm) \ emith_or_r_imm(r, imm) #define emith_eor_r_imm_c(cond, r, imm) \ @@ -404,6 +395,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_bic_r_imm(r, imm) #define emith_tst_r_imm_c(cond, r, imm) \ emith_tst_r_imm(r, imm) +#define emith_move_r_r_ptr_c(cond, d, s) \ + emith_move_r_r_ptr(d, s) #define emith_ror_c(cond, d, s, cnt) \ emith_ror(d, s, cnt) #define emith_and_r_r_c(cond, d, s) \ @@ -819,12 +812,16 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMIT(offs, u32); \ } while (0) -#define emith_push_ret() \ - emith_push(xSI); /* to align */ +#define emith_push_ret(r) do { \ + int r_ = (r >= 0 ? r : xSI); \ + emith_push(r_); /* always push to align */ \ +} while (0) -#define emith_pop_and_ret() \ - emith_pop(xSI); \ - emith_ret() +#define emith_pop_and_ret(r) do { \ + int r_ = (r >= 0 ? r : xSI); \ + emith_pop(r_); \ + emith_ret(); \ +} while (0) #define EMITH_JMP_START(cond) { \ u8 *cond_ptr; \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index fd75cc44f..b7f54dd9e 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -532,6 +532,9 @@ static void (*sh2_drc_test_irq)(void); static u32 REGPARM(1) (*sh2_drc_read8)(u32 a); static u32 REGPARM(1) (*sh2_drc_read16)(u32 a); static u32 REGPARM(1) (*sh2_drc_read32)(u32 a); +static u32 REGPARM(1) (*sh2_drc_read8_poll)(u32 a); +static u32 REGPARM(1) (*sh2_drc_read16_poll)(u32 a); +static u32 REGPARM(1) (*sh2_drc_read32_poll)(u32 a); static void REGPARM(2) (*sh2_drc_write8)(u32 a, u32 d); static void REGPARM(2) (*sh2_drc_write16)(u32 a, u32 d); static void REGPARM(2) (*sh2_drc_write32)(u32 a, u32 d); @@ -540,6 +543,7 @@ static void REGPARM(2) (*sh2_drc_write32)(u32 a, u32 d); #define MF_SIZEMASK 0x03 // size of access #define MF_POSTINCR 0x10 // post increment (for read_rr) #define MF_PREDECR MF_POSTINCR // pre decrement (for write_rr) +#define MF_POLLING 0x20 // include polling check in read // address space stuff static int dr_is_rom(u32 a) @@ -2263,11 +2267,18 @@ static int emit_memhandler_read(int size) rcache_evict_vreg(guest_regs[SHR_SR].vreg); #endif - switch (size & MF_SIZEMASK) { - case 0: emith_call(sh2_drc_read8); break; // 8 - case 1: emith_call(sh2_drc_read16); break; // 16 - case 2: emith_call(sh2_drc_read32); break; // 32 - } + if (size & MF_POLLING) + switch (size & MF_SIZEMASK) { + case 0: emith_call(sh2_drc_read8_poll); break; // 8 + case 1: emith_call(sh2_drc_read16_poll); break; // 16 + case 2: emith_call(sh2_drc_read32_poll); break; // 32 + } + else + switch (size & MF_SIZEMASK) { + case 0: emith_call(sh2_drc_read8); break; // 8 + case 1: emith_call(sh2_drc_read16); break; // 16 + case 2: emith_call(sh2_drc_read32); break; // 32 + } rcache_invalidate_tmp(); return rcache_get_tmp_ret(); @@ -2545,6 +2556,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) struct drcf { int delay_reg:8; u32 loop_type:8; + u32 polling:8; u32 test_irq:1; u32 pending_branch_direct:1; u32 pending_branch_indirect:1; @@ -2769,6 +2781,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #if LOOP_DETECTION drcf.loop_type = op_flags[i] & OF_LOOP; drcf.delay_reg = -1; + drcf.polling = (drcf.loop_type == OF_POLL_LOOP ? MF_POLLING : 0); #endif // must update PC @@ -3176,7 +3189,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x0c: // MOV.B @(R0,Rm),Rn 0000nnnnmmmm1100 case 0x0d: // MOV.W @(R0,Rm),Rn 0000nnnnmmmm1101 case 0x0e: // MOV.L @(R0,Rm),Rn 0000nnnnmmmm1110 - emit_indirect_indexed_read(sh2, GET_Rn(), SHR_R0, GET_Rm(), op & 3); + emit_indirect_indexed_read(sh2, GET_Rn(), SHR_R0, GET_Rm(), (op & 3) | drcf.polling); goto end_op; case 0x0f: // MAC.L @Rm+,@Rn+ 0000nnnnmmmm1111 emit_indirect_read_double(sh2, &tmp, &tmp2, GET_Rn(), GET_Rm(), 2); @@ -3700,7 +3713,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) ///////////////////////////////////////////// case 0x05: // MOV.L @(disp,Rm),Rn 0101nnnnmmmmdddd - emit_memhandler_read_rr(sh2, GET_Rn(), GET_Rm(), (op & 0x0f) * 4, 2); + emit_memhandler_read_rr(sh2, GET_Rn(), GET_Rm(), (op & 0x0f) * 4, 2 | drcf.polling); goto end_op; ///////////////////////////////////////////// @@ -3713,7 +3726,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x04: // MOV.B @Rm+,Rn 0110nnnnmmmm0100 case 0x05: // MOV.W @Rm+,Rn 0110nnnnmmmm0101 case 0x06: // MOV.L @Rm+,Rn 0110nnnnmmmm0110 - tmp = ((op & 7) >= 4 && GET_Rn() != GET_Rm()) ? MF_POSTINCR : 0; + tmp = ((op & 7) >= 4 && GET_Rn() != GET_Rm()) ? MF_POSTINCR : drcf.polling; emit_memhandler_read_rr(sh2, GET_Rn(), GET_Rm(), 0, (op & 3) | tmp); goto end_op; case 0x03: // MOV Rm,Rn 0110nnnnmmmm0011 @@ -3791,7 +3804,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x0400: // MOV.B @(disp,Rm),R0 10000100mmmmdddd case 0x0500: // MOV.W @(disp,Rm),R0 10000101mmmmdddd tmp = (op & 0x100) >> 8; - emit_memhandler_read_rr(sh2, SHR_R0, GET_Rm(), (op & 0x0f) << tmp, tmp); + emit_memhandler_read_rr(sh2, SHR_R0, GET_Rm(), (op & 0x0f) << tmp, tmp | drcf.polling); goto end_op; case 0x0800: // CMP/EQ #imm,R0 10001000iiiiiiii tmp2 = rcache_get_reg(SHR_R0, RC_GR_READ, NULL); @@ -3817,7 +3830,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x0500: // MOV.W @(disp,GBR),R0 11000101dddddddd case 0x0600: // MOV.L @(disp,GBR),R0 11000110dddddddd tmp = (op & 0x300) >> 8; - emit_memhandler_read_rr(sh2, SHR_R0, SHR_GBR, (op & 0xff) << tmp, tmp); + emit_memhandler_read_rr(sh2, SHR_R0, SHR_GBR, (op & 0xff) << tmp, tmp | drcf.polling); goto end_op; case 0x0800: // TST #imm,R0 11001000iiiiiiii tmp = rcache_get_reg(SHR_R0, RC_GR_READ, NULL); @@ -3843,7 +3856,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } goto end_op; case 0x0c00: // TST.B #imm,@(R0,GBR) 11001100iiiiiiii - tmp = emit_indirect_indexed_read(sh2, SHR_TMP, SHR_R0, SHR_GBR, 0); + tmp = emit_indirect_indexed_read(sh2, SHR_TMP, SHR_R0, SHR_GBR, 0 | drcf.polling); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_bic_r_imm(sr, T); emith_tst_r_imm(tmp, op & 0xff); @@ -4149,6 +4162,56 @@ static void sh2_generate_utils(void) emith_jump_reg(arg2); emith_flush(); + // d = sh2_drc_read8_poll(u32 a) + sh2_drc_read8_poll = (void *)tcache_ptr; + emith_ctx_read_ptr(arg1, offsetof(SH2, read8_map)); + emith_sh2_rcall(arg0, arg1, arg2, arg3); + EMITH_SJMP_START(DCOND_CC); + emith_move_r_r_ptr_c(DCOND_CS, arg1, CONTEXT_REG); + emith_jump_reg_c(DCOND_CS, arg2); + EMITH_SJMP_END(DCOND_CC); + emith_and_r_r_r(arg1, arg0, arg3); + emith_eor_r_imm(arg1, 1); + emith_read8s_r_r_r(arg1, arg1, arg2); + emith_push_ret(arg1); + emith_move_r_r_ptr(arg1, CONTEXT_REG); + emith_call(p32x_sh2_poll_memory); + emith_pop_and_ret(RET_REG); + emith_flush(); + + // d = sh2_drc_read16_poll(u32 a) + sh2_drc_read16_poll = (void *)tcache_ptr; + emith_ctx_read_ptr(arg1, offsetof(SH2, read16_map)); + emith_sh2_rcall(arg0, arg1, arg2, arg3); + EMITH_SJMP_START(DCOND_CC); + emith_move_r_r_ptr_c(DCOND_CS, arg1, CONTEXT_REG); + emith_jump_reg_c(DCOND_CS, arg2); + EMITH_SJMP_END(DCOND_CC); + emith_and_r_r_r(arg1, arg0, arg3); + emith_read16s_r_r_r(arg1, arg1, arg2); + emith_push_ret(arg1); + emith_move_r_r_ptr(arg1, CONTEXT_REG); + emith_call(p32x_sh2_poll_memory); + emith_pop_and_ret(RET_REG); + emith_flush(); + + // d = sh2_drc_read32_poll(u32 a) + sh2_drc_read32_poll = (void *)tcache_ptr; + emith_ctx_read_ptr(arg1, offsetof(SH2, read32_map)); + emith_sh2_rcall(arg0, arg1, arg2, arg3); + EMITH_SJMP_START(DCOND_CC); + emith_move_r_r_ptr_c(DCOND_CS, arg1, CONTEXT_REG); + emith_jump_reg_c(DCOND_CS, arg2); + EMITH_SJMP_END(DCOND_CC); + emith_and_r_r_r(arg1, arg0, arg3); + emith_read_r_r_r(arg1, arg1, arg2); + emith_ror(arg1, arg1, 16); + emith_push_ret(arg1); + emith_move_r_r_ptr(arg1, CONTEXT_REG); + emith_call(p32x_sh2_poll_memory); + emith_pop_and_ret(RET_REG); + emith_flush(); + // sh2_drc_exit(void) sh2_drc_exit = (void *)tcache_ptr; emit_do_static_regs(1, arg2); @@ -4289,6 +4352,9 @@ static void sh2_generate_utils(void) MAKE_WRITE_WRAPPER(sh2_drc_write8); MAKE_WRITE_WRAPPER(sh2_drc_write16); MAKE_WRITE_WRAPPER(sh2_drc_write32); + MAKE_READ_WRAPPER(sh2_drc_read8_poll); + MAKE_READ_WRAPPER(sh2_drc_read16_poll); + MAKE_READ_WRAPPER(sh2_drc_read32_poll); #endif emith_pool_commit(0); @@ -4304,6 +4370,9 @@ static void sh2_generate_utils(void) host_dasm_new_symbol(sh2_drc_read8); host_dasm_new_symbol(sh2_drc_read16); host_dasm_new_symbol(sh2_drc_read32); + host_dasm_new_symbol(sh2_drc_read8_poll); + host_dasm_new_symbol(sh2_drc_read16_poll); + host_dasm_new_symbol(sh2_drc_read32_poll); #endif } @@ -5396,11 +5465,13 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->source = BITMASK2(GET_Rm(), SHR_MEM); opd->dest = BITMASK1(SHR_R0); opd->imm = (op & 0x0f); + op_flags[i] |= OF_POLL_INSN; break; case 0x0500: // MOV.W @(disp,Rm),R0 10000101mmmmdddd opd->source = BITMASK2(GET_Rm(), SHR_MEM); opd->dest = BITMASK1(SHR_R0); opd->imm = (op & 0x0f) * 2; + op_flags[i] |= OF_POLL_INSN; break; case 0x0800: // CMP/EQ #imm,R0 10001000iiiiiiii opd->source = BITMASK1(SHR_R0); @@ -5539,6 +5610,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->source = BITMASK3(SHR_GBR, SHR_R0, SHR_MEM); opd->dest = BITMASK1(SHR_T); opd->imm = op & 0xff; + op_flags[i] |= OF_POLL_INSN; opd->cycles = 3; break; case 0x0d00: // AND.B #imm,@(R0,GBR) 11001101iiiiiiii diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index 5a0661eaf..a3eb5b12b 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -42,9 +42,10 @@ typedef struct SH2_ unsigned int pdb_io_csum[2]; #define SH2_STATE_RUN (1 << 0) // to prevent recursion -#define SH2_STATE_SLEEP (1 << 1) +#define SH2_STATE_SLEEP (1 << 1) // temporarily stopped (DMA, IO, ...) #define SH2_STATE_CPOLL (1 << 2) // polling comm regs #define SH2_STATE_VPOLL (1 << 3) // polling VDP +#define SH2_STATE_RPOLL (1 << 4) // polling address in SDRAM unsigned int state; unsigned int poll_addr; int poll_cycles; diff --git a/pico/32x/32x.c b/pico/32x/32x.c index 4e8377eb3..19c6e0a6a 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -12,7 +12,7 @@ struct Pico32x Pico32x; SH2 sh2s[2]; -#define SH2_IDLE_STATES (SH2_STATE_CPOLL|SH2_STATE_VPOLL|SH2_STATE_SLEEP) +#define SH2_IDLE_STATES (SH2_STATE_CPOLL|SH2_STATE_VPOLL|SH2_STATE_RPOLL|SH2_STATE_SLEEP) static int REGPARM(2) sh2_irq_cb(SH2 *sh2, int level) { diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 8a4b53654..c385d1417 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -61,29 +61,37 @@ static void (*m68k_write16_io)(u32 a, u32 d); #define POLL_THRESHOLD 3 static struct { - u32 addr, cycles; + u32 addr1, addr2, cycles; int cnt; } m68k_poll; static int m68k_poll_detect(u32 a, u32 cycles, u32 flags) { int ret = 0; + // support polling on 2 addresses - seen in Wolfenstein + int match = (a - m68k_poll.addr1 <= 2 || a - m68k_poll.addr2 <= 2); - if (a - 2 <= m68k_poll.addr && m68k_poll.addr <= a + 2 - && cycles - m68k_poll.cycles <= 64 && !SekNotPolling) + if (match && cycles - m68k_poll.cycles <= 64 && !SekNotPolling) { - if (m68k_poll.cnt++ > POLL_THRESHOLD) { + // detect split 32bit access by same cycle count, and ignore those + if (cycles != m68k_poll.cycles && m68k_poll.cnt++ > POLL_THRESHOLD) { if (!(Pico32x.emu_flags & flags)) { elprintf(EL_32X, "m68k poll addr %08x, cyc %u", a, cycles - m68k_poll.cycles); - ret = 1; } Pico32x.emu_flags |= flags; + ret = 1; } } else { + // reset poll state in case of restart by interrupt + Pico32x.emu_flags &= ~(P32XF_68KCPOLL|P32XF_68KVPOLL); + SekSetStop(0); m68k_poll.cnt = 0; - m68k_poll.addr = a; + if (!match) { + m68k_poll.addr2 = m68k_poll.addr1; + m68k_poll.addr1 = a; + } SekNotPolling = 0; } m68k_poll.cycles = cycles; @@ -99,15 +107,15 @@ void p32x_m68k_poll_event(u32 flags) Pico32x.emu_flags &= ~flags; SekSetStop(0); } - m68k_poll.addr = m68k_poll.cnt = 0; + m68k_poll.addr1 = m68k_poll.addr2 = m68k_poll.cnt = 0; } -static void sh2_poll_detect(SH2 *sh2, u32 a, u32 flags, int maxcnt) +static void NOINLINE sh2_poll_detect(u32 a, SH2 *sh2, u32 flags, int maxcnt) { - int cycles_left = sh2_cycles_left(sh2); + u32 cycles_done = sh2_cycles_done_t(sh2); - if (a == sh2->poll_addr && sh2->poll_cycles - cycles_left <= 10) { - if (sh2->poll_cnt++ > maxcnt) { + if (a - sh2->poll_addr <= 2 && CYCLES_GE(sh2->poll_cycles+20, cycles_done)) { + if (sh2->poll_cycles != cycles_done && ++sh2->poll_cnt >= maxcnt) { if (!(sh2->state & flags)) elprintf_sh2(sh2, EL_32X, "state: %02x->%02x", sh2->state, sh2->state | flags); @@ -115,16 +123,22 @@ static void sh2_poll_detect(SH2 *sh2, u32 a, u32 flags, int maxcnt) sh2->state |= flags; sh2_end_run(sh2, 1); pevt_log_sh2(sh2, EVT_POLL_START); - return; +#ifdef DRC_SH2 + if ((a & 0xc6000000) == 0x06000000) { + unsigned char *p = sh2->p_drcblk_ram; + p[(a & 0x3ffff) >> SH2_DRCBLK_RAM_SHIFT] |= 0x80; + } +#endif } } - else + else if (!(sh2->state & (SH2_STATE_CPOLL|SH2_STATE_VPOLL|SH2_STATE_RPOLL))) { sh2->poll_cnt = 0; - sh2->poll_addr = a; - sh2->poll_cycles = cycles_left; + sh2->poll_addr = a; + } + sh2->poll_cycles = cycles_done; } -void p32x_sh2_poll_event(SH2 *sh2, u32 flags, u32 m68k_cycles) +void NOINLINE p32x_sh2_poll_event(SH2 *sh2, u32 flags, u32 m68k_cycles) { if (sh2->state & flags) { elprintf_sh2(sh2, EL_32X, "state: %02x->%02x", sh2->state, @@ -134,10 +148,17 @@ void p32x_sh2_poll_event(SH2 *sh2, u32 flags, u32 m68k_cycles) sh2->m68krcycles_done = m68k_cycles; pevt_log_sh2_o(sh2, EVT_POLL_END); + sh2->state &= ~flags; +#ifdef DRC_SH2 + if ((sh2->poll_addr & 0xc6000000) == 0x06000000) { + unsigned char *p = sh2->p_drcblk_ram; + p[(sh2->poll_addr & 0x3ffff) >> SH2_DRCBLK_RAM_SHIFT] &= ~0x80; + } +#endif } - sh2->state &= ~flags; - sh2->poll_addr = sh2->poll_cycles = sh2->poll_cnt = 0; + if (!(sh2->state & (SH2_STATE_CPOLL|SH2_STATE_VPOLL|SH2_STATE_RPOLL))) + sh2->poll_addr = sh2->poll_cycles = sh2->poll_cnt = 0; } static void sh2s_sync_on_read(SH2 *sh2) @@ -151,6 +172,14 @@ static void sh2s_sync_on_read(SH2 *sh2) p32x_sync_other_sh2(sh2, sh2->m68krcycles_done + C_SH2_TO_M68K(sh2, cycles)); } +void p32x_sh2_poll_memory(unsigned int a, SH2 *sh2) +{ + DRC_SAVE_SR(sh2); + sh2_poll_detect(a, sh2, SH2_STATE_RPOLL, 5); + sh2s_sync_on_read(sh2); + DRC_RESTORE_SR(sh2); +} + // SH2 faking //#define FAKE_SH2 #ifdef FAKE_SH2 @@ -567,7 +596,7 @@ static u32 p32x_sh2reg_read16(u32 a, SH2 *sh2) return (r[0] & P32XS_FM) | Pico32x.sh2_regs[0] | Pico32x.sh2irq_mask[sh2->is_slave]; case 0x04: // H count (often as comm too) - sh2_poll_detect(sh2, a, SH2_STATE_CPOLL, 3); + sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 7); sh2s_sync_on_read(sh2); return Pico32x.sh2_regs[4 / 2]; case 0x06: @@ -596,7 +625,7 @@ static u32 p32x_sh2reg_read16(u32 a, SH2 *sh2) // comm port if ((a & 0x30) == 0x20) { - sh2_poll_detect(sh2, a, SH2_STATE_CPOLL, 3); + sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 7); sh2s_sync_on_read(sh2); return r[a / 2]; } @@ -614,7 +643,7 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) u32 old; a &= 0x3f; - sh2->poll_addr = 0; + sh2->poll_cnt = 0; switch (a) { case 0x00: // FM @@ -695,6 +724,7 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) return; REG8IN16(r, a) = d; + sh2_end_run(sh2, 1); p32x_m68k_poll_event(P32XF_68KCPOLL); p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, sh2_cycles_done_m68k(sh2)); @@ -711,7 +741,7 @@ static void p32x_sh2reg_write16(u32 a, u32 d, SH2 *sh2) { a &= 0x3e; - sh2->poll_addr = 0; + sh2->poll_cnt = 0; // comm if ((a & 0x30) == 0x20) { @@ -720,6 +750,7 @@ static void p32x_sh2reg_write16(u32 a, u32 d, SH2 *sh2) return; Pico32x.regs[a / 2] = d; + sh2_end_run(sh2, 1); p32x_m68k_poll_event(P32XF_68KCPOLL); p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, sh2_cycles_done_m68k(sh2)); @@ -1251,7 +1282,7 @@ static u32 REGPARM(2) sh2_read8_cs0(u32 a, SH2 *sh2) if ((a & 0x3fff0) == 0x4100) { d = p32x_vdp_read16(a); - sh2_poll_detect(sh2, a, SH2_STATE_VPOLL, 7); + sh2_poll_detect(a, sh2, SH2_STATE_VPOLL, 9); goto out_16to8; } @@ -1319,7 +1350,7 @@ static u32 REGPARM(2) sh2_read16_cs0(u32 a, SH2 *sh2) if ((a & 0x3fff0) == 0x4100) { d = p32x_vdp_read16(a); - sh2_poll_detect(sh2, a, SH2_STATE_VPOLL, 7); + sh2_poll_detect(a, sh2, SH2_STATE_VPOLL, 9); goto out; } @@ -1383,6 +1414,28 @@ static u32 REGPARM(2) sh2_read32_rom(u32 a, SH2 *sh2) } // writes +#ifdef DRC_SH2 +void NOINLINE sh2_sdram_checks(u32 a, int t, SH2 *sh2) +{ + int v = t & ~0x80; + + if (v) + sh2_drc_wcheck_ram(a, v, sh2); + if (t & 0x80) { + DRC_SAVE_SR(sh2); + sh2_end_run(sh2, 1); + p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_RPOLL, sh2_cycles_done_m68k(sh2)); + DRC_RESTORE_SR(sh2); + } +} + +void inline sh2_da_checks(u32 a, int t, SH2 *sh2) +{ + if (t) + sh2_drc_wcheck_da(a, t, sh2); +} +#endif + static void REGPARM(3) sh2_write_ignore(u32 a, u32 d, SH2 *sh2) { } @@ -1402,7 +1455,7 @@ static void REGPARM(3) sh2_write8_cs0(u32 a, u32 d, SH2 *sh2) if (Pico32x.regs[0] & P32XS_FM) { if ((a & 0x3fff0) == 0x4100) { - sh2->poll_addr = 0; + sh2->poll_cnt = 0; p32x_vdp_write8(a, d); goto out; } @@ -1431,38 +1484,26 @@ static void REGPARM(3) sh2_write8_dram(u32 a, u32 d, SH2 *sh2) static void REGPARM(3) sh2_write8_sdram(u32 a, u32 d, SH2 *sh2) { - u32 a1 = a & 0x3ffff; + u32 a1 = (a & 0x3ffff) ^ 1; + ((u8 *)sh2->p_sdram)[a1] = d; #ifdef DRC_SH2 u8 *p = sh2->p_drcblk_ram; int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; if (t) - sh2_drc_wcheck_ram(a, t, sh2); + sh2_sdram_checks(a, t, sh2); #endif - ((u8 *)sh2->p_sdram)[a1 ^ 1] = d; -} - -static void REGPARM(3) sh2_write8_sdram_wt(u32 a, u32 d, SH2 *sh2) -{ - // xmen sync hack.. - if (a < 0x26000200) { - DRC_SAVE_SR(sh2); - sh2_end_run(sh2, 32); - DRC_RESTORE_SR(sh2); - } - - sh2_write8_sdram(a, d, sh2); } static void REGPARM(3) sh2_write8_da(u32 a, u32 d, SH2 *sh2) { - u32 a1 = a & 0xfff; + u32 a1 = (a & 0xfff) ^ 1; + sh2->data_array[a1] = d; #ifdef DRC_SH2 u8 *p = sh2->p_drcblk_da; int t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; if (t) - sh2_drc_wcheck_da(a, t, sh2); + sh2_da_checks(a, t, sh2); #endif - sh2->data_array[a1 ^ 1] = d; } // write16 @@ -1481,7 +1522,7 @@ static void REGPARM(3) sh2_write16_cs0(u32 a, u32 d, SH2 *sh2) if (Pico32x.regs[0] & P32XS_FM) { if ((a & 0x3fff0) == 0x4100) { - sh2->poll_addr = 0; + sh2->poll_cnt = 0; p32x_vdp_write16(a, d, sh2); goto out; } @@ -1511,25 +1552,25 @@ static void REGPARM(3) sh2_write16_dram(u32 a, u32 d, SH2 *sh2) static void REGPARM(3) sh2_write16_sdram(u32 a, u32 d, SH2 *sh2) { u32 a1 = a & 0x3fffe; + ((u16 *)sh2->p_sdram)[a1 / 2] = d; #ifdef DRC_SH2 u8 *p = sh2->p_drcblk_ram; int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; if (t) - sh2_drc_wcheck_ram(a, t, sh2); + sh2_sdram_checks(a, t, sh2); #endif - ((u16 *)sh2->p_sdram)[a1 / 2] = d; } static void REGPARM(3) sh2_write16_da(u32 a, u32 d, SH2 *sh2) { u32 a1 = a & 0xffe; + ((u16 *)sh2->data_array)[a1 / 2] = d; #ifdef DRC_SH2 u8 *p = sh2->p_drcblk_da; int t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; if (t) - sh2_drc_wcheck_da(a, t, sh2); + sh2_da_checks(a, t, sh2); #endif - ((u16 *)sh2->data_array)[a1 / 2] = d; } static void REGPARM(3) sh2_write16_rom(u32 a, u32 d, SH2 *sh2) @@ -1580,31 +1621,31 @@ static void REGPARM(3) sh2_write32_dram(u32 a, u32 d, SH2 *sh2) static void REGPARM(3) sh2_write32_sdram(u32 a, u32 d, SH2 *sh2) { u32 a1 = a & 0x3fffc; + *(u32 *)(sh2->p_sdram + a1) = (d << 16) | (d >> 16); #ifdef DRC_SH2 u8 *p = sh2->p_drcblk_ram; int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; if (t) - sh2_drc_wcheck_ram(a, t, sh2); + sh2_sdram_checks(a, t, sh2); int u = p[(a1+2) >> SH2_DRCBLK_RAM_SHIFT]; if (u) - sh2_drc_wcheck_ram(a+2, u, sh2); + sh2_sdram_checks(a+2, u, sh2); #endif - *(u32 *)(sh2->p_sdram + a1) = (d << 16) | (d >> 16); } static void REGPARM(3) sh2_write32_da(u32 a, u32 d, SH2 *sh2) { u32 a1 = a & 0xffc; + *((u32 *)sh2->data_array + a1/4) = (d << 16) | (d >> 16); #ifdef DRC_SH2 u8 *p = sh2->p_drcblk_da; int t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; if (t) - sh2_drc_wcheck_da(a, t, sh2); + sh2_da_checks(a, t, sh2); int u = p[(a1+2) >> SH2_DRCBLK_DA_SHIFT]; if (u) - sh2_drc_wcheck_da(a+2, u, sh2); + sh2_da_checks(a+2, u, sh2); #endif - *((u32 *)sh2->data_array + a1/4) = (d << 16) | (d >> 16); } static void REGPARM(3) sh2_write32_rom(u32 a, u32 d, SH2 *sh2) @@ -2040,8 +2081,7 @@ void PicoMemSetup32x(void) sh2_read8_map[0x06/2].addr = sh2_read8_map[0x26/2].addr = sh2_read16_map[0x06/2].addr = sh2_read16_map[0x26/2].addr = sh2_read32_map[0x06/2].addr = sh2_read32_map[0x26/2].addr = MAP_MEMORY(Pico32xMem->sdram); - sh2_write8_map[0x06/2] = sh2_write8_sdram; - sh2_write8_map[0x26/2] = sh2_write8_sdram_wt; + sh2_write8_map[0x06/2] = sh2_write8_map[0x26/2] = sh2_write8_sdram; sh2_write16_map[0x06/2] = sh2_write16_map[0x26/2] = sh2_write16_sdram; sh2_write32_map[0x06/2] = sh2_write32_map[0x26/2] = sh2_write32_sdram; sh2_read8_map[0x06/2].mask = sh2_read8_map[0x26/2].mask = 0x03ffff; diff --git a/pico/32x/memory_arm.S b/pico/32x/memory_arm.S index 1082c7b7f..b449370b1 100644 --- a/pico/32x/memory_arm.S +++ b/pico/32x/memory_arm.S @@ -227,9 +227,9 @@ sh2_write32_sdram: ldrb r1, [ip, r3, lsr #SH2_RAM_SHIFT+1]! cmp r1, #0 beq 1f - stmfd sp!, {r0, r1, r2, ip} + stmfd sp!, {r0, r2, ip, lr} bl sh2_drc_wcheck_ram - ldmfd sp!, {r0, r1, r2, ip} + ldmfd sp!, {r0, r2, ip, lr} 1: ldrb r1, [ip, #1] cmp r1, #0 bxeq lr @@ -250,9 +250,9 @@ sh2_write32_da: ldrb r1, [ip, r3, lsr #SH2_DA_SHIFT+1]! cmp r1, #0 beq 1f - stmfd sp!, {r0, r1, r2, ip} + stmfd sp!, {r0, r2, ip, lr} bl sh2_drc_wcheck_da - ldmfd sp!, {r0, r1, r2, ip} + ldmfd sp!, {r0, r2, ip, lr} 1: ldrb r1, [ip, #1] cmp r1, #0 bxeq lr @@ -269,7 +269,6 @@ sh2_write32_dram: moveq r1, r1, ror #16 streq r1, [ip, r3, lsr #SH2_DRAM_SHIFT] bxeq lr -#if 1 ldr r0, [ip, r3, lsr #SH2_DRAM_SHIFT] mov r1, r1, ror #16 mov r2, #0 @@ -284,20 +283,6 @@ sh2_write32_dram: bic r0, r0, r2 orr r0, r0, r1 str r0, [ip, r3, lsr #SH2_DRAM_SHIFT] -#else - add ip, ip, r3, lsr #SH2_DRAM_SHIFT - tst r1, #0x00ff0000 - lsrne r3, r1, #16 - strneb r3, [ip, #0] - tst r1, #0xff000000 - lsrne r3, r1, #24 - strneb r3, [ip, #1] - tst r1, #0x000000ff - strneb r1, [ip, #2] - tst r1, #0x0000ff00 - lsrne r3, r1, #8 - strneb r3, [ip, #3] -#endif bx lr .pool diff --git a/pico/32x/sh2soc.c b/pico/32x/sh2soc.c index 4aae2a045..dd61a93bb 100644 --- a/pico/32x/sh2soc.c +++ b/pico/32x/sh2soc.c @@ -138,6 +138,7 @@ static void dmac_trigger(SH2 *sh2, struct dma_chan *chan) if (chan->chcr & DMA_AR) { // auto-request transfer + sh2->state |= SH2_STATE_SLEEP; while ((int)chan->tcr > 0) dmac_transfer_one(sh2, chan); dmac_transfer_complete(sh2, chan); @@ -237,6 +238,7 @@ u32 REGPARM(2) sh2_peripheral_read8(u32 a, SH2 *sh2) a &= 0x1ff; d = PREG8(r, a); + sh2->poll_cnt = 0; elprintf_sh2(sh2, EL_32XP, "peri r8 [%08x] %02x @%06x", a | ~0x1ff, d, sh2_pc(sh2)); return d; @@ -250,6 +252,7 @@ u32 REGPARM(2) sh2_peripheral_read16(u32 a, SH2 *sh2) a &= 0x1fe; d = r[(a / 2) ^ 1]; + sh2->poll_cnt = 0; elprintf_sh2(sh2, EL_32XP, "peri r16 [%08x] %04x @%06x", a | ~0x1ff, d, sh2_pc(sh2)); return d; @@ -258,9 +261,11 @@ u32 REGPARM(2) sh2_peripheral_read16(u32 a, SH2 *sh2) u32 REGPARM(2) sh2_peripheral_read32(u32 a, SH2 *sh2) { u32 d; + a &= 0x1fc; d = sh2->peri_regs[a / 4]; + sh2->poll_cnt = 0; elprintf_sh2(sh2, EL_32XP, "peri r32 [%08x] %08x @%06x", a | ~0x1ff, d, sh2_pc(sh2)); return d; @@ -472,6 +477,7 @@ static void dreq1_do(SH2 *sh2, struct dma_chan *chan) if ((chan->dar & ~0xf) != 0x20004030) elprintf(EL_32XP|EL_ANOMALY, "dreq1: bad dar?: %08x\n", chan->dar); + sh2->state |= SH2_STATE_SLEEP; dmac_transfer_one(sh2, chan); if (chan->tcr == 0) dmac_transfer_complete(sh2, chan); diff --git a/pico/pico_int.h b/pico/pico_int.h index 831bfc725..2c55c941a 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -932,6 +932,7 @@ void Pico32xSwapDRAM(int b); void Pico32xMemStateLoaded(void); void p32x_update_banks(void); void p32x_m68k_poll_event(unsigned int flags); +void p32x_sh2_poll_memory(unsigned int a, SH2 *sh2); void p32x_sh2_poll_event(SH2 *sh2, unsigned int flags, unsigned int m68k_cycles); // 32x/draw.c From e9a3de1ed4a95d0da94425b94bddcebdb043b895 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 22 May 2019 21:01:00 +0200 Subject: [PATCH 037/174] sh2 drc, block management bugfixes and cleanup --- cpu/sh2/compiler.c | 159 ++++++++++++++++++++++----------------------- 1 file changed, 77 insertions(+), 82 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index b7f54dd9e..2e6aa7c9b 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -943,20 +943,14 @@ static struct block_desc *dr_find_inactive_block(int tcache_id, u16 crc, u32 addr, int size, u32 addr_lit, int size_lit) { struct block_list **head = &inactive_blocks[tcache_id]; - struct block_list *prev = NULL, *current = *head; + struct block_list *current; - for (; current != NULL; prev = current, current = current->next) { + for (current = *head; current != NULL; current = current->next) { struct block_desc *block = current->block; if (block->crc == crc && block->addr == addr && block->size == size && block->addr_lit == addr_lit && block->size_lit == size_lit) { - if (prev == NULL) - *head = current->next; - else - prev->next = current->next; - block->list = NULL; // should now be empty - current->next = blist_free; - blist_free = current; + rm_from_block_lists(block); return block; } } @@ -1031,6 +1025,47 @@ static void *dr_failure(void) exit(1); } +#if LINK_BRANCHES +static void dr_block_link(struct block_entry *be, struct block_link *bl, int emit_jump) +{ + dbg(2, "- %slink from %p to pc %08x entry %p", emit_jump ? "":"early ", + bl->jump, bl->target_pc, be->tcache_ptr); + + if (emit_jump) + emith_jump_patch(bl->jump, be->tcache_ptr); + // could sync arm caches here, but that's unnecessary + + // move bl to block_entry + bl->target = be; + bl->prev = NULL; + if (be->links) + be->links->prev = bl; + bl->next = be->links; + be->links = bl; +} + +static void dr_block_unlink(struct block_link *bl, int emit_jump) +{ + dbg(2,"- unlink from %p to pc %08x", bl->jump, bl->target_pc); + + if (bl->target) { + if (emit_jump) { + emith_jump_patch(bl->jump, sh2_drc_dispatcher); + // update cpu caches since the previous jump target doesn't exist anymore + host_instructions_updated(bl->jump, bl->jump+4); + } + + if (bl->prev) + bl->prev->next = bl->next; + else + bl->target->links = bl->next; + if (bl->next) + bl->next->prev = bl->prev; + bl->target = NULL; + } +} +#endif + static void *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_slave, int tcache_id) { #if LINK_BRANCHES @@ -1064,13 +1099,7 @@ static void *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_sla owner->o_links = bl; if (be != NULL) { - dbg(2, "- early link from %p to pc %08x entry %p", bl->jump, pc, be->tcache_ptr); - bl->target = be; - bl->prev = NULL; - if (be->links) - be->links->prev = bl; - bl->next = be->links; - be->links = bl; + dr_block_link(be, bl, 0); // jump not yet emitted by translate() return be->tcache_ptr; } else { @@ -1092,23 +1121,12 @@ static void dr_link_blocks(struct block_entry *be, int tcache_id) while (bl != NULL) { next = bl->next; - if (bl->target_pc == pc) { - dbg(2, "- link from %p to pc %08x entry %p", bl->jump, pc, be->tcache_ptr); - // move bl from unresolved_links to block_entry - rm_from_hashlist_unresolved(bl, tcache_id); - - emith_jump_patch(bl->jump, be->tcache_ptr); - bl->target = be; - bl->prev = NULL; - if (be->links) - be->links->prev = bl; - bl->next = be->links; - be->links = bl; + if (bl->target_pc == pc && (!bl->tcache_id || bl->tcache_id == tcache_id)) { + rm_from_hashlist_unresolved(bl, bl->tcache_id); + dr_block_link(be, bl, 1); } bl = next; } - - // could sync arm caches here, but that's unnecessary #endif } @@ -1119,22 +1137,13 @@ static void dr_link_outgoing(struct block_entry *be, int tcache_id, int is_slave int target_tcache_id; for (bl = be->o_links; bl; bl = bl->o_next) { - be = dr_get_entry(bl->target_pc, is_slave, &target_tcache_id); - if (!target_tcache_id || target_tcache_id == tcache_id) { - if (be) { - dbg(2, "- link from %p to pc %08x entry %p", bl->jump, bl->target_pc, be->tcache_ptr); - emith_jump_patch(bl->jump, be->tcache_ptr); - bl->target = be; - bl->prev = NULL; - if (be->links) - be->links->prev = bl; - bl->next = be->links; - be->links = bl; - } else { - emith_jump_patch(bl->jump, sh2_drc_dispatcher); - add_to_hashlist_unresolved(bl, tcache_id); + if (bl->target == NULL) { + be = dr_get_entry(bl->target_pc, is_slave, &target_tcache_id); + if (be != NULL && (!target_tcache_id || target_tcache_id == tcache_id)) { + // remove bl from unresolved_links (must've been since target was NULL) + rm_from_hashlist_unresolved(bl, bl->tcache_id); + dr_block_link(be, bl, 1); } - host_instructions_updated(bl->jump, bl->jump+4); } } #endif @@ -4381,65 +4390,48 @@ static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nol struct block_link *bl; u32 i; - dbg(2, " killing entry %08x-%08x,%08x-%08x, blkid %d,%d", + free = free || nolit; // block is invalid if literals are overwritten + dbg(2," %sing block %08x-%08x,%08x-%08x, blkid %d,%d", free?"delet":"disabl", bd->addr, bd->addr + bd->size, bd->addr_lit, bd->addr_lit + bd->size_lit, tcache_id, bd - block_tables[tcache_id]); if (bd->addr == 0 || bd->entry_count == 0) { dbg(1, " killing dead block!? %08x", bd->addr); return; } - free = free || nolit; // block is invalid if literals are overwritten - // remove from hash table, make incoming links unresolved, revoke outgoing links - for (i = 0; i < bd->entry_count; i++) { - if (bd->active) + // remove from hash table, make incoming links unresolved + if (bd->active) { + for (i = 0; i < bd->entry_count; i++) { rm_from_hashlist(&bd->entryp[i], tcache_id); - for (bl = bd->entryp[i].o_links; bl != NULL; ) { - if (bl->target) { - if (bl->prev) - bl->prev->next = bl->next; - else - bl->target->links = bl->next; - if (bl->next) - bl->next->prev = bl->prev; - bl->target = NULL; - } else if (bd->active) - rm_from_hashlist_unresolved(bl, tcache_id); - bl = bl->o_next; - } - - for (bl = bd->entryp[i].links; bl != NULL; ) { - struct block_link *bl_next = bl->next; - dbg(2, "- unlink from %p to pc %08x", bl->jump, bl->target_pc); - emith_jump_patch(bl->jump, sh2_drc_dispatcher); - // update cpu caches since the previous jump target doesn't exist anymore - host_instructions_updated(bl->jump, bl->jump+4); - - add_to_hashlist_unresolved(bl, tcache_id); - bl = bl_next; + while ((bl = bd->entryp[i].links) != NULL) { + dr_block_unlink(bl, 1); + add_to_hashlist_unresolved(bl, tcache_id); + } } - bd->entryp[i].links = NULL; - } - if (bd->active) dr_mark_memory(-1, bd, tcache_id, nolit); + add_to_block_list(&inactive_blocks[tcache_id], bd); + } + bd->active = 0; if (free) { - while ((bl = bd->entryp[0].o_links) != NULL) { - bd->entryp[0].o_links = bl->next; + // revoke outgoing links + for (bl = bd->entryp[0].o_links; bl != NULL; bl = bl->o_next) { + if (bl->target) + dr_block_unlink(bl, 0); + else + rm_from_hashlist_unresolved(bl, tcache_id); bl->jump = NULL; bl->next = blink_free[bl->tcache_id]; blink_free[bl->tcache_id] = bl; } bd->entryp[0].o_links = NULL; + // invalidate block rm_from_block_lists(bd); bd->addr = bd->size = bd->addr_lit = bd->size_lit = 0; bd->entry_count = 0; - } else { - add_to_block_list(&inactive_blocks[tcache_id], bd); } - bd->active = 0; } static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift) @@ -4454,10 +4446,12 @@ static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift) int removed = 0; #endif - // need to check cached and writethrough area + // ignore cache-through a &= wtmask; + blist = &inval_lookup[tcache_id][(a & mask) / INVAL_PAGE_SIZE]; entry = *blist; + // go through the block list for this range while (entry != NULL) { next = entry->next; block = entry->block; @@ -4465,6 +4459,7 @@ static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift) end_addr = start_addr + block->size; start_lit = block->addr_lit & wtmask; end_lit = start_lit + block->size_lit; + // disable/delete block if it covers the modified address if ((start_addr <= a && a < end_addr) || (start_lit <= a && a < end_lit)) { From 79f45561feaeb0ae264f146dc3c00820b51f99df Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 22 May 2019 21:04:59 +0200 Subject: [PATCH 038/174] sh2 drc, register cache optimisations --- cpu/sh2/compiler.c | 317 ++++++++++++++++++++++++--------------------- pico/32x/memory.c | 26 ++++ pico/pico_int.h | 1 + 3 files changed, 195 insertions(+), 149 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 2e6aa7c9b..9160c90c6 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -138,10 +138,11 @@ enum op_types { OP_MOVE, // register move OP_LOAD_CONST,// load const to register OP_LOAD_POOL, // literal pool load, imm is address - OP_MOVA, - OP_SLEEP, - OP_RTE, - OP_TRAPA, + OP_MOVA, // MOVA instruction + OP_SLEEP, // SLEEP instruction + OP_RTE, // RTE instruction + OP_TRAPA, // TRAPA instruction + OP_LDC, // LDC instruction OP_UNDEFINED, }; @@ -552,31 +553,25 @@ static int dr_is_rom(u32 a) return (a & 0xc6000000) == 0x02000000 && (a & 0x3f0000) < 0x3e0000; } -static int dr_ctx_get_mem_ptr(u32 a, u32 *mask) +static int dr_ctx_get_mem_ptr(SH2 *sh2, u32 a, u32 *mask) { + void *memptr; int poffs = -1; - if ((a & ~0x7ff) == 0) { - // BIOS + // check if region is mapped memory + memptr = p32x_sh2_get_mem_ptr(a, mask, sh2); + if (memptr == NULL /*|| (a & ((1 << SH2_READ_SHIFT)-1) & ~*mask) != 0*/) + return poffs; + + if (memptr == sh2->p_bios) // BIOS poffs = offsetof(SH2, p_bios); - *mask = 0x7ff; - } - else if ((a & 0xfffff000) == 0xc0000000) { - // data array + else if (memptr == sh2->p_da) // data array // FIXME: access sh2->data_array instead poffs = offsetof(SH2, p_da); - *mask = 0xfff; - } - else if ((a & 0xc6000000) == 0x06000000) { - // SDRAM + else if (memptr == sh2->p_sdram) // SDRAM poffs = offsetof(SH2, p_sdram); - *mask = 0x03ffff; - } - else if ((a & 0xc6000000) == 0x02000000) { - // ROM + else if (memptr == sh2->p_rom) // ROM poffs = offsetof(SH2, p_rom); - *mask = 0x3fffff; - } return poffs; } @@ -1365,6 +1360,7 @@ static u32 rcache_locked; static u32 rcache_hint_soon; static u32 rcache_hint_late; static u32 rcache_hint_write; +static u32 rcache_hint_clean; #define rcache_hint (rcache_hint_soon|rcache_hint_late) static void rcache_unmap_vreg(int x) @@ -1396,16 +1392,19 @@ static void rcache_clean_vreg(int x) emith_move_r_r(cache_regs[guest_regs[r].sreg].hreg, cache_regs[guest_regs[r].vreg].hreg); rcache_remove_vreg_alias(x, r); rcache_add_vreg_alias(guest_regs[r].sreg, r); + cache_regs[guest_regs[r].sreg].flags |= HRF_DIRTY; } else { // must evict since sreg is locked emith_ctx_write(cache_regs[x].hreg, r * 4); + guest_regs[r].flags &= ~GRF_DIRTY; guest_regs[r].vreg = -1; } } - } else + } else if (~rcache_hint_write & (1 << r)) { emith_ctx_write(cache_regs[x].hreg, r * 4); - } - guest_regs[r].flags &= ~GRF_DIRTY;) + guest_regs[r].flags &= ~GRF_DIRTY; + } + }) } } @@ -1654,7 +1653,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr (cache_regs[i].flags & HRF_LOCKED) || (cache_regs[i].type == HR_STATIC && !(guest_regs[r].flags & GRF_STATIC))) { // need to split up. take reg out here to avoid unnecessary writebacks - cache_regs[i].gregs &= ~(1 << r); + rcache_remove_vreg_alias(i, r); split = i; } else { // aliases not needed anytime soon, remove them @@ -1809,7 +1808,8 @@ static int rcache_get_reg_arg(int arg, sh2_reg_e r, int *hr) // r is needed later on anyway srcr = rcache_get_reg_(r, RC_GR_READ, 0, NULL); is_cached = (cache_regs[reg_map_host[srcr]].type == HR_CACHED); - } else if ((guest_regs[r].flags & GRF_CDIRTY) && gconst_get(r, &val)) { + } else if (!(rcache_hint_clean & (1 << r)) && + (guest_regs[r].flags & GRF_CDIRTY) && gconst_get(r, &val)) { // r has an uncomitted const - load into arg, but keep constant uncomitted srcr = dstr; is_const = 1; @@ -1822,7 +1822,7 @@ static int rcache_get_reg_arg(int arg, sh2_reg_e r, int *hr) srcr = dstr; if (rcache_static & (1 << r)) srcr = rcache_get_reg_(r, RC_GR_READ, 0, NULL); - else if (gconst_try_read(guest_regs[r].vreg, r)) + else if (gconst_try_read(dstid, r)) dirty = 1; else emith_ctx_read(srcr, r * 4); @@ -1856,14 +1856,18 @@ static int rcache_get_reg_arg(int arg, sh2_reg_e r, int *hr) } else if (hr != NULL) { // caller will modify arg, so it will soon be out of sync with r if (dirty || src_dirty) { - emith_ctx_write(dstr, r * 4); // must clean since arg will be modified - guest_regs[r].flags &= ~GRF_DIRTY; + if (~rcache_hint_write & (1 << r)) { + emith_ctx_write(dstr, r * 4); // must clean since arg will be modified + guest_regs[r].flags &= ~GRF_DIRTY; + } } - } else if (guest_regs[r].vreg < 0) { + } else { // keep arg as vreg for r cache_regs[dstid].type = HR_CACHED; - cache_regs[dstid].gregs = 1 << r; - guest_regs[r].vreg = dstid; + if (guest_regs[r].vreg < 0) { + cache_regs[dstid].gregs = 1 << r; + guest_regs[r].vreg = dstid; + } if (dirty || src_dirty) { // mark as modifed for cleaning later on cache_regs[dstid].flags |= HRF_DIRTY; guest_regs[r].flags |= GRF_DIRTY; @@ -2057,9 +2061,9 @@ static void rcache_clean_mask(u32 mask) { int i; - // XXX consider gconst? - if (!(mask &= ~rcache_static & ~gconst_dirty_mask())) + if (!(mask &= ~rcache_static)) return; + rcache_hint_clean |= mask; // clean only vregs where all aliases are covered by the mask for (i = 0; i < ARRAY_SIZE(cache_regs); i++) @@ -2120,7 +2124,7 @@ static void rcache_invalidate(void) } rcache_counter = 0; - rcache_hint_soon = rcache_hint_late = rcache_hint_write = 0; + rcache_hint_soon = rcache_hint_late = rcache_hint_write = rcache_hint_clean = 0; gconst_invalidate(); } @@ -2164,48 +2168,76 @@ static void rcache_init(void) // --------------------------------------------------------------- -static int emit_get_rbase_and_offs(SH2 *sh2, u32 a, u32 *offs) +// NB may return either REG or TEMP +static int emit_get_rbase_and_offs(SH2 *sh2, sh2_reg_e r, int rmod, u32 *offs) { - u32 omask = 0xff; // offset mask, XXX: ARM oriented.. + uptr omask = 0xff; // offset mask, XXX: ARM oriented.. u32 mask = 0; + u32 a; int poffs; - int hr; - unsigned long la; + int hr, hr2; + uptr la; - poffs = dr_ctx_get_mem_ptr(a, &mask); + // is r constant and points to a memory region? + if (! gconst_get(r, &a)) + return -1; + poffs = dr_ctx_get_mem_ptr(sh2, a, &mask); if (poffs == -1) return -1; - hr = rcache_get_tmp(); if (mask < 0x1000) { - // can't access data array or BIOS directly from ROM or SDRAM, - // since code may run on both SH2s (tcache_id of translation block needed)) + // data array or BIOS, can't safely access directly since translated code + // may run on both SH2s + hr = rcache_get_tmp(); emith_ctx_read_ptr(hr, poffs); + a += *offs; if (a & mask & ~omask) emith_add_r_r_ptr_imm(hr, hr, a & mask & ~omask); *offs = a & omask; + return hr; + } + + la = (uptr)*(void **)((char *)sh2 + poffs); + // accessing ROM or SDRAM, code location doesn't matter. The host address + // for these should be mmapped to be equal to the SH2 address. + // if r is in rcache or needed soon anyway, and offs is relative to region + // use rcached const to avoid loading a literal on ARM + if ((guest_regs[r].vreg >= 0 || ((guest_regs[r].flags & GRF_CDIRTY) && + ((rcache_hint_soon|rcache_hint_clean) & (1 << r)))) && !(*offs & ~mask)) { + u32 odd = a & 1; // need to fix odd address for correct byte addressing + la -= (s32)((a & ~mask) - *offs - odd); // diff between reg and memory + // if reg is modified later on, allocate it RMW to remove aliases here + // else the aliases vreg stays locked and a vreg shortage may occur. + hr = hr2 = rcache_get_reg(r, rmod ? RC_GR_RMW : RC_GR_READ, NULL); + if ((la & ~omask) - odd) { + hr = rcache_get_tmp(); + emith_add_r_r_ptr_imm(hr, hr2, (la & ~omask) - odd); + } + *offs = (la & omask); } else { // known fixed host address - la = (unsigned long)*(void **)((char *)sh2 + poffs) + (a & mask); - *offs = la & omask; + la += (a + *offs) & mask; + hr = rcache_get_tmp(); emith_move_r_ptr_imm(hr, la & ~omask); + *offs = la & omask; } return hr; } // read const data from const ROM address -static int emit_get_rom_data(sh2_reg_e r, u32 offs, int size, u32 *val) +static int emit_get_rom_data(SH2 *sh2, sh2_reg_e r, u32 offs, int size, u32 *val) { - u32 tmp; + u32 a, mask; *val = 0; - if (gconst_get(r, &tmp)) { - tmp += offs; - if (dr_is_rom(tmp)) { + if (gconst_get(r, &a)) { + a += offs; + // check if rom is memory mapped (not bank switched), and address is in rom + if (dr_is_rom(a) && p32x_sh2_get_mem_ptr(a, &mask, sh2)) { switch (size & MF_SIZEMASK) { - case 0: *val = (s8)p32x_sh2_read8(tmp, sh2s); break; // 8 - case 1: *val = (s16)p32x_sh2_read16(tmp, sh2s); break; // 16 - case 2: *val = p32x_sh2_read32(tmp, sh2s); break; // 32 + case 0: *val = (s8)p32x_sh2_read8(a, sh2s); break; // 8 + case 1: *val = (s16)p32x_sh2_read16(a, sh2s); break; // 16 + case 2: *val = p32x_sh2_read32(a, sh2s); break; // 32 } return 1; } @@ -2315,10 +2347,10 @@ static void emit_memhandler_write(int size) static int emit_memhandler_read_rr(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rs, u32 offs, int size) { int hr, hr2; - u32 val, offs2; + u32 val; #if PROPAGATE_CONSTANTS - if (emit_get_rom_data(rs, offs, size, &val)) { + if (emit_get_rom_data(sh2, rs, offs, size, &val)) { if (rd == SHR_TMP) { hr2 = rcache_get_tmp(); emith_move_r_imm(hr2, val); @@ -2331,47 +2363,49 @@ static int emit_memhandler_read_rr(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rs, u32 off return hr2; } - if (gconst_get(rs, &val)) { - hr = emit_get_rbase_and_offs(sh2, val + offs, &offs2); - if (hr != -1) { - if (rd == SHR_TMP) - hr2 = rcache_get_tmp(); - else - hr2 = rcache_get_reg(rd, RC_GR_WRITE, NULL); - switch (size & MF_SIZEMASK) { - case 0: // 8 - emith_read8s_r_r_offs(hr2, hr, offs2 ^ 1); - break; - case 1: // 16 - emith_read16s_r_r_offs(hr2, hr, offs2); - break; - case 2: // 32 - emith_read_r_r_offs(hr2, hr, offs2); - emith_ror(hr2, hr2, 16); - break; - } + hr = emit_get_rbase_and_offs(sh2, rs, size & MF_POSTINCR, &offs); + if (hr != -1) { + if (rd == SHR_TMP) + hr2 = rcache_get_tmp(); + else + hr2 = rcache_get_reg(rd, RC_GR_WRITE, NULL); + switch (size & MF_SIZEMASK) { + case 0: emith_read8s_r_r_offs(hr2, hr, offs ^ 1); break; // 8 + case 1: emith_read16s_r_r_offs(hr2, hr, offs); break; // 16 + case 2: emith_read_r_r_offs(hr2, hr, offs); emith_ror(hr2, hr2, 16); break; + } + if (cache_regs[reg_map_host[hr]].type == HR_TEMP) // may also return REG rcache_free_tmp(hr); - if (size & MF_POSTINCR) + if (size & MF_POSTINCR) { + int isgc = gconst_get(rs, &val); + if (!isgc || guest_regs[rs].vreg >= 0) { + // already loaded + hr = rcache_get_reg(rs, RC_GR_RMW, NULL); + emith_add_r_r_imm(hr, hr, 1 << (size & MF_SIZEMASK)); + if (isgc) + gconst_set(rs, val + (1 << (size & MF_SIZEMASK))); + } else gconst_new(rs, val + (1 << (size & MF_SIZEMASK))); - return hr2; } + return hr2; } #endif - if (gconst_get(rs, &val) && (!(size & MF_POSTINCR) /*|| !(rcache_hint_soon & (1 << rs))*/)) { + + if (gconst_get(rs, &val) && guest_regs[rs].vreg < 0 && !(rcache_hint_soon & (1 << rs))) { hr = rcache_get_tmp_arg(0); emith_move_r_imm(hr, val + offs); if (size & MF_POSTINCR) gconst_new(rs, val + (1 << (size & MF_SIZEMASK))); - } else if (offs || (size & MF_POSTINCR)) { + } else if (size & MF_POSTINCR) { + hr = rcache_get_tmp_arg(0); + hr2 = rcache_get_reg(rs, RC_GR_RMW, NULL); + emith_add_r_r_imm(hr, hr2, offs); + emith_add_r_imm(hr2, 1 << (size & MF_SIZEMASK)); + } else { hr = rcache_get_reg_arg(0, rs, &hr2); if (offs || hr != hr2) emith_add_r_r_imm(hr, hr2, offs); - if (size & MF_POSTINCR) { - hr = rcache_get_reg(rs, RC_GR_WRITE, NULL); - emith_add_r_r_imm(hr, hr2, 1 << (size & MF_SIZEMASK)); - } - } else - rcache_get_reg_arg(0, rs, NULL); + } hr = emit_memhandler_read(size); size &= MF_SIZEMASK; @@ -2405,7 +2439,7 @@ static void emit_memhandler_write_rr(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rs, u32 o } else hr2 = rcache_get_reg_arg(1, rd, NULL); - if (gconst_get(rs, &val) && (!(size & MF_PREDECR) /*|| !(rcache_hint_soon & (1 << rs))*/)) { + if (gconst_get(rs, &val) && guest_regs[rs].vreg < 0 && !(rcache_hint_soon & (1 << rs))) { if (size & MF_PREDECR) { val -= 1 << (size & MF_SIZEMASK); gconst_new(rs, val); @@ -2551,7 +2585,7 @@ static void emit_block_entry(void) cycles = 0; \ } -static void *dr_get_pc_base(u32 pc, int is_slave); +static void *dr_get_pc_base(u32 pc, SH2 *sh2); static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { @@ -2591,7 +2625,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) base_pc = sh2->pc; // get base/validate PC - dr_pc_base = dr_get_pc_base(base_pc, sh2->is_slave); + dr_pc_base = dr_get_pc_base(base_pc, sh2); if (dr_pc_base == (void *)-1) { printf("invalid PC, aborting: %08x\n", base_pc); // FIXME: be less destructive @@ -2637,6 +2671,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) op_flags[i] &= ~OF_BTARGET; if (op_flags[i] & OF_BTARGET) ADD_TO_ARRAY(branch_target_pc, branch_target_count, pc, ); + if (ops[i].op == OP_LDC && (ops[i].dest & BITMASK1(SHR_SR)) && pc+2 < end_pc) + op_flags[i+1] |= OF_BTARGET; // RTE entrypoint in case of SR(IMASK) change #if LOOP_DETECTION // loop types detected: // 1. target: ... BRA target -> idle loop @@ -2930,7 +2966,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) u32 late = 0; // regs read by future ops u32 write = 0; // regs written to (to detect write before read) u32 soon = 0; // regs read soon - tmp = OP_ISBRANCH(opd[0].op); // branch insn detected + tmp = (OP_ISBRANCH(opd[0].op) || opd[0].op == OP_RTE || // branching insns + opd[0].op == OP_TRAPA || opd[0].op == OP_UNDEFINED); for (v = 1; v <= 9; v++) { // no sense in looking any further than the next rcache flush if (pc + 2*v < end_pc && !(op_flags[i+v] & OF_BTARGET) && @@ -2944,7 +2981,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_clean_mask(rcache_dirty_mask() & ~tmp2); break; } - // XXX must also include test-irq locations! tmp |= (OP_ISBRANCH(opd[v].op) || opd[v].op == OP_RTE || opd[v].op == OP_TRAPA || opd[v].op == OP_UNDEFINED); // regs needed in the next few instructions @@ -2953,7 +2989,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } rcache_set_hint_soon(late); // insns 1-3 rcache_set_hint_late(late & ~soon); // insns 4-9 - rcache_set_hint_write(write & ~(late|soon)); // next access is write + rcache_set_hint_write(write & ~(late|soon) & ~opd[0].source); + // overwritten without being used } rcache_set_locked(opd[0].source); // try not to evict src regs for this op @@ -2973,32 +3010,22 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case OP_BRANCH_R: if (opd->dest & BITMASK1(SHR_PR)) emit_move_r_imm32(SHR_PR, pc + 2); - if (gconst_get(opd->rm, &u)) { - opd->imm = u; - drcf.pending_branch_direct = 1; - } else { - emit_move_r_r(SHR_PC, opd->rm); - drcf.pending_branch_indirect = 1; - } + emit_move_r_r(SHR_PC, opd->rm); + drcf.pending_branch_indirect = 1; goto end_op; case OP_BRANCH_RF: - if (gconst_get(GET_Rn(), &u)) { - if (opd->dest & BITMASK1(SHR_PR)) - emit_move_r_imm32(SHR_PR, pc + 2); - opd->imm = pc + 2 + u; - drcf.pending_branch_direct = 1; - } else { - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); - tmp = rcache_get_reg(SHR_PC, RC_GR_WRITE, NULL); - emith_move_r_imm(tmp, pc + 2); - if (opd->dest & BITMASK1(SHR_PR)) { - tmp3 = rcache_get_reg(SHR_PR, RC_GR_WRITE, NULL); - emith_move_r_r(tmp3, tmp); - } - emith_add_r_r(tmp, tmp2); - drcf.pending_branch_indirect = 1; + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); + tmp = rcache_get_reg(SHR_PC, RC_GR_WRITE, NULL); + emith_move_r_imm(tmp, pc + 2); + if (opd->dest & BITMASK1(SHR_PR)) { + tmp3 = rcache_get_reg(SHR_PR, RC_GR_WRITE, NULL); + emith_move_r_r(tmp3, tmp); } + emith_add_r_r(tmp, tmp2); + if (gconst_get(GET_Rn(), &u)) + gconst_set(SHR_PC, pc + 2 + u); + drcf.pending_branch_indirect = 1; goto end_op; case OP_SLEEP: // SLEEP 0000000000011011 @@ -3041,10 +3068,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // obtain new PC emit_memhandler_read_rr(sh2, SHR_PC, SHR_VBR, opd->imm * 4, 2); // indirect jump -> back to dispatcher - sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - FLUSH_CYCLES(sr); - rcache_flush(); - emith_jump(sh2_drc_dispatcher); + drcf.pending_branch_indirect = 1; goto end_op; case OP_LOAD_POOL: @@ -3483,7 +3507,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (drcf.delay_reg == -1) drcf.delay_reg = GET_Rn(); else - drcf.loop_type = 0; + drcf.polling = drcf.loop_type = 0; } #endif emith_bic_r_imm(sr, T); @@ -3925,8 +3949,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emit_move_r_imm32(SHR_PC, pc); rcache_flush(); emith_call(sh2_drc_test_irq); - if (pc < end_pc) // mark next insns as entry point for RTE - op_flags[i+1] |= OF_BTARGET; drcf.test_irq = 0; } @@ -3950,7 +3972,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { // idle or delay loop emith_sh2_delay_loop(cycles, drcf.delay_reg); - drcf.loop_type = 0; + drcf.polling = drcf.loop_type = 0; } #endif @@ -4011,15 +4033,30 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.pending_branch_direct = 0; if (target_pc >= base_pc && target_pc < pc) - drcf.loop_type = 0; + drcf.polling = drcf.loop_type = 0; } else if (drcf.pending_branch_indirect) { + struct op_data *opd_b = + (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd; + void *target; + u32 target_pc; + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); - rcache_flush(); - emith_jump(sh2_drc_dispatcher); + rcache_clean(); + if (gconst_get(SHR_PC, &target_pc)) { + // JMP const, treat like unconditional direct branch + target = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); + if (target == NULL) + return NULL; + emith_jump_patchable(target); + } else { + // JMP + emith_jump(sh2_drc_dispatcher); + } + rcache_invalidate(); drcf.pending_branch_indirect = 0; - drcf.loop_type = 0; + drcf.polling = drcf.loop_type = 0; } do_host_disasm(tcache_id); @@ -4836,33 +4873,12 @@ void sh2_drc_finish(SH2 *sh2) #endif /* DRC_SH2 */ -static void *dr_get_pc_base(u32 pc, int is_slave) +static void *dr_get_pc_base(u32 pc, SH2 *sh2) { void *ret = NULL; u32 mask = 0; - if ((pc & ~0x7ff) == 0) { - // BIOS - ret = is_slave ? Pico32xMem->sh2_rom_s.w : Pico32xMem->sh2_rom_m.w; - mask = 0x7ff; - } - else if ((pc & 0xfffff000) == 0xc0000000) { - // data array - ret = sh2s[is_slave].data_array; - mask = 0xfff; - } - else if ((pc & 0xc6000000) == 0x06000000) { - // SDRAM - ret = Pico32xMem->sdram; - mask = 0x03ffff; - } - else if ((pc & 0xc6000000) == 0x02000000) { - // ROM - if ((pc & 0x3fffff) < Pico.romsize) - ret = Pico.rom; - mask = 0x3fffff; - } - + ret = p32x_sh2_get_mem_ptr(pc, &mask, sh2); if (ret == NULL) return (void *)-1; // NULL is valid value @@ -4889,7 +4905,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, memset(op_flags, 0, sizeof(*op_flags) * BLOCK_INSN_LIMIT); op_flags[0] |= OF_BTARGET; // block start is always a target - dr_pc_base = dr_get_pc_base(base_pc, is_slave); + dr_pc_base = dr_get_pc_base(base_pc, &sh2s[!!is_slave]); // 1st pass: disassemble for (i = 0, pc = base_pc; ; i++, pc += 2) { @@ -5274,14 +5290,17 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, break; case 0x07: // LDC.L @Rm+,SR 0100mmmm00000111 tmp = SHR_SR; + opd->op = OP_LDC; opd->cycles = 3; break; case 0x17: // LDC.L @Rm+,GBR 0100mmmm00010111 tmp = SHR_GBR; + opd->op = OP_LDC; opd->cycles = 3; break; case 0x27: // LDC.L @Rm+,VBR 0100mmmm00100111 tmp = SHR_VBR; + opd->op = OP_LDC; opd->cycles = 3; break; default: @@ -5372,7 +5391,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, default: goto undefined; } - opd->op = OP_MOVE; + opd->op = OP_LDC; opd->source = BITMASK1(GET_Rn()); opd->dest = BITMASK1(tmp); break; diff --git a/pico/32x/memory.c b/pico/32x/memory.c index c385d1417..578c72f83 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -1730,6 +1730,32 @@ void REGPARM(3) p32x_sh2_write32(u32 a, u32 d, SH2 *sh2) wh(a, d, sh2); } +void *p32x_sh2_get_mem_ptr(u32 a, u32 *mask, SH2 *sh2) +{ + const sh2_memmap *mm = sh2->read8_map; + void *ret = (void *)-1; + u32 am; + + mm += a >> SH2_READ_SHIFT; + am = a & ((1 << SH2_READ_SHIFT)-1); + if (!map_flag_set(mm->addr) && !(am & ~mm->mask)) { + // directly mapped memory (SDRAM, ROM, data array) + ret = (void *)(mm->addr << 1); + *mask = mm->mask; + } else if ((a & ~0x7ff) == 0) { + // BIOS, has handler function since it shares its segment with I/O + ret = sh2->is_slave ? Pico32xMem->sh2_rom_s.w : Pico32xMem->sh2_rom_m.w; + *mask = 0x7ff; + } else if ((a & 0xc6000000) == 0x02000000) { + // banked ROM. Return bank address + u32 bank = carthw_ssf2_banks[(a >> 19) & 7] << 19; + ret = sh2->p_rom + bank; + *mask = 0x07ffff; + } + + return ret; +} + // ----------------------------------------------------------------- static void z80_md_bank_write_32x(unsigned int a, unsigned char d) diff --git a/pico/pico_int.h b/pico/pico_int.h index 2c55c941a..4139e816f 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -933,6 +933,7 @@ void Pico32xMemStateLoaded(void); void p32x_update_banks(void); void p32x_m68k_poll_event(unsigned int flags); void p32x_sh2_poll_memory(unsigned int a, SH2 *sh2); +void *p32x_sh2_get_mem_ptr(unsigned int a, unsigned int *mask, SH2 *sh2); void p32x_sh2_poll_event(SH2 *sh2, unsigned int flags, unsigned int m68k_cycles); // 32x/draw.c From f08d47500b5aa7f95ea4038a2d2fd39e6bd8a3ad Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 22 May 2019 21:33:23 +0200 Subject: [PATCH 039/174] sh2 memory access improvements, revive ARM asm memory functions --- Makefile | 1 + pico/32x/memory.c | 211 +++++++++++++++++++++---------------- pico/32x/memory_arm.S | 23 ++-- platform/common/common.mak | 5 + 4 files changed, 140 insertions(+), 100 deletions(-) diff --git a/Makefile b/Makefile index 45fde98fb..4bc48780d 100644 --- a/Makefile +++ b/Makefile @@ -54,6 +54,7 @@ asm_misc ?= 1 asm_cdmemory ?= 1 asm_mix ?= 1 asm_32xdraw ?= 1 +asm_32xmemory ?= 1 else # if not arm use_fame ?= 1 use_cz80 ?= 1 diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 578c72f83..e05d74c91 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -1313,11 +1313,6 @@ static u32 REGPARM(2) sh2_read8_cs0(u32 a, SH2 *sh2) return (s8)d; } -static u32 REGPARM(2) sh2_read8_da(u32 a, SH2 *sh2) -{ - return (s8)sh2->data_array[(a & 0xfff) ^ 1]; -} - // for ssf2 static u32 REGPARM(2) sh2_read8_rom(u32 a, SH2 *sh2) { @@ -1374,11 +1369,6 @@ static u32 REGPARM(2) sh2_read16_cs0(u32 a, SH2 *sh2) return (s16)d; } -static u32 REGPARM(2) sh2_read16_da(u32 a, SH2 *sh2) -{ - return ((s16 *)sh2->data_array)[(a & 0xffe) / 2]; -} - static u32 REGPARM(2) sh2_read16_rom(u32 a, SH2 *sh2) { u32 bank = carthw_ssf2_banks[(a >> 19) & 7] << 19; @@ -1399,12 +1389,6 @@ static u32 REGPARM(2) sh2_read32_cs0(u32 a, SH2 *sh2) return d1 | (d2 >> 16); } -static u32 REGPARM(2) sh2_read32_da(u32 a, SH2 *sh2) -{ - u32 d = *((u32 *)sh2->data_array + (a & 0xffc)/4); - return (d << 16) | (d >> 16); -} - static u32 REGPARM(2) sh2_read32_rom(u32 a, SH2 *sh2) { u32 bank = carthw_ssf2_banks[(a >> 19) & 7] << 19; @@ -1429,12 +1413,14 @@ void NOINLINE sh2_sdram_checks(u32 a, int t, SH2 *sh2) } } -void inline sh2_da_checks(u32 a, int t, SH2 *sh2) +#ifndef _ASM_32X_MEMORY_C +static void sh2_da_checks(u32 a, int t, SH2 *sh2) { if (t) sh2_drc_wcheck_da(a, t, sh2); } #endif +#endif static void REGPARM(3) sh2_write_ignore(u32 a, u32 d, SH2 *sh2) { @@ -1477,6 +1463,11 @@ static void REGPARM(3) sh2_write8_cs0(u32 a, u32 d, SH2 *sh2) DRC_RESTORE_SR(sh2); } +#ifdef _ASM_32X_MEMORY_C +extern void REGPARM(3) sh2_write8_dram(u32 a, u32 d, SH2 *sh2); +extern void REGPARM(3) sh2_write8_sdram(u32 a, u32 d, SH2 *sh2); +extern void REGPARM(3) sh2_write8_da(u32 a, u32 d, SH2 *sh2); +#else static void REGPARM(3) sh2_write8_dram(u32 a, u32 d, SH2 *sh2) { sh2_write8_dramN(sh2->p_dram, a, d); @@ -1505,6 +1496,7 @@ static void REGPARM(3) sh2_write8_da(u32 a, u32 d, SH2 *sh2) sh2_da_checks(a, t, sh2); #endif } +#endif // write16 static void REGPARM(3) sh2_write16_unmapped(u32 a, u32 d, SH2 *sh2) @@ -1544,6 +1536,11 @@ static void REGPARM(3) sh2_write16_cs0(u32 a, u32 d, SH2 *sh2) DRC_RESTORE_SR(sh2); } +#ifdef _ASM_32X_MEMORY_C +extern void REGPARM(3) sh2_write16_dram(u32 a, u32 d, SH2 *sh2); +extern void REGPARM(3) sh2_write16_sdram(u32 a, u32 d, SH2 *sh2); +extern void REGPARM(3) sh2_write16_da(u32 a, u32 d, SH2 *sh2); +#else static void REGPARM(3) sh2_write16_dram(u32 a, u32 d, SH2 *sh2) { sh2_write16_dramN(sh2->p_dram, a, d); @@ -1572,6 +1569,7 @@ static void REGPARM(3) sh2_write16_da(u32 a, u32 d, SH2 *sh2) sh2_da_checks(a, t, sh2); #endif } +#endif static void REGPARM(3) sh2_write16_rom(u32 a, u32 d, SH2 *sh2) { @@ -1613,6 +1611,11 @@ static void REGPARM(3) sh2_write32_cs0(u32 a, u32 d, SH2 *sh2) *pd = d | (v&m); \ } +#ifdef _ASM_32X_MEMORY_C +extern void REGPARM(3) sh2_write32_dram(u32 a, u32 d, SH2 *sh2); +extern void REGPARM(3) sh2_write32_sdram(u32 a, u32 d, SH2 *sh2); +extern void REGPARM(3) sh2_write32_da(u32 a, u32 d, SH2 *sh2); +#else static void REGPARM(3) sh2_write32_dram(u32 a, u32 d, SH2 *sh2) { sh2_write32_dramN(sh2->p_dram, a, d); @@ -1647,6 +1650,7 @@ static void REGPARM(3) sh2_write32_da(u32 a, u32 d, SH2 *sh2) sh2_da_checks(a+2, u, sh2); #endif } +#endif static void REGPARM(3) sh2_write32_rom(u32 a, u32 d, SH2 *sh2) { @@ -1670,10 +1674,10 @@ u32 REGPARM(2) p32x_sh2_read8(u32 a, SH2 *sh2) sh2_map += SH2MAP_ADDR2OFFS_R(a); p = sh2_map->addr; - if (map_flag_set(p)) - return ((sh2_read_handler *)(p << 1))(a, sh2); - else + if (!map_flag_set(p)) return *(s8 *)((p << 1) + ((a & sh2_map->mask) ^ 1)); + else + return ((sh2_read_handler *)(p << 1))(a, sh2); } u32 REGPARM(2) p32x_sh2_read16(u32 a, SH2 *sh2) @@ -1683,10 +1687,10 @@ u32 REGPARM(2) p32x_sh2_read16(u32 a, SH2 *sh2) sh2_map += SH2MAP_ADDR2OFFS_R(a); p = sh2_map->addr; - if (map_flag_set(p)) - return ((sh2_read_handler *)(p << 1))(a, sh2); - else + if (!map_flag_set(p)) return *(s16 *)((p << 1) + (a & sh2_map->mask)); + else + return ((sh2_read_handler *)(p << 1))(a, sh2); } u32 REGPARM(2) p32x_sh2_read32(u32 a, SH2 *sh2) @@ -1961,9 +1965,11 @@ static void get_bios(void) #define MAP_MEMORY(m) ((uptr)(m) >> 1) #define MAP_HANDLER(h) ( ((uptr)(h) >> 1) | ((uptr)1 << (sizeof(uptr) * 8 - 1)) ) -static sh2_memmap sh2_read8_map[0x80], sh2_read16_map[0x80], sh2_read32_map[0x80]; +static sh2_memmap msh2_read8_map[0x80], msh2_read16_map[0x80], msh2_read32_map[0x80]; +static sh2_memmap ssh2_read8_map[0x80], ssh2_read16_map[0x80], ssh2_read32_map[0x80]; // for writes we are using handlers only -static sh2_write_handler *sh2_write8_map[0x80], *sh2_write16_map[0x80], *sh2_write32_map[0x80]; +static sh2_write_handler *msh2_write8_map[0x80], *msh2_write16_map[0x80], *msh2_write32_map[0x80]; +static sh2_write_handler *ssh2_write8_map[0x80], *ssh2_write16_map[0x80], *ssh2_write32_map[0x80]; void Pico32xSwapDRAM(int b) { @@ -1977,25 +1983,35 @@ void Pico32xSwapDRAM(int b) b ? m68k_write16_dram1_ow : m68k_write16_dram0_ow, 1); // SH2 - sh2_read8_map[0x04/2].addr = sh2_read8_map[0x24/2].addr = - sh2_read16_map[0x04/2].addr = sh2_read16_map[0x24/2].addr = - sh2_read32_map[0x04/2].addr = sh2_read32_map[0x24/2].addr = MAP_MEMORY(Pico32xMem->dram[b]); + msh2_read8_map[0x04/2].addr = msh2_read8_map[0x24/2].addr = + msh2_read16_map[0x04/2].addr = msh2_read16_map[0x24/2].addr = + msh2_read32_map[0x04/2].addr = msh2_read32_map[0x24/2].addr = MAP_MEMORY(Pico32xMem->dram[b]); + ssh2_read8_map[0x04/2].addr = ssh2_read8_map[0x24/2].addr = + ssh2_read16_map[0x04/2].addr = ssh2_read16_map[0x24/2].addr = + ssh2_read32_map[0x04/2].addr = ssh2_read32_map[0x24/2].addr = MAP_MEMORY(Pico32xMem->dram[b]); msh2.p_dram = ssh2.p_dram = Pico32xMem->dram[b]; // DRC conveniance ptr + msh2.p_rom = ssh2.p_rom = Pico.rom; } static void bank_switch_rom_sh2(void) { if (!carthw_ssf2_active) { // easy - sh2_read8_map[0x02/2].addr = sh2_read8_map[0x22/2].addr = - sh2_read16_map[0x02/2].addr = sh2_read16_map[0x22/2].addr = - sh2_read32_map[0x02/2].addr = sh2_read32_map[0x22/2].addr = MAP_MEMORY(Pico.rom); + msh2_read8_map[0x02/2].addr = msh2_read8_map[0x22/2].addr = + msh2_read16_map[0x02/2].addr = msh2_read16_map[0x22/2].addr = + msh2_read32_map[0x02/2].addr = msh2_read32_map[0x22/2].addr = MAP_MEMORY(Pico.rom); + ssh2_read8_map[0x02/2].addr = ssh2_read8_map[0x22/2].addr = + ssh2_read16_map[0x02/2].addr = ssh2_read16_map[0x22/2].addr = + ssh2_read32_map[0x02/2].addr = ssh2_read32_map[0x22/2].addr = MAP_MEMORY(Pico.rom); } else { - sh2_read8_map[0x02/2].addr = sh2_read8_map[0x22/2].addr = MAP_HANDLER(sh2_read8_rom); - sh2_read16_map[0x02/2].addr = sh2_read16_map[0x22/2].addr = MAP_HANDLER(sh2_read16_rom); - sh2_read32_map[0x02/2].addr = sh2_read32_map[0x22/2].addr = MAP_HANDLER(sh2_read32_rom); + msh2_read8_map[0x02/2].addr = msh2_read8_map[0x22/2].addr = MAP_HANDLER(sh2_read8_rom); + msh2_read16_map[0x02/2].addr = msh2_read16_map[0x22/2].addr = MAP_HANDLER(sh2_read16_rom); + msh2_read32_map[0x02/2].addr = msh2_read32_map[0x22/2].addr = MAP_HANDLER(sh2_read32_rom); + ssh2_read8_map[0x02/2].addr = ssh2_read8_map[0x22/2].addr = MAP_HANDLER(sh2_read8_rom); + ssh2_read16_map[0x02/2].addr = ssh2_read16_map[0x22/2].addr = MAP_HANDLER(sh2_read16_rom); + ssh2_read32_map[0x02/2].addr = ssh2_read32_map[0x22/2].addr = MAP_HANDLER(sh2_read32_rom); } } @@ -2062,81 +2078,98 @@ void PicoMemSetup32x(void) // SH2 maps: A31,A30,A29,CS1,CS0 // all unmapped by default - for (i = 0; i < ARRAY_SIZE(sh2_read8_map); i++) { - sh2_read8_map[i].addr = MAP_HANDLER(sh2_read8_unmapped); - sh2_read16_map[i].addr = MAP_HANDLER(sh2_read16_unmapped); - sh2_read32_map[i].addr = MAP_HANDLER(sh2_read32_unmapped); + for (i = 0; i < ARRAY_SIZE(msh2_read8_map); i++) { + msh2_read8_map[i].addr = MAP_HANDLER(sh2_read8_unmapped); + msh2_read16_map[i].addr = MAP_HANDLER(sh2_read16_unmapped); + msh2_read32_map[i].addr = MAP_HANDLER(sh2_read32_unmapped); } - for (i = 0; i < ARRAY_SIZE(sh2_write8_map); i++) { - sh2_write8_map[i] = sh2_write8_unmapped; - sh2_write16_map[i] = sh2_write16_unmapped; - sh2_write32_map[i] = sh2_write32_unmapped; + for (i = 0; i < ARRAY_SIZE(msh2_write8_map); i++) { + msh2_write8_map[i] = sh2_write8_unmapped; + msh2_write16_map[i] = sh2_write16_unmapped; + msh2_write32_map[i] = sh2_write32_unmapped; } // "purge area" for (i = 0x40; i <= 0x5f; i++) { - sh2_write8_map[i >> 1] = - sh2_write16_map[i >> 1] = - sh2_write32_map[i >> 1] = sh2_write_ignore; + msh2_write8_map[i >> 1] = + msh2_write16_map[i >> 1] = + msh2_write32_map[i >> 1] = sh2_write_ignore; } // CS0 - sh2_read8_map[0x00/2].addr = sh2_read8_map[0x20/2].addr = MAP_HANDLER(sh2_read8_cs0); - sh2_read16_map[0x00/2].addr = sh2_read16_map[0x20/2].addr = MAP_HANDLER(sh2_read16_cs0); - sh2_read32_map[0x00/2].addr = sh2_read32_map[0x20/2].addr = MAP_HANDLER(sh2_read32_cs0); - sh2_write8_map[0x00/2] = sh2_write8_map[0x20/2] = sh2_write8_cs0; - sh2_write16_map[0x00/2] = sh2_write16_map[0x20/2] = sh2_write16_cs0; - sh2_write32_map[0x00/2] = sh2_write32_map[0x20/2] = sh2_write32_cs0; + msh2_read8_map[0x00/2].addr = msh2_read8_map[0x20/2].addr = MAP_HANDLER(sh2_read8_cs0); + msh2_read16_map[0x00/2].addr = msh2_read16_map[0x20/2].addr = MAP_HANDLER(sh2_read16_cs0); + msh2_read32_map[0x00/2].addr = msh2_read32_map[0x20/2].addr = MAP_HANDLER(sh2_read32_cs0); + msh2_write8_map[0x00/2] = msh2_write8_map[0x20/2] = sh2_write8_cs0; + msh2_write16_map[0x00/2] = msh2_write16_map[0x20/2] = sh2_write16_cs0; + msh2_write32_map[0x00/2] = msh2_write32_map[0x20/2] = sh2_write32_cs0; // CS1 - ROM bank_switch_rom_sh2(); - sh2_read8_map[0x02/2].mask = sh2_read8_map[0x22/2].mask = 0x3fffff; // FIXME - sh2_read16_map[0x02/2].mask = sh2_read16_map[0x22/2].mask = 0x3ffffe; // FIXME - sh2_read32_map[0x02/2].mask = sh2_read32_map[0x22/2].mask = 0x3ffffc; // FIXME - sh2_write16_map[0x02/2] = sh2_write16_map[0x22/2] = sh2_write16_rom; - sh2_write32_map[0x02/2] = sh2_write32_map[0x22/2] = sh2_write32_rom; + msh2_read8_map[0x02/2].mask = msh2_read8_map[0x22/2].mask = 0x3fffff; // FIXME + msh2_read16_map[0x02/2].mask = msh2_read16_map[0x22/2].mask = 0x3ffffe; // FIXME + msh2_read32_map[0x02/2].mask = msh2_read32_map[0x22/2].mask = 0x3ffffc; // FIXME + msh2_write16_map[0x02/2] = msh2_write16_map[0x22/2] = sh2_write16_rom; + msh2_write32_map[0x02/2] = msh2_write32_map[0x22/2] = sh2_write32_rom; // CS2 - DRAM - sh2_read8_map[0x04/2].mask = sh2_read8_map[0x24/2].mask = 0x01ffff; - sh2_read16_map[0x04/2].mask = sh2_read16_map[0x24/2].mask = 0x01fffe; - sh2_read32_map[0x04/2].mask = sh2_read32_map[0x24/2].mask = 0x01fffc; - sh2_write8_map[0x04/2] = sh2_write8_map[0x24/2] = sh2_write8_dram; - sh2_write16_map[0x04/2] = sh2_write16_map[0x24/2] = sh2_write16_dram; - sh2_write32_map[0x04/2] = sh2_write32_map[0x24/2] = sh2_write32_dram; + msh2_read8_map[0x04/2].mask = msh2_read8_map[0x24/2].mask = 0x01ffff; + msh2_read16_map[0x04/2].mask = msh2_read16_map[0x24/2].mask = 0x01fffe; + msh2_read32_map[0x04/2].mask = msh2_read32_map[0x24/2].mask = 0x01fffc; + msh2_write8_map[0x04/2] = msh2_write8_map[0x24/2] = sh2_write8_dram; + msh2_write16_map[0x04/2] = msh2_write16_map[0x24/2] = sh2_write16_dram; + msh2_write32_map[0x04/2] = msh2_write32_map[0x24/2] = sh2_write32_dram; // CS3 - SDRAM - sh2_read8_map[0x06/2].addr = sh2_read8_map[0x26/2].addr = - sh2_read16_map[0x06/2].addr = sh2_read16_map[0x26/2].addr = - sh2_read32_map[0x06/2].addr = sh2_read32_map[0x26/2].addr = MAP_MEMORY(Pico32xMem->sdram); - sh2_write8_map[0x06/2] = sh2_write8_map[0x26/2] = sh2_write8_sdram; - sh2_write16_map[0x06/2] = sh2_write16_map[0x26/2] = sh2_write16_sdram; - sh2_write32_map[0x06/2] = sh2_write32_map[0x26/2] = sh2_write32_sdram; - sh2_read8_map[0x06/2].mask = sh2_read8_map[0x26/2].mask = 0x03ffff; - sh2_read16_map[0x06/2].mask = sh2_read16_map[0x26/2].mask = 0x03fffe; - sh2_read32_map[0x06/2].mask = sh2_read32_map[0x26/2].mask = 0x03fffc; + msh2_read8_map[0x06/2].addr = msh2_read8_map[0x26/2].addr = + msh2_read16_map[0x06/2].addr = msh2_read16_map[0x26/2].addr = + msh2_read32_map[0x06/2].addr = msh2_read32_map[0x26/2].addr = MAP_MEMORY(Pico32xMem->sdram); + msh2_write8_map[0x06/2] = msh2_write8_map[0x26/2] = sh2_write8_sdram; + msh2_write16_map[0x06/2] = msh2_write16_map[0x26/2] = sh2_write16_sdram; + msh2_write32_map[0x06/2] = msh2_write32_map[0x26/2] = sh2_write32_sdram; + msh2_read8_map[0x06/2].mask = msh2_read8_map[0x26/2].mask = 0x03ffff; + msh2_read16_map[0x06/2].mask = msh2_read16_map[0x26/2].mask = 0x03fffe; + msh2_read32_map[0x06/2].mask = msh2_read32_map[0x26/2].mask = 0x03fffc; // SH2 data array - sh2_read8_map[0xc0/2].addr = MAP_HANDLER(sh2_read8_da); - sh2_read16_map[0xc0/2].addr = MAP_HANDLER(sh2_read16_da); - sh2_read32_map[0xc0/2].addr = MAP_HANDLER(sh2_read32_da); - sh2_write8_map[0xc0/2] = sh2_write8_da; - sh2_write16_map[0xc0/2] = sh2_write16_da; - sh2_write32_map[0xc0/2] = sh2_write32_da; + msh2_read8_map[0xc0/2].mask = 0x0fff; + msh2_read16_map[0xc0/2].mask = 0x0ffe; + msh2_read32_map[0xc0/2].mask = 0x0ffc; + msh2_write8_map[0xc0/2] = sh2_write8_da; + msh2_write16_map[0xc0/2] = sh2_write16_da; + msh2_write32_map[0xc0/2] = sh2_write32_da; // SH2 IO - sh2_read8_map[0xff/2].addr = MAP_HANDLER(sh2_peripheral_read8); - sh2_read16_map[0xff/2].addr = MAP_HANDLER(sh2_peripheral_read16); - sh2_read32_map[0xff/2].addr = MAP_HANDLER(sh2_peripheral_read32); - sh2_write8_map[0xff/2] = sh2_peripheral_write8; - sh2_write16_map[0xff/2] = sh2_peripheral_write16; - sh2_write32_map[0xff/2] = sh2_peripheral_write32; + msh2_read8_map[0xff/2].addr = MAP_HANDLER(sh2_peripheral_read8); + msh2_read16_map[0xff/2].addr = MAP_HANDLER(sh2_peripheral_read16); + msh2_read32_map[0xff/2].addr = MAP_HANDLER(sh2_peripheral_read32); + msh2_write8_map[0xff/2] = sh2_peripheral_write8; + msh2_write16_map[0xff/2] = sh2_peripheral_write16; + msh2_write32_map[0xff/2] = sh2_peripheral_write32; + + memcpy(ssh2_read8_map, msh2_read8_map, sizeof(msh2_read8_map)); + memcpy(ssh2_read16_map, msh2_read16_map, sizeof(msh2_read16_map)); + memcpy(ssh2_read32_map, msh2_read32_map, sizeof(msh2_read32_map)); + memcpy(ssh2_write8_map, msh2_write8_map, sizeof(msh2_write8_map)); + memcpy(ssh2_write16_map, msh2_write16_map, sizeof(msh2_write16_map)); + memcpy(ssh2_write32_map, msh2_write32_map, sizeof(msh2_write32_map)); + + msh2_read8_map[0xc0/2].addr = + msh2_read16_map[0xc0/2].addr = + msh2_read32_map[0xc0/2].addr = MAP_MEMORY(msh2.data_array); + ssh2_read8_map[0xc0/2].addr = + ssh2_read16_map[0xc0/2].addr = + ssh2_read32_map[0xc0/2].addr = MAP_MEMORY(ssh2.data_array); // map DRAM area, both 68k and SH2 Pico32xSwapDRAM(1); - msh2.read8_map = ssh2.read8_map = sh2_read8_map; - msh2.read16_map = ssh2.read16_map = sh2_read16_map; - msh2.read32_map = ssh2.read32_map = sh2_read32_map; - msh2.write8_tab = ssh2.write8_tab = (const void **)(void *)sh2_write8_map; - msh2.write16_tab = ssh2.write16_tab = (const void **)(void *)sh2_write16_map; - msh2.write32_tab = ssh2.write32_tab = (const void **)(void *)sh2_write32_map; + msh2.read8_map = msh2_read8_map; ssh2.read8_map = ssh2_read8_map; + msh2.read16_map = msh2_read16_map; ssh2.read16_map = ssh2_read16_map; + msh2.read32_map = msh2_read32_map; ssh2.read32_map = ssh2_read32_map; + msh2.write8_tab = (const void **)(void *)msh2_write8_map; + msh2.write16_tab = (const void **)(void *)msh2_write16_map; + msh2.write32_tab = (const void **)(void *)msh2_write32_map; + ssh2.write8_tab = (const void **)(void *)ssh2_write8_map; + ssh2.write16_tab = (const void **)(void *)ssh2_write16_map; + ssh2.write32_tab = (const void **)(void *)ssh2_write32_map; sh2_drc_mem_setup(&msh2); sh2_drc_mem_setup(&ssh2); diff --git a/pico/32x/memory_arm.S b/pico/32x/memory_arm.S index b449370b1..379906a0b 100644 --- a/pico/32x/memory_arm.S +++ b/pico/32x/memory_arm.S @@ -43,6 +43,7 @@ .global sh2_write32_da .global sh2_write32_dram +#if 0 sh2_read8_rom: ldr ip, [r1, #OFS_SH2_p_rom] eor r0, r0, #1 @@ -126,9 +127,10 @@ sh2_read32_dram: ldr r0, [ip, r0, lsr #SH2_DRAM_SHIFT] mov r0, r0, ror #16 bx lr +#endif sh2_write8_sdram: - @ preserve r0 and r2 for tail call + @ preserve r0,r2 for tail call ldr ip, [r2, #OFS_SH2_p_sdram] eor r3, r0, #1 mov r3, r3, lsl #SH2_RAM_SHIFT @@ -139,7 +141,7 @@ sh2_write8_sdram: bic r0, r0, #1 cmp r1, #0 bxeq lr - b sh2_drc_wcheck_ram + b sh2_sdram_checks #else bx lr #endif @@ -170,7 +172,7 @@ sh2_write8_dram: bx lr sh2_write16_sdram: - @ preserve r0 and r2 for tail call + @ preserve r0,r2 for tail call ldr ip, [r2, #OFS_SH2_p_sdram] mov r3, r0, lsl #SH2_RAM_SHIFT mov r3, r3, lsr #SH2_RAM_SHIFT @@ -180,7 +182,7 @@ sh2_write16_sdram: ldrb r1, [ip, r3, lsr #1] cmp r1, #0 bxeq lr - b sh2_drc_wcheck_ram + b sh2_sdram_checks #else bx lr #endif @@ -217,7 +219,7 @@ sh2_write16_dram: bx lr sh2_write32_sdram: - @ preserve r0 and r2 for tail call + @ preserve r0,r2 for tail call ldr ip, [r2, #OFS_SH2_p_sdram] mov r1, r1, ror #16 mov r3, r0, lsl #SH2_RAM_SHIFT @@ -228,13 +230,13 @@ sh2_write32_sdram: cmp r1, #0 beq 1f stmfd sp!, {r0, r2, ip, lr} - bl sh2_drc_wcheck_ram + b sh2_sdram_checks ldmfd sp!, {r0, r2, ip, lr} 1: ldrb r1, [ip, #1] + add r0, r0, #2 cmp r1, #0 bxeq lr - add r0, r0, #2 - b sh2_drc_wcheck_ram + b sh2_sdram_checks #else bx lr #endif @@ -254,9 +256,9 @@ sh2_write32_da: bl sh2_drc_wcheck_da ldmfd sp!, {r0, r2, ip, lr} 1: ldrb r1, [ip, #1] + add r0, r0, #2 cmp r1, #0 bxeq lr - add r0, r0, #2 b sh2_drc_wcheck_da #else bx lr @@ -266,11 +268,10 @@ sh2_write32_dram: ldr ip, [r2, #OFS_SH2_p_dram] tst r0, #SH2_DRAM_OW mov r3, r0, lsl #SH2_DRAM_SHIFT - moveq r1, r1, ror #16 + mov r1, r1, ror #16 streq r1, [ip, r3, lsr #SH2_DRAM_SHIFT] bxeq lr ldr r0, [ip, r3, lsr #SH2_DRAM_SHIFT] - mov r1, r1, ror #16 mov r2, #0 tst r1, #0x00ff0000 orrne r2, r2, #0x00ff0000 diff --git a/platform/common/common.mak b/platform/common/common.mak index 89b72bb81..0c7e349c3 100644 --- a/platform/common/common.mak +++ b/platform/common/common.mak @@ -10,6 +10,7 @@ asm_ym2612 = 0 asm_misc = 0 asm_cdmemory = 0 asm_32xdraw = 0 +asm_32xmemory = 0 asm_mix = 0 endif @@ -73,6 +74,10 @@ ifeq "$(asm_32xdraw)" "1" DEFINES += _ASM_32X_DRAW SRCS_COMMON += $(R)pico/32x/draw_arm.S endif +ifeq "$(asm_32xmemory)" "1" +DEFINES += _ASM_32X_MEMORY_C +SRCS_COMMON += $(R)pico/32x/memory_arm.s +endif ifeq "$(asm_mix)" "1" SRCS_COMMON += $(R)pico/sound/mix_arm.S endif From ad4aa3e9faf8ff60421931c2ca78523ed79a928e Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 22 May 2019 21:38:59 +0200 Subject: [PATCH 040/174] polling detection: communication poll fifo to avoid comm data loss --- cpu/sh2/compiler.c | 18 ++-- pico/32x/32x.c | 4 +- pico/32x/memory.c | 241 +++++++++++++++++++++++++++++++----------- pico/32x/memory_arm.S | 44 ++++---- pico/pico_int.h | 4 +- 5 files changed, 214 insertions(+), 97 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 9160c90c6..6d8e5118b 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -4220,9 +4220,9 @@ static void sh2_generate_utils(void) emith_eor_r_imm(arg1, 1); emith_read8s_r_r_r(arg1, arg1, arg2); emith_push_ret(arg1); - emith_move_r_r_ptr(arg1, CONTEXT_REG); - emith_call(p32x_sh2_poll_memory); - emith_pop_and_ret(RET_REG); + emith_move_r_r_ptr(arg2, CONTEXT_REG); + emith_call(p32x_sh2_poll_memory8); + emith_pop_and_ret(arg1); emith_flush(); // d = sh2_drc_read16_poll(u32 a) @@ -4236,9 +4236,9 @@ static void sh2_generate_utils(void) emith_and_r_r_r(arg1, arg0, arg3); emith_read16s_r_r_r(arg1, arg1, arg2); emith_push_ret(arg1); - emith_move_r_r_ptr(arg1, CONTEXT_REG); - emith_call(p32x_sh2_poll_memory); - emith_pop_and_ret(RET_REG); + emith_move_r_r_ptr(arg2, CONTEXT_REG); + emith_call(p32x_sh2_poll_memory16); + emith_pop_and_ret(arg1); emith_flush(); // d = sh2_drc_read32_poll(u32 a) @@ -4253,9 +4253,9 @@ static void sh2_generate_utils(void) emith_read_r_r_r(arg1, arg1, arg2); emith_ror(arg1, arg1, 16); emith_push_ret(arg1); - emith_move_r_r_ptr(arg1, CONTEXT_REG); - emith_call(p32x_sh2_poll_memory); - emith_pop_and_ret(RET_REG); + emith_move_r_r_ptr(arg2, CONTEXT_REG); + emith_call(p32x_sh2_poll_memory32); + emith_pop_and_ret(arg1); emith_flush(); // sh2_drc_exit(void) diff --git a/pico/32x/32x.c b/pico/32x/32x.c index 19c6e0a6a..1511f3f7c 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -471,7 +471,7 @@ void sync_sh2s_normal(unsigned int m68k_target) if (!(ssh2.state & SH2_IDLE_STATES)) { cycles = target - ssh2.m68krcycles_done; if (cycles > 0) { - run_sh2(&ssh2, cycles); + run_sh2(&ssh2, cycles > 20 ? cycles : 20); if (event_time_next && CYCLES_GT(target, event_time_next)) target = event_time_next; @@ -483,7 +483,7 @@ void sync_sh2s_normal(unsigned int m68k_target) if (!(msh2.state & SH2_IDLE_STATES)) { cycles = target - msh2.m68krcycles_done; if (cycles > 0) { - run_sh2(&msh2, cycles); + run_sh2(&msh2, cycles > 20 ? cycles : 20); if (event_time_next && CYCLES_GT(target, event_time_next)) target = event_time_next; diff --git a/pico/32x/memory.c b/pico/32x/memory.c index e05d74c91..a1ef42c2b 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -58,7 +58,7 @@ static void (*m68k_write16_io)(u32 a, u32 d); #define REG8IN16(ptr, offs) ((u8 *)ptr)[(offs) ^ 1] // poll detection -#define POLL_THRESHOLD 3 +#define POLL_THRESHOLD 5 static struct { u32 addr1, addr2, cycles; @@ -74,7 +74,7 @@ static int m68k_poll_detect(u32 a, u32 cycles, u32 flags) if (match && cycles - m68k_poll.cycles <= 64 && !SekNotPolling) { // detect split 32bit access by same cycle count, and ignore those - if (cycles != m68k_poll.cycles && m68k_poll.cnt++ > POLL_THRESHOLD) { + if (cycles != m68k_poll.cycles && ++m68k_poll.cnt > POLL_THRESHOLD) { if (!(Pico32x.emu_flags & flags)) { elprintf(EL_32X, "m68k poll addr %08x, cyc %u", a, cycles - m68k_poll.cycles); @@ -114,8 +114,11 @@ static void NOINLINE sh2_poll_detect(u32 a, SH2 *sh2, u32 flags, int maxcnt) { u32 cycles_done = sh2_cycles_done_t(sh2); + // reading 2 consecutive 16bit values is probably a 32bit access. detect this + // by checking address (max 2 bytes away) and cycles (max 2 cycles later). + // no polling if more than 20 cycles have passed since last detect call. if (a - sh2->poll_addr <= 2 && CYCLES_GE(sh2->poll_cycles+20, cycles_done)) { - if (sh2->poll_cycles != cycles_done && ++sh2->poll_cnt >= maxcnt) { + if (CYCLES_GT(cycles_done,sh2->poll_cycles+2) && ++sh2->poll_cnt > maxcnt) { if (!(sh2->state & flags)) elprintf_sh2(sh2, EL_32X, "state: %02x->%02x", sh2->state, sh2->state | flags); @@ -124,6 +127,7 @@ static void NOINLINE sh2_poll_detect(u32 a, SH2 *sh2, u32 flags, int maxcnt) sh2_end_run(sh2, 1); pevt_log_sh2(sh2, EVT_POLL_START); #ifdef DRC_SH2 + // mark this as an address used for polling if SDRAM if ((a & 0xc6000000) == 0x06000000) { unsigned char *p = sh2->p_drcblk_ram; p[(a & 0x3ffff) >> SH2_DRCBLK_RAM_SHIFT] |= 0x80; @@ -149,12 +153,6 @@ void NOINLINE p32x_sh2_poll_event(SH2 *sh2, u32 flags, u32 m68k_cycles) pevt_log_sh2_o(sh2, EVT_POLL_END); sh2->state &= ~flags; -#ifdef DRC_SH2 - if ((sh2->poll_addr & 0xc6000000) == 0x06000000) { - unsigned char *p = sh2->p_drcblk_ram; - p[(sh2->poll_addr & 0x3ffff) >> SH2_DRCBLK_RAM_SHIFT] &= ~0x80; - } -#endif } if (!(sh2->state & (SH2_STATE_CPOLL|SH2_STATE_VPOLL|SH2_STATE_RPOLL))) @@ -172,12 +170,123 @@ static void sh2s_sync_on_read(SH2 *sh2) p32x_sync_other_sh2(sh2, sh2->m68krcycles_done + C_SH2_TO_M68K(sh2, cycles)); } -void p32x_sh2_poll_memory(unsigned int a, SH2 *sh2) +// poll fifo, stores writes to potential addresses used for polling. +// This is used to correctly deliver syncronisation data to the 3 cpus. The +// fifo stores 16 bit values, 8/32 bit accesses must be adapted accordingly. +#define PFIFO_SZ 4 +#define PFIFO_CNT 4 +struct sh2_poll_fifo { + u32 cycles; + u32 a; + u16 d; + u16 cpu; +} sh2_poll_fifo[PFIFO_CNT][PFIFO_SZ]; +unsigned sh2_poll_rd[PFIFO_CNT], sh2_poll_wr[PFIFO_CNT]; // ringbuffer pointers + +static NOINLINE u32 sh2_poll_read(u32 a, u32 d, unsigned int cycles, SH2* sh2) +{ + int hix = (a >> 1) % PFIFO_CNT; + struct sh2_poll_fifo *fifo = sh2_poll_fifo[hix]; + struct sh2_poll_fifo *p; + int cpu = sh2 ? sh2->is_slave+1 : 0; + unsigned idx; + + // fetch oldest write to address from fifo, but stop when reaching the present + idx = sh2_poll_rd[hix]; + while (idx != sh2_poll_wr[hix] && CYCLES_GE(cycles, fifo[idx].cycles)) { +// int oidx = idx; + p = &fifo[idx]; + idx = (idx+1) % PFIFO_SZ; + + if (CYCLES_GT(cycles, p->cycles+80)) { + // drop older fifo stores that may cause synchronisation problems. + // NB unfortunately this cycle diff is quite sensitive: + // observed in Brutal Unleashed: min 80, observed in Afterburner: max 110 + sh2_poll_rd[hix] = idx; + } else if (p->a == a) { + // replace current data with fifo value and discard fifo entry + if (cpu != p->cpu) { + d = p->d; + p->a = -1; +// if (oidx == sh2_poll_rd[hix]) +// sh2_poll_rd[hix] = idx; + } + break; + } + } + return d; +} + +static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) +{ + int hix = (a >> 1) % PFIFO_CNT; + struct sh2_poll_fifo *fifo = sh2_poll_fifo[hix]; + struct sh2_poll_fifo *p = &fifo[sh2_poll_wr[hix]]; + struct sh2_poll_fifo *q = &fifo[(sh2_poll_wr[hix]-1) % PFIFO_SZ]; + int cpu = sh2 ? sh2->is_slave+1 : 0; + + // fold 2 consecutive writes to the same address to avoid reading of + // intermediate values that may cause synchronisation problems. + // NB this can take an eternity on m68k: mov.b , needs + // 28 m68k-cycles (~80 sh2-cycles) to complete (observed in Metal Head) + if (q->a == a && !CYCLES_GT(cycles,q->cycles+30)) { + q->d = d; + } else { + // store write to poll address in fifo + sh2_poll_wr[hix] = (sh2_poll_wr[hix]+1) % PFIFO_SZ; + if (sh2_poll_wr[hix] == sh2_poll_rd[hix]) + // fifo overflow, discard oldest value + sh2_poll_rd[hix] = (sh2_poll_rd[hix]+1) % PFIFO_SZ; + *p = (struct sh2_poll_fifo){ .cycles = cycles, .a = a, .d = d, .cpu = cpu }; + } +} + +u32 REGPARM(3) p32x_sh2_poll_memory8(unsigned int a, u32 d, SH2 *sh2) { + int shift = (a & 1 ? 0 : 8); + d = (s8)(p32x_sh2_poll_memory16(a & ~1, d << shift, sh2) >> shift); + return d; +} + +u32 REGPARM(3) p32x_sh2_poll_memory16(unsigned int a, u32 d, SH2 *sh2) +{ + unsigned char *p = sh2->p_drcblk_ram; + unsigned int cycles; + DRC_SAVE_SR(sh2); + // is this a synchronisation address? + if(p[(a & 0x3ffff) >> SH2_DRCBLK_RAM_SHIFT] & 0x80) { + sh2s_sync_on_read(sh2); + cycles = sh2_cycles_done_m68k(sh2); + // check poll fifo and sign-extend the result correctly + d = (s16)sh2_poll_read(a, d, cycles, sh2); + } + sh2_poll_detect(a, sh2, SH2_STATE_RPOLL, 5); - sh2s_sync_on_read(sh2); + DRC_RESTORE_SR(sh2); + return d; +} + +u32 REGPARM(3) p32x_sh2_poll_memory32(unsigned int a, u32 d, SH2 *sh2) +{ + unsigned char *p = sh2->p_drcblk_ram; + unsigned int cycles; + + DRC_SAVE_SR(sh2); + // is this a synchronisation address? + if(p[(a & 0x3ffff) >> SH2_DRCBLK_RAM_SHIFT] & 0x80) { + sh2s_sync_on_read(sh2); + cycles = sh2_cycles_done_m68k(sh2); + // check poll fifo and sign-extend the result correctly + d = sh2_poll_read(a, d, cycles, sh2) | + (sh2_poll_read(a+2, d >> 16, cycles, sh2) << 16); + } + + sh2_poll_detect(a, sh2, SH2_STATE_RPOLL, 5); + + DRC_RESTORE_SR(sh2); + return d; } // SH2 faking @@ -222,17 +331,15 @@ static u32 p32x_reg_read16(u32 a) #else if ((a & 0x30) == 0x20) { unsigned int cycles = SekCyclesDone(); - int comreg = 1 << (a & 0x0f) / 2; - if (cycles - msh2.m68krcycles_done > 244 - || (Pico32x.comm_dirty & comreg)) + if (cycles - msh2.m68krcycles_done > 244) p32x_sync_sh2s(cycles); if (m68k_poll_detect(a, cycles, P32XF_68KCPOLL)) { SekSetStop(1); SekEndRun(16); } - goto out; + return sh2_poll_read(a, Pico32x.regs[a / 2], cycles, NULL); } #endif @@ -415,18 +522,17 @@ static void p32x_reg_write8(u32 a, u32 d) if ((a & 0x30) == 0x20) { int cycles = SekCyclesDone(); - int comreg; if (REG8IN16(r, a) == d) return; - p32x_sync_sh2s(cycles); + if (cycles - (int)msh2.m68krcycles_done > 30) + p32x_sync_sh2s(cycles); REG8IN16(r, a) = d; p32x_sh2_poll_event(&sh2s[0], SH2_STATE_CPOLL, cycles); p32x_sh2_poll_event(&sh2s[1], SH2_STATE_CPOLL, cycles); - comreg = 1 << (a & 0x0f) / 2; - Pico32x.comm_dirty |= comreg; + sh2_poll_write(a & ~1, r[a / 2], cycles, NULL); return; } } @@ -477,18 +583,17 @@ static void p32x_reg_write16(u32 a, u32 d) // comm port if ((a & 0x30) == 0x20) { int cycles = SekCyclesDone(); - int comreg; - + if (r[a / 2] == d) return; - p32x_sync_sh2s(cycles); + if (cycles - (int)msh2.m68krcycles_done > 30) + p32x_sync_sh2s(cycles); r[a / 2] = d; p32x_sh2_poll_event(&sh2s[0], SH2_STATE_CPOLL, cycles); p32x_sh2_poll_event(&sh2s[1], SH2_STATE_CPOLL, cycles); - comreg = 1 << (a & 0x0f) / 2; - Pico32x.comm_dirty |= comreg; + sh2_poll_write(a, (u16)d, cycles, NULL); return; } // PWM @@ -596,9 +701,9 @@ static u32 p32x_sh2reg_read16(u32 a, SH2 *sh2) return (r[0] & P32XS_FM) | Pico32x.sh2_regs[0] | Pico32x.sh2irq_mask[sh2->is_slave]; case 0x04: // H count (often as comm too) - sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 7); + sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 9); sh2s_sync_on_read(sh2); - return Pico32x.sh2_regs[4 / 2]; + return sh2_poll_read(a, Pico32x.sh2_regs[4 / 2], sh2_cycles_done_m68k(sh2), sh2); case 0x06: return (r[a / 2] & ~P32XS_FULL) | 0x4000; case 0x08: // DREQ src @@ -625,9 +730,9 @@ static u32 p32x_sh2reg_read16(u32 a, SH2 *sh2) // comm port if ((a & 0x30) == 0x20) { - sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 7); + sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 9); sh2s_sync_on_read(sh2); - return r[a / 2]; + return sh2_poll_read(a, r[a / 2], sh2_cycles_done_m68k(sh2), sh2); } if ((a & 0x30) == 0x30) return p32x_pwm_read16(a, sh2, sh2_cycles_done_m68k(sh2)); @@ -671,10 +776,11 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) case 0x05: // H count d &= 0xff; if (Pico32x.sh2_regs[4 / 2] != d) { + unsigned int cycles = sh2_cycles_done_m68k(sh2); Pico32x.sh2_regs[4 / 2] = d; - p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, - sh2_cycles_done_m68k(sh2)); sh2_end_run(sh2, 4); + p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); + sh2_poll_write(a & ~1, d, cycles, sh2); } return; case 0x30: @@ -719,17 +825,16 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) } if ((a & 0x30) == 0x20) { - int comreg; + unsigned int cycles; if (REG8IN16(r, a) == d) return; REG8IN16(r, a) = d; + cycles = sh2_cycles_done_m68k(sh2); sh2_end_run(sh2, 1); p32x_m68k_poll_event(P32XF_68KCPOLL); - p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, - sh2_cycles_done_m68k(sh2)); - comreg = 1 << (a & 0x0f) / 2; - Pico32x.comm_dirty |= comreg; + p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); + sh2_poll_write(a & ~1, r[a / 2], cycles, sh2); return; } @@ -745,17 +850,16 @@ static void p32x_sh2reg_write16(u32 a, u32 d, SH2 *sh2) // comm if ((a & 0x30) == 0x20) { - int comreg; + unsigned int cycles; if (Pico32x.regs[a / 2] == d) return; Pico32x.regs[a / 2] = d; + cycles = sh2_cycles_done_m68k(sh2); sh2_end_run(sh2, 1); p32x_m68k_poll_event(P32XF_68KCPOLL); - p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, - sh2_cycles_done_m68k(sh2)); - comreg = 1 << (a & 0x0f) / 2; - Pico32x.comm_dirty |= comreg; + p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); + sh2_poll_write(a, d, cycles, sh2); return; } // PWM @@ -1399,25 +1503,42 @@ static u32 REGPARM(2) sh2_read32_rom(u32 a, SH2 *sh2) // writes #ifdef DRC_SH2 -void NOINLINE sh2_sdram_checks(u32 a, int t, SH2 *sh2) +static void NOINLINE sh2_sdram_poll(u32 a, u16 d, SH2 *sh2) { - int v = t & ~0x80; + unsigned cycles; - if (v) - sh2_drc_wcheck_ram(a, v, sh2); - if (t & 0x80) { - DRC_SAVE_SR(sh2); - sh2_end_run(sh2, 1); - p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_RPOLL, sh2_cycles_done_m68k(sh2)); - DRC_RESTORE_SR(sh2); - } + DRC_SAVE_SR(sh2); + sh2_end_run(sh2, 1); + cycles = sh2_cycles_done_m68k(sh2); + sh2_poll_write(a, d, cycles, sh2); + p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_RPOLL, cycles); + DRC_RESTORE_SR(sh2); +} + +void NOINLINE sh2_sdram_checks(u32 a, u32 d, SH2 *sh2, int t) +{ + if (t & 0x80) + sh2_sdram_poll(a, d, sh2); + if (t & 0x7f) + sh2_drc_wcheck_ram(a, t & 0x7f, sh2); +} + +void NOINLINE sh2_sdram_checks_l(u32 a, u32 d, SH2 *sh2, int t) +{ + sh2_sdram_checks(a, d, sh2, t); + sh2_sdram_checks(a+2, d>>16, sh2, t>>16); } #ifndef _ASM_32X_MEMORY_C static void sh2_da_checks(u32 a, int t, SH2 *sh2) { - if (t) - sh2_drc_wcheck_da(a, t, sh2); + sh2_drc_wcheck_da(a, t, sh2); +} + +static void NOINLINE sh2_da_checks_l(u32 a, int t, SH2 *sh2) +{ + sh2_da_checks(a, t, sh2); + sh2_da_checks(a+2, t>>16, sh2); } #endif #endif @@ -1481,7 +1602,7 @@ static void REGPARM(3) sh2_write8_sdram(u32 a, u32 d, SH2 *sh2) u8 *p = sh2->p_drcblk_ram; int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; if (t) - sh2_sdram_checks(a, t, sh2); + sh2_sdram_checks(a & ~1, ((u16 *)sh2->p_sdram)[a1 / 2], sh2, t); #endif } @@ -1554,7 +1675,7 @@ static void REGPARM(3) sh2_write16_sdram(u32 a, u32 d, SH2 *sh2) u8 *p = sh2->p_drcblk_ram; int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; if (t) - sh2_sdram_checks(a, t, sh2); + sh2_sdram_checks(a, d, sh2, t); #endif } @@ -1628,11 +1749,9 @@ static void REGPARM(3) sh2_write32_sdram(u32 a, u32 d, SH2 *sh2) #ifdef DRC_SH2 u8 *p = sh2->p_drcblk_ram; int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; - if (t) - sh2_sdram_checks(a, t, sh2); int u = p[(a1+2) >> SH2_DRCBLK_RAM_SHIFT]; - if (u) - sh2_sdram_checks(a+2, u, sh2); + if (t|(u<<16)) + sh2_sdram_checks_l(a, d, sh2, t|(u<<16)); #endif } @@ -1643,11 +1762,9 @@ static void REGPARM(3) sh2_write32_da(u32 a, u32 d, SH2 *sh2) #ifdef DRC_SH2 u8 *p = sh2->p_drcblk_da; int t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; - if (t) - sh2_da_checks(a, t, sh2); int u = p[(a1+2) >> SH2_DRCBLK_DA_SHIFT]; - if (u) - sh2_da_checks(a+2, u, sh2); + if (t|(u<<16)) + sh2_da_checks_l(a, t|(u<<16), sh2); #endif } #endif diff --git a/pico/32x/memory_arm.S b/pico/32x/memory_arm.S index 379906a0b..48143ba9f 100644 --- a/pico/32x/memory_arm.S +++ b/pico/32x/memory_arm.S @@ -130,17 +130,21 @@ sh2_read32_dram: #endif sh2_write8_sdram: - @ preserve r0,r2 for tail call + @ preserve r0-r2 for tail call ldr ip, [r2, #OFS_SH2_p_sdram] eor r3, r0, #1 mov r3, r3, lsl #SH2_RAM_SHIFT strb r1, [ip, r3, lsr #SH2_RAM_SHIFT] #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_ram] - ldrb r1, [ip, r3, lsr #SH2_RAM_SHIFT+1] - bic r0, r0, #1 - cmp r1, #0 + ldrb r3, [ip, r3, lsr #SH2_RAM_SHIFT+1] + cmp r3, #0 bxeq lr + ldr ip, [r2, #OFS_SH2_p_sdram] + bic r0, r0, #1 + mov r3, r0, lsl #SH2_RAM_SHIFT + mov r3, r3, lsr #SH2_RAM_SHIFT + ldrh r1, [ip, r3] b sh2_sdram_checks #else bx lr @@ -172,15 +176,15 @@ sh2_write8_dram: bx lr sh2_write16_sdram: - @ preserve r0,r2 for tail call + @ preserve r0-r2 for tail call ldr ip, [r2, #OFS_SH2_p_sdram] mov r3, r0, lsl #SH2_RAM_SHIFT mov r3, r3, lsr #SH2_RAM_SHIFT strh r1, [ip, r3] #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_ram] - ldrb r1, [ip, r3, lsr #1] - cmp r1, #0 + ldrb r3, [ip, r3, lsr #1] + cmp r3, #0 bxeq lr b sh2_sdram_checks #else @@ -219,24 +223,19 @@ sh2_write16_dram: bx lr sh2_write32_sdram: - @ preserve r0,r2 for tail call + @ preserve r0-r2 for tail call ldr ip, [r2, #OFS_SH2_p_sdram] mov r1, r1, ror #16 mov r3, r0, lsl #SH2_RAM_SHIFT str r1, [ip, r3, lsr #SH2_RAM_SHIFT] #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_ram] - ldrb r1, [ip, r3, lsr #SH2_RAM_SHIFT+1]! - cmp r1, #0 - beq 1f - stmfd sp!, {r0, r2, ip, lr} - b sh2_sdram_checks - ldmfd sp!, {r0, r2, ip, lr} -1: ldrb r1, [ip, #1] - add r0, r0, #2 - cmp r1, #0 + ldrb r3, [ip, r3, lsr #SH2_RAM_SHIFT+1]! + ldrb ip, [ip, #1] + orrs r3, r3, ip, lsl #16 bxeq lr - b sh2_sdram_checks + mov r1, r1, ror #16 + b sh2_sdram_checks_l #else bx lr #endif @@ -250,15 +249,14 @@ sh2_write32_da: #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_da] ldrb r1, [ip, r3, lsr #SH2_DA_SHIFT+1]! - cmp r1, #0 - beq 1f + ldrb ip, [ip, #1] + orrs r3, r1, ip, lsl #16 + bxeq lr stmfd sp!, {r0, r2, ip, lr} bl sh2_drc_wcheck_da ldmfd sp!, {r0, r2, ip, lr} -1: ldrb r1, [ip, #1] add r0, r0, #2 - cmp r1, #0 - bxeq lr + mov r1, ip b sh2_drc_wcheck_da #else bx lr diff --git a/pico/pico_int.h b/pico/pico_int.h index 4139e816f..31fc702ce 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -932,7 +932,9 @@ void Pico32xSwapDRAM(int b); void Pico32xMemStateLoaded(void); void p32x_update_banks(void); void p32x_m68k_poll_event(unsigned int flags); -void p32x_sh2_poll_memory(unsigned int a, SH2 *sh2); +unsigned int REGPARM(3) p32x_sh2_poll_memory8(unsigned int a, unsigned int d, SH2 *sh2); +unsigned int REGPARM(3) p32x_sh2_poll_memory16(unsigned int a, unsigned int d, SH2 *sh2); +unsigned int REGPARM(3) p32x_sh2_poll_memory32(unsigned int a, unsigned int d, SH2 *sh2); void *p32x_sh2_get_mem_ptr(unsigned int a, unsigned int *mask, SH2 *sh2); void p32x_sh2_poll_event(SH2 *sh2, unsigned int flags, unsigned int m68k_cycles); From 1cf16a7c514d667a7572b5568e1059dc1986fd8c Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 22 May 2019 21:45:31 +0200 Subject: [PATCH 041/174] add xSR/RTS call stack cache to sh2 drc --- cpu/drc/emit_arm.c | 17 +++++ cpu/drc/emit_x86.c | 22 ++++++ cpu/sh2/compiler.c | 173 +++++++++++++++++++++++++++++++++++++++------ cpu/sh2/sh2.h | 2 + 4 files changed, 192 insertions(+), 22 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 1b429b352..9af2f4538 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -304,7 +304,9 @@ static void emith_flush(void) #define EOP_STR_SIMPLE(rd,rn) EOP_C_AM2_IMM(A_COND_AL,1,0,0,rn,rd,0) #define EOP_LDR_REG_LSL(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,0,1,rn,rd,shift_imm,A_AM1_LSL,rm) +#define EOP_LDR_REG_LSL_WB(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,0,3,rn,rd,shift_imm,A_AM1_LSL,rm) #define EOP_LDRB_REG_LSL(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,1,1,rn,rd,shift_imm,A_AM1_LSL,rm); +#define EOP_STR_REG_LSL_WB(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,0,2,rn,rd,shift_imm,A_AM1_LSL,rm) #define EOP_LDRH_IMM2(cond,rd,rn,offset_8) EOP_C_AM3_IMM(cond,(offset_8) >= 0,1,rn,rd,0,1,abs(offset_8)) #define EOP_LDRH_REG2(cond,rd,rn,rm) EOP_C_AM3_REG(cond,1,1,rn,rd,0,1,rm) @@ -941,8 +943,12 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) EOP_LDR_REG_LSL(cond, r, rs, rm, 0) #define emith_read_r_r_offs(r, rs, offs) \ emith_read_r_r_offs_c(A_COND_AL, r, rs, offs) +#define emith_read_r_r_offs_ptr(r, rs, offs) \ + emith_read_r_r_offs_c(A_COND_AL, r, rs, offs) #define emith_read_r_r_r(r, rs, rm) \ EOP_LDR_REG_LSL(A_COND_AL, r, rs, rm, 0) +#define emith_read_r_r_r_wb(r, rs, rm) \ + EOP_LDR_REG_LSL_WB(A_COND_AL, r, rs, rm, 0) #define emith_read8_r_r_offs_c(cond, r, rs, offs) \ EOP_LDRB_IMM2(cond, r, rs, offs) @@ -984,6 +990,12 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) EOP_STR_IMM2(cond, r, rs, offs) #define emith_write_r_r_offs_ptr_c(cond, r, rs, offs) \ emith_write_r_r_offs_c(cond, r, rs, offs) +#define emith_write_r_r_offs(r, rs, offs) \ + emith_write_r_r_offs_c(A_COND_AL, r, rs, offs) +#define emith_write_r_r_offs_ptr(r, rs, offs) \ + emith_write_r_r_offs_c(A_COND_AL, r, rs, offs) +#define emith_write_r_r_r_wb(r, rs, rm) \ + EOP_STR_REG_LSL_WB(A_COND_AL, r, rs, rm, 0) #define emith_ctx_read_c(cond, r, offs) \ emith_read_r_r_offs_c(cond, r, CONTEXT_REG, offs) @@ -1111,6 +1123,11 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) emith_jump_ctx(offs); \ } while (0) +#define emith_call_link(r, target) do { \ + emith_move_r_r(r, PC); \ + emith_jump(target); \ +} while (0) + #define emith_ret_c(cond) \ emith_jump_reg_c(cond, LR) diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 9dd062624..edb34521d 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -721,6 +721,20 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMIT_OP_MODRM(0x8b, 0, r, 4); \ EMIT_SIB(0, rs, rm); /* mov r, [rm + rs * 1] */ \ } while (0) +#define emith_read_r_r_r_wb(r, rs, rm) do { \ + emith_read_r_r_r(r, rs, rm); \ + emith_add_r_r_ptr(rs, rm); \ +} while (0) + +#define emith_write_r_r_r(r, rs, rm) do { \ + EMIT_OP_MODRM(0x89, 0, r, 4); \ + EMIT_SIB(0, rs, rm); /* mov [rm + rs * 1], r */ \ +} while (0) +#define emith_write_r_r_r_wb(r, rs, rm) do { \ + emith_write_r_r_r(r, rs, rm); \ + emith_add_r_r_ptr(rs, rm); \ +} while (0) + #define emith_ctx_read(r, offs) \ emith_read_r_r_offs(r, CONTEXT_REG, offs) @@ -801,6 +815,14 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMIT(offs, u32); \ } while (0) +#define emith_call_link(r, target) do { \ + EMIT_OP(0xe8); \ + EMIT(0, u32); /* call pc+0 */ \ + emith_pop(r); \ + emith_add_r_r_ptr_imm(r, r, 13); \ + emith_jump(target); \ +} while (0) + #define emith_ret() \ EMIT_OP(0xc3) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 6d8e5118b..be6e3ee14 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -39,6 +39,7 @@ #define PROPAGATE_CONSTANTS 1 #define LINK_BRANCHES 1 #define BRANCH_CACHE 1 +#define CALL_STACK 0 #define ALIAS_REGISTERS 1 #define REMAP_REGISTER 1 #define LOOP_DETECTION 1 @@ -58,13 +59,14 @@ // 08 - runtime block entry log // 10 - smc self-check // 20 - runtime block entry counter +// 80 - branch cache statistics // 100 - write trace // 200 - compare trace // 400 - block entry backtrace on exit // 800 - state dump on exit // { #ifndef DRC_DEBUG -#define DRC_DEBUG 0 +#define DRC_DEBUG 0x0 #endif #if DRC_DEBUG @@ -369,6 +371,15 @@ static struct block_entry **hash_tables[TCACHE_BUFFERS]; #define HASH_FUNC(hash_tab, addr, mask) \ (hash_tab)[(((addr) >> 20) ^ ((addr) >> 2)) & (mask)] +#if (DRC_DEBUG & 128) +#if BRANCH_CACHE +int bchit, bcmiss; +#endif +#if CALL_STACK +int rchit, rcmiss; +#endif +#endif + // host register tracking enum { HR_FREE, @@ -527,6 +538,10 @@ static signed char reg_map_host[HOST_REGS]; static void REGPARM(1) (*sh2_drc_entry)(SH2 *sh2); static void (*sh2_drc_dispatcher)(void); +#if CALL_STACK +static void REGPARM(1) (*sh2_drc_dispatcher_call)(uptr host_pc); +static void (*sh2_drc_dispatcher_return)(void); +#endif static void (*sh2_drc_exit)(void); static void (*sh2_drc_test_irq)(void); @@ -684,12 +699,17 @@ static void REGPARM(1) flush_tcache(int tcid) memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram)); memset(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)); memset(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)); + memset(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)); + memset(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)); + sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0; } else { memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram)); memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram)); memset(Pico32xMem->drcblk_da[tcid - 1], 0, sizeof(Pico32xMem->drcblk_da[tcid - 1])); memset(Pico32xMem->drclit_da[tcid - 1], 0, sizeof(Pico32xMem->drclit_da[tcid - 1])); memset(sh2s[tcid - 1].branch_cache, -1, sizeof(sh2s[0].branch_cache)); + memset(sh2s[tcid - 1].rts_cache, -1, sizeof(sh2s[0].rts_cache)); + sh2s[tcid - 1].rts_cache_idx = 0; } } #if (DRC_DEBUG & 4) @@ -816,9 +836,7 @@ static void dr_free_oldest_block(int tcache_id) static u8 *dr_prepare_cache(int tcache_id, int insn_count) { -#if BRANCH_CACHE u8 *limit = tcache_limit[tcache_id]; -#endif // if no block desc available if (block_counts[tcache_id] == block_limit[tcache_id]) @@ -828,16 +846,26 @@ static u8 *dr_prepare_cache(int tcache_id, int insn_count) while (tcache_limit[tcache_id] - tcache_ptrs[tcache_id] < insn_count * 128) dr_free_oldest_block(tcache_id); -#if BRANCH_CACHE if (limit != tcache_limit[tcache_id]) { +#if BRANCH_CACHE if (tcache_id) memset32(sh2s[tcache_id-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); else { memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4); } - } #endif +#if CALL_STACK + if (tcache_id) { + memset32(sh2s[tcache_id-1].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4); + sh2s[tcache_id-1].rts_cache_idx = 0; + } else { + memset32(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4); + memset32(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)/4); + sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0; + } +#endif + } return (u8 *)tcache_ptrs[tcache_id]; } @@ -3955,16 +3983,14 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // branch handling if (drcf.pending_branch_direct) { - struct op_data *opd_b = - (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd; + struct op_data *opd_b = (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd; u32 target_pc = opd_b->imm; int cond = -1; void *target = NULL; int ctaken = 0; - if (OP_ISBRACND(opd_b->op)) { + if (OP_ISBRACND(opd_b->op)) ctaken = (op_flags[i] & OF_DELAY_OP) ? 1 : 2; - } cycles += ctaken; // assume branch taken #if LOOP_DETECTION if ((drcf.loop_type == OF_IDLE_LOOP || @@ -4014,15 +4040,21 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emit_move_r_imm32(SHR_PC, target_pc); rcache_clean(); - target = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); - if (target == NULL) - return NULL; +#if CALL_STACK + if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) { + // BSR + tmp = rcache_get_tmp_arg(0); + emith_call_link(tmp, sh2_drc_dispatcher_call); + rcache_free_tmp(tmp); + } else +#endif + target = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); } if (cond != -1) { emith_jump_cond_patchable(cond, target); } - else { + else if (target != NULL) { emith_jump_patchable(target); rcache_invalidate(); } @@ -4036,19 +4068,26 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.polling = drcf.loop_type = 0; } else if (drcf.pending_branch_indirect) { - struct op_data *opd_b = - (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd; void *target; u32 target_pc; sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); rcache_clean(); +#if CALL_STACK + struct op_data *opd_b = (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd; + if (opd_b->rm == SHR_PR) { + // RTS + emith_jump(sh2_drc_dispatcher_return); + } else if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) { + // JSR/BSRF + tmp = rcache_get_tmp_arg(0); + emith_call_link(tmp, sh2_drc_dispatcher_call); + } else +#endif if (gconst_get(SHR_PC, &target_pc)) { // JMP const, treat like unconditional direct branch target = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); - if (target == NULL) - return NULL; emith_jump_patchable(target); } else { // JMP @@ -4264,6 +4303,20 @@ static void sh2_generate_utils(void) emith_sh2_drc_exit(); emith_flush(); +#if CALL_STACK + // sh2_drc_dispatcher_call(uptr host_pc) + sh2_drc_dispatcher_call = (void *)tcache_ptr; + emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx)); + emith_add_r_imm(arg2, 2*sizeof(void *)); + emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *)); + emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx)); + emith_add_r_r_ptr_imm(arg1, CONTEXT_REG, offsetof(SH2, rts_cache)); + emith_ctx_read(arg3, offsetof(SH2, pr)); + emith_write_r_r_r_wb(arg3, arg1, arg2); + emith_write_r_r_offs_ptr(arg0, arg1, sizeof(void *)); + emith_flush(); + // FALLTHROUGH +#endif // sh2_drc_dispatcher(void) sh2_drc_dispatcher = (void *)tcache_ptr; emith_ctx_read(arg0, SHR_PC * 4); @@ -4274,6 +4327,12 @@ static void sh2_generate_utils(void) emith_read_r_r_offs(arg2, arg1, offsetof(SH2, branch_cache)); emith_cmp_r_r(arg2, arg0); EMITH_SJMP_START(DCOND_NE); +#if (DRC_DEBUG & 128) + emith_move_r_ptr_imm(arg2, (uptr)&bchit); + emith_read_r_r_offs_c(DCOND_EQ, arg3, arg2, 0); + emith_add_r_imm_c(DCOND_EQ, arg3, 1); + emith_write_r_r_offs_c(DCOND_EQ, arg3, arg2, 0); +#endif emith_read_r_r_offs_ptr_c(DCOND_EQ, RET_REG, arg1, offsetof(SH2, branch_cache) + sizeof(void *)); emith_jump_reg_c(DCOND_EQ, RET_REG); EMITH_SJMP_END(DCOND_NE); @@ -4285,6 +4344,12 @@ static void sh2_generate_utils(void) // store PC and block entry ptr (in arg0) in branch target cache emith_tst_r_r_ptr(RET_REG, RET_REG); EMITH_SJMP_START(DCOND_EQ); +#if (DRC_DEBUG & 128) + emith_move_r_ptr_imm(arg2, (uptr)&bcmiss); + emith_read_r_r_offs_c(DCOND_NE, arg3, arg2, 0); + emith_add_r_imm_c(DCOND_NE, arg3, 1); + emith_write_r_r_offs_c(DCOND_NE, arg3, arg2, 0); +#endif emith_ctx_read_c(DCOND_NE, arg2, SHR_PC * 4); emith_and_r_r_imm(arg1, arg2, (ARRAY_SIZE(sh2s->branch_cache)-1)*4); emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg1, sizeof(void *) == 8 ? 2 : 1); @@ -4302,6 +4367,37 @@ static void sh2_generate_utils(void) emith_call(dr_failure); emith_flush(); +#if CALL_STACK + // sh2_drc_dispatcher_return(void) + sh2_drc_dispatcher_return = (void *)tcache_ptr; + emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx)); + emith_add_r_r_ptr_imm(arg1, CONTEXT_REG, offsetof(SH2, rts_cache)); + emith_ctx_read(arg0, offsetof(SH2, pc)); + emith_read_r_r_r_wb(arg3, arg1, arg2); + emith_cmp_r_r(arg0, arg3); +#if (DRC_DEBUG & 128) + EMITH_SJMP_START(DCOND_EQ); + emith_move_r_ptr_imm(arg2, (uptr)&rcmiss); + emith_read_r_r_offs_c(DCOND_NE, arg1, arg2, 0); + emith_add_r_imm_c(DCOND_NE, arg1, 1); + emith_write_r_r_offs_c(DCOND_NE, arg1, arg2, 0); + EMITH_SJMP_END(DCOND_EQ); +#endif + emith_jump_cond(DCOND_NE, sh2_drc_dispatcher); + emith_read_r_r_offs_ptr(arg0, arg1, sizeof(void *)); + emith_sub_r_imm(arg2, 2*sizeof(void *)); + emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *)); + emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx)); +#if (DRC_DEBUG & 128) + emith_move_r_ptr_imm(arg2, (uptr)&rchit); + emith_read_r_r_offs(arg1, arg2, 0); + emith_add_r_imm(arg1, 1); + emith_write_r_r_offs(arg1, arg2, 0); +#endif + emith_jump_reg(arg0); + emith_flush(); +#endif + // sh2_drc_test_irq(void) // assumes it's called from main function (may jump to dispatcher) sh2_drc_test_irq = (void *)tcache_ptr; @@ -4408,6 +4504,10 @@ static void sh2_generate_utils(void) #if (DRC_DEBUG & 4) host_dasm_new_symbol(sh2_drc_entry); host_dasm_new_symbol(sh2_drc_dispatcher); +#if CALL_STACK + host_dasm_new_symbol(sh2_drc_dispatcher_call); + host_dasm_new_symbol(sh2_drc_dispatcher_return); +#endif host_dasm_new_symbol(sh2_drc_exit); host_dasm_new_symbol(sh2_drc_test_irq); host_dasm_new_symbol(sh2_drc_write8); @@ -4521,6 +4621,16 @@ static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift) memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4); } #endif +#if CALL_STACK + if (tcache_id) { + memset32(sh2s[tcache_id-1].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4); + sh2s[tcache_id-1].rts_cache_idx = 0; + } else { + memset32(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4); + memset32(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)/4); + sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0; + } +#endif } void sh2_drc_wcheck_ram(unsigned int a, int val, SH2 *sh2) @@ -4694,11 +4804,6 @@ static void state_dump(void) printf("%08x ",p32x_sh2_read32(sh2s[0].r[15] + i*4, &sh2s[0])); if ((i+1) % 8 == 0) printf("\n"); } - printf("branch cache master:\n"); - for (i = 0; i < ARRAY_SIZE(sh2s[0].branch_cache); i++) { - printf("%08x ",sh2s[0].branch_cache[i].pc); - if ((i+1) % 8 == 0) printf("\n"); - } SH2_DUMP(&sh2s[1], "slave"); printf("VBR ssh2: %x\n", sh2s[1].vbr); for (i = 0; i < 0x60; i++) { @@ -4710,12 +4815,33 @@ static void state_dump(void) printf("%08x ",p32x_sh2_read32(sh2s[1].r[15] + i*4, &sh2s[1])); if ((i+1) % 8 == 0) printf("\n"); } +#endif +} + +static void bcache_stats(void) +{ +#if (DRC_DEBUG & 128) + int i; +#if CALL_STACK + for (i = 1; i < ARRAY_SIZE(sh2s->rts_cache); i++) + if (sh2s[0].rts_cache[i].pc == -1 && sh2s[1].rts_cache[i].pc == -1) break; + + printf("return cache hits:%d misses:%d depth: %d\n", rchit, rcmiss, i); +#endif +#if BRANCH_CACHE + printf("branch cache hits:%d misses:%d\n", bchit, bcmiss); + printf("branch cache master:\n"); + for (i = 0; i < ARRAY_SIZE(sh2s[0].branch_cache); i++) { + printf("%08x ",sh2s[0].branch_cache[i].pc); + if ((i+1) % 8 == 0) printf("\n"); + } printf("branch cache slave:\n"); for (i = 0; i < ARRAY_SIZE(sh2s[1].branch_cache); i++) { printf("%08x ",sh2s[1].branch_cache[i].pc); if ((i+1) % 8 == 0) printf("\n"); } #endif +#endif } void sh2_drc_flush_all(void) @@ -4724,6 +4850,7 @@ void sh2_drc_flush_all(void) state_dump(); block_stats(); entry_stats(); + bcache_stats(); flush_tcache(0); flush_tcache(1); flush_tcache(2); @@ -4810,6 +4937,8 @@ int sh2_drc_init(SH2 *sh2) #endif } memset(sh2->branch_cache, -1, sizeof(sh2->branch_cache)); + memset(sh2->rts_cache, -1, sizeof(sh2->rts_cache)); + sh2->rts_cache_idx = 0; return 0; diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index a3eb5b12b..cf830dfca 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -52,6 +52,8 @@ typedef struct SH2_ int poll_cnt; // DRC branch cache. size must be 2^n and <=128 + int rts_cache_idx; + struct { unsigned int pc; void *code; } rts_cache[16]; struct { unsigned int pc; void *code; } branch_cache[128]; // interpreter stuff From 57f2c6a5c7f565018343ae81de56fb3a2afc1eef Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 23 May 2019 19:04:31 +0200 Subject: [PATCH 042/174] sh2 drc, keep T bit in host flags as long as possible --- cpu/drc/emit_arm.c | 58 +++++++++++++++- cpu/drc/emit_x86.c | 38 +++++++++++ cpu/sh2/compiler.c | 165 ++++++++++++++++++++++++++------------------- 3 files changed, 190 insertions(+), 71 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 9af2f4538..b7922a984 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -128,7 +128,7 @@ static NOINLINE void EMIT(u32 op, u32 dst, u32 src) emit_cache[i] = emit_cache[i+1]; } } - + static void emith_flush(void) { int i; @@ -156,6 +156,7 @@ static void emith_flush(void) #define A_COND_LE 0xd #define A_COND_CS A_COND_HS #define A_COND_CC A_COND_LO +#define A_COND_NV 0xf // Not Valid (aka NeVer :-) - ATTN: not a real condition! /* unified conditions */ #define DCOND_EQ A_COND_EQ @@ -414,6 +415,9 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int u32 v; int i; + if (cond == A_COND_NV) + return; + switch (op) { case A_OP_MOV: rn = 0; @@ -522,6 +526,9 @@ static int emith_xbranch(int cond, void *target, int is_call) int direct = is_offset_24(val); u32 *start_ptr = (u32 *)tcache_ptr; + if (cond == A_COND_NV) + return 0; // never taken + if (direct) { EOP_C_B(cond,is_call,val & 0xffffff); // b, bl target @@ -1328,3 +1335,52 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) EMITH_SJMP2_END(DCOND_NE); \ } while (0) +#ifdef T +// T bit handling +static int tcond = -1; + +#define emith_invert_cond(cond) \ + ((cond) ^ 1) + +#define emith_clr_t_cond(sr) \ + (void)sr + +#define emith_set_t_cond(sr, cond) \ + tcond = cond + +#define emith_get_t_cond() \ + tcond + +#define emith_invalidate_t() \ + tcond = -1 + +#define emith_set_t(sr, val) \ + tcond = ((val) ? A_COND_AL: A_COND_NV) + +static void emith_sync_t(sr) +{ + if (tcond == A_COND_AL) + emith_or_r_imm(sr, T); + else if (tcond == A_COND_NV) + emith_bic_r_imm(sr, T); + else if (tcond >= 0) { + emith_bic_r_imm_c(emith_invert_cond(tcond),sr, T); + emith_or_r_imm_c(tcond, sr, T); + } + tcond = -1; +} + +static int emith_tst_t(int sr, int tf) +{ + if (tcond < 0) { + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; + } else if (tcond >= A_COND_AL) { + // MUST sync because A_COND_NV isn't a real condition + emith_sync_t(sr); + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; + } else + return tf ? tcond : emith_invert_cond(tcond); +} +#endif diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index edb34521d..325694046 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -1200,3 +1200,41 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_pool_commit(j) /**/ #define emith_insn_ptr() ((u8 *)tcache_ptr) #define emith_flush() /**/ + +#ifdef T +// T bit handling +#define emith_invert_cond(cond) \ + ((cond) ^ 1) + +static void emith_clr_t_cond(int sr) +{ + emith_bic_r_imm(sr, T); +} + +static void emith_set_t_cond(int sr, int cond) +{ + EMITH_SJMP_START(emith_invert_cond(cond)); + emith_or_r_imm_c(cond, sr, T); + EMITH_SJMP_END(emith_invert_cond(cond)); +} + +#define emith_get_t_cond() -1 + +#define emith_sync_t(sr) ((void)sr) + +#define emith_invalidate_t() + +static void emith_set_t(int sr, int val) +{ + if (val) + emith_or_r_imm(sr, T); + else + emith_bic_r_imm(sr, T); +} + +static int emith_tst_t(int sr, int tf) +{ + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; +} +#endif diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index be6e3ee14..d441039be 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -117,6 +117,17 @@ static int insns_compiled, hash_collisions, host_insn_count; #define SHR_MEM 31 #define SHR_TMP -1 +#define T 0x00000001 +#define S 0x00000002 +#define I 0x000000f0 +#define Q 0x00000100 +#define M 0x00000200 +#define T_save 0x00000800 + +#define I_SHIFT 4 +#define Q_SHIFT 8 +#define M_SHIFT 9 + static struct op_data { u8 op; u8 cycles; @@ -525,17 +536,6 @@ static cache_reg_t cache_regs[] = { static signed char reg_map_host[HOST_REGS]; -#define T 0x00000001 -#define S 0x00000002 -#define I 0x000000f0 -#define Q 0x00000100 -#define M 0x00000200 -#define T_save 0x00000800 - -#define I_SHIFT 4 -#define Q_SHIFT 8 -#define M_SHIFT 9 - static void REGPARM(1) (*sh2_drc_entry)(SH2 *sh2); static void (*sh2_drc_dispatcher)(void); #if CALL_STACK @@ -2318,17 +2318,19 @@ static void emit_move_r_r(sh2_reg_e dst, sh2_reg_e src) } } -// T must be clear, and comparison done just before this -static void emit_or_t_if_eq(int srr) +static void emit_sync_t_to_sr(void) { - EMITH_SJMP_START(DCOND_NE); - emith_or_r_imm_c(DCOND_EQ, srr, T); - EMITH_SJMP_END(DCOND_NE); + // avoid reloading SR from context if there's nothing to do + if (emith_get_t_cond() >= 0) { + int sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); + } } // rd = @(arg0) static int emit_memhandler_read(int size) { + emit_sync_t_to_sr(); rcache_clean_tmp(); #ifndef DRC_SR_REG // must writeback cycles for poll detection stuff @@ -2356,6 +2358,7 @@ static int emit_memhandler_read(int size) // @(arg0) = arg1 static void emit_memhandler_write(int size) { + emit_sync_t_to_sr(); rcache_clean_tmp(); #ifndef DRC_SR_REG if (guest_regs[SHR_SR].vreg != -1) @@ -2776,6 +2779,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // clear stale state after compile errors rcache_invalidate(); + emith_invalidate_t(); drcf = (struct drcf) { 0 }; // ------------------------------------------------- @@ -2812,6 +2816,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); + emith_sync_t(sr); rcache_flush(); emith_flush(); @@ -2896,6 +2901,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #if (DRC_DEBUG & (8|256|512|1024)) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); rcache_clean(); tmp = rcache_used_hreg_mask(); emith_save_caller_regs(tmp); @@ -2918,6 +2924,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (!(op_flags[i] & OF_DELAY_OP)) { sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); + emith_sync_t(sr); rcache_clean(); tmp = rcache_used_hreg_mask(); @@ -2944,6 +2951,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) delay_dep_bk = opd->source & ops[i-1].dest; if (delay_dep_fw & BITMASK1(SHR_T)) { sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); DELAY_SAVE_T(sr); } if (delay_dep_bk & BITMASK1(SHR_PC)) { @@ -2965,9 +2973,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); tmp = rcache_get_reg(SHR_PC, RC_GR_WRITE, NULL); emith_move_r_imm(tmp, pc); - emith_tst_r_imm(sr, T); - tmp2 = ops[i-1].op == OP_BRANCH_CT ? DCOND_NE : DCOND_EQ; - tmp3 = ops[i-1].op == OP_BRANCH_CT ? DCOND_EQ : DCOND_NE; + tmp2 = emith_tst_t(sr, (ops[i-1].op == OP_BRANCH_CT)); + tmp3 = emith_invert_cond(tmp2); EMITH_SJMP_START(tmp3); emith_move_r_imm_c(tmp2, tmp, ops[i-1].imm); EMITH_SJMP_END(tmp3); @@ -3061,6 +3068,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto end_op; case OP_RTE: // RTE 0000000000101011 + emith_invalidate_t(); // pop PC emit_memhandler_read_rr(sh2, SHR_PC, SHR_SP, 0, 2 | MF_POSTINCR); // pop SR @@ -3079,6 +3087,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case OP_TRAPA: // TRAPA #imm 11000011iiiiiiii // push SR tmp = rcache_get_reg_arg(1, SHR_SR, &tmp2); + emith_sync_t(tmp2); emith_clear_msb(tmp, tmp2, 22); emit_memhandler_write_rr(sh2, SHR_TMP, SHR_SP, 0, 2 | MF_PREDECR); // push PC @@ -3177,6 +3186,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } if (tmp2 == SHR_SR) { sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + emith_sync_t(sr); tmp = rcache_get_reg(GET_Rn(), RC_GR_WRITE, NULL); emith_clear_msb(tmp, sr, 22); // reserved bits defined by ISA as 0 } else @@ -3198,11 +3208,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { case 0: // CLRT 0000000000001000 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_bic_r_imm(sr, T); + emith_set_t(sr, 0); break; case 1: // SETT 0000000000011000 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_or_r_imm(sr, T); + emith_set_t(sr, 1); break; case 2: // CLRMAC 0000000000101000 emit_move_r_imm32(SHR_MACL, 0); @@ -3219,10 +3229,12 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) break; case 1: // DIV0U 0000000000011001 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_invalidate_t(); emith_bic_r_imm(sr, M|Q|T); break; case 2: // MOVT Rn 0000nnnn00101001 sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + emith_sync_t(sr); tmp2 = rcache_get_reg(GET_Rn(), RC_GR_WRITE, NULL); emith_clear_msb(tmp2, sr, 31); break; @@ -3286,6 +3298,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + emith_invalidate_t(); emith_bic_r_imm(sr, M|Q|T); emith_tst_r_imm(tmp2, (1<<31)); EMITH_SJMP_START(DCOND_EQ); @@ -3304,9 +3317,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); - emith_bic_r_imm(sr, T); + emith_clr_t_cond(sr); emith_tst_r_r(tmp2, tmp3); - emit_or_t_if_eq(sr); + emith_set_t_cond(sr, DCOND_EQ); goto end_op; case 0x09: // AND Rm,Rn 0010nnnnmmmm1001 if (GET_Rm() != GET_Rn()) { @@ -3339,7 +3352,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); emith_eor_r_r_r(tmp, tmp2, tmp3); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_bic_r_imm(sr, T); + emith_clr_t_cond(sr); emith_tst_r_imm(tmp, 0x000000ff); EMITH_SJMP_START(DCOND_EQ); emith_tst_r_imm_c(DCOND_NE, tmp, 0x0000ff00); @@ -3350,7 +3363,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) EMITH_SJMP_END(DCOND_EQ); EMITH_SJMP_END(DCOND_EQ); EMITH_SJMP_END(DCOND_EQ); - emit_or_t_if_eq(sr); + emith_set_t_cond(sr, DCOND_EQ); rcache_free_tmp(tmp); goto end_op; case 0x0d: // XTRCT Rm,Rn 0010nnnnmmmm1101 @@ -3391,32 +3404,24 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); - emith_bic_r_imm(sr, T); + emith_clr_t_cond(sr); emith_cmp_r_r(tmp2, tmp3); switch (op & 0x07) { case 0x00: // CMP/EQ - emit_or_t_if_eq(sr); + emith_set_t_cond(sr, DCOND_EQ); break; case 0x02: // CMP/HS - EMITH_SJMP_START(DCOND_LO); - emith_or_r_imm_c(DCOND_HS, sr, T); - EMITH_SJMP_END(DCOND_LO); + emith_set_t_cond(sr, DCOND_HS); break; case 0x03: // CMP/GE - EMITH_SJMP_START(DCOND_LT); - emith_or_r_imm_c(DCOND_GE, sr, T); - EMITH_SJMP_END(DCOND_LT); + emith_set_t_cond(sr, DCOND_GE); break; case 0x06: // CMP/HI - EMITH_SJMP_START(DCOND_LS); - emith_or_r_imm_c(DCOND_HI, sr, T); - EMITH_SJMP_END(DCOND_LS); + emith_set_t_cond(sr, DCOND_HI); break; case 0x07: // CMP/GT - EMITH_SJMP_START(DCOND_LE); - emith_or_r_imm_c(DCOND_GT, sr, T); - EMITH_SJMP_END(DCOND_LE); + emith_set_t_cond(sr, DCOND_GT); break; } goto end_op; @@ -3431,6 +3436,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); emith_tpop_carry(sr, 0); emith_adcf_r_r_r(tmp2, tmp, tmp); emith_tpush_carry(sr, 0); // keep Q1 in T for now @@ -3479,6 +3485,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); if (op & 4) { // adc emith_tpop_carry(sr, 0); emith_adcf_r_r_r(tmp, tmp3, tmp2); @@ -3494,14 +3501,12 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_bic_r_imm(sr, T); + emith_clr_t_cond(sr); if (op & 4) { emith_addf_r_r_r(tmp, tmp3, tmp2); } else emith_subf_r_r_r(tmp, tmp3, tmp2); - EMITH_SJMP_START(DCOND_VC); - emith_or_r_imm_c(DCOND_VS, sr, T); - EMITH_SJMP_END(DCOND_VC); + emith_set_t_cond(sr, DCOND_VS); goto end_op; case 0x0d: // DMULS.L Rm,Rn 0011nnnnmmmm1101 tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); @@ -3524,6 +3529,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 2: // SHAL Rn 0100nnnn00100000 tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); emith_tpop_carry(sr, 0); // dummy emith_lslf(tmp, tmp2, 1); emith_tpush_carry(sr, 0); @@ -3538,10 +3544,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.polling = drcf.loop_type = 0; } #endif - emith_bic_r_imm(sr, T); tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); + emith_clr_t_cond(sr); emith_subf_r_r_imm(tmp, tmp2, 1); - emit_or_t_if_eq(sr); + emith_set_t_cond(sr, DCOND_EQ); goto end_op; } goto default_; @@ -3552,6 +3558,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 2: // SHAR Rn 0100nnnn00100001 tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); emith_tpop_carry(sr, 0); // dummy if (op & 0x20) { emith_asrf(tmp, tmp2, 1); @@ -3562,11 +3569,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 1: // CMP/PZ Rn 0100nnnn00010001 tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_bic_r_imm(sr, T); + emith_clr_t_cond(sr); emith_cmp_r_imm(tmp, 0); - EMITH_SJMP_START(DCOND_LT); - emith_or_r_imm_c(DCOND_GE, sr, T); - EMITH_SJMP_END(DCOND_LT); + emith_set_t_cond(sr, DCOND_GE); goto end_op; } goto default_; @@ -3597,6 +3602,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } tmp3 = rcache_get_reg_arg(1, tmp, &tmp4); if (tmp == SHR_SR) { + emith_sync_t(tmp4); emith_clear_msb(tmp3, tmp4, 22); // reserved bits defined by ISA as 0 } else if (tmp3 != tmp4) emith_move_r_r(tmp3, tmp4); @@ -3610,6 +3616,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x05: // ROTR Rn 0100nnnn00000101 tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); emith_tpop_carry(sr, 0); // dummy if (op & 1) { emith_rorf(tmp, tmp2, 1); @@ -3621,6 +3628,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x25: // ROTCR Rn 0100nnnn00100101 tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, NULL); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); emith_tpop_carry(sr, 0); if (op & 1) { emith_rorcf(tmp); @@ -3631,11 +3639,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x15: // CMP/PL Rn 0100nnnn00010101 tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_bic_r_imm(sr, T); + emith_clr_t_cond(sr); emith_cmp_r_imm(tmp, 0); - EMITH_SJMP_START(DCOND_LE); - emith_or_r_imm_c(DCOND_GT, sr, T); - EMITH_SJMP_END(DCOND_LE); + emith_set_t_cond(sr, DCOND_GT); goto end_op; } goto default_; @@ -3665,6 +3671,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto default_; } if (tmp == SHR_SR) { + emith_invalidate_t(); tmp2 = emit_memhandler_read_rr(sh2, SHR_TMP, GET_Rn(), 0, 2 | MF_POSTINCR); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_write_sr(sr, tmp2); @@ -3723,9 +3730,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_get_reg_arg(0, GET_Rn(), NULL); tmp = emit_memhandler_read(0); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_bic_r_imm(sr, T); + emith_clr_t_cond(sr); emith_cmp_r_imm(tmp, 0); - emit_or_t_if_eq(sr); + emith_set_t_cond(sr, DCOND_EQ); emith_or_r_imm(tmp, 0x80); tmp2 = rcache_get_tmp_arg(1); // assuming it differs to tmp emith_move_r_r(tmp2, tmp); @@ -3753,6 +3760,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto default_; } if (tmp2 == SHR_SR) { + emith_invalidate_t(); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); emith_write_sr(sr, tmp); @@ -3820,6 +3828,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) break; case 0x0a: // NEGC Rm,Rn 0110nnnnmmmm1010 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + emith_sync_t(sr); emith_tpop_carry(sr, 1); emith_negcf_r_r(tmp2, tmp); emith_tpush_carry(sr, 1); @@ -3870,9 +3879,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x0800: // CMP/EQ #imm,R0 10001000iiiiiiii tmp2 = rcache_get_reg(SHR_R0, RC_GR_READ, NULL); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_bic_r_imm(sr, T); + emith_clr_t_cond(sr); emith_cmp_r_imm(tmp2, (s8)(op & 0xff)); - emit_or_t_if_eq(sr); + emith_set_t_cond(sr, DCOND_EQ); goto end_op; } goto default_; @@ -3896,9 +3905,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x0800: // TST #imm,R0 11001000iiiiiiii tmp = rcache_get_reg(SHR_R0, RC_GR_READ, NULL); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_bic_r_imm(sr, T); + emith_clr_t_cond(sr); emith_tst_r_imm(tmp, op & 0xff); - emit_or_t_if_eq(sr); + emith_set_t_cond(sr, DCOND_EQ); goto end_op; case 0x0900: // AND #imm,R0 11001001iiiiiiii tmp = rcache_get_reg(SHR_R0, RC_GR_RMW, &tmp2); @@ -3919,9 +3928,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x0c00: // TST.B #imm,@(R0,GBR) 11001100iiiiiiii tmp = emit_indirect_indexed_read(sh2, SHR_TMP, SHR_R0, SHR_GBR, 0 | drcf.polling); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_bic_r_imm(sr, T); + emith_clr_t_cond(sr); emith_tst_r_imm(tmp, op & 0xff); - emit_or_t_if_eq(sr); + emith_set_t_cond(sr, DCOND_EQ); rcache_free_tmp(tmp); goto end_op; case 0x0d00: // AND.B #imm,@(R0,GBR) 11001101iiiiiiii @@ -3955,7 +3964,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (!(op_flags[i] & OF_B_IN_DS)) { elprintf_sh2(sh2, EL_ANOMALY, "drc: illegal op %04x @ %08x", op, pc - 2); - exit(1); + exit(1); } } @@ -3973,6 +3982,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (drcf.test_irq && !drcf.pending_branch_direct) { sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); + emith_sync_t(sr); if (!drcf.pending_branch_indirect) emit_move_r_imm32(SHR_PC, pc); rcache_flush(); @@ -3997,6 +4007,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) (drcf.loop_type == OF_DELAY_LOOP && drcf.delay_reg >= 0))) { // idle or delay loop + emit_sync_t_to_sr(); emith_sh2_delay_loop(cycles, drcf.delay_reg); drcf.polling = drcf.loop_type = 0; } @@ -4009,11 +4020,20 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // emit condition test for conditional branch if (OP_ISBRACND(opd_b->op)) { cond = (opd_b->op == OP_BRANCH_CF) ? DCOND_EQ : DCOND_NE; - if (delay_dep_fw & BITMASK1(SHR_T)) + if (delay_dep_fw & BITMASK1(SHR_T)) { + emith_sync_t(sr); emith_tst_r_imm(sr, T_save); - else - emith_tst_r_imm(sr, T); - } + } else { + cond = emith_tst_t(sr, (opd_b->op == OP_BRANCH_CT)); + if (emith_get_t_cond() >= 0) { + if (opd_b->op == OP_BRANCH_CT) + emith_or_r_imm_c(cond, sr, T); + else + emith_bic_r_imm_c(cond, sr, T); + } + } + } else + emith_sync_t(sr); // no modification of host status/flags between here and branching! #if LINK_BRANCHES @@ -4062,6 +4082,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // branch not taken, correct cycle count if (ctaken) emith_add_r_imm(sr, ctaken << 12); + // set T bit to reflect branch not taken for OP_BRANCH_CT/CF + if (emith_get_t_cond() >= 0) // T is synced for all other cases + emith_set_t(sr, opd_b->op == OP_BRANCH_CF); drcf.pending_branch_direct = 0; if (target_pc >= base_pc && target_pc < pc) @@ -4073,6 +4096,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); + emith_sync_t(sr); rcache_clean(); #if CALL_STACK struct op_data *opd_b = (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd; @@ -4113,6 +4137,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) s32 tmp = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(tmp); + emith_sync_t(tmp); emit_move_r_imm32(SHR_PC, pc); rcache_flush(); @@ -5553,7 +5578,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x06: // MOV.L @Rm+,Rn 0110nnnnmmmm0110 opd->dest = BITMASK2(GET_Rm(), GET_Rn()); opd->source = BITMASK2(GET_Rm(), SHR_MEM); - break; + break; case 0x00: // MOV.B @Rm,Rn 0110nnnnmmmm0000 case 0x01: // MOV.W @Rm,Rn 0110nnnnmmmm0001 case 0x02: // MOV.L @Rm,Rn 0110nnnnmmmm0010 @@ -5596,12 +5621,12 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, { case 0x0000: // MOV.B R0,@(disp,Rn) 10000000nnnndddd opd->source = BITMASK2(GET_Rm(), SHR_R0); - opd->dest = BITMASK1(SHR_MEM); + opd->dest = BITMASK1(SHR_MEM); opd->imm = (op & 0x0f); break; case 0x0100: // MOV.W R0,@(disp,Rn) 10000001nnnndddd opd->source = BITMASK2(GET_Rm(), SHR_R0); - opd->dest = BITMASK1(SHR_MEM); + opd->dest = BITMASK1(SHR_MEM); opd->imm = (op & 0x0f) * 2; break; case 0x0400: // MOV.B @(disp,Rm),R0 10000100mmmmdddd @@ -5760,7 +5785,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 0x0e00: // XOR.B #imm,@(R0,GBR) 11001110iiiiiiii case 0x0f00: // OR.B #imm,@(R0,GBR) 11001111iiiiiiii opd->source = BITMASK3(SHR_GBR, SHR_R0, SHR_MEM); - opd->dest = BITMASK1(SHR_MEM); + opd->dest = BITMASK1(SHR_MEM); opd->imm = op & 0xff; opd->cycles = 3; break; From 862f2f2defc89f3307a8c8396dd7fb14e4bbaa16 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 24 May 2019 21:52:03 +0200 Subject: [PATCH 043/174] sh2 drc, change utils abi to pass sh2 PC in arg0 (reduces compiled code size) --- cpu/drc/emit_arm.c | 4 +++ cpu/drc/emit_x86.c | 18 ++++++++++++ cpu/sh2/compiler.c | 68 +++++++++++++++++++++++++++------------------- 3 files changed, 62 insertions(+), 28 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index b7922a984..c85a3d713 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -956,6 +956,8 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) EOP_LDR_REG_LSL(A_COND_AL, r, rs, rm, 0) #define emith_read_r_r_r_wb(r, rs, rm) \ EOP_LDR_REG_LSL_WB(A_COND_AL, r, rs, rm, 0) +#define emith_read_r_r_r_ptr_wb(r, rs, rm) \ + emith_read_r_r_r_wb(r, rs, rm) #define emith_read8_r_r_offs_c(cond, r, rs, offs) \ EOP_LDRB_IMM2(cond, r, rs, offs) @@ -1003,6 +1005,8 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) emith_write_r_r_offs_c(A_COND_AL, r, rs, offs) #define emith_write_r_r_r_wb(r, rs, rm) \ EOP_STR_REG_LSL_WB(A_COND_AL, r, rs, rm, 0) +#define emith_write_r_r_r_ptr_wb(r, rs, rm) \ + emith_write_r_r_r_wb(r, rs, rm) #define emith_ctx_read_c(cond, r, offs) \ emith_read_r_r_offs_c(cond, r, CONTEXT_REG, offs) diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 325694046..f71c5d429 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -721,19 +721,37 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMIT_OP_MODRM(0x8b, 0, r, 4); \ EMIT_SIB(0, rs, rm); /* mov r, [rm + rs * 1] */ \ } while (0) +#define emith_read_r_r_r_ptr(r, rs, rm) do { \ + EMIT_REX_IF(1, r, rs); \ + EMIT_OP_MODRM64(0x8b, 0, r, 4); \ + EMIT_SIB(0, rs, rm); /* mov r, [rm + rs * 1] */ \ +} while (0) #define emith_read_r_r_r_wb(r, rs, rm) do { \ emith_read_r_r_r(r, rs, rm); \ emith_add_r_r_ptr(rs, rm); \ } while (0) +#define emith_read_r_r_r_ptr_wb(r, rs, rm) do { \ + emith_read_r_r_r_ptr(r, rs, rm); \ + emith_add_r_r_ptr(rs, rm); \ +} while (0) #define emith_write_r_r_r(r, rs, rm) do { \ EMIT_OP_MODRM(0x89, 0, r, 4); \ EMIT_SIB(0, rs, rm); /* mov [rm + rs * 1], r */ \ } while (0) +#define emith_write_r_r_r_ptr(r, rs, rm) do { \ + EMIT_REX_IF(1, r, rs); \ + EMIT_OP_MODRM64(0x89, 0, r, 4); \ + EMIT_SIB(0, rs, rm); /* mov [rm + rs * 1], r */ \ +} while (0) #define emith_write_r_r_r_wb(r, rs, rm) do { \ emith_write_r_r_r(r, rs, rm); \ emith_add_r_r_ptr(rs, rm); \ } while (0) +#define emith_write_r_r_r_ptr_wb(r, rs, rm) do { \ + emith_write_r_r_r_ptr(r, rs, rm); \ + emith_add_r_r_ptr(rs, rm); \ +} while (0) #define emith_ctx_read(r, offs) \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index d441039be..f2a1f95ba 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -537,12 +537,12 @@ static cache_reg_t cache_regs[] = { static signed char reg_map_host[HOST_REGS]; static void REGPARM(1) (*sh2_drc_entry)(SH2 *sh2); -static void (*sh2_drc_dispatcher)(void); +static void REGPARM(1) (*sh2_drc_dispatcher)(u32 pc); #if CALL_STACK -static void REGPARM(1) (*sh2_drc_dispatcher_call)(uptr host_pc); -static void (*sh2_drc_dispatcher_return)(void); +static void REGPARM(2) (*sh2_drc_dispatcher_call)(u32 pc, uptr host_pr); +static void REGPARM(1) (*sh2_drc_dispatcher_return)(u32 pc); #endif -static void (*sh2_drc_exit)(void); +static void REGPARM(1) (*sh2_drc_exit)(u32 pc); static void (*sh2_drc_test_irq)(void); static u32 REGPARM(1) (*sh2_drc_read8)(u32 a); @@ -2862,8 +2862,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.polling = (drcf.loop_type == OF_POLL_LOOP ? MF_POLLING : 0); #endif +#if DRC_DEBUG // must update PC emit_move_r_imm32(SHR_PC, pc); +#endif rcache_clean(); #if (DRC_DEBUG & 0x10) @@ -2883,9 +2885,12 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #endif // check cycles + tmp = rcache_get_tmp_arg(0); sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); emith_cmp_r_imm(sr, 0); + emith_move_r_imm(tmp, pc); emith_jump_cond(DCOND_LE, sh2_drc_exit); + rcache_free_tmp(tmp); #if (DRC_DEBUG & 32) // block hit counter @@ -4057,13 +4062,15 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (target == NULL) { // can't resolve branch locally, make a block exit - emit_move_r_imm32(SHR_PC, target_pc); rcache_clean(); + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, target_pc); + rcache_free_tmp(tmp); #if CALL_STACK if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) { // BSR - tmp = rcache_get_tmp_arg(0); + tmp = rcache_get_tmp_arg(1); emith_call_link(tmp, sh2_drc_dispatcher_call); rcache_free_tmp(tmp); } else @@ -4098,6 +4105,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) FLUSH_CYCLES(sr); emith_sync_t(sr); rcache_clean(); + tmp = rcache_get_reg_arg(0, SHR_PC, NULL); #if CALL_STACK struct op_data *opd_b = (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd; if (opd_b->rm == SHR_PR) { @@ -4105,7 +4113,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_jump(sh2_drc_dispatcher_return); } else if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) { // JSR/BSRF - tmp = rcache_get_tmp_arg(0); + tmp = rcache_get_tmp_arg(1); emith_call_link(tmp, sh2_drc_dispatcher_call); } else #endif @@ -4139,13 +4147,15 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) FLUSH_CYCLES(tmp); emith_sync_t(tmp); - emit_move_r_imm32(SHR_PC, pc); - rcache_flush(); + rcache_clean(); + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, pc); target = dr_prepare_ext_branch(block->entryp, pc, sh2->is_slave, tcache_id); if (target == NULL) return NULL; emith_jump_patchable(target); + rcache_invalidate(); } else rcache_flush(); emith_flush(); @@ -4160,7 +4170,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // flush pc and go back to dispatcher (this should no longer happen) dbg(1, "stray branch to %08x %p", branch_patch_pc[i], tcache_ptr); target = tcache_ptr; - emit_move_r_imm32(SHR_PC, branch_patch_pc[i]); + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, branch_patch_pc[i]); rcache_flush(); emith_jump(sh2_drc_dispatcher); } @@ -4322,33 +4333,34 @@ static void sh2_generate_utils(void) emith_pop_and_ret(arg1); emith_flush(); - // sh2_drc_exit(void) + // sh2_drc_exit(u32 pc) sh2_drc_exit = (void *)tcache_ptr; + emith_ctx_write(arg0, SHR_PC * 4); emit_do_static_regs(1, arg2); emith_sh2_drc_exit(); emith_flush(); #if CALL_STACK - // sh2_drc_dispatcher_call(uptr host_pc) + // sh2_drc_dispatcher_call(u32 pc, uptr host_pr) sh2_drc_dispatcher_call = (void *)tcache_ptr; emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx)); emith_add_r_imm(arg2, 2*sizeof(void *)); emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *)); emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx)); - emith_add_r_r_ptr_imm(arg1, CONTEXT_REG, offsetof(SH2, rts_cache)); - emith_ctx_read(arg3, offsetof(SH2, pr)); - emith_write_r_r_r_wb(arg3, arg1, arg2); - emith_write_r_r_offs_ptr(arg0, arg1, sizeof(void *)); + emith_add_r_r_ptr_imm(arg3, CONTEXT_REG, offsetof(SH2, rts_cache) + sizeof(void *)); + emith_write_r_r_r_ptr_wb(arg1, arg2, arg3); + emith_ctx_read(arg3, SHR_PR * 4); + emith_write_r_r_offs(arg3, arg2, (s8)-sizeof(void *)); emith_flush(); // FALLTHROUGH #endif - // sh2_drc_dispatcher(void) + // sh2_drc_dispatcher(u32 pc) sh2_drc_dispatcher = (void *)tcache_ptr; - emith_ctx_read(arg0, SHR_PC * 4); + emith_ctx_write(arg0, SHR_PC * 4); #if BRANCH_CACHE // check if PC is in branch target cache - emith_and_r_r_imm(arg1, arg0, (ARRAY_SIZE(sh2s->branch_cache)-1)*4); - emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg1, sizeof(void *) == 8 ? 2 : 1); + emith_and_r_r_imm(arg1, arg0, (ARRAY_SIZE(sh2s->branch_cache)-1)*8); + emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg1, sizeof(void *) == 8 ? 1 : 0); emith_read_r_r_offs(arg2, arg1, offsetof(SH2, branch_cache)); emith_cmp_r_r(arg2, arg0); EMITH_SJMP_START(DCOND_NE); @@ -4376,8 +4388,8 @@ static void sh2_generate_utils(void) emith_write_r_r_offs_c(DCOND_NE, arg3, arg2, 0); #endif emith_ctx_read_c(DCOND_NE, arg2, SHR_PC * 4); - emith_and_r_r_imm(arg1, arg2, (ARRAY_SIZE(sh2s->branch_cache)-1)*4); - emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg1, sizeof(void *) == 8 ? 2 : 1); + emith_and_r_r_imm(arg1, arg2, (ARRAY_SIZE(sh2s->branch_cache)-1)*8); + emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg1, sizeof(void *) == 8 ? 1 : 0); emith_write_r_r_offs_c(DCOND_NE, arg2, arg1, offsetof(SH2, branch_cache)); emith_write_r_r_offs_ptr_c(DCOND_NE, RET_REG, arg1, offsetof(SH2, branch_cache) + sizeof(void *)); EMITH_SJMP_END(DCOND_EQ); @@ -4393,11 +4405,10 @@ static void sh2_generate_utils(void) emith_flush(); #if CALL_STACK - // sh2_drc_dispatcher_return(void) + // sh2_drc_dispatcher_return(u32 pc) sh2_drc_dispatcher_return = (void *)tcache_ptr; emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx)); emith_add_r_r_ptr_imm(arg1, CONTEXT_REG, offsetof(SH2, rts_cache)); - emith_ctx_read(arg0, offsetof(SH2, pc)); emith_read_r_r_r_wb(arg3, arg1, arg2); emith_cmp_r_r(arg0, arg3); #if (DRC_DEBUG & 128) @@ -4462,11 +4473,11 @@ static void sh2_generate_utils(void) emith_move_r_r_ptr(arg0, CONTEXT_REG); emith_call_ctx(offsetof(SH2, irq_callback)); // vector = sh2->irq_callback(sh2, level); // obtain new PC - emith_lsl(arg0, RET_REG, 2); emith_ctx_read(arg1, SHR_VBR * 4); - emith_add_r_r(arg0, arg1); - tmp = emit_memhandler_read(2); - emith_ctx_write(tmp, SHR_PC * 4); + emith_add_r_r_r_lsl(arg0, arg1, RET_REG, 2); + emith_call(sh2_drc_read32); + if (arg0 != RET_REG) + emith_move_r_r(arg0, RET_REG); #if defined(__i386__) || defined(__x86_64__) emith_add_r_r_ptr_imm(xSP, xSP, sizeof(void *)); // fix stack #endif @@ -4480,6 +4491,7 @@ static void sh2_generate_utils(void) emith_move_r_r_ptr(CONTEXT_REG, arg0); // move ctx, arg0 emit_do_static_regs(0, arg2); emith_call(sh2_drc_test_irq); + emith_ctx_read(arg0, SHR_PC * 4); emith_jump(sh2_drc_dispatcher); emith_flush(); From e2015483a1519a8f144fe642ba0d33e1e36b6617 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 28 May 2019 23:16:45 +0200 Subject: [PATCH 044/174] 32x DMA memory copy performance optimisation --- cpu/sh2/compiler.c | 9 ++--- pico/32x/memory.c | 85 ++++++++++++++++++++++++++++++++++++++++++---- pico/32x/sh2soc.c | 23 +++++++++++++ pico/pico_int.h | 1 + tools/mkoffsets.sh | 2 +- 5 files changed, 105 insertions(+), 15 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index f2a1f95ba..2a147a15b 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -2261,7 +2261,7 @@ static int emit_get_rom_data(SH2 *sh2, sh2_reg_e r, u32 offs, int size, u32 *val if (gconst_get(r, &a)) { a += offs; // check if rom is memory mapped (not bank switched), and address is in rom - if (dr_is_rom(a) && p32x_sh2_get_mem_ptr(a, &mask, sh2)) { + if (dr_is_rom(a) && p32x_sh2_get_mem_ptr(a, &mask, sh2) != (void *)-1) { switch (size & MF_SIZEMASK) { case 0: *val = (s8)p32x_sh2_read8(a, sh2s); break; // 8 case 1: *val = (s16)p32x_sh2_read16(a, sh2s); break; // 16 @@ -4896,12 +4896,7 @@ void sh2_drc_flush_all(void) void sh2_drc_mem_setup(SH2 *sh2) { - // fill the convenience pointers - sh2->p_bios = sh2->is_slave ? Pico32xMem->sh2_rom_s.w : Pico32xMem->sh2_rom_m.w; - sh2->p_da = sh2->data_array; - sh2->p_sdram = Pico32xMem->sdram; - sh2->p_rom = Pico.rom; - // sh2->p_dram filled in dram bank switching + // fill the DRC-only convenience pointers sh2->p_drcblk_da = Pico32xMem->drcblk_da[!!sh2->is_slave]; sh2->p_drcblk_ram = Pico32xMem->drcblk_ram; } diff --git a/pico/32x/memory.c b/pico/32x/memory.c index a1ef42c2b..70287a2cb 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -1855,17 +1855,15 @@ void *p32x_sh2_get_mem_ptr(u32 a, u32 *mask, SH2 *sh2) { const sh2_memmap *mm = sh2->read8_map; void *ret = (void *)-1; - u32 am; - mm += a >> SH2_READ_SHIFT; - am = a & ((1 << SH2_READ_SHIFT)-1); - if (!map_flag_set(mm->addr) && !(am & ~mm->mask)) { + mm += SH2MAP_ADDR2OFFS_R(a); + if (!map_flag_set(mm->addr)) { // directly mapped memory (SDRAM, ROM, data array) ret = (void *)(mm->addr << 1); *mask = mm->mask; } else if ((a & ~0x7ff) == 0) { // BIOS, has handler function since it shares its segment with I/O - ret = sh2->is_slave ? Pico32xMem->sh2_rom_s.w : Pico32xMem->sh2_rom_m.w; + ret = sh2->p_bios; *mask = 0x7ff; } else if ((a & 0xc6000000) == 0x02000000) { // banked ROM. Return bank address @@ -1877,6 +1875,75 @@ void *p32x_sh2_get_mem_ptr(u32 a, u32 *mask, SH2 *sh2) return ret; } +int p32x_sh2_memcpy(u32 dst, u32 src, int count, int size, SH2 *sh2) +{ + u32 mask; + void *ps, *pd; + int len, i; + + // check if src and dst points to memory (rom/sdram/dram/da) + if ((pd = p32x_sh2_get_mem_ptr(dst, &mask, sh2)) == (void *)-1) + return 0; + if ((ps = p32x_sh2_get_mem_ptr(src, &mask, sh2)) == (void *)-1) + return 0; + ps += src & mask; + len = count * size; + + // DRAM in byte access is always in overwrite mode + if (pd == sh2->p_dram && size == 1) + dst |= 0x20000; + + // align dst to halfword + if (dst & 1) { + p32x_sh2_write8(dst, *(u8 *)((uptr)ps ^ 1), sh2); + ps++, dst++, len --; + } + + // copy data + if ((uptr)ps & 1) { + // unaligned, use halfword copy mode to reduce memory bandwidth + u16 *sp = (u16 *)(ps - 1); + u16 dl, dh = *sp++; + for (i = 0; i < (len & ~1); i += 2, dst += 2, sp++) { + dl = dh, dh = *sp; + p32x_sh2_write16(dst, (dh >> 8) | (dl << 8), sh2); + } + if (len & 1) + p32x_sh2_write8(dst, dh, sh2); + } else { + // dst and src at least halfword aligned + u16 *sp = (u16 *)ps; + // align dst to word + if ((dst & 2) && len >= 2) { + p32x_sh2_write16(dst, *sp++, sh2); + dst += 2, len -= 2; + } + if ((uptr)sp & 2) { + // halfword copy, using word writes to reduce memory bandwidth + u16 dl, dh; + for (i = 0; i < (len & ~3); i += 4, dst += 4, sp += 2) { + dl = sp[0], dh = sp[1]; + p32x_sh2_write32(dst, (dl << 16) | dh, sh2); + } + } else { + // word copy + u32 d; + for (i = 0; i < (len & ~3); i += 4, dst += 4, sp += 2) { + d = *(u32 *)sp; + p32x_sh2_write32(dst, (d << 16) | (d >> 16), sh2); + } + } + if (len & 2) { + p32x_sh2_write16(dst, *sp++, sh2); + dst += 2; + } + if (len & 1) + p32x_sh2_write8(dst, *sp >> 8, sh2); + } + + return count; +} + // ----------------------------------------------------------------- static void z80_md_bank_write_32x(unsigned int a, unsigned char d) @@ -2107,8 +2174,12 @@ void Pico32xSwapDRAM(int b) ssh2_read16_map[0x04/2].addr = ssh2_read16_map[0x24/2].addr = ssh2_read32_map[0x04/2].addr = ssh2_read32_map[0x24/2].addr = MAP_MEMORY(Pico32xMem->dram[b]); - msh2.p_dram = ssh2.p_dram = Pico32xMem->dram[b]; // DRC conveniance ptr - msh2.p_rom = ssh2.p_rom = Pico.rom; + // convenience ptrs + msh2.p_sdram = ssh2.p_sdram = Pico32xMem->sdram; + msh2.p_dram = ssh2.p_dram = Pico32xMem->dram[b]; + msh2.p_rom = ssh2.p_rom = Pico.rom; + msh2.p_bios = Pico32xMem->sh2_rom_m.w; msh2.p_da = msh2.data_array; + ssh2.p_bios = Pico32xMem->sh2_rom_s.w; ssh2.p_da = ssh2.data_array; } static void bank_switch_rom_sh2(void) diff --git a/pico/32x/sh2soc.c b/pico/32x/sh2soc.c index dd61a93bb..66bdc478d 100644 --- a/pico/32x/sh2soc.c +++ b/pico/32x/sh2soc.c @@ -129,6 +129,24 @@ static void dmac_transfer_one(SH2 *sh2, struct dma_chan *chan) chan->sar += size; } +// optimization for copying around memory with SH2 DMA +static void dmac_memcpy(struct dma_chan *chan, SH2 *sh2) +{ + u32 size = (chan->chcr >> 10) & 3, up = chan->chcr & (1 << 14); + int count; + + if (!up || chan->tcr < 4) + return; + if (size == 3) size = 2; // 4-word xfer mode still counts in words + // XXX check TCR being a multiple of 4 in 4-word xfer mode? + // XXX check alignment of sar/dar, generating a bus error if unaligned? + count = p32x_sh2_memcpy(chan->dar, chan->sar, chan->tcr, 1 << size, sh2); + + chan->sar += count << size; + chan->dar += count << size; + chan->tcr -= count; +} + // DMA trigger by SH2 register write static void dmac_trigger(SH2 *sh2, struct dma_chan *chan) { @@ -139,6 +157,11 @@ static void dmac_trigger(SH2 *sh2, struct dma_chan *chan) if (chan->chcr & DMA_AR) { // auto-request transfer sh2->state |= SH2_STATE_SLEEP; + if ((((chan->chcr >> 12) ^ (chan->chcr >> 14)) & 3) == 0 && + (((chan->chcr >> 14) ^ (chan->chcr >> 15)) & 1) == 1) { + // SM == DM and either DM0 or DM1 are set. check for mem to mem copy + dmac_memcpy(chan, sh2); + } while ((int)chan->tcr > 0) dmac_transfer_one(sh2, chan); dmac_transfer_complete(sh2, chan); diff --git a/pico/pico_int.h b/pico/pico_int.h index 31fc702ce..36b36144d 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -937,6 +937,7 @@ unsigned int REGPARM(3) p32x_sh2_poll_memory16(unsigned int a, unsigned int d, S unsigned int REGPARM(3) p32x_sh2_poll_memory32(unsigned int a, unsigned int d, SH2 *sh2); void *p32x_sh2_get_mem_ptr(unsigned int a, unsigned int *mask, SH2 *sh2); void p32x_sh2_poll_event(SH2 *sh2, unsigned int flags, unsigned int m68k_cycles); +int p32x_sh2_memcpy(unsigned int dst, unsigned int src, int count, int size, SH2 *sh2); // 32x/draw.c void PicoDrawSetOutFormat32x(pdso_t which, int use_32x_line_mode); diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index 461fbfa7d..a573f7a43 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -89,7 +89,7 @@ get_define OFS_PMEM32x_ Pico32xMem pal_native ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ is_slave ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ p_bios ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ p_da ; echo "$line" >>$fn -get_define OFS_SH2_ SH2_ p_sdram ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_sdram ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ p_rom ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ p_dram ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ p_drcblk_da ; echo "$line" >>$fn From 721f9c33856e1e115f50f55acfadd60472231ff0 Mon Sep 17 00:00:00 2001 From: kub Date: Mon, 24 Jun 2019 20:09:15 +0200 Subject: [PATCH 045/174] sh2 drc, x86 code emitter: use x86-64 registers R8-R15 --- cpu/drc/emit_x86.c | 351 ++++++++++++++++++++++++++++----------------- cpu/sh2/compiler.c | 64 +++++---- 2 files changed, 254 insertions(+), 161 deletions(-) diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index f71c5d429..652b49898 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -13,9 +13,9 @@ */ #include -enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; +enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common + xR8, xR9, xR10, xR11, xR12, xR13, xR14, xR15 }; // x86-64 only -#define HOST_REGS 8 #define CONTEXT_REG xBP #define RET_REG xAX @@ -65,7 +65,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define EMIT_OP(op) do { \ COUNT_OP; \ - EMIT(op, u8); \ + if ((op) > 0xff) EMIT((op) >> 8, u8); \ + EMIT((u8)(op), u8); \ } while (0) #define EMIT_MODRM(mod, r, rm) do { \ @@ -110,50 +111,70 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; EMIT_PTR(ptr + 1, (tcache_ptr - (ptr+2)), u8) // _r_r -#define emith_move_r_r(dst, src) \ - EMIT_OP_MODRM(0x8b, 3, dst, src) +#define emith_move_r_r(dst, src) do {\ + EMIT_REX_IF(0, dst, src); \ + EMIT_OP_MODRM64(0x8b, 3, dst, src); \ +} while (0) #define emith_move_r_r_ptr(dst, src) do { \ EMIT_REX_IF(1, dst, src); \ EMIT_OP_MODRM64(0x8b, 3, dst, src); \ } while (0) -#define emith_add_r_r(d, s) \ - EMIT_OP_MODRM(0x01, 3, s, d) +#define emith_add_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x01, 3, s, d); \ +} while (0) #define emith_add_r_r_ptr(d, s) do { \ EMIT_REX_IF(1, s, d); \ EMIT_OP_MODRM64(0x01, 3, s, d); \ } while (0) -#define emith_sub_r_r(d, s) \ - EMIT_OP_MODRM(0x29, 3, s, d) +#define emith_sub_r_r(d, s) do {\ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x29, 3, s, d); \ +} while (0) -#define emith_adc_r_r(d, s) \ - EMIT_OP_MODRM(0x11, 3, s, d) +#define emith_adc_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x11, 3, s, d); \ +} while (0) -#define emith_sbc_r_r(d, s) \ - EMIT_OP_MODRM(0x19, 3, s, d) /* SBB */ +#define emith_sbc_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x19, 3, s, d); /* SBB */ \ +} while (0) -#define emith_or_r_r(d, s) \ - EMIT_OP_MODRM(0x09, 3, s, d) +#define emith_or_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x09, 3, s, d); \ +} while (0) -#define emith_and_r_r(d, s) \ - EMIT_OP_MODRM(0x21, 3, s, d) +#define emith_and_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x21, 3, s, d); \ +} while (0) -#define emith_eor_r_r(d, s) \ - EMIT_OP_MODRM(0x31, 3, s, d) /* XOR */ +#define emith_eor_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x31, 3, s, d); /* XOR */ \ +} while (0) -#define emith_tst_r_r(d, s) \ - EMIT_OP_MODRM(0x85, 3, s, d) /* TEST */ +#define emith_tst_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x85, 3, s, d); /* TEST */ \ +} while (0) #define emith_tst_r_r_ptr(d, s) do { \ EMIT_REX_IF(1, s, d); \ EMIT_OP_MODRM64(0x85, 3, s, d); /* TEST */ \ } while (0) -#define emith_cmp_r_r(d, s) \ - EMIT_OP_MODRM(0x39, 3, s, d) +#define emith_cmp_r_r(d, s) do { \ + EMIT_REX_IF(0, s, d); \ + EMIT_OP_MODRM64(0x39, 3, s, d); \ +} while (0) // fake teq - test equivalence - get_flags(d ^ s) #define emith_teq_r_r(d, s) do { \ @@ -165,7 +186,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_mvn_r_r(d, s) do { \ if (d != s) \ emith_move_r_r(d, s); \ - EMIT_OP_MODRM(0xf7, 3, 2, d); /* NOT d */ \ + EMIT_REX_IF(0, 0, d); \ + EMIT_OP_MODRM64(0xf7, 3, 2, d); /* NOT d */ \ } while (0) #define emith_negc_r_r(d, s) do { \ @@ -179,7 +201,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_neg_r_r(d, s) do { \ if (d != s) \ emith_move_r_r(d, s); \ - EMIT_OP_MODRM(0xf7, 3, 3, d); /* NEG d */ \ + EMIT_REX_IF(0, 0, d); \ + EMIT_OP_MODRM64(0xf7, 3, 3, d); /* NEG d */ \ } while (0) // _r_r_r @@ -325,17 +348,18 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; // _r_imm #define emith_move_r_imm(r, imm) do { \ - EMIT_OP(0xb8 + (r)); \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP(0xb8 + ((r)&7)); \ EMIT(imm, u32); \ } while (0) #define emith_move_r_ptr_imm(r, imm) do { \ - if ((uint64_t)(imm) <= UINT32_MAX) \ + if ((uintptr_t)(imm) <= UINT32_MAX) \ emith_move_r_imm(r, (uintptr_t)(imm)); \ else { \ EMIT_REX_IF(1, 0, r); \ - EMIT_OP(0xb8 + (r)); \ - EMIT((uint64_t)(imm), uint64_t); \ + EMIT_OP(0xb8 + ((r)&7)); \ + EMIT((uintptr_t)(imm), uint64_t); \ } \ } while (0) @@ -343,7 +367,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_move_r_imm(r, (u32)(signed int)(signed char)(imm)) #define emith_arith_r_imm(op, r, imm) do { \ - EMIT_OP_MODRM(0x81, 3, op, r); \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM64(0x81, 3, op, r); \ EMIT(imm, u32); \ } while (0) @@ -372,7 +397,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_arith_r_imm(7, r, imm) #define emith_tst_r_imm(r, imm) do { \ - EMIT_OP_MODRM(0xf7, 3, 0, r); \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM64(0xf7, 3, 0, r); \ EMIT(imm, u32); \ } while (0) @@ -442,22 +468,14 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; // _r_r_imm - use lea #define emith_add_r_r_imm(d, s, imm) do { \ - assert(s != xSP); \ - EMIT_OP_MODRM(0x8d, 2, d, s); /* lea */ \ + EMIT_REX_IF(0, d, s); \ + emith_deref_modrm(0x8d, 2, d, s); \ EMIT(imm, s32); \ } while (0) #define emith_add_r_r_ptr_imm(d, s, imm) do { \ - if ((s) != xSP) { \ - EMIT_REX_IF(1, d, s); \ - EMIT_OP_MODRM64(0x8d, 2, d, s); /* lea */ \ - } \ - else { \ - if (d != s) \ - emith_move_r_r_ptr(d, s); \ - EMIT_REX_IF(1, 0, d); \ - EMIT_OP_MODRM64(0x81, 3, 0, d); /* add */ \ - } \ + EMIT_REX_IF(1, d, s); \ + emith_deref_modrm(0x8d, 2, d, s); \ EMIT(imm, s32); \ } while (0) @@ -493,7 +511,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_shift(op, d, s, cnt) do { \ if (d != s) \ emith_move_r_r(d, s); \ - EMIT_OP_MODRM(0xc1, 3, op, d); \ + EMIT_REX_IF(0, 0, d); \ + EMIT_OP_MODRM64(0xc1, 3, op, d); \ EMIT(cnt, u8); \ } while (0) @@ -512,26 +531,36 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_ror(d, s, cnt) \ emith_shift(1, d, s, cnt) -#define emith_rolc(r) \ - EMIT_OP_MODRM(0xd1, 3, 2, r) +#define emith_rolc(r) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM64(0xd1, 3, 2, r); \ +} while (0) -#define emith_rorc(r) \ - EMIT_OP_MODRM(0xd1, 3, 3, r) +#define emith_rorc(r) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM64(0xd1, 3, 3, r); \ +} while (0) // misc -#define emith_push(r) \ - EMIT_OP(0x50 + (r)) +#define emith_push(r) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP(0x50 + ((r)&7)); \ +} while (0) #define emith_push_imm(imm) do { \ EMIT_OP(0x68); \ EMIT(imm, u32); \ } while (0) -#define emith_pop(r) \ - EMIT_OP(0x58 + (r)) +#define emith_pop(r) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP(0x58 + ((r)&7)); \ +} while (0) -#define emith_neg_r(r) \ - EMIT_OP_MODRM(0xf7, 3, 3, r) +#define emith_neg_r(r) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM64(0xf7, 3, 3, r); \ +} while (0) #define emith_clear_msb(d, s, count) do { \ u32 t = (u32)-1; \ @@ -553,8 +582,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_setc(r) do { \ assert(is_abcdx(r)); \ - EMIT_OP(0x0f); \ - EMIT_OP_MODRM(0x92, 3, 0, r); /* SETC r */ \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM64(0x0f92, 3, 0, r); /* SETC r */ \ } while (0) // XXX: stupid mess @@ -572,9 +601,12 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; emith_move_r_r(xAX, s1); \ rmr = s2; \ } \ - EMIT_OP_MODRM(0xf7, 3, op, rmr); /* xMUL rmr */ \ - if (dlo != xAX) \ - EMIT_OP(0x90 + (dlo)); /* XCHG eax, dlo */ \ + EMIT_REX_IF(0, 0, rmr); \ + EMIT_OP_MODRM64(0xf7, 3, op, rmr); /* xMUL rmr */ \ + if (dlo != xAX) { \ + EMIT_REX_IF(0, 0, dlo); \ + EMIT_OP(0x90 + ((dlo)&7)); /* XCHG eax, dlo */ \ + } \ if (dhi != xDX && dhi != -1 && !(dhi == xAX && dlo == xDX)) \ emith_move_r_r(dhi, (dlo == xDX ? xAX : xDX)); \ if (dlo != xDX && dhi != xDX) \ @@ -589,19 +621,30 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_mul_s64(dlo, dhi, s1, s2) \ emith_mul_(5, dlo, dhi, s1, s2) /* IMUL */ -#define emith_mul(d, s1, s2) \ - emith_mul_(4, d, -1, s1, s2) +#define emith_mul(d, s1, s2) do { \ + if (d == s1) { \ + EMIT_REX_IF(0, d, s2); \ + EMIT_OP_MODRM64(0x0faf, 3, d, s2); \ + } else if (d == s2) { \ + EMIT_REX_IF(0, d, s1); \ + EMIT_OP_MODRM64(0x0faf, 3, d, s1); \ + } else { \ + emith_move_r_r(d, s1); \ + EMIT_REX_IF(0, d, s2); \ + EMIT_OP_MODRM64(0x0faf, 3, d, s2); \ + } \ +} while (0) // (dlo,dhi) += signed(s1) * signed(s2) #define emith_mula_s64(dlo, dhi, s1, s2) do { \ emith_push(dhi); \ emith_push(dlo); \ emith_mul_(5, dlo, dhi, s1, s2); \ - EMIT_OP_MODRM(0x03, 0, dlo, 4); \ - EMIT_SIB(0, 4, 4); /* add dlo, [xsp] */ \ - EMIT_OP_MODRM(0x13, 1, dhi, 4); \ - EMIT_SIB(0, 4, 4); \ - EMIT(sizeof(void *), u8); /* adc dhi, [xsp+{4,8}] */ \ + EMIT_REX_IF(0, dlo, xSP); \ + emith_deref_modrm(0x03, 0, dlo, xSP); /* add dlo, [xsp] */ \ + EMIT_REX_IF(0, dhi, xSP); \ + emith_deref_modrm(0x13, 1, dhi, xSP); /* adc dhi, [xsp+{4,8}] */ \ + EMIT(sizeof(void *), u8); \ emith_add_r_r_ptr_imm(xSP, xSP, sizeof(void *) * 2); \ } while (0) @@ -631,100 +674,114 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_rolcf emith_rolc #define emith_rorcf emith_rorc +#define emith_deref_modrm(op, m, r, rs) do { \ + if (((rs) & 7) == 5 && m == 0) { /* xBP,xR13 not in mod 0, use mod 1 */\ + EMIT_OP_MODRM64(op, 1, r, rs); \ + EMIT(0, u8); \ + } else if (((rs) & 7) == 4) { /* xSP,xR12 must use SIB */ \ + EMIT_OP_MODRM64(op, m, r, 4); \ + EMIT_SIB64(0, 4, rs); \ + } else \ + EMIT_OP_MODRM64(op, m, r, rs); \ +} while (0) + #define emith_deref_op(op, r, rs, offs) do { \ /* mov r <-> [ebp+#offs] */ \ - if (abs(offs) >= 0x80) { \ - EMIT_OP_MODRM64(op, 2, r, rs); \ + if ((offs) == 0) { \ + emith_deref_modrm(op, 0, r, rs); \ + } else if (abs(offs) >= 0x80) { \ + emith_deref_modrm(op, 2, r, rs); \ EMIT(offs, u32); \ } else { \ - EMIT_OP_MODRM64(op, 1, r, rs); \ + emith_deref_modrm(op, 1, r, rs); \ EMIT((u8)offs, u8); \ } \ } while (0) -#define is_abcdx(r) (xAX <= (r) && (r) <= xDX) +#define is_abcdx(r) !((r) & ~0x3) -#define emith_read_r_r_offs(r, rs, offs) \ - emith_deref_op(0x8b, r, rs, offs) -#define emith_read_r_r_offs_ptr(r, rs, offs) \ +#define emith_read_r_r_offs(r, rs, offs) do { \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x8b, r, rs, offs); \ +} while (0) +#define emith_read_r_r_offs_ptr(r, rs, offs) do { \ EMIT_REX_IF(1, r, rs); \ - emith_deref_op(0x8b, r, rs, offs) + emith_deref_op(0x8b, r, rs, offs); \ +} while (0) -#define emith_write_r_r_offs(r, rs, offs) \ - emith_deref_op(0x89, r, rs, offs) -#define emith_write_r_r_offs_ptr(r, rs, offs) \ +#define emith_write_r_r_offs(r, rs, offs) do { \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x89, r, rs, offs); \ +} while (0) +#define emith_write_r_r_offs_ptr(r, rs, offs) do { \ EMIT_REX_IF(1, r, rs); \ - emith_deref_op(0x89, r, rs, offs) + emith_deref_op(0x89, r, rs, offs); \ +} while (0) #define emith_read8_r_r_offs(r, rs, offs) do { \ - EMIT(0x0f, u8); \ - emith_deref_op(0xb6, r, rs, offs); \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x0fb6, r, rs, offs); \ } while (0) #define emith_read8s_r_r_offs(r, rs, offs) do { \ - EMIT(0x0f, u8); \ - emith_deref_op(0xbe, r, rs, offs); \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x0fbe, r, rs, offs); \ } while (0) -// note: don't use prefixes on this #define emith_write8_r_r_offs(r, rs, offs) do {\ - int r_ = r; \ - if (!is_abcdx(r)) { \ - r_ = rcache_get_tmp(); \ - emith_move_r_r(r_, r); \ - } \ - emith_deref_op(0x88, r_, rs, offs); \ - if ((r) != r_) \ - rcache_free_tmp(r_); \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x88, r, rs, offs); \ } while (0) #define emith_read16_r_r_offs(r, rs, offs) do { \ - EMIT(0x0f, u8); \ - emith_deref_op(0xb7, r, rs, offs); \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x0fb7, r, rs, offs); \ } while (0) #define emith_read16s_r_r_offs(r, rs, offs) do { \ - EMIT(0x0f, u8); \ - emith_deref_op(0xbf, r, rs, offs); \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x0fbf, r, rs, offs); \ } while (0) #define emith_write16_r_r_offs(r, rs, offs) do { \ - EMIT(0x66, u8); \ - emith_write_r_r_offs(r, rs, offs); \ + EMIT(0x66, u8); /* Intel SDM Vol 2a: REX must be closest to opcode */ \ + EMIT_REX_IF(0, r, rs); \ + emith_deref_op(0x89, r, rs, offs); \ } while (0) #define emith_read8_r_r_r(r, rs, rm) do { \ - EMIT(0x0f, u8); \ - EMIT_OP_MODRM(0xb6, 0, r, 4); \ - EMIT_SIB(0, rs, rm); /* mov r, [rm + rs * 1] */ \ + EMIT_XREX_IF(0, r, rm, rs); \ + EMIT_OP_MODRM64(0x0fb6, 0, r, 4); \ + EMIT_SIB64(0, rs, rm); /* mov r, [rm + rs * 1] */ \ } while (0) #define emith_read8s_r_r_r(r, rs, rm) do { \ - EMIT(0x0f, u8); \ - EMIT_OP_MODRM(0xbe, 0, r, 4); \ - EMIT_SIB(0, rs, rm); /* mov r, [rm + rs * 1] */ \ + EMIT_XREX_IF(0, r, rm, rs); \ + EMIT_OP_MODRM64(0x0fbe, 0, r, 4); \ + EMIT_SIB64(0, rs, rm); /* mov r, [rm + rs * 1] */ \ } while (0) #define emith_read16_r_r_r(r, rs, rm) do { \ - EMIT(0x0f, u8); \ - EMIT_OP_MODRM(0xb7, 0, r, 4); \ - EMIT_SIB(0, rs, rm); /* mov r, [rm + rs * 1] */ \ + EMIT_XREX_IF(0, r, rm, rs); \ + EMIT_OP_MODRM64(0x0fb7, 0, r, 4); \ + EMIT_SIB64(0, rs, rm); /* mov r, [rm + rs * 1] */ \ } while (0) #define emith_read16s_r_r_r(r, rs, rm) do { \ - EMIT(0x0f, u8); \ - EMIT_OP_MODRM(0xbf, 0, r, 4); \ - EMIT_SIB(0, rs, rm); /* mov r, [rm + rs * 1] */ \ + EMIT_XREX_IF(0, r, rm, rs); \ + EMIT_OP_MODRM64(0x0fbf, 0, r, 4); \ + EMIT_SIB64(0, rs, rm); /* mov r, [rm + rs * 1] */ \ } while (0) #define emith_read_r_r_r(r, rs, rm) do { \ - EMIT_OP_MODRM(0x8b, 0, r, 4); \ - EMIT_SIB(0, rs, rm); /* mov r, [rm + rs * 1] */ \ + EMIT_XREX_IF(0, r, rm, rs); \ + EMIT_OP_MODRM64(0x8b, 0, r, 4); \ + EMIT_SIB64(0, rs, rm); /* mov r, [rm + rs * 1] */ \ } while (0) #define emith_read_r_r_r_ptr(r, rs, rm) do { \ - EMIT_REX_IF(1, r, rs); \ + EMIT_XREX_IF(1, r, rm, rs); \ EMIT_OP_MODRM64(0x8b, 0, r, 4); \ - EMIT_SIB(0, rs, rm); /* mov r, [rm + rs * 1] */ \ + EMIT_SIB64(0, rs, rm); /* mov r, [rm + rs * 1] */ \ } while (0) #define emith_read_r_r_r_wb(r, rs, rm) do { \ emith_read_r_r_r(r, rs, rm); \ @@ -736,13 +793,14 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; } while (0) #define emith_write_r_r_r(r, rs, rm) do { \ - EMIT_OP_MODRM(0x89, 0, r, 4); \ - EMIT_SIB(0, rs, rm); /* mov [rm + rs * 1], r */ \ + EMIT_XREX_IF(0, r, rm, rs); \ + EMIT_OP_MODRM64(0x89, 0, r, 4); \ + EMIT_SIB64(0, rs, rm); /* mov [rm + rs * 1], r */ \ } while (0) #define emith_write_r_r_r_ptr(r, rs, rm) do { \ - EMIT_REX_IF(1, r, rs); \ + EMIT_XREX_IF(1, r, rm, rs); \ EMIT_OP_MODRM64(0x89, 0, r, 4); \ - EMIT_SIB(0, rs, rm); /* mov [rm + rs * 1], r */ \ + EMIT_SIB64(0, rs, rm); /* mov [rm + rs * 1], r */ \ } while (0) #define emith_write_r_r_r_wb(r, rs, rm) do { \ emith_write_r_r_r(r, rs, rm); \ @@ -796,8 +854,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_jump_cond(cond, ptr) do { \ u32 disp = (u8 *)(ptr) - ((u8 *)tcache_ptr + 6); \ - EMIT(0x0f, u8); \ - EMIT_OP(0x80 | (cond)); \ + EMIT_OP(0x0f80 | (cond)); \ EMIT(disp, u32); \ } while (0) @@ -924,15 +981,20 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #ifdef __x86_64__ +#define HOST_REGS 16 #define PTR_SCALE 3 #define NA_TMP_REG xAX // non-arg tmp from reg_temp[] -#define EMIT_REX_IF(w, r, rm) do { \ - int r_ = (r) > 7 ? 1 : 0; \ - int rm_ = (rm) > 7 ? 1 : 0; \ - if ((w) | r_ | rm_) \ - EMIT_REX(1, r_, 0, rm_); \ +#define EMIT_XREX_IF(w, r, rm, rs) do { \ + int xr_ = (r) > 7 ? 1 : 0; \ + int xb_ = (rm) > 7 ? 1 : 0; \ + int xx_ = (rs) > 7 ? 1 : 0; \ + if ((w) | xr_ | xx_ | xb_) \ + EMIT_REX(w, xr_, xx_, xb_); \ } while (0) + +#define EMIT_REX_IF(w, r, rm) \ + EMIT_XREX_IF(w, r, rm, 0) #ifndef _WIN32 @@ -947,11 +1009,19 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_sh2_drc_entry() do { \ emith_push(xBX); \ emith_push(xBP); \ + emith_push(xR12); \ + emith_push(xR13); \ + emith_push(xR14); \ + emith_push(xR15); \ emith_push(xSI); /* to align */ \ } while (0) #define emith_sh2_drc_exit() do { \ emith_pop(xSI); \ + emith_pop(xR15); \ + emith_pop(xR14); \ + emith_pop(xR13); \ + emith_pop(xR12); \ emith_pop(xBP); \ emith_pop(xBX); \ emith_ret(); \ @@ -963,22 +1033,30 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; switch (arg) { \ case 0: rd = xCX; break; \ case 1: rd = xDX; break; \ - case 2: rd = 8; break; \ - default: rd = 9; break; \ + case 2: rd = xR8; break; \ + default: rd = xR9; break; \ } #define emith_sh2_drc_entry() do { \ emith_push(xBX); \ emith_push(xBP); \ + emith_push(xR12); \ + emith_push(xR13); \ + emith_push(xR14); \ + emith_push(xR15); \ emith_push(xSI); \ emith_push(xDI); \ - emith_add_r_r_ptr_imm(xSP, xSP, -8*5); \ + emith_add_r_r_ptr_imm(xSP, xSP, -8*5); /* align + ABI param area */ \ } while (0) #define emith_sh2_drc_exit() do { \ emith_add_r_r_ptr_imm(xSP, xSP, 8*5); \ emith_pop(xDI); \ emith_pop(xSI); \ + emith_pop(xR15); \ + emith_pop(xR14); \ + emith_pop(xR13); \ + emith_pop(xR12); \ emith_pop(xBP); \ emith_pop(xBX); \ emith_ret(); \ @@ -988,6 +1066,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #else // !__x86_64__ +#define HOST_REGS 8 #define PTR_SCALE 2 #define NA_TMP_REG xBX // non-arg tmp from reg_temp[] @@ -995,6 +1074,11 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; assert((u32)(r) < 8u); \ assert((u32)(rm) < 8u); \ } while (0) +#define EMIT_XREX_IF(w, r, rs, rm) do { \ + assert((u32)(r) < 8u); \ + assert((u32)(rs) < 8u); \ + assert((u32)(rm) < 8u); \ +} while (0) #define host_arg2reg(rd, arg) \ switch (arg) { \ @@ -1039,15 +1123,16 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; #define emith_sh2_rcall(a, tab, func, mask) do { \ emith_lsr(mask, a, SH2_READ_SHIFT); \ - EMIT_REX_IF(1, mask, tab); \ + EMIT_XREX_IF(1, tab, tab, mask); \ EMIT_OP_MODRM64(0x8d, 0, tab, 4); \ EMIT_SIB64(PTR_SCALE, mask, tab); /* lea tab, [tab + mask * {4,8}] */ \ - EMIT_REX_IF(1, mask, tab); \ + EMIT_XREX_IF(1, tab, tab, mask); \ EMIT_OP_MODRM64(0x8d, 0, tab, 4); \ EMIT_SIB64(PTR_SCALE, mask, tab); /* lea tab, [tab + mask * {4,8}] */ \ - EMIT_REX_IF(1, func, tab); \ - EMIT_OP_MODRM64(0x8b, 0, func, tab); /* mov func, [tab] */ \ - EMIT_OP_MODRM64(0x8b, 1, mask, tab); \ + EMIT_REX_IF(1, func, tab); \ + emith_deref_modrm(0x8b, 0, func, tab); /* mov func, [tab] */ \ + EMIT_REX_IF(0, mask, tab); \ + emith_deref_modrm(0x8b, 1, mask, tab); \ EMIT(1 << PTR_SCALE, u8); /* mov mask, [tab + {4,8}] */ \ emith_add_r_r_ptr(func, func); \ } while (0) @@ -1056,7 +1141,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI }; int arg2_; \ host_arg2reg(arg2_, 2); \ emith_lsr(func, a, SH2_WRITE_SHIFT); /* tmp = a >> WRT_SHIFT */ \ - EMIT_REX_IF(1, func, tab); \ + EMIT_XREX_IF(1, func, tab, func); \ EMIT_OP_MODRM64(0x8b, 0, func, 4); \ EMIT_SIB64(PTR_SCALE, func, tab); /* mov tmp, [tab + tmp * {4,8}] */ \ emith_move_r_r_ptr(arg2_, CONTEXT_REG); \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 2a147a15b..9932ce6ff 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -1,6 +1,7 @@ /* * SH2 recompiler * (C) notaz, 2009,2010,2013 + * (C) kub, 2018,2019 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. @@ -430,13 +431,16 @@ typedef struct { } guest_reg_t; -// note: cache_regs[] must have at least the amount of -// HRF_REG registers used by handlers in worst case (currently 4) +// Note: cache_regs[] must have at least the amount of REG and TEMP registers +// used by handlers in worst case (currently 4). +// Register assignment goes by ABI convention. Caller save registers are TEMP, +// the others are either static or REG. SR must be static, R0 very recommended. +// TEMP registers first, REG last. alloc/evict algorithm depends on this. +// The 1st TEMP must not be RET_REG on x86 (it uses temps for some insns). +// XXX shouldn't this be somehow defined in the code emitters? #ifdef __arm__ #include "../drc/emit_arm.c" -// register assigment goes by ABI convention. All caller save registers are TEMP -// the others are either static or REG. SR must be static, R0 very recommended static guest_reg_t guest_regs[] = { // SHR_R0 .. SHR_SP #ifndef __MACH__ // no r9.. @@ -453,20 +457,21 @@ static guest_reg_t guest_regs[] = { { 0 } , { 0 } , { 0 } , { 0 } , }; -// NB first TEMP, then REG. alloc/evict algorithm depends on this +// OABI/EABI: params: r0-r3, return: r0-r1, temp: r12,r14, saved: r4-r8,r10,r11 +// SP,PC: r13,r15 must not be used. saved: r9 (for platform use, e.g. on OSx) static cache_reg_t cache_regs[] = { - { 12, HRF_TEMP }, + { 12, HRF_TEMP }, // temps { 14, HRF_TEMP }, - { 0, HRF_TEMP }, - { 1, HRF_TEMP }, + { 3, HRF_TEMP }, // params { 2, HRF_TEMP }, - { 3, HRF_TEMP }, - { 8, HRF_LOCKED }, + { 1, HRF_TEMP }, + { 0, HRF_TEMP }, // RET_REG + { 8, HRF_LOCKED }, // statics #ifndef __MACH__ // no r9.. { 9, HRF_LOCKED }, #endif { 10, HRF_LOCKED }, - { 4, HRF_REG }, + { 4, HRF_REG }, // other regs { 5, HRF_REG }, { 6, HRF_REG }, { 7, HRF_REG }, @@ -489,11 +494,11 @@ static guest_reg_t guest_regs[] = { // ax, cx, dx are usually temporaries by convention static cache_reg_t cache_regs[] = { - { xBX, HRF_REG|HRF_TEMP }, + { xBX, HRF_REG|HRF_TEMP }, // params { xCX, HRF_REG|HRF_TEMP }, { xDX, HRF_REG|HRF_TEMP }, - { xAX, HRF_REG|HRF_TEMP }, - { xSI, HRF_LOCKED }, + { xAX, HRF_REG|HRF_TEMP }, // return value + { xSI, HRF_LOCKED }, // statics { xDI, HRF_LOCKED }, }; @@ -502,11 +507,7 @@ static cache_reg_t cache_regs[] = { static guest_reg_t guest_regs[] = { // SHR_R0 .. SHR_SP -#ifndef _WIN32 - { 0 } , { 0 } , { 0 } , { 0 } , -#else - {GRF_STATIC, xDI}, { 0 } , { 0 } , { 0 } , -#endif + {GRF_STATIC,xR12}, { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , @@ -516,18 +517,25 @@ static guest_reg_t guest_regs[] = { { 0 } , { 0 } , { 0 } , { 0 } , }; -// ax, cx, dx are usually temporaries by convention +// M$/SystemV ABI conventions: +// rbx,rbp,r12-r15 are preserved, rcx,rdx,rax,r8,r9,r10,r11 are temporaries +// rsi,rdi are preserved in M$ ABI, temporary in SystemV ABI +// parameters in rcx,rdx,r8,r9, SystemV ABI additionally uses rsi,rdi static cache_reg_t cache_regs[] = { - { xCX, HRF_REG|HRF_TEMP }, - { xDX, HRF_REG|HRF_TEMP }, - { xAX, HRF_REG|HRF_TEMP }, + { xR10,HRF_TEMP }, // temps + { xR11,HRF_TEMP }, + { xAX, HRF_TEMP }, // RET_REG + { xR8, HRF_TEMP }, // params + { xR9, HRF_TEMP }, + { xCX, HRF_TEMP }, + { xDX, HRF_TEMP }, { xSI, HRF_REG|HRF_TEMP }, -#ifndef _WIN32 { xDI, HRF_REG|HRF_TEMP }, -#else - { xDI, HRF_LOCKED }, -#endif - { xBX, HRF_LOCKED }, + { xBX, HRF_LOCKED }, // statics + { xR12,HRF_LOCKED }, + { xR13,HRF_REG }, // other regs + { xR14,HRF_REG }, + { xR15,HRF_REG }, }; #else From 9cb4ef190715dc77542a072d665b7d25c57788fa Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 25 Jun 2019 20:15:48 +0200 Subject: [PATCH 046/174] 32X: memory access and polling bug fixes --- pico/32x/memory.c | 36 ++++++++++++++++++++++-------------- pico/32x/memory_arm.S | 14 ++++++-------- pico/32x/sh2soc.c | 6 +++++- 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 70287a2cb..7148d41c9 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -74,7 +74,7 @@ static int m68k_poll_detect(u32 a, u32 cycles, u32 flags) if (match && cycles - m68k_poll.cycles <= 64 && !SekNotPolling) { // detect split 32bit access by same cycle count, and ignore those - if (cycles != m68k_poll.cycles && ++m68k_poll.cnt > POLL_THRESHOLD) { + if (cycles != m68k_poll.cycles && ++m68k_poll.cnt >= POLL_THRESHOLD) { if (!(Pico32x.emu_flags & flags)) { elprintf(EL_32X, "m68k poll addr %08x, cyc %u", a, cycles - m68k_poll.cycles); @@ -118,7 +118,7 @@ static void NOINLINE sh2_poll_detect(u32 a, SH2 *sh2, u32 flags, int maxcnt) // by checking address (max 2 bytes away) and cycles (max 2 cycles later). // no polling if more than 20 cycles have passed since last detect call. if (a - sh2->poll_addr <= 2 && CYCLES_GE(sh2->poll_cycles+20, cycles_done)) { - if (CYCLES_GT(cycles_done,sh2->poll_cycles+2) && ++sh2->poll_cnt > maxcnt) { + if (CYCLES_GT(cycles_done,sh2->poll_cycles+2) && ++sh2->poll_cnt >= maxcnt) { if (!(sh2->state & flags)) elprintf_sh2(sh2, EL_32X, "state: %02x->%02x", sh2->state, sh2->state | flags); @@ -131,6 +131,8 @@ static void NOINLINE sh2_poll_detect(u32 a, SH2 *sh2, u32 flags, int maxcnt) if ((a & 0xc6000000) == 0x06000000) { unsigned char *p = sh2->p_drcblk_ram; p[(a & 0x3ffff) >> SH2_DRCBLK_RAM_SHIFT] |= 0x80; + // mark next word too to enable poll fifo for 32bit access + p[((a+2) & 0x3ffff) >> SH2_DRCBLK_RAM_SHIFT] |= 0x80; } #endif } @@ -148,7 +150,7 @@ void NOINLINE p32x_sh2_poll_event(SH2 *sh2, u32 flags, u32 m68k_cycles) elprintf_sh2(sh2, EL_32X, "state: %02x->%02x", sh2->state, sh2->state & ~flags); - if (sh2->m68krcycles_done < m68k_cycles) + if (sh2->m68krcycles_done < m68k_cycles && !(sh2->state & SH2_STATE_RUN)) sh2->m68krcycles_done = m68k_cycles; pevt_log_sh2_o(sh2, EVT_POLL_END); @@ -174,12 +176,12 @@ static void sh2s_sync_on_read(SH2 *sh2) // This is used to correctly deliver syncronisation data to the 3 cpus. The // fifo stores 16 bit values, 8/32 bit accesses must be adapted accordingly. #define PFIFO_SZ 4 -#define PFIFO_CNT 4 +#define PFIFO_CNT 8 struct sh2_poll_fifo { u32 cycles; u32 a; u16 d; - u16 cpu; + int cpu; } sh2_poll_fifo[PFIFO_CNT][PFIFO_SZ]; unsigned sh2_poll_rd[PFIFO_CNT], sh2_poll_wr[PFIFO_CNT]; // ringbuffer pointers @@ -191,6 +193,7 @@ static NOINLINE u32 sh2_poll_read(u32 a, u32 d, unsigned int cycles, SH2* sh2) int cpu = sh2 ? sh2->is_slave+1 : 0; unsigned idx; + a &= ~0x20000000; // ignore writethrough bit // fetch oldest write to address from fifo, but stop when reaching the present idx = sh2_poll_rd[hix]; while (idx != sh2_poll_wr[hix] && CYCLES_GE(cycles, fifo[idx].cycles)) { @@ -225,6 +228,7 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) struct sh2_poll_fifo *q = &fifo[(sh2_poll_wr[hix]-1) % PFIFO_SZ]; int cpu = sh2 ? sh2->is_slave+1 : 0; + a &= ~0x20000000; // ignore writethrough bit // fold 2 consecutive writes to the same address to avoid reading of // intermediate values that may cause synchronisation problems. // NB this can take an eternity on m68k: mov.b , needs @@ -279,8 +283,8 @@ u32 REGPARM(3) p32x_sh2_poll_memory32(unsigned int a, u32 d, SH2 *sh2) sh2s_sync_on_read(sh2); cycles = sh2_cycles_done_m68k(sh2); // check poll fifo and sign-extend the result correctly - d = sh2_poll_read(a, d, cycles, sh2) | - (sh2_poll_read(a+2, d >> 16, cycles, sh2) << 16); + d = (sh2_poll_read(a, d >> 16, cycles, sh2) << 16) | + ((u16)sh2_poll_read(a+2, d, cycles, sh2)); } sh2_poll_detect(a, sh2, SH2_STATE_RPOLL, 5); @@ -1503,7 +1507,7 @@ static u32 REGPARM(2) sh2_read32_rom(u32 a, SH2 *sh2) // writes #ifdef DRC_SH2 -static void NOINLINE sh2_sdram_poll(u32 a, u16 d, SH2 *sh2) +static void NOINLINE sh2_sdram_poll(u32 a, u32 d, SH2 *sh2) { unsigned cycles; @@ -1525,8 +1529,8 @@ void NOINLINE sh2_sdram_checks(u32 a, u32 d, SH2 *sh2, int t) void NOINLINE sh2_sdram_checks_l(u32 a, u32 d, SH2 *sh2, int t) { - sh2_sdram_checks(a, d, sh2, t); - sh2_sdram_checks(a+2, d>>16, sh2, t>>16); + sh2_sdram_checks(a, d>>16, sh2, t); + sh2_sdram_checks(a+2, d, sh2, t>>16); } #ifndef _ASM_32X_MEMORY_C @@ -1568,6 +1572,7 @@ static void REGPARM(3) sh2_write8_cs0(u32 a, u32 d, SH2 *sh2) } if ((a & 0x3fe00) == 0x4200) { + sh2->poll_cnt = 0; ((u8 *)Pico32xMem->pal)[(a & 0x1ff) ^ 1] = d; Pico32x.dirty_pal = 1; goto out; @@ -1641,6 +1646,7 @@ static void REGPARM(3) sh2_write16_cs0(u32 a, u32 d, SH2 *sh2) } if ((a & 0x3fe00) == 0x4200) { + sh2->poll_cnt = 0; Pico32xMem->pal[(a & 0x1ff) / 2] = d; Pico32x.dirty_pal = 1; goto out; @@ -2175,11 +2181,7 @@ void Pico32xSwapDRAM(int b) ssh2_read32_map[0x04/2].addr = ssh2_read32_map[0x24/2].addr = MAP_MEMORY(Pico32xMem->dram[b]); // convenience ptrs - msh2.p_sdram = ssh2.p_sdram = Pico32xMem->sdram; msh2.p_dram = ssh2.p_dram = Pico32xMem->dram[b]; - msh2.p_rom = ssh2.p_rom = Pico.rom; - msh2.p_bios = Pico32xMem->sh2_rom_m.w; msh2.p_da = msh2.data_array; - ssh2.p_bios = Pico32xMem->sh2_rom_s.w; ssh2.p_da = ssh2.data_array; } static void bank_switch_rom_sh2(void) @@ -2359,6 +2361,12 @@ void PicoMemSetup32x(void) ssh2.write16_tab = (const void **)(void *)ssh2_write16_map; ssh2.write32_tab = (const void **)(void *)ssh2_write32_map; + // convenience ptrs + msh2.p_sdram = ssh2.p_sdram = Pico32xMem->sdram; + msh2.p_rom = ssh2.p_rom = Pico.rom; + msh2.p_bios = Pico32xMem->sh2_rom_m.w; msh2.p_da = msh2.data_array; + ssh2.p_bios = Pico32xMem->sh2_rom_s.w; ssh2.p_da = ssh2.data_array; + sh2_drc_mem_setup(&msh2); sh2_drc_mem_setup(&ssh2); diff --git a/pico/32x/memory_arm.S b/pico/32x/memory_arm.S index 48143ba9f..43a019580 100644 --- a/pico/32x/memory_arm.S +++ b/pico/32x/memory_arm.S @@ -18,7 +18,7 @@ .text -@ u32 a +@ u32 a, SH2 *sh2 .global sh2_read8_rom .global sh2_read8_sdram .global sh2_read8_da @@ -32,7 +32,7 @@ .global sh2_read32_da .global sh2_read32_dram -@ u32 a, u32 d +@ u32 a, u32 d, SH2 *sh2 .global sh2_write8_sdram .global sh2_write8_da .global sh2_write8_dram @@ -270,16 +270,14 @@ sh2_write32_dram: streq r1, [ip, r3, lsr #SH2_DRAM_SHIFT] bxeq lr ldr r0, [ip, r3, lsr #SH2_DRAM_SHIFT] - mov r2, #0 tst r1, #0x00ff0000 - orrne r2, r2, #0x00ff0000 + bicne r0, r0, #0x00ff0000 tst r1, #0xff000000 - orrne r2, r2, #0xff000000 + bicne r0, r0, #0xff000000 tst r1, #0x000000ff - orrne r2, r2, #0x000000ff + bicne r0, r0, #0x000000ff tst r1, #0x0000ff00 - orrne r2, r2, #0x0000ff00 - bic r0, r0, r2 + bicne r0, r0, #0x0000ff00 orr r0, r0, r1 str r0, [ip, r3, lsr #SH2_DRAM_SHIFT] bx lr diff --git a/pico/32x/sh2soc.c b/pico/32x/sh2soc.c index 66bdc478d..1f19150e4 100644 --- a/pico/32x/sh2soc.c +++ b/pico/32x/sh2soc.c @@ -137,6 +137,11 @@ static void dmac_memcpy(struct dma_chan *chan, SH2 *sh2) if (!up || chan->tcr < 4) return; + // XXX Mars Check Program fills a 64K buffer, then copies 32K longwords from + // DRAM to SDRAM in 4-longword mode, which is 128K. This overwrites a comm + // area in SDRAM, which is why the check fails. + // Is this a buswidth mismatch problem? As a kludge, usw 16-bit width xfers + if (size == 3 && (chan->sar & 0xdf000000) == 0x04000000) size = 1; if (size == 3) size = 2; // 4-word xfer mode still counts in words // XXX check TCR being a multiple of 4 in 4-word xfer mode? // XXX check alignment of sar/dar, generating a bus error if unaligned? @@ -500,7 +505,6 @@ static void dreq1_do(SH2 *sh2, struct dma_chan *chan) if ((chan->dar & ~0xf) != 0x20004030) elprintf(EL_32XP|EL_ANOMALY, "dreq1: bad dar?: %08x\n", chan->dar); - sh2->state |= SH2_STATE_SLEEP; dmac_transfer_one(sh2, chan); if (chan->tcr == 0) dmac_transfer_complete(sh2, chan); From 141566aa23351f4c454d4df6ab67d092f96b0b47 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 25 Jun 2019 20:23:45 +0200 Subject: [PATCH 047/174] SH2 drc: bug fixing and small speed improvements --- config.gp2x47 | 5 +- cpu/drc/cmn.h | 36 +++ cpu/drc/emit_arm.c | 38 ++- cpu/drc/emit_x86.c | 62 ++-- cpu/sh2/compiler.c | 299 +++++++++--------- cpu/sh2/compiler.h | 2 +- platform/common/common.mak | 2 +- .../common/{host_dasm_arm.c => host_dasm.c} | 13 +- 8 files changed, 254 insertions(+), 203 deletions(-) rename platform/common/{host_dasm_arm.c => host_dasm.c} (88%) diff --git a/config.gp2x47 b/config.gp2x47 index 21769ada5..632515ee7 100644 --- a/config.gp2x47 +++ b/config.gp2x47 @@ -4,9 +4,10 @@ CC = arm-linux-gnueabi-gcc CXX = arm-linux-gnueabi-g++ AS = arm-linux-gnueabi-as STRIP = arm-linux-gnueabi-strip -CFLAGS += -mabi=apcs-gnu -mno-thumb-interwork -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t -mtune=arm920t -Wno-unused-result -fno-stack-protector -D__GP2X__ +CFLAGS += -mabi=apcs-gnu -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t -mtune=arm920t +CFLAGS += -Wno-unused-result -D__GP2X__ -mno-thumb-interwork -fno-stack-protector -fno-common CFLAGS += -I${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I${HOME}/src/gp2x/armroot/include -CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers +CFLAGS += -finline-limit=42 -fipa-pta -fno-ipa-sra -fno-ipa-pure-const ASFLAGS += -mabi=apcs-gnu -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t LDFLAGS += -mabi=apcs-gnu -mfpu=fpa -B${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/src/gp2x/armroot/lib -static LDLIBS += -lpng -lm -ldl diff --git a/cpu/drc/cmn.h b/cpu/drc/cmn.h index bad02a1b3..2eb52aada 100644 --- a/cpu/drc/cmn.h +++ b/cpu/drc/cmn.h @@ -6,3 +6,39 @@ extern u8 *tcache; void drc_cmn_init(void); void drc_cmn_cleanup(void); +#define BITMASK1(v0) (1 << (v0)) +#define BITMASK2(v0,v1) ((1 << (v0)) | (1 << (v1))) +#define BITMASK3(v0,v1,v2) (BITMASK2(v0,v1) | (1 << (v2))) +#define BITMASK4(v0,v1,v2,v3) (BITMASK3(v0,v1,v2) | (1 << (v3))) +#define BITMASK5(v0,v1,v2,v3,v4) (BITMASK4(v0,v1,v2,v3) | (1 << (v4))) +#define BITMASK6(v0,v1,v2,v3,v4,v5) (BITMASK5(v0,v1,v2,v3,v4) | (1 << (v5))) +#define BITRANGE(v0,v1) (BITMASK1(v1+1)-BITMASK1(v0)) // set with v0..v1 + +// binary search approach, since we don't have CLZ on ARM920T +#define FOR_ALL_BITS_SET_DO(mask, bit, code) { \ + u32 __mask = mask; \ + for (bit = 31; bit >= 0 && mask; bit--, __mask <<= 1) { \ + if (!(__mask & (0xffff << 16))) \ + bit -= 16, __mask <<= 16; \ + if (!(__mask & (0xff << 24))) \ + bit -= 8, __mask <<= 8; \ + if (!(__mask & (0xf << 28))) \ + bit -= 4, __mask <<= 4; \ + if (!(__mask & (0x3 << 30))) \ + bit -= 2, __mask <<= 2; \ + if (!(__mask & (0x1 << 31))) \ + bit -= 1, __mask <<= 1; \ + if (__mask & (0x1 << 31)) { \ + code; \ + } \ + } \ +} + +// inspired by https://graphics.stanford.edu/~seander/bithacks.html +static inline int count_bits(unsigned val) +{ + val = val - ((val >> 1) & 0x55555555); + val = (val & 0x33333333) + ((val >> 2) & 0x33333333); + return (((val + (val >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24; +} + diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index c85a3d713..0eb2d9724 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -382,13 +382,6 @@ static void emith_flush(void) #define EOP_MOVT(rd,imm) \ EMIT(0xe3400000 | ((rd)<<12) | (((imm)>>16)&0xfff) | (((imm)>>12)&0xf0000), M1(rd), NO) -static inline int count_bits(unsigned val) -{ - val = val - ((val >> 1) & 0x55555555); - val = (val & 0x33333333) + ((val >> 2) & 0x33333333); - return (((val + (val >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24; -} - // host literal pool; must be significantly smaller than 1024 (max LDR offset = 4096) #define MAX_HOST_LITERALS 128 static u32 literal_pool[MAX_HOST_LITERALS]; @@ -429,18 +422,26 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int // count insns needed for mov/orr #imm for (v = imm, ror2 = 0; (v >> 24) && ror2 < 32/2; ror2++) v = (v << 2) | (v >> 30); +#ifdef HAVE_ARMV7 for (i = 2; i > 0; i--, v >>= 8) while (v > 0xff && !(v & 3)) v >>= 2; if (v) { // 3+ insns needed... if (op == A_OP_MVN) imm = ~imm; -#ifdef HAVE_ARMV7 // ...prefer movw/movt EOP_MOVW(rd, imm); if (imm & 0xffff0000) EOP_MOVT(rd, imm); + return; + } #else + for (i = 3; i > 0; i--, v >>= 8) + while (v > 0xff && !(v & 3)) + v >>= 2; + if (v) { // 4 insns needed... + if (op == A_OP_MVN) + imm = ~imm; // ...emit literal load int idx, o; if (literal_iindex >= MAX_HOST_LITERALS) { @@ -455,9 +456,9 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int EOP_C_DOP_IMM(cond, A_OP_ADD, 0, rd, rd, 0, o); else if (o < 0) EOP_C_DOP_IMM(cond, A_OP_SUB, 0, rd, rd, 0, -o); -#endif return; } +#endif break; case A_OP_AND: @@ -544,7 +545,7 @@ static int emith_xbranch(int cond, void *target, int is_call) EMIT((u32)target,M1(PC),0); #else // should never happen - elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, "indirect jmp %08x->%08x", target, tcache_ptr); + elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, "indirect jmp %8p->%8p", target, tcache_ptr); exit(1); #endif } @@ -633,9 +634,6 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define EMITH_NOTHING1(cond) \ (void)(cond) -#define EMITH_SJMP_DECL_() -#define EMITH_SJMP_START_(cond) EMITH_NOTHING1(cond) -#define EMITH_SJMP_END_(cond) EMITH_NOTHING1(cond) #define EMITH_SJMP_START(cond) EMITH_NOTHING1(cond) #define EMITH_SJMP_END(cond) EMITH_NOTHING1(cond) #define EMITH_SJMP2_START(cond) EMITH_NOTHING1(cond) @@ -806,6 +804,9 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_eor_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 0, A_OP_EOR, r, imm) +#define emith_eor_r_imm_ptr(r, imm) \ + emith_eor_r_imm(r, imm) + // note: only use 8bit imm for these #define emith_tst_r_imm(r, imm) \ emith_top_imm(A_COND_AL, A_OP_TST, r, imm) @@ -837,6 +838,9 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_eor_r_imm_c(cond, r, imm) \ emith_op_imm(cond, 0, A_OP_EOR, r, imm) +#define emith_eor_r_imm_ptr_c(cond, r, imm) \ + emith_eor_r_imm_c(cond, r, imm) + #define emith_bic_r_imm_c(cond, r, imm) \ emith_op_imm(cond, 0, A_OP_BIC, r, imm) @@ -1139,6 +1143,8 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) emith_jump(target); \ } while (0) +#define emith_call_cleanup() /**/ + #define emith_ret_c(cond) \ emith_jump_reg_c(cond, LR) @@ -1228,10 +1234,10 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) /* if (reg <= turns) turns = reg-1 */ \ t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \ emith_cmp_r_r(t3, t2); \ - emith_sub_r_r_imm_c(DCOND_LE, t2, t3, 1); \ + emith_sub_r_r_imm_c(DCOND_LS, t2, t3, 1); \ /* if (reg <= 1) turns = 0 */ \ emith_cmp_r_imm(t3, 1); \ - emith_move_r_imm_c(DCOND_LE, t2, 0); \ + emith_move_r_imm_c(DCOND_LS, t2, 0); \ /* reg -= turns */ \ emith_sub_r_r(t3, t2); \ } \ @@ -1361,7 +1367,7 @@ static int tcond = -1; #define emith_set_t(sr, val) \ tcond = ((val) ? A_COND_AL: A_COND_NV) -static void emith_sync_t(sr) +static void emith_sync_t(int sr) { if (tcond == A_COND_AL) emith_or_r_imm(sr, T); diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 652b49898..0a31d8949 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -396,6 +396,12 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define emith_cmp_r_imm(r, imm) \ emith_arith_r_imm(7, r, imm) +#define emith_eor_r_imm_ptr(r, imm) do { \ + EMIT_REX_IF(1, 0, r); \ + EMIT_OP_MODRM64(0x81, 3, 6, r); \ + EMIT(imm, u32); \ +} while (0) + #define emith_tst_r_imm(r, imm) do { \ EMIT_REX_IF(0, 0, r); \ EMIT_OP_MODRM64(0xf7, 3, 0, r); \ @@ -417,6 +423,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common emith_or_r_imm(r, imm) #define emith_eor_r_imm_c(cond, r, imm) \ emith_eor_r_imm(r, imm) +#define emith_eor_r_imm_ptr_c(cond, r, imm) \ + emith_eor_r_imm_ptr(r, imm) #define emith_bic_r_imm_c(cond, r, imm) \ emith_bic_r_imm(r, imm) #define emith_tst_r_imm_c(cond, r, imm) \ @@ -589,9 +597,9 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common // XXX: stupid mess #define emith_mul_(op, dlo, dhi, s1, s2) do { \ int rmr; \ - if (dlo != xAX && dhi != xAX) \ + if (dlo != xAX && dhi != xAX && rcache_is_hreg_used(xAX)) \ emith_push(xAX); \ - if (dlo != xDX && dhi != xDX) \ + if (dlo != xDX && dhi != xDX && rcache_is_hreg_used(xDX)) \ emith_push(xDX); \ if ((s1) == xAX) \ rmr = s2; \ @@ -609,9 +617,9 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common } \ if (dhi != xDX && dhi != -1 && !(dhi == xAX && dlo == xDX)) \ emith_move_r_r(dhi, (dlo == xDX ? xAX : xDX)); \ - if (dlo != xDX && dhi != xDX) \ + if (dlo != xDX && dhi != xDX && rcache_is_hreg_used(xDX)) \ emith_pop(xDX); \ - if (dlo != xAX && dhi != xAX) \ + if (dlo != xAX && dhi != xAX && rcache_is_hreg_used(xAX)) \ emith_pop(xAX); \ } while (0) @@ -898,6 +906,9 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common emith_jump(target); \ } while (0) +#define emith_call_cleanup() \ + emith_add_r_r_ptr_imm(xSP, xSP, sizeof(void *)); // remove return addr + #define emith_ret() \ EMIT_OP(0xc3) @@ -912,10 +923,12 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define emith_push_ret(r) do { \ int r_ = (r >= 0 ? r : xSI); \ emith_push(r_); /* always push to align */ \ + emith_add_r_r_ptr_imm(xSP, xSP, -8*4); /* args shadow space */ \ } while (0) #define emith_pop_and_ret(r) do { \ int r_ = (r >= 0 ? r : xSI); \ + emith_add_r_r_ptr_imm(xSP, xSP, 8*4); /* args shadow space */ \ emith_pop(r_); \ emith_ret(); \ } while (0) @@ -942,15 +955,6 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common // "simple" jump (no more then a few insns) // ARM will use conditional instructions here -#define EMITH_SJMP_DECL_() \ - u8 *cond_ptr - -#define EMITH_SJMP_START_(cond) \ - JMP8_POS(cond_ptr) - -#define EMITH_SJMP_END_(cond) \ - JMP8_EMIT(cond, cond_ptr) - #define EMITH_SJMP_START EMITH_JMP_START #define EMITH_SJMP_END EMITH_JMP_END @@ -1046,7 +1050,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common emith_push(xR15); \ emith_push(xSI); \ emith_push(xDI); \ - emith_add_r_r_ptr_imm(xSP, xSP, -8*5); /* align + ABI param area */ \ + emith_add_r_r_ptr_imm(xSP, xSP, -8*5); /* align + args shadow space */ \ } while (0) #define emith_sh2_drc_exit() do { \ @@ -1106,19 +1110,17 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #endif #define emith_save_caller_regs(mask) do { \ - if ((mask) & (1 << xAX)) emith_push(xAX); \ - if ((mask) & (1 << xCX)) emith_push(xCX); \ - if ((mask) & (1 << xDX)) emith_push(xDX); \ - if ((mask) & (1 << xSI)) emith_push(xSI); \ - if ((mask) & (1 << xDI)) emith_push(xDI); \ + int _c; u32 _m = mask & 0xfc7; /* AX, CX, DX, SI, DI, 8, 9, 10, 11 */ \ + if (__builtin_parity(_m) == 1) _m |= 0x8; /* BX for ABI align */ \ + for (_c = HOST_REGS; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + if (_m & (1 << _c)) emith_push(_c); \ } while (0) #define emith_restore_caller_regs(mask) do { \ - if ((mask) & (1 << xDI)) emith_pop(xDI); \ - if ((mask) & (1 << xSI)) emith_pop(xSI); \ - if ((mask) & (1 << xDX)) emith_pop(xDX); \ - if ((mask) & (1 << xCX)) emith_pop(xCX); \ - if ((mask) & (1 << xAX)) emith_pop(xAX); \ + int _c; u32 _m = mask & 0xfc7; \ + if (__builtin_parity(_m) == 1) _m |= 0x8; /* BX for ABI align */ \ + for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) emith_pop(_c); \ } while (0) #define emith_sh2_rcall(a, tab, func, mask) do { \ @@ -1192,14 +1194,14 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common /* if (reg <= turns) turns = reg-1 */ \ t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \ emith_cmp_r_r(t3, t2); \ - EMITH_SJMP_START(DCOND_GT); \ - emith_sub_r_r_imm_c(DCOND_LE, t2, t3, 1); \ - EMITH_SJMP_END(DCOND_GT); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_sub_r_r_imm_c(DCOND_LS, t2, t3, 1); \ + EMITH_SJMP_END(DCOND_HI); \ /* if (reg <= 1) turns = 0 */ \ emith_cmp_r_imm(t3, 1); \ - EMITH_SJMP_START(DCOND_GT); \ - emith_move_r_imm_c(DCOND_LE, t2, 0); \ - EMITH_SJMP_END(DCOND_GT); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_move_r_imm_c(DCOND_LS, t2, 0); \ + EMITH_SJMP_END(DCOND_HI); \ /* reg -= turns */ \ emith_sub_r_r(t3, t2); \ } \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 9932ce6ff..c1ba3f322 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -106,14 +106,6 @@ static int insns_compiled, hash_collisions, host_insn_count; #define GET_Rn() \ ((op >> 8) & 0x0f) -#define BITMASK1(v0) (1 << (v0)) -#define BITMASK2(v0,v1) ((1 << (v0)) | (1 << (v1))) -#define BITMASK3(v0,v1,v2) (BITMASK2(v0,v1) | (1 << (v2))) -#define BITMASK4(v0,v1,v2,v3) (BITMASK3(v0,v1,v2) | (1 << (v3))) -#define BITMASK5(v0,v1,v2,v3,v4) (BITMASK4(v0,v1,v2,v3) | (1 << (v4))) -#define BITMASK6(v0,v1,v2,v3,v4,v5) (BITMASK5(v0,v1,v2,v3,v4) | (1 << (v5))) -#define BITRANGE(v0,v1) (BITMASK1(v1+1)-BITMASK1(v0)) // set with v0..v1 - #define SHR_T SHR_SR // might make them separate someday #define SHR_MEM 31 #define SHR_TMP -1 @@ -174,6 +166,7 @@ enum op_types { static u8 *tcache_dsm_ptrs[3]; static char sh2dasm_buff[64]; #define do_host_disasm(tcid) \ + emith_flush(); \ host_dasm(tcache_dsm_ptrs[tcid], emith_insn_ptr() - tcache_dsm_ptrs[tcid]); \ tcache_dsm_ptrs[tcid] = emith_insn_ptr() #else @@ -212,7 +205,6 @@ static void REGPARM(3) *sh2_drc_log_entry(void *block, SH2 *sh2, u32 sr) static FILE *trace[2]; int idx = sh2->is_slave; if (!trace[0]) { - truncate("pico.trace", 0); trace[0] = fopen("pico.trace0", "wb"); trace[1] = fopen("pico.trace1", "wb"); } @@ -286,7 +278,7 @@ static u8 *tcache_limit[TCACHE_BUFFERS]; // ptr for code emiters static u8 *tcache_ptr; -#define MAX_BLOCK_ENTRIES (BLOCK_INSN_LIMIT / 8) +#define MAX_BLOCK_ENTRIES (BLOCK_INSN_LIMIT / 6) struct block_link { u32 target_pc; @@ -330,32 +322,20 @@ struct block_desc { struct block_entry entryp[MAX_BLOCK_ENTRIES]; }; -static const int block_max_counts[TCACHE_BUFFERS] = { - 4*1024, - 256, - 256, -}; +#define BLOCK_MAX_COUNT(tcid) ((tcid) ? 256 : 16*256) static struct block_desc *block_tables[TCACHE_BUFFERS]; static int block_counts[TCACHE_BUFFERS]; static int block_limit[TCACHE_BUFFERS]; // we have block_link_pool to avoid using mallocs -static const int block_link_pool_max_counts[TCACHE_BUFFERS] = { - 16*1024, - 4*256, - 4*256, -}; +#define BLOCK_LINK_MAX_COUNT(tcid) ((tcid) ? 1024 : 16*1024) static struct block_link *block_link_pool[TCACHE_BUFFERS]; static int block_link_pool_counts[TCACHE_BUFFERS]; static struct block_link **unresolved_links[TCACHE_BUFFERS]; static struct block_link *blink_free[TCACHE_BUFFERS]; // used for invalidation -static const int ram_sizes[TCACHE_BUFFERS] = { - 0x40000, - 0x1000, - 0x1000, -}; +#define RAM_SIZE(tcid) ((tcid) ? 0x1000 : 0x40000) #define INVAL_PAGE_SIZE 0x100 struct block_list { @@ -373,15 +353,11 @@ static struct block_list *inactive_blocks[TCACHE_BUFFERS]; // each array has len: sizeof(mem) / INVAL_PAGE_SIZE static struct block_list **inval_lookup[TCACHE_BUFFERS]; -static const int hash_table_sizes[TCACHE_BUFFERS] = { - 0x4000, - 0x100, - 0x100, -}; +#define HASH_TABLE_SIZE(tcid) ((tcid) ? 256 : 64*256) static struct block_entry **hash_tables[TCACHE_BUFFERS]; #define HASH_FUNC(hash_tab, addr, mask) \ - (hash_tab)[(((addr) >> 20) ^ ((addr) >> 2)) & (mask)] + (hash_tab)[((addr) >> 1) & (mask)] #if (DRC_DEBUG & 128) #if BRANCH_CACHE @@ -431,6 +407,10 @@ typedef struct { } guest_reg_t; +// possibly needed in code emitter +static int rcache_get_tmp(void); +static void rcache_free_tmp(int hr); + // Note: cache_regs[] must have at least the amount of REG and TEMP registers // used by handlers in worst case (currently 4). // Register assignment goes by ABI convention. Caller save registers are TEMP, @@ -583,13 +563,12 @@ static int dr_ctx_get_mem_ptr(SH2 *sh2, u32 a, u32 *mask) // check if region is mapped memory memptr = p32x_sh2_get_mem_ptr(a, mask, sh2); - if (memptr == NULL /*|| (a & ((1 << SH2_READ_SHIFT)-1) & ~*mask) != 0*/) + if (memptr == NULL) return poffs; if (memptr == sh2->p_bios) // BIOS poffs = offsetof(SH2, p_bios); else if (memptr == sh2->p_da) // data array - // FIXME: access sh2->data_array instead poffs = offsetof(SH2, p_da); else if (memptr == sh2->p_sdram) // SDRAM poffs = offsetof(SH2, p_sdram); @@ -602,16 +581,16 @@ static int dr_ctx_get_mem_ptr(SH2 *sh2, u32 a, u32 *mask) static struct block_entry *dr_get_entry(u32 pc, int is_slave, int *tcache_id) { struct block_entry *be; - u32 tcid = 0, mask; - - // data arrays have their own caches - if ((pc & 0xe0000000) == 0xc0000000 || (pc & ~0xfff) == 0) - tcid = 1 + is_slave; - + u32 tcid = 0; + + if ((pc & 0xe0000000) == 0xc0000000) + tcid = 1 + is_slave; // data array + if ((pc & ~0xfff) == 0) + tcid = 1 + is_slave; // BIOS *tcache_id = tcid; - mask = hash_table_sizes[tcid] - 1; - be = HASH_FUNC(hash_tables[tcid], pc, mask); + be = HASH_FUNC(hash_tables[tcid], pc, HASH_TABLE_SIZE(tcid) - 1); + if (be != NULL) // don't ask... gcc code generation hint for (; be != NULL; be = be->next) if (be->pc == pc) return be; @@ -688,17 +667,17 @@ static void REGPARM(1) flush_tcache(int tcid) int tc_used, bl_used; tc_used = tcache_sizes[tcid] - (tcache_limit[tcid] - tcache_ptrs[tcid]); - bl_used = block_max_counts[tcid] - (block_limit[tcid] - block_counts[tcid]); + bl_used = BLOCK_MAX_COUNT(tcid) - (block_limit[tcid] - block_counts[tcid]); elprintf(EL_STATUS, "tcache #%d flush! (%d/%d, bds %d/%d)", tcid, tc_used, - tcache_sizes[tcid], bl_used, block_max_counts[tcid]); + tcache_sizes[tcid], bl_used, BLOCK_MAX_COUNT(tcid)); #endif block_counts[tcid] = 0; - block_limit[tcid] = block_max_counts[tcid] - 1; + block_limit[tcid] = BLOCK_MAX_COUNT(tcid) - 1; block_link_pool_counts[tcid] = 0; blink_free[tcid] = NULL; - memset(unresolved_links[tcid], 0, sizeof(*unresolved_links[0]) * hash_table_sizes[tcid]); - memset(hash_tables[tcid], 0, sizeof(*hash_tables[0]) * hash_table_sizes[tcid]); + memset(unresolved_links[tcid], 0, sizeof(*unresolved_links[0]) * HASH_TABLE_SIZE(tcid)); + memset(hash_tables[tcid], 0, sizeof(*hash_tables[0]) * HASH_TABLE_SIZE(tcid)); tcache_ptrs[tcid] = tcache_bases[tcid]; tcache_limit[tcid] = tcache_bases[tcid] + tcache_sizes[tcid]; if (Pico32xMem->sdram != NULL) { @@ -724,14 +703,14 @@ static void REGPARM(1) flush_tcache(int tcid) tcache_dsm_ptrs[tcid] = tcache_bases[tcid]; #endif - for (i = 0; i < ram_sizes[tcid] / INVAL_PAGE_SIZE; i++) + for (i = 0; i < RAM_SIZE(tcid) / INVAL_PAGE_SIZE; i++) rm_block_list(&inval_lookup[tcid][i]); rm_block_list(&inactive_blocks[tcid]); } static void add_to_hashlist(struct block_entry *be, int tcache_id) { - u32 tcmask = hash_table_sizes[tcache_id] - 1; + u32 tcmask = HASH_TABLE_SIZE(tcache_id) - 1; struct block_entry **head = &HASH_FUNC(hash_tables[tcache_id], be->pc, tcmask); be->prev = NULL; @@ -751,7 +730,7 @@ static void add_to_hashlist(struct block_entry *be, int tcache_id) static void rm_from_hashlist(struct block_entry *be, int tcache_id) { - u32 tcmask = hash_table_sizes[tcache_id] - 1; + u32 tcmask = HASH_TABLE_SIZE(tcache_id) - 1; struct block_entry **head = &HASH_FUNC(hash_tables[tcache_id], be->pc, tcmask); #if DRC_DEBUG & 1 @@ -773,7 +752,7 @@ static void rm_from_hashlist(struct block_entry *be, int tcache_id) static void add_to_hashlist_unresolved(struct block_link *bl, int tcache_id) { - u32 tcmask = hash_table_sizes[tcache_id] - 1; + u32 tcmask = HASH_TABLE_SIZE(tcache_id) - 1; struct block_link **head = &HASH_FUNC(unresolved_links[tcache_id], bl->target_pc, tcmask); #if DRC_DEBUG & 1 @@ -794,7 +773,7 @@ static void add_to_hashlist_unresolved(struct block_link *bl, int tcache_id) static void rm_from_hashlist_unresolved(struct block_link *bl, int tcache_id) { - u32 tcmask = hash_table_sizes[tcache_id] - 1; + u32 tcmask = HASH_TABLE_SIZE(tcache_id) - 1; struct block_link **head = &HASH_FUNC(unresolved_links[tcache_id], bl->target_pc, tcmask); #if DRC_DEBUG & 1 @@ -818,7 +797,7 @@ static void dr_free_oldest_block(int tcache_id) { struct block_desc *bd; - if (block_limit[tcache_id] >= block_max_counts[tcache_id]) { + if (block_limit[tcache_id] >= BLOCK_MAX_COUNT(tcache_id)) { // block desc wrap around block_limit[tcache_id] = 0; } @@ -833,7 +812,7 @@ static void dr_free_oldest_block(int tcache_id) sh2_smc_rm_block_entry(bd, tcache_id, 0, 1); block_limit[tcache_id]++; - if (block_limit[tcache_id] >= block_max_counts[tcache_id]) + if (block_limit[tcache_id] >= BLOCK_MAX_COUNT(tcache_id)) block_limit[tcache_id] = 0; bd = &block_tables[tcache_id][block_limit[tcache_id]]; if (bd->tcache_ptr >= tcache_ptrs[tcache_id]) @@ -898,7 +877,7 @@ static void dr_mark_memory(int mark, struct block_desc *block, int tcache_id, u3 lit_ram_blk = Pico32xMem->drclit_ram; shift = SH2_DRCBLK_RAM_SHIFT; } - mask = ram_sizes[tcache_id] - 1; + mask = RAM_SIZE(tcache_id) - 1; // mark recompiled insns addr = block->addr & ~((1 << shift) - 1); @@ -957,7 +936,7 @@ static u32 dr_check_nolit(u32 start, u32 end, int tcache_id) lit_ram_blk = Pico32xMem->drclit_ram; shift = SH2_DRCBLK_RAM_SHIFT; } - mask = ram_sizes[tcache_id] - 1; + mask = RAM_SIZE(tcache_id) - 1; addr = start & ~((1 << shift) - 1); for (idx = (addr & mask) >> shift; addr < end; addr += (1 << shift)) @@ -1028,18 +1007,18 @@ static struct block_desc *dr_add_block(u32 addr, int size, *blk_id = *bcount; (*bcount)++; - if (*bcount >= block_max_counts[tcache_id]) + if (*bcount >= BLOCK_MAX_COUNT(tcache_id)) *bcount = 0; return bd; } -static void REGPARM(3) *dr_lookup_block(u32 pc, int is_slave, int *tcache_id) +static void REGPARM(3) *dr_lookup_block(u32 pc, SH2 *sh2, int *tcache_id) { struct block_entry *be = NULL; void *block = NULL; - be = dr_get_entry(pc, is_slave, tcache_id); + be = dr_get_entry(pc, sh2->is_slave, tcache_id); if (be != NULL) block = be->tcache_ptr; @@ -1114,7 +1093,7 @@ static void *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_sla if (blink_free[tcache_id] != NULL) { bl = blink_free[tcache_id]; blink_free[tcache_id] = bl->next; - } else if (cnt >= block_link_pool_max_counts[tcache_id]) { + } else if (cnt >= BLOCK_LINK_MAX_COUNT(tcache_id)) { dbg(1, "bl overflow for tcache %d", tcache_id); return sh2_drc_dispatcher; } else { @@ -1145,7 +1124,7 @@ static void *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_sla static void dr_link_blocks(struct block_entry *be, int tcache_id) { #if LINK_BRANCHES - u32 tcmask = hash_table_sizes[tcache_id] - 1; + u32 tcmask = HASH_TABLE_SIZE(tcache_id) - 1; u32 pc = be->pc; struct block_link **head = &HASH_FUNC(unresolved_links[tcache_id], pc, tcmask); struct block_link *bl = *head, *next; @@ -1188,7 +1167,7 @@ static void dr_link_outgoing(struct block_entry *be, int tcache_id, int is_slave array[count++] = item; \ } -static int find_in_array(u32 *array, size_t size, u32 what) +static inline int find_in_array(u32 *array, size_t size, u32 what) { size_t i; for (i = 0; i < size; i++) @@ -1198,6 +1177,23 @@ static int find_in_array(u32 *array, size_t size, u32 what) return -1; } +static int find_in_sorted_array(u32 *array, size_t size, u32 what) +{ + // binary search in sorted array + int left = 0, right = size-1; + while (left <= right) + { + int middle = (left + right) / 2; + if (array[middle] == what) + return middle; + else if (array[middle] < what) + left = middle + 1; + else + right = middle - 1; + } + return -1; +} + // --------------------------------------------------------------- // NB rcache allocation dependencies: @@ -1242,26 +1238,6 @@ static void rcache_remove_vreg_alias(int x, sh2_reg_e r); } \ } -// binary search approach, since we don't have CLZ on ARM920T -#define FOR_ALL_BITS_SET_DO(mask, bit, code) { \ - u32 __mask = mask; \ - for (bit = 31; bit >= 0 && mask; bit--, __mask <<= 1) { \ - if (!(__mask & (0xffff << 16))) \ - bit -= 16, __mask <<= 16; \ - if (!(__mask & (0xff << 24))) \ - bit -= 8, __mask <<= 8; \ - if (!(__mask & (0xf << 28))) \ - bit -= 4, __mask <<= 4; \ - if (!(__mask & (0x3 << 30))) \ - bit -= 2, __mask <<= 2; \ - if (!(__mask & (0x1 << 31))) \ - bit -= 1, __mask <<= 1; \ - if (__mask & (0x1 << 31)) { \ - code; \ - } \ - } \ -} - #if PROPAGATE_CONSTANTS static inline int gconst_alloc(sh2_reg_e r) { @@ -1319,6 +1295,7 @@ static int gconst_get(sh2_reg_e r, u32 *val) *val = gconsts[guest_regs[r].cnst].val; return 1; } + *val = 0; return 0; } @@ -2043,13 +2020,22 @@ static inline int rcache_is_cached(sh2_reg_e r) return (guest_regs[r].vreg >= 0); } +static inline int rcache_is_hreg_used(int hr) +{ + int x = reg_map_host[hr]; + // is hr in use? + return cache_regs[x].type != HR_FREE && + (cache_regs[x].type != HR_TEMP || (cache_regs[x].flags & HRF_LOCKED)); +} + static inline u32 rcache_used_hreg_mask(void) { u32 mask = 0; int i; for (i = 0; i < ARRAY_SIZE(cache_regs); i++) - if (cache_regs[i].type != HR_FREE) + if ((cache_regs[i].flags & HRF_TEMP) && cache_regs[i].type != HR_FREE && + (cache_regs[i].type != HR_TEMP || (cache_regs[i].flags & HRF_LOCKED))) mask |= 1 << cache_regs[i].hreg; return mask & ~rcache_static; @@ -2137,6 +2123,8 @@ static void rcache_invalidate(void) { int i; + gconst_invalidate(); + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { cache_regs[i].flags &= (HRF_TEMP|HRF_REG); if (cache_regs[i].type != HR_STATIC) @@ -2161,8 +2149,6 @@ static void rcache_invalidate(void) rcache_counter = 0; rcache_hint_soon = rcache_hint_late = rcache_hint_write = rcache_hint_clean = 0; - - gconst_invalidate(); } static void rcache_flush(void) @@ -2221,14 +2207,20 @@ static int emit_get_rbase_and_offs(SH2 *sh2, sh2_reg_e r, int rmod, u32 *offs) if (poffs == -1) return -1; - if (mask < 0x1000) { - // data array or BIOS, can't safely access directly since translated code - // may run on both SH2s + if (mask < 0x20000) { + // data array, BIOS, DRAM, can't safely access directly since host addr may + // change (BIOS,da code may run on either core, DRAM may be switched) hr = rcache_get_tmp(); - emith_ctx_read_ptr(hr, poffs); - a += *offs; - if (a & mask & ~omask) - emith_add_r_r_ptr_imm(hr, hr, a & mask & ~omask); + a = (a + *offs) & mask; + if (poffs == offsetof(SH2, p_da)) { + // access sh2->data_array directly + a += offsetof(SH2, data_array); + emith_add_r_r_ptr_imm(hr, CONTEXT_REG, a & ~omask); + } else { + emith_ctx_read_ptr(hr, poffs); + if (a & ~omask) + emith_add_r_r_ptr_imm(hr, hr, a & ~omask); + } *offs = a & omask; return hr; } @@ -2269,7 +2261,7 @@ static int emit_get_rom_data(SH2 *sh2, sh2_reg_e r, u32 offs, int size, u32 *val if (gconst_get(r, &a)) { a += offs; // check if rom is memory mapped (not bank switched), and address is in rom - if (dr_is_rom(a) && p32x_sh2_get_mem_ptr(a, &mask, sh2) != (void *)-1) { + if (dr_is_rom(a) && p32x_sh2_get_mem_ptr(a, &mask, sh2) == sh2->p_rom) { switch (size & MF_SIZEMASK) { case 0: *val = (s8)p32x_sh2_read8(a, sh2s); break; // 8 case 1: *val = (s16)p32x_sh2_read16(a, sh2s); break; // 16 @@ -2507,9 +2499,10 @@ static int emit_indirect_indexed_read(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rx, sh2_ #if PROPAGATE_CONSTANTS u32 offs; - if (gconst_get(ry, &offs)) + // if offs is larger than 0x01000000, it's most probably the base address part + if (gconst_get(ry, &offs) && offs < 0x01000000) return emit_memhandler_read_rr(sh2, rd, rx, offs, size); - if (gconst_get(rx, &offs)) + if (gconst_get(rx, &offs) && offs < 0x01000000) return emit_memhandler_read_rr(sh2, rd, ry, offs, size); #endif hr = rcache_get_reg_arg(0, rx, &tx); @@ -2541,9 +2534,10 @@ static void emit_indirect_indexed_write(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rx, sh #if PROPAGATE_CONSTANTS u32 offs; - if (gconst_get(ry, &offs)) + // if offs is larger than 0x01000000, it's most probably the base address part + if (gconst_get(ry, &offs) && offs < 0x01000000) return emit_memhandler_write_rr(sh2, rd, rx, offs, size); - if (gconst_get(rx, &offs)) + if (gconst_get(rx, &offs) && offs < 0x01000000) return emit_memhandler_write_rr(sh2, rd, ry, offs, size); #endif if (rd != SHR_TMP) @@ -2601,15 +2595,6 @@ static void emit_do_static_regs(int is_write, int tmpr) } } -/* just after lookup function, jump to address returned */ -static void emit_block_entry(void) -{ - emith_tst_r_r_ptr(RET_REG, RET_REG); - EMITH_SJMP_START(DCOND_EQ); - emith_jump_reg_c(DCOND_NE, RET_REG); - EMITH_SJMP_END(DCOND_EQ); -} - #define DELAY_SAVE_T(sr) { \ emith_bic_r_imm(sr, T_save); \ emith_tst_r_imm(sr, T); \ @@ -2861,7 +2846,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (!tcache_id) // can safely link from cpu-local to global memory dr_link_blocks(entry, sh2->is_slave?2:1); - v = find_in_array(branch_target_pc, branch_target_count, pc); + v = find_in_sorted_array(branch_target_pc, branch_target_count, pc); if (v >= 0) branch_target_ptr[v] = tcache_ptr; #if LOOP_DETECTION @@ -2870,14 +2855,15 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.polling = (drcf.loop_type == OF_POLL_LOOP ? MF_POLLING : 0); #endif -#if DRC_DEBUG +#if (DRC_DEBUG & ~7) // must update PC emit_move_r_imm32(SHR_PC, pc); #endif rcache_clean(); #if (DRC_DEBUG & 0x10) - rcache_get_reg_arg(0, SHR_PC, NULL); + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, pc); tmp = emit_memhandler_read(1); tmp2 = rcache_get_tmp(); tmp3 = rcache_get_tmp(); @@ -2896,7 +2882,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp = rcache_get_tmp_arg(0); sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); emith_cmp_r_imm(sr, 0); - emith_move_r_imm(tmp, pc); + emith_move_r_imm_c(DCOND_LE, tmp, pc); emith_jump_cond(DCOND_LE, sh2_drc_exit); rcache_free_tmp(tmp); @@ -3104,7 +3090,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_clear_msb(tmp, tmp2, 22); emit_memhandler_write_rr(sh2, SHR_TMP, SHR_SP, 0, 2 | MF_PREDECR); // push PC - if (op == OP_TRAPA) { + if (opd->op == OP_TRAPA) { tmp = rcache_get_tmp_arg(1); emith_move_r_imm(tmp, pc); } else if (drcf.pending_branch_indirect) { @@ -3113,7 +3099,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp = rcache_get_tmp_arg(1); emith_move_r_imm(tmp, pc - 2); } - emith_move_r_imm(tmp, pc); emit_memhandler_write_rr(sh2, SHR_TMP, SHR_SP, 0, 2 | MF_PREDECR); // obtain new PC emit_memhandler_read_rr(sh2, SHR_PC, SHR_VBR, opd->imm * 4, 2); @@ -3613,12 +3598,12 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) default: goto default_; } - tmp3 = rcache_get_reg_arg(1, tmp, &tmp4); if (tmp == SHR_SR) { + tmp3 = rcache_get_reg_arg(1, tmp, &tmp4); emith_sync_t(tmp4); emith_clear_msb(tmp3, tmp4, 22); // reserved bits defined by ISA as 0 - } else if (tmp3 != tmp4) - emith_move_r_r(tmp3, tmp4); + } else + tmp3 = rcache_get_reg_arg(1, tmp, NULL); emit_memhandler_write_rr(sh2, SHR_TMP, GET_Rn(), 0, 2 | MF_PREDECR); goto end_op; case 0x04: @@ -4050,7 +4035,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // no modification of host status/flags between here and branching! #if LINK_BRANCHES - v = find_in_array(branch_target_pc, branch_target_count, target_pc); + v = find_in_sorted_array(branch_target_pc, branch_target_count, target_pc); if (v >= 0) { // local branch @@ -4151,7 +4136,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { void *target; - s32 tmp = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + tmp = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(tmp); emith_sync_t(tmp); @@ -4172,7 +4157,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) for (i = 0; i < branch_patch_count; i++) { void *target; int t; - t = find_in_array(branch_target_pc, branch_target_count, branch_patch_pc[i]); + t = find_in_sorted_array(branch_target_pc, branch_target_count, branch_patch_pc[i]); target = branch_target_ptr[t]; if (target == NULL) { // flush pc and go back to dispatcher (this should no longer happen) @@ -4256,8 +4241,8 @@ static void sh2_generate_utils(void) emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CS); emith_and_r_r_c(DCOND_CC, arg0, arg3); - emith_eor_r_imm_c(DCOND_CC, arg0, 1); - emith_read8s_r_r_r_c(DCOND_CC, RET_REG, arg0, arg2); + emith_eor_r_imm_ptr_c(DCOND_CC, arg0, 1); + emith_read8s_r_r_r_c(DCOND_CC, RET_REG, arg2, arg0); emith_ret_c(DCOND_CC); EMITH_SJMP_END(DCOND_CS); emith_move_r_r_ptr(arg1, CONTEXT_REG); @@ -4270,7 +4255,7 @@ static void sh2_generate_utils(void) emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CS); emith_and_r_r_c(DCOND_CC, arg0, arg3); - emith_read16s_r_r_r_c(DCOND_CC, RET_REG, arg0, arg2); + emith_read16s_r_r_r_c(DCOND_CC, RET_REG, arg2, arg0); emith_ret_c(DCOND_CC); EMITH_SJMP_END(DCOND_CS); emith_move_r_r_ptr(arg1, CONTEXT_REG); @@ -4283,7 +4268,7 @@ static void sh2_generate_utils(void) emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CS); emith_and_r_r_c(DCOND_CC, arg0, arg3); - emith_read_r_r_r_c(DCOND_CC, RET_REG, arg0, arg2); + emith_read_r_r_r_c(DCOND_CC, RET_REG, arg2, arg0); emith_ror_c(DCOND_CC, RET_REG, RET_REG, 16); emith_ret_c(DCOND_CC); EMITH_SJMP_END(DCOND_CS); @@ -4300,8 +4285,8 @@ static void sh2_generate_utils(void) emith_jump_reg_c(DCOND_CS, arg2); EMITH_SJMP_END(DCOND_CC); emith_and_r_r_r(arg1, arg0, arg3); - emith_eor_r_imm(arg1, 1); - emith_read8s_r_r_r(arg1, arg1, arg2); + emith_eor_r_imm_ptr(arg1, 1); + emith_read8s_r_r_r(arg1, arg2, arg1); emith_push_ret(arg1); emith_move_r_r_ptr(arg2, CONTEXT_REG); emith_call(p32x_sh2_poll_memory8); @@ -4317,7 +4302,7 @@ static void sh2_generate_utils(void) emith_jump_reg_c(DCOND_CS, arg2); EMITH_SJMP_END(DCOND_CC); emith_and_r_r_r(arg1, arg0, arg3); - emith_read16s_r_r_r(arg1, arg1, arg2); + emith_read16s_r_r_r(arg1, arg2, arg1); emith_push_ret(arg1); emith_move_r_r_ptr(arg2, CONTEXT_REG); emith_call(p32x_sh2_poll_memory16); @@ -4333,7 +4318,7 @@ static void sh2_generate_utils(void) emith_jump_reg_c(DCOND_CS, arg2); EMITH_SJMP_END(DCOND_CC); emith_and_r_r_r(arg1, arg0, arg3); - emith_read_r_r_r(arg1, arg1, arg2); + emith_read_r_r_r(arg1, arg2, arg1); emith_ror(arg1, arg1, 16); emith_push_ret(arg1); emith_move_r_r_ptr(arg2, CONTEXT_REG); @@ -4382,13 +4367,13 @@ static void sh2_generate_utils(void) emith_jump_reg_c(DCOND_EQ, RET_REG); EMITH_SJMP_END(DCOND_NE); #endif - emith_ctx_read(arg1, offsetof(SH2, is_slave)); + emith_move_r_r_ptr(arg1, CONTEXT_REG); emith_add_r_r_ptr_imm(arg2, CONTEXT_REG, offsetof(SH2, drc_tmp)); emith_call(dr_lookup_block); -#if BRANCH_CACHE // store PC and block entry ptr (in arg0) in branch target cache emith_tst_r_r_ptr(RET_REG, RET_REG); EMITH_SJMP_START(DCOND_EQ); +#if BRANCH_CACHE #if (DRC_DEBUG & 128) emith_move_r_ptr_imm(arg2, (uptr)&bcmiss); emith_read_r_r_offs_c(DCOND_NE, arg3, arg2, 0); @@ -4400,14 +4385,18 @@ static void sh2_generate_utils(void) emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg1, sizeof(void *) == 8 ? 1 : 0); emith_write_r_r_offs_c(DCOND_NE, arg2, arg1, offsetof(SH2, branch_cache)); emith_write_r_r_offs_ptr_c(DCOND_NE, RET_REG, arg1, offsetof(SH2, branch_cache) + sizeof(void *)); - EMITH_SJMP_END(DCOND_EQ); #endif - emit_block_entry(); + emith_jump_reg_c(DCOND_NE, RET_REG); + EMITH_SJMP_END(DCOND_EQ); // lookup failed, call sh2_translate() emith_move_r_r_ptr(arg0, CONTEXT_REG); emith_ctx_read(arg1, offsetof(SH2, drc_tmp)); // tcache_id emith_call(sh2_translate); - emit_block_entry(); +/* just after lookup function, jump to address returned */ + emith_tst_r_r_ptr(RET_REG, RET_REG); + EMITH_SJMP_START(DCOND_EQ); + emith_jump_reg_c(DCOND_NE, RET_REG); + EMITH_SJMP_END(DCOND_EQ); // XXX: can't translate, fail emith_call(dr_failure); emith_flush(); @@ -4486,9 +4475,7 @@ static void sh2_generate_utils(void) emith_call(sh2_drc_read32); if (arg0 != RET_REG) emith_move_r_r(arg0, RET_REG); -#if defined(__i386__) || defined(__x86_64__) - emith_add_r_r_ptr_imm(xSP, xSP, sizeof(void *)); // fix stack -#endif + emith_call_cleanup(); emith_jump(sh2_drc_dispatcher); rcache_invalidate(); emith_flush(); @@ -4581,6 +4568,7 @@ static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nol return; } +#if LINK_BRANCHES // remove from hash table, make incoming links unresolved if (bd->active) { for (i = 0; i < bd->entry_count; i++) { @@ -4596,8 +4584,10 @@ static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nol add_to_block_list(&inactive_blocks[tcache_id], bd); } bd->active = 0; +#endif if (free) { +#if LINK_BRANCHES // revoke outgoing links for (bl = bd->entryp[0].o_links; bl != NULL; bl = bl->o_next) { if (bl->target) @@ -4609,6 +4599,7 @@ static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nol blink_free[bl->tcache_id] = bl; } bd->entryp[0].o_links = NULL; +#endif // invalidate block rm_from_block_lists(bd); bd->addr = bd->size = bd->addr_lit = bd->size_lit = 0; @@ -4619,7 +4610,7 @@ static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nol static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift) { struct block_list **blist, *entry, *next; - u32 mask = ram_sizes[tcache_id] - 1; + u32 mask = RAM_SIZE(tcache_id) - 1; u32 wtmask = ~0x20000000; // writethrough area mask u32 start_addr, end_addr; u32 start_lit, end_lit; @@ -4722,7 +4713,7 @@ static void block_stats(void) for (i = 0; i < block_counts[b]; i++) if (block_tables[b][i].addr != 0) total += block_tables[b][i].refcount; - for (i = block_limit[b]; i < block_max_counts[b]; i++) + for (i = block_limit[b]; i < BLOCK_MAX_COUNT(b); i++) if (block_tables[b][i].addr != 0) total += block_tables[b][i].refcount; } @@ -4739,7 +4730,7 @@ static void block_stats(void) maxb = blk; } } - for (i = block_limit[b]; i < block_max_counts[b]; i++) { + for (i = block_limit[b]; i < BLOCK_MAX_COUNT(b); i++) { blk = &block_tables[b][i]; if (blk->addr != 0 && blk->refcount > max) { max = blk->refcount; @@ -4757,7 +4748,7 @@ static void block_stats(void) for (b = 0; b < ARRAY_SIZE(block_tables); b++) { for (i = 0; i < block_counts[b]; i++) block_tables[b][i].refcount = 0; - for (i = block_limit[b]; i < block_max_counts[b]; i++) + for (i = block_limit[b]; i < BLOCK_MAX_COUNT(b); i++) block_tables[b][i].refcount = 0; } #endif @@ -4774,7 +4765,7 @@ void entry_stats(void) for (i = 0; i < block_counts[b]; i++) for (j = 0; j < block_tables[b][i].entry_count; j++) total += block_tables[b][i].entryp[j].entry_count; - for (i = block_limit[b]; i < block_max_counts[b]; i++) + for (i = block_limit[b]; i < BLOCK_MAX_COUNT(b); i++) for (j = 0; j < block_tables[b][i].entry_count; j++) total += block_tables[b][i].entryp[j].entry_count; } @@ -4793,7 +4784,7 @@ void entry_stats(void) maxb = &blk->entryp[j]; } } - for (i = block_limit[b]; i < block_max_counts[b]; i++) { + for (i = block_limit[b]; i < BLOCK_MAX_COUNT(b); i++) { blk = &block_tables[b][i]; for (j = 0; j < blk->entry_count; j++) if (blk->entryp[j].entry_count > max) { @@ -4813,7 +4804,7 @@ void entry_stats(void) for (i = 0; i < block_counts[b]; i++) for (j = 0; j < block_tables[b][i].entry_count; j++) block_tables[b][i].entryp[j].entry_count = 0; - for (i = block_limit[b]; i < block_max_counts[b]; i++) + for (i = block_limit[b]; i < BLOCK_MAX_COUNT(b); i++) for (j = 0; j < block_tables[b][i].entry_count; j++) block_tables[b][i].entryp[j].entry_count = 0; } @@ -4871,7 +4862,15 @@ static void bcache_stats(void) for (i = 1; i < ARRAY_SIZE(sh2s->rts_cache); i++) if (sh2s[0].rts_cache[i].pc == -1 && sh2s[1].rts_cache[i].pc == -1) break; - printf("return cache hits:%d misses:%d depth: %d\n", rchit, rcmiss, i); + printf("return cache hits:%d misses:%d depth: %d index: %d/%d\n", rchit, rcmiss, i,sh2s[0].rts_cache_idx,sh2s[1].rts_cache_idx); + for (i = 0; i < ARRAY_SIZE(sh2s[0].rts_cache); i++) { + printf("%08x ",sh2s[0].rts_cache[i].pc); + if ((i+1) % 8 == 0) printf("\n"); + } + for (i = 0; i < ARRAY_SIZE(sh2s[1].rts_cache); i++) { + printf("%08x ",sh2s[1].rts_cache[i].pc); + if ((i+1) % 8 == 0) printf("\n"); + } #endif #if BRANCH_CACHE printf("branch cache hits:%d misses:%d\n", bchit, bcmiss); @@ -4920,31 +4919,31 @@ int sh2_drc_init(SH2 *sh2) if (block_tables[0] == NULL) { for (i = 0; i < TCACHE_BUFFERS; i++) { - block_tables[i] = calloc(block_max_counts[i], sizeof(*block_tables[0])); + block_tables[i] = calloc(BLOCK_MAX_COUNT(i), sizeof(*block_tables[0])); if (block_tables[i] == NULL) goto fail; // max 2 block links (exits) per block - block_link_pool[i] = calloc(block_link_pool_max_counts[i], + block_link_pool[i] = calloc(BLOCK_LINK_MAX_COUNT(i), sizeof(*block_link_pool[0])); if (block_link_pool[i] == NULL) goto fail; - inval_lookup[i] = calloc(ram_sizes[i] / INVAL_PAGE_SIZE, + inval_lookup[i] = calloc(RAM_SIZE(i) / INVAL_PAGE_SIZE, sizeof(inval_lookup[0])); if (inval_lookup[i] == NULL) goto fail; - hash_tables[i] = calloc(hash_table_sizes[i], sizeof(*hash_tables[0])); + hash_tables[i] = calloc(HASH_TABLE_SIZE(i), sizeof(*hash_tables[0])); if (hash_tables[i] == NULL) goto fail; - unresolved_links[i] = calloc(hash_table_sizes[i], sizeof(*unresolved_links[0])); + unresolved_links[i] = calloc(HASH_TABLE_SIZE(i), sizeof(*unresolved_links[0])); if (unresolved_links[i] == NULL) goto fail; } memset(block_counts, 0, sizeof(block_counts)); for (i = 0; i < ARRAY_SIZE(block_counts); i++) { - block_limit[i] = block_max_counts[i] - 1; + block_limit[i] = BLOCK_MAX_COUNT(i) - 1; } memset(block_link_pool_counts, 0, sizeof(block_link_pool_counts)); for (i = 0; i < ARRAY_SIZE(blink_free); i++) { @@ -5044,12 +5043,12 @@ void sh2_drc_finish(SH2 *sh2) static void *dr_get_pc_base(u32 pc, SH2 *sh2) { - void *ret = NULL; + void *ret; u32 mask = 0; ret = p32x_sh2_get_mem_ptr(pc, &mask, sh2); - if (ret == NULL) - return (void *)-1; // NULL is valid value + if (ret == (void *)-1) + return ret; return (char *)ret - (pc & ~mask); } diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index b098f6c6b..38e47c0bc 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -43,6 +43,7 @@ unsigned short scan_block(unsigned int base_pc, int is_slave, #else #warning "direct DRC register access not available for this host" #endif +#endif #ifdef DRC_SR_REG #define __DRC_DECLARE_SR(SR) register int sh2_sr asm(#SR) @@ -59,4 +60,3 @@ unsigned short scan_block(unsigned int base_pc, int is_slave, #define DRC_SAVE_SR(sh2) #define DRC_RESTORE_SR(sh2) #endif -#endif diff --git a/platform/common/common.mak b/platform/common/common.mak index 0c7e349c3..f4e5b8c3f 100644 --- a/platform/common/common.mak +++ b/platform/common/common.mak @@ -171,7 +171,7 @@ DASM = $(R)platform/libpicofe/linux/host_dasm.c DASMLIBS = -lbfd -lopcodes -liberty ifeq "$(ARCH)" "arm" ifeq ($(filter_out $(shell $(CC) --print-file-name=libbfd.so),"/"),) -DASM = $(R)platform/common/host_dasm_arm.c +DASM = $(R)platform/common/host_dasm.c DASMLIBS = endif endif diff --git a/platform/common/host_dasm_arm.c b/platform/common/host_dasm.c similarity index 88% rename from platform/common/host_dasm_arm.c rename to platform/common/host_dasm.c index 7951b7d92..d0537ef63 100644 --- a/platform/common/host_dasm_arm.c +++ b/platform/common/host_dasm.c @@ -1,9 +1,15 @@ #include #include +#include #include +#ifdef __mips__ +#include "dismips.c" +#define disasm dismips +#else #include "disarm.c" - +#define disasm disarm +#endif /* symbols */ typedef struct { const char *name; void *value; } asymbol; @@ -40,7 +46,8 @@ void host_dasm(void *addr, int len) insn = *(long *)addr; printf(" %08lx %08lx ", (long)addr, insn); - if(disarm((unsigned)addr, insn, buf, sizeof(buf))) { + if(disasm((unsigned)addr, insn, buf, sizeof(buf))) + { symaddr = 0; if ((insn & 0xe000000) == 0xa000000) { symaddr = (long)addr + 8 + ((long)(insn << 8) >> 6); @@ -53,7 +60,7 @@ void host_dasm(void *addr, int len) else printf("%s\n", buf); } else - printf("unknown\n"); + printf("unknown (0x%08lx)\n", insn); addr = (char *)addr + sizeof(long); } } From a34b8bed7e8dfd0672aeec9c1ff991888fa2caf3 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 25 Jun 2019 20:24:11 +0200 Subject: [PATCH 048/174] SH2 drc: register cache overhaul (bugfixing, speed, readability) --- cpu/sh2/compiler.c | 1178 ++++++++++++++++++++++++-------------------- 1 file changed, 657 insertions(+), 521 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index c1ba3f322..3b03d0c22 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -60,6 +60,7 @@ // 08 - runtime block entry log // 10 - smc self-check // 20 - runtime block entry counter +// 40 - rcache checking // 80 - branch cache statistics // 100 - write trace // 200 - compare trace @@ -67,7 +68,7 @@ // 800 - state dump on exit // { #ifndef DRC_DEBUG -#define DRC_DEBUG 0x0 +#define DRC_DEBUG 0//x8e7 #endif #if DRC_DEBUG @@ -152,13 +153,17 @@ enum op_types { OP_UNDEFINED, }; -#define OP_ISBRANCH(op) (BITRANGE(OP_BRANCH, OP_BRANCH_RF) & BITMASK1(op)) +// XXX consider trap insns: OP_TRAPA, OP_UNDEFINED? +#define OP_ISBRANCH(op) ((BITRANGE(OP_BRANCH, OP_BRANCH_RF)| BITMASK1(OP_RTE)) \ + & BITMASK1(op)) #define OP_ISBRAUC(op) (BITMASK4(OP_BRANCH, OP_BRANCH_R, OP_BRANCH_RF, OP_RTE) \ & BITMASK1(op)) -#define OP_ISBRACND(op) (BITMASK2(OP_BRANCH_CT, OP_BRANCH_CF) & BITMASK1(op)) +#define OP_ISBRACND(op) (BITMASK3(OP_BRANCH_CT, OP_BRANCH_CF, OP_BRANCH_N) \ + & BITMASK1(op)) #define OP_ISBRAIMM(op) (BITMASK3(OP_BRANCH, OP_BRANCH_CT, OP_BRANCH_CF) \ - & BITMASK1(op)) -#define OP_ISBRAIND(op) (BITMASK2(OP_BRANCH_R, OP_BRANCH_RF) & BITMASK1(op)) + & BITMASK1(op)) +#define OP_ISBRAIND(op) (BITMASK3(OP_BRANCH_R, OP_BRANCH_RF, OP_RTE) \ + & BITMASK1(op)) #ifdef DRC_SH2 @@ -192,7 +197,9 @@ static char sh2dasm_buff[64]; } #if (DRC_DEBUG & (8|256|512|1024)) || defined(PDB) +#if (DRC_DEBUG & (256|512|1024)) static SH2 csh2[2][8]; +#endif static void REGPARM(3) *sh2_drc_log_entry(void *block, SH2 *sh2, u32 sr) { if (block != NULL) { @@ -386,7 +393,8 @@ enum { typedef struct { u8 hreg; // "host" reg u8 flags:4; // TEMP or REG? - u8 type:4; + u8 type:2; // CACHED or TEMP? + u8 ref:2; // ref counter u16 stamp; // kind of a timestamp u32 gregs; // "guest" reg mask } cache_reg_t; @@ -415,8 +423,9 @@ static void rcache_free_tmp(int hr); // used by handlers in worst case (currently 4). // Register assignment goes by ABI convention. Caller save registers are TEMP, // the others are either static or REG. SR must be static, R0 very recommended. +// VBR, PC, PR must not be static (read from context in utils). // TEMP registers first, REG last. alloc/evict algorithm depends on this. -// The 1st TEMP must not be RET_REG on x86 (it uses temps for some insns). +// The 1st TEMP must not be RET_REG on platforms using temps in insns (eg. x86). // XXX shouldn't this be somehow defined in the code emitters? #ifdef __arm__ #include "../drc/emit_arm.c" @@ -438,7 +447,7 @@ static guest_reg_t guest_regs[] = { }; // OABI/EABI: params: r0-r3, return: r0-r1, temp: r12,r14, saved: r4-r8,r10,r11 -// SP,PC: r13,r15 must not be used. saved: r9 (for platform use, e.g. on OSx) +// SP,PC: r13,r15 must not be used. saved: r9 (for platform use, e.g. on ios) static cache_reg_t cache_regs[] = { { 12, HRF_TEMP }, // temps { 14, HRF_TEMP }, @@ -1216,43 +1225,93 @@ typedef struct { gconst_t gconsts[ARRAY_SIZE(guest_regs)]; static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr); +static inline int rcache_is_cached(sh2_reg_e r); static void rcache_add_vreg_alias(int x, sh2_reg_e r); static void rcache_remove_vreg_alias(int x, sh2_reg_e r); +static void rcache_evict_vreg(int x); +static void rcache_remap_vreg(int x); #define RCACHE_DUMP(msg) { \ cache_reg_t *cp; \ guest_reg_t *gp; \ int i; \ printf("cache dump %s:\n",msg); \ - printf("cache_regs:\n"); \ + printf(" cache_regs:\n"); \ for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { \ cp = &cache_regs[i]; \ - if (cp->type != HR_FREE || cp->gregs) \ - printf("%d: hr=%d t=%d f=%x m=%x\n", i, cp->hreg, cp->type, cp->flags, cp->gregs); \ + if (cp->type != HR_FREE || cp->gregs || (cp->flags & ~(HRF_REG|HRF_TEMP))) \ + printf(" %d: hr=%d t=%d f=%x c=%d m=%x\n", i, cp->hreg, cp->type, cp->flags, cp->ref, cp->gregs); \ } \ - printf("guest_regs:\n"); \ + printf(" guest_regs:\n"); \ for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { \ gp = &guest_regs[i]; \ - if (gp->vreg != -1 || gp->sreg >= 0) \ - printf("%d: v=%d f=%x s=%d\n", i, gp->vreg, gp->flags, gp->sreg); \ + if (gp->vreg != -1 || gp->sreg >= 0 || gp->flags) \ + printf(" %d: v=%d f=%x s=%d c=%d\n", i, gp->vreg, gp->flags, gp->sreg, gp->cnst); \ + } \ + printf(" gconsts:\n"); \ + for (i = 0; i < ARRAY_SIZE(gconsts); i++) { \ + if (gconsts[i].gregs) \ + printf(" %d: m=%x v=%x\n", i, gconsts[i].gregs, gconsts[i].val); \ } \ } +#define RCACHE_CHECK(msg) { \ + cache_reg_t *cp; \ + guest_reg_t *gp; \ + int i, x, d = 0; \ + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { \ + cp = &cache_regs[i]; \ + if (cp->type == HR_FREE || cp->type == HR_TEMP) continue; \ + /* check connectivity greg->vreg */ \ + FOR_ALL_BITS_SET_DO(cp->gregs, x, \ + if (guest_regs[x].vreg != i) \ + { d = 1; printf("cache check v=%d r=%d not connected?\n",i,x); } \ + ) \ + } \ + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { \ + gp = &guest_regs[i]; \ + if (gp->vreg != -1 && !(cache_regs[gp->vreg].gregs & (1 << i))) \ + { d = 1; printf("cache check r=%d v=%d not connected?\n", i, gp->vreg); }\ + if (gp->vreg != -1 && cache_regs[gp->vreg].type != HR_STATIC && cache_regs[gp->vreg].type != HR_CACHED) \ + { d = 1; printf("cache check r=%d v=%d wrong type?\n", i, gp->vreg); }\ + if ((gp->flags & GRF_CONST) && !(gconsts[gp->cnst].gregs & (1 << i))) \ + { d = 1; printf("cache check r=%d c=%d not connected?\n", i, gp->cnst); }\ + if ((gp->flags & GRF_CDIRTY) && (gp->vreg != -1 || !(gp->flags & GRF_CONST)) )\ + { d = 1; printf("cache check r=%d CDIRTY?\n", i); } \ + } \ + for (i = 0; i < ARRAY_SIZE(gconsts); i++) { \ + FOR_ALL_BITS_SET_DO(gconsts[i].gregs, x, \ + if (guest_regs[x].cnst != i || !(guest_regs[x].flags & GRF_CONST)) \ + { d = 1; printf("cache check c=%d v=%d not connected?\n",i,x); } \ + ) \ + } \ + if (d) RCACHE_DUMP(msg) \ +/* else { \ + printf("locked regs %s:\n",msg); \ + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { \ + cp = &cache_regs[i]; \ + if (cp->flags & HRF_LOCKED) \ + printf(" %d: hr=%d t=%d f=%x c=%d m=%x\n", i, cp->hreg, cp->type, cp->flags, cp->ref, cp->gregs); \ + } \ + } */ \ +} + #if PROPAGATE_CONSTANTS static inline int gconst_alloc(sh2_reg_e r) { int i, n = -1; for (i = 0; i < ARRAY_SIZE(gconsts); i++) { - if (gconsts[i].gregs & (1 << r)) - gconsts[i].gregs &= ~(1 << r); + gconsts[i].gregs &= ~(1 << r); if (gconsts[i].gregs == 0 && n < 0) n = i; } if (n >= 0) gconsts[n].gregs = (1 << r); - else + else { + printf("all gconst buffers in use, aborting\n"); exit(1); // cannot happen - more constants than guest regs? + } return n; } @@ -1274,19 +1333,6 @@ static void gconst_new(sh2_reg_e r, u32 val) if (guest_regs[r].vreg >= 0) rcache_remove_vreg_alias(guest_regs[r].vreg, r); } - -static void gconst_copy(sh2_reg_e rd, sh2_reg_e rs) -{ - if (guest_regs[rd].flags & GRF_CONST) { - guest_regs[rd].flags &= ~(GRF_CONST|GRF_CDIRTY); - gconsts[guest_regs[rd].cnst].gregs &= ~(1 << rd); - } - if (guest_regs[rs].flags & GRF_CONST) { - guest_regs[rd].flags |= GRF_CONST; - guest_regs[rd].cnst = guest_regs[rs].cnst; - gconsts[guest_regs[rd].cnst].gregs |= (1 << rd); - } -} #endif static int gconst_get(sh2_reg_e r, u32 *val) @@ -1310,17 +1356,22 @@ static int gconst_check(sh2_reg_e r) static int gconst_try_read(int vreg, sh2_reg_e r) { int i, x; + if (guest_regs[r].flags & GRF_CDIRTY) { x = guest_regs[r].cnst; emith_move_r_imm(cache_regs[vreg].hreg, gconsts[x].val); FOR_ALL_BITS_SET_DO(gconsts[x].gregs, i, { - if (guest_regs[i].vreg >= 0 && i != r) + if (guest_regs[i].vreg >= 0 && guest_regs[i].vreg != vreg) rcache_remove_vreg_alias(guest_regs[i].vreg, i); - rcache_add_vreg_alias(vreg, i); + if (guest_regs[i].vreg < 0) + rcache_add_vreg_alias(vreg, i); guest_regs[i].flags &= ~GRF_CDIRTY; guest_regs[i].flags |= GRF_DIRTY; }); + if (cache_regs[vreg].type != HR_STATIC) + cache_regs[vreg].type = HR_CACHED; + cache_regs[vreg].flags |= HRF_DIRTY; return 1; } return 0; @@ -1339,11 +1390,23 @@ static u32 gconst_dirty_mask(void) static void gconst_kill(sh2_reg_e r) { - if (guest_regs[r].flags &= ~(GRF_CONST|GRF_CDIRTY)) + if (guest_regs[r].flags & (GRF_CONST|GRF_CDIRTY)) gconsts[guest_regs[r].cnst].gregs &= ~(1 << r); guest_regs[r].flags &= ~(GRF_CONST|GRF_CDIRTY); } +static void gconst_copy(sh2_reg_e rd, sh2_reg_e rs) +{ + gconst_kill(rd); + if (guest_regs[rs].flags & GRF_CONST) { + guest_regs[rd].flags |= GRF_CONST; + if (guest_regs[rd].vreg < 0) + guest_regs[rd].flags |= GRF_CDIRTY; + guest_regs[rd].cnst = guest_regs[rs].cnst; + gconsts[guest_regs[rd].cnst].gregs |= (1 << rd); + } +} + static void gconst_clean(void) { int i; @@ -1367,25 +1430,76 @@ static void gconst_invalidate(void) } } + static u16 rcache_counter; -static u32 rcache_static; -static u32 rcache_locked; -static u32 rcache_hint_soon; -static u32 rcache_hint_late; -static u32 rcache_hint_write; -static u32 rcache_hint_clean; -#define rcache_hint (rcache_hint_soon|rcache_hint_late) +// SH2 register usage bitmasks +static u32 rcache_regs_static; // statically allocated regs +static u32 rcache_regs_now; // regs used in current insn +static u32 rcache_regs_soon; // regs used in the next few insns +static u32 rcache_regs_late; // regs used in later insns +static u32 rcache_regs_discard; // regs overwritten without being used +static u32 rcache_regs_clean; // regs needing cleaning +// combination masks XXX this seems obscure +#define rcache_regs_used (rcache_regs_soon|rcache_regs_late|rcache_regs_clean) +#define rcache_regs_nowused (rcache_regs_now|rcache_regs_used) +#define rcache_regs_nowsoon (rcache_regs_now|rcache_regs_soon) +#define rcache_regs_soonclean (rcache_regs_soon|rcache_regs_clean) + +static void rcache_ref_vreg(int x) +{ + if (x >= 0) { + cache_regs[x].ref ++; + cache_regs[x].flags |= HRF_LOCKED; + } +} + +static void rcache_unref_vreg(int x) +{ + if (x >= 0 && -- cache_regs[x].ref == 0) { + cache_regs[x].flags &= ~HRF_LOCKED; + } +} + +static void rcache_free_vreg(int x) +{ + if (cache_regs[x].type != HR_STATIC) + cache_regs[x].type = HR_FREE; + cache_regs[x].flags &= (HRF_REG|HRF_TEMP); + cache_regs[x].gregs = 0; + cache_regs[x].ref = 0; +} static void rcache_unmap_vreg(int x) { int i; FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, i, + if (guest_regs[i].flags & GRF_DIRTY) { + // if a dirty reg is unmapped save its value to context + if (~rcache_regs_discard & (1 << i)) + emith_ctx_write(cache_regs[x].hreg, i * 4); + guest_regs[i].flags &= ~GRF_DIRTY; + } guest_regs[i].vreg = -1); - if (cache_regs[x].type != HR_STATIC) - cache_regs[x].type = HR_FREE; - cache_regs[x].gregs = 0; - cache_regs[x].flags &= (HRF_REG|HRF_TEMP); + rcache_free_vreg(x); +} + +static void rcache_move_vreg(int d, int x) +{ + int i; + + if (cache_regs[d].type != HR_STATIC) + cache_regs[d].type = HR_CACHED; + cache_regs[d].gregs = cache_regs[x].gregs; + cache_regs[d].flags &= (HRF_TEMP|HRF_REG); + cache_regs[d].flags |= cache_regs[x].flags & ~(HRF_TEMP|HRF_REG); + cache_regs[d].ref = 0; + cache_regs[d].stamp = cache_regs[x].stamp; + emith_move_r_r(cache_regs[d].hreg, cache_regs[x].hreg); + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) + if (guest_regs[i].vreg == x) + guest_regs[i].vreg = d; + rcache_free_vreg(x); } static void rcache_clean_vreg(int x) @@ -1394,99 +1508,112 @@ static void rcache_clean_vreg(int x) if (cache_regs[x].flags & HRF_DIRTY) { // writeback cache_regs[x].flags &= ~HRF_DIRTY; + rcache_ref_vreg(x); FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, r, if (guest_regs[r].flags & GRF_DIRTY) { if (guest_regs[r].flags & GRF_STATIC) { if (guest_regs[r].vreg != guest_regs[r].sreg) { if (!(cache_regs[guest_regs[r].sreg].flags & HRF_LOCKED)) { // statically mapped reg not in its sreg. move back to sreg - rcache_clean_vreg(guest_regs[r].sreg); - rcache_unmap_vreg(guest_regs[r].sreg); - emith_move_r_r(cache_regs[guest_regs[r].sreg].hreg, cache_regs[guest_regs[r].vreg].hreg); + rcache_evict_vreg(guest_regs[r].sreg); + emith_move_r_r(cache_regs[guest_regs[r].sreg].hreg, + cache_regs[guest_regs[r].vreg].hreg); rcache_remove_vreg_alias(x, r); rcache_add_vreg_alias(guest_regs[r].sreg, r); cache_regs[guest_regs[r].sreg].flags |= HRF_DIRTY; } else { // must evict since sreg is locked - emith_ctx_write(cache_regs[x].hreg, r * 4); + if (~rcache_regs_discard & (1 << r)) + emith_ctx_write(cache_regs[x].hreg, r * 4); guest_regs[r].flags &= ~GRF_DIRTY; - guest_regs[r].vreg = -1; + rcache_remove_vreg_alias(x, r); } - } - } else if (~rcache_hint_write & (1 << r)) { - emith_ctx_write(cache_regs[x].hreg, r * 4); + } else + cache_regs[x].flags |= HRF_DIRTY; + } else { + if (~rcache_regs_discard & (1 << r)) + emith_ctx_write(cache_regs[x].hreg, r * 4); guest_regs[r].flags &= ~GRF_DIRTY; } + rcache_regs_clean &= ~(1 << r); }) + rcache_unref_vreg(x); } +#if DRC_DEBUG & 64 + RCACHE_CHECK("after clean"); +#endif } static void rcache_add_vreg_alias(int x, sh2_reg_e r) { cache_regs[x].gregs |= (1 << r); guest_regs[r].vreg = x; + if (cache_regs[x].type != HR_STATIC) + cache_regs[x].type = HR_CACHED; } static void rcache_remove_vreg_alias(int x, sh2_reg_e r) { cache_regs[x].gregs &= ~(1 << r); - if (!cache_regs[x].gregs) { + if (!cache_regs[x].gregs) // no reg mapped -> free vreg - if (cache_regs[x].type != HR_STATIC) - cache_regs[x].type = HR_FREE; - cache_regs[x].flags &= (HRF_REG|HRF_TEMP); - } + rcache_free_vreg(x); guest_regs[r].vreg = -1; } static void rcache_evict_vreg(int x) { +#if REMAP_REGISTER + rcache_remap_vreg(x); +#else rcache_clean_vreg(x); +#endif rcache_unmap_vreg(x); } static void rcache_evict_vreg_aliases(int x, sh2_reg_e r) { - cache_regs[x].gregs &= ~(1 << r); + rcache_remove_vreg_alias(x, r); rcache_evict_vreg(x); - cache_regs[x].gregs = (1 << r); - if (cache_regs[x].type != HR_STATIC) - cache_regs[x].type = HR_CACHED; - if (guest_regs[r].flags & GRF_DIRTY) - cache_regs[x].flags |= HRF_DIRTY; + rcache_add_vreg_alias(x, r); } -static cache_reg_t *rcache_evict(void) +static int rcache_allocate(int what, int minprio) { // evict reg with oldest stamp (only for HRF_REG, no temps) int i, i_prio, oldest = -1, prio = 0; u16 min_stamp = (u16)-1; for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { - // consider only unlocked REG - if (!(cache_regs[i].flags & HRF_REG) || (cache_regs[i].flags & HRF_LOCKED)) + // consider only unlocked REG or non-TEMP + if (cache_regs[i].flags == 0 || (cache_regs[i].flags & HRF_LOCKED)) + continue; + if ((what > 0 && !(cache_regs[i].flags & HRF_REG)) || + (what == 0 && (cache_regs[i].flags & HRF_TEMP)) || + (what < 0 && !(cache_regs[i].flags & HRF_TEMP))) continue; - if (cache_regs[i].type == HR_FREE || (cache_regs[i].type == HR_TEMP)) { + if (cache_regs[i].type == HR_FREE || cache_regs[i].type == HR_TEMP) { + // REG is free + prio = 6; oldest = i; break; } if (cache_regs[i].type == HR_CACHED) { - if (rcache_locked & cache_regs[i].gregs) + if (rcache_regs_now & cache_regs[i].gregs) // REGs needed for the current insn i_prio = 1; - else if (rcache_hint_soon & cache_regs[i].gregs) - // REGs needed in some future insn + else if (rcache_regs_soon & cache_regs[i].gregs) + // REGs needed in the next insns i_prio = 2; - else if (rcache_hint_late & cache_regs[i].gregs) + else if (rcache_regs_late & cache_regs[i].gregs) // REGs needed in some future insn i_prio = 3; - else if ((rcache_hint_write & cache_regs[i].gregs) != cache_regs[i].gregs) - // REGs not needed soon + else if (!(~rcache_regs_discard & cache_regs[i].gregs)) + // REGs not needed in the foreseeable future i_prio = 4; else // REGs soon overwritten anyway i_prio = 5; - if (prio < i_prio || (prio == i_prio && cache_regs[i].stamp < min_stamp)) { min_stamp = cache_regs[i].stamp; oldest = i; @@ -1495,25 +1622,66 @@ static cache_reg_t *rcache_evict(void) } } - if (oldest == -1) { - printf("no registers to evict, aborting\n"); - exit(1); - } + + if (prio < minprio || oldest == -1) + return -1; if (cache_regs[oldest].type == HR_CACHED) rcache_evict_vreg(oldest); - cache_regs[oldest].type = HR_FREE; - cache_regs[oldest].flags &= (HRF_TEMP|HRF_REG); - cache_regs[oldest].gregs = 0; + else + rcache_free_vreg(oldest); + + return oldest; +} + +static int rcache_allocate_vreg(int needed) +{ + int x; + + // get a free reg, but use temps only if r is not needed soon + for (x = ARRAY_SIZE(cache_regs) - 1; x >= 0; x--) { + if (cache_regs[x].flags && (cache_regs[x].type == HR_FREE || + (cache_regs[x].type == HR_TEMP && !(cache_regs[x].flags & HRF_LOCKED))) && + (!needed || (cache_regs[x].flags & HRF_REG))) + break; + } + + if (x < 0) + x = rcache_allocate(1, 0); + return x; +} - return &cache_regs[oldest]; +static int rcache_allocate_nontemp(void) +{ + int x = rcache_allocate(0, 3); + return x; +} + +static int rcache_allocate_temp(void) +{ + int x; + + // use any free reg, but prefer TEMP regs + for (x = 0; x < ARRAY_SIZE(cache_regs); x++) { + if (cache_regs[x].flags && (cache_regs[x].type == HR_FREE || + (cache_regs[x].type == HR_TEMP && !(cache_regs[x].flags & HRF_LOCKED)))) + break; + } + + if (x >= ARRAY_SIZE(cache_regs)) + x = rcache_allocate(-1, 1); + if (x < 0) { + printf("no temp register available, aborting\n"); + exit(1); + } + return x; } #if REMAP_REGISTER // maps a host register to a REG static int rcache_map_reg(sh2_reg_e r, int hr, int mode) { - int i; + int x, i; gconst_kill(r); @@ -1527,11 +1695,13 @@ static int rcache_map_reg(sh2_reg_e r, int hr, int mode) // deal with statically mapped regs if (mode == RC_GR_RMW && (guest_regs[r].flags & GRF_STATIC)) { - if (guest_regs[r].vreg == guest_regs[r].sreg) { + x = guest_regs[r].sreg; + if (guest_regs[r].vreg == x) { // STATIC in its sreg with no aliases, and some processing pending - if (cache_regs[guest_regs[r].vreg].gregs == 1 << r) - return cache_regs[guest_regs[r].vreg].hreg; - } else if (!cache_regs[guest_regs[r].sreg].gregs) + if (cache_regs[x].gregs == 1 << r) + return cache_regs[x].hreg; + } else if (cache_regs[x].type == HR_FREE || + (cache_regs[x].type == HR_TEMP && !(cache_regs[x].flags & HRF_LOCKED))) // STATIC not in its sreg, with sreg available -> move it i = guest_regs[r].sreg; } @@ -1540,187 +1710,184 @@ static int rcache_map_reg(sh2_reg_e r, int hr, int mode) if (guest_regs[r].vreg >= 0) rcache_remove_vreg_alias(guest_regs[r].vreg, r); if (cache_regs[i].type == HR_CACHED) - rcache_unmap_vreg(i); + rcache_evict_vreg(i); // set new mappping if (cache_regs[i].type != HR_STATIC) cache_regs[i].type = HR_CACHED; cache_regs[i].gregs = 1 << r; cache_regs[i].flags &= (HRF_TEMP|HRF_REG); + cache_regs[i].ref = 0; cache_regs[i].stamp = ++rcache_counter; - cache_regs[i].flags |= HRF_DIRTY|HRF_LOCKED; + cache_regs[i].flags |= HRF_DIRTY; + rcache_ref_vreg(i); guest_regs[r].flags |= GRF_DIRTY; guest_regs[r].vreg = i; +#if DRC_DEBUG & 64 + RCACHE_CHECK("after map"); +#endif return cache_regs[i].hreg; } -// remap vreg from a TEMP to a REG if it is hinted (upcoming TEMP invalidation) -static void rcache_remap_vreg(int r) +// remap vreg from a TEMP to a REG if it will be used (upcoming TEMP invalidation) +static void rcache_remap_vreg(int x) { - int i, j, free = -1, cached = -1, hinted = -1; - u16 min_stamp_cached = (u16)-1, min_stamp_hinted = -1; + int d; - // r must be a vreg - if (cache_regs[r].type != HR_CACHED) + // x must be a cached vreg + if (cache_regs[x].type != HR_CACHED && cache_regs[x].type != HR_STATIC) return; - // if r is already a REG or isn't used, clean here to avoid data loss on inval - if ((cache_regs[r].flags & HRF_REG) || !(rcache_hint & cache_regs[r].gregs)) { - rcache_clean_vreg(r); + // don't do it if x is already a REG or isn't used or to be cleaned anyway + if ((cache_regs[x].flags & HRF_REG) || + !(rcache_regs_used & ~rcache_regs_clean & cache_regs[x].gregs)) { + // clean here to avoid data loss on invalidation + rcache_clean_vreg(x); return; } - // find REG, either free or unused temp or oldest cached - for (i = 0; i < ARRAY_SIZE(cache_regs) && free < 0; i++) { - if ((cache_regs[i].flags & HRF_TEMP) || (cache_regs[i].flags & HRF_LOCKED)) - continue; - if (cache_regs[i].type == HR_FREE || cache_regs[i].type == HR_TEMP) - free = i; - if (cache_regs[i].type == HR_CACHED && !(rcache_hint & cache_regs[i].gregs)) { - if (cache_regs[i].stamp < min_stamp_cached) { - min_stamp_cached = cache_regs[i].stamp; - cached = i; - } - } - if (cache_regs[i].type == HR_CACHED && !(rcache_hint_soon & cache_regs[i].gregs) - && (rcache_hint_soon & cache_regs[r].gregs)) - if (cache_regs[i].stamp < min_stamp_hinted) { - min_stamp_hinted = cache_regs[i].stamp; - hinted = i; - } + if (cache_regs[x].flags & HRF_LOCKED) { + printf("remap vreg %d is locked\n", x); + exit(1); } - if (free >= 0) { - i = free; - } else if (cached >= 0 && cached != r) { - i = cached; - rcache_evict_vreg(i); - } else if (hinted >= 0 && hinted != r) { - i = hinted; - rcache_evict_vreg(i); - } else { - rcache_clean_vreg(r); + // allocate a non-TEMP vreg + rcache_ref_vreg(x); // lock to avoid evicting x + d = rcache_allocate_nontemp(); + rcache_unref_vreg(x); + if (d < 0) { + rcache_clean_vreg(x); return; } - // set new mapping and remove old one - cache_regs[i].type = HR_CACHED; - cache_regs[i].gregs = cache_regs[r].gregs; - cache_regs[i].flags &= (HRF_TEMP|HRF_REG); - cache_regs[i].flags |= cache_regs[r].flags & ~(HRF_TEMP|HRF_REG); - cache_regs[i].stamp = cache_regs[r].stamp; - emith_move_r_r(cache_regs[i].hreg, cache_regs[r].hreg); - for (j = 0; j < ARRAY_SIZE(guest_regs); j++) - if (guest_regs[j].vreg == r) - guest_regs[j].vreg = i; - cache_regs[r].type = HR_FREE; - cache_regs[r].flags &= (HRF_TEMP|HRF_REG); - cache_regs[r].gregs = 0; + // move vreg to new location + rcache_move_vreg(d, x); +#if DRC_DEBUG & 64 + RCACHE_CHECK("after remap"); +#endif +} +#endif + +#if ALIAS_REGISTERS +static void rcache_alias_vreg(sh2_reg_e rd, sh2_reg_e rs) +{ + int x; + + // if s isn't constant, it must be in cache for aliasing + if (!gconst_check(rs)) + rcache_get_reg_(rs, RC_GR_READ, 0, NULL); + + // if d and s are not already aliased + x = guest_regs[rs].vreg; + if (guest_regs[rd].vreg != x) { + // remove possible old mapping of dst + if (guest_regs[rd].vreg >= 0) + rcache_remove_vreg_alias(guest_regs[rd].vreg, rd); + // make dst an alias of src + if (x >= 0) + rcache_add_vreg_alias(x, rd); + // if d is now in cache, it must be dirty + if (guest_regs[rd].vreg >= 0) { + x = guest_regs[rd].vreg; + cache_regs[x].flags |= HRF_DIRTY; + guest_regs[rd].flags |= GRF_DIRTY; + } + } + + gconst_copy(rd, rs); +#if DRC_DEBUG & 64 + RCACHE_CHECK("after alias"); +#endif } #endif // note: must not be called when doing conditional code static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr) { - cache_reg_t *tr = NULL; - int i, h, split = -1; + int src, dst, ali; + cache_reg_t *tr; - rcache_counter++; + dst = src = guest_regs[r].vreg; - // maybe already cached? - // if so, prefer against gconst (they must be in sync) - i = guest_regs[r].vreg; - if ((guest_regs[r].flags & GRF_STATIC) && i != guest_regs[r].sreg && + rcache_ref_vreg(src); // lock to avoid evicting src + // good opportunity to relocate a remapped STATIC? + if ((guest_regs[r].flags & GRF_STATIC) && src != guest_regs[r].sreg && !(cache_regs[guest_regs[r].sreg].flags & HRF_LOCKED) && - (i < 0 || mode != RC_GR_READ) && - !((rcache_hint_soon|rcache_locked) & cache_regs[guest_regs[r].sreg].gregs)) { - // good opportunity to relocate a remapped STATIC - h = guest_regs[r].sreg; - rcache_evict_vreg(h); - tr = &cache_regs[h]; - tr->gregs = 1 << r; - if (i >= 0) { - if (mode != RC_GR_WRITE) { - if (hr) - *hr = cache_regs[i].hreg; - else - emith_move_r_r(cache_regs[h].hreg, cache_regs[i].hreg); - hr = NULL; - } - rcache_remove_vreg_alias(guest_regs[r].vreg, r); - } else if (mode != RC_GR_WRITE) { - if (gconst_try_read(h, r)) { - tr->flags |= HRF_DIRTY; - guest_regs[r].flags |= GRF_DIRTY; - } else - emith_ctx_read(tr->hreg, r * 4); - } - guest_regs[r].vreg = guest_regs[r].sreg; - goto end; - } else if (i >= 0) { - if (mode == RC_GR_READ || !(cache_regs[i].gregs & ~(1 << r))) { - // either only reading, or no multiple mapping - tr = &cache_regs[i]; - goto end; - } - // split if aliases needed rsn, or already locked, or r is STATIC in sreg - if (((rcache_hint|rcache_locked) & cache_regs[i].gregs & ~(1 << r)) || - (cache_regs[i].flags & HRF_LOCKED) || - (cache_regs[i].type == HR_STATIC && !(guest_regs[r].flags & GRF_STATIC))) { - // need to split up. take reg out here to avoid unnecessary writebacks - rcache_remove_vreg_alias(i, r); - split = i; - } else { - // aliases not needed anytime soon, remove them - // XXX split aliases away if writing and static and not locked and hinted? - rcache_evict_vreg_aliases(i, r); - tr = &cache_regs[i]; - goto end; + (src < 0 || mode != RC_GR_READ) && + !(rcache_regs_nowsoon & cache_regs[guest_regs[r].sreg].gregs)) { + dst = guest_regs[r].sreg; + rcache_evict_vreg(dst); + } else if (dst < 0) { + // allocate a cache register + if ((dst = rcache_allocate_vreg(rcache_regs_nowsoon & (1 << r))) < 0) { + printf("no registers to evict, aborting\n"); + exit(1); } } - - // get a free reg, but use temps only if r is not needed soon - for (i = ARRAY_SIZE(cache_regs) - 1; i >= 0; i--) { - if ((cache_regs[i].type == HR_FREE || - (cache_regs[i].type == HR_TEMP && !(cache_regs[i].flags & HRF_LOCKED))) && - (!(rcache_hint & (1 << r)) || (cache_regs[i].flags & HRF_REG))) { - tr = &cache_regs[i]; - break; + tr = &cache_regs[dst]; + tr->stamp = rcache_counter; + rcache_unref_vreg(src); + // remove r from src + if (src >= 0 && src != dst) + rcache_remove_vreg_alias(src, r); + + // if r has a constant it may have aliases + if (mode != RC_GR_WRITE && gconst_try_read(dst, r)) + src = dst; + + // if r will be modified, check for aliases being needed rsn + ali = tr->gregs & ~(1 << r); + if (mode != RC_GR_READ && src == dst && ali) { + int x = -1; + if (rcache_regs_nowsoon & ali) { + if (tr->type == HR_STATIC && guest_regs[r].sreg == dst && + !(tr->flags & HRF_LOCKED)) { + // split aliases if r is STATIC in sreg and dst isn't already locked + rcache_ref_vreg(dst); // lock to avoid evicting dst + if ((x = rcache_allocate_vreg(rcache_regs_nowsoon & ali)) >= 0) { + src = x; + rcache_move_vreg(src, dst); + } + rcache_unref_vreg(dst); + } else { + // split r + rcache_ref_vreg(src); // lock to avoid evicting src + if ((x = rcache_allocate_vreg(rcache_regs_nowsoon & (1 << r))) >= 0) { + dst = x; + tr = &cache_regs[dst]; + tr->stamp = rcache_counter; + } + rcache_unref_vreg(src); + } } + if (x < 0) + // aliases not needed or no vreg available, remove them + rcache_evict_vreg_aliases(dst, r); + else if (src != dst) + rcache_remove_vreg_alias(src, r); } - if (!tr) - tr = rcache_evict(); - - tr->type = HR_CACHED; - tr->gregs = 1 << r; - guest_regs[r].vreg = tr - cache_regs; - - if (mode != RC_GR_WRITE) { - if (gconst_try_read(guest_regs[r].vreg, r)) { - tr->flags |= HRF_DIRTY; - guest_regs[r].flags |= GRF_DIRTY; - } else if (split >= 0) { - if (hr) { - cache_regs[split].flags |= HRF_LOCKED; - *hr = cache_regs[split].hreg; - hr = NULL; - } else if (tr->hreg != cache_regs[split].hreg) - emith_move_r_r(tr->hreg, cache_regs[split].hreg); - } else - emith_ctx_read(tr->hreg, r * 4); - } + // assign r to dst + rcache_add_vreg_alias(dst, r); -end: - if (hr) - *hr = tr->hreg; + // handle dst register transfer + if (src < 0 && mode != RC_GR_WRITE) + emith_ctx_read(tr->hreg, r * 4); + if (hr) { + *hr = (src >= 0 ? cache_regs[src].hreg : tr->hreg); + rcache_ref_vreg(reg_map_host[*hr]); + } else if (src >= 0 && cache_regs[src].hreg != tr->hreg) + emith_move_r_r(tr->hreg, cache_regs[src].hreg); + + // housekeeping if (do_locking) - tr->flags |= HRF_LOCKED; - tr->stamp = rcache_counter; + rcache_ref_vreg(dst); if (mode != RC_GR_READ) { tr->flags |= HRF_DIRTY; guest_regs[r].flags |= GRF_DIRTY; gconst_kill(r); } - +#if DRC_DEBUG & 64 + RCACHE_CHECK("after getreg"); +#endif return tr->hreg; } @@ -1731,38 +1898,25 @@ static int rcache_get_reg(sh2_reg_e r, rc_gr_mode mode, int *hr) static int rcache_get_tmp(void) { - cache_reg_t *tr = NULL; int i; - // use any free reg, but prefer TEMP regs - for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { - if (cache_regs[i].type == HR_FREE || - (cache_regs[i].type == HR_TEMP && !(cache_regs[i].flags & HRF_LOCKED))) { - tr = &cache_regs[i]; - break; - } - } - - if (!tr) - tr = rcache_evict(); + i = rcache_allocate_temp(); + rcache_ref_vreg(i); - tr->type = HR_TEMP; - tr->flags |= HRF_LOCKED; - return tr->hreg; + cache_regs[i].type = HR_TEMP; + return cache_regs[i].hreg; } -static int rcache_get_hr_id(int hr) +static int rcache_get_vreg_hr(int hr) { int i; i = reg_map_host[hr]; - if (i < 0) // can't happen + if (i < 0 || (cache_regs[i].flags & HRF_LOCKED)) { + printf("host register %d is locked\n", hr); exit(1); + } -#if REMAP_REGISTER - if (cache_regs[i].type == HR_CACHED) - rcache_remap_vreg(i); -#endif if (cache_regs[i].type == HR_CACHED) rcache_evict_vreg(i); else if (cache_regs[i].type == HR_TEMP && (cache_regs[i].flags & HRF_LOCKED)) { @@ -1773,167 +1927,110 @@ static int rcache_get_hr_id(int hr) return i; } -static int rcache_get_arg_id(int arg) +static int rcache_get_vreg_arg(int arg) { int hr = 0; host_arg2reg(hr, arg); - return rcache_get_hr_id(hr); + return rcache_get_vreg_hr(hr); } // get a reg to be used as function arg static int rcache_get_tmp_arg(int arg) { - int id = rcache_get_arg_id(arg); - cache_regs[id].type = HR_TEMP; - cache_regs[id].flags |= HRF_LOCKED; + int x = rcache_get_vreg_arg(arg); + cache_regs[x].type = HR_TEMP; + rcache_ref_vreg(x); - return cache_regs[id].hreg; + return cache_regs[x].hreg; } // ... as return value after a call static int rcache_get_tmp_ret(void) { - int id = rcache_get_hr_id(RET_REG); - cache_regs[id].type = HR_TEMP; - cache_regs[id].flags |= HRF_LOCKED; + int x = rcache_get_vreg_hr(RET_REG); + cache_regs[x].type = HR_TEMP; + rcache_ref_vreg(x); - return cache_regs[id].hreg; + return cache_regs[x].hreg; } // same but caches a reg if access is readonly (announced by hr being NULL) static int rcache_get_reg_arg(int arg, sh2_reg_e r, int *hr) { - int i, srcr, dstr, dstid; - int dirty = 0, src_dirty = 0, is_const = 0, is_cached = 0; + int i, srcr, dstr, dstid, keep; u32 val; host_arg2reg(dstr, arg); i = guest_regs[r].vreg; if (i >= 0 && cache_regs[i].type == HR_CACHED && cache_regs[i].hreg == dstr) - // r is already in arg + // r is already in arg, avoid evicting dstid = i; else - dstid = rcache_get_arg_id(arg); + dstid = rcache_get_vreg_arg(arg); dstr = cache_regs[dstid].hreg; - if (rcache_hint & (1 << r)) { + if (rcache_is_cached(r)) { // r is needed later on anyway srcr = rcache_get_reg_(r, RC_GR_READ, 0, NULL); - is_cached = (cache_regs[reg_map_host[srcr]].type == HR_CACHED); - } else if (!(rcache_hint_clean & (1 << r)) && - (guest_regs[r].flags & GRF_CDIRTY) && gconst_get(r, &val)) { + keep = 1; + } else if ((guest_regs[r].flags & GRF_CDIRTY) && gconst_get(r, &val)) { // r has an uncomitted const - load into arg, but keep constant uncomitted srcr = dstr; - is_const = 1; - } else if ((i = guest_regs[r].vreg) >= 0) { - // maybe already cached? - srcr = cache_regs[i].hreg; - is_cached = (cache_regs[reg_map_host[srcr]].type == HR_CACHED); + emith_move_r_imm(srcr, val); + keep = 0; } else { - // must read either const or from ctx + // must read from ctx srcr = dstr; - if (rcache_static & (1 << r)) - srcr = rcache_get_reg_(r, RC_GR_READ, 0, NULL); - else if (gconst_try_read(dstid, r)) - dirty = 1; - else - emith_ctx_read(srcr, r * 4); + emith_ctx_read(srcr, r * 4); + keep = 1; } - if (is_cached) { - i = reg_map_host[srcr]; - if (srcr == dstr) { // evict aliases here since it is reallocated below - if (guest_regs[r].flags & GRF_STATIC) // move STATIC back to its sreg - rcache_clean_vreg(guest_regs[r].vreg); -#if REMAP_REGISTER - rcache_remap_vreg(i); -#endif - if (cache_regs[i].type == HR_CACHED) - rcache_evict_vreg(i); - } - else if (hr != NULL) // must lock srcr if not copied here - cache_regs[i].flags |= HRF_LOCKED; - if (guest_regs[r].flags & GRF_DIRTY) - src_dirty = 1; - } + if (cache_regs[dstid].type == HR_CACHED) + rcache_evict_vreg(dstid); cache_regs[dstid].type = HR_TEMP; - if (is_const) { - // uncomitted constant - emith_move_r_imm(srcr, val); - } else if (dstr != srcr) { - // arg is a copy of cached r - if (hr == NULL) + if (hr == NULL) { + if (dstr != srcr) + // arg is a copy of cached r emith_move_r_r(dstr, srcr); - } else if (hr != NULL) { - // caller will modify arg, so it will soon be out of sync with r - if (dirty || src_dirty) { - if (~rcache_hint_write & (1 << r)) { - emith_ctx_write(dstr, r * 4); // must clean since arg will be modified - guest_regs[r].flags &= ~GRF_DIRTY; - } - } + else if (keep && guest_regs[r].vreg < 0) + // keep arg as vreg for r + rcache_add_vreg_alias(dstid, r); } else { - // keep arg as vreg for r - cache_regs[dstid].type = HR_CACHED; - if (guest_regs[r].vreg < 0) { - cache_regs[dstid].gregs = 1 << r; - guest_regs[r].vreg = dstid; - } - if (dirty || src_dirty) { // mark as modifed for cleaning later on - cache_regs[dstid].flags |= HRF_DIRTY; - guest_regs[r].flags |= GRF_DIRTY; - } - } - - if (hr) *hr = srcr; + if (dstr != srcr) // must lock srcr if not copied here + rcache_ref_vreg(reg_map_host[srcr]); + } cache_regs[dstid].stamp = ++rcache_counter; - cache_regs[dstid].flags |= HRF_LOCKED; + rcache_ref_vreg(dstid); +#if DRC_DEBUG & 64 + RCACHE_CHECK("after getarg"); +#endif return dstr; } static void rcache_free_tmp(int hr) { int i = reg_map_host[hr]; + if (i < 0 || cache_regs[i].type != HR_TEMP) { printf("rcache_free_tmp fail: #%i hr %d, type %d\n", i, hr, cache_regs[i].type); - return; + exit(1); } - cache_regs[i].type = HR_FREE; - cache_regs[i].flags &= (HRF_REG|HRF_TEMP); + rcache_free_vreg(i); } // saves temporary result either in REG or in drctmp static int rcache_save_tmp(int hr) { - int i, free = -1, cached = -1; - u16 min_stamp = (u16)-1; + int i; // find REG, either free or unlocked temp or oldest non-hinted cached - for (i = 0; i < ARRAY_SIZE(cache_regs) && free < 0; i++) { - if ((cache_regs[i].flags & HRF_TEMP) || (cache_regs[i].flags & HRF_LOCKED)) - continue; - if (cache_regs[i].type == HR_FREE || cache_regs[i].type == HR_TEMP) - free = i; - if (cache_regs[i].type == HR_CACHED && - !((rcache_hint | rcache_locked) & cache_regs[i].gregs)) { - if (cache_regs[i].stamp < min_stamp) { - min_stamp = cache_regs[i].stamp; - cached = i; - } - } - } - - if (free >= 0) - i = free; - else if (cached >= 0) { - i = cached; - rcache_evict_vreg(i); - } else { + i = rcache_allocate_nontemp(); + if (i < 0) { // if none is available, store in drctmp emith_ctx_write(hr, offsetof(SH2, drc_tmp)); rcache_free_tmp(hr); @@ -1943,27 +2040,27 @@ static int rcache_save_tmp(int hr) cache_regs[i].type = HR_CACHED; cache_regs[i].gregs = 0; // not storing any guest register cache_regs[i].flags &= (HRF_TEMP|HRF_REG); - cache_regs[i].flags |= HRF_LOCKED; + cache_regs[i].ref = 0; cache_regs[i].stamp = ++rcache_counter; + rcache_ref_vreg(i); emith_move_r_r(cache_regs[i].hreg, hr); rcache_free_tmp(hr); return i; } -static int rcache_restore_tmp(int r) +static int rcache_restore_tmp(int x) { int hr; // find REG with tmp store: cached but with no gregs - if (r >= 0) { - if (cache_regs[r].type != HR_CACHED || cache_regs[r].gregs) { - printf("invalid tmp storage %d\n", r); + if (x >= 0) { + if (cache_regs[x].type != HR_CACHED || cache_regs[x].gregs) { + printf("invalid tmp storage %d\n", x); exit(1); } // found, transform to a TEMP - cache_regs[r].type = HR_TEMP; - cache_regs[r].flags |= HRF_LOCKED; - return cache_regs[r].hreg; + cache_regs[x].type = HR_TEMP; + return cache_regs[x].hreg; } // if not available, create a TEMP store and fetch from drctmp @@ -1973,51 +2070,57 @@ static int rcache_restore_tmp(int r) return hr; } -static void rcache_unlock(int hr) +static void rcache_free(int hr) { - if (hr >= 0) { - cache_regs[hr].flags &= ~HRF_LOCKED; - rcache_locked &= ~cache_regs[hr].gregs; + int x = reg_map_host[hr]; + if (cache_regs[x].type == HR_TEMP) + rcache_free_tmp(hr); + else + rcache_unref_vreg(x); +} + +static void rcache_unlock(int x) +{ + if (x >= 0) { + cache_regs[x].flags &= ~HRF_LOCKED; + cache_regs[x].ref = 0; +// rcache_regs_now &= ~cache_regs[x].gregs; } } static void rcache_unlock_all(void) { int i; - for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { cache_regs[i].flags &= ~HRF_LOCKED; + cache_regs[i].ref = 0; + } } -static inline void rcache_set_locked(u32 mask) -{ - rcache_locked = mask & ~rcache_static; -} - -static inline void rcache_set_hint_soon(u32 mask) +static inline void rcache_set_usage_now(u32 mask) { - rcache_hint_soon = mask & ~rcache_static; + rcache_regs_now = mask; } -static inline void rcache_set_hint_late(u32 mask) +static inline void rcache_set_usage_soon(u32 mask) { - rcache_hint_late = mask & ~rcache_static; + rcache_regs_soon = mask; } -static inline void rcache_set_hint_write(u32 mask) +static inline void rcache_set_usage_late(u32 mask) { - rcache_hint_write = mask & ~rcache_static; + rcache_regs_late = mask; } -static inline int rcache_is_hinted(sh2_reg_e r) +static inline void rcache_set_usage_discard(u32 mask) { - // consider static REGs as always hinted, since they are always there - return ((rcache_hint | rcache_static) & (1 << r)); + rcache_regs_discard = mask; } static inline int rcache_is_cached(sh2_reg_e r) { - // consider static REGs as always hinted, since they are always there - return (guest_regs[r].vreg >= 0); + // is r in cache or needed RSN? + return (guest_regs[r].vreg >= 0 || (rcache_regs_soonclean & (1 << r))); } static inline int rcache_is_hreg_used(int hr) @@ -2028,7 +2131,7 @@ static inline int rcache_is_hreg_used(int hr) (cache_regs[x].type != HR_TEMP || (cache_regs[x].flags & HRF_LOCKED)); } -static inline u32 rcache_used_hreg_mask(void) +static inline u32 rcache_used_hregs_mask(void) { u32 mask = 0; int i; @@ -2038,7 +2141,7 @@ static inline u32 rcache_used_hreg_mask(void) (cache_regs[i].type != HR_TEMP || (cache_regs[i].flags & HRF_LOCKED))) mask |= 1 << cache_regs[i].hreg; - return mask & ~rcache_static; + return mask; } static inline u32 rcache_dirty_mask(void) @@ -2054,13 +2157,13 @@ static inline u32 rcache_dirty_mask(void) return mask; } -static inline u32 rcache_reg_mask(void) +static inline u32 rcache_cached_mask(void) { u32 mask = 0; int i; for (i = 0; i < ARRAY_SIZE(cache_regs); i++) - if (cache_regs[i].type == HR_CACHED) + if (cache_regs[i].type == HR_CACHED || cache_regs[i].type == HR_STATIC) mask |= cache_regs[i].gregs; return mask; @@ -2070,26 +2173,40 @@ static void rcache_clean_tmp(void) { int i; + rcache_regs_clean = (1 << ARRAY_SIZE(guest_regs)) - 1; for (i = 0; i < ARRAY_SIZE(cache_regs); i++) - if (cache_regs[i].type == HR_CACHED && (cache_regs[i].flags & HRF_TEMP)) + if (cache_regs[i].type == HR_CACHED && (cache_regs[i].flags & HRF_TEMP)) { + rcache_unlock(i); #if REMAP_REGISTER rcache_remap_vreg(i); #else rcache_clean_vreg(i); #endif + } + rcache_regs_clean = 0; } -static void rcache_clean_mask(u32 mask) +static void rcache_clean_masked(u32 mask) { - int i; + int i, r, hr; - if (!(mask &= ~rcache_static)) + if (!(mask &= ~rcache_regs_static)) return; - rcache_hint_clean |= mask; - - // clean only vregs where all aliases are covered by the mask + rcache_regs_clean |= mask; + + // clean constants where all aliases are covered by the mask + for (i = 0; i < ARRAY_SIZE(gconsts); i++) + if ((gconsts[i].gregs & mask) && !(gconsts[i].gregs & ~mask)) { + FOR_ALL_BITS_SET_DO(gconsts[i].gregs, r, + if (guest_regs[r].flags & GRF_CDIRTY) { + hr = rcache_get_reg_(r, RC_GR_READ, 0, NULL); + rcache_clean_vreg(reg_map_host[hr]); + break; + }); + } + // clean vregs where all aliases are covered by the mask for (i = 0; i < ARRAY_SIZE(cache_regs); i++) - if (cache_regs[i].type == HR_CACHED && + if ((cache_regs[i].type == HR_CACHED || cache_regs[i].type == HR_STATIC) && (cache_regs[i].gregs & mask) && !(cache_regs[i].gregs & ~mask)) rcache_clean_vreg(i); } @@ -2099,9 +2216,30 @@ static void rcache_clean(void) int i; gconst_clean(); + rcache_regs_clean = (1 << ARRAY_SIZE(guest_regs)) - 1; for (i = ARRAY_SIZE(cache_regs)-1; i >= 0; i--) if (cache_regs[i].type == HR_CACHED || cache_regs[i].type == HR_STATIC) rcache_clean_vreg(i); + + // relocate statics to their sregs (necessary before conditional jumps) + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { + if ((guest_regs[i].flags & GRF_STATIC) && + guest_regs[i].vreg != guest_regs[i].sreg) { + rcache_ref_vreg(guest_regs[i].vreg); + rcache_evict_vreg(guest_regs[i].sreg); + rcache_unref_vreg(guest_regs[i].vreg); + if (guest_regs[i].vreg < 0) + emith_ctx_read(cache_regs[guest_regs[i].sreg].hreg, i*4); + else + emith_move_r_r(cache_regs[guest_regs[i].sreg].hreg, + cache_regs[guest_regs[i].vreg].hreg); + cache_regs[guest_regs[i].sreg].gregs = 1 << i; + cache_regs[guest_regs[i].sreg].flags |= HRF_DIRTY; + guest_regs[i].flags |= GRF_DIRTY; + guest_regs[i].vreg = guest_regs[i].sreg; + } + } + rcache_regs_clean = 0; } static void rcache_invalidate_tmp(void) @@ -2110,11 +2248,11 @@ static void rcache_invalidate_tmp(void) for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { if (cache_regs[i].flags & HRF_TEMP) { + rcache_unlock(i); if (cache_regs[i].type == HR_CACHED) - rcache_unmap_vreg(i); - cache_regs[i].type = HR_FREE; - cache_regs[i].flags &= (HRF_TEMP|HRF_REG); - cache_regs[i].gregs = 0; + rcache_evict_vreg(i); + else + rcache_free_vreg(i); } } } @@ -2122,33 +2260,26 @@ static void rcache_invalidate_tmp(void) static void rcache_invalidate(void) { int i; - gconst_invalidate(); - for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { - cache_regs[i].flags &= (HRF_TEMP|HRF_REG); - if (cache_regs[i].type != HR_STATIC) - cache_regs[i].type = HR_FREE; - cache_regs[i].gregs = 0; - } + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + rcache_free_vreg(i); for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { guest_regs[i].flags &= GRF_STATIC; if (!(guest_regs[i].flags & GRF_STATIC)) guest_regs[i].vreg = -1; else { - if (guest_regs[i].vreg < 0) - emith_ctx_read(cache_regs[guest_regs[i].sreg].hreg, i*4); - else if (guest_regs[i].vreg != guest_regs[i].sreg) - emith_move_r_r(cache_regs[guest_regs[i].sreg].hreg, - cache_regs[guest_regs[i].vreg].hreg); cache_regs[guest_regs[i].sreg].gregs = 1 << i; + cache_regs[guest_regs[i].sreg].flags |= HRF_DIRTY; + guest_regs[i].flags |= GRF_DIRTY; guest_regs[i].vreg = guest_regs[i].sreg; } } rcache_counter = 0; - rcache_hint_soon = rcache_hint_late = rcache_hint_write = rcache_hint_clean = 0; + rcache_regs_now = rcache_regs_soon = rcache_regs_late = 0; + rcache_regs_discard = rcache_regs_clean = 0; } static void rcache_flush(void) @@ -2171,7 +2302,7 @@ static void rcache_init(void) for (i = 0; i < ARRAY_SIZE(guest_regs); i++) if (guest_regs[i].flags & GRF_STATIC) { - rcache_static |= (1 << i); + rcache_regs_static |= (1 << i); guest_regs[i].sreg = reg_map_host[guest_regs[i].sreg]; cache_regs[guest_regs[i].sreg].type = HR_STATIC; } else @@ -2191,7 +2322,7 @@ static void rcache_init(void) // --------------------------------------------------------------- // NB may return either REG or TEMP -static int emit_get_rbase_and_offs(SH2 *sh2, sh2_reg_e r, int rmod, u32 *offs) +static int emit_get_rbase_and_offs(SH2 *sh2, sh2_reg_e r, int rmode, u32 *offs) { uptr omask = 0xff; // offset mask, XXX: ARM oriented.. u32 mask = 0; @@ -2225,21 +2356,19 @@ static int emit_get_rbase_and_offs(SH2 *sh2, sh2_reg_e r, int rmod, u32 *offs) return hr; } + // ROM, SDRAM. Host address should be mmapped to be equal to SH2 address. la = (uptr)*(void **)((char *)sh2 + poffs); - // accessing ROM or SDRAM, code location doesn't matter. The host address - // for these should be mmapped to be equal to the SH2 address. - // if r is in rcache or needed soon anyway, and offs is relative to region - // use rcached const to avoid loading a literal on ARM - if ((guest_regs[r].vreg >= 0 || ((guest_regs[r].flags & GRF_CDIRTY) && - ((rcache_hint_soon|rcache_hint_clean) & (1 << r)))) && !(*offs & ~mask)) { + + // if r is in rcache or needed soon anyway, and offs is relative to region, + // and address translation fits in add_ptr_imm (s32), then use rcached const + if (la == (s32)la && !(*offs & ~mask) && rcache_is_cached(r)) { u32 odd = a & 1; // need to fix odd address for correct byte addressing la -= (s32)((a & ~mask) - *offs - odd); // diff between reg and memory - // if reg is modified later on, allocate it RMW to remove aliases here - // else the aliases vreg stays locked and a vreg shortage may occur. - hr = hr2 = rcache_get_reg(r, rmod ? RC_GR_RMW : RC_GR_READ, NULL); + hr = hr2 = rcache_get_reg(r, rmode, NULL); if ((la & ~omask) - odd) { hr = rcache_get_tmp(); emith_add_r_r_ptr_imm(hr, hr2, (la & ~omask) - odd); + rcache_free(hr2); } *offs = (la & omask); } else { @@ -2285,39 +2414,55 @@ static void emit_move_r_imm32(sh2_reg_e dst, u32 imm) static void emit_move_r_r(sh2_reg_e dst, sh2_reg_e src) { - int hr_d, hr_s; - - if (guest_regs[src].vreg >= 0 || gconst_check(src) || rcache_is_hinted(src)) { - hr_s = rcache_get_reg(src, RC_GR_READ, NULL); + if (gconst_check(src) || rcache_is_cached(src)) { #if ALIAS_REGISTERS - // check for aliasing - int i = guest_regs[src].vreg; - if (guest_regs[dst].vreg != i) { - // remove possible old mapping of dst - if (guest_regs[dst].vreg >= 0) - rcache_remove_vreg_alias(guest_regs[dst].vreg, dst); - // make dst an alias of src - rcache_add_vreg_alias(i, dst); - cache_regs[i].flags |= HRF_DIRTY; - guest_regs[dst].flags |= GRF_DIRTY; - gconst_kill(dst); -#if PROPAGATE_CONSTANTS - gconst_copy(dst, src); -#endif - return; - } -#endif - hr_d = rcache_get_reg(dst, RC_GR_WRITE, NULL); + rcache_alias_vreg(dst, src); +#else + int hr_s = rcache_get_reg(src, RC_GR_READ, NULL); + int hr_d = rcache_get_reg(dst, RC_GR_WRITE, NULL); emith_move_r_r(hr_d, hr_s); -#if PROPAGATE_CONSTANTS gconst_copy(dst, src); #endif } else { - hr_d = rcache_get_reg(dst, RC_GR_WRITE, NULL); + int hr_d = rcache_get_reg(dst, RC_GR_WRITE, NULL); emith_ctx_read(hr_d, src * 4); } } +static void emit_add_r_imm(sh2_reg_e r, u32 imm) +{ + u32 val; + int isgc = gconst_get(r, &val); + int hr, hr2; + + if (!isgc || rcache_is_cached(r)) { + // not constant, or r is already in cache + hr = rcache_get_reg(r, RC_GR_RMW, &hr2); + emith_add_r_r_imm(hr, hr2, imm); + rcache_free(hr2); + if (isgc) + gconst_set(r, val + imm); + } else + gconst_new(r, val + imm); +} + +static void emit_sub_r_imm(sh2_reg_e r, u32 imm) +{ + u32 val; + int isgc = gconst_get(r, &val); + int hr, hr2; + + if (!isgc || rcache_is_cached(r)) { + // not constant, or r is already in cache + hr = rcache_get_reg(r, RC_GR_RMW, &hr2); + emith_sub_r_r_imm(hr, hr2, imm); + rcache_free(hr2); + if (isgc) + gconst_set(r, val - imm); + } else + gconst_new(r, val - imm); +} + static void emit_sync_t_to_sr(void) { // avoid reloading SR from context if there's nothing to do @@ -2335,8 +2480,9 @@ static int emit_memhandler_read(int size) #ifndef DRC_SR_REG // must writeback cycles for poll detection stuff if (guest_regs[SHR_SR].vreg != -1) - rcache_evict_vreg(guest_regs[SHR_SR].vreg); + rcache_unmap_vreg(guest_regs[SHR_SR].vreg); #endif + rcache_invalidate_tmp(); if (size & MF_POLLING) switch (size & MF_SIZEMASK) { @@ -2351,7 +2497,6 @@ static int emit_memhandler_read(int size) case 2: emith_call(sh2_drc_read32); break; // 32 } - rcache_invalidate_tmp(); return rcache_get_tmp_ret(); } @@ -2362,16 +2507,15 @@ static void emit_memhandler_write(int size) rcache_clean_tmp(); #ifndef DRC_SR_REG if (guest_regs[SHR_SR].vreg != -1) - rcache_evict_vreg(guest_regs[SHR_SR].vreg); + rcache_unmap_vreg(guest_regs[SHR_SR].vreg); #endif + rcache_invalidate_tmp(); switch (size & MF_SIZEMASK) { case 0: emith_call(sh2_drc_write8); break; // 8 case 1: emith_call(sh2_drc_write16); break; // 16 case 2: emith_call(sh2_drc_write32); break; // 32 } - - rcache_invalidate_tmp(); } // rd = @(Rs,#offs); rd < 0 -> return a temp @@ -2389,12 +2533,13 @@ static int emit_memhandler_read_rr(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rs, u32 off emit_move_r_imm32(rd, val); hr2 = rcache_get_reg(rd, RC_GR_RMW, NULL); } - if ((size & MF_POSTINCR) && gconst_get(rs, &val)) - gconst_new(rs, val + (1 << (size & MF_SIZEMASK))); + if (size & MF_POSTINCR) + emit_add_r_imm(rs, 1 << (size & MF_SIZEMASK)); return hr2; } - hr = emit_get_rbase_and_offs(sh2, rs, size & MF_POSTINCR, &offs); + val = size & MF_POSTINCR; + hr = emit_get_rbase_and_offs(sh2, rs, val ? RC_GR_RMW : RC_GR_READ, &offs); if (hr != -1) { if (rd == SHR_TMP) hr2 = rcache_get_tmp(); @@ -2405,24 +2550,14 @@ static int emit_memhandler_read_rr(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rs, u32 off case 1: emith_read16s_r_r_offs(hr2, hr, offs); break; // 16 case 2: emith_read_r_r_offs(hr2, hr, offs); emith_ror(hr2, hr2, 16); break; } - if (cache_regs[reg_map_host[hr]].type == HR_TEMP) // may also return REG - rcache_free_tmp(hr); - if (size & MF_POSTINCR) { - int isgc = gconst_get(rs, &val); - if (!isgc || guest_regs[rs].vreg >= 0) { - // already loaded - hr = rcache_get_reg(rs, RC_GR_RMW, NULL); - emith_add_r_r_imm(hr, hr, 1 << (size & MF_SIZEMASK)); - if (isgc) - gconst_set(rs, val + (1 << (size & MF_SIZEMASK))); - } else - gconst_new(rs, val + (1 << (size & MF_SIZEMASK))); - } + rcache_free(hr); + if (size & MF_POSTINCR) + emit_add_r_imm(rs, 1 << (size & MF_SIZEMASK)); return hr2; } #endif - if (gconst_get(rs, &val) && guest_regs[rs].vreg < 0 && !(rcache_hint_soon & (1 << rs))) { + if (gconst_get(rs, &val) && !rcache_is_cached(rs)) { hr = rcache_get_tmp_arg(0); emith_move_r_imm(hr, val + offs); if (size & MF_POSTINCR) @@ -2432,6 +2567,8 @@ static int emit_memhandler_read_rr(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rs, u32 off hr2 = rcache_get_reg(rs, RC_GR_RMW, NULL); emith_add_r_r_imm(hr, hr2, offs); emith_add_r_imm(hr2, 1 << (size & MF_SIZEMASK)); + if (gconst_get(rs, &val)) + gconst_set(rs, val + (1 << (size & MF_SIZEMASK))); } else { hr = rcache_get_reg_arg(0, rs, &hr2); if (offs || hr != hr2) @@ -2463,30 +2600,34 @@ static void emit_memhandler_write_rr(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rs, u32 o u32 val; if (rd == SHR_TMP) { - host_arg2reg(hr2, 1); + host_arg2reg(hr2, 1); // already locked and prepared by caller } else if ((size & MF_PREDECR) && rd == rs) { // must avoid caching rd in arg1 hr2 = rcache_get_reg_arg(1, rd, &hr); - if (hr != hr2) emith_move_r_r(hr2, hr); + if (hr != hr2) { + emith_move_r_r(hr2, hr); + rcache_free(hr2); + } } else hr2 = rcache_get_reg_arg(1, rd, NULL); + if (rd != SHR_TMP) + rcache_unlock(guest_regs[rd].vreg); // unlock in case rd is in arg0 - if (gconst_get(rs, &val) && guest_regs[rs].vreg < 0 && !(rcache_hint_soon & (1 << rs))) { + if (gconst_get(rs, &val) && !rcache_is_cached(rs)) { + hr = rcache_get_tmp_arg(0); if (size & MF_PREDECR) { val -= 1 << (size & MF_SIZEMASK); gconst_new(rs, val); } - hr = rcache_get_tmp_arg(0); emith_move_r_imm(hr, val + offs); } else if (offs || (size & MF_PREDECR)) { - if (size & MF_PREDECR) { - hr = rcache_get_reg(rs, RC_GR_RMW, &hr2); - emith_sub_r_r_imm(hr, hr2, 1 << (size & MF_SIZEMASK)); - } + if (size & MF_PREDECR) + emit_sub_r_imm(rs, 1 << (size & MF_SIZEMASK)); + rcache_unlock(guest_regs[rs].vreg); // unlock in case rs is in arg0 hr = rcache_get_reg_arg(0, rs, &hr2); if (offs || hr != hr2) emith_add_r_r_imm(hr, hr2, offs); } else - rcache_get_reg_arg(0, rs, NULL); + hr = rcache_get_reg_arg(0, rs, NULL); emit_memhandler_write(size); } @@ -2696,7 +2837,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (op_flags[i] & OF_BTARGET) ADD_TO_ARRAY(branch_target_pc, branch_target_count, pc, ); if (ops[i].op == OP_LDC && (ops[i].dest & BITMASK1(SHR_SR)) && pc+2 < end_pc) - op_flags[i+1] |= OF_BTARGET; // RTE entrypoint in case of SR(IMASK) change + op_flags[i+1] |= OF_BTARGET; // RTE entrypoint in case of SR.IMASK change #if LOOP_DETECTION // loop types detected: // 1. target: ... BRA target -> idle loop @@ -2855,10 +2996,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.polling = (drcf.loop_type == OF_POLL_LOOP ? MF_POLLING : 0); #endif -#if (DRC_DEBUG & ~7) - // must update PC - emit_move_r_imm32(SHR_PC, pc); -#endif rcache_clean(); #if (DRC_DEBUG & 0x10) @@ -2902,17 +3039,20 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_sync_t(sr); rcache_clean(); - tmp = rcache_used_hreg_mask(); + tmp = rcache_used_hregs_mask(); emith_save_caller_regs(tmp); emit_do_static_regs(1, 0); rcache_get_reg_arg(2, SHR_SR, NULL); tmp2 = rcache_get_tmp_arg(0); tmp3 = rcache_get_tmp_arg(1); + tmp4 = rcache_get_tmp_arg(3); emith_move_r_ptr_imm(tmp2, tcache_ptr); - emith_move_r_r_ptr(tmp3,CONTEXT_REG); + emith_move_r_r_ptr(tmp3, CONTEXT_REG); + emith_move_r_imm(tmp4, pc); + emith_ctx_write(tmp4, SHR_PC * 4); + rcache_invalidate_tmp(); emith_call(sh2_drc_log_entry); emith_restore_caller_regs(tmp); - rcache_invalidate_tmp(); #endif do_host_disasm(tcache_id); @@ -2924,9 +3064,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); emith_sync_t(sr); + emit_move_r_imm32(SHR_PC, pc); rcache_clean(); - tmp = rcache_used_hreg_mask(); + tmp = rcache_used_hregs_mask(); emith_save_caller_regs(tmp); emit_do_static_regs(1, 0); emith_pass_arg_r(0, CONTEXT_REG); @@ -2990,43 +3131,33 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // dbg(1, "unhandled delay_dep_fw: %x", delay_dep_fw & ~BITMASK1(SHR_T)); if (delay_dep_bk & ~BITMASK2(SHR_PC, SHR_PR)) dbg(1, "unhandled delay_dep_bk: %x", delay_dep_bk); - rcache_set_hint_soon(0); - rcache_set_hint_late(0); - rcache_set_hint_write(0); } - else - { - // inform cache about future register usage - u32 late = 0; // regs read by future ops - u32 write = 0; // regs written to (to detect write before read) - u32 soon = 0; // regs read soon - tmp = (OP_ISBRANCH(opd[0].op) || opd[0].op == OP_RTE || // branching insns - opd[0].op == OP_TRAPA || opd[0].op == OP_UNDEFINED); - for (v = 1; v <= 9; v++) { - // no sense in looking any further than the next rcache flush - if (pc + 2*v < end_pc && !(op_flags[i+v] & OF_BTARGET) && - (!tmp || (op_flags[i+v] & OF_DELAY_OP))) { - late |= opd[v].source & ~write; - // ignore source regs after they have been written to - write |= opd[v].dest; - } else { - // upcoming rcache_flush, start writing back unused dirty stuff - tmp2 = write|opd[0].source|opd[0].dest; // insn may change reg aliases - rcache_clean_mask(rcache_dirty_mask() & ~tmp2); - break; - } - tmp |= (OP_ISBRANCH(opd[v].op) || opd[v].op == OP_RTE || - opd[v].op == OP_TRAPA || opd[v].op == OP_UNDEFINED); + + // inform cache about future register usage + u32 late = 0; // regs read by future ops + u32 write = 0; // regs written to (to detect write before read) + u32 soon = 0; // regs read soon + for (v = 1; v <= 9; v++) { + // no sense in looking any further than the next rcache flush + tmp = ((op_flags[i+v] & OF_BTARGET) || (op_flags[i+v-1] & OF_DELAY_OP) || + (OP_ISBRACND(opd[v-1].op) && !(op_flags[i+v] & OF_DELAY_OP))); + if (pc + 2*v <= end_pc && !tmp) { // (pc already incremented above) + late |= opd[v].source & ~write; + // ignore source regs after they have been written to + write |= opd[v].dest; // regs needed in the next few instructions if (v <= 4) soon = late; + } else { + // upcoming rcache_flush, start writing back unused dirty stuff + rcache_clean_masked(rcache_dirty_mask() & ~(write|opd[0].dest)); + break; } - rcache_set_hint_soon(late); // insns 1-3 - rcache_set_hint_late(late & ~soon); // insns 4-9 - rcache_set_hint_write(write & ~(late|soon) & ~opd[0].source); - // overwritten without being used } - rcache_set_locked(opd[0].source); // try not to evict src regs for this op + rcache_set_usage_now(opd[0].source); // current insn + rcache_set_usage_soon(late); // insns 1-3 + rcache_set_usage_late(late & ~soon); // insns 4-9 + rcache_set_usage_discard(write & ~(late|soon) & ~opd[0].source); switch (opd->op) { @@ -3069,7 +3200,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case OP_RTE: // RTE 0000000000101011 emith_invalidate_t(); // pop PC - emit_memhandler_read_rr(sh2, SHR_PC, SHR_SP, 0, 2 | MF_POSTINCR); + tmp = emit_memhandler_read_rr(sh2, SHR_PC, SHR_SP, 0, 2 | MF_POSTINCR); + rcache_free(tmp); // pop SR tmp = emit_memhandler_read_rr(sh2, SHR_TMP, SHR_SP, 0, 2 | MF_POSTINCR); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); @@ -3853,11 +3985,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) ///////////////////////////////////////////// case 0x07: // ADD #imm,Rn 0111nnnniiiiiiii - tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); - if (op & 0x80) { // adding negative - emith_sub_r_r_imm(tmp, tmp2, -op & 0xff); - } else - emith_add_r_r_imm(tmp, tmp2, op & 0xff); + if (op & 0x80) // adding negative + emit_sub_r_imm(GET_Rn(), (u8)-op); + else + emit_add_r_imm(GET_Rn(), (u8)op); goto end_op; ///////////////////////////////////////////// @@ -3968,6 +4099,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) end_op: rcache_unlock_all(); +#if DRC_DEBUG & 64 + RCACHE_CHECK("after insn"); +#endif cycles += opd->cycles; @@ -4007,6 +4141,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // idle or delay loop emit_sync_t_to_sr(); emith_sh2_delay_loop(cycles, drcf.delay_reg); + rcache_unlock_all(); // may lock delay_reg drcf.polling = drcf.loop_type = 0; } #endif @@ -4075,8 +4210,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_jump_cond_patchable(cond, target); } else if (target != NULL) { - emith_jump_patchable(target); rcache_invalidate(); + emith_jump_patchable(target); } // branch not taken, correct cycle count @@ -4099,6 +4234,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_sync_t(sr); rcache_clean(); tmp = rcache_get_reg_arg(0, SHR_PC, NULL); + rcache_invalidate(); #if CALL_STACK struct op_data *opd_b = (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd; if (opd_b->rm == SHR_PR) { @@ -4108,6 +4244,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // JSR/BSRF tmp = rcache_get_tmp_arg(1); emith_call_link(tmp, sh2_drc_dispatcher_call); + rcache_free(tmp); } else #endif if (gconst_get(SHR_PC, &target_pc)) { @@ -4118,7 +4255,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // JMP emith_jump(sh2_drc_dispatcher); } - rcache_invalidate(); drcf.pending_branch_indirect = 0; drcf.polling = drcf.loop_type = 0; } @@ -4147,8 +4283,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) target = dr_prepare_ext_branch(block->entryp, pc, sh2->is_slave, tcache_id); if (target == NULL) return NULL; - emith_jump_patchable(target); rcache_invalidate(); + emith_jump_patchable(target); } else rcache_flush(); emith_flush(); @@ -4452,14 +4588,14 @@ static void sh2_generate_utils(void) tmp = rcache_get_reg_arg(1, SHR_SR, NULL); emith_clear_msb(tmp, tmp, 22); emith_move_r_r_ptr(arg2, CONTEXT_REG); - emith_call(p32x_sh2_write32); // XXX: use sh2_drc_write32? rcache_invalidate(); + emith_call(p32x_sh2_write32); // XXX: use sh2_drc_write32? // push PC rcache_get_reg_arg(0, SHR_SP, NULL); emith_ctx_read(arg1, SHR_PC * 4); emith_move_r_r_ptr(arg2, CONTEXT_REG); - emith_call(p32x_sh2_write32); rcache_invalidate(); + emith_call(p32x_sh2_write32); // update I, cycles, do callback emith_ctx_read(arg1, offsetof(SH2, pending_level)); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); @@ -4476,8 +4612,8 @@ static void sh2_generate_utils(void) if (arg0 != RET_REG) emith_move_r_r(arg0, RET_REG); emith_call_cleanup(); - emith_jump(sh2_drc_dispatcher); rcache_invalidate(); + emith_jump(sh2_drc_dispatcher); emith_flush(); // sh2_drc_entry(SH2 *sh2) From 57f65578f41eb312f6eea19ba64828bab02400a1 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 30 Jul 2019 20:55:48 +0200 Subject: [PATCH 049/174] sh2 drc: add mipsel backend for MIPS32 Release 1 (for JZ47xx) --- Makefile | 5 +- config.gcw0 | 16 + cpu/drc/emit_arm.c | 7 +- cpu/drc/emit_mips.c | 1464 ++++++++++++++++++++++++++++++++++++ cpu/drc/emit_x86.c | 7 +- cpu/sh2/compiler.c | 84 ++- cpu/sh2/compiler.h | 2 + platform/common/common.mak | 2 +- platform/common/disarm.c | 2 +- platform/common/disarm.h | 2 +- platform/common/dismips.c | 346 +++++++++ platform/common/dismips.h | 6 + platform/linux/emu.c | 2 +- 13 files changed, 1922 insertions(+), 23 deletions(-) create mode 100644 config.gcw0 create mode 100644 cpu/drc/emit_mips.c create mode 100644 platform/common/dismips.c create mode 100644 platform/common/dismips.h diff --git a/Makefile b/Makefile index 4bc48780d..7f02a1c96 100644 --- a/Makefile +++ b/Makefile @@ -61,6 +61,9 @@ use_cz80 ?= 1 ifneq (,$(findstring 86,$(ARCH))) use_sh2drc ?= 1 endif +ifneq (,$(findstring mips,$(ARCH))) +use_sh2drc ?= 1 +endif endif -include Makefile.local @@ -267,7 +270,7 @@ pico/carthw_cfg.c: pico/carthw.cfg # random deps pico/carthw/svp/compiler.o : cpu/drc/emit_arm.c cpu/sh2/compiler.o : cpu/drc/emit_arm.c -cpu/sh2/compiler.o : cpu/drc/emit_x86.c +cpu/sh2/compiler.o : cpu/drc/emit_x86.c cpu/drc/emit_mips.c cpu/sh2/mame/sh2pico.o : cpu/sh2/mame/sh2.c pico/pico.o pico/cd/mcd.o pico/32x/32x.o : pico/pico_cmn.c pico/pico_int.h pico/memory.o pico/cd/memory.o pico/32x/memory.o : pico/pico_int.h pico/memory.h diff --git a/config.gcw0 b/config.gcw0 new file mode 100644 index 000000000..1d2ccef0a --- /dev/null +++ b/config.gcw0 @@ -0,0 +1,16 @@ +# Automatically generated by configure +# Configured with: './configure' '--platform=generic' +CC = mipsel-gcw0-linux-uclibc-gcc +CXX = mipsel-gcw0-linux-uclibc-g++ +AS = mipsel-gcw0-linux-uclibc-as +STRIP = mipsel-gcw0-linux-uclibc-strip +CFLAGS += -I${HOME}/opt/gcw0-toolchain/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/ +CFLAGS += -I${HOME}/opt/gcw0-toolchain/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL +CFLAGS += -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -fno-stack-protector +ASFLAGS += +LDFLAGS += +LDLIBS += -B${HOME}/opt/gcw0-toolchain/usr/lib -Wl,-rpath-link=${HOME}/opt/gcw0-toolchain/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/lib -Wl,-rpath-link=${HOME}/opt/gcw0-toolchain/usr/mipsel-gcw0-linux-uclibc/sysroot/lib -lSDL -lasound -lpng -lz -lm -lstdc++ -ldl + +ARCH = mipsel +PLATFORM = opendingux +SOUND_DRIVERS = sdl diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 0eb2d9724..72542a3fc 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -1098,11 +1098,14 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_jump_cond_patchable(cond, target) \ emith_jump_cond(cond, target) -#define emith_jump_patch(ptr, target) do { \ +#define emith_jump_patch(ptr, target) ({ \ u32 *ptr_ = ptr; \ u32 val_ = (u32 *)(target) - ptr_ - 2; \ *ptr_ = (*ptr_ & 0xff000000) | (val_ & 0x00ffffff); \ -} while (0) + (u8 *)ptr; \ +}) + +#define emith_jump_patch_size() 4 #define emith_jump_at(ptr, target) do { \ u32 val_ = (u32 *)(target) - (u32 *)(ptr) - 2; \ diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c new file mode 100644 index 000000000..f56b89a31 --- /dev/null +++ b/cpu/drc/emit_mips.c @@ -0,0 +1,1464 @@ +/* + * Basic macros to emit MIPS II/MIPS32 Release 1 instructions and some utils + * Copyright (C) 2019 kub + * + * This work is licensed under the terms of MAME license. + * See COPYING file in the top-level directory. + */ +#define HOST_REGS 32 +#define CONTEXT_REG 23 // s7 +#define RET_REG 2 // v0 + +// NB: the ubiquitous JZ74[46]0 uses MIPS32 Release 1, a slight MIPS II superset + +// registers usable for user code: r1-r25, others reserved or special +#define Z0 0 // zero register +#define GP 28 // global pointer +#define SP 29 // stack pointer +#define FP 30 // frame pointer +#define LR 31 // link register +// internally used by code emitter: +#define AT 1 // used to hold intermediate results +#define FNZ 15 // emulated processor flags: N (bit 31) ,Z (all bits) +#define FC 24 // emulated processor flags: C (bit 0), others 0 +#define FV 25 // emulated processor flags: Nt^Ns (bit 31). others ? + + +// unified conditions; virtual, not corresponding to anything real on MIPS +#define DCOND_EQ 0x0 +#define DCOND_NE 0x1 +#define DCOND_HS 0x2 +#define DCOND_LO 0x3 +#define DCOND_MI 0x4 +#define DCOND_PL 0x5 +#define DCOND_VS 0x6 +#define DCOND_VC 0x7 +#define DCOND_HI 0x8 +#define DCOND_LS 0x9 +#define DCOND_GE 0xa +#define DCOND_LT 0xb +#define DCOND_GT 0xc +#define DCOND_LE 0xd + +#define DCOND_CS DCOND_LO +#define DCOND_CC DCOND_HS + +// unified insn +#define MIPS_INSN(op, rs, rt, rd, sa, fn) \ + (((op)<<26)|((rs)<<21)|((rt)<<16)|((rd)<<11)|((sa)<<6)|((fn)<<0)) + +#define _ 0 // marker for "field unused" +#define __(n) o##n // enum marker for "undefined" + +// opcode field (encoded in op) +enum { OP__FN=000, OP__RT, OP_J, OP_JAL, OP_BEQ, OP_BNE, OP_BLEZ, OP_BGTZ }; +enum { OP_ADDI=010, OP_ADDIU, OP_SLTI, OP_SLTIU, OP_ANDI, OP_ORI, OP_XORI, OP_LUI }; +enum { OP_LB=040, OP_LH, OP_LWL, OP_LW, OP_LBU, OP_LHU, OP_LWR }; +enum { OP_SB=050, OP_SH, OP_SWL, OP_SW, __(54), __(55), OP_SWR }; +// function field (encoded in fn if opcode = OP__FN) +enum { FN_SLL=000, __(01), FN_SRL, FN_SRA, FN_SLLV, __(05), FN_SRLV, FN_SRAV }; +enum { FN_MFHI=020, FN_MTHI, FN_MFLO, FN_MTLO }; +enum { FN_MULT=030, FN_MULTU, FN_DIV, FN_DIVU }; +enum { FN_ADD=040, FN_ADDU, FN_SUB, FN_SUBU, FN_AND, FN_OR, FN_XOR, FN_NOR }; +enum { FN_JR=010, FN_JALR, FN_MOVZ, FN_MOVN, FN_SYNC=017, FN_SLT=052, FN_SLTU }; +// rt field (encoded in rt if opcode = OP__RT) +enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; + +#define MIPS_NOP 000 // null operation: SLL r0, r0, #0 + +// arithmetic/logical + +#define MIPS_OP_REG(op, rd, rs, rt) \ + MIPS_INSN(OP__FN, rs, rt, rd, _, op) // R-type, SPECIAL +#define MIPS_OP_IMM(op, rt, rs, imm) \ + MIPS_INSN(op, rs, rt, _, _, (u16)(imm)) // I-type + +// rd = rt OP rs +#define MIPS_ADD_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_ADDU, rd, rs, rt) +#define MIPS_SUB_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_SUBU, rd, rs, rt) + +#define MIPS_NEG_REG(rd, rt) \ + MIPS_SUB_REG(rd, Z0, rt) + +#define MIPS_XOR_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_XOR, rd, rs, rt) +#define MIPS_OR_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_OR, rd, rs, rt) +#define MIPS_AND_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_AND, rd, rs, rt) +#define MIPS_NOR_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_NOR, rd, rs, rt) + +#define MIPS_MOVE_REG(rd, rs) \ + MIPS_OR_REG(rd, rs, Z0) +#define MIPS_MVN_REG(rd, rs) \ + MIPS_NOR_REG(rd, rs, Z0) + +// rd = rt SHIFT rs +#define MIPS_LSL_REG(rd, rt, rs) \ + MIPS_OP_REG(FN_SLLV, rd, rs, rt) +#define MIPS_LSR_REG(rd, rt, rs) \ + MIPS_OP_REG(FN_SRLV, rd, rs, rt) +#define MIPS_ASR_REG(rd, rt, rs) \ + MIPS_OP_REG(FN_SRAV, rd, rs, rt) + +// rd = (rs < rt) +#define MIPS_SLT_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_SLT, rd, rs, rt) +#define MIPS_SLTU_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_SLTU, rd, rs, rt) + +// rt = rs OP imm16 +#define MIPS_ADD_IMM(rt, rs, imm16) \ + MIPS_OP_IMM(OP_ADDIU, rt, rs, imm16) + +#define MIPS_XOR_IMM(rt, rs, imm16) \ + MIPS_OP_IMM(OP_XORI, rt, rs, imm16) +#define MIPS_OR_IMM(rt, rs, imm16) \ + MIPS_OP_IMM(OP_ORI, rt, rs, imm16) +#define MIPS_AND_IMM(rt, rs, imm16) \ + MIPS_OP_IMM(OP_ANDI, rt, rs, imm16) + +// rt = (imm16 << (0|16)) +#define MIPS_MOV_IMM(rt, imm16) \ + MIPS_OP_IMM(OP_ORI, rt, Z0, imm16) +#define MIPS_MOVT_IMM(rt, imm16) \ + MIPS_OP_IMM(OP_LUI, rt, _, imm16) + +// rd = rt SHIFT imm5 +#define MIPS_LSL_IMM(rd, rt, bits) \ + MIPS_INSN(OP__FN, _, rt, rd, bits, FN_SLL) +#define MIPS_LSR_IMM(rd, rt, bits) \ + MIPS_INSN(OP__FN, _, rt, rd, bits, FN_SRL) +#define MIPS_ASR_IMM(rd, rt, bits) \ + MIPS_INSN(OP__FN, _, rt, rd, bits, FN_SRA) + +// rt = (rs < imm16) +#define MIPS_SLT_IMM(rt, rs, imm16) \ + MIPS_OP_IMM(OP_SLTI, rt, rs, imm16) +#define MIPS_SLTU_IMM(rt, rs, imm16) \ + MIPS_OP_IMM(OP_SLTIU, rt, rs, imm16) + +// multiplication + +#define MIPS_MULT(rt, rs) \ + MIPS_OP_REG(FN_MULT, _, rs, rt) +#define MIPS_MULTU(rt, rs) \ + MIPS_OP_REG(FN_MULTU, _, rs, rt) +#define MIPS_MFLO(rd) \ + MIPS_OP_REG(FN_MFLO, rd, _, _) +#define MIPS_MFHI(rd) \ + MIPS_OP_REG(FN_MFHI, rd, _, _) + +// branching + +#define MIPS_J(abs26) \ + MIPS_INSN(OP_J, _,_,_,_, (abs26) >> 2) // J-type +#define MIPS_JAL(abs26) \ + MIPS_INSN(OP_JAL, _,_,_,_, (abs26) >> 2) +#define MIPS_JR(rs) \ + MIPS_OP_REG(FN_JR,_,rs,_) +#define MIPS_JALR(rd, rs) \ + MIPS_OP_REG(FN_JALR,rd,rs,_) + +// conditional branches; no condition code, these compare rs against rt or Z0 +#define MIPS_BEQ (OP_BEQ << 5) +#define MIPS_BNE (OP_BNE << 5) +#define MIPS_BLE (OP_BLEZ << 5) +#define MIPS_BGT (OP_BGTZ << 5) +#define MIPS_BLT ((OP__RT << 5)|RT_BLTZ) +#define MIPS_BGE ((OP__RT << 5)|RT_BGEZ) +#define MIPS_BGTL ((OP__RT << 5)|RT_BLTZAL) +#define MIPS_BGEL ((OP__RT << 5)|RT_BGEZAL) + +#define MIPS_BCONDZ(cond, rs, offs16) \ + MIPS_OP_IMM((cond >> 5), (cond & 0x1f), rs, (offs16) >> 2) +#define MIPS_B(offs16) \ + MIPS_BCONDZ(MIPS_BEQ, Z0, offs16) +#define MIPS_BL(offs16) \ + MIPS_BCONDZ(MIPS_BGEL, Z0, offs16) + +// load/store indexed base + +#define MIPS_LW(rt, rs, offs16) \ + MIPS_INSN(OP_LW, rs, rt, _,_, (u16)(offs16)) +#define MIPS_LH(rt, rs, offs16) \ + MIPS_INSN(OP_LH, rs, rt, _,_, (u16)(offs16)) +#define MIPS_LB(rt, rs, offs16) \ + MIPS_INSN(OP_LB, rs, rt, _,_, (u16)(offs16)) +#define MIPS_LHU(rt, rs, offs16) \ + MIPS_INSN(OP_LHU, rs, rt, _,_, (u16)(offs16)) +#define MIPS_LBU(rt, rs, offs16) \ + MIPS_INSN(OP_LBU, rs, rt, _,_, (u16)(offs16)) + +#define MIPS_SW(rt, rs, offs16) \ + MIPS_INSN(OP_SW, rs, rt, _,_, (u16)(offs16)) +#define MIPS_SH(rt, rs, offs16) \ + MIPS_INSN(OP_SH, rs, rt, _,_, (u16)(offs16)) +#define MIPS_SB(rt, rs, offs16) \ + MIPS_INSN(OP_SB, rs, rt, _,_, (u16)(offs16)) + +// XXX: tcache_ptr type for SVP and SH2 compilers differs.. +#define EMIT_PTR(ptr, x) \ + do { \ + *(u32 *)(ptr) = x; \ + ptr = (void *)((u8 *)(ptr) + sizeof(u32)); \ + } while (0) + +// FIFO for 2 instructions, for delay slot handling +u32 emith_last_insns[2] = { -1,-1 }; +int emith_last_idx; + +#define EMIT_PUSHOP() \ + do { \ + emith_last_idx ^= 1; \ + if (emith_last_insns[emith_last_idx] != -1) \ + EMIT_PTR(tcache_ptr, emith_last_insns[emith_last_idx]);\ + emith_last_insns[emith_last_idx] = -1; \ + } while (0) + +#define EMIT(op) \ + do { \ + EMIT_PUSHOP(); \ + emith_last_insns[emith_last_idx] = op; \ + COUNT_OP; \ + } while (0) + +#define emith_flush() \ + do { \ + int i; for (i = 0; i < 2; i++) EMIT_PUSHOP(); \ + } while (0) + +#define emith_insn_ptr() (u8 *)((u32 *)tcache_ptr + \ + (emith_last_insns[0] != -1) + (emith_last_insns[1] != -1)) + +// delay slot stuff +static int emith_is_j(u32 op) // J, JAL + { return ((op>>26) & 076) == OP_J; } +static int emith_is_jr(u32 op) // JR, JALR + { return (op>>26) == OP__FN && (op & 076) == FN_JR; } +static int emith_is_b(u32 op) // B + { return ((op>>26) & 074) == OP_BEQ || + ((op>>26) == OP__RT && ((op>>16) & 036) == RT_BLTZ); } +// register usage for dependency evaluation XXX better do this as in emit_arm? +static uint64_t emith_has_rs[3] = // OP__FN, OP__RT, others + { 0x00fffffffffa0ff0ULL, 0x000fff0fUL, 0xffffffff0f007f30ULL }; +static uint64_t emith_has_rt[3] = // OP__FN, OP__RT, others + { 0xff00fffffff00cffULL, 0x00000000UL, 0x8000ff0000000030ULL }; +static uint64_t emith_has_rd[3] = // OP__FN, OP__RT, others (rt instead of rd) + { 0xff00fffffff50fffULL, 0x00000000UL, 0x119100ff0f00ff00ULL }; +#define emith_has_(rx,ix,op,sa,m) \ + (emith_has_##rx[ix] & (1ULL << (((op)>>(sa)) & (m)))) +static int emith_rs(u32 op) + { if ((op>>26) == OP__FN) + return emith_has_(rs,0,op, 0,0x3f) ? (op>>21)&0x1f : 0; + if ((op>>26) == OP__RT) + return emith_has_(rs,1,op,16,0x1f) ? (op>>21)&0x1f : 0; + return emith_has_(rs,2,op,26,0x3f) ? (op>>21)&0x1f : 0; + } +static int emith_rt(u32 op) + { if ((op>>26) == OP__FN) + return emith_has_(rt,0,op, 0,0x3f) ? (op>>16)&0x1f : 0; + if ((op>>26) == OP__RT) + return 0; + return emith_has_(rt,2,op,26,0x3f) ? (op>>16)&0x1f : 0; + } +static int emith_rd(u32 op) + { if ((op>>26) == OP__FN) + return emith_has_(rd,0,op, 0,0x3f) ? (op>>11)&0x1f :-1; + if ((op>>26) == OP__RT) + return -1; + return emith_has_(rd,2,op,26,0x3f) ? (op>>16)&0x1f :-1; + } + +static int emith_b_isswap(u32 bop, u32 lop) +{ + if (emith_is_j(bop)) + return bop; + else if (emith_is_jr(bop) && emith_rd(lop) != emith_rs(bop)) + return bop; + else if (emith_is_b(bop) && emith_rd(lop) != emith_rs(bop)) + if ((bop & 0xffff) != 0x7fff) // displacement overflow? + return (bop & 0xffff0000) | ((bop & 0xffff)+1); + return 0; +} + +// emit branch, trying to fill the delay slot with one of the last insns +static void *emith_branch(u32 op) +{ + int idx = emith_last_idx; + u32 op1 = emith_last_insns[idx], op2 = emith_last_insns[idx^1]; + u32 bop = 0; + void *bp; + + // check last insn (op1) + if (op1 != -1 && op1) + bop = emith_b_isswap(op, op1); + // if not, check older insn (op2); mustn't interact with op1 to overtake + if (!bop && op2 != -1 && op2 && emith_rd(op1) != emith_rd(op2) && + emith_rs(op1) != emith_rd(op2) && emith_rt(op1) != emith_rd(op2) && + emith_rs(op2) != emith_rd(op1) && emith_rt(op2) != emith_rd(op1)) { + idx ^= 1; + bop = emith_b_isswap(op, op2); + } + + if (bop) { // can swap + if (emith_last_insns[idx^1] != -1) + EMIT_PTR(tcache_ptr, emith_last_insns[idx^1]); + bp = tcache_ptr; + EMIT_PTR(tcache_ptr, bop); COUNT_OP; + EMIT_PTR(tcache_ptr, emith_last_insns[idx]); + emith_last_insns[0] = emith_last_insns[1] = -1; + } else { // can't swap + emith_flush(); + bp = tcache_ptr; + EMIT_PTR(tcache_ptr, op); COUNT_OP; + EMIT_PTR(tcache_ptr, MIPS_NOP); COUNT_OP; + } + return bp; +} + +// if-then-else conditional execution helpers +#define JMP_POS(ptr) \ + ptr = emith_branch(MIPS_BCONDZ(cond_m, cond_r, 0)); + +#define JMP_EMIT(cond, ptr) { \ + u32 val_ = emith_insn_ptr() - (u8 *)(ptr) - 4; \ + EMIT_PTR(ptr, MIPS_BCONDZ(cond_m, cond_r, val_ & 0x0003ffff)); \ + emith_flush(); /* NO delay slot handling across jump targets */ \ +} + +#define JMP_EMIT_NC(ptr) { \ + u32 val_ = emith_insn_ptr() - (u8 *)(ptr) - 4; \ + EMIT_PTR(ptr, MIPS_B(val_ & 0x0003ffff)); \ + emith_flush(); \ +} + +#define EMITH_JMP_START(cond) { \ + int cond_r, cond_m = emith_cond_check(cond, &cond_r); \ + u8 *cond_ptr; \ + JMP_POS(cond_ptr) + +#define EMITH_JMP_END(cond) \ + JMP_EMIT(cond, cond_ptr); \ +} + +#define EMITH_JMP3_START(cond) { \ + int cond_r, cond_m = emith_cond_check(cond, &cond_r); \ + u8 *cond_ptr, *else_ptr; \ + JMP_POS(cond_ptr) + +#define EMITH_JMP3_MID(cond) \ + JMP_POS(else_ptr); \ + JMP_EMIT(cond, cond_ptr); + +#define EMITH_JMP3_END() \ + JMP_EMIT_NC(else_ptr); \ +} + +// "simple" jump (no more then a few insns) +// ARM32 will use conditional instructions here +#define EMITH_SJMP_START EMITH_JMP_START +#define EMITH_SJMP_END EMITH_JMP_END + +#define EMITH_SJMP3_START EMITH_JMP3_START +#define EMITH_SJMP3_MID EMITH_JMP3_MID +#define EMITH_SJMP3_END EMITH_JMP3_END + +#define EMITH_SJMP2_START(cond) \ + EMITH_SJMP3_START(cond) +#define EMITH_SJMP2_MID(cond) \ + EMITH_SJMP3_MID(cond) +#define EMITH_SJMP2_END(cond) \ + EMITH_SJMP3_END() + + +// flag register emulation. this is modelled after arm/x86. +// the FNZ register stores the result of the last flag setting operation for +// N and Z flag, used for EQ,NE,MI,PL branches. +// the FC register stores the C flag (used for HI,HS,LO,LS,CC,CS). +// the FV register stores information for V flag calculation (used for +// GT,GE,LT,LE,VC,VS). V flag is costly and only fully calculated when needed. +// the core registers may be temp registers, since the condition after calls +// is undefined anyway. + +// flag emulation creates 2 (ie cmp #0/beq) up to 9 (ie adcf/ble) extra insns. +// flag handling shortcuts may reduce this by 1-4 insns, see emith_cond_check() +int emith_flg_rs, emith_flg_rt; // registers used in FNZ=rs-rt (aka cmp_r_r) +int emith_flg_noV; // V flag known not to be set + +// store minimal cc information: rd, rt^rs, carry +// NB: the result *must* first go to FNZ, in case rd == rs or rd == rt. +// NB: for adcf and sbcf, carry-in must be dealt with separately (see there) +static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) +{ + if (sub && rd == FNZ && rt && rs) // is this cmp_r_r? + emith_flg_rs = rs, emith_flg_rt = rt; + else emith_flg_rs = emith_flg_rt = 0; + + if (sub) // C = sub:rt 0) // Nt^Ns + EMIT(MIPS_XOR_REG(FV, rt, rs)); + else if (imm < 0) + EMIT(MIPS_NOR_REG(FV, rt, Z0)); + else if (imm > 0) + EMIT(MIPS_OR_REG(FV, rt, Z0)); // Nt^Ns in FV, bit 31 + else emith_flg_noV = 1; // imm #0, never overflows + // full V = Nd^Nt^Ns^C calculation is deferred until really needed + + if (rd != FNZ) + EMIT(MIPS_MOVE_REG(rd, FNZ)); // N,Z via result value in FNZ +} + +// data processing, register +#define emith_move_r_r_ptr(d, s) \ + EMIT(MIPS_MOVE_REG(d, s)) +#define emith_move_r_r_ptr_c(cond, d, s) \ + emith_move_r_r_ptr(d, s) + +#define emith_move_r_r(d, s) \ + emith_move_r_r_ptr(d, s) +#define emith_move_r_r_c(cond, d, s) \ + emith_move_r_r(d, s) + +#define emith_mvn_r_r(d, s) \ + EMIT(MIPS_MVN_REG(d, s)) + +#define emith_add_r_r_r_lsl_ptr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_ADD_REG(d, s1, AT)); \ + } else EMIT(MIPS_ADD_REG(d, s1, s2)); \ +} while (0) +#define emith_add_r_r_r_lsl(d, s1, s2, simm) \ + emith_add_r_r_r_lsl_ptr(d, s1, s2, simm) + +#define emith_add_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSR_IMM(AT, s2, simm)); \ + EMIT(MIPS_ADD_REG(d, s1, AT)); \ + } else EMIT(MIPS_ADD_REG(d, s1, s2)); \ +} while (0) + +#define emith_addf_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_ADD_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(MIPS_ADD_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) + +#define emith_addf_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSR_IMM(AT, s2, simm)); \ + EMIT(MIPS_ADD_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(MIPS_ADD_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) + +#define emith_sub_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_SUB_REG(d, s1, AT)); \ + } else EMIT(MIPS_SUB_REG(d, s1, s2)); \ +} while (0) + +#define emith_subf_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_SUB_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 1); \ + } else { \ + EMIT(MIPS_SUB_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 1); \ + } \ +} while (0) + +#define emith_or_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_OR_REG(d, s1, AT)); \ + } else EMIT(MIPS_OR_REG(d, s1, s2)); \ +} while (0) + +#define emith_eor_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_XOR_REG(d, s1, AT)); \ + } else EMIT(MIPS_XOR_REG(d, s1, s2)); \ +} while (0) + +#define emith_eor_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSR_IMM(AT, s2, simm)); \ + EMIT(MIPS_XOR_REG(d, s1, AT)); \ + } else EMIT(MIPS_XOR_REG(d, s1, s2)); \ +} while (0) + +#define emith_and_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_AND_REG(d, s1, AT)); \ + } else EMIT(MIPS_AND_REG(d, s1, s2)); \ +} while (0) + +#define emith_or_r_r_lsl(d, s, lslimm) \ + emith_or_r_r_r_lsl(d, d, s, lslimm) + +#define emith_eor_r_r_lsr(d, s, lsrimm) \ + emith_eor_r_r_r_lsr(d, d, s, lsrimm) + +#define emith_add_r_r_r(d, s1, s2) \ + emith_add_r_r_r_lsl(d, s1, s2, 0) + +#define emith_addf_r_r_r(d, s1, s2) \ + emith_addf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_sub_r_r_r(d, s1, s2) \ + emith_sub_r_r_r_lsl(d, s1, s2, 0) + +#define emith_subf_r_r_r(d, s1, s2) \ + emith_subf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_or_r_r_r(d, s1, s2) \ + emith_or_r_r_r_lsl(d, s1, s2, 0) + +#define emith_eor_r_r_r(d, s1, s2) \ + emith_eor_r_r_r_lsl(d, s1, s2, 0) + +#define emith_and_r_r_r(d, s1, s2) \ + emith_and_r_r_r_lsl(d, s1, s2, 0) + +#define emith_add_r_r_ptr(d, s) \ + emith_add_r_r_r_lsl_ptr(d, d, s, 0) +#define emith_add_r_r(d, s) \ + emith_add_r_r_r(d, d, s) + +#define emith_sub_r_r(d, s) \ + emith_sub_r_r_r(d, d, s) + +#define emith_neg_r_r(d, s) \ + EMIT(MIPS_NEG_REG(d, s)) + +#define emith_adc_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(AT, s1, FC); \ + emith_add_r_r_r(d, AT, s2); \ +} while (0) + +#define emith_adc_r_r(d, s) \ + emith_adc_r_r_r(d, d, s) + +// NB: the incoming C can cause its own outgoing C if s2+C=0 (or s1+C=0 FWIW) +// moreover, s2 is 0 if there is C, so no other C can be generated. +#define emith_adcf_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(FNZ, s2, FC); \ + EMIT(MIPS_SLTU_REG(AT, FNZ, FC)); \ + emith_add_r_r_r(FNZ, s1, FNZ); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + emith_or_r_r(FC, AT); \ +} while (0) + +#define emith_sbcf_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(FNZ, s2, FC); \ + EMIT(MIPS_SLTU_REG(AT, FNZ, FC)); \ + emith_sub_r_r_r(FNZ, s1, FNZ); \ + emith_set_arith_flags(d, s1, s2, 0, 1); \ + emith_or_r_r(FC, AT); \ +} while (0) + +#define emith_and_r_r(d, s) \ + emith_and_r_r_r(d, d, s) +#define emith_and_r_r_c(cond, d, s) \ + emith_and_r_r(d, s) + +#define emith_or_r_r(d, s) \ + emith_or_r_r_r(d, d, s) + +#define emith_eor_r_r(d, s) \ + emith_eor_r_r_r(d, d, s) + +#define emith_tst_r_r_ptr(d, s) \ + emith_and_r_r_r(FNZ, d, s) +#define emith_tst_r_r(d, s) \ + emith_tst_r_r_ptr(d, s) + +#define emith_teq_r_r(d, s) \ + emith_eor_r_r_r(FNZ, d, s) + +#define emith_cmp_r_r(d, s) \ + emith_subf_r_r_r(FNZ, d, s) + +#define emith_addf_r_r(d, s) \ + emith_addf_r_r_r(d, d, s) + +#define emith_subf_r_r(d, s) \ + emith_subf_r_r_r(d, d, s) + +#define emith_adcf_r_r(d, s) \ + emith_adcf_r_r_r(d, d, s) + +#define emith_sbcf_r_r(d, s) \ + emith_sbcf_r_r_r(d, d, s) + +#define emith_negcf_r_r(d, s) \ + emith_sbcf_r_r_r(d, Z0, s) + + +// move immediate +static void emith_move_imm(int r, uintptr_t imm) +{ + if ((s16)imm != imm) { + int s = Z0; + if (imm >> 16) { + EMIT(MIPS_MOVT_IMM(r, imm >> 16)); + s = r; + } + if ((u16)imm) + EMIT(MIPS_OR_IMM(r, s, (u16)imm)); + } else + EMIT(MIPS_ADD_IMM(r, Z0, imm)); +} + +#define emith_move_r_ptr_imm(r, imm) \ + emith_move_imm(r, (uintptr_t)(imm)) + +#define emith_move_r_imm(r, imm) \ + emith_move_imm(r, (u32)(imm)) +#define emith_move_r_imm_c(cond, r, imm) \ + emith_move_r_imm(r, imm) + + +// arithmetic, immediate +static void emith_arith_imm(int op, int rd, int rs, u32 imm) +{ + if ((s16)imm != imm) { + emith_move_r_imm(AT, imm); + EMIT(MIPS_OP_REG(FN_ADD + (op-OP_ADDI), rd, rs, AT)); + } else if (imm || rd != rs) + EMIT(MIPS_OP_IMM(op, rd, rs, imm)); +} + +#define emith_add_r_imm(r, imm) \ + emith_add_r_r_imm(r, r, imm) +#define emith_add_r_imm_c(cond, r, imm) \ + emith_add_r_imm(r, imm) + +#define emith_addf_r_imm(r, imm) \ + emith_addf_r_r_imm(r, imm) + +#define emith_sub_r_imm(r, imm) \ + emith_sub_r_r_imm(r, r, imm) +#define emith_sub_r_imm_c(cond, r, imm) \ + emith_sub_r_imm(r, imm) + +#define emith_subf_r_imm(r, imm) \ + emith_subf_r_r_imm(r, r, imm) + +#define emith_adc_r_imm(r, imm) \ + emith_adc_r_r_imm(r, r, imm); + +#define emith_adcf_r_imm(r, imm) \ + emith_adcf_r_r_imm(r, r, imm) + +#define emith_cmp_r_imm(r, imm) \ + emith_subf_r_r_imm(FNZ, r, (s16)imm) + + +#define emith_add_r_r_ptr_imm(d, s, imm) \ + emith_arith_imm(OP_ADDIU, d, s, imm) + +#define emith_add_r_r_imm(d, s, imm) \ + emith_add_r_r_ptr_imm(d, s, imm) + +#define emith_addf_r_r_imm(d, s, imm) do { \ + emith_add_r_r_imm(FNZ, s, imm); \ + emith_set_arith_flags(d, s, 0, imm, 0); \ +} while (0) + +#define emith_adc_r_r_imm(d, s, imm) do { \ + emith_add_r_r_r(AT, s, FC); \ + emith_add_r_r_imm(d, AT, imm); \ +} while (0) + +#define emith_adcf_r_r_imm(d, s, imm) do { \ + emith_add_r_r_r(FNZ, s, FC); \ + EMIT(MIPS_SLTU_REG(AT, FNZ, FC)); \ + emith_add_r_r_imm(FNZ, FNZ, imm); \ + emith_set_arith_flags(d, s, 0, imm, 0); \ + emith_or_r_r(FC, AT); \ +} while (0) + +// NB: no SUBI in MIPS II, since ADDI takes a signed imm +#define emith_sub_r_r_imm(d, s, imm) \ + emith_add_r_r_imm(d, s, -(imm)) +#define emith_sub_r_r_imm_c(cond, d, s, imm) \ + emith_sub_r_r_imm(d, s, imm) + +#define emith_subf_r_r_imm(d, s, imm) do { \ + emith_sub_r_r_imm(FNZ, s, imm); \ + emith_set_arith_flags(d, s, 0, imm, 1); \ +} while (0) + +// logical, immediate +static void emith_log_imm(int op, int rd, int rs, u32 imm) +{ + if (imm >> 16) { + emith_move_r_imm(AT, imm); + EMIT(MIPS_OP_REG(FN_AND + (op-OP_ANDI), rd, rs, AT)); + } else if (op == OP_ANDI || imm || rd != rs) + EMIT(MIPS_OP_IMM(op, rd, rs, imm)); +} + +#define emith_and_r_imm(r, imm) \ + emith_log_imm(OP_ANDI, r, r, imm) + +#define emith_or_r_imm(r, imm) \ + emith_log_imm(OP_ORI, r, r, imm) +#define emith_or_r_imm_c(cond, r, imm) \ + emith_or_r_imm(r, imm) + +#define emith_eor_r_imm_ptr(r, imm) \ + emith_log_imm(OP_XORI, r, r, imm) +#define emith_eor_r_imm_ptr_c(cond, r, imm) \ + emith_eor_r_imm_ptr(r, imm) + +#define emith_eor_r_imm(r, imm) \ + emith_eor_r_imm_ptr(r, imm) +#define emith_eor_r_imm_c(cond, r, imm) \ + emith_eor_r_imm(r, imm) + +/* NB: BIC #imm not available in MIPS; use AND #~imm instead */ +#define emith_bic_r_imm(r, imm) \ + emith_log_imm(OP_ANDI, r, r, ~(imm)) +#define emith_bic_r_imm_c(cond, r, imm) \ + emith_bic_r_imm(r, imm) + +#define emith_tst_r_imm(r, imm) \ + emith_log_imm(OP_ANDI, FNZ, r, imm) +#define emith_tst_r_imm_c(cond, r, imm) \ + emith_tst_r_imm(r, imm) + +#define emith_and_r_r_imm(d, s, imm) \ + emith_log_imm(OP_ANDI, d, s, imm) + +#define emith_or_r_r_imm(d, s, imm) \ + emith_log_imm(OP_ORI, d, s, imm) + +#define emith_eor_r_r_imm(d, s, imm) \ + emith_log_imm(OP_XORI, d, s, imm) + +// shift +#define emith_lsl(d, s, cnt) \ + EMIT(MIPS_LSL_IMM(d, s, cnt)) + +#define emith_lsr(d, s, cnt) \ + EMIT(MIPS_LSR_IMM(d, s, cnt)) + +#define emith_asr(d, s, cnt) \ + EMIT(MIPS_ASR_IMM(d, s, cnt)) + +// NB: mips32r2 has ROT (SLR with R bit set) +#define emith_ror(d, s, cnt) do { \ + EMIT(MIPS_LSL_IMM(AT, s, 32-(cnt))); \ + EMIT(MIPS_LSR_IMM(d, s, cnt)); \ + EMIT(MIPS_OR_REG(d, d, AT)); \ +} while (0) +#define emith_ror_c(cond, d, s, cnt) \ + emith_ror(d, s, cnt) + +#define emith_rol(d, s, cnt) do { \ + EMIT(MIPS_LSR_IMM(AT, s, 32-(cnt))); \ + EMIT(MIPS_LSL_IMM(d, s, cnt)); \ + EMIT(MIPS_OR_REG(d, d, AT)); \ +} while (0) + +// NB: all flag setting shifts make V undefined +// NB: mips32r2 has EXT (useful for extracting C) +#define emith_lslf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_lsl(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_lsr(FC, _s, 31); \ + emith_lsl(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ +} while (0) + +#define emith_lsrf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_lsr(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_and_r_r_imm(FC, _s, 1); \ + emith_lsr(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ +} while (0) + +#define emith_asrf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_asr(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_and_r_r_imm(FC, _s, 1); \ + emith_asr(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ +} while (0) + +#define emith_rolf(d, s, cnt) do { \ + emith_rol(d, s, cnt); \ + emith_and_r_r_imm(FC, d, 1); \ + emith_move_r_r(FNZ, d); \ +} while (0) + +#define emith_rorf(d, s, cnt) do { \ + emith_ror(d, s, cnt); \ + emith_lsr(FC, d, 31); \ + emith_move_r_r(FNZ, d); \ +} while (0) + +#define emith_rolcf(d) do { \ + emith_lsr(AT, d, 31); \ + emith_lsl(d, d, 1); \ + emith_or_r_r(d, FC); \ + emith_move_r_r(FC, AT); \ + emith_move_r_r(FNZ, d); \ +} while (0) + +#define emith_rorcf(d) do { \ + emith_and_r_r_imm(AT, d, 1); \ + emith_lsr(d, d, 1); \ + emith_lsl(FC, FC, 31); \ + emith_or_r_r(d, FC); \ + emith_move_r_r(FC, AT); \ + emith_move_r_r(FNZ, d); \ +} while (0) + +// signed/unsigned extend +// NB: mips32r2 has EXT and INS +#define emith_clear_msb(d, s, count) /* bits to clear */ do { \ + u32 t; \ + if ((count) > 16) { \ + t = (count) - 16; \ + t = 0xffff >> t; \ + emith_and_r_r_imm(d, s, t); \ + } else { \ + emith_lsl(d, s, count); \ + emith_lsr(d, d, count); \ + } \ +} while (0) +#define emith_clear_msb_c(cond, d, s, count) \ + emith_clear_msb(d, s, count) + +// NB: mips32r2 has SE[BH]H +#define emith_sext(d, s, count) /* bits to keep */ do { \ + emith_lsl(d, s, 32-(count)); \ + emith_asr(d, d, 32-(count)); \ +} while (0) + +// multiply Rd = Rn*Rm (+ Ra); NB: next 2 insns after MFLO/MFHI mustn't be MULT +static u8 *last_lohi; +static void emith_lohi_nops(void) +{ + u32 d; + while ((d = emith_insn_ptr() - last_lohi) < 8 && d >= 0) EMIT(MIPS_NOP); +} + +#define emith_mul(d, s1, s2) do { \ + emith_lohi_nops(); \ + EMIT(MIPS_MULTU(s1, s2)); \ + EMIT(MIPS_MFLO(d)); \ + last_lohi = emith_insn_ptr(); \ +} while (0) + +#define emith_mul_u64(dlo, dhi, s1, s2) do { \ + emith_lohi_nops(); \ + EMIT(MIPS_MULTU(s1, s2)); \ + EMIT(MIPS_MFLO(dlo)); \ + EMIT(MIPS_MFHI(dhi)); \ + last_lohi = emith_insn_ptr(); \ +} while (0) + +#define emith_mul_s64(dlo, dhi, s1, s2) do { \ + emith_lohi_nops(); \ + EMIT(MIPS_MULT(s1, s2)); \ + EMIT(MIPS_MFLO(dlo)); \ + EMIT(MIPS_MFHI(dhi)); \ + last_lohi = emith_insn_ptr(); \ +} while (0) + +#define emith_mula_s64(dlo, dhi, s1, s2) do { \ + int t_ = rcache_get_tmp(); \ + emith_lohi_nops(); \ + EMIT(MIPS_MULT(s1, s2)); \ + EMIT(MIPS_MFLO(AT)); \ + emith_add_r_r(dlo, AT); \ + EMIT(MIPS_SLTU_REG(t_, dlo, AT)); \ + EMIT(MIPS_MFHI(AT)); \ + last_lohi = emith_insn_ptr(); \ + emith_add_r_r(dhi, AT); \ + emith_add_r_r(dhi, t_); \ + rcache_free_tmp(t_); \ +} while (0) +#define emith_mula_s64_c(cond, dlo, dhi, s1, s2) \ + emith_mula_s64(dlo, dhi, s1, s2) + +// load/store. offs has 16 bits signed, which is currently sufficient +#define emith_read_r_r_offs_ptr(r, rs, offs) \ + EMIT(MIPS_LW(r, rs, offs)) +#define emith_read_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_read_r_r_offs_ptr(r, rs, offs) + +#define emith_read_r_r_offs(r, rs, offs) \ + emith_read_r_r_offs_ptr(r, rs, offs) +#define emith_read_r_r_offs_c(cond, r, rs, offs) \ + emith_read_r_r_offs(r, rs, offs) + +#define emith_read_r_r_r_ptr(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_LW(r, AT, 0)); \ +} while (0) + +#define emith_read_r_r_r(r, rs, rm) \ + emith_read_r_r_r_ptr(r, rs, rm) +#define emith_read_r_r_r_c(cond, r, rs, rm) \ + emith_read_r_r_r(r, rs, rm) + +#define emith_read_r_r_r_ptr_wb(r, rs, rm) do { \ + emith_add_r_r_r(rs, rs, rm); \ + EMIT(MIPS_LW(r, rs, 0)); \ +} while (0) +#define emith_read_r_r_r_wb(r, rs, rm) \ + emith_read_r_r_r_ptr_wb(r, rs, rm) + +#define emith_read8_r_r_offs(r, rs, offs) \ + EMIT(MIPS_LBU(r, rs, offs)) +#define emith_read8_r_r_offs_c(cond, r, rs, offs) \ + emith_read8_r_r_offs(r, rs, offs) + +#define emith_read8_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_LBU(r, AT, 0)); \ +} while (0) +#define emith_read8_r_r_r_c(cond, r, rs, rm) \ + emith_read8_r_r_r(r, rs, rm) + +#define emith_read16_r_r_offs(r, rs, offs) \ + EMIT(MIPS_LHU(r, rs, offs)) +#define emith_read16_r_r_offs_c(cond, r, rs, offs) \ + emith_read16_r_r_offs(r, rs, offs) + +#define emith_read16_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_LHU(r, AT, 0)); \ +} while (0) +#define emith_read16_r_r_r_c(cond, r, rs, rm) \ + emith_read16_r_r_r(r, rs, rm) + +#define emith_read8s_r_r_offs(r, rs, offs) \ + EMIT(MIPS_LB(r, rs, offs)) +#define emith_read8s_r_r_offs_c(cond, r, rs, offs) \ + emith_read8s_r_r_offs(r, rs, offs) + +#define emith_read8s_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_LB(r, AT, 0)); \ +} while (0) +#define emith_read8s_r_r_r_c(cond, r, rs, rm) \ + emith_read8s_r_r_r(r, rs, rm) + +#define emith_read16s_r_r_offs(r, rs, offs) \ + EMIT(MIPS_LH(r, rs, offs)) +#define emith_read16s_r_r_offs_c(cond, r, rs, offs) \ + emith_read16s_r_r_offs(r, rs, offs) + +#define emith_read16s_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_LH(r, AT, 0)); \ +} while (0) +#define emith_read16s_r_r_r_c(cond, r, rs, rm) \ + emith_read16s_r_r_r(r, rs, rm) + + +#define emith_write_r_r_offs_ptr(r, rs, offs) \ + EMIT(MIPS_SW(r, rs, offs)) +#define emith_write_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_write_r_r_offs_ptr(r, rs, offs) + +#define emith_write_r_r_r_ptr(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_SW(r, AT, 0)); \ +} while (0) +#define emith_write_r_r_r_ptr_c(cond, r, rs, rm) \ + emith_write_r_r_r_ptr(r, rs, rm) + +#define emith_write_r_r_offs(r, rs, offs) \ + emith_write_r_r_offs_ptr(r, rs, offs) +#define emith_write_r_r_offs_c(cond, r, rs, offs) \ + emith_write_r_r_offs(r, rs, offs) + +#define emith_write_r_r_r(r, rs, rm) \ + emith_write_r_r_r_ptr(r, rs, rm) +#define emith_write_r_r_r_c(cond, r, rs, rm) \ + emith_write_r_r_r(r, rs, rm) + +#define emith_write_r_r_r_ptr_wb(r, rs, rm) do { \ + emith_add_r_r_r(rs, rs, rm); \ + EMIT(MIPS_SW(r, rs, 0)); \ +} while (0) +#define emith_write_r_r_r_wb(r, rs, rm) \ + emith_write_r_r_r_ptr_wb(r, rs, rm) + +#define emith_ctx_read_ptr(r, offs) \ + emith_read_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_read(r, offs) \ + emith_read_r_r_offs(r, CONTEXT_REG, offs) +#define emith_ctx_read_c(cond, r, offs) \ + emith_ctx_read(r, offs) + +#define emith_ctx_write_ptr(r, offs) \ + emith_write_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_write(r, offs) \ + emith_write_r_r_offs(r, CONTEXT_REG, offs) + +#define emith_ctx_read_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_read(r_, offs_); \ +} while (0) + +#define emith_ctx_write_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_write(r_, offs_); \ +} while (0) + +// function call handling +#define emith_save_caller_regs(mask) do { \ + int _c; u32 _m = mask & 0x300fffc; /* r2-r15,r24-r25 */ \ + if (__builtin_parity(_m) == 1) _m |= 0x1; /* ABI align */ \ + int _s = count_bits(_m) * 4, _o = _s; \ + if (_s) emith_sub_r_imm(SP, _s); \ + for (_c = HOST_REGS; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + if (_m & (1 << _c)) \ + { _o -= 4; if (_c) emith_write_r_r_offs(_c, SP, _o); } \ +} while (0) + +#define emith_restore_caller_regs(mask) do { \ + int _c; u32 _m = mask & 0x300fffc; \ + if (__builtin_parity(_m) == 1) _m |= 0x1; \ + int _s = count_bits(_m) * 4, _o = 0; \ + for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) \ + { if (_c) emith_read_r_r_offs(_c, SP, _o); _o += 4; } \ + if (_s) emith_add_r_imm(SP, _s); \ +} while (0) + +#define host_arg2reg(rd, arg) \ + rd = (arg+4) + +#define emith_pass_arg_r(arg, reg) \ + emith_move_r_r(arg, reg) + +#define emith_pass_arg_imm(arg, imm) \ + emith_move_r_imm(arg, imm) + +// branching +#define emith_invert_branch(cond) /* inverted conditional branch */ \ + (((cond) >> 5) == OP__RT ? (cond) ^ 0x01 : (cond) ^ 0x20) + +// evaluate the emulated condition, returns a register/branch type pair +static int emith_cond_check(int cond, int *r) +{ + int b = 0; + + // shortcut for comparing 2 registers + if (emith_flg_rs || emith_flg_rt) switch (cond) { + case DCOND_LS: EMIT(MIPS_SLTU_REG(AT, emith_flg_rs, emith_flg_rt)); + *r = AT, b = MIPS_BEQ; break; // s <= t unsigned + case DCOND_HI: EMIT(MIPS_SLTU_REG(AT, emith_flg_rs, emith_flg_rt)); + *r = AT, b = MIPS_BNE; break; // s > t unsigned + case DCOND_LT: EMIT(MIPS_SLT_REG(AT, emith_flg_rt, emith_flg_rs)); + *r = AT, b = MIPS_BNE; break; // s < t + case DCOND_GE: EMIT(MIPS_SLT_REG(AT, emith_flg_rt, emith_flg_rs)); + *r = AT, b = MIPS_BEQ; break; // s >= t + case DCOND_LE: EMIT(MIPS_SLT_REG(AT, emith_flg_rs, emith_flg_rt)); + *r = AT, b = MIPS_BEQ; break; // s <= t + case DCOND_GT: EMIT(MIPS_SLT_REG(AT, emith_flg_rs, emith_flg_rt)); + *r = AT, b = MIPS_BNE; break; // s > t + } + + // shortcut for V known to be 0 + if (!b && emith_flg_noV) switch (cond) { + case DCOND_VS: *r = Z0; b = MIPS_BNE; break; // never + case DCOND_VC: *r = Z0; b = MIPS_BEQ; break; // always + case DCOND_LT: *r = FNZ, b = MIPS_BLT; break; // N + case DCOND_GE: *r = FNZ, b = MIPS_BGE; break; // !N + case DCOND_LE: *r = FNZ, b = MIPS_BLE; break; // N || Z + case DCOND_GT: *r = FNZ, b = MIPS_BGT; break; // !N && !Z + } + + // the full monty if no shortcut + if (!b) switch (cond) { + // conditions using NZ + case DCOND_EQ: *r = FNZ; b = MIPS_BEQ; break; // Z + case DCOND_NE: *r = FNZ; b = MIPS_BNE; break; // !Z + case DCOND_MI: *r = FNZ; b = MIPS_BLT; break; // N + case DCOND_PL: *r = FNZ; b = MIPS_BGE; break; // !N + // conditions using C + case DCOND_LO: *r = FC; b = MIPS_BNE; break; // C + case DCOND_HS: *r = FC; b = MIPS_BEQ; break; // !C + // conditions using CZ + case DCOND_LS: // C || Z + case DCOND_HI: // !C && !Z + EMIT(MIPS_ADD_IMM(AT, FC, (u16)-1)); // !C && !Z + EMIT(MIPS_AND_REG(AT, FNZ, AT)); + *r = AT, b = (cond == DCOND_HI ? MIPS_BNE : MIPS_BEQ); + break; + + // conditions using V + case DCOND_VS: // V + case DCOND_VC: // !V + EMIT(MIPS_XOR_REG(AT, FV, FNZ)); // V = Nt^Ns^Nd^C + EMIT(MIPS_LSR_IMM(AT, AT, 31)); + EMIT(MIPS_XOR_REG(AT, AT, FC)); + *r = AT, b = (cond == DCOND_VS ? MIPS_BNE : MIPS_BEQ); + break; + // conditions using VNZ + case DCOND_LT: // N^V + case DCOND_GE: // !(N^V) + EMIT(MIPS_LSR_IMM(AT, FV, 31)); // Nd^V = Nt^Ns^C + EMIT(MIPS_XOR_REG(AT, FC, AT)); + *r = AT, b = (cond == DCOND_LT ? MIPS_BNE : MIPS_BEQ); + break; + case DCOND_LE: // (N^V) || Z + case DCOND_GT: // !(N^V) && !Z + EMIT(MIPS_LSR_IMM(AT, FV, 31)); // Nd^V = Nt^Ns^C + EMIT(MIPS_XOR_REG(AT, FC, AT)); + EMIT(MIPS_ADD_IMM(AT, AT, (u16)-1)); // !(Nd^V) && !Z + EMIT(MIPS_AND_REG(AT, FNZ, AT)); + *r = AT, b = (cond == DCOND_GT ? MIPS_BNE : MIPS_BEQ); + break; + } + return b; +} + +// NB: assumes all targets are in the same 256MB segment +#define emith_jump(target) \ + emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)) +#define emith_jump_patchable(target) \ + emith_jump(target) + +// NB: MIPS conditional branches have only +/- 128KB range +#define emith_jump_cond(cond, target) do { \ + int r_, mcond_ = emith_cond_check(cond, &r_); \ + u32 disp_ = (u8 *)target - emith_insn_ptr() - 4; \ + if (disp_ >= 0xfffe0000 || disp_ <= 0x0001ffff) { /* can use near B */ \ + emith_branch(MIPS_BCONDZ(mcond_,r_,disp_ & 0x0003ffff)); \ + } else { /* far branch if near branch isn't possible */ \ + mcond_ = emith_invert_branch(mcond_); \ + u8 *bp = emith_branch(MIPS_BCONDZ(mcond_, r_, 0)); \ + emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)); \ + EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, emith_insn_ptr()-bp-4)); \ + } \ +} while (0) + +#define emith_jump_cond_patchable(cond, target) do { \ + int r_, mcond_ = emith_cond_check(cond, &r_); \ + mcond_ = emith_invert_branch(mcond_); \ + u8 *bp = emith_branch(MIPS_BCONDZ(mcond_, r_, 0));\ + emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)); \ + EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, emith_insn_ptr()-bp-4)); \ +} while (0) + +// NB: returns position of patch for cache maintenance +#define emith_jump_patch(ptr, target) ({ \ + u32 *ptr_ = (u32 *)ptr-1; /* must skip condition check code */ \ + while ((ptr_[0] & 0xf8000000) != OP_J << 26) ptr_ ++; \ + EMIT_PTR(ptr_, MIPS_J((uintptr_t)target & 0x0fffffff)); \ + (u8 *)(ptr_-1); \ +}) + +#define emith_jump_reg(r) \ + emith_branch(MIPS_JR(r)) +#define emith_jump_reg_c(cond, r) \ + emith_jump_reg(r) + +#define emith_jump_ctx(offs) do { \ + emith_ctx_read_ptr(AT, offs); \ + emith_jump_reg(AT); \ +} while (0) +#define emith_jump_ctx_c(cond, offs) \ + emith_jump_ctx(offs) + +#define emith_call(target) \ + emith_branch(MIPS_JAL((uintptr_t)target & 0x0fffffff)) +#define emith_call_cond(cond, target) \ + emith_call(target) + +#define emith_call_reg(r) \ + emith_branch(MIPS_JALR(LR, r)) + +#define emith_call_ctx(offs) do { \ + emith_ctx_read_ptr(AT, offs); \ + emith_call_reg(AT); \ +} while (0) + +#define emith_call_link(r, target) do { \ + EMIT(MIPS_BL(4)); EMIT(MIPS_ADD_IMM(r, LR, 8)); emith_flush(); \ + emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)); \ +} while (0) + +#define emith_call_cleanup() /**/ + +#define emith_ret() \ + emith_branch(MIPS_JR(LR)) +#define emith_ret_c(cond) \ + emith_ret() + +#define emith_ret_to_ctx(offs) \ + emith_ctx_write_ptr(LR, offs) + +// NB: ABI SP alignment is 8 for compatibility with MIPS IV +#define emith_push_ret(r) do { \ + emith_sub_r_imm(SP, 8+16); /* reserve new arg save area (16) */ \ + emith_write_r_r_offs(LR, SP, 4+16); \ + if ((r) >= 0) emith_write_r_r_offs(r, SP, 0+16); \ +} while (0) + +#define emith_pop_and_ret(r) do { \ + if ((r) >= 0) emith_read_r_r_offs(r, SP, 0+16); \ + emith_read_r_r_offs(LR, SP, 4+16); \ + emith_add_r_imm(SP, 8+16); \ + emith_ret(); \ +} while (0) + + +// emitter ABI stuff +#define emith_pool_check() /**/ +#define emith_pool_commit(j) /**/ +// NB: mips32r2 has SYNCI +#define host_instructions_updated(base, end) __builtin___clear_cache(base, end) +#define emith_jump_patch_size() 4 + +// SH2 drc specific +#define emith_sh2_drc_entry() do { \ + int _c; u32 _m = 0xd0ff0000; \ + if (__builtin_parity(_m) == 1) _m |= 0x1; /* ABI align for SP is 8 */ \ + int _s = count_bits(_m) * 4 + 16, _o = _s; /* 16 byte arg save area */ \ + if (_s) emith_sub_r_imm(SP, _s); \ + for (_c = HOST_REGS; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + if (_m & (1 << _c)) \ + { _o -= 4; if (_c) emith_write_r_r_offs(_c, SP, _o); } \ +} while (0) +#define emith_sh2_drc_exit() do { \ + int _c; u32 _m = 0xd0ff0000; \ + if (__builtin_parity(_m) == 1) _m |= 0x1; \ + int _s = count_bits(_m) * 4 + 16, _o = 16; \ + for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) \ + { if (_c) emith_read_r_r_offs(_c, SP, _o); _o += 4; } \ + if (_s) emith_add_r_imm(SP, _s); \ + emith_ret(); \ +} while (0) + +// NB: assumes a is in arg0, tab, func and mask are temp +#define emith_sh2_rcall(a, tab, func, mask) do { \ + emith_lsr(mask, a, SH2_READ_SHIFT); \ + emith_add_r_r_r_lsl_ptr(tab, tab, mask, 3); \ + emith_read_r_r_offs_ptr(func, tab, 0); \ + emith_read_r_r_offs(mask, tab, 4); \ + emith_addf_r_r_r/*_ptr*/(func, func, func); \ +} while (0) + +// NB: assumes a, val are in arg0 and arg1, tab and func are temp +#define emith_sh2_wcall(a, val, tab, func) do { \ + emith_lsr(func, a, SH2_WRITE_SHIFT); \ + emith_lsl(func, func, 2); \ + emith_read_r_r_r_ptr(func, tab, func); \ + emith_move_r_r_ptr(6, CONTEXT_REG); /* arg2 */ \ + emith_jump_reg(func); \ +} while (0) + +#define emith_sh2_delay_loop(cycles, reg) do { \ + int sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); \ + int t1 = rcache_get_tmp(); \ + int t2 = rcache_get_tmp(); \ + int t3 = rcache_get_tmp(); \ + /* if (sr < 0) return */ \ + emith_cmp_r_imm(sr, 0); \ + EMITH_JMP_START(DCOND_LE); \ + /* turns = sr.cycles / cycles */ \ + emith_asr(t2, sr, 12); \ + emith_move_r_imm(t3, (u32)((1ULL<<32) / (cycles)) + 1); \ + emith_mul_u64(t1, t2, t2, t3); /* multiply by 1/x */ \ + rcache_free_tmp(t3); \ + if (reg >= 0) { \ + /* if (reg <= turns) turns = reg-1 */ \ + t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \ + emith_cmp_r_r(t3, t2); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_sub_r_r_imm_c(DCOND_LS, t2, t3, 1); \ + EMITH_SJMP_END(DCOND_HI); \ + /* if (reg <= 1) turns = 0 */ \ + emith_cmp_r_imm(t3, 1); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_move_r_imm_c(DCOND_LS, t2, 0); \ + EMITH_SJMP_END(DCOND_HI); \ + /* reg -= turns */ \ + emith_sub_r_r(t3, t2); \ + } \ + /* sr.cycles -= turns * cycles; */ \ + emith_move_r_imm(t1, cycles); \ + emith_mul(t1, t2, t1); \ + emith_sub_r_r_r_lsl(sr, sr, t1, 12); \ + EMITH_JMP_END(DCOND_LE); \ + rcache_free_tmp(t1); \ + rcache_free_tmp(t2); \ +} while (0) + +/* + * if Q + * t = carry(Rn += Rm) + * else + * t = carry(Rn -= Rm) + * T ^= t + */ +#define emith_sh2_div1_step(rn, rm, sr) do { \ + emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ + EMITH_JMP3_START(DCOND_EQ); \ + emith_addf_r_r(rn, rm); \ + EMITH_JMP3_MID(DCOND_EQ); \ + emith_subf_r_r(rn, rm); \ + EMITH_JMP3_END(); \ + emith_eor_r_r(sr, FC); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macl(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* MACH top 16 bits unused if saturated. sign ext for overfl detect */ \ + emith_sext(mh, mh, 16); \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ + /* to check: add MACH[15] to MACH[31:16]. this is 0 if no overflow */ \ + emith_asrf(rn, mh, 16); /* sum = (MACH>>16) + ((MACH>>15)&1) */ \ + emith_adcf_r_imm(rn, 0); /* (MACH>>15) is in carry after shift */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ + EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> +ovl */ \ + emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0xffffffff */ \ + emith_sub_r_imm_c(DCOND_GT, mh, 1); /* 0x00007fff */ \ + EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macw(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* XXX: MACH should be untouched when S is set? */ \ + emith_asr(mh, ml, 31); /* sign ext MACL to MACH for ovrfl check */ \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 33 bits of MACH:MACL aren't all 1 or 0 */ \ + /* to check: add MACL[31] to MACH. this is 0 if no overflow */ \ + emith_lsr(rn, ml, 31); \ + emith_addf_r_r(rn, mh); /* sum = MACH + ((MACL>>31)&1) */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> overflow */ \ + /* XXX: LSB signalling only in SH1, or in SH2 too? */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x00000001); /* LSB of MACH */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x80000000); /* negative ovrfl */ \ + EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> positive ovrfl */ \ + emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0x7fffffff */ \ + EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +#define emith_write_sr(sr, srcr) do { \ + emith_lsr(sr, sr, 10); \ + emith_or_r_r_r_lsl(sr, sr, srcr, 22); \ + emith_ror(sr, sr, 22); \ +} while (0) + +#define emith_carry_to_t(srr, is_sub) do { \ + emith_lsr(sr, sr, 1); \ + emith_adc_r_r(sr, sr); \ +} while (0) + +#define emith_tpop_carry(sr, is_sub) do { \ + emith_and_r_r_imm(FC, sr, 1); \ + emith_lsr(sr, sr, 1); \ +} while (0) + +#define emith_tpush_carry(sr, is_sub) \ + emith_adc_r_r(sr, sr) + +#ifdef T +// T bit handling +#define emith_invert_cond(cond) \ + ((cond) ^ 1) + +static void emith_clr_t_cond(int sr) +{ + emith_bic_r_imm(sr, T); +} + +static void emith_set_t_cond(int sr, int cond) +{ + EMITH_SJMP_START(emith_invert_cond(cond)); + emith_or_r_imm_c(cond, sr, T); + EMITH_SJMP_END(emith_invert_cond(cond)); +} + +#define emith_get_t_cond() -1 + +#define emith_sync_t(sr) ((void)sr) + +#define emith_invalidate_t() + +static void emith_set_t(int sr, int val) +{ + if (val) + emith_or_r_imm(sr, T); + else + emith_bic_r_imm(sr, T); +} + +static int emith_tst_t(int sr, int tf) +{ + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; +} +#endif diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 0a31d8949..a40c0f8ca 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -869,11 +869,14 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define emith_jump_cond_patchable(cond, target) \ emith_jump_cond(cond, target) -#define emith_jump_patch(ptr, target) do { \ +#define emith_jump_patch(ptr, target) ({ \ u32 disp_ = (u8 *)(target) - ((u8 *)(ptr) + 4); \ u32 offs_ = (*(u8 *)(ptr) == 0x0f) ? 2 : 1; \ EMIT_PTR((u8 *)(ptr) + offs_, disp_ - offs_, u32); \ -} while (0) + ptr; \ +}) + +#define emith_jump_patch_size() 6 #define emith_jump_at(ptr, target) do { \ u32 disp_ = (u8 *)(target) - ((u8 *)(ptr) + 5); \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 3b03d0c22..01fc6ae1f 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -466,6 +466,47 @@ static cache_reg_t cache_regs[] = { { 7, HRF_REG }, }; +#elif defined(__mips__) +#include "../drc/emit_mips.c" + +static guest_reg_t guest_regs[] = { + // SHR_R0 .. SHR_SP + {GRF_STATIC, 20} , {GRF_STATIC, 21} , { 0 } , { 0 } , + { 0 } , { 0 } , { 0 } , { 0 } , + { 0 } , { 0 } , { 0 } , { 0 } , + { 0 } , { 0 } , { 0 } , { 0 } , + // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, + // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, + { 0 } , { 0 } , { 0 } , {GRF_STATIC, 22} , + { 0 } , { 0 } , { 0 } , { 0 } , +}; + +// MIPS ABI: params: r4-r7, return: r2-r3, temp: r1(at),r8-r15,r24-r25,r31(ra), +// saved: r16-r23,r30, reserved: r0(zero), r26-r27(irq), r28(gp), r29(sp) +// r1,r15,r24,r25 are used internally by the code emitter +static cache_reg_t cache_regs[] = { + { 14, HRF_TEMP }, // temps + { 13, HRF_TEMP }, + { 12, HRF_TEMP }, + { 11, HRF_TEMP }, + { 10, HRF_TEMP }, + { 9, HRF_TEMP }, + { 8, HRF_TEMP }, + { 7, HRF_TEMP }, // params + { 6, HRF_TEMP }, + { 5, HRF_TEMP }, + { 4, HRF_TEMP }, + { 3, HRF_TEMP }, // RET_REG + { 2, HRF_TEMP }, + { 22, HRF_LOCKED }, // statics + { 21, HRF_LOCKED }, + { 20, HRF_LOCKED }, + { 19, HRF_REG }, // other regs + { 18, HRF_REG }, + { 17, HRF_REG }, + { 16, HRF_REG }, +}; + #elif defined(__i386__) #include "../drc/emit_x86.c" @@ -1050,9 +1091,12 @@ static void dr_block_link(struct block_entry *be, struct block_link *bl, int emi dbg(2, "- %slink from %p to pc %08x entry %p", emit_jump ? "":"early ", bl->jump, bl->target_pc, be->tcache_ptr); - if (emit_jump) - emith_jump_patch(bl->jump, be->tcache_ptr); - // could sync arm caches here, but that's unnecessary + if (emit_jump) { + u8 *jump = emith_jump_patch(bl->jump, be->tcache_ptr); + // only needs sync if patch is possibly crossing cacheline (assume 16 byte) + if ((uintptr_t)jump >>4 != ((uintptr_t)jump+emith_jump_patch_size()-1) >>4) + host_instructions_updated(jump, jump+emith_jump_patch_size()); + } // move bl to block_entry bl->target = be; @@ -1069,9 +1113,9 @@ static void dr_block_unlink(struct block_link *bl, int emit_jump) if (bl->target) { if (emit_jump) { - emith_jump_patch(bl->jump, sh2_drc_dispatcher); + u8 *jump = emith_jump_patch(bl->jump, sh2_drc_dispatcher); // update cpu caches since the previous jump target doesn't exist anymore - host_instructions_updated(bl->jump, bl->jump+4); + host_instructions_updated(jump, jump+emith_jump_patch_size()); } if (bl->prev) @@ -4128,8 +4172,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) struct op_data *opd_b = (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd; u32 target_pc = opd_b->imm; int cond = -1; - void *target = NULL; int ctaken = 0; + void *target = NULL; + int patchable = 0; if (OP_ISBRACND(opd_b->op)) ctaken = (op_flags[i] & OF_DELAY_OP) ? 1 : 2; @@ -4182,11 +4227,12 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) branch_patch_pc[branch_patch_count] = target_pc; branch_patch_ptr[branch_patch_count] = target; branch_patch_count++; - } - else + patchable = 1; + } else dbg(1, "warning: too many local branches"); } #endif + if (target == NULL) { // can't resolve branch locally, make a block exit @@ -4204,14 +4250,24 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } else #endif target = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); + patchable = 1; } - if (cond != -1) { - emith_jump_cond_patchable(cond, target); - } - else if (target != NULL) { - rcache_invalidate(); - emith_jump_patchable(target); + // create branch + if (patchable) { + if (cond != -1) + emith_jump_cond_patchable(cond, target); + else if (target != NULL) { + rcache_invalidate(); + emith_jump_patchable(target); + } + } else { + if (cond != -1) + emith_jump_cond(cond, target); + else if (target != NULL) { + rcache_invalidate(); + emith_jump(target); + } } // branch not taken, correct cycle count diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 38e47c0bc..09f4ae979 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -36,6 +36,8 @@ unsigned short scan_block(unsigned int base_pc, int is_slave, // XXX MUST match definitions in cpu/sh2/compiler.c #if defined(__arm__) #define DRC_SR_REG r10 +#elif defined(__mips__) +#define DRC_SR_REG s6 #elif defined(__i386__) #define DRC_SR_REG edi #elif defined(__x86_64__) diff --git a/platform/common/common.mak b/platform/common/common.mak index f4e5b8c3f..35f6ac9ef 100644 --- a/platform/common/common.mak +++ b/platform/common/common.mak @@ -169,7 +169,7 @@ DEFINES += DRC_DEBUG=$(drc_debug) SRCS_COMMON += $(R)cpu/sh2/mame/sh2dasm.c DASM = $(R)platform/libpicofe/linux/host_dasm.c DASMLIBS = -lbfd -lopcodes -liberty -ifeq "$(ARCH)" "arm" +ifeq ("$(ARCH)",$(filter "$(ARCH)","arm" "mipsel")) ifeq ($(filter_out $(shell $(CC) --print-file-name=libbfd.so),"/"),) DASM = $(R)platform/common/host_dasm.c DASMLIBS = diff --git a/platform/common/disarm.c b/platform/common/disarm.c index 2e7c04e70..80655877a 100644 --- a/platform/common/disarm.c +++ b/platform/common/disarm.c @@ -435,7 +435,7 @@ static int software_interrupt(unsigned int pc, unsigned int insn, char *buf, siz return 1; } -int disarm(unsigned int pc, unsigned int insn, char *buf, size_t buf_len) +int disarm(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len) { if ((insn & 0x0fffffd0) == 0x012fff10) return branch_and_exchange(pc, insn, buf, buf_len); diff --git a/platform/common/disarm.h b/platform/common/disarm.h index 2ea4ccc3b..b8634f682 100644 --- a/platform/common/disarm.h +++ b/platform/common/disarm.h @@ -23,6 +23,6 @@ #ifndef DISARM_H #define DISARM_H -int disarm(unsigned int pc, unsigned int insn, char *buf, unsigned int buf_len); +int disarm(uintptr_t long pc, uint32_t, char *buf, unsigned int buf_len); #endif /* DISARM_H */ diff --git a/platform/common/dismips.c b/platform/common/dismips.c new file mode 100644 index 000000000..af71b0954 --- /dev/null +++ b/platform/common/dismips.c @@ -0,0 +1,346 @@ +/* + * very basic mips disassembler for MIPS32/MIPS64 Release 1, only for picodrive + * Copyright (C) 2019 kub + * + * This work is licensed under the terms of MAME license. + * See COPYING file in the top-level directory. + */ + +// XXX unimplemented: SYSCALL, BREAK, SYNC, SDBBP, T*, CACHE, PREF, +// MOVF/MOVT, LWC*/LDC*, SWC*/SDC*, COP*. +// however, it's certainly good enough for anything picodrive DRC throws at it. + +#include +#include +#include +#include + +#include "dismips.h" + + +static char *const register_names[32] = { + "$zero", + "$at", + "$v0", + "$v1", + "$a0", + "$a1", + "$a2", + "$a3", + "$t0", + "$t1", + "$t2", + "$t3", + "$t4", + "$t5", + "$t6", + "$t7", + "$s0", + "$s1", + "$s2", + "$s3", + "$s4", + "$s5", + "$s6", + "$s7", + "$t8", + "$t9", + "$k0", + "$k1", + "$gp", + "$sp", + "$fp", + "$ra" +}; + + +enum insn_type { + REG_DTS, REG_TS, // 3, 2, or 1 regs + REG_DS, REG_D, REG_S, + S_IMM_DT, // 2 regs with shift amount + B_IMM_S, B_IMM_TS, // pc-relative branches with 1 or 2 regs + J_IMM, // region-relative jump + A_IMM_TS, // arithmetic immediate with 1 or 2 regs + L_IMM_T, L_IMM_TS, // logical immediate with 2 regs + M_IMM_TS, // memory indexed with 2 regs +}; + +struct insn { + unsigned char op; + enum insn_type type; + char *name; +}; + +// ATTN: these array MUST be sorted by op (decode relies on it) + +// instructions with opcode SPECIAL (R-type) +#define OP_SPECIAL 0x00 +static const struct insn special_insns[] = { + {0x00, S_IMM_DT, "sll"}, + {0x02, S_IMM_DT, "srl"}, + {0x03, S_IMM_DT, "sra"}, + {0x04, REG_DTS, "sllv"}, + {0x06, REG_DTS, "srlv"}, + {0x07, REG_DTS, "srav"}, + {0x08, REG_S, "jr"}, + {0x09, REG_DS, "jalr"}, + {0x0a, REG_DTS, "movz"}, + {0x0b, REG_DTS, "movn"}, +// {0x0c, , "syscall"}, +// {0x0d, , "break"}, +// {0x0f, , "sync"}, + {0x10, REG_D, "mfhi"}, + {0x11, REG_S, "mthi"}, + {0x12, REG_D, "mflo"}, + {0x13, REG_S, "mtlo"}, + {0x14, REG_DTS, "dsllv"}, + {0x16, REG_DTS, "dslrv"}, + {0x17, REG_DTS, "dsrav"}, + {0x18, REG_TS, "mult"}, + {0x19, REG_TS, "multu"}, + {0x1A, REG_TS, "div"}, + {0x1B, REG_TS, "divu"}, + {0x1C, REG_TS, "dmult"}, + {0x1D, REG_TS, "dmultu"}, + {0x1E, REG_TS, "ddiv"}, + {0x1F, REG_TS, "ddivu"}, + {0x20, REG_DTS, "add"}, + {0x21, REG_DTS, "addu"}, + {0x22, REG_DTS, "sub"}, + {0x23, REG_DTS, "subu"}, + {0x24, REG_DTS, "and"}, + {0x25, REG_DTS, "or"}, + {0x26, REG_DTS, "xor"}, + {0x27, REG_DTS, "nor"}, + {0x2A, REG_DTS, "slt"}, + {0x2B, REG_DTS, "sltu"}, + {0x2C, REG_DTS, "dadd"}, + {0x2D, REG_DTS, "daddu"}, + {0x2E, REG_DTS, "dsub"}, + {0x2F, REG_DTS, "dsubu"}, +// {0x30, REG_TS, "tge" }, +// {0x31, REG_TS, "tgeu" }, +// {0x32, REG_TS, "tlt" }, +// {0x33, REG_TS, "tltu" }, +// {0x34, REG_TS, "teq" }, +// {0x36, REG_TS, "tne" }, + {0x38, S_IMM_DT, "dsll"}, + {0x3A, S_IMM_DT, "dsrl"}, + {0x3B, S_IMM_DT, "dsra"}, + {0x3D, S_IMM_DT, "dsll32"}, + {0x3E, S_IMM_DT, "dsrl32"}, + {0x3F, S_IMM_DT, "dsra32"}, +}; + +// instructions with opcode SPECIAL2 (R-type) +#define OP_SPECIAL2 0x1C +static const struct insn special2_insns[] = { + {0x00, REG_TS, "madd" }, + {0x01, REG_TS, "maddu" }, + {0x02, REG_TS, "mul" }, + {0x04, REG_TS, "msub" }, + {0x05, REG_TS, "msubu" }, + {0x20, REG_DS, "clz" }, + {0x21, REG_DS, "clo" }, + {0x24, REG_DS, "dclz" }, + {0x25, REG_DS, "dclo" }, +}; + +// instructions with opcode REGIMM (I-type) +#define OP_REGIMM 0x01 +static const struct insn regimm_insns[] = { + {0x00, B_IMM_S, "bltz"}, + {0x01, B_IMM_S, "bgez"}, + {0x02, B_IMM_S, "bltzl"}, + {0x03, B_IMM_S, "bgezl"}, +// {0x08, , "tgei"}, +// {0x09, , "tgeiu"}, +// {0x0a, , "tlti"}, +// {0x0b, , "tltiu"}, +// {0x0c, , "teqi"}, +// {0x0e, , "tnei"}, + {0x10, B_IMM_S, "bltzal"}, + {0x11, B_IMM_S, "bgezal"}, + {0x12, B_IMM_S, "bltzall"}, + {0x13, B_IMM_S, "bgezall"}, + {0x13, B_IMM_S, "bgezall"}, +}; + +// instructions with other opcodes (I-type) +static const struct insn immediate_insns[] = { + {0x02, J_IMM, "j"}, + {0x03, J_IMM, "jal"}, + {0x04, B_IMM_TS, "beq"}, + {0x05, B_IMM_TS, "bne"}, + {0x06, B_IMM_S, "blez"}, + {0x07, B_IMM_S, "bgtz"}, + {0x08, A_IMM_TS, "addi"}, + {0x09, A_IMM_TS, "addiu"}, + {0x0A, A_IMM_TS, "slti"}, + {0x0B, A_IMM_TS, "sltiu"}, + {0x0C, L_IMM_TS, "andi"}, + {0x0D, L_IMM_TS, "ori"}, + {0x0E, L_IMM_TS, "xori"}, + {0x0F, L_IMM_T, "lui"}, + {0x14, B_IMM_TS, "beql"}, + {0x15, B_IMM_TS, "bnel"}, + {0x16, B_IMM_S, "blezl"}, + {0x17, B_IMM_S, "bgtzl"}, + {0x18, A_IMM_TS, "daddi"}, + {0x19, A_IMM_TS, "daddiu"}, + {0x1A, M_IMM_TS, "ldl"}, + {0x1B, M_IMM_TS, "ldr"}, + {0x20, M_IMM_TS, "lb"}, + {0x21, M_IMM_TS, "lh"}, + {0x22, M_IMM_TS, "lwl"}, + {0x23, M_IMM_TS, "lw"}, + {0x24, M_IMM_TS, "lbu"}, + {0x25, M_IMM_TS, "lhu"}, + {0x26, M_IMM_TS, "lwr"}, + {0x27, M_IMM_TS, "lwu"}, + {0x28, M_IMM_TS, "sb"}, + {0x29, M_IMM_TS, "sh"}, + {0x2A, M_IMM_TS, "swl"}, + {0x2B, M_IMM_TS, "sw"}, + {0x2C, M_IMM_TS, "sdl"}, + {0x2D, M_IMM_TS, "sdr"}, + {0x2E, M_IMM_TS, "swr"}, +// {0x2F, , "cache"}, + {0x30, M_IMM_TS, "ll"}, +// {0x31, , "lwc1"}, +// {0x32, , "lwc2"}, +// {0x33, , "pref"}, + {0x34, M_IMM_TS, "lld"}, +// {0x35, , "ldc1"}, +// {0x36, , "ldc2"}, + {0x37, M_IMM_TS, "ld"}, + {0x38, M_IMM_TS, "sc"}, +// {0x39, , "swc1"}, +// {0x3A, , "swc2"}, + {0x3C, M_IMM_TS, "scd"}, +// {0x3D, , "sdc1"}, +// {0x3E, , "sdc2"}, + {0x3F, M_IMM_TS, "sd"}, +}; + +#define ARRAY_SIZE(a) (sizeof(a)/sizeof(*a)) + +// find instruction description for insn +static const struct insn *decode_insn(uint32_t insn) +{ + uint32_t op = insn >> 26; + const struct insn *pi; + int l = 0, r = 0; + + if (op == OP_SPECIAL) { + op = insn & 0x3f; + pi = special_insns; + r = ARRAY_SIZE(special_insns)-1; + } else if (op == OP_SPECIAL2) { + op = insn & 0x3f; + pi = special2_insns; + r = ARRAY_SIZE(special2_insns)-1; + } else if (op == OP_REGIMM) { + op = (insn>>16) & 0x1f; + pi = regimm_insns; + r = ARRAY_SIZE(regimm_insns)-1; + } else { + pi = immediate_insns; + r = ARRAY_SIZE(immediate_insns)-1; + } + + while (l <= r) { + int m = (l+r) / 2; + if (pi[m].op == op) + return pi+m; + else if (pi[m].op < op) + l = m+1; + else + r = m-1; + } + return NULL; +} + +// calculate target for pc-relative branches +static unsigned long b_target(unsigned long pc, uint32_t insn) +{ + return pc + 4 + (int16_t)insn * 4; +} + +// calculate target for region-relative branches +static unsigned long j_target(unsigned long pc, uint32_t insn) +{ + return (pc & ~0x0fffffffL) | ((insn & 0x03ffffff) << 2); +} + +// main disassembler function +int dismips(uintptr_t pc, uint32_t insn, char *buf, unsigned int buflen) +{ + const struct insn *pi = decode_insn(insn); + char *rs = register_names[(insn >> 21) & 0x1f]; + char *rt = register_names[(insn >> 16) & 0x1f]; + char *rd = register_names[(insn >> 11) & 0x1f]; + int sa = (insn >> 6) & 0x1f; + int imm = (int16_t) insn; + + if (pi == NULL) { + snprintf(buf, buflen, "0x%x", insn); + return 0; + } + + switch (pi->type) { + case REG_DTS: + if ((insn & 0x3f) == 0x25 /*OR*/ && (insn & 0x1f0000) == 0 /*zero*/) + snprintf(buf, buflen, "move %s, %s", rd, rs); + else + snprintf(buf, buflen, "%s %s, %s, %s", pi->name, rd, rs, rt); + break; + case REG_TS: + snprintf(buf, buflen, "%s %s, %s", pi->name, rs, rt); + break; + case REG_DS: + snprintf(buf, buflen, "%s %s, %s", pi->name, rd, rs); + break; + case REG_D: + snprintf(buf, buflen, "%s %s", pi->name, rd); + break; + case REG_S: + snprintf(buf, buflen, "%s %s", pi->name, rs); + break; + case S_IMM_DT: + if (insn == 0x00000000) + snprintf(buf, buflen, "nop"); + else + snprintf(buf, buflen, "%s %s, %s, %d", pi->name, rd, rt, sa); + break; + case B_IMM_S: + snprintf(buf, buflen, "%s %s, 0x%lx", pi->name, rs, b_target(pc, insn)); + break; + case B_IMM_TS: + snprintf(buf, buflen, "%s %s, %s, 0x%lx", pi->name, rs, rt, b_target(pc, insn)); + break; + case J_IMM: + snprintf(buf, buflen, "%s 0x%lx", pi->name, j_target(pc, insn)); + break; + case A_IMM_TS: + if (abs(imm) < 1000) + snprintf(buf, buflen, "%s %s, %s, %d", pi->name, rt, rs, imm); + else + snprintf(buf, buflen, "%s %s, %s, 0x%x", pi->name, rt, rs, imm); + break; + case L_IMM_T: + snprintf(buf, buflen, "%s %s, 0x%x", pi->name, rt, (uint16_t)imm); + break; + case L_IMM_TS: + if ((insn >> 26) == 0x34 /*ORI*/ && (insn & 0x03e00000) == 0 /*zero*/) + snprintf(buf, buflen, "li %s, 0x%x", rt, (uint16_t)imm); + else + snprintf(buf, buflen, "%s %s, %s, 0x%x", pi->name, rt, rs, (uint16_t)imm); + break; + case M_IMM_TS: + snprintf(buf, buflen, "%s %s, %d(%s)", pi->name, rt, imm, rs); + break; + } + return 1; +} + diff --git a/platform/common/dismips.h b/platform/common/dismips.h new file mode 100644 index 000000000..e6338defa --- /dev/null +++ b/platform/common/dismips.h @@ -0,0 +1,6 @@ +#ifndef DISMIPS_H +#define DISMIPS_H + +int dismips(uintptr_t pc, uint32_t insn, char *buf, unsigned int buf_len); + +#endif /* DISMIPS_H */ diff --git a/platform/linux/emu.c b/platform/linux/emu.c index 8af5afa80..887d78360 100644 --- a/platform/linux/emu.c +++ b/platform/linux/emu.c @@ -29,7 +29,7 @@ void pemu_prep_defconfig(void) void pemu_validate_config(void) { -#if !defined(__arm__) && !defined(__i386__) && !defined(__x86_64__) +#if !defined(__arm__) && !defined(__aarch64__) && !defined(__mips__) && !defined(__i386__) && !defined(__x86_64__) PicoIn.opt &= ~POPT_EN_DRC; #endif } From 57f76d2cb78f13fffde7f20485fbc4e42e147da2 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 30 Jul 2019 21:04:16 +0200 Subject: [PATCH 050/174] sh2 drc: add aarch64 backend for A64 --- Makefile | 15 +- config.aarch64 | 15 + cpu/drc/emit_arm64.c | 1328 ++++++++++++++++++++++++++++++++++++++++++ cpu/sh2/compiler.c | 50 ++ cpu/sh2/compiler.h | 2 + 5 files changed, 1404 insertions(+), 6 deletions(-) create mode 100644 config.aarch64 create mode 100644 cpu/drc/emit_arm64.c diff --git a/Makefile b/Makefile index 7f02a1c96..78de3c562 100644 --- a/Makefile +++ b/Makefile @@ -55,15 +55,18 @@ asm_cdmemory ?= 1 asm_mix ?= 1 asm_32xdraw ?= 1 asm_32xmemory ?= 1 -else # if not arm +else ifneq (,$(findstring 86,$(ARCH))) use_fame ?= 1 use_cz80 ?= 1 -ifneq (,$(findstring 86,$(ARCH))) use_sh2drc ?= 1 -endif -ifneq (,$(findstring mips,$(ARCH))) +else ifneq (,$(findstring mips,$(ARCH))) +use_fame ?= 1 +use_cz80 ?= 1 +use_sh2drc ?= 1 +else ifneq (,$(findstring aarch64,$(ARCH))) +use_fame ?= 1 +use_cz80 ?= 1 use_sh2drc ?= 1 -endif endif -include Makefile.local @@ -269,7 +272,7 @@ pico/carthw_cfg.c: pico/carthw.cfg # random deps pico/carthw/svp/compiler.o : cpu/drc/emit_arm.c -cpu/sh2/compiler.o : cpu/drc/emit_arm.c +cpu/sh2/compiler.o : cpu/drc/emit_arm.c cpu/drc/emit_arm64.c cpu/sh2/compiler.o : cpu/drc/emit_x86.c cpu/drc/emit_mips.c cpu/sh2/mame/sh2pico.o : cpu/sh2/mame/sh2.c pico/pico.o pico/cd/mcd.o pico/32x/32x.o : pico/pico_cmn.c pico/pico_int.h diff --git a/config.aarch64 b/config.aarch64 new file mode 100644 index 000000000..70a6fe300 --- /dev/null +++ b/config.aarch64 @@ -0,0 +1,15 @@ +# Automatically generated by configure +# Configured with: './configure' '--platform=generic' +CC = aarch64-linux-gnu-gcc +CXX = aarch64-linux-gnu-g++ +AS = aarch64-linux-gnu-as +STRIP = aarch64-linux-gnu-strip +CFLAGS += -I/usr/include/SDL +CFLAGS += -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -fno-stack-protector +ASFLAGS += +LDFLAGS += +LDLIBS += -lSDL -lasound -lpng -lz -lm -lstdc++ -ldl + +ARCH = aarch64 +PLATFORM = generic +SOUND_DRIVERS = alsa diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c new file mode 100644 index 000000000..90010d803 --- /dev/null +++ b/cpu/drc/emit_arm64.c @@ -0,0 +1,1328 @@ +/* + * Basic macros to emit ARM A64 instructions and some utils + * Copyright (C) 2019 kub + * + * This work is licensed under the terms of MAME license. + * See COPYING file in the top-level directory. + */ +#define HOST_REGS 32 +#define CONTEXT_REG 19 +#define RET_REG 0 + +// R31 doesn't exist, it aliases either with zero or SP +#define SP 31 // stack pointer +#define Z0 31 // zero register +#define LR 30 // link register +#define FP 29 // frame pointer +#define PR 18 // platform register + +// All operations but ptr ops are using the lower 32 bits of the A64 registers. +// The upper 32 bits are only used in ptr ops. + + +#define A64_COND_EQ 0x0 +#define A64_COND_NE 0x1 +#define A64_COND_HS 0x2 +#define A64_COND_LO 0x3 +#define A64_COND_MI 0x4 +#define A64_COND_PL 0x5 +#define A64_COND_VS 0x6 +#define A64_COND_VC 0x7 +#define A64_COND_HI 0x8 +#define A64_COND_LS 0x9 +#define A64_COND_GE 0xa +#define A64_COND_LT 0xb +#define A64_COND_GT 0xc +#define A64_COND_LE 0xd +#define A64_COND_CS A64_COND_HS +#define A64_COND_CC A64_COND_LO +#define A64_COND_AL 0xe +#define A64_COND_NV 0xf + +/* unified conditions */ +#define DCOND_EQ A64_COND_EQ +#define DCOND_NE A64_COND_NE +#define DCOND_MI A64_COND_MI +#define DCOND_PL A64_COND_PL +#define DCOND_HI A64_COND_HI +#define DCOND_HS A64_COND_HS +#define DCOND_LO A64_COND_LO +#define DCOND_GE A64_COND_GE +#define DCOND_GT A64_COND_GT +#define DCOND_LT A64_COND_LT +#define DCOND_LS A64_COND_LS +#define DCOND_LE A64_COND_LE +#define DCOND_VS A64_COND_VS +#define DCOND_VC A64_COND_VC + +#define DCOND_CS A64_COND_HS +#define DCOND_CC A64_COND_LO + + +// unified insn +#define A64_INSN(op, b29, b22, b21, b16, b12, b10, b5, b0) \ + (((op)<<25)|((b29)<<29)|((b22)<<22)|((b21)<<21)|((b16)<<16)|((b12)<<12)|((b10)<<10)|((b5)<<5)|((b0)<<0)) + +#define _ 0 // marker for "field unused" + +#define A64_NOP \ + A64_INSN(0xa,0x6,0x4,_,0x3,0x2,_,0,0x1f) // 0xd503201f + +// arithmetic/logical + +enum { OP_AND, OP_OR, OP_EOR, OP_ANDS, OP_ADD, OP_ADDS, OP_SUB, OP_SUBS }; +enum { ST_LSL, ST_LSR, ST_ASR, ST_ROR }; +enum { XT_UXTW=0x4, XT_UXTX=0x6, XT_LSL=0x7, XT_SXTW=0xc, XT_SXTX=0xe }; +#define OP_SZ64 (1 << 31) // bit for 64 bit op selection +#define OP_N64 (1 << 22) // N-bit for 64 bit logical immediate ops + +#define A64_OP_REG(op, n, rd, rn, rm, stype, simm) /* arith+logical, ST_ */ \ + A64_INSN(0x5,(op)&3,((op)&4)|stype,n,rm,_,simm,rn,rd) +#define A64_OP_XREG(op, rd, rn, rm, xtopt, simm) /* arith, XT_ */ \ + A64_INSN(0x5,(op)&3,0x4,1,rm,xtopt,simm,rn,rd) +#define A64_OP_IMM12(op, rd, rn, imm, lsl12) /* arith */ \ + A64_INSN(0x8,(op)&3,((op)&4)|lsl12,_,_,_,(imm)&0xfff,rn,rd) +#define A64_OP_IMMBM(op, rd, rn, immr, imms) /* logical */ \ + A64_INSN(0x9,(op)&3,0x0,_,immr,_,(imms)&0x3f,rn,rd) + +// rd = rn OP (rm SHIFT simm) +#define A64_ADD_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_ADD,0,rd,rn,rm,stype,simm) +#define A64_ADDS_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_ADDS,0,rd,rn,rm,stype,simm) +#define A64_SUB_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_SUB,0,rd,rn,rm,stype,simm) +#define A64_SUBS_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_SUBS,0,rd,rn,rm,stype,simm) + +#define A64_NEG_REG(rd, rm, stype, simm) \ + A64_SUB_REG(rd,Z0,rm,stype,simm) +#define A64_NEGS_REG(rd, rm, stype, simm) \ + A64_SUBS_REG(rd,Z0,rm,stype,simm) +#define A64_NEGC_REG(rd, rm) \ + A64_SBC_REG(rd,Z0,rm,stype,simm) +#define A64_NEGCS_REG(rd, rm) \ + A64_SBCS_REG(rd,Z0,rm,stype,simm) +#define A64_CMP_REG(rn, rm, stype, simm) \ + A64_SUBS_REG(Z0, rn, rm, stype, simm) +#define A64_CMN_REG(rn, rm, stype, simm) \ + A64_ADDS_REG(Z0, rn, rm, stype, simm) + +#define A64_EOR_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_EOR,0,rd,rn,rm,stype,simm) +#define A64_OR_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_OR,0,rd,rn,rm,stype,simm) +#define A64_ORN_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_OR,1,rd,rn,rm,stype,simm) +#define A64_AND_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_AND,0,rd,rn,rm,stype,simm) +#define A64_ANDS_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_ANDS,0,rd,rn,rm,stype,simm) +#define A64_BIC_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_AND,1,rd,rn,rm,stype,simm) +#define A64_BICS_REG(rd, rn, rm, stype, simm) \ + A64_OP_REG(OP_ANDS,1,rd,rn,rm,stype,simm) + +#define A64_TST_REG(rn, rm, stype, simm) \ + A64_ANDS_REG(Z0, rn, rm, stype, simm) +#define A64_MOV_REG(rd, rm, stype, simm) \ + A64_OR_REG(rd, Z0, rm, stype, simm); +#define A64_MVN_REG(rd, rm, stype, simm) \ + A64_ORN_REG(rd, Z0, rm, stype, simm); + +// rd = rn OP (rm EXTEND simm) +#define A64_ADD_XREG(rd, rn, rm, xtopt, simm) \ + A64_OP_XREG(OP_ADD,rd,rn,rm,xtopt,simm) +#define A64_ADDS_XREG(rd, rn, rm, xtopt, simm) \ + A64_OP_XREG(OP_ADDS,rd,rn,rm,xtopt,simm) +#define A64_SUB_XREG(rd, rn, rm, stype, simm) \ + A64_OP_XREG(OP_SUB,rd,rn,rm,xtopt,simm) +#define A64_SUBS_XREG(rd, rn, rm, stype, simm) \ + A64_OP_XREG(OP_SUBS,rd,rn,rm,xtopt,simm) + +// rd = rn OP rm OP carry +#define A64_ADC_REG(rd, rn, rm) \ + A64_INSN(0xd,OP_ADD &3,0x0,_,rm,_,_,rn,rd) +#define A64_ADCS_REG(rd, rn, rm) \ + A64_INSN(0xd,OP_ADDS&3,0x0,_,rm,_,_,rn,rd) +#define A64_SBC_REG(rd, rn, rm, s) \ + A64_INSN(0xd,OP_SUB &3,0x0,_,rm,_,_,rn,rd) +#define A64_SBCS_REG(rd, rn, rm) \ + A64_INSN(0xd,OP_SUBS&3,0x0,_,rm,_,_,rn,rd) + +// rd = rn SHIFT rm +#define A64_LSL_REG(rd, rn, rm) \ + A64_INSN(0xd,0x0,0x3,_,rm,_,0x8,rn,rd) +#define A64_LSR_REG(rd, rn, rm) \ + A64_INSN(0xd,0x0,0x3,_,rm,_,0xa,rn,rd) +#define A64_ASR_REG(rd, rn, rm) \ + A64_INSN(0xd,0x0,0x3,_,rm,_,0x9,rn,rd) +#define A64_ROR_REG(rd, rn, rm) \ + A64_INSN(0xd,0x0,0x3,_,rm,_,0xb,rn,rd) + +// rd = REVERSE(n) rn +#define A64_RBIT_REG(rd, rn) \ + A64_INSN(0xd,0x2,0x3,_,_,_,_,rn,rd) + +// rd = rn OP (imm12 << (0|12)) +#define A64_ADD_IMM(rd, rn, imm12, lsl12) \ + A64_OP_IMM12(OP_ADD, rd, rn, imm12, lsl12) +#define A64_ADDS_IMM(rd, rn, imm12, lsl12) \ + A64_OP_IMM12(OP_ADDS, rd, rn, imm12, lsl12) +#define A64_SUB_IMM(rd, rn, imm12, lsl12) \ + A64_OP_IMM12(OP_SUB, rd, rn, imm12, lsl12) +#define A64_SUBS_IMM(rd, rn, imm12, lsl12) \ + A64_OP_IMM12(OP_SUBS, rd, rn, imm12, lsl12) + +#define A64_CMP_IMM(rn, imm12, lsl12) \ + A64_SUBS_IMM(Z0,rn,imm12,lsl12) +#define A64_CMN_IMM(rn, imm12, lsl12) \ + A64_ADDS_IMM(Z0,rn,imm12,lsl12) + +// rd = rn OP immbm; immbm is a repeated special pattern of 2^n bits length +#define A64_EOR_IMM(rd, rn, immr, imms) \ + A64_OP_IMMBM(OP_EOR,rd,rn,immr,imms) +#define A64_OR_IMM(rd, rn, immr, imms) \ + A64_OP_IMMBM(OP_OR,rd,rn,immr,imms) +#define A64_AND_IMM(rd, rn, immr, imms) \ + A64_OP_IMMBM(OP_AND,rd,rn,immr,imms) +#define A64_ANDS_IMM(rd, rn, immr, imms) \ + A64_OP_IMMBM(OP_ANDS,rd,rn,immr,imms) +#define A64_TST_IMM(rn, immr, imms) \ + A64_OP_IMMBM(OP_ANDS,Z0,rn,immr,imms) +#define A64_MOV_IMM(rd, rn, immr, imms) \ + A64_OP_IMMBM(OP_OR,rd,Z0,immr,imms) + +// rd = (imm16 << (0|16|32|48)) +#define A64_MOVN_IMM(rd, imm16, lsl16) \ + A64_INSN(0x9,0x0,0x2,lsl16,_,_,_,(imm16)&0xffff,rd) +#define A64_MOVZ_IMM(rd, imm16, lsl16) \ + A64_INSN(0x9,0x2,0x2,lsl16,_,_,_,(imm16)&0xffff,rd) +#define A64_MOVK_IMM(rd, imm16, lsl16) \ + A64_INSN(0x9,0x3,0x2,lsl16,_,_,_,(imm16)&0xffff,rd) +#define A64_MOVT_IMM(rd, imm16, lsl16) \ + A64_INSN(0x9,0x3,0x2,lsl16,_,_,_,(imm16)&0xffff,rd) + +// rd = rn SHIFT imm6 +#define A64_LSL_IMM(rd, rn, bits) /* UBFM */ \ + A64_INSN(0x9,0x2,0x4,_,32-(bits),_,31-(bits),rn,rd) +#define A64_LSR_IMM(rd, rn, bits) /* UBFM */ \ + A64_INSN(0x9,0x2,0x4,_,bits,_,31,rn,rd) +#define A64_ASR_IMM(rd, rn, bits) /* SBFM */ \ + A64_INSN(0x9,0x0,0x4,_,bits,_,31,rn,rd) +#define A64_ROR_IMM(rd, rn, bits) /* EXTR */ \ + A64_INSN(0x9,0x0,0x6,_,rn,_,bits,rn,rd) + +#define A64_SXT_IMM(rd, rn, bits) \ + A64_INSN(0x9,0x0,0x4,0,0,_,bits-1,rn,rd) +#define A64_UXT_IMM(rd, rn, bits) \ + A64_INSN(0x9,0x2,0x4,0,0,_,bits-1,rn,rd) + +// multiplication + +#define A64_SMULL(rd, rn, rm) /* Xd = Wn*Wm (+ Xa) */ \ + A64_INSN(0xd,0x4,0x4,1,rm,_,Z0,rn,rd) +#define A64_SMADDL(rd, rn, rm, ra) \ + A64_INSN(0xd,0x4,0x4,1,rm,_,ra,rn,rd) +#define A64_UMULL(rd, rn, rm) \ + A64_INSN(0xd,0x4,0x6,1,rm,_,Z0,rn,rd) +#define A64_UMADDL(rd, rn, rm, ra) \ + A64_INSN(0xd,0x4,0x6,1,rm,_,ra,rn,rd) +#define A64_MUL(rd, rn, rm) /* Wd = Wn*Wm (+ Wa) */ \ + A64_INSN(0xd,0x0,0x4,0,rm,_,Z0,rn,rd) +#define A64_MADD(rd, rn, rm, ra) \ + A64_INSN(0xd,0x0,0x4,0,rm,_,ra,rn,rd) + +// branching + +#define A64_B(offs26) \ + A64_INSN(0xa,0x0,_,_,_,_,_,_,(offs26) >> 2) +#define A64_BL(offs26) \ + A64_INSN(0xa,0x4,_,_,_,_,_,_,(offs26) >> 2) +#define A64_BR(rn) \ + A64_INSN(0xb,0x6,_,_,0x1f,_,_,rn,_) +#define A64_BLR(rn) \ + A64_INSN(0xb,0x6,_,_,0x3f,_,_,rn,_) +#define A64_RET(rn) /* same as BR, but hint for cpu */ \ + A64_INSN(0xb,0x6,_,_,0x5f,_,_,rn,_) +#define A64_BCOND(cond, offs19) \ + A64_INSN(0xa,0x2,_,_,_,_,_,(offs19) >> 2,(cond)) + +// load pc-relative + +#define A64_LDRLIT_IMM(rd, offs19) \ + A64_INSN(0xc,0x0,0x0,_,_,_,_,(offs19) >> 2,rd) +#define A64_LDRXLIT_IMM(rd, offs19) \ + A64_INSN(0xc,0x2,0x0,_,_,_,_,(offs19) >> 2,rd) +#define A64_ADRXLIT_IMM(rd, offs21) \ + A64_INSN(0x8,(offs21)&3,0x0,_,_,_,_,(offs21) >> 2,rd) + +// load/store indexed base. Only the signed unscaled variant is used here. + +enum { LT_ST, LT_LD, LT_LDSX, LT_LDS }; +enum { AM_B=0x1, AM_H=0x3, AM_W=0x5, AM_X=0x7 }; +enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; +#define A64_LDST_AM(ir,rm,optimm) (((ir)<<9)|((rm)<<4)|((optimm)&0x1ff)) +#define A64_OP_LDST(sz, op, am, mode, rm, rd) \ + A64_INSN(0xc,sz,op,_,_,am,mode,rm,rd) + +#define A64_LDSTX_IMM(rd, rn, offs9, ld, mode) \ + A64_OP_LDST(AM_X,ld,A64_LDST_AM(0,_,offs9),mode,rn,rd) +#define A64_LDST_IMM(rd, rn, offs9, ld, mode) \ + A64_OP_LDST(AM_W,ld,A64_LDST_AM(0,_,offs9),mode,rn,rd) +#define A64_LDSTH_IMM(rd, rn, offs9, ld, mode) \ + A64_OP_LDST(AM_H,ld,A64_LDST_AM(0,_,offs9),mode,rn,rd) +#define A64_LDSTB_IMM(rd, rn, offs9, ld, mode) \ + A64_OP_LDST(AM_B,ld,A64_LDST_AM(0,_,offs9),mode,rn,rd) + +// NB: pre/postindex isn't available with register offset +#define A64_LDSTX_REG(rd, rn, rm, ld, opt) \ + A64_OP_LDST(AM_X,ld,A64_LDST_AM(1,rm,opt),AM_IDXREG,rn,rd) +#define A64_LDST_REG(rd, rn, rm, ld, opt) \ + A64_OP_LDST(AM_W,ld,A64_LDST_AM(1,rm,opt),AM_IDXREG,rn,rd) +#define A64_LDSTH_REG(rd, rn, rm, ld, opt) \ + A64_OP_LDST(AM_H,ld,A64_LDST_AM(1,rm,opt),AM_IDXREG,rn,rd) +#define A64_LDSTB_REG(rd, rn, rm, ld, opt) \ + A64_OP_LDST(AM_B,ld,A64_LDST_AM(1,rm,opt),AM_IDXREG,rn,rd) + +#define A64_LDSTPX_IMM(rn, r1, r2, offs7, ld, mode) \ + A64_INSN(0x4,0x5,(mode<<1)|ld,_,_,(offs7)&0x3f8,r2,rn,r1) + +// 64 bit stuff for pointer handling + +#define A64_ADDX_XREG(rd, rn, rm, xtopt, simm) \ + OP_SZ64|A64_OP_XREG(OP_ADD,rd,rn,rm,xtopt,simm) +#define A64_ADDX_REG(rd, rn, rm, stype, simm) \ + OP_SZ64|A64_ADD_REG(rd, rn, rm, stype, simm) +#define A64_ADDXS_REG(rd, rn, rm, stype, simm) \ + OP_SZ64|A64_ADDS_REG(rd, rn, rm, stype, simm) +#define A64_ORX_REG(rd, rn, rm, stype, simm) \ + OP_SZ64|A64_OR_REG(rd, rn, rm, stype, simm) +#define A64_TSTX_REG(rn, rm, stype, simm) \ + OP_SZ64|A64_TST_REG(rn, rm, stype, simm) +#define A64_MOVX_REG(rd, rm, stype, simm) \ + OP_SZ64|A64_MOV_REG(rd, rm, stype, simm) +#define A64_ADDX_IMM(rd, rn, imm12) \ + OP_SZ64|A64_ADD_IMM(rd, rn, imm12, 0) +#define A64_EORX_IMM(rd, rn, immr, imms) \ + OP_SZ64|OP_N64|A64_EOR_IMM(rd, rn, immr, imms) +#define A64_UXTX_IMM(rd, rn, bits) \ + OP_SZ64|OP_N64|A64_UXT_IMM(rd, rn, bits) +#define A64_LSRX_IMM(rd, rn, bits) \ + OP_SZ64|OP_N64|A64_LSR_IMM(rd, rn, bits)|(63<<10) + + +// XXX: tcache_ptr type for SVP and SH2 compilers differs.. +#define EMIT_PTR(ptr, x) \ + do { \ + *(u32 *)(ptr) = x; \ + ptr = (void *)((u8 *)(ptr) + sizeof(u32)); \ + } while (0) + +#define EMIT(op) \ + do { \ + EMIT_PTR(tcache_ptr, op); \ + COUNT_OP; \ + } while (0) + + +// if-then-else conditional execution helpers +#define JMP_POS(ptr) \ + ptr = tcache_ptr; \ + EMIT(A64_B(0)); + +#define JMP_EMIT(cond, ptr) { \ + u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr); \ + EMIT_PTR(ptr, A64_BCOND(cond, val_ & 0x001fffff)); \ +} + +#define JMP_EMIT_NC(ptr) { \ + u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr); \ + EMIT_PTR(ptr, A64_B(val_ & 0x0fffffff)); \ +} + +#define EMITH_JMP_START(cond) { \ + u8 *cond_ptr; \ + JMP_POS(cond_ptr) + +#define EMITH_JMP_END(cond) \ + JMP_EMIT(cond, cond_ptr); \ +} + +#define EMITH_JMP3_START(cond) { \ + u8 *cond_ptr, *else_ptr; \ + JMP_POS(cond_ptr) + +#define EMITH_JMP3_MID(cond) \ + JMP_POS(else_ptr); \ + JMP_EMIT(cond, cond_ptr); + +#define EMITH_JMP3_END() \ + JMP_EMIT_NC(else_ptr); \ +} + +// "simple" jump (no more then a few insns) +// ARM32 will use conditional instructions here +#define EMITH_SJMP_START EMITH_JMP_START +#define EMITH_SJMP_END EMITH_JMP_END + +#define EMITH_SJMP3_START EMITH_JMP3_START +#define EMITH_SJMP3_MID EMITH_JMP3_MID +#define EMITH_SJMP3_END EMITH_JMP3_END + +#define EMITH_SJMP2_START(cond) \ + EMITH_SJMP3_START(cond) +#define EMITH_SJMP2_MID(cond) \ + EMITH_SJMP3_MID(cond) +#define EMITH_SJMP2_END(cond) \ + EMITH_SJMP3_END() + + +// data processing, register +#define emith_move_r_r_ptr(d, s) \ + EMIT(A64_MOVX_REG(d, s, ST_LSL, 0)) +#define emith_move_r_r_ptr_c(cond, d, s) \ + emith_move_r_r_ptr(d, s) + +#define emith_move_r_r(d, s) \ + EMIT(A64_MOV_REG(d, s, ST_LSL, 0)) +#define emith_move_r_r_c(cond, d, s) \ + emith_move_r_r(d, s) + +#define emith_mvn_r_r(d, s) \ + EMIT(A64_MVN_REG(d, s, ST_LSL, 0)) + +#define emith_add_r_r_r_lsl_ptr(d, s1, s2, simm) do { \ + if (simm < 4) EMIT(A64_ADDX_XREG(d, s1, s2, XT_SXTW, simm)); \ + else EMIT(A64_ADDX_REG(d, s1, s2, ST_LSL, simm)); \ +} while (0) +#define emith_add_r_r_r_lsl(d, s1, s2, simm) \ + EMIT(A64_ADD_REG(d, s1, s2, ST_LSL, simm)) + +#define emith_addf_r_r_r_lsl(d, s1, s2, simm) \ + EMIT(A64_ADDS_REG(d, s1, s2, ST_LSL, simm)) + +#define emith_addf_r_r_r_lsr(d, s1, s2, simm) \ + EMIT(A64_ADDS_REG(d, s1, s2, ST_LSR, simm)) + +#define emith_sub_r_r_r_lsl(d, s1, s2, simm) \ + EMIT(A64_SUB_REG(d, s1, s2, ST_LSL, simm)) + +#define emith_subf_r_r_r_lsl(d, s1, s2, simm) \ + EMIT(A64_SUBS_REG(d, s1, s2, ST_LSL, simm)) + +#define emith_or_r_r_r_lsl(d, s1, s2, simm) \ + EMIT(A64_OR_REG(d, s1, s2, ST_LSL, simm)) + +#define emith_eor_r_r_r_lsl(d, s1, s2, simm) \ + EMIT(A64_EOR_REG(d, s1, s2, ST_LSL, simm)) + +#define emith_eor_r_r_r_lsr(d, s1, s2, simm) \ + EMIT(A64_EOR_REG(d, s1, s2, ST_LSR, simm)) + +#define emith_and_r_r_r_lsl(d, s1, s2, simm) \ + EMIT(A64_AND_REG(d, s1, s2, ST_LSL, simm)) + +#define emith_or_r_r_lsl(d, s, lslimm) \ + emith_or_r_r_r_lsl(d, d, s, lslimm) + +#define emith_eor_r_r_lsr(d, s, lsrimm) \ + emith_eor_r_r_r_lsr(d, d, s, lsrimm) + +#define emith_add_r_r_r(d, s1, s2) \ + emith_add_r_r_r_lsl(d, s1, s2, 0) + +#define emith_addf_r_r_r(d, s1, s2) \ + emith_addf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_sub_r_r_r(d, s1, s2) \ + emith_sub_r_r_r_lsl(d, s1, s2, 0) + +#define emith_subf_r_r_r(d, s1, s2) \ + emith_subf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_or_r_r_r(d, s1, s2) \ + emith_or_r_r_r_lsl(d, s1, s2, 0) + +#define emith_eor_r_r_r(d, s1, s2) \ + emith_eor_r_r_r_lsl(d, s1, s2, 0) + +#define emith_and_r_r_r(d, s1, s2) \ + emith_and_r_r_r_lsl(d, s1, s2, 0) + +#define emith_add_r_r_ptr(d, s) \ + emith_add_r_r_r_lsl_ptr(d, d, s, 0) +#define emith_add_r_r(d, s) \ + emith_add_r_r_r(d, d, s) + +#define emith_sub_r_r(d, s) \ + emith_sub_r_r_r(d, d, s) + +#define emith_neg_r_r(d, s) \ + EMIT(A64_NEG_REG(d, s, ST_LSL, 0)) + +#define emith_adc_r_r_r(d, s1, s2) \ + EMIT(A64_ADC_REG(d, s1, s2)) + +#define emith_adc_r_r(d, s) \ + EMIT(A64_ADC_REG(d, d, s)) + +#define emith_adcf_r_r_r(d, s1, s2) \ + EMIT(A64_ADCS_REG(d, s1, s2)) + +#define emith_sbcf_r_r_r(d, s1, s2) \ + EMIT(A64_SBCS_REG(d, s1, s2)) + +#define emith_and_r_r(d, s) \ + emith_and_r_r_r(d, d, s) +#define emith_and_r_r_c(cond, d, s) \ + emith_and_r_r(d, s) + +#define emith_or_r_r(d, s) \ + emith_or_r_r_r(d, d, s) + +#define emith_eor_r_r(d, s) \ + emith_eor_r_r_r(d, d, s) + +#define emith_tst_r_r_ptr(d, s) \ + EMIT(A64_TSTX_REG(d, s, ST_LSL, 0)) +#define emith_tst_r_r(d, s) \ + EMIT(A64_TST_REG(d, s, ST_LSL, 0)) + +#define emith_teq_r_r(d, s) do { \ + int _t = rcache_get_tmp(); \ + emith_eor_r_r_r(_t, d, s); \ + emith_cmp_r_imm(_t, 0); \ + rcache_free_tmp(_t); \ +} while (0) + +#define emith_cmp_r_r(d, s) \ + EMIT(A64_CMP_REG(d, s, ST_LSL, 0)) + +#define emith_addf_r_r(d, s) \ + emith_addf_r_r_r(d, d, s) + +#define emith_subf_r_r(d, s) \ + emith_subf_r_r_r(d, d, s) + +#define emith_adcf_r_r(d, s) \ + emith_adcf_r_r_r(d, d, s) + +#define emith_sbcf_r_r(d, s) \ + emith_sbcf_r_r_r(d, d, s) + +#define emith_negcf_r_r(d, s) \ + emith_sbcf_r_r_r(d, Z0, s) + + +// move immediate + +static void emith_move_imm64(int r, int wx, int64_t imm) +{ + int sz64 = wx ? OP_SZ64:0; + int c, s; + + if (!imm) { + EMIT(sz64|A64_MOVZ_IMM(r, imm, 0)); + return; + } + if (imm && -imm == (u16)-imm) { + EMIT(sz64|A64_MOVN_IMM(r, ~imm, 0)); + return; + } + + for (c = s = 0; s < (wx ? 4:2) && imm; s++, imm >>= 16) + if ((u16)(imm)) { + if (c++) EMIT(sz64|A64_MOVK_IMM(r, imm, s)); + else EMIT(sz64|A64_MOVZ_IMM(r, imm, s)); + } +} + +#define emith_move_r_ptr_imm(r, imm) \ + emith_move_imm64(r, 1, (intptr_t)(imm)) + +#define emith_move_r_imm(r, imm) \ + emith_move_imm64(r, 0, (s32)(imm)) +#define emith_move_r_imm_c(cond, r, imm) \ + emith_move_r_imm(r, imm) + + +// arithmetic, immediate +static void emith_arith_imm(int op, int wx, int rd, int rn, s32 imm) +{ + u32 sz64 = wx ? OP_SZ64:0; + + if (imm < 0) { + op ^= (OP_ADD ^ OP_SUB); + imm = -imm; + } + if (imm == 0) { + // value 0, must emit if op is *S or source isn't dest + if ((op & 1) || rd != rn) + EMIT(sz64|A64_OP_IMM12(op, rd, rn, 0, 0)); + } else if (imm >> 24) { + // value too large + int _t = rcache_get_tmp(); + emith_move_r_imm(_t, imm); + EMIT(sz64|A64_OP_REG(op, 0, rd, rn, _t, ST_LSL, 0)); + rcache_free_tmp(_t); + } else { + int rs = rn; + if ((imm) & 0x000fff) { + EMIT(sz64|A64_OP_IMM12(op, rd, rs, imm, 0)); rs = rd; + } + if ((imm) & 0xfff000) { + EMIT(sz64|A64_OP_IMM12(op, rd, rs, imm >>12, 1)); + } + } +} + +#define emith_add_r_imm(r, imm) \ + emith_arith_imm(OP_ADD, 0, r, r, imm) +#define emith_add_r_imm_c(cond, r, imm) \ + emith_add_r_imm(r, imm) + +#define emith_addf_r_imm(r, imm) \ + emith_arith_imm(OP_ADDS, 0, r, r, imm) + +#define emith_sub_r_imm(r, imm) \ + emith_arith_imm(OP_SUB, 0, r, r, imm) +#define emith_sub_r_imm_c(cond, r, imm) \ + emith_sub_r_imm(r, imm) + +#define emith_subf_r_imm(r, imm) \ + emith_arith_imm(OP_SUBS, 0, r, r, imm) + + +#define emith_adc_r_imm(r, imm) do { \ + int _t = rcache_get_tmp(); \ + emith_move_r_imm(_t, imm); \ + emith_adc_r_r(r, _t); \ + rcache_free_tmp(_t); \ +} while (0) + +#define emith_adcf_r_imm(r, imm) do { \ + int _t = rcache_get_tmp(); \ + emith_move_r_imm(_t, imm); \ + emith_adcf_r_r(r, _t); \ + rcache_free_tmp(_t); \ +} while (0) + +#define emith_cmp_r_imm(r, imm) do { \ + u32 op_ = OP_SUBS, imm_ = (u8)imm; \ + if ((s8)imm_ < 0) { \ + imm_ = (u8)-imm_; \ + op_ = OP_ADDS; \ + } \ + EMIT(A64_OP_IMM12(op_, Z0, r, imm_, 0)); \ +} while (0) + + +#define emith_add_r_r_ptr_imm(d, s, imm) \ + emith_arith_imm(OP_ADD, 1, d, s, imm) + +#define emith_add_r_r_imm(d, s, imm) \ + emith_arith_imm(OP_ADD, 0, d, s, imm) + +#define emith_sub_r_r_imm(d, s, imm) \ + emith_arith_imm(OP_SUB, 0, d, s, imm) +#define emith_sub_r_r_imm_c(cond, d, s, imm) \ + emith_sub_r_r_imm(d, s, imm) + +#define emith_subf_r_r_imm(d, s, imm) \ + emith_arith_imm(OP_SUBS, 0, d, s, imm) + + +// logical, immediate; the value describes a bitmask, see ARMv8 ArchRefMan +// NB: deal only with simple masks 0{n}1{m}0{o} or 1{n}0{m}1{o}, 0 16) { + emith_move_r_imm(_t, ~imm); + EMIT(sz64|A64_OP_REG(op, 1, rd, rn, _t, ST_LSL, 0)); + } else { + emith_move_r_imm(_t, imm); + EMIT(sz64|A64_OP_REG(op, 0, rd, rn, _t, ST_LSL, 0)); + } + rcache_free_tmp(_t); + } +} + +#define emith_and_r_imm(r, imm) \ + emith_log_imm(OP_AND, 0, r, r, imm) + +#define emith_or_r_imm(r, imm) \ + emith_log_imm(OP_OR, 0, r, r, imm) +#define emith_or_r_imm_c(cond, r, imm) \ + emith_or_r_imm(r, imm) + +#define emith_eor_r_imm_ptr(r, imm) \ + emith_log_imm(OP_EOR, 1, r, r, imm) +#define emith_eor_r_imm_ptr_c(cond, r, imm) \ + emith_eor_r_imm_ptr(r, imm) + +#define emith_eor_r_imm(r, imm) \ + emith_log_imm(OP_EOR, 0, r, r, imm) +#define emith_eor_r_imm_c(cond, r, imm) \ + emith_eor_r_imm(r, imm) + +/* NB: BIC #imm not available in A64; use AND #~imm instead */ +#define emith_bic_r_imm(r, imm) \ + emith_log_imm(OP_AND, 0, r, r, ~(imm)) +#define emith_bic_r_imm_c(cond, r, imm) \ + emith_bic_r_imm(r, imm) + +#define emith_tst_r_imm(r, imm) \ + emith_log_imm(OP_ANDS, 0, Z0, r, imm) +#define emith_tst_r_imm_c(cond, r, imm) \ + emith_tst_r_imm(r, imm) + +#define emith_and_r_r_imm(d, s, imm) \ + emith_log_imm(OP_AND, 0, d, s, imm) + +#define emith_or_r_r_imm(d, s, imm) \ + emith_log_imm(OP_OR, 0, d, s, imm) + +#define emith_eor_r_r_imm(d, s, imm) \ + emith_log_imm(OP_EOR, 0, d, s, imm) + + +// shift +#define emith_lsl(d, s, cnt) \ + EMIT(A64_LSL_IMM(d, s, cnt)) + +#define emith_lsr(d, s, cnt) \ + EMIT(A64_LSR_IMM(d, s, cnt)) + +#define emith_asr(d, s, cnt) \ + EMIT(A64_ASR_IMM(d, s, cnt)) + +#define emith_ror(d, s, cnt) \ + EMIT(A64_ROR_IMM(d, s, cnt)) +#define emith_ror_c(cond, d, s, cnt) \ + emith_ror(d, s, cnt) + +#define emith_rol(d, s, cnt) \ + EMIT(A64_ROR_IMM(d, s, 32-(cnt))) + +// NB: shift with carry not directly supported in A64 :-|. +#define emith_lslf(d, s, cnt) do { \ + if ((cnt) > 1) { \ + emith_lsl(d, s, cnt-1); \ + emith_addf_r_r_r(d, d, d); \ + } else if ((cnt) > 0) \ + emith_addf_r_r_r(d, s, s); \ +} while (0) + +#define emith_lsrf(d, s, cnt) do { \ + EMIT(A64_RBIT_REG(d, s)); \ + emith_lslf(d, d, cnt); \ + EMIT(A64_RBIT_REG(d, d)); \ +} while (0) + +#define emith_asrf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_asr(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_addf_r_r_r(Z0, _s, _s); \ + EMIT(A64_RBIT_REG(d, _s)); \ + emith_adcf_r_r_r(d, d, d); \ + EMIT(A64_RBIT_REG(d, d)); \ + } \ +} while (0) + +#define emith_rolf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_rol(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_addf_r_r_r(d, _s, _s); \ + emith_adc_r_r_r(d, d, Z0); \ + } \ +} while (0) + +#define emith_rorf(d, s, cnt) do { \ + if ((cnt) > 0) { \ + emith_ror(d, s, cnt); \ + emith_addf_r_r_r(Z0, d, d); \ + } \ +} while (0) + +#define emith_rolcf(d) \ + emith_adcf_r_r(d, d) + +#define emith_rorcf(d) do { \ + EMIT(A64_RBIT_REG(d, d)); \ + emith_adcf_r_r(d, d); \ + EMIT(A64_RBIT_REG(d, d)); \ +} while (0) + +// signed/unsigned extend +#define emith_clear_msb(d, s, count) /* bits to clear */ \ + EMIT(A64_UXT_IMM(d, s, 32-(count))) +#define emith_clear_msb_c(cond, d, s, count) \ + emith_clear_msb(d, s, count) + +#define emith_sext(d, s, count) /* bits to keep */ \ + EMIT(A64_SXT_IMM(d, s, count)) + +// multiply Rd = Rn*Rm (+ Ra) +#define emith_mul(d, s1, s2) \ + EMIT(A64_MUL(d, s1, s2)) + +// NB: must combine/split Xd from/into 2 Wd's; play safe and clear upper bits +#define emith_combine64(dlo, dhi) \ + EMIT(A64_UXTX_IMM(dlo, dlo, 32)); \ + EMIT(A64_ORX_REG(dlo, dlo, dhi, ST_LSL, 32)); + +#define emith_split64(dlo, dhi) \ + EMIT(A64_LSRX_IMM(dhi, dlo, 32)); \ + EMIT(A64_UXTX_IMM(dlo, dlo, 32)); + +#define emith_mul_u64(dlo, dhi, s1, s2) do { \ + EMIT(A64_UMULL(dlo, s1, s2)); \ + emith_split64(dlo, dhi); \ +} while (0) + +#define emith_mul_s64(dlo, dhi, s1, s2) do { \ + EMIT(A64_SMULL(dlo, s1, s2)); \ + emith_split64(dlo, dhi); \ +} while (0) + +#define emith_mula_s64(dlo, dhi, s1, s2) do { \ + emith_combine64(dlo, dhi); \ + EMIT(A64_SMADDL(dlo, s1, s2, dlo)); \ + emith_split64(dlo, dhi); \ +} while (0) +#define emith_mula_s64_c(cond, dlo, dhi, s1, s2) \ + emith_mula_s64(dlo, dhi, s1, s2) + +// load/store. offs has 9 bits signed, hence larger offs may use a temp +static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) +{ + if (o9 >= -256 && o9 < 256) { + EMIT(A64_OP_LDST(sz, ld, A64_LDST_AM(0,_,o9), mode, rn, rd)); + } else if (mode == AM_IDXPRE) { + emith_add_r_r_ptr_imm(rn, rn, o9); + EMIT(A64_OP_LDST(sz, ld, A64_LDST_AM(0,_,0), AM_IDX, rn, rd)); + } else if (mode == AM_IDXPOST) { + EMIT(A64_OP_LDST(sz, ld, A64_LDST_AM(0,_,0), AM_IDX, rn, rd)); + emith_add_r_r_ptr_imm(rn, rn, o9); + } else { + int _t = rcache_get_tmp(); + emith_add_r_r_ptr_imm(_t, rn, o9); + EMIT(A64_OP_LDST(sz, ld, A64_LDST_AM(0,_,0), AM_IDX, _t, rd)); + rcache_free_tmp(_t); + } +} + +#define emith_read_r_r_offs_ptr(r, rs, offs) \ + emith_ldst_offs(AM_X, r, rs, offs, LT_LD, AM_IDX) +#define emith_read_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_read_r_r_offs_ptr(r, rs, offs) + +#define emith_read_r_r_offs(r, rs, offs) \ + emith_ldst_offs(AM_W, r, rs, offs, LT_LD, AM_IDX) +#define emith_read_r_r_offs_c(cond, r, rs, offs) \ + emith_read_r_r_offs(r, rs, offs) + +#define emith_read_r_r_r_ptr(r, rs, rm) \ + EMIT(A64_LDSTX_REG(r, rs, rm, LT_LD, XT_SXTW)) + +#define emith_read_r_r_r(r, rs, rm) \ + EMIT(A64_LDST_REG(r, rs, rm, LT_LD, XT_SXTW)) +#define emith_read_r_r_r_c(cond, r, rs, rm) \ + emith_read_r_r_r(r, rs, rm) + +#define emith_read_r_r_r_ptr_wb(r, rs, rm) do { \ + emith_read_r_r_r_ptr(r, rs, rm); \ + emith_add_r_r_ptr(rs, rm); \ +} while (0) +#define emith_read_r_r_r_wb(r, rs, rm) do { \ + emith_read_r_r_r(r, rs, rm); \ + emith_add_r_r_ptr(rs, rm); \ +} while (0) + +#define emith_read8_r_r_offs(r, rs, offs) \ + emith_ldst_offs(AM_B, r, rs, offs, LT_LD, AM_IDX) +#define emith_read8_r_r_offs_c(cond, r, rs, offs) \ + emith_read8_r_r_offs(r, rs, offs) + +#define emith_read8_r_r_r(r, rs, rm) \ + EMIT(A64_LDSTB_REG(r, rs, rm, LT_LD, XT_SXTW)) +#define emith_read8_r_r_r_c(cond, r, rs, rm) \ + emith_read8_r_r_r(r, rs, rm) + +#define emith_read16_r_r_offs(r, rs, offs) \ + emith_ldst_offs(AM_H, r, rs, offs, LT_LD, AM_IDX) +#define emith_read16_r_r_offs_c(cond, r, rs, offs) \ + emith_read16_r_r_offs(r, rs, offs) + +#define emith_read16_r_r_r(r, rs, rm) \ + EMIT(A64_LDSTH_REG(r, rs, rm, LT_LD, XT_SXTW)) +#define emith_read16_r_r_r_c(cond, r, rs, rm) \ + emith_read16_r_r_r(r, rs, rm) + +#define emith_read8s_r_r_offs(r, rs, offs) \ + emith_ldst_offs(AM_B, r, rs, offs, LT_LDS, AM_IDX) +#define emith_read8s_r_r_offs_c(cond, r, rs, offs) \ + emith_read8s_r_r_offs(r, rs, offs) + +#define emith_read8s_r_r_r(r, rs, rm) \ + EMIT(A64_LDSTB_REG(r, rs, rm, LT_LDS, XT_SXTW)) +#define emith_read8s_r_r_r_c(cond, r, rs, rm) \ + emith_read8s_r_r_r(r, rs, rm) + +#define emith_read16s_r_r_offs(r, rs, offs) \ + emith_ldst_offs(AM_H, r, rs, offs, LT_LDS, AM_IDX) +#define emith_read16s_r_r_offs_c(cond, r, rs, offs) \ + emith_read16s_r_r_offs(r, rs, offs) + +#define emith_read16s_r_r_r(r, rs, rm) \ + EMIT(A64_LDSTH_REG(r, rs, rm, LT_LDS, XT_SXTW)) +#define emith_read16s_r_r_r_c(cond, r, rs, rm) \ + emith_read16s_r_r_r(r, rs, rm) + + +#define emith_write_r_r_offs_ptr(r, rs, offs) \ + emith_ldst_offs(AM_X, r, rs, offs, LT_ST, AM_IDX) +#define emith_write_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_write_r_r_offs_ptr(r, rs, offs) + +#define emith_write_r_r_r_ptr(r, rs, rm) \ + EMIT(A64_LDSTX_REG(r, rs, rm, LT_ST, XT_SXTW)) +#define emith_write_r_r_r_ptr_c(cond, r, rs, rm) \ + emith_write_r_r_r_ptr(r, rs, rm) + +#define emith_write_r_r_offs(r, rs, offs) \ + emith_ldst_offs(AM_W, r, rs, offs, LT_ST, AM_IDX) +#define emith_write_r_r_offs_c(cond, r, rs, offs) \ + emith_write_r_r_offs(r, rs, offs) + +#define emith_write_r_r_r(r, rs, rm) \ + EMIT(A64_LDST_REG(r, rs, rm, LT_ST, XT_SXTW)) +#define emith_write_r_r_r_c(cond, r, rs, rm) \ + emith_write_r_r_r(r, rs, rm) + +#define emith_write_r_r_r_ptr_wb(r, rs, rm) do { \ + emith_write_r_r_r_ptr(r, rs, rm); \ + emith_add_r_r_ptr(rs, rm); \ +} while (0) +#define emith_write_r_r_r_wb(r, rs, rm) do { \ + emith_write_r_r_r(r, rs, rm); \ + emith_add_r_r_ptr(rs, rm); \ +} while (0) + +#define emith_ctx_read_ptr(r, offs) \ + emith_read_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_read(r, offs) \ + emith_read_r_r_offs(r, CONTEXT_REG, offs) +#define emith_ctx_read_c(cond, r, offs) \ + emith_ctx_read(r, offs) + +#define emith_ctx_write_ptr(r, offs) \ + emith_write_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_write(r, offs) \ + emith_write_r_r_offs(r, CONTEXT_REG, offs) + +#define emith_ctx_read_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_read(r_, offs_); \ +} while (0) + +#define emith_ctx_write_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_write(r_, offs_); \ +} while (0) + +// push pairs; NB: SP must be 16 byte aligned (HW requirement!) +#define emith_push2(r1, r2) \ + EMIT(A64_LDSTPX_IMM(SP, r1, r2, -2*8, LT_ST, AM_IDXPRE)) +#define emith_pop2(r1, r2) \ + EMIT(A64_LDSTPX_IMM(SP, r1, r2, 2*8, LT_LD, AM_IDXPOST)) + +// function call handling +#define emith_save_caller_regs(mask) do { \ + int _c, _r1, _r2; u32 _m = mask & 0x3ffff; \ + if (__builtin_parity(_m) == 1) _m |= 0x40000; /* hardware align */ \ + for (_c = HOST_REGS, _r1 = -1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + if (_m & (1 << _c)) { \ + _r2 = _r1, _r1 = _c; \ + if (_r2 != -1) { \ + emith_push2(_r1, _r2); \ + _r1 = -1; \ + } \ + } \ +} while (0) + +#define emith_restore_caller_regs(mask) do { \ + int _c, _r1, _r2; u32 _m = mask & 0x3ffff; \ + if (__builtin_parity(_m) == 1) _m |= 0x40000; /* hardware align */ \ + for (_c = 0, _r1 = -1; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) { \ + _r2 = _r1, _r1 = _c; \ + if (_r2 != -1) { \ + emith_pop2(_r2, _r1); \ + _r1 = -1; \ + } \ + } \ +} while (0) + +#define host_arg2reg(rd, arg) \ + rd = arg + +#define emith_pass_arg_r(arg, reg) \ + emith_move_r_r(arg, reg) + +#define emith_pass_arg_imm(arg, imm) \ + emith_move_r_imm(arg, imm) + +// branching; NB: A64 B.cond has only +/- 1MB range +#define emith_bcond(ptr, patch, cond, target) do { \ + u32 disp_ = (u8 *)target - (u8 *)ptr; \ + if (disp_ >= 0xfff00000 || disp_ <= 0x000fffff) { /* can use near B.c */ \ + EMIT_PTR(ptr, A64_BCOND(cond, disp_ & 0x001fffff)); \ + if (patch) EMIT_PTR(ptr, A64_NOP); /* reserve space for far B */ \ + } else { /* far branch if near branch isn't possible */ \ + EMIT_PTR(ptr, A64_BCOND(emith_invert_cond(cond), 8)); \ + EMIT_PTR(ptr, A64_B((disp_ - 4) & 0x0fffffff)); \ + } \ +} while (0) + +#define emith_jump(target) do {\ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ + EMIT(A64_B(disp_ & 0x0fffffff)); \ +} while (0) + +#define emith_jump_patchable(target) \ + emith_jump(target) + +#define emith_jump_cond(cond, target) \ + emith_bcond(tcache_ptr, 0, cond, target) + +#define emith_jump_cond_patchable(cond, target) \ + emith_bcond(tcache_ptr, 1, cond, target) + +#define emith_jump_patch(ptr, target) ({ \ + u32 *ptr_ = (u32 *)ptr; \ + u32 disp_ = (u8 *)(target) - (u8 *)(ptr_); \ + int cond_ = ptr_[0] & 0xf; \ + if ((ptr_[0] & 0xff000000) == 0x54000000) { /* B.cond */ \ + if (ptr_[1] != A64_NOP) cond_ = emith_invert_cond(cond_); \ + emith_bcond(ptr_, 1, cond_, target); \ + } else if (ptr_[0] & 0x80000000) \ + EMIT_PTR(ptr_, A64_BL((disp_) & 0x0fffffff)); \ + else EMIT_PTR(ptr_, A64_B((disp_) & 0x0fffffff)); \ + (u8 *)ptr; \ +}) + +#define emith_jump_reg(r) \ + EMIT(A64_BR(r)) +#define emith_jump_reg_c(cond, r) \ + emith_jump_reg(r) + +#define emith_jump_ctx(offs) do { \ + int _t = rcache_get_tmp(); \ + emith_ctx_read_ptr(_t, offs); \ + emith_jump_reg(_t); \ + rcache_free_tmp(_t); \ +} while (0) +#define emith_jump_ctx_c(cond, offs) \ + emith_jump_ctx(offs) + +#define emith_call(target) do { \ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ + EMIT(A64_BL(disp_ & 0x0fffffff)); \ +} while (0) +#define emith_call_cond(cond, target) \ + emith_call(target) + +#define emith_call_reg(r) \ + EMIT(A64_BLR(r)) + +#define emith_call_ctx(offs) do { \ + int _t = rcache_get_tmp(); \ + emith_ctx_read_ptr(_t, offs); \ + emith_call_reg(_t); \ + rcache_free_tmp(_t); \ +} while (0) + +#define emith_call_link(r, target) do { \ + EMIT(A64_ADRXLIT_IMM(r, 8)); \ + emith_jump(target); \ +} while (0) + +#define emith_call_cleanup() /**/ + +#define emith_ret() \ + EMIT(A64_RET(LR)) +#define emith_ret_c(cond) \ + emith_ret() + +#define emith_ret_to_ctx(offs) \ + emith_ctx_write_ptr(LR, offs) + +// NB: pushes r or r18 for SP hardware alignment +#define emith_push_ret(r) do { \ + int r_ = (r >= 0 ? r : 18); \ + emith_push2(r_, LR); \ +} while (0) + +#define emith_pop_and_ret(r) do { \ + int r_ = (r >= 0 ? r : 18); \ + emith_pop2(r_, LR); \ + emith_ret(); \ +} while (0) + + +// emitter ABI stuff +#define emith_pool_check() /**/ +#define emith_pool_commit(j) /**/ +#define emith_insn_ptr() ((u8 *)tcache_ptr) +#define emith_flush() /**/ +#define host_instructions_updated(base, end) __builtin___clear_cache(base, end) +#define emith_jump_patch_size() 8 + + +// SH2 drc specific +#define emith_sh2_drc_entry() do { \ + emith_push2(LR, FP); \ + emith_push2(28, 27); \ + emith_push2(26, 25); \ + emith_push2(24, 23); \ + emith_push2(22, 21); \ + emith_push2(20, 19); \ +} while (0) +#define emith_sh2_drc_exit() do { \ + emith_pop2(20, 19); \ + emith_pop2(22, 21); \ + emith_pop2(24, 23); \ + emith_pop2(26, 25); \ + emith_pop2(28, 27); \ + emith_pop2(LR, FP); \ + emith_ret(); \ +} while (0) + +// NB: assumes a is in arg0, tab, func and mask are temp +#define emith_sh2_rcall(a, tab, func, mask) do { \ + emith_lsr(mask, a, SH2_READ_SHIFT); \ + EMIT(A64_ADDX_REG(tab, tab, mask, ST_LSL, 4)); \ + emith_read_r_r_offs_ptr(func, tab, 0); \ + emith_read_r_r_offs(mask, tab, 8); \ + EMIT(A64_ADDXS_REG(func, func, func, ST_LSL, 0)); \ +} while (0) + +// NB: assumes a, val are in arg0 and arg1, tab and func are temp +#define emith_sh2_wcall(a, val, tab, func) do { \ + emith_lsr(func, a, SH2_WRITE_SHIFT); \ + emith_lsl(func, func, 3); \ + emith_read_r_r_r_ptr(func, tab, func); \ + emith_move_r_r_ptr(2, CONTEXT_REG); /* arg2 */ \ + emith_jump_reg(func); \ +} while (0) + +#define emith_sh2_delay_loop(cycles, reg) do { \ + int sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); \ + int t1 = rcache_get_tmp(); \ + int t2 = rcache_get_tmp(); \ + int t3 = rcache_get_tmp(); \ + /* if (sr < 0) return */ \ + emith_asrf(t2, sr, 12); \ + EMITH_JMP_START(DCOND_LE); \ + /* turns = sr.cycles / cycles */ \ + emith_move_r_imm(t3, (u32)((1ULL<<32) / (cycles)) + 1); \ + emith_mul_u64(t1, t2, t2, t3); /* multiply by 1/x */ \ + rcache_free_tmp(t3); \ + if (reg >= 0) { \ + /* if (reg <= turns) turns = reg-1 */ \ + t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \ + emith_cmp_r_r(t3, t2); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_sub_r_r_imm_c(DCOND_LS, t2, t3, 1); \ + EMITH_SJMP_END(DCOND_HI); \ + /* if (reg <= 1) turns = 0 */ \ + emith_cmp_r_imm(t3, 1); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_move_r_imm_c(DCOND_LS, t2, 0); \ + EMITH_SJMP_END(DCOND_HI); \ + /* reg -= turns */ \ + emith_sub_r_r(t3, t2); \ + } \ + /* sr.cycles -= turns * cycles; */ \ + emith_move_r_imm(t1, cycles); \ + emith_mul(t1, t2, t1); \ + emith_sub_r_r_r_lsl(sr, sr, t1, 12); \ + EMITH_JMP_END(DCOND_LE); \ + rcache_free_tmp(t1); \ + rcache_free_tmp(t2); \ +} while (0) + +/* + * if Q + * t = carry(Rn += Rm) + * else + * t = carry(Rn -= Rm) + * T ^= t + */ +#define emith_sh2_div1_step(rn, rm, sr) do { \ + int tmp_ = rcache_get_tmp(); \ + emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ + EMITH_SJMP3_START(DCOND_EQ); \ + emith_addf_r_r(rn, rm); \ + emith_adc_r_r_r(tmp_, Z0, Z0); \ + EMITH_SJMP3_MID(DCOND_EQ); \ + emith_subf_r_r(rn, rm); \ + emith_adc_r_r_r(tmp_, Z0, Z0); \ + emith_eor_r_imm(tmp_, 1); \ + EMITH_SJMP3_END(); \ + emith_eor_r_r(sr, tmp_); \ + rcache_free_tmp(tmp_); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macl(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* MACH top 16 bits unused if saturated. sign ext for overfl detect */ \ + emith_sext(mh, mh, 16); \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ + /* to check: add MACH[15] to MACH[31:16]. this is 0 if no overflow */ \ + emith_asrf(rn, mh, 16); /* sum = (MACH>>16) + ((MACH>>15)&1) */ \ + emith_adcf_r_imm(rn, 0); /* (MACH>>15) is in carry after shift */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ + EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> +ovl */ \ + emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0xffffffff */ \ + emith_sub_r_imm_c(DCOND_GT, mh, 1); /* 0x00007fff */ \ + EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macw(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* XXX: MACH should be untouched when S is set? */ \ + emith_asr(mh, ml, 31); /* sign ext MACL to MACH for ovrfl check */ \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 33 bits of MACH:MACL aren't all 1 or 0 */ \ + /* to check: add MACL[31] to MACH. this is 0 if no overflow */ \ + emith_lsr(rn, ml, 31); \ + emith_addf_r_r(rn, mh); /* sum = MACH + ((MACL>>31)&1) */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> overflow */ \ + /* XXX: LSB signalling only in SH1, or in SH2 too? */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x00000001); /* LSB of MACH */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x80000000); /* negative ovrfl */ \ + EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> positive ovrfl */ \ + emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0x7fffffff */ \ + EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +#define emith_write_sr(sr, srcr) do { \ + emith_lsr(sr, sr, 10); \ + emith_or_r_r_r_lsl(sr, sr, srcr, 22); \ + emith_ror(sr, sr, 22); \ +} while (0) + +#define emith_carry_to_t(srr, is_sub) do { \ + emith_lsr(sr, sr, 1); \ + emith_adc_r_r(sr, sr); \ + if (is_sub) /* SUB has inverted C on ARM */ \ + emith_eor_r_imm(sr, 1); \ +} while (0) + +#define emith_tpop_carry(sr, is_sub) do { \ + if (is_sub) \ + emith_eor_r_imm(sr, 1); \ + emith_lsrf(sr, sr, 1); \ +} while (0) + +#define emith_tpush_carry(sr, is_sub) do { \ + emith_adc_r_r(sr, sr); \ + if (is_sub) \ + emith_eor_r_imm(sr, 1); \ +} while (0) + +#ifdef T +// T bit handling +#define emith_invert_cond(cond) \ + ((cond) ^ 1) + +static void emith_clr_t_cond(int sr) +{ + emith_bic_r_imm(sr, T); +} + +static void emith_set_t_cond(int sr, int cond) +{ + EMITH_SJMP_START(emith_invert_cond(cond)); + emith_or_r_imm_c(cond, sr, T); + EMITH_SJMP_END(emith_invert_cond(cond)); +} + +#define emith_get_t_cond() -1 + +#define emith_sync_t(sr) ((void)sr) + +#define emith_invalidate_t() + +static void emith_set_t(int sr, int val) +{ + if (val) + emith_or_r_imm(sr, T); + else + emith_bic_r_imm(sr, T); +} + +static int emith_tst_t(int sr, int tf) +{ + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; +} +#endif diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 01fc6ae1f..0083dc427 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -466,6 +466,56 @@ static cache_reg_t cache_regs[] = { { 7, HRF_REG }, }; +#elif defined(__aarch64__) +#include "../drc/emit_arm64.c" + +static guest_reg_t guest_regs[] = { + // SHR_R0 .. SHR_SP + { GRF_STATIC,20 }, { GRF_STATIC,21 }, { 0 } , { 0 } , + { 0 } , { 0 } , { 0 } , { 0 } , + { 0 } , { 0 } , { 0 } , { 0 } , + { 0 } , { 0 } , { 0 } , { 0 } , + // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, + // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, + { 0 } , { 0 } , { 0 } , { GRF_STATIC, 22 }, + { 0 } , { 0 } , { 0 } , { 0 } , +}; + +// AAPCS64: params: r0-r7, return: r0-r1, temp: r8-r17, saved: r19-r29 +// saved: r18 (for platform use) +// since drc never needs more than 4 parameters, r4-r7 are treated as temp. +static cache_reg_t cache_regs[] = { + { 17, HRF_TEMP }, // temps + { 16, HRF_TEMP }, + { 15, HRF_TEMP }, + { 14, HRF_TEMP }, + { 13, HRF_TEMP }, + { 12, HRF_TEMP }, + { 11, HRF_TEMP }, + { 10, HRF_TEMP }, + { 9, HRF_TEMP }, + { 8, HRF_TEMP }, + { 7, HRF_TEMP }, + { 6, HRF_TEMP }, + { 5, HRF_TEMP }, + { 4, HRF_TEMP }, + { 3, HRF_TEMP }, // params + { 2, HRF_TEMP }, + { 1, HRF_TEMP }, + { 0, HRF_TEMP }, // RET_REG + { 22, HRF_LOCKED }, // statics + { 21, HRF_LOCKED }, + { 20, HRF_LOCKED }, + { 29, HRF_REG }, // other regs + { 28, HRF_REG }, + { 27, HRF_REG }, + { 26, HRF_REG }, + { 25, HRF_REG }, + { 24, HRF_REG }, + { 23, HRF_REG }, + { 22, HRF_REG }, +}; + #elif defined(__mips__) #include "../drc/emit_mips.c" diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 09f4ae979..1ad922b79 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -36,6 +36,8 @@ unsigned short scan_block(unsigned int base_pc, int is_slave, // XXX MUST match definitions in cpu/sh2/compiler.c #if defined(__arm__) #define DRC_SR_REG r10 +#elif defined(__aarch64__) +#define DRC_SR_REG r22 #elif defined(__mips__) #define DRC_SR_REG s6 #elif defined(__i386__) From e666ac97c433589cbc1860a45977a1146eb94bd2 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 16 Aug 2019 15:14:41 +0200 Subject: [PATCH 051/174] various small fixes and optimsations --- Makefile | 8 ++++++++ Makefile.libretro | 3 ++- cpu/drc/emit_arm.c | 2 ++ cpu/drc/emit_arm64.c | 1 + cpu/drc/emit_mips.c | 5 +++-- cpu/drc/emit_x86.c | 2 ++ cpu/sh2/compiler.c | 16 ++++++++-------- pico/32x/memory.c | 27 ++++++++++++--------------- tools/mkoffsets.sh | 2 +- 9 files changed, 39 insertions(+), 27 deletions(-) diff --git a/Makefile b/Makefile index 78de3c562..837ad8f27 100644 --- a/Makefile +++ b/Makefile @@ -254,6 +254,14 @@ pico/cd/cd_file.o: CFLAGS += -fno-strict-aliasing pico/cd/pcm.o: CFLAGS += -fno-strict-aliasing pico/cd/LC89510.o: CFLAGS += -fno-strict-aliasing pico/cd/gfx_cd.o: CFLAGS += -fno-strict-aliasing +ifeq (1,$(use_sh2drc)) +ifneq (,$(findstring -flto,$(CFLAGS))) +# if using the DRC, memory and sh2soc use a global register variable to avoid +# saving and reloading the SH2 SR. However, this collides with the use of LTO. +pico/32x/memory.o: CFLAGS += -fno-lto +pico/32x/sh2soc.o: CFLAGS += -fno-lto +endif +endif # fame needs ~2GB of RAM to compile on gcc 4.8 # on x86, this is reduced by ~300MB when debug info is off (but not on ARM) diff --git a/Makefile.libretro b/Makefile.libretro index 89cfea1c4..0cc97695d 100644 --- a/Makefile.libretro +++ b/Makefile.libretro @@ -41,7 +41,7 @@ CFLAGS += -I platform/libretro/libretro-common/include/vfs STATIC_LINKING:= 0 TARGET_NAME := picodrive LIBM := -lm -GIT_VERSION ?= " $(shell git rev-parse --short HEAD || echo unknown)" +GIT_VERSION ?= $(shell git rev-parse --short HEAD || echo unknown) ifneq ($(GIT_VERSION)," unknown") CFLAGS += -DGIT_VERSION=\"$(GIT_VERSION)\" endif @@ -616,6 +616,7 @@ else ifeq ($(platform), gcw0) use_fame = 1 use_drz80 = 0 use_cz80 = 1 + use_sh2drc = 1 # Windows MSVC 2017 all architectures else ifneq (,$(findstring windows_msvc2017,$(platform))) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 72542a3fc..a4aa2ec62 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -1174,6 +1174,8 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define host_arg2reg(rd, arg) \ rd = arg +#define emith_rw_offs_max() 0xff + /* SH2 drc specific */ /* pushes r12 for eabi alignment */ #define emith_sh2_drc_entry() \ diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 90010d803..a67f6819c 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -1117,6 +1117,7 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_flush() /**/ #define host_instructions_updated(base, end) __builtin___clear_cache(base, end) #define emith_jump_patch_size() 8 +#define emith_rw_offs_max() 0xff // SH2 drc specific diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index f56b89a31..91d493b59 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -394,7 +394,7 @@ int emith_flg_noV; // V flag known not to be set // NB: for adcf and sbcf, carry-in must be dealt with separately (see there) static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) { - if (sub && rd == FNZ && rt && rs) // is this cmp_r_r? + if (sub && rd == FNZ && rt > AT && rs > AT) // is this cmp_r_r? emith_flg_rs = rs, emith_flg_rt = rt; else emith_flg_rs = emith_flg_rt = 0; @@ -858,7 +858,7 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) // NB: mips32r2 has EXT and INS #define emith_clear_msb(d, s, count) /* bits to clear */ do { \ u32 t; \ - if ((count) > 16) { \ + if ((count) >= 16) { \ t = (count) - 16; \ t = 0xffff >> t; \ emith_and_r_r_imm(d, s, t); \ @@ -1262,6 +1262,7 @@ static int emith_cond_check(int cond, int *r) // NB: mips32r2 has SYNCI #define host_instructions_updated(base, end) __builtin___clear_cache(base, end) #define emith_jump_patch_size() 4 +#define emith_rw_offs_max() 0x7fff // SH2 drc specific #define emith_sh2_drc_entry() do { \ diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index a40c0f8ca..2177541cd 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -986,6 +986,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define host_instructions_updated(base, end) +#define emith_rw_offs_max() 0xffffffff + #ifdef __x86_64__ #define HOST_REGS 16 diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 0083dc427..677c8adf9 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -419,8 +419,8 @@ typedef struct { static int rcache_get_tmp(void); static void rcache_free_tmp(int hr); -// Note: cache_regs[] must have at least the amount of REG and TEMP registers -// used by handlers in worst case (currently 4). +// Note: cache_regs[] must have at least the amount of HRF_REG registers used +// by handlers in worst case (currently 4). // Register assignment goes by ABI convention. Caller save registers are TEMP, // the others are either static or REG. SR must be static, R0 very recommended. // VBR, PC, PR must not be static (read from context in utils). @@ -2418,7 +2418,7 @@ static void rcache_init(void) // NB may return either REG or TEMP static int emit_get_rbase_and_offs(SH2 *sh2, sh2_reg_e r, int rmode, u32 *offs) { - uptr omask = 0xff; // offset mask, XXX: ARM oriented.. + uptr omask = emith_rw_offs_max(); // offset mask u32 mask = 0; u32 a; int poffs; @@ -4447,7 +4447,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) static void sh2_generate_utils(void) { - int arg0, arg1, arg2, arg3, sr, tmp; + int arg0, arg1, arg2, arg3, sr, tmp, tmp2; host_arg2reg(arg0, 0); host_arg2reg(arg1, 1); @@ -4689,18 +4689,18 @@ static void sh2_generate_utils(void) emith_sub_r_imm(tmp, 4*2); rcache_clean(); // push SR - tmp = rcache_get_reg_arg(0, SHR_SP, NULL); - emith_add_r_imm(tmp, 4); + tmp = rcache_get_reg_arg(0, SHR_SP,&tmp2); + emith_add_r_r_imm(tmp, tmp2, 4); tmp = rcache_get_reg_arg(1, SHR_SR, NULL); emith_clear_msb(tmp, tmp, 22); emith_move_r_r_ptr(arg2, CONTEXT_REG); - rcache_invalidate(); + rcache_invalidate_tmp(); emith_call(p32x_sh2_write32); // XXX: use sh2_drc_write32? // push PC rcache_get_reg_arg(0, SHR_SP, NULL); emith_ctx_read(arg1, SHR_PC * 4); emith_move_r_r_ptr(arg2, CONTEXT_REG); - rcache_invalidate(); + rcache_invalidate_tmp(); emith_call(p32x_sh2_write32); // update I, cycles, do callback emith_ctx_read(arg1, offsetof(SH2, pending_level)); diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 7148d41c9..8d5ca7258 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -197,24 +197,19 @@ static NOINLINE u32 sh2_poll_read(u32 a, u32 d, unsigned int cycles, SH2* sh2) // fetch oldest write to address from fifo, but stop when reaching the present idx = sh2_poll_rd[hix]; while (idx != sh2_poll_wr[hix] && CYCLES_GE(cycles, fifo[idx].cycles)) { -// int oidx = idx; p = &fifo[idx]; idx = (idx+1) % PFIFO_SZ; - if (CYCLES_GT(cycles, p->cycles+80)) { - // drop older fifo stores that may cause synchronisation problems. - // NB unfortunately this cycle diff is quite sensitive: - // observed in Brutal Unleashed: min 80, observed in Afterburner: max 110 - sh2_poll_rd[hix] = idx; - } else if (p->a == a) { - // replace current data with fifo value and discard fifo entry - if (cpu != p->cpu) { + if (cpu != p->cpu) { + if (CYCLES_GT(cycles, p->cycles+80)) { + // drop older fifo stores that may cause synchronisation problems. + sh2_poll_rd[hix] = idx; + } else if (p->a == a) { + // replace current data with fifo value and discard fifo entry d = p->d; p->a = -1; -// if (oidx == sh2_poll_rd[hix]) -// sh2_poll_rd[hix] = idx; + break; } - break; } } return d; @@ -224,7 +219,6 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) { int hix = (a >> 1) % PFIFO_CNT; struct sh2_poll_fifo *fifo = sh2_poll_fifo[hix]; - struct sh2_poll_fifo *p = &fifo[sh2_poll_wr[hix]]; struct sh2_poll_fifo *q = &fifo[(sh2_poll_wr[hix]-1) % PFIFO_SZ]; int cpu = sh2 ? sh2->is_slave+1 : 0; @@ -233,15 +227,16 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) // intermediate values that may cause synchronisation problems. // NB this can take an eternity on m68k: mov.b , needs // 28 m68k-cycles (~80 sh2-cycles) to complete (observed in Metal Head) - if (q->a == a && !CYCLES_GT(cycles,q->cycles+30)) { + if (q->a == a && sh2_poll_wr[hix] != sh2_poll_rd[hix] && !CYCLES_GT(cycles,q->cycles+30)) { q->d = d; } else { // store write to poll address in fifo + fifo[sh2_poll_wr[hix]] = + (struct sh2_poll_fifo){ .cycles = cycles, .a = a, .d = d, .cpu = cpu }; sh2_poll_wr[hix] = (sh2_poll_wr[hix]+1) % PFIFO_SZ; if (sh2_poll_wr[hix] == sh2_poll_rd[hix]) // fifo overflow, discard oldest value sh2_poll_rd[hix] = (sh2_poll_rd[hix]+1) % PFIFO_SZ; - *p = (struct sh2_poll_fifo){ .cycles = cycles, .a = a, .d = d, .cpu = cpu }; } } @@ -2369,6 +2364,8 @@ void PicoMemSetup32x(void) sh2_drc_mem_setup(&msh2); sh2_drc_mem_setup(&ssh2); + memset(sh2_poll_rd, 0, sizeof(sh2_poll_rd)); + memset(sh2_poll_wr, 0, sizeof(sh2_poll_wr)); // z80 hack z80_map_set(z80_write_map, 0x8000, 0xffff, z80_md_bank_write_32x, 1); diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index a573f7a43..e76325936 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -11,7 +11,7 @@ ENDIAN= # compile with target C compiler and extract value from .rodata section compile_rodata () { - $CC $CFLAGS -I .. -c /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 + $CC $CFLAGS -I .. -shared /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 # find the name of the .rodata section (in case -fdata-sections is used) rosect=$(readelf -S /tmp/getoffs.o | grep '\.rodata' | sed 's/^[^.]*././;s/ .*//') From 6a38d505d626fb794cec4cf16ab7ecb06df446c9 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 16 Aug 2019 17:25:23 +0200 Subject: [PATCH 052/174] fix for mkoffsets without multiarch binutils --- tools/mkoffsets.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index e76325936..3b4c076d0 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -16,8 +16,8 @@ compile_rodata () rosect=$(readelf -S /tmp/getoffs.o | grep '\.rodata' | sed 's/^[^.]*././;s/ .*//') # read out .rodata section as hex string (should be only 4 or 8 bytes) - objcopy --dump-section $rosect=/tmp/getoffs.ro /tmp/getoffs.o || exit 1 - ro=$(xxd -ps /tmp/getoffs.ro) + ro=$(readelf -x $rosect /tmp/getoffs.o | grep '0x' | cut -c14-48 | + tr -d ' \n') if [ "$ENDIAN" = "le" ]; then # swap needed for le target hex="" From 7abc11c714364658b648647e970f72e1e3a20fd5 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 20 Aug 2019 22:26:39 +0200 Subject: [PATCH 053/174] cleanup config files, copyright stuff --- config.aarch64 | 9 ++++----- config.caanoo | 8 ++++---- config.caanoo47 | 8 ++++---- config.dingux | 7 +++---- config.dingux54 | 7 +++---- config.gcw0 | 7 +++---- config.gp2x | 6 +++--- config.gp2x47 | 6 +++--- config.i386 | 8 ++++---- config.x86 | 8 ++++---- pico/32x/draw_arm.S | 1 + pico/32x/memory.c | 1 + pico/memory_arm.S | 1 + tools/mkoffsets.sh | 9 ++++++--- 14 files changed, 44 insertions(+), 42 deletions(-) diff --git a/config.aarch64 b/config.aarch64 index 70a6fe300..9631d64ec 100644 --- a/config.aarch64 +++ b/config.aarch64 @@ -4,12 +4,11 @@ CC = aarch64-linux-gnu-gcc CXX = aarch64-linux-gnu-g++ AS = aarch64-linux-gnu-as STRIP = aarch64-linux-gnu-strip -CFLAGS += -I/usr/include/SDL -CFLAGS += -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -fno-stack-protector +CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result ASFLAGS += -LDFLAGS += -LDLIBS += -lSDL -lasound -lpng -lz -lm -lstdc++ -ldl +LDFLAGS += # --sysroot ${HOME}/opt/aarch64/debian-arm64 +LDLIBS += -lSDL -lasound -lpng -lz -lm -ldl ARCH = aarch64 PLATFORM = generic -SOUND_DRIVERS = alsa +SOUND_DRIVERS = oss alsa sdl diff --git a/config.caanoo b/config.caanoo index dd053bc5f..1ffc54da8 100644 --- a/config.caanoo +++ b/config.caanoo @@ -4,11 +4,11 @@ CC = arm-gph-linux-gnueabi-gcc CXX = arm-gph-linux-gnueabi-g++ AS = arm-gph-linux-gnueabi-as STRIP = arm-gph-linux-gnueabi-strip -CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -fno-stack-protector -D__GP2X__ -CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers -CFLAGS += -I${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I${HOME}/src/gp2x/armroot-eabi/include +CFLAGS += -I${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I${HOME}/src/gp2x/armroot-eabi/include -D__GP2X__ -Wno-unused-result +CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -mno-thumb-interwork -fno-stack-protector -fno-common +CFLAGS += -finline-limit=42 -fipa-pta -fno-ipa-pure-const ASFLAGS += -mfloat-abi=soft -mcpu=arm920t -LDFLAGS += -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/lib/gcc/arm-gph-linux-gnueabi/4.2.4 -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L${HOME}/src/gp2x/armroot-eabi/lib -static +LDFLAGS += --sysroot ${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root -L${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L${HOME}/src/gp2x/armroot-eabi/lib -static LDLIBS += -lpng -lm -ldl ARCH = arm diff --git a/config.caanoo47 b/config.caanoo47 index 2c0ee5aff..5bcf86084 100644 --- a/config.caanoo47 +++ b/config.caanoo47 @@ -4,11 +4,11 @@ CC = arm-linux-gnueabi-gcc CXX = arm-linux-gnueabi-g++ AS = arm-linux-gnueabi-as STRIP = arm-linux-gnueabi-strip -CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -Wno-unused-result -fno-stack-protector -D__GP2X__ -CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers -CFLAGS += -I${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I${HOME}/src/gp2x/armroot-eabi/include +CFLAGS += -I${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I${HOME}/src/gp2x/armroot-eabi/include -D__GP2X__ -Wno-unused-result +CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -mno-thumb-interwork -fno-stack-protector -fno-common +CFLAGS += -finline-limit=42 -fipa-pta -fno-ipa-sra -fno-ipa-pure-const ASFLAGS += -mfloat-abi=soft -mcpu=arm920t -LDFLAGS += -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/lib/gcc/arm-gph-linux-gnueabi/4.2.4 -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -static +LDFLAGS += -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/lib/gcc/arm-gph-linux-gnueabi/4.2.4 -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L${HOME}/src/gp2x/armroot-eabi/lib -static LDLIBS += -lpng -lm -ldl ARCH = arm diff --git a/config.dingux b/config.dingux index 8aca06a63..d1ec7fe57 100644 --- a/config.dingux +++ b/config.dingux @@ -4,12 +4,11 @@ CC = mipsel-linux-gcc CXX = mipsel-linux-g++ AS = mipsel-linux-as STRIP = mipsel-linux-strip -CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/ -CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/SDL +CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/ -I${HOME}/opt/opendingux-toolchain/usr/include/SDL CFLAGS += -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -fno-stack-protector ASFLAGS += -LDFLAGS += -LDLIBS += -B${HOME}/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/lib -lSDL -lasound -lpng -lm -lstdc++ -ldl +LDFLAGS += --sysroot ${HOME}/opt/opendingux-toolchain -L${HOME}/opt/opendingux-toolchain/lib +LDLIBS += -lSDL -lasound -lpng -lz -lm -ldl ARCH = mipsel PLATFORM = opendingux diff --git a/config.dingux54 b/config.dingux54 index 5f292652b..423cbd17b 100644 --- a/config.dingux54 +++ b/config.dingux54 @@ -4,12 +4,11 @@ CC = mipsel-linux-gnu-gcc CXX = mipsel-linux-gnu-g++ AS = mipsel-linux-gnu-as STRIP = mipsel-linux-gnu-strip -CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/ -CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/SDL +CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/ -I${HOME}/opt/opendingux-toolchain/usr/include/SDL CFLAGS += -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -fno-stack-protector ASFLAGS += -LDFLAGS += -LDLIBS += -B${HOME}/opt/opendingux-toolchain/usr/lib -B${HOME}/opt/opendingux-toolchain/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/lib -lSDL -lasound -lpng -lz -lm -lstdc++ -ldl +LDFLAGS += -B${HOME}/opt/opendingux-toolchain/usr/lib -B${HOME}/opt/opendingux-toolchain/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/lib +LDLIBS += -lSDL -lasound -lpng -lz -lm -ldl ARCH = mipsel PLATFORM = opendingux diff --git a/config.gcw0 b/config.gcw0 index 1d2ccef0a..78f7c3a47 100644 --- a/config.gcw0 +++ b/config.gcw0 @@ -4,12 +4,11 @@ CC = mipsel-gcw0-linux-uclibc-gcc CXX = mipsel-gcw0-linux-uclibc-g++ AS = mipsel-gcw0-linux-uclibc-as STRIP = mipsel-gcw0-linux-uclibc-strip -CFLAGS += -I${HOME}/opt/gcw0-toolchain/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/ -CFLAGS += -I${HOME}/opt/gcw0-toolchain/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL +CFLAGS += -I${HOME}/opt/gcw0-toolchain/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/ -I${HOME}/opt/gcw0-toolchain/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL CFLAGS += -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -fno-stack-protector ASFLAGS += -LDFLAGS += -LDLIBS += -B${HOME}/opt/gcw0-toolchain/usr/lib -Wl,-rpath-link=${HOME}/opt/gcw0-toolchain/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/lib -Wl,-rpath-link=${HOME}/opt/gcw0-toolchain/usr/mipsel-gcw0-linux-uclibc/sysroot/lib -lSDL -lasound -lpng -lz -lm -lstdc++ -ldl +LDFLAGS += --sysroot ${HOME}/opt/gcw0-toolchain/usr/mipsel-gcw0-linux-uclibc/sysroot +LDLIBS += -lSDL -lasound -lpng -lz -lm -ldl ARCH = mipsel PLATFORM = opendingux diff --git a/config.gp2x b/config.gp2x index 248d73aa1..84d2f93d0 100644 --- a/config.gp2x +++ b/config.gp2x @@ -4,9 +4,9 @@ CC = arm-open2x-linux-gcc CXX = arm-open2x-linux-g++ AS = arm-open2x-linux-as STRIP = arm-open2x-linux-strip -CFLAGS += -msoft-float -mcpu=arm920t -mtune=arm920t -D__GP2X__ -CFLAGS += -I${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I${HOME}/src/gp2x/armroot/include -CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers +CFLAGS += -I${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I${HOME}/src/gp2x/armroot/include -D__GP2X__ -Wno-unused-result +CFLAGS += -msoft-float -mcpu=arm920t -mtune=arm920t +CFLAGS += -finline-limit=42 -fipa-cp -fno-ipa-pure-const ASFLAGS += -mcpu=arm920t -mfloat-abi=soft LDFLAGS += --sysroot ${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/src/gp2x/armroot/lib -static LDLIBS += -lpng -lm -ldl diff --git a/config.gp2x47 b/config.gp2x47 index 632515ee7..7ce3d9a93 100644 --- a/config.gp2x47 +++ b/config.gp2x47 @@ -4,12 +4,12 @@ CC = arm-linux-gnueabi-gcc CXX = arm-linux-gnueabi-g++ AS = arm-linux-gnueabi-as STRIP = arm-linux-gnueabi-strip +CFLAGS += -I${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I${HOME}/src/gp2x/armroot/include -D__GP2X__ -Wno-unused-result CFLAGS += -mabi=apcs-gnu -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t -mtune=arm920t -CFLAGS += -Wno-unused-result -D__GP2X__ -mno-thumb-interwork -fno-stack-protector -fno-common -CFLAGS += -I${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I${HOME}/src/gp2x/armroot/include +CFLAGS += -mno-thumb-interwork -fno-stack-protector -fno-common CFLAGS += -finline-limit=42 -fipa-pta -fno-ipa-sra -fno-ipa-pure-const ASFLAGS += -mabi=apcs-gnu -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t -LDFLAGS += -mabi=apcs-gnu -mfpu=fpa -B${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/src/gp2x/armroot/lib -static +LDFLAGS += -mabi=apcs-gnu -mfpu=fpa -B${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/usr/lib -L${HOME}/src/gp2x/armroot/lib -static LDLIBS += -lpng -lm -ldl ARCH = arm diff --git a/config.i386 b/config.i386 index ce07b103e..9c8c2e652 100644 --- a/config.i386 +++ b/config.i386 @@ -4,11 +4,11 @@ CC = gcc CXX = g++ AS = as STRIP = strip -CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -m32 # -pg +CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -m32 ASFLAGS += -LDFLAGS += -m32 #-pg -LDLIBS += -L/usr/lib/i386-linux-gnu -L${HOME}/opt/lib32 -lSDL-1.2 -lasound -lpng -lz -lm -ldl +LDFLAGS += -m32 -L/usr/lib/i386-linux-gnu -L${HOME}/opt/lib32 +LDLIBS += -lSDL-1.2 -lasound -lpng -lz -lm -ldl ARCH = i386 PLATFORM = generic -SOUND_DRIVERS = oss alsa sdl +SOUND_DRIVERS = oss alsa sdl diff --git a/config.x86 b/config.x86 index 287b82d32..454400110 100644 --- a/config.x86 +++ b/config.x86 @@ -4,11 +4,11 @@ CC = gcc CXX = g++ AS = as STRIP = strip -CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result # -pg +CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result ASFLAGS += -LDFLAGS += #-pg -LDLIBS += -L/usr/lib/x86_64-linux-gnu -lSDL-1.2 -lasound -lpng -lz -lm -ldl +LDFLAGS += -L/usr/lib/x86_64-linux-gnu +LDLIBS += -lSDL-1.2 -lasound -lpng -lz -lm -ldl ARCH = x86_64 PLATFORM = generic -SOUND_DRIVERS = oss alsa sdl +SOUND_DRIVERS = oss alsa sdl diff --git a/pico/32x/draw_arm.S b/pico/32x/draw_arm.S index e0cdcbe50..f351d8e00 100644 --- a/pico/32x/draw_arm.S +++ b/pico/32x/draw_arm.S @@ -1,6 +1,7 @@ @* @* PicoDrive @* (C) notaz, 2010 +@* (C) kub, 2019 @* @* This work is licensed under the terms of MAME license. @* See COPYING file in the top-level directory. diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 8d5ca7258..3e11cbcba 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -1,6 +1,7 @@ /* * PicoDrive * (C) notaz, 2009,2010,2013 + * (C) kub, 2019 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. diff --git a/pico/memory_arm.S b/pico/memory_arm.S index 07d6a128c..333780c10 100644 --- a/pico/memory_arm.S +++ b/pico/memory_arm.S @@ -1,6 +1,7 @@ /* * PicoDrive * (C) notaz, 2006-2009 + * (C) kub, 2019 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index 3b4c076d0..8f2d888c2 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -11,7 +11,10 @@ ENDIAN= # compile with target C compiler and extract value from .rodata section compile_rodata () { - $CC $CFLAGS -I .. -shared /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 + # $CC $CFLAGS -I .. -shared /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 + echo 'void dummy(void) { asm(""::"r" (&val)); }' >> /tmp/getoffs.c + $CC $CFLAGS -I .. -nostdlib -Wl,-edummy /tmp/getoffs.c \ + -o /tmp/getoffs.o || exit 1 # find the name of the .rodata section (in case -fdata-sections is used) rosect=$(readelf -S /tmp/getoffs.o | grep '\.rodata' | sed 's/^[^.]*././;s/ .*//') @@ -40,13 +43,13 @@ get_define () # prefix struct member member... name=$(echo $* | sed 's/ /_/g') echo '#include "pico/pico_int.h"' > /tmp/getoffs.c echo "static const struct $struct p;" >> /tmp/getoffs.c - echo "const int offs = (char *)&p.$field - (char*)&p;" >>/tmp/getoffs.c + echo "const int val = (char *)&p.$field - (char*)&p;" >>/tmp/getoffs.c compile_rodata line=$(printf "#define %-20s 0x%04x" $prefix$name $rodata) } # determine endianess -echo "const int one = 1;" >/tmp/getoffs.c +echo "const int val = 1;" >/tmp/getoffs.c compile_rodata ENDIAN=$(if [ "$rodata" -eq 1 ]; then echo be; else echo le; fi) # output header From 6afb2662bdd130b1f108a88ff7397e6fdda9fd13 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 21 Aug 2019 18:27:26 +0200 Subject: [PATCH 054/174] configuration changes and README --- Makefile | 4 +- README.md | 112 +++++++++++++++++++++++++++++++++ config.dingux | 2 +- config.dingux54 | 2 +- config.gcw0 | 2 +- config.gp2x | 4 +- config.gp2x47 | 3 +- configure | 104 ++++++++++++++++++------------ cpu/drc/emit_arm.c | 1 + cpu/drc/emit_x86.c | 1 + platform/common/helix/Makefile | 3 +- 11 files changed, 189 insertions(+), 49 deletions(-) create mode 100644 README.md diff --git a/Makefile b/Makefile index 837ad8f27..a5adaf090 100644 --- a/Makefile +++ b/Makefile @@ -256,8 +256,8 @@ pico/cd/LC89510.o: CFLAGS += -fno-strict-aliasing pico/cd/gfx_cd.o: CFLAGS += -fno-strict-aliasing ifeq (1,$(use_sh2drc)) ifneq (,$(findstring -flto,$(CFLAGS))) -# if using the DRC, memory and sh2soc use a global register variable to avoid -# saving and reloading the SH2 SR. However, this collides with the use of LTO. +# if using the DRC, memory and sh2soc directly use the DRC register for SH2 SR +# to avoid saving and reloading it. However, this collides with the use of LTO. pico/32x/memory.o: CFLAGS += -fno-lto pico/32x/sh2soc.o: CFLAGS += -fno-lto endif diff --git a/README.md b/README.md new file mode 100644 index 000000000..d0d7259f8 --- /dev/null +++ b/README.md @@ -0,0 +1,112 @@ +This is my foray into dynamic recompilation using PicoDrive, a +Megadrive / Genesis / Sega CD / Mega CD / 32X / SMS emulator. + +I added support for MIPS (mips32r1) and ARM64 (aarch64) to the recompiler, as +well as spent much effort to optimize the code generated by the DRC. +I also optimized SH2 memory access inside the emulator, and did some work on +M68K/SH2 CPU synchronization to fix some problems and speed up the emulator. + +It got a bit out of hand. I ended up doing fixes and optimzations all over the +place, mainly for 32X and CD, 32X graphics handling, and probably some more, +see the commit history. + +### compiling + +I mainly worked with standalone PicoDrive versions as created by configure/make. +A list of platforms for which this is possible can be obtained with + +> configure --help + +If you want to build an executable for a unixoid platform not listed in the +platform list, just use + +> configure --platform=generic + +If DRC is available for the platform, it should be enabled automatically. + +For other platforms using a cross-compiling toolchain I used this, +assuming $TC points to the appropriate cross compile toolchain directory: + +platform|toolchain|configure command +--------|---------|----------------- +gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common -finline-limit=42" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x +gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common -finline-limit=42 -fipa-pta" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x +opendingux|opendingux|CROSS_COMPILE=mipsel-linux- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL -fno-stack-protector -fno-common -finline-limit=42 -fipa-pta" LDFLAGS="--sysroot $TC -L$TC/lib" ./configure --platform=opendingux +opendingux|opendingux with ubuntu mips gcc 5.4|CROSS_COMPILE=mipsel-linux-gnu- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL -fno-stack-protector -fno-common -finline-limit=42 -fipa-pta" LDFLAGS="-B$TC/usr/lib -B$TC/lib -Wl,-rpath-link=$TC/usr/lib -Wl,-rpath-link=$TC/lib" ./configure --platform=opendingux +gcw0|gcw0|CROSS_COMPILE=mipsel-gcw0-linux-uclibc- CFLAGS="-I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include -I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL -fno-stack-protector -fno-common -finline-limit=42 -fipa-pta" LDFLAGS="--sysroot $TC/usr/mipsel-gcw0-linux-uclibc/sysroot" ./configure --platform=gcw0 + +For gp2x, wiz, and caanoo you may need to compile libpng first, and additionally +this patch may need to be applied to the cpu/cyclone submodule: +> diff --git a/OpArith.cpp b/OpArith.cpp +> index 96c7e0d..09517b8 100644 +> --- a/OpArith.cpp +> +++ b/OpArith.cpp +> @@ -425,7 +425,7 @@ int OpAbcd(int op) +> ot(" add r1,r1,r0\n"); +> ot(" add r1,r1,r6\n"); +> ot(" mov r12,r1\n"); +> - ot(" addhi r12,#6 ;@ Decimal adjust units\n"); +> + ot(" addhi r12,r12,#6 ;@ Decimal adjust units\n"); +> ot(" tst r1,#0x80\n"); +> ot(" orreq r10,r10,#0x10000000 ;@ Undefined V behavior\n"); +> ot(" cmp r12,#0x9f\n"); +> @@ -452,7 +452,7 @@ int OpAbcd(int op) +> ot(" cmp r1,r12\n"); +> ot(" orrlt r10,r10,#0x20000000 ;@ C\n"); +> ot(" cmp r1,#0xff\n"); +> - ot(" addhi r1,#0xa0\n"); +> + ot(" addhi r1,r1,#0xa0\n"); +> ot(" sub r12,r1,r12\n"); +> ot(" movs r0,r12,lsl #24\n"); +> ot(" bicmi r10,r10,#0x10000000 ;@ Undefined V behavior part II\n"); +> diff --git a/OpLogic.cpp b/OpLogic.cpp +> index 012e35a..d40d814 100644 +> --- a/OpLogic.cpp +> +++ b/OpLogic.cpp +> @@ -74,12 +74,12 @@ const char *TestCond(int m68k_cc, int invert) +> break; +> case 0x0e: // gt +> ot(" eor r0,r10,r10,lsl #3 ;@ gt: !Z && N == V\n"); +> - ot(" orrs r0,r10,lsl #1\n"); +> + ot(" orrs r0,r0,r10,lsl #1\n"); +> cond="pl", icond="mi"; +> break; +> case 0x0f: // le +> ot(" eor r0,r10,r10,lsl #3 ;@ le: Z || N != V\n"); +> - ot(" orrs r0,r10,lsl #1\n"); +> + ot(" orrs r0,r0,r10,lsl #1\n"); +> cond="mi", icond="pl"; +> break; +> default: + +After configure, compile with + +> make opk # for opendingux and gcw0 +> +> make # for anything else + +### helix MP3 decoder + +For 32 bit ARM platforms, there is the possibility to compile the helix MP3 +decoder into a shared library to be able to use MP3 audio files with CD games. +The helix source files aren't supplied because of licensing issues. However, if +you have obtained the sources, put them into the platform/common/helix +directory, set CROSS to your cross compiler prefix (e.g. arm-linux-gnueabi-) +and LIBGCC to your cross compiler's libgcc.a +(e.g. /usr/lib/gcc-cross/arm-linux-gnueabi/4.7/libgcc.a), and compile with + +> make -C platform/common/helix CROSS=$CROSS LIBGCC=$LIBGCC + +Copy the resulting ${CROSS}helix_mp3.so as libhelix.so to the directory where +the PicoDrive binary is. + +### installing + +You need to install the resulting binary onto your device manually. +For opendingux and gcw0, copy the opk to your SD card. +For gp2x, wiz and caanoo, the easiest way is to unpack +[PicoDrive_191.zip](http://notaz.gp2x.de/releases/PicoDrive/PicoDrive_191.zip) +on you SD card and replace the PicoDrive binary. + +Send bug reports, fixes etc to +Kai-Uwe Bloem diff --git a/config.dingux b/config.dingux index d1ec7fe57..b981bd3f9 100644 --- a/config.dingux +++ b/config.dingux @@ -1,5 +1,5 @@ # Automatically generated by configure -# Configured with: './configure' '--platform=generic' +# Configured with: './configure' '--platform=opendingux' CC = mipsel-linux-gcc CXX = mipsel-linux-g++ AS = mipsel-linux-as diff --git a/config.dingux54 b/config.dingux54 index 423cbd17b..a232d952b 100644 --- a/config.dingux54 +++ b/config.dingux54 @@ -1,5 +1,5 @@ # Automatically generated by configure -# Configured with: './configure' '--platform=generic' +# Configured with: './configure' '--platform=opendingux' CC = mipsel-linux-gnu-gcc CXX = mipsel-linux-gnu-g++ AS = mipsel-linux-gnu-as diff --git a/config.gcw0 b/config.gcw0 index 78f7c3a47..cebe79a10 100644 --- a/config.gcw0 +++ b/config.gcw0 @@ -1,5 +1,5 @@ # Automatically generated by configure -# Configured with: './configure' '--platform=generic' +# Configured with: './configure' '--platform=gcw0' CC = mipsel-gcw0-linux-uclibc-gcc CXX = mipsel-gcw0-linux-uclibc-g++ AS = mipsel-gcw0-linux-uclibc-as diff --git a/config.gp2x b/config.gp2x index 84d2f93d0..cf99bd774 100644 --- a/config.gp2x +++ b/config.gp2x @@ -4,8 +4,8 @@ CC = arm-open2x-linux-gcc CXX = arm-open2x-linux-g++ AS = arm-open2x-linux-as STRIP = arm-open2x-linux-strip -CFLAGS += -I${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I${HOME}/src/gp2x/armroot/include -D__GP2X__ -Wno-unused-result -CFLAGS += -msoft-float -mcpu=arm920t -mtune=arm920t +CFLAGS += -I${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I${HOME}/src/gp2x/armroot/include -D__GP2X__ +CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -mno-thumb-interwork -fno-stack-protector -fno-common CFLAGS += -finline-limit=42 -fipa-cp -fno-ipa-pure-const ASFLAGS += -mcpu=arm920t -mfloat-abi=soft LDFLAGS += --sysroot ${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/src/gp2x/armroot/lib -static diff --git a/config.gp2x47 b/config.gp2x47 index 7ce3d9a93..8a86e850c 100644 --- a/config.gp2x47 +++ b/config.gp2x47 @@ -5,8 +5,7 @@ CXX = arm-linux-gnueabi-g++ AS = arm-linux-gnueabi-as STRIP = arm-linux-gnueabi-strip CFLAGS += -I${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I${HOME}/src/gp2x/armroot/include -D__GP2X__ -Wno-unused-result -CFLAGS += -mabi=apcs-gnu -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t -mtune=arm920t -CFLAGS += -mno-thumb-interwork -fno-stack-protector -fno-common +CFLAGS += -mabi=apcs-gnu -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t -mtune=arm920t -mno-thumb-interwork -fno-stack-protector -fno-common CFLAGS += -finline-limit=42 -fipa-pta -fno-ipa-sra -fno-ipa-pure-const ASFLAGS += -mabi=apcs-gnu -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t LDFLAGS += -mabi=apcs-gnu -mfpu=fpa -B${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/usr/lib -L${HOME}/src/gp2x/armroot/lib -static diff --git a/configure b/configure index 1310ab2c9..c82fe2053 100755 --- a/configure +++ b/configure @@ -22,6 +22,13 @@ compile_binary() $c >> config.log 2>&1 } +check_option() +{ + echo 'void test(void) { }' >$TMPC + compile_object $1 || return 1 + return 0 +} + check_define() { $CC -E -dD $CFLAGS pico/arm_features.h | grep -q $1 || return 1 @@ -31,17 +38,18 @@ check_define() # setting options to "yes" or "no" will make that choice default, # "" means "autodetect". -platform_list="generic pandora gp2x opendingux rpi1 rpi2" +platform_list="generic pandora gp2x wiz caanoo opendingux gcw0 rpi1 rpi2" platform="generic" sound_driver_list="oss alsa sdl" sound_drivers="" have_armv5="" have_armv6="" have_armv7="" +have_arm_oabi="" have_arm_neon="" have_libavcodec="" need_sdl="no" -need_xlib="no" +need_zlib="no" # these are for known platforms optimize_cortexa8="no" optimize_cortexa7="no" @@ -54,7 +62,7 @@ CC="${CC-${CROSS_COMPILE}gcc}" CXX="${CXX-${CROSS_COMPILE}g++}" AS="${AS-${CROSS_COMPILE}as}" STRIP="${STRIP-${CROSS_COMPILE}strip}" -test -n "$SDL_CONFIG" || SDL_CONFIG="`$CC --print-sysroot 2> /dev/null || true`/usr/bin/sdl-config" +test -n "$SDL_CONFIG" || SDL_CONFIG="`$CC $CFLAGS $LDFLAGS --print-sysroot 2> /dev/null || true`/usr/bin/sdl-config" MAIN_LDLIBS="$LDLIBS -lm" config_mak="config.mak" @@ -78,23 +86,27 @@ set_platform() ;; generic) ;; - opendingux) + opendingux | gcw0) sound_drivers="sdl" + # both are really an opendingux + platform="opendingux" ;; pandora) sound_drivers="oss alsa" optimize_cortexa8="yes" have_arm_neon="yes" ;; - gp2x) + gp2x | wiz | caanoo) sound_drivers="oss" optimize_arm920="yes" + # compile for OABI if toolchain provides it (faster code on caanoo) + have_arm_oabi="yes" + # always use static linking, since caanoo doesn't have OABI libs. Moreover, + # dynamic linking slows Wiz 1-10%, and libm on F100 isn't compatible + LDFLAGS="$LDFLAGS -static" + # unified binary for all of them CFLAGS="$CFLAGS -D__GP2X__" - if [ "$CROSS_COMPILE" = "arm-linux-" ]; then - # still using static, dynamic linking slows Wiz 1-10% - # also libm on F100 is not compatible - MAIN_LDLIBS="$MAIN_LDLIBS -static" - fi + platform="gp2x" ;; *) fail "unsupported platform: $platform" @@ -147,18 +159,11 @@ fi # fi #fi -# basic compiler test -cat > $TMPC < $TMPC < $TMPC <> $config_mak if [ "$have_libavcodec" = "yes" ]; then echo "HAVE_LIBAVCODEC = 1" >> $config_mak fi +if [ "$need_zlib" = "yes" ]; then + echo "PLATFORM_ZLIB = 1" >> $config_mak +fi # GP2X toolchains are too old for UAL asm, # so add this here to not litter main Makefile -if [ "$platform" = "g1p2x" ]; then - echo >> $config_mak - echo "%.o: %.S" >> $config_mak - echo " $(CC) $(CFLAGS) -E -c $^ -o /tmp/$(notdir $@).s" >> $config_mak - echo " $(AS) $(ASFLAGS) /tmp/$(notdir $@).s -o $@" >> $config_mak -fi +#if [ "$platform" = "gp2x" ]; then +# echo >> $config_mak +# echo '%.o: %.S' >> $config_mak +# echo ' $(CC) $(CFLAGS) -E -c $^ -o /tmp/$(notdir $@).s' >> $config_mak +# echo ' $(AS) $(ASFLAGS) /tmp/$(notdir $@).s -o $@' >> $config_mak +#fi # use pandora's skin (for now) test -e skin || ln -s platform/pandora/skin skin diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index a4aa2ec62..1d70866cb 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -1,6 +1,7 @@ /* * Basic macros to emit ARM instructions and some utils * Copyright (C) 2008,2009,2010 notaz + * Copyright (C) 2019 kub * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 2177541cd..62288ff5f 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -1,6 +1,7 @@ /* * Basic macros to emit x86 instructions and some utils * Copyright (C) 2008,2009,2010 notaz + * Copyright (C) 2019 kuv * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. diff --git a/platform/common/helix/Makefile b/platform/common/helix/Makefile index 0021ea8e8..9fa4c1cc6 100644 --- a/platform/common/helix/Makefile +++ b/platform/common/helix/Makefile @@ -4,6 +4,7 @@ CC = $(CROSS)gcc AS = $(CROSS)as AR = $(CROSS)ar TOOLCHAIN = $(notdir $(CROSS)) +LIBGCC ?= ${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1/libgcc.a CFLAGS += -Ipub -O2 -Wall -fstrict-aliasing -ffast-math ifneq ($(findstring arm-,$(TOOLCHAIN)),) @@ -34,7 +35,7 @@ real/arm/asmpoly_gcc.o: real/arm/asmpoly_gcc.s $(LIB) : $(OBJS) $(AR) r $@ $^ -$(SHLIB) : $(OBJS) /home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1/libgcc.a +$(SHLIB) : $(OBJS) $(LIBGCC) $(CC) -o $@ -nostdlib -shared $(CFLAGS) $^ clean: From 173fc3f6de4b7cc74774e487f8d035f5a3af2e6c Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 21 Aug 2019 18:43:28 +0200 Subject: [PATCH 055/174] pff... README, 2nd try --- README.md | 45 +++------------------------------------------ cyclone_gp2x.patch | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 42 deletions(-) create mode 100644 cyclone_gp2x.patch diff --git a/README.md b/README.md index d0d7259f8..13ff1598c 100644 --- a/README.md +++ b/README.md @@ -36,48 +36,9 @@ opendingux|opendingux with ubuntu mips gcc 5.4|CROSS_COMPILE=mipsel-linux-gnu- C gcw0|gcw0|CROSS_COMPILE=mipsel-gcw0-linux-uclibc- CFLAGS="-I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include -I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL -fno-stack-protector -fno-common -finline-limit=42 -fipa-pta" LDFLAGS="--sysroot $TC/usr/mipsel-gcw0-linux-uclibc/sysroot" ./configure --platform=gcw0 For gp2x, wiz, and caanoo you may need to compile libpng first, and additionally -this patch may need to be applied to the cpu/cyclone submodule: -> diff --git a/OpArith.cpp b/OpArith.cpp -> index 96c7e0d..09517b8 100644 -> --- a/OpArith.cpp -> +++ b/OpArith.cpp -> @@ -425,7 +425,7 @@ int OpAbcd(int op) -> ot(" add r1,r1,r0\n"); -> ot(" add r1,r1,r6\n"); -> ot(" mov r12,r1\n"); -> - ot(" addhi r12,#6 ;@ Decimal adjust units\n"); -> + ot(" addhi r12,r12,#6 ;@ Decimal adjust units\n"); -> ot(" tst r1,#0x80\n"); -> ot(" orreq r10,r10,#0x10000000 ;@ Undefined V behavior\n"); -> ot(" cmp r12,#0x9f\n"); -> @@ -452,7 +452,7 @@ int OpAbcd(int op) -> ot(" cmp r1,r12\n"); -> ot(" orrlt r10,r10,#0x20000000 ;@ C\n"); -> ot(" cmp r1,#0xff\n"); -> - ot(" addhi r1,#0xa0\n"); -> + ot(" addhi r1,r1,#0xa0\n"); -> ot(" sub r12,r1,r12\n"); -> ot(" movs r0,r12,lsl #24\n"); -> ot(" bicmi r10,r10,#0x10000000 ;@ Undefined V behavior part II\n"); -> diff --git a/OpLogic.cpp b/OpLogic.cpp -> index 012e35a..d40d814 100644 -> --- a/OpLogic.cpp -> +++ b/OpLogic.cpp -> @@ -74,12 +74,12 @@ const char *TestCond(int m68k_cc, int invert) -> break; -> case 0x0e: // gt -> ot(" eor r0,r10,r10,lsl #3 ;@ gt: !Z && N == V\n"); -> - ot(" orrs r0,r10,lsl #1\n"); -> + ot(" orrs r0,r0,r10,lsl #1\n"); -> cond="pl", icond="mi"; -> break; -> case 0x0f: // le -> ot(" eor r0,r10,r10,lsl #3 ;@ le: Z || N != V\n"); -> - ot(" orrs r0,r10,lsl #1\n"); -> + ot(" orrs r0,r0,r10,lsl #1\n"); -> cond="mi", icond="pl"; -> break; -> default: +cyclone_gp2x.patch may need to be applied to the cpu/cyclone submodule: + +> patch -d cpu/cyclone -p1 Date: Thu, 22 Aug 2019 22:57:42 +0200 Subject: [PATCH 056/174] bug fix in comm poll fifo, and back to -O3 --- README.md | 12 ++++++------ pico/32x/memory.c | 30 ++++++++++++++++++++++-------- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 13ff1598c..aa0466d19 100644 --- a/README.md +++ b/README.md @@ -29,11 +29,11 @@ assuming $TC points to the appropriate cross compile toolchain directory: platform|toolchain|configure command --------|---------|----------------- -gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common -finline-limit=42" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x -gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common -finline-limit=42 -fipa-pta" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x -opendingux|opendingux|CROSS_COMPILE=mipsel-linux- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL -fno-stack-protector -fno-common -finline-limit=42 -fipa-pta" LDFLAGS="--sysroot $TC -L$TC/lib" ./configure --platform=opendingux -opendingux|opendingux with ubuntu mips gcc 5.4|CROSS_COMPILE=mipsel-linux-gnu- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL -fno-stack-protector -fno-common -finline-limit=42 -fipa-pta" LDFLAGS="-B$TC/usr/lib -B$TC/lib -Wl,-rpath-link=$TC/usr/lib -Wl,-rpath-link=$TC/lib" ./configure --platform=opendingux -gcw0|gcw0|CROSS_COMPILE=mipsel-gcw0-linux-uclibc- CFLAGS="-I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include -I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL -fno-stack-protector -fno-common -finline-limit=42 -fipa-pta" LDFLAGS="--sysroot $TC/usr/mipsel-gcw0-linux-uclibc/sysroot" ./configure --platform=gcw0 +gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -finline-limit=42 -fno-unroll-loops -fno-stack-protector -fno-common" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x +gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -finline-limit=42 -fno-unroll-loops -fno-stack-protector -fno-common" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x +opendingux|opendingux|CROSS_COMPILE=mipsel-linux- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="--sysroot $TC -L$TC/lib" ./configure --platform=opendingux +opendingux|opendingux with ubuntu mips gcc 5.4|CROSS_COMPILE=mipsel-linux-gnu- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="-B$TC/usr/lib -B$TC/lib -Wl,-rpath-link=$TC/usr/lib -Wl,-rpath-link=$TC/lib" ./configure --platform=opendingux +gcw0|gcw0|CROSS_COMPILE=mipsel-gcw0-linux-uclibc- CFLAGS="-I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include -I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL" LDFLAGS="--sysroot $TC/usr/mipsel-gcw0-linux-uclibc/sysroot" ./configure --platform=gcw0 For gp2x, wiz, and caanoo you may need to compile libpng first, and additionally cyclone_gp2x.patch may need to be applied to the cpu/cyclone submodule: @@ -67,7 +67,7 @@ You need to install the resulting binary onto your device manually. For opendingux and gcw0, copy the opk to your SD card. For gp2x, wiz and caanoo, the easiest way is to unpack [PicoDrive_191.zip](http://notaz.gp2x.de/releases/PicoDrive/PicoDrive_191.zip) -on you SD card and replace the PicoDrive binary. +on your SD card and replace the PicoDrive binary. Send bug reports, fixes etc to Kai-Uwe Bloem diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 3e11cbcba..7f494e7ad 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -191,7 +191,7 @@ static NOINLINE u32 sh2_poll_read(u32 a, u32 d, unsigned int cycles, SH2* sh2) int hix = (a >> 1) % PFIFO_CNT; struct sh2_poll_fifo *fifo = sh2_poll_fifo[hix]; struct sh2_poll_fifo *p; - int cpu = sh2 ? sh2->is_slave+1 : 0; + int cpu = sh2 ? sh2->is_slave : -1; unsigned idx; a &= ~0x20000000; // ignore writethrough bit @@ -204,7 +204,7 @@ static NOINLINE u32 sh2_poll_read(u32 a, u32 d, unsigned int cycles, SH2* sh2) if (cpu != p->cpu) { if (CYCLES_GT(cycles, p->cycles+80)) { // drop older fifo stores that may cause synchronisation problems. - sh2_poll_rd[hix] = idx; + p->a = -1; } else if (p->a == a) { // replace current data with fifo value and discard fifo entry d = p->d; @@ -221,24 +221,37 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) int hix = (a >> 1) % PFIFO_CNT; struct sh2_poll_fifo *fifo = sh2_poll_fifo[hix]; struct sh2_poll_fifo *q = &fifo[(sh2_poll_wr[hix]-1) % PFIFO_SZ]; - int cpu = sh2 ? sh2->is_slave+1 : 0; + int cpu = sh2 ? sh2->is_slave : -1; + unsigned rd = sh2_poll_rd[hix], wr = sh2_poll_wr[hix]; + unsigned idx, nrd; a &= ~0x20000000; // ignore writethrough bit + + // throw out any values written by other cpus, plus heading cancelled stuff + for (idx = nrd = wr; idx != rd; ) { + idx = (idx-1) % PFIFO_SZ; + if (fifo[idx].a == a && fifo[idx].cpu != cpu) { fifo[idx].a = -1; } + if (fifo[idx].a != -1) { nrd = idx; } + } + rd = nrd; + // fold 2 consecutive writes to the same address to avoid reading of // intermediate values that may cause synchronisation problems. // NB this can take an eternity on m68k: mov.b , needs // 28 m68k-cycles (~80 sh2-cycles) to complete (observed in Metal Head) - if (q->a == a && sh2_poll_wr[hix] != sh2_poll_rd[hix] && !CYCLES_GT(cycles,q->cycles+30)) { + if (q->a == a && rd != wr && !CYCLES_GT(cycles,q->cycles+30)) { q->d = d; } else { // store write to poll address in fifo - fifo[sh2_poll_wr[hix]] = + fifo[wr] = (struct sh2_poll_fifo){ .cycles = cycles, .a = a, .d = d, .cpu = cpu }; - sh2_poll_wr[hix] = (sh2_poll_wr[hix]+1) % PFIFO_SZ; - if (sh2_poll_wr[hix] == sh2_poll_rd[hix]) + wr = (wr+1) % PFIFO_SZ; + if (wr == rd) // fifo overflow, discard oldest value - sh2_poll_rd[hix] = (sh2_poll_rd[hix]+1) % PFIFO_SZ; + rd = (rd+1) % PFIFO_SZ; } + + sh2_poll_rd[hix] = rd; sh2_poll_wr[hix] = wr; } u32 REGPARM(3) p32x_sh2_poll_memory8(unsigned int a, u32 d, SH2 *sh2) @@ -2367,6 +2380,7 @@ void PicoMemSetup32x(void) sh2_drc_mem_setup(&ssh2); memset(sh2_poll_rd, 0, sizeof(sh2_poll_rd)); memset(sh2_poll_wr, 0, sizeof(sh2_poll_wr)); + memset(sh2_poll_fifo, -1, sizeof(sh2_poll_fifo)); // z80 hack z80_map_set(z80_write_map, 0x8000, 0xffff, z80_md_bank_write_32x, 1); From 69c6012a8f4dde5a5f5d4aea970420f5f49bbde9 Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 25 Aug 2019 17:33:13 +0200 Subject: [PATCH 057/174] some drawing code C optimisations --- Makefile | 4 ++++ README.md | 4 ++-- pico/32x/draw.c | 41 ++++++++++++++++++++++++-------------- pico/draw.c | 10 ++++++++-- platform/common/plat_sdl.c | 35 +++++++++++++------------------- 5 files changed, 54 insertions(+), 40 deletions(-) diff --git a/Makefile b/Makefile index a5adaf090..69f61b0d2 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,10 @@ ifneq ("$(PLATFORM)", "libretro") CFLAGS += -O3 -DNDEBUG endif endif +ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1")) +# very small caches, avoid optimization options making the binary much bigger +CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp-clone # -fno-ipa-cp +endif # This is actually needed, bevieve me. # If you really have to disable this, set NO_ALIGN_FUNCTIONS elsewhere. diff --git a/README.md b/README.md index aa0466d19..d77982310 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,8 @@ assuming $TC points to the appropriate cross compile toolchain directory: platform|toolchain|configure command --------|---------|----------------- -gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -finline-limit=42 -fno-unroll-loops -fno-stack-protector -fno-common" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x -gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -finline-limit=42 -fno-unroll-loops -fno-stack-protector -fno-common" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x +gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x +gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x opendingux|opendingux|CROSS_COMPILE=mipsel-linux- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="--sysroot $TC -L$TC/lib" ./configure --platform=opendingux opendingux|opendingux with ubuntu mips gcc 5.4|CROSS_COMPILE=mipsel-linux-gnu- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="-B$TC/usr/lib -B$TC/lib -Wl,-rpath-link=$TC/usr/lib -Wl,-rpath-link=$TC/lib" ./configure --platform=opendingux gcw0|gcw0|CROSS_COMPILE=mipsel-gcw0-linux-uclibc- CFLAGS="-I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include -I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL" LDFLAGS="--sysroot $TC/usr/mipsel-gcw0-linux-uclibc/sysroot" ./configure --platform=gcw0 diff --git a/pico/32x/draw.c b/pico/32x/draw.c index 229ed914d..4da70650d 100644 --- a/pico/32x/draw.c +++ b/pico/32x/draw.c @@ -47,16 +47,21 @@ static void convert_pal555(int invert_prio) const unsigned int m1 = 0x001f; \ const unsigned int m2 = 0x03e0; \ const unsigned int m3 = 0x7c00; \ - int i; \ + unsigned short t; \ + int i = 320; \ \ - for (i = 320; i > 0; i--, pd++, p32x++, pmd++) { \ - unsigned short t = *p32x; \ - if ((*pmd & 0x3f) != mdbg && !((t ^ inv) & 0x8000)) { \ - pmd_draw_code; \ - continue; \ + while (i > 0) { \ + for (; i > 0 && (*pmd & 0x3f) == mdbg; pd++, pmd++, i--) { \ + t = *p32x++; \ + *pd = ((t&m1) << 11) | ((t&m2) << 1) | ((t&m3) >> 10); \ + } \ + for (; i > 0 && (*pmd & 0x3f) != mdbg; pd++, pmd++, i--) { \ + t = *p32x++; \ + if ((t ^ inv) & 0x8000) \ + *pd = ((t&m1) << 11) | ((t&m2) << 1) | ((t&m3) >> 10); \ + else \ + pmd_draw_code; \ } \ - \ - *pd = ((t & m1) << 11) | ((t & m2) << 1) | ((t & m3) >> 10); \ } \ } @@ -64,15 +69,21 @@ static void convert_pal555(int invert_prio) #define do_line_pp(pd, p32x, pmd, pmd_draw_code) \ { \ unsigned short t; \ - int i; \ - for (i = 320; i > 0; i--, pd++, p32x++, pmd++) { \ - t = pal[*(unsigned char *)((uintptr_t)p32x ^ 1)]; \ - if ((t & 0x20) || (*pmd & 0x3f) == mdbg) \ + int i = 320; \ + while (i > 0) { \ + for (; i > 0 && (*pmd & 0x3f) == mdbg; pd++, pmd++, i--) { \ + t = pal[*(unsigned char *)((uintptr_t)(p32x++) ^ 1)]; \ *pd = t; \ - else \ - pmd_draw_code; \ + } \ + for (; i > 0 && (*pmd & 0x3f) != mdbg; pd++, pmd++, i--) { \ + t = pal[*(unsigned char *)((uintptr_t)(p32x++) ^ 1)]; \ + if (t & 0x20) \ + *pd = t; \ + else \ + pmd_draw_code; \ + } \ } \ -} +} // run length mode #define do_line_rl(pd, p32x, pmd, pmd_draw_code) \ diff --git a/pico/draw.c b/pico/draw.c index 7326aec56..984f5cd7d 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -1341,8 +1341,14 @@ void FinalizeLine555(int sh, int line, struct PicoEState *est) #if 1 int i; - for (i = 0; i < len; i++) - pd[i] = pal[ps[i]]; + for (i = len; i > 0; i-=4) { + *pd++ = pal[*ps++]; + *pd++ = pal[*ps++]; + *pd++ = pal[*ps++]; + *pd++ = pal[*ps++]; + } +// for (i = 0; i < len; i++) +// pd[i] = pal[ps[i]]; #else extern void amips_clut(unsigned short *dst, unsigned char *src, unsigned short *pal, int count); extern void amips_clut_6bit(unsigned short *dst, unsigned char *src, unsigned short *pal, int count); diff --git a/platform/common/plat_sdl.c b/platform/common/plat_sdl.c index ef99af2a0..bce4b0841 100644 --- a/platform/common/plat_sdl.c +++ b/platform/common/plat_sdl.c @@ -89,7 +89,8 @@ static const struct in_pdata in_sdl_platform_data = { /* YUV stuff */ static int yuv_ry[32], yuv_gy[32], yuv_by[32]; static unsigned char yuv_u[32 * 2], yuv_v[32 * 2]; -static int yuv_y[256]; +static unsigned char yuv_y[256]; +static struct uyvy { unsigned int y:8; unsigned int vyu:24; } yuv_uyvy[65536]; void bgr_to_uyvy_init(void) { @@ -124,34 +125,26 @@ void bgr_to_uyvy_init(void) for (i = 0; i < 256; i++) { yuv_y[i] = 16 + 219 * i / 32; } + // everything combined into one large array for speed + for (i = 0; i < 65536; i++) { + int r = (i >> 11) & 0x1f, g = (i >> 6) & 0x1f, b = (i >> 0) & 0x1f; + int y = (yuv_ry[r] + yuv_gy[g] + yuv_by[b]) >> 16; + yuv_uyvy[i].y = yuv_y[y]; + yuv_uyvy[i].vyu = (yuv_v[r-y + 32] << 16) | (yuv_y[y] << 8) | yuv_u[b-y + 32]; + } } void rgb565_to_uyvy(void *d, const void *s, int pixels) { unsigned int *dst = d; const unsigned short *src = s; - const unsigned char *yu = yuv_u + 32; - const unsigned char *yv = yuv_v + 32; - int r0, g0, b0, r1, g1, b1; - int y0, y1, u, v; - for (; pixels > 0; src += 2, dst++, pixels -= 2) + for (; pixels > 0; src += 4, dst += 2, pixels -= 4) { - r0 = (src[0] >> 11) & 0x1f; - g0 = (src[0] >> 6) & 0x1f; - b0 = src[0] & 0x1f; - r1 = (src[1] >> 11) & 0x1f; - g1 = (src[1] >> 6) & 0x1f; - b1 = src[1] & 0x1f; - y0 = (yuv_ry[r0] + yuv_gy[g0] + yuv_by[b0]) >> 16; - y1 = (yuv_ry[r1] + yuv_gy[g1] + yuv_by[b1]) >> 16; - u = yu[b0 - y0]; - v = yv[r0 - y0]; - // valid Y range seems to be 16..235 - y0 = yuv_y[y0]; - y1 = yuv_y[y1]; - - *dst = (y1 << 24) | (v << 16) | (y0 << 8) | u; + struct uyvy *uyvy0 = yuv_uyvy + src[0], *uyvy1 = yuv_uyvy + src[1]; + struct uyvy *uyvy2 = yuv_uyvy + src[2], *uyvy3 = yuv_uyvy + src[3]; + dst[0] = (uyvy1->y << 24) | uyvy0->vyu; + dst[1] = (uyvy3->y << 24) | uyvy2->vyu; } } From f98ab2655d82fd203871ecc73a0de8bc581a7c6c Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 31 Aug 2019 17:37:18 +0200 Subject: [PATCH 058/174] cleanup and microoptimizations in SH2 hw handling --- Makefile | 9 +- README.md | 4 +- cpu/drc/emit_arm64.c | 3 +- cpu/drc/emit_mips.c | 4 +- cpu/drc/emit_x86.c | 2 +- pico/32x/32x.c | 2 +- pico/32x/memory.c | 309 +++++++++++++++++++++++++----------------- pico/32x/memory_arm.S | 2 + pico/32x/pwm.c | 184 +++++++++++++------------ pico/32x/sh2soc.c | 34 ++--- pico/draw.c | 2 - pico/pico_int.h | 2 +- 12 files changed, 309 insertions(+), 248 deletions(-) diff --git a/Makefile b/Makefile index 69f61b0d2..7c78b19de 100644 --- a/Makefile +++ b/Makefile @@ -11,10 +11,6 @@ ifneq ("$(PLATFORM)", "libretro") CFLAGS += -O3 -DNDEBUG endif endif -ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1")) -# very small caches, avoid optimization options making the binary much bigger -CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp-clone # -fno-ipa-cp -endif # This is actually needed, bevieve me. # If you really have to disable this, set NO_ALIGN_FUNCTIONS elsewhere. @@ -44,6 +40,11 @@ else # NO_CONFIG_MAK config.mak: endif +ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1")) +# very small caches, avoid optimization options making the binary much bigger +CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp -fno-common -fno-stack-protector -ffast-math +endif + # default settings ifeq "$(ARCH)" "arm" use_cyclone ?= 1 diff --git a/README.md b/README.md index d77982310..8154f7dc0 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,8 @@ assuming $TC points to the appropriate cross compile toolchain directory: platform|toolchain|configure command --------|---------|----------------- -gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x -gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x +gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x +gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x opendingux|opendingux|CROSS_COMPILE=mipsel-linux- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="--sysroot $TC -L$TC/lib" ./configure --platform=opendingux opendingux|opendingux with ubuntu mips gcc 5.4|CROSS_COMPILE=mipsel-linux-gnu- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="-B$TC/usr/lib -B$TC/lib -Wl,-rpath-link=$TC/usr/lib -Wl,-rpath-link=$TC/lib" ./configure --platform=opendingux gcw0|gcw0|CROSS_COMPILE=mipsel-gcw0-linux-uclibc- CFLAGS="-I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include -I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL" LDFLAGS="--sysroot $TC/usr/mipsel-gcw0-linux-uclibc/sysroot" ./configure --platform=gcw0 diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index a67f6819c..de5876193 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -1163,9 +1163,10 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) int t2 = rcache_get_tmp(); \ int t3 = rcache_get_tmp(); \ /* if (sr < 0) return */ \ - emith_asrf(t2, sr, 12); \ + emith_cmp_r_imm(sr, 0); \ EMITH_JMP_START(DCOND_LE); \ /* turns = sr.cycles / cycles */ \ + emith_asr(t2, sr, 12); \ emith_move_r_imm(t3, (u32)((1ULL<<32) / (cycles)) + 1); \ emith_mul_u64(t1, t2, t2, t3); /* multiply by 1/x */ \ rcache_free_tmp(t3); \ diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 91d493b59..e200db0a4 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -560,8 +560,8 @@ static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) #define emith_adc_r_r(d, s) \ emith_adc_r_r_r(d, d, s) -// NB: the incoming C can cause its own outgoing C if s2+C=0 (or s1+C=0 FWIW) -// moreover, s2 is 0 if there is C, so no other C can be generated. +// NB: the incoming carry Cin can cause Cout if s2+Cin=0 (or s1+Cin=0 FWIW) +// moreover, if s2+Cin=0 caused Cout, s1+s2+Cin=s1+0 can't cause another Cout #define emith_adcf_r_r_r(d, s1, s2) do { \ emith_add_r_r_r(FNZ, s2, FC); \ EMIT(MIPS_SLTU_REG(AT, FNZ, FC)); \ diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 62288ff5f..d515cd238 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -1,7 +1,7 @@ /* * Basic macros to emit x86 instructions and some utils * Copyright (C) 2008,2009,2010 notaz - * Copyright (C) 2019 kuv + * Copyright (C) 2019 kub * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. diff --git a/pico/32x/32x.c b/pico/32x/32x.c index 1511f3f7c..e9d8ff6d2 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -426,7 +426,7 @@ void p32x_sync_other_sh2(SH2 *sh2, unsigned int m68k_target) } #define STEP_LS 24 -#define STEP_N 440 +#define STEP_N 488 // one line #define sync_sh2s_normal p32x_sync_sh2s //#define sync_sh2s_lockstep p32x_sync_sh2s diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 7f494e7ad..e139910a4 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -220,7 +220,7 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) { int hix = (a >> 1) % PFIFO_CNT; struct sh2_poll_fifo *fifo = sh2_poll_fifo[hix]; - struct sh2_poll_fifo *q = &fifo[(sh2_poll_wr[hix]-1) % PFIFO_SZ]; + struct sh2_poll_fifo *q; int cpu = sh2 ? sh2->is_slave : -1; unsigned rd = sh2_poll_rd[hix], wr = sh2_poll_wr[hix]; unsigned idx, nrd; @@ -230,8 +230,9 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) // throw out any values written by other cpus, plus heading cancelled stuff for (idx = nrd = wr; idx != rd; ) { idx = (idx-1) % PFIFO_SZ; - if (fifo[idx].a == a && fifo[idx].cpu != cpu) { fifo[idx].a = -1; } - if (fifo[idx].a != -1) { nrd = idx; } + q = &fifo[idx]; + if (q->cpu != cpu && q->a == a) { q->a = -1; } + if (q->a != -1) { nrd = idx; } } rd = nrd; @@ -239,7 +240,8 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) // intermediate values that may cause synchronisation problems. // NB this can take an eternity on m68k: mov.b , needs // 28 m68k-cycles (~80 sh2-cycles) to complete (observed in Metal Head) - if (q->a == a && rd != wr && !CYCLES_GT(cycles,q->cycles+30)) { + q = &fifo[(sh2_poll_wr[hix]-1) % PFIFO_SZ]; + if (rd != wr && q->a == a && !CYCLES_GT(cycles,q->cycles+30)) { q->d = d; } else { // store write to poll address in fifo @@ -493,6 +495,35 @@ static void p32x_reg_write8(u32 a, u32 d) case 0x1d: case 0x1e: case 0x1f: + return; + case 0x20: // comm port + case 0x21: + case 0x22: + case 0x23: + case 0x24: + case 0x25: + case 0x26: + case 0x27: + case 0x28: + case 0x29: + case 0x2a: + case 0x2b: + case 0x2c: + case 0x2d: + case 0x2e: + case 0x2f: + if (REG8IN16(r, a) != d) { + int cycles = SekCyclesDone(); + + if (cycles - (int)msh2.m68krcycles_done > 30) + p32x_sync_sh2s(cycles); + + REG8IN16(r, a) = d; + p32x_sh2_poll_event(&sh2s[0], SH2_STATE_CPOLL, cycles); + p32x_sh2_poll_event(&sh2s[1], SH2_STATE_CPOLL, cycles); + sh2_poll_write(a & ~1, r[a / 2], cycles, NULL); + } + return; case 0x30: return; case 0x31: // PWM control @@ -532,22 +563,6 @@ static void p32x_reg_write8(u32 a, u32 d) p32x_pwm_write16(a & ~1, d, NULL, SekCyclesDone()); return; } - - if ((a & 0x30) == 0x20) { - int cycles = SekCyclesDone(); - - if (REG8IN16(r, a) == d) - return; - - if (cycles - (int)msh2.m68krcycles_done > 30) - p32x_sync_sh2s(cycles); - - REG8IN16(r, a) = d; - p32x_sh2_poll_event(&sh2s[0], SH2_STATE_CPOLL, cycles); - p32x_sh2_poll_event(&sh2s[1], SH2_STATE_CPOLL, cycles); - sh2_poll_write(a & ~1, r[a / 2], cycles, NULL); - return; - } } static void p32x_reg_write16(u32 a, u32 d) @@ -558,61 +573,68 @@ static void p32x_reg_write16(u32 a, u32 d) // for things like bset on comm port m68k_poll.cnt = 0; - switch (a) { - case 0x00: // adapter ctl + switch (a/2) { + case 0x00/2: // adapter ctl if ((d ^ r[0]) & d & P32XS_nRES) p32x_reset_sh2s(); r[0] &= ~(P32XS_FM|P32XS_nRES|P32XS_ADEN); r[0] |= d & (P32XS_FM|P32XS_nRES|P32XS_ADEN); return; - case 0x08: // DREQ src + case 0x08/2: // DREQ src r[a / 2] = d & 0xff; return; - case 0x0a: + case 0x0a/2: r[a / 2] = d & ~1; return; - case 0x0c: // DREQ dest + case 0x0c/2: // DREQ dest r[a / 2] = d & 0xff; return; - case 0x0e: + case 0x0e/2: r[a / 2] = d; return; - case 0x10: // DREQ len + case 0x10/2: // DREQ len r[a / 2] = d & ~3; return; - case 0x12: // FIFO reg + case 0x12/2: // FIFO reg dreq0_write(r, d); return; - case 0x1a: // TV + mystery bit + case 0x1a/2: // TV + mystery bit r[a / 2] = d & 0x0101; return; - case 0x30: // PWM control + case 0x20/2: // comm port + case 0x22/2: + case 0x24/2: + case 0x26/2: + case 0x28/2: + case 0x2a/2: + case 0x2c/2: + case 0x2e/2: + if (r[a / 2] != d) { + int cycles = SekCyclesDone(); + + if (cycles - (int)msh2.m68krcycles_done > 30) + p32x_sync_sh2s(cycles); + + r[a / 2] = d; + p32x_sh2_poll_event(&sh2s[0], SH2_STATE_CPOLL, cycles); + p32x_sh2_poll_event(&sh2s[1], SH2_STATE_CPOLL, cycles); + sh2_poll_write(a, (u16)d, cycles, NULL); + } + return; + case 0x30/2: // PWM control d = (r[a / 2] & ~0x0f) | (d & 0x0f); r[a / 2] = d; p32x_pwm_write16(a, d, NULL, SekCyclesDone()); return; - } - - // comm port - if ((a & 0x30) == 0x20) { - int cycles = SekCyclesDone(); - - if (r[a / 2] == d) - return; - - if (cycles - (int)msh2.m68krcycles_done > 30) - p32x_sync_sh2s(cycles); - - r[a / 2] = d; - p32x_sh2_poll_event(&sh2s[0], SH2_STATE_CPOLL, cycles); - p32x_sh2_poll_event(&sh2s[1], SH2_STATE_CPOLL, cycles); - sh2_poll_write(a, (u16)d, cycles, NULL); - return; - } - // PWM - else if ((a & 0x30) == 0x30) { - p32x_pwm_write16(a, d, NULL, SekCyclesDone()); - return; + case 0x32/2: + case 0x34/2: + case 0x36/2: + case 0x38/2: + case 0x3a/2: + case 0x3c/2: + case 0x3e/2: + p32x_pwm_write16(a, d, NULL, SekCyclesDone()); + return; } p32x_reg_write8(a + 1, d); @@ -709,23 +731,23 @@ static u32 p32x_sh2reg_read16(u32 a, SH2 *sh2) u16 *r = Pico32x.regs; a &= 0x3e; - switch (a) { - case 0x00: // adapter/irq ctl + switch (a/2) { + case 0x00/2: // adapter/irq ctl return (r[0] & P32XS_FM) | Pico32x.sh2_regs[0] | Pico32x.sh2irq_mask[sh2->is_slave]; - case 0x04: // H count (often as comm too) + case 0x04/2: // H count (often as comm too) sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 9); sh2s_sync_on_read(sh2); return sh2_poll_read(a, Pico32x.sh2_regs[4 / 2], sh2_cycles_done_m68k(sh2), sh2); - case 0x06: + case 0x06/2: return (r[a / 2] & ~P32XS_FULL) | 0x4000; - case 0x08: // DREQ src - case 0x0a: - case 0x0c: // DREQ dst - case 0x0e: - case 0x10: // DREQ len + case 0x08/2: // DREQ src + case 0x0a/2: + case 0x0c/2: // DREQ dst + case 0x0e/2: + case 0x10/2: // DREQ len return r[a / 2]; - case 0x12: // DREQ FIFO - does this work on hw? + case 0x12/2: // DREQ FIFO - does this work on hw? if (Pico32x.dmac0_fifo_ptr > 0) { Pico32x.dmac0_fifo_ptr--; r[a / 2] = Pico32x.dmac_fifo[0]; @@ -733,23 +755,34 @@ static u32 p32x_sh2reg_read16(u32 a, SH2 *sh2) Pico32x.dmac0_fifo_ptr * 2); } return r[a / 2]; - case 0x14: - case 0x16: - case 0x18: - case 0x1a: - case 0x1c: + case 0x14/2: + case 0x16/2: + case 0x18/2: + case 0x1a/2: + case 0x1c/2: return 0; // ? + case 0x20/2: // comm port + case 0x22/2: + case 0x24/2: + case 0x26/2: + case 0x28/2: + case 0x2a/2: + case 0x2c/2: + case 0x2e/2: + sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 9); + sh2s_sync_on_read(sh2); + return sh2_poll_read(a, r[a / 2], sh2_cycles_done_m68k(sh2), sh2); + case 0x30/2: // PWM + case 0x32/2: + case 0x34/2: + case 0x36/2: + case 0x38/2: + case 0x3a/2: + case 0x3c/2: + case 0x3e/2: + return p32x_pwm_read16(a, sh2, sh2_cycles_done_m68k(sh2)); } - // comm port - if ((a & 0x30) == 0x20) { - sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 9); - sh2s_sync_on_read(sh2); - return sh2_poll_read(a, r[a / 2], sh2_cycles_done_m68k(sh2), sh2); - } - if ((a & 0x30) == 0x30) - return p32x_pwm_read16(a, sh2, sh2_cycles_done_m68k(sh2)); - elprintf_sh2(sh2, EL_32X|EL_ANOMALY, "unhandled sysreg r16 [%02x] @%08x", a, sh2_pc(sh2)); return 0; @@ -796,6 +829,32 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) sh2_poll_write(a & ~1, d, cycles, sh2); } return; + case 0x20: // comm port + case 0x21: + case 0x22: + case 0x23: + case 0x24: + case 0x25: + case 0x26: + case 0x27: + case 0x28: + case 0x29: + case 0x2a: + case 0x2b: + case 0x2c: + case 0x2d: + case 0x2e: + case 0x2f: + if (REG8IN16(r, a) != d) { + unsigned int cycles = sh2_cycles_done_m68k(sh2); + + REG8IN16(r, a) = d; + sh2_end_run(sh2, 1); + p32x_m68k_poll_event(P32XF_68KCPOLL); + p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); + sh2_poll_write(a & ~1, r[a / 2], cycles, sh2); + } + return; case 0x30: REG8IN16(r, a) = d & 0x0f; d = r[0x30 / 2]; @@ -837,20 +896,6 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) return; } - if ((a & 0x30) == 0x20) { - unsigned int cycles; - if (REG8IN16(r, a) == d) - return; - - REG8IN16(r, a) = d; - cycles = sh2_cycles_done_m68k(sh2); - sh2_end_run(sh2, 1); - p32x_m68k_poll_event(P32XF_68KCPOLL); - p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); - sh2_poll_write(a & ~1, r[a / 2], cycles, sh2); - return; - } - elprintf(EL_32X|EL_ANOMALY, "unhandled sysreg w8 [%02x] %02x @%08x", a, d, sh2_pc(sh2)); } @@ -861,49 +906,57 @@ static void p32x_sh2reg_write16(u32 a, u32 d, SH2 *sh2) sh2->poll_cnt = 0; - // comm - if ((a & 0x30) == 0x20) { - unsigned int cycles; - if (Pico32x.regs[a / 2] == d) - return; - - Pico32x.regs[a / 2] = d; - cycles = sh2_cycles_done_m68k(sh2); - sh2_end_run(sh2, 1); - p32x_m68k_poll_event(P32XF_68KCPOLL); - p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); - sh2_poll_write(a, d, cycles, sh2); - return; - } - // PWM - else if ((a & 0x30) == 0x30) { - p32x_pwm_write16(a, d, sh2, sh2_cycles_done_m68k(sh2)); - return; - } - - switch (a) { - case 0: // FM + switch (a/2) { + case 0x00/2: // FM Pico32x.regs[0] &= ~P32XS_FM; Pico32x.regs[0] |= d & P32XS_FM; break; - case 0x14: + case 0x14/2: Pico32x.sh2irqs &= ~P32XI_VRES; goto irls; - case 0x16: + case 0x16/2: Pico32x.sh2irqi[sh2->is_slave] &= ~P32XI_VINT; goto irls; - case 0x18: + case 0x18/2: Pico32x.sh2irqi[sh2->is_slave] &= ~P32XI_HINT; goto irls; - case 0x1a: + case 0x1a/2: Pico32x.regs[2 / 2] &= ~(1 << sh2->is_slave); p32x_update_cmd_irq(sh2, 0); return; - case 0x1c: + case 0x1c/2: p32x_pwm_sync_to_sh2(sh2); Pico32x.sh2irqi[sh2->is_slave] &= ~P32XI_PWM; p32x_pwm_schedule_sh2(sh2); goto irls; + case 0x20/2: // comm port + case 0x22/2: + case 0x24/2: + case 0x26/2: + case 0x28/2: + case 0x2a/2: + case 0x2c/2: + case 0x2e/2: + if (Pico32x.regs[a / 2] != d) { + unsigned int cycles = sh2_cycles_done_m68k(sh2); + + Pico32x.regs[a / 2] = d; + sh2_end_run(sh2, 1); + p32x_m68k_poll_event(P32XF_68KCPOLL); + p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); + sh2_poll_write(a, d, cycles, sh2); + } + return; + case 0x30/2: // PWM + case 0x32/2: + case 0x34/2: + case 0x36/2: + case 0x38/2: + case 0x3a/2: + case 0x3c/2: + case 0x3e/2: + p32x_pwm_write16(a, d, sh2, sh2_cycles_done_m68k(sh2)); + return; } p32x_sh2reg_write8(a | 1, d, sh2); @@ -1391,7 +1444,7 @@ static u32 REGPARM(2) sh2_read8_cs0(u32 a, SH2 *sh2) sh2_burn_cycles(sh2, 1*2); - // 0x3ffc0 is veridied + // 0x3ffc0 is verified if ((a & 0x3ffc0) == 0x4000) { d = p32x_sh2reg_read16(a, sh2); goto out_16to8; @@ -1573,6 +1626,11 @@ static void REGPARM(3) sh2_write8_cs0(u32 a, u32 d, SH2 *sh2) elprintf_sh2(sh2, EL_32X, "w8 [%08x] %02x @%06x", a, d & 0xff, sh2_pc(sh2)); + if ((a & 0x3ffc0) == 0x4000) { + p32x_sh2reg_write8(a, d, sh2); + goto out; + } + if (Pico32x.regs[0] & P32XS_FM) { if ((a & 0x3fff0) == 0x4100) { sh2->poll_cnt = 0; @@ -1588,11 +1646,6 @@ static void REGPARM(3) sh2_write8_cs0(u32 a, u32 d, SH2 *sh2) } } - if ((a & 0x3ffc0) == 0x4000) { - p32x_sh2reg_write8(a, d, sh2); - goto out; - } - sh2_write8_unmapped(a, d, sh2); out: DRC_RESTORE_SR(sh2); @@ -1647,6 +1700,11 @@ static void REGPARM(3) sh2_write16_cs0(u32 a, u32 d, SH2 *sh2) elprintf_sh2(sh2, EL_32X, "w16 [%08x] %04x @%06x", a, d & 0xffff, sh2_pc(sh2)); + if ((a & 0x3ffc0) == 0x4000) { + p32x_sh2reg_write16(a, d, sh2); + goto out; + } + if (Pico32x.regs[0] & P32XS_FM) { if ((a & 0x3fff0) == 0x4100) { sh2->poll_cnt = 0; @@ -1662,11 +1720,6 @@ static void REGPARM(3) sh2_write16_cs0(u32 a, u32 d, SH2 *sh2) } } - if ((a & 0x3ffc0) == 0x4000) { - p32x_sh2reg_write16(a, d, sh2); - goto out; - } - sh2_write16_unmapped(a, d, sh2); out: DRC_RESTORE_SR(sh2); diff --git a/pico/32x/memory_arm.S b/pico/32x/memory_arm.S index 43a019580..ba83a6bf4 100644 --- a/pico/32x/memory_arm.S +++ b/pico/32x/memory_arm.S @@ -18,6 +18,7 @@ .text +#if 0 @ u32 a, SH2 *sh2 .global sh2_read8_rom .global sh2_read8_sdram @@ -31,6 +32,7 @@ .global sh2_read32_sdram .global sh2_read32_da .global sh2_read32_dram +#endif @ u32 a, u32 d, SH2 *sh2 .global sh2_write8_sdram diff --git a/pico/32x/pwm.c b/pico/32x/pwm.c index 507356420..1c1ec4289 100644 --- a/pico/32x/pwm.c +++ b/pico/32x/pwm.c @@ -7,12 +7,15 @@ */ #include "../pico_int.h" -static int pwm_cycles; -static int pwm_mult; -static int pwm_ptr; -static int pwm_irq_reload; -static int pwm_doing_fifo; -static int pwm_silent; +static struct { + int cycles; + int mult; + int ptr; + int irq_reload; + int doing_fifo; + int silent; + short current[2]; +} pwm; void p32x_pwm_ctl_changed(void) { @@ -20,19 +23,19 @@ void p32x_pwm_ctl_changed(void) int cycles = Pico32x.regs[0x32 / 2]; cycles = (cycles - 1) & 0x0fff; - pwm_cycles = cycles; + pwm.cycles = cycles; // supposedly we should stop FIFO when xMd is 0, // but mars test disagrees - pwm_mult = 0; + pwm.mult = 0; if ((control & 0x0f) != 0) - pwm_mult = 0x10000 / cycles; + pwm.mult = 0x10000 / cycles; - pwm_irq_reload = (control & 0x0f00) >> 8; - pwm_irq_reload = ((pwm_irq_reload - 1) & 0x0f) + 1; + pwm.irq_reload = (control & 0x0f00) >> 8; + pwm.irq_reload = ((pwm.irq_reload - 1) & 0x0f) + 1; if (Pico32x.pwm_irq_cnt == 0) - Pico32x.pwm_irq_cnt = pwm_irq_reload; + Pico32x.pwm_irq_cnt = pwm.irq_reload; } static void do_pwm_irq(SH2 *sh2, unsigned int m68k_cycles) @@ -40,7 +43,7 @@ static void do_pwm_irq(SH2 *sh2, unsigned int m68k_cycles) p32x_trigger_irq(sh2, m68k_cycles, P32XI_PWM); if (Pico32x.regs[0x30 / 2] & P32XP_RTP) { - p32x_event_schedule(m68k_cycles, P32X_EVENT_PWM, pwm_cycles / 3 + 1); + p32x_event_schedule(m68k_cycles, P32X_EVENT_PWM, pwm.cycles / 3 + 1); // note: might recurse p32x_dreq1_trigger(); } @@ -50,14 +53,14 @@ static int convert_sample(unsigned int v) { if (v == 0) return 0; - if (v > pwm_cycles) - v = pwm_cycles; - return ((int)v - pwm_cycles / 2) * pwm_mult; + if (v > pwm.cycles) + v = pwm.cycles; + return (v * 2 - pwm.cycles) / 2 * pwm.mult; } #define consume_fifo(sh2, m68k_cycles) { \ int cycles_diff = ((m68k_cycles) * 3) - Pico32x.pwm_cycle_p; \ - if (cycles_diff >= pwm_cycles) \ + if (cycles_diff >= pwm.cycles) \ consume_fifo_do(sh2, m68k_cycles, cycles_diff); \ } @@ -69,67 +72,63 @@ static void consume_fifo_do(SH2 *sh2, unsigned int m68k_cycles, unsigned short *fifo_r = mem->pwm_fifo[1]; int sum = 0; - if (pwm_cycles == 0 || pwm_doing_fifo) + if (pwm.cycles == 0 || pwm.doing_fifo) return; elprintf(EL_PWM, "pwm: %u: consume %d/%d, %d,%d ptr %d", - m68k_cycles, sh2_cycles_diff, sh2_cycles_diff / pwm_cycles, - Pico32x.pwm_p[0], Pico32x.pwm_p[1], pwm_ptr); + m68k_cycles, sh2_cycles_diff, sh2_cycles_diff / pwm.cycles, + Pico32x.pwm_p[0], Pico32x.pwm_p[1], pwm.ptr); // this is for recursion from dreq1 writes - pwm_doing_fifo = 1; + pwm.doing_fifo = 1; - for (; sh2_cycles_diff >= pwm_cycles; sh2_cycles_diff -= pwm_cycles) + for (; sh2_cycles_diff >= pwm.cycles; sh2_cycles_diff -= pwm.cycles) { if (Pico32x.pwm_p[0] > 0) { - fifo_l[0] = fifo_l[1]; - fifo_l[1] = fifo_l[2]; - fifo_l[2] = fifo_l[3]; + mem->pwm_index[0] = (mem->pwm_index[0]+1) % 4; Pico32x.pwm_p[0]--; - mem->pwm_current[0] = convert_sample(fifo_l[0]); - sum += mem->pwm_current[0]; + pwm.current[0] = convert_sample(fifo_l[mem->pwm_index[0]]); + sum |=pwm.current[0]; } if (Pico32x.pwm_p[1] > 0) { - fifo_r[0] = fifo_r[1]; - fifo_r[1] = fifo_r[2]; - fifo_r[2] = fifo_r[3]; + mem->pwm_index[1] = (mem->pwm_index[1]+1) % 4; Pico32x.pwm_p[1]--; - mem->pwm_current[1] = convert_sample(fifo_r[0]); - sum += mem->pwm_current[1]; + pwm.current[1] = convert_sample(fifo_r[mem->pwm_index[1]]); + sum |= pwm.current[1]; } - mem->pwm[pwm_ptr * 2 ] = mem->pwm_current[0]; - mem->pwm[pwm_ptr * 2 + 1] = mem->pwm_current[1]; - pwm_ptr = (pwm_ptr + 1) & (PWM_BUFF_LEN - 1); + mem->pwm[pwm.ptr * 2 ] = pwm.current[0]; + mem->pwm[pwm.ptr * 2 + 1] = pwm.current[1]; + pwm.ptr = (pwm.ptr + 1) & (PWM_BUFF_LEN - 1); if (--Pico32x.pwm_irq_cnt == 0) { - Pico32x.pwm_irq_cnt = pwm_irq_reload; + Pico32x.pwm_irq_cnt = pwm.irq_reload; do_pwm_irq(sh2, m68k_cycles); } } Pico32x.pwm_cycle_p = m68k_cycles * 3 - sh2_cycles_diff; - pwm_doing_fifo = 0; + pwm.doing_fifo = 0; if (sum != 0) - pwm_silent = 0; + pwm.silent = 0; } static int p32x_pwm_schedule_(SH2 *sh2, unsigned int m68k_now) { - unsigned int sh2_now = m68k_now * 3; + unsigned int pwm_now = m68k_now * 3; int cycles_diff_sh2; - if (pwm_cycles == 0) + if (pwm.cycles == 0) return 0; - cycles_diff_sh2 = sh2_now - Pico32x.pwm_cycle_p; - if (cycles_diff_sh2 >= pwm_cycles) + cycles_diff_sh2 = pwm_now - Pico32x.pwm_cycle_p; + if (cycles_diff_sh2 >= pwm.cycles) consume_fifo_do(sh2, m68k_now, cycles_diff_sh2); if (!((Pico32x.sh2irq_mask[0] | Pico32x.sh2irq_mask[1]) & 1)) return 0; // masked by everyone - cycles_diff_sh2 = sh2_now - Pico32x.pwm_cycle_p; - return (Pico32x.pwm_irq_cnt * pwm_cycles + cycles_diff_sh2 = pwm_now - Pico32x.pwm_cycle_p; + return (Pico32x.pwm_irq_cnt * pwm.cycles - cycles_diff_sh2) / 3 + 1; } @@ -166,21 +165,21 @@ unsigned int p32x_pwm_read16(unsigned int a, SH2 *sh2, consume_fifo(sh2, m68k_cycles); a &= 0x0e; - switch (a) { - case 0: // control - case 2: // cycle + switch (a/2) { + case 0/2: // control + case 2/2: // cycle d = Pico32x.regs[(0x30 + a) / 2]; break; - case 4: // L ch + case 4/2: // L ch if (Pico32x.pwm_p[0] == 3) d |= P32XP_FULL; else if (Pico32x.pwm_p[0] == 0) d |= P32XP_EMPTY; break; - case 6: // R ch - case 8: // MONO + case 6/2: // R ch + case 8/2: // MONO if (Pico32x.pwm_p[1] == 3) d |= P32XP_FULL; else if (Pico32x.pwm_p[1] == 0) @@ -196,47 +195,53 @@ unsigned int p32x_pwm_read16(unsigned int a, SH2 *sh2, void p32x_pwm_write16(unsigned int a, unsigned int d, SH2 *sh2, unsigned int m68k_cycles) { + unsigned short *fifo; + int idx; + elprintf(EL_PWM, "pwm: %u: w16 %02x %04x (p %d %d)", m68k_cycles, a & 0x0e, d, Pico32x.pwm_p[0], Pico32x.pwm_p[1]); consume_fifo(sh2, m68k_cycles); a &= 0x0e; - if (a == 0) { // control - // avoiding pops.. - if ((Pico32x.regs[0x30 / 2] & 0x0f) == 0) - Pico32xMem->pwm_fifo[0][0] = Pico32xMem->pwm_fifo[1][0] = 0; - Pico32x.regs[0x30 / 2] = d; - p32x_pwm_ctl_changed(); - Pico32x.pwm_irq_cnt = pwm_irq_reload; // ? - } - else if (a == 2) { // cycle - Pico32x.regs[0x32 / 2] = d & 0x0fff; - p32x_pwm_ctl_changed(); - } - else if (a <= 8) { - d = (d - 1) & 0x0fff; - - if (a == 4 || a == 8) { // L ch or MONO - unsigned short *fifo = Pico32xMem->pwm_fifo[0]; - if (Pico32x.pwm_p[0] < 3) - Pico32x.pwm_p[0]++; - else { - fifo[1] = fifo[2]; - fifo[2] = fifo[3]; - } - fifo[Pico32x.pwm_p[0]] = d; - } - if (a == 6 || a == 8) { // R ch or MONO - unsigned short *fifo = Pico32xMem->pwm_fifo[1]; + switch (a/2) { + case 0/2: // control + // avoiding pops.. + if ((Pico32x.regs[0x30 / 2] & 0x0f) == 0) + Pico32xMem->pwm_fifo[0][0] = Pico32xMem->pwm_fifo[1][0] = 0; + Pico32x.regs[0x30 / 2] = d; + p32x_pwm_ctl_changed(); + Pico32x.pwm_irq_cnt = pwm.irq_reload; // ? + break; + case 2/2: // cycle + Pico32x.regs[0x32 / 2] = d & 0x0fff; + p32x_pwm_ctl_changed(); + break; + case 8/2: // MONO + case 6/2: // R ch + fifo = Pico32xMem->pwm_fifo[1]; + idx = Pico32xMem->pwm_index[1]; if (Pico32x.pwm_p[1] < 3) Pico32x.pwm_p[1]++; else { - fifo[1] = fifo[2]; - fifo[2] = fifo[3]; +// fifo[(idx+1) % 4] = fifo[idx]; + idx = (idx+1) % 4; + Pico32xMem->pwm_index[0] = idx; } - fifo[Pico32x.pwm_p[1]] = d; - } + fifo[(idx+Pico32x.pwm_p[1]) % 4] = (d - 1) & 0x0fff; + if (a != 8) break; // fallthrough if MONO + case 4/2: // L ch + fifo = Pico32xMem->pwm_fifo[0]; + idx = Pico32xMem->pwm_index[0]; + if (Pico32x.pwm_p[0] < 3) + Pico32x.pwm_p[0]++; + else { +// fifo[(idx+1) % 4] = fifo[idx]; + idx = (idx+1) % 4; + Pico32xMem->pwm_index[0] = idx; + } + fifo[(idx+Pico32x.pwm_p[0]) % 4] = (d - 1) & 0x0fff; + break; } } @@ -252,10 +257,10 @@ void p32x_pwm_update(int *buf32, int length, int stereo) xmd = Pico32x.regs[0x30 / 2] & 0x0f; if (xmd == 0 || xmd == 0x06 || xmd == 0x09 || xmd == 0x0f) goto out; // invalid? - if (pwm_silent) + if (pwm.silent) return; - step = (pwm_ptr << 16) / length; + step = (pwm.ptr << 16) / length; pwmb = Pico32xMem->pwm; if (stereo) @@ -310,13 +315,12 @@ void p32x_pwm_update(int *buf32, int length, int stereo) } } - elprintf(EL_PWM, "pwm_update: pwm_ptr %d, len %d, step %04x, done %d", - pwm_ptr, length, step, (pwmb - Pico32xMem->pwm) / 2); + elprintf(EL_PWM, "pwm_update: pwm.ptr %d, len %d, step %04x, done %d", + pwm.ptr, length, step, (pwmb - Pico32xMem->pwm) / 2); out: - pwm_ptr = 0; - pwm_silent = Pico32xMem->pwm_current[0] == 0 - && Pico32xMem->pwm_current[1] == 0; + pwm.ptr = 0; + pwm.silent = pwm.current[0] == 0 && pwm.current[1] == 0; } void p32x_pwm_state_loaded(void) @@ -327,8 +331,8 @@ void p32x_pwm_state_loaded(void) // for old savestates cycles_diff_sh2 = Pico.t.m68c_cnt * 3 - Pico32x.pwm_cycle_p; - if (cycles_diff_sh2 >= pwm_cycles || cycles_diff_sh2 < 0) { - Pico32x.pwm_irq_cnt = pwm_irq_reload; + if (cycles_diff_sh2 >= pwm.cycles || cycles_diff_sh2 < 0) { + Pico32x.pwm_irq_cnt = pwm.irq_reload; Pico32x.pwm_cycle_p = Pico.t.m68c_cnt * 3; p32x_pwm_schedule(Pico.t.m68c_cnt); } diff --git a/pico/32x/sh2soc.c b/pico/32x/sh2soc.c index 1f19150e4..2b5a126c9 100644 --- a/pico/32x/sh2soc.c +++ b/pico/32x/sh2soc.c @@ -399,6 +399,7 @@ void REGPARM(3) sh2_peripheral_write32(u32 a, u32 d, SH2 *sh2) { u32 *r = sh2->peri_regs; u32 old; + struct dmac *dmac; elprintf_sh2(sh2, EL_32XP, "peri w32 [%08x] %08x @%06x", a, d, sh2_pc(sh2)); @@ -439,22 +440,23 @@ void REGPARM(3) sh2_peripheral_write32(u32 a, u32 d, SH2 *sh2) else r[0x110 / 4] = r[0x114 / 4] = r[0x118 / 4] = r[0x11c / 4] = 0; // ? break; - } - - // perhaps starting a DMA? - if (a == 0x1b0 || a == 0x18c || a == 0x19c) { - struct dmac *dmac = (void *)&sh2->peri_regs[0x180 / 4]; - if (a == 0x1b0 && !((old ^ d) & d & DMA_DME)) - return; - if (!(dmac->dmaor & DMA_DME)) - return; - - DRC_SAVE_SR(sh2); - if ((dmac->chan[0].chcr & (DMA_TE|DMA_DE)) == DMA_DE) - dmac_trigger(sh2, &dmac->chan[0]); - if ((dmac->chan[1].chcr & (DMA_TE|DMA_DE)) == DMA_DE) - dmac_trigger(sh2, &dmac->chan[1]); - DRC_RESTORE_SR(sh2); + // perhaps starting a DMA? + case 0x18c: + case 0x19c: + case 0x1b0: + dmac = (void *)&sh2->peri_regs[0x180 / 4]; + if (a == 0x1b0 && !((old ^ d) & d & DMA_DME)) + return; + if (!(dmac->dmaor & DMA_DME)) + return; + + DRC_SAVE_SR(sh2); + if ((dmac->chan[0].chcr & (DMA_TE|DMA_DE)) == DMA_DE) + dmac_trigger(sh2, &dmac->chan[0]); + if ((dmac->chan[1].chcr & (DMA_TE|DMA_DE)) == DMA_DE) + dmac_trigger(sh2, &dmac->chan[1]); + DRC_RESTORE_SR(sh2); + break; } } diff --git a/pico/draw.c b/pico/draw.c index 984f5cd7d..06c54807b 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -1347,8 +1347,6 @@ void FinalizeLine555(int sh, int line, struct PicoEState *est) *pd++ = pal[*ps++]; *pd++ = pal[*ps++]; } -// for (i = 0; i < len; i++) -// pd[i] = pal[ps[i]]; #else extern void amips_clut(unsigned short *dst, unsigned char *src, unsigned short *pal, int count); extern void amips_clut_6bit(unsigned short *dst, unsigned char *src, unsigned short *pal, int count); diff --git a/pico/pico_int.h b/pico/pico_int.h index 36b36144d..89acc4fbb 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -630,8 +630,8 @@ struct Pico32xMem unsigned short pal[0x100]; unsigned short pal_native[0x100]; // converted to native (for renderer) signed short pwm[2*PWM_BUFF_LEN]; // PWM buffer for current frame - signed short pwm_current[2]; // current converted samples unsigned short pwm_fifo[2][4]; // [0] - current raw, others - fifo entries + unsigned pwm_index[2]; // ringbuffer index for pwm_fifo }; // area.c From 97706c3ee068a0577c0da4a316b5a15de58180c0 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 17 Sep 2019 22:48:32 +0200 Subject: [PATCH 059/174] various smallish optimizations, cleanups, and bug fixes --- Makefile | 4 +++- cpu/drc/emit_arm.c | 48 +++++++++++++++++++++++++++++++++++++++++--- cpu/drc/emit_arm64.c | 4 ++++ cpu/drc/emit_mips.c | 40 ++++++++++++++++++++++-------------- cpu/drc/emit_x86.c | 4 ++-- cpu/sh2/compiler.c | 24 +++++++++++++++++++--- cpu/sh2/compiler.h | 16 +++++++-------- pico/32x/32x.c | 4 ++-- 8 files changed, 109 insertions(+), 35 deletions(-) diff --git a/Makefile b/Makefile index 7c78b19de..e0ce4fd09 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,9 @@ endif ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1")) # very small caches, avoid optimization options making the binary much bigger -CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp -fno-common -fno-stack-protector -ffast-math +CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp +# this gets you about 20% better execution speed on 32bit arm/mips +CFLAGS += -fno-common -fno-stack-protector -fno-guess-branch-probability -fno-caller-saves -fno-tree-loop-if-convert -ffast-math endif # default settings diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 1d70866cb..66a5b065b 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -36,6 +36,47 @@ #define M5(x,y,z,a,b) (M4(x,y,z,a)|M1(b)) #define M10(a,b,c,d,e,f,g,h,i,j) (M5(a,b,c,d,e)|M5(f,g,h,i,j)) +// sys_cacheflush always flushes whole pages, and it's rather expensive on ARMs +// hold a list of pending cache updates and merge requests to reduce cacheflush +static struct { void *base, *end; } pageflush[4]; +static unsigned pagesize = 4096; + +static void emith_update_cache(void) +{ + int i; + + for (i = 0; i < 4 && pageflush[i].base; i++) { + cache_flush_d_inval_i(pageflush[i].base, pageflush[i].end + pagesize-1); + pageflush[i].base = NULL; + } +} + +static inline void emith_update_add(void *base, void *end) +{ + void *p_base = (void *)((uintptr_t)(base) & ~(pagesize-1)); + void *p_end = (void *)((uintptr_t)(end ) & ~(pagesize-1)); + int i; + + for (i = 0; i < 4 && pageflush[i].base; i++) { + if (p_base <= pageflush[i].end+pagesize && p_end >= pageflush[i].end) { + if (p_base < pageflush[i].base) pageflush[i].base = p_base; + pageflush[i].end = p_end; + return; + } + if (p_base <= pageflush[i].base && p_end >= pageflush[i].base-pagesize) { + if (p_end > pageflush[i].end) pageflush[i].end = p_end; + pageflush[i].base = p_base; + return; + } + } + if (i == 4) { + /* list full and not mergeable -> flush list */ + emith_update_cache(); + i = 0; + } + pageflush[i].base = p_base, pageflush[i].end = p_end; +} + // peephole optimizer. ATM only tries to reduce interlock #define EMIT_CACHE_SIZE 3 struct emit_op { @@ -48,8 +89,8 @@ static struct emit_op emit_cache[EMIT_CACHE_SIZE+3]; static int emit_index; #define emith_insn_ptr() (u8 *)((u32 *)tcache_ptr-emit_index) -static int emith_pool_index(int tcache_offs); -static void emith_pool_adjust(int pool_index, int move_offs); +static inline int emith_pool_index(int tcache_offs); +static inline void emith_pool_adjust(int pool_index, int move_offs); static NOINLINE void EMIT(u32 op, u32 dst, u32 src) { @@ -1106,6 +1147,7 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) (u8 *)ptr; \ }) +#define emith_jump_cond_inrange(target) !0 #define emith_jump_patch_size() 4 #define emith_jump_at(ptr, target) do { \ @@ -1170,7 +1212,7 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) } while (0) #define host_instructions_updated(base, end) \ - cache_flush_d_inval_i(base, end) + emith_update_add(base, end) #define host_arg2reg(rd, arg) \ rd = arg diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index de5876193..8ce2ef382 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -1038,6 +1038,9 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_jump_cond_patchable(cond, target) \ emith_bcond(tcache_ptr, 1, cond, target) +#define emith_jump_cond_inrange(target) \ + !(((u8 *)target - (u8 *)tcache_ptr + 0x100000) >> 22) + #define emith_jump_patch(ptr, target) ({ \ u32 *ptr_ = (u32 *)ptr; \ u32 disp_ = (u8 *)(target) - (u8 *)(ptr_); \ @@ -1116,6 +1119,7 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_insn_ptr() ((u8 *)tcache_ptr) #define emith_flush() /**/ #define host_instructions_updated(base, end) __builtin___clear_cache(base, end) +#define emith_update_cache() /**/ #define emith_jump_patch_size() 8 #define emith_rw_offs_max() 0xff diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index e200db0a4..0e85f92a7 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -209,20 +209,25 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; // FIFO for 2 instructions, for delay slot handling u32 emith_last_insns[2] = { -1,-1 }; -int emith_last_idx; +int emith_last_idx, emith_last_cnt; #define EMIT_PUSHOP() \ do { \ emith_last_idx ^= 1; \ - if (emith_last_insns[emith_last_idx] != -1) \ - EMIT_PTR(tcache_ptr, emith_last_insns[emith_last_idx]);\ + if (emith_last_insns[emith_last_idx] != -1) { \ + u32 *p = (u32 *)tcache_ptr - emith_last_cnt; \ + EMIT_PTR(p, emith_last_insns[emith_last_idx]);\ + emith_last_cnt --; \ + } \ emith_last_insns[emith_last_idx] = -1; \ } while (0) #define EMIT(op) \ do { \ EMIT_PUSHOP(); \ + tcache_ptr = (void *)((u32 *)tcache_ptr + 1); \ emith_last_insns[emith_last_idx] = op; \ + emith_last_cnt ++; \ COUNT_OP; \ } while (0) @@ -231,8 +236,7 @@ int emith_last_idx; int i; for (i = 0; i < 2; i++) EMIT_PUSHOP(); \ } while (0) -#define emith_insn_ptr() (u8 *)((u32 *)tcache_ptr + \ - (emith_last_insns[0] != -1) + (emith_last_insns[1] != -1)) +#define emith_insn_ptr() (u8 *)((u32 *)tcache_ptr - emith_last_cnt) // delay slot stuff static int emith_is_j(u32 op) // J, JAL @@ -305,12 +309,14 @@ static void *emith_branch(u32 op) } if (bop) { // can swap + tcache_ptr = (void *)((u32 *)tcache_ptr - emith_last_cnt); if (emith_last_insns[idx^1] != -1) EMIT_PTR(tcache_ptr, emith_last_insns[idx^1]); bp = tcache_ptr; EMIT_PTR(tcache_ptr, bop); COUNT_OP; EMIT_PTR(tcache_ptr, emith_last_insns[idx]); emith_last_insns[0] = emith_last_insns[1] = -1; + emith_last_cnt = 0; } else { // can't swap emith_flush(); bp = tcache_ptr; @@ -325,13 +331,13 @@ static void *emith_branch(u32 op) ptr = emith_branch(MIPS_BCONDZ(cond_m, cond_r, 0)); #define JMP_EMIT(cond, ptr) { \ - u32 val_ = emith_insn_ptr() - (u8 *)(ptr) - 4; \ + u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; \ EMIT_PTR(ptr, MIPS_BCONDZ(cond_m, cond_r, val_ & 0x0003ffff)); \ emith_flush(); /* NO delay slot handling across jump targets */ \ } #define JMP_EMIT_NC(ptr) { \ - u32 val_ = emith_insn_ptr() - (u8 *)(ptr) - 4; \ + u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; \ EMIT_PTR(ptr, MIPS_B(val_ & 0x0003ffff)); \ emith_flush(); \ } @@ -881,14 +887,14 @@ static u8 *last_lohi; static void emith_lohi_nops(void) { u32 d; - while ((d = emith_insn_ptr() - last_lohi) < 8 && d >= 0) EMIT(MIPS_NOP); + while ((d = (u8 *)tcache_ptr - last_lohi) < 8 && d >= 0) EMIT(MIPS_NOP); } #define emith_mul(d, s1, s2) do { \ emith_lohi_nops(); \ EMIT(MIPS_MULTU(s1, s2)); \ EMIT(MIPS_MFLO(d)); \ - last_lohi = emith_insn_ptr(); \ + last_lohi = (u8 *)tcache_ptr; \ } while (0) #define emith_mul_u64(dlo, dhi, s1, s2) do { \ @@ -896,7 +902,7 @@ static void emith_lohi_nops(void) EMIT(MIPS_MULTU(s1, s2)); \ EMIT(MIPS_MFLO(dlo)); \ EMIT(MIPS_MFHI(dhi)); \ - last_lohi = emith_insn_ptr(); \ + last_lohi = (u8 *)tcache_ptr; \ } while (0) #define emith_mul_s64(dlo, dhi, s1, s2) do { \ @@ -904,7 +910,7 @@ static void emith_lohi_nops(void) EMIT(MIPS_MULT(s1, s2)); \ EMIT(MIPS_MFLO(dlo)); \ EMIT(MIPS_MFHI(dhi)); \ - last_lohi = emith_insn_ptr(); \ + last_lohi = (u8 *)tcache_ptr; \ } while (0) #define emith_mula_s64(dlo, dhi, s1, s2) do { \ @@ -915,7 +921,7 @@ static void emith_lohi_nops(void) emith_add_r_r(dlo, AT); \ EMIT(MIPS_SLTU_REG(t_, dlo, AT)); \ EMIT(MIPS_MFHI(AT)); \ - last_lohi = emith_insn_ptr(); \ + last_lohi = (u8 *)tcache_ptr; \ emith_add_r_r(dhi, AT); \ emith_add_r_r(dhi, t_); \ rcache_free_tmp(t_); \ @@ -1174,14 +1180,14 @@ static int emith_cond_check(int cond, int *r) // NB: MIPS conditional branches have only +/- 128KB range #define emith_jump_cond(cond, target) do { \ int r_, mcond_ = emith_cond_check(cond, &r_); \ - u32 disp_ = (u8 *)target - emith_insn_ptr() - 4; \ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr - 4; \ if (disp_ >= 0xfffe0000 || disp_ <= 0x0001ffff) { /* can use near B */ \ emith_branch(MIPS_BCONDZ(mcond_,r_,disp_ & 0x0003ffff)); \ } else { /* far branch if near branch isn't possible */ \ mcond_ = emith_invert_branch(mcond_); \ u8 *bp = emith_branch(MIPS_BCONDZ(mcond_, r_, 0)); \ emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)); \ - EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, emith_insn_ptr()-bp-4)); \ + EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, (u8 *)tcache_ptr-bp-4)); \ } \ } while (0) @@ -1190,9 +1196,12 @@ static int emith_cond_check(int cond, int *r) mcond_ = emith_invert_branch(mcond_); \ u8 *bp = emith_branch(MIPS_BCONDZ(mcond_, r_, 0));\ emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)); \ - EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, emith_insn_ptr()-bp-4)); \ + EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, (u8 *)tcache_ptr-bp-4)); \ } while (0) +#define emith_jump_cond_inrange(target) \ + !(((u8 *)target - (u8 *)tcache_ptr + 0x10000) >> 18) + // NB: returns position of patch for cache maintenance #define emith_jump_patch(ptr, target) ({ \ u32 *ptr_ = (u32 *)ptr-1; /* must skip condition check code */ \ @@ -1261,6 +1270,7 @@ static int emith_cond_check(int cond, int *r) #define emith_pool_commit(j) /**/ // NB: mips32r2 has SYNCI #define host_instructions_updated(base, end) __builtin___clear_cache(base, end) +#define emith_update_cache() /**/ #define emith_jump_patch_size() 4 #define emith_rw_offs_max() 0x7fff diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index d515cd238..caade3a67 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -877,6 +877,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common ptr; \ }) +#define emith_jump_cond_inrange(ptr) !0 #define emith_jump_patch_size() 6 #define emith_jump_at(ptr, target) do { \ @@ -986,6 +987,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common } while (0) #define host_instructions_updated(base, end) +#define emith_update_cache() /**/ #define emith_rw_offs_max() 0xffffffff @@ -993,7 +995,6 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define HOST_REGS 16 #define PTR_SCALE 3 -#define NA_TMP_REG xAX // non-arg tmp from reg_temp[] #define EMIT_XREX_IF(w, r, rm, rs) do { \ int xr_ = (r) > 7 ? 1 : 0; \ @@ -1078,7 +1079,6 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define HOST_REGS 8 #define PTR_SCALE 2 -#define NA_TMP_REG xBX // non-arg tmp from reg_temp[] #define EMIT_REX_IF(w, r, rm) do { \ assert((u32)(r) < 8u); \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 677c8adf9..6eaf71232 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -2920,6 +2920,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // mark memory for overwrite detection dr_mark_memory(1, block, tcache_id, 0); block->active = 1; + emith_update_cache(); return block->entryp[0].tcache_ptr; } @@ -3113,8 +3114,15 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp = rcache_get_tmp_arg(0); sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); emith_cmp_r_imm(sr, 0); - emith_move_r_imm_c(DCOND_LE, tmp, pc); - emith_jump_cond(DCOND_LE, sh2_drc_exit); + if (emith_jump_cond_inrange(sh2_drc_exit)) { + emith_move_r_imm_c(DCOND_LE, tmp, pc); + emith_jump_cond(DCOND_LE, sh2_drc_exit); + } else { + EMITH_JMP_START(DCOND_GT); + emith_move_r_imm(tmp, pc); + emith_jump(sh2_drc_exit); + EMITH_JMP_END(DCOND_GT); + } rcache_free_tmp(tmp); #if (DRC_DEBUG & 32) @@ -3249,7 +3257,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } } rcache_set_usage_now(opd[0].source); // current insn - rcache_set_usage_soon(late); // insns 1-3 + rcache_set_usage_soon(soon); // insns 1-3 rcache_set_usage_late(late & ~soon); // insns 4-9 rcache_set_usage_discard(write & ~(late|soon) & ~opd[0].source); @@ -4442,12 +4450,16 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) fflush(stdout); #endif + emith_update_cache(); return block_entry_ptr; } static void sh2_generate_utils(void) { int arg0, arg1, arg2, arg3, sr, tmp, tmp2; +#if DRC_DEBUG + int hic = host_insn_count; // don't count utils for insn statistics +#endif host_arg2reg(arg0, 0); host_arg2reg(arg1, 1); @@ -4794,6 +4806,10 @@ static void sh2_generate_utils(void) host_dasm_new_symbol(sh2_drc_read16_poll); host_dasm_new_symbol(sh2_drc_read32_poll); #endif + +#if DRC_DEBUG + host_insn_count = hic; +#endif } static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit, int free) @@ -4847,6 +4863,7 @@ static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nol bd->addr = bd->size = bd->addr_lit = bd->size_lit = 0; bd->entry_count = 0; } + emith_update_cache(); } static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift) @@ -5197,6 +5214,7 @@ int sh2_drc_init(SH2 *sh2) tcache_ptr = tcache; sh2_generate_utils(); host_instructions_updated(tcache, tcache_ptr); + emith_update_cache(); tcache_bases[0] = tcache_ptrs[0] = tcache_ptr; tcache_limit[0] = tcache_bases[0] + tcache_sizes[0] - (tcache_ptr-tcache); diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 1ad922b79..187ad716f 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -33,26 +33,24 @@ unsigned short scan_block(unsigned int base_pc, int is_slave, #if defined(DRC_SH2) // direct access to some host CPU registers used by the DRC -// XXX MUST match definitions in cpu/sh2/compiler.c +// XXX MUST match definitions for SHR_SR in cpu/sh2/compiler.c #if defined(__arm__) -#define DRC_SR_REG r10 +#define DRC_SR_REG "r10" #elif defined(__aarch64__) -#define DRC_SR_REG r22 +#define DRC_SR_REG "r22" #elif defined(__mips__) -#define DRC_SR_REG s6 +#define DRC_SR_REG "s6" #elif defined(__i386__) -#define DRC_SR_REG edi +#define DRC_SR_REG "edi" #elif defined(__x86_64__) -#define DRC_SR_REG ebx +#define DRC_SR_REG "ebx" #else #warning "direct DRC register access not available for this host" #endif #endif #ifdef DRC_SR_REG -#define __DRC_DECLARE_SR(SR) register int sh2_sr asm(#SR) -#define _DRC_DECLARE_SR(SR) __DRC_DECLARE_SR(SR) -#define DRC_DECLARE_SR _DRC_DECLARE_SR(DRC_SR_REG) +#define DRC_DECLARE_SR register int sh2_sr asm(DRC_SR_REG) #define DRC_SAVE_SR(sh2) \ if ((sh2->state & (SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN) \ sh2->sr = sh2_sr; diff --git a/pico/32x/32x.c b/pico/32x/32x.c index e9d8ff6d2..f6d1a153f 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -471,7 +471,7 @@ void sync_sh2s_normal(unsigned int m68k_target) if (!(ssh2.state & SH2_IDLE_STATES)) { cycles = target - ssh2.m68krcycles_done; if (cycles > 0) { - run_sh2(&ssh2, cycles > 20 ? cycles : 20); + run_sh2(&ssh2, cycles > 20U ? cycles : 20U); if (event_time_next && CYCLES_GT(target, event_time_next)) target = event_time_next; @@ -483,7 +483,7 @@ void sync_sh2s_normal(unsigned int m68k_target) if (!(msh2.state & SH2_IDLE_STATES)) { cycles = target - msh2.m68krcycles_done; if (cycles > 0) { - run_sh2(&msh2, cycles > 20 ? cycles : 20); + run_sh2(&msh2, cycles > 20U ? cycles : 20U); if (event_time_next && CYCLES_GT(target, event_time_next)) target = event_time_next; From 5ae77d6e7367f80c473c08786b08569396c18a27 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 17 Sep 2019 23:02:05 +0200 Subject: [PATCH 060/174] sh2 drc: rework of register cache to implement basic loop optmization --- cpu/sh2/compiler.c | 608 +++++++++++++++++++++++++++------------------ cpu/sh2/compiler.h | 3 +- 2 files changed, 370 insertions(+), 241 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 6eaf71232..f6fbadaf4 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -44,6 +44,7 @@ #define ALIAS_REGISTERS 1 #define REMAP_REGISTER 1 #define LOOP_DETECTION 1 +#define LOOP_OPTIMIZER 1 // limits (per block) #define MAX_BLOCK_SIZE (BLOCK_INSN_LIMIT * 6 * 6) @@ -376,36 +377,41 @@ int rchit, rcmiss; #endif // host register tracking -enum { +enum cache_reg_htype { + HRT_TEMP = 1, // is for temps and args + HRT_REG = 2, // is for sh2 regs + HRT_STATIC = 2, // is for static mappings (same as HRT_REG) +}; + +enum cache_reg_flags { + HRF_DIRTY = 1 << 0, // has "dirty" value to be written to ctx + HRF_PINNED = 1 << 1, // has a pinned mapping +}; + +enum cache_reg_type { HR_FREE, - HR_STATIC, // vreg has a static mapping HR_CACHED, // vreg has sh2_reg_e HR_TEMP, // reg used for temp storage -} cache_reg_type; - -enum { - HRF_DIRTY = 1 << 0, // has "dirty" value to be written to ctx - HRF_LOCKED = 1 << 1, // can't be evicted - HRF_TEMP = 1 << 2, // is for temps and args - HRF_REG = 1 << 3, // is for sh2 regs -} cache_reg_flags; +}; typedef struct { u8 hreg; // "host" reg - u8 flags:4; // TEMP or REG? + u8 htype:2; // TEMP or REG? + u8 flags:2; // DIRTY, PINNED? u8 type:2; // CACHED or TEMP? - u8 ref:2; // ref counter + u8 locked:2; // LOCKED reference counter u16 stamp; // kind of a timestamp u32 gregs; // "guest" reg mask } cache_reg_t; // guest register tracking -enum { +enum guest_reg_flags { GRF_DIRTY = 1 << 0, // reg has "dirty" value to be written to ctx GRF_CONST = 1 << 1, // reg has a constant GRF_CDIRTY = 1 << 2, // constant not yet written to ctx GRF_STATIC = 1 << 3, // reg has static mapping to vreg -} guest_reg_flags; + GRF_PINNED = 1 << 4, // reg has pinned mapping to vreg +}; typedef struct { u8 flags; // guest flags: is constant, is dirty? @@ -419,13 +425,14 @@ typedef struct { static int rcache_get_tmp(void); static void rcache_free_tmp(int hr); -// Note: cache_regs[] must have at least the amount of HRF_REG registers used +// Note: cache_regs[] must have at least the amount of REG/TEMP registers used // by handlers in worst case (currently 4). // Register assignment goes by ABI convention. Caller save registers are TEMP, // the others are either static or REG. SR must be static, R0 very recommended. +// XXX the static definition of SR MUST match that in compiler.h // VBR, PC, PR must not be static (read from context in utils). -// TEMP registers first, REG last. alloc/evict algorithm depends on this. -// The 1st TEMP must not be RET_REG on platforms using temps in insns (eg. x86). +// RET_REG/params should be first TEMPs to avoid allocation conflicts in calls. +// There MUST be at least 3 params and one non-RET_REG/param TEMP. // XXX shouldn't this be somehow defined in the code emitters? #ifdef __arm__ #include "../drc/emit_arm.c" @@ -449,21 +456,21 @@ static guest_reg_t guest_regs[] = { // OABI/EABI: params: r0-r3, return: r0-r1, temp: r12,r14, saved: r4-r8,r10,r11 // SP,PC: r13,r15 must not be used. saved: r9 (for platform use, e.g. on ios) static cache_reg_t cache_regs[] = { - { 12, HRF_TEMP }, // temps - { 14, HRF_TEMP }, - { 3, HRF_TEMP }, // params - { 2, HRF_TEMP }, - { 1, HRF_TEMP }, - { 0, HRF_TEMP }, // RET_REG - { 8, HRF_LOCKED }, // statics + { 0, HRT_TEMP }, // RET_REG, params + { 1, HRT_TEMP }, + { 2, HRT_TEMP }, // params + { 3, HRT_TEMP }, + { 12, HRT_TEMP }, // temps + { 14, HRT_TEMP }, + { 8, HRT_STATIC }, // statics #ifndef __MACH__ // no r9.. - { 9, HRF_LOCKED }, + { 9, HRT_STATIC }, #endif - { 10, HRF_LOCKED }, - { 4, HRF_REG }, // other regs - { 5, HRF_REG }, - { 6, HRF_REG }, - { 7, HRF_REG }, + { 10, HRT_STATIC }, + { 4, HRT_REG }, // other regs + { 5, HRT_REG }, + { 6, HRT_REG }, + { 7, HRT_REG }, }; #elif defined(__aarch64__) @@ -485,35 +492,34 @@ static guest_reg_t guest_regs[] = { // saved: r18 (for platform use) // since drc never needs more than 4 parameters, r4-r7 are treated as temp. static cache_reg_t cache_regs[] = { - { 17, HRF_TEMP }, // temps - { 16, HRF_TEMP }, - { 15, HRF_TEMP }, - { 14, HRF_TEMP }, - { 13, HRF_TEMP }, - { 12, HRF_TEMP }, - { 11, HRF_TEMP }, - { 10, HRF_TEMP }, - { 9, HRF_TEMP }, - { 8, HRF_TEMP }, - { 7, HRF_TEMP }, - { 6, HRF_TEMP }, - { 5, HRF_TEMP }, - { 4, HRF_TEMP }, - { 3, HRF_TEMP }, // params - { 2, HRF_TEMP }, - { 1, HRF_TEMP }, - { 0, HRF_TEMP }, // RET_REG - { 22, HRF_LOCKED }, // statics - { 21, HRF_LOCKED }, - { 20, HRF_LOCKED }, - { 29, HRF_REG }, // other regs - { 28, HRF_REG }, - { 27, HRF_REG }, - { 26, HRF_REG }, - { 25, HRF_REG }, - { 24, HRF_REG }, - { 23, HRF_REG }, - { 22, HRF_REG }, + { 0, HRT_TEMP }, // RET_REG, params + { 1, HRT_TEMP }, + { 2, HRT_TEMP }, // params + { 3, HRT_TEMP }, + { 4, HRT_TEMP }, // temps + { 5, HRT_TEMP }, + { 6, HRT_TEMP }, + { 7, HRT_TEMP }, + { 8, HRT_TEMP }, + { 9, HRT_TEMP }, + { 10, HRT_TEMP }, + { 11, HRT_TEMP }, + { 12, HRT_TEMP }, + { 13, HRT_TEMP }, + { 14, HRT_TEMP }, + { 15, HRT_TEMP }, + { 16, HRT_TEMP }, + { 17, HRT_TEMP }, + { 20, HRT_STATIC }, // statics + { 21, HRT_STATIC }, + { 22, HRT_STATIC }, + { 23, HRT_REG }, // other regs + { 24, HRT_REG }, + { 25, HRT_REG }, + { 26, HRT_REG }, + { 27, HRT_REG }, + { 28, HRT_REG }, + { 29, HRT_REG }, }; #elif defined(__mips__) @@ -521,13 +527,13 @@ static cache_reg_t cache_regs[] = { static guest_reg_t guest_regs[] = { // SHR_R0 .. SHR_SP - {GRF_STATIC, 20} , {GRF_STATIC, 21} , { 0 } , { 0 } , + {GRF_STATIC, 16} , {GRF_STATIC, 17} , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , { 0 } , // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, - { 0 } , { 0 } , { 0 } , {GRF_STATIC, 22} , + { 0 } , { 0 } , { 0 } , {GRF_STATIC, 18} , { 0 } , { 0 } , { 0 } , { 0 } , }; @@ -535,26 +541,26 @@ static guest_reg_t guest_regs[] = { // saved: r16-r23,r30, reserved: r0(zero), r26-r27(irq), r28(gp), r29(sp) // r1,r15,r24,r25 are used internally by the code emitter static cache_reg_t cache_regs[] = { - { 14, HRF_TEMP }, // temps - { 13, HRF_TEMP }, - { 12, HRF_TEMP }, - { 11, HRF_TEMP }, - { 10, HRF_TEMP }, - { 9, HRF_TEMP }, - { 8, HRF_TEMP }, - { 7, HRF_TEMP }, // params - { 6, HRF_TEMP }, - { 5, HRF_TEMP }, - { 4, HRF_TEMP }, - { 3, HRF_TEMP }, // RET_REG - { 2, HRF_TEMP }, - { 22, HRF_LOCKED }, // statics - { 21, HRF_LOCKED }, - { 20, HRF_LOCKED }, - { 19, HRF_REG }, // other regs - { 18, HRF_REG }, - { 17, HRF_REG }, - { 16, HRF_REG }, + { 2, HRT_TEMP }, // RET_REG (v0-v1) + { 3, HRT_TEMP }, + { 4, HRT_TEMP }, // params (a0-a3) + { 5, HRT_TEMP }, + { 6, HRT_TEMP }, + { 7, HRT_TEMP }, + { 8, HRT_TEMP }, // temps (t0-t6) + { 9, HRT_TEMP }, + { 10, HRT_TEMP }, + { 11, HRT_TEMP }, + { 12, HRT_TEMP }, + { 13, HRT_TEMP }, + { 14, HRT_TEMP }, + { 16, HRT_STATIC }, // statics (s0-s2) + { 17, HRT_STATIC }, + { 18, HRT_STATIC }, + { 19, HRT_REG }, // other regs (s3-s6) + { 20, HRT_REG }, + { 21, HRT_REG }, + { 22, HRT_REG }, }; #elif defined(__i386__) @@ -572,14 +578,16 @@ static guest_reg_t guest_regs[] = { { 0 } , { 0 } , { 0 } , { 0 } , }; -// ax, cx, dx are usually temporaries by convention +// MS/SystemV ABI: ebx,esi,edi,ebp are preserved, eax,ecx,edx are temporaries +// DRC uses REGPARM to pass upto 3 parameters in registers eax,ecx,edx. +// To avoid conflicts with param passing ebx must be declared temp here. static cache_reg_t cache_regs[] = { - { xBX, HRF_REG|HRF_TEMP }, // params - { xCX, HRF_REG|HRF_TEMP }, - { xDX, HRF_REG|HRF_TEMP }, - { xAX, HRF_REG|HRF_TEMP }, // return value - { xSI, HRF_LOCKED }, // statics - { xDI, HRF_LOCKED }, + { xAX, HRT_TEMP }, // RET_REG, param + { xDX, HRT_TEMP }, // params + { xCX, HRT_TEMP }, + { xBX, HRT_TEMP }, // temp + { xSI, HRT_STATIC }, // statics + { xDI, HRT_STATIC }, }; #elif defined(__x86_64__) @@ -602,20 +610,20 @@ static guest_reg_t guest_regs[] = { // rsi,rdi are preserved in M$ ABI, temporary in SystemV ABI // parameters in rcx,rdx,r8,r9, SystemV ABI additionally uses rsi,rdi static cache_reg_t cache_regs[] = { - { xR10,HRF_TEMP }, // temps - { xR11,HRF_TEMP }, - { xAX, HRF_TEMP }, // RET_REG - { xR8, HRF_TEMP }, // params - { xR9, HRF_TEMP }, - { xCX, HRF_TEMP }, - { xDX, HRF_TEMP }, - { xSI, HRF_REG|HRF_TEMP }, - { xDI, HRF_REG|HRF_TEMP }, - { xBX, HRF_LOCKED }, // statics - { xR12,HRF_LOCKED }, - { xR13,HRF_REG }, // other regs - { xR14,HRF_REG }, - { xR15,HRF_REG }, + { xAX, HRT_TEMP }, // RET_REG + { xDX, HRT_TEMP }, // params + { xCX, HRT_TEMP }, + { xDI, HRT_TEMP }, + { xSI, HRT_TEMP }, + { xR8, HRT_TEMP }, + { xR9, HRT_TEMP }, + { xR10,HRT_TEMP }, // temps + { xR11,HRT_TEMP }, + { xBX, HRT_STATIC }, // statics + { xR12,HRT_STATIC }, + { xR13,HRT_REG }, // other regs + { xR14,HRT_REG }, + { xR15,HRT_REG }, }; #else @@ -1333,8 +1341,8 @@ static void rcache_remap_vreg(int x); printf(" cache_regs:\n"); \ for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { \ cp = &cache_regs[i]; \ - if (cp->type != HR_FREE || cp->gregs || (cp->flags & ~(HRF_REG|HRF_TEMP))) \ - printf(" %d: hr=%d t=%d f=%x c=%d m=%x\n", i, cp->hreg, cp->type, cp->flags, cp->ref, cp->gregs); \ + if (cp->type != HR_FREE || cp->gregs || cp->locked || cp->flags) \ + printf(" %d: hr=%d t=%d f=%x c=%d m=%x\n", i, cp->hreg, cp->type, cp->flags, cp->locked, cp->gregs); \ } \ printf(" guest_regs:\n"); \ for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { \ @@ -1352,9 +1360,10 @@ static void rcache_remap_vreg(int x); #define RCACHE_CHECK(msg) { \ cache_reg_t *cp; \ guest_reg_t *gp; \ - int i, x, d = 0; \ + int i, x, m = 0, d = 0; \ for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { \ cp = &cache_regs[i]; \ + if (cp->flags & HRF_PINNED) m |= (1 << i); \ if (cp->type == HR_FREE || cp->type == HR_TEMP) continue; \ /* check connectivity greg->vreg */ \ FOR_ALL_BITS_SET_DO(cp->gregs, x, \ @@ -1366,12 +1375,17 @@ static void rcache_remap_vreg(int x); gp = &guest_regs[i]; \ if (gp->vreg != -1 && !(cache_regs[gp->vreg].gregs & (1 << i))) \ { d = 1; printf("cache check r=%d v=%d not connected?\n", i, gp->vreg); }\ - if (gp->vreg != -1 && cache_regs[gp->vreg].type != HR_STATIC && cache_regs[gp->vreg].type != HR_CACHED) \ + if (gp->vreg != -1 && cache_regs[gp->vreg].type != HR_CACHED) \ { d = 1; printf("cache check r=%d v=%d wrong type?\n", i, gp->vreg); }\ if ((gp->flags & GRF_CONST) && !(gconsts[gp->cnst].gregs & (1 << i))) \ { d = 1; printf("cache check r=%d c=%d not connected?\n", i, gp->cnst); }\ - if ((gp->flags & GRF_CDIRTY) && (gp->vreg != -1 || !(gp->flags & GRF_CONST)) )\ + if ((gp->flags & GRF_CDIRTY) && (gp->vreg != -1 || !(gp->flags & GRF_CONST)))\ { d = 1; printf("cache check r=%d CDIRTY?\n", i); } \ + if (gp->flags & GRF_PINNED) { \ + if (gp->sreg == -1 || !(cache_regs[gp->sreg].flags & HRF_PINNED))\ + { d = 1; printf("cache check r=%d v=%d not pinned?\n", i, gp->vreg); } \ + else m &= ~(1 << gp->sreg); \ + } \ } \ for (i = 0; i < ARRAY_SIZE(gconsts); i++) { \ FOR_ALL_BITS_SET_DO(gconsts[i].gregs, x, \ @@ -1379,13 +1393,15 @@ static void rcache_remap_vreg(int x); { d = 1; printf("cache check c=%d v=%d not connected?\n",i,x); } \ ) \ } \ + if (m) \ + { d = 1; printf("cache check m=%x pinning wrong?\n",m); } \ if (d) RCACHE_DUMP(msg) \ /* else { \ printf("locked regs %s:\n",msg); \ for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { \ cp = &cache_regs[i]; \ - if (cp->flags & HRF_LOCKED) \ - printf(" %d: hr=%d t=%d f=%x c=%d m=%x\n", i, cp->hreg, cp->type, cp->flags, cp->ref, cp->gregs); \ + if (cp->locked) \ + printf(" %d: hr=%d t=%d f=%x c=%d m=%x\n", i, cp->hreg, cp->type, cp->flags, cp->locked, cp->gregs); \ } \ } */ \ } @@ -1463,8 +1479,7 @@ static int gconst_try_read(int vreg, sh2_reg_e r) guest_regs[i].flags &= ~GRF_CDIRTY; guest_regs[i].flags |= GRF_DIRTY; }); - if (cache_regs[vreg].type != HR_STATIC) - cache_regs[vreg].type = HR_CACHED; + cache_regs[vreg].type = HR_CACHED; cache_regs[vreg].flags |= HRF_DIRTY; return 1; } @@ -1527,6 +1542,7 @@ static void gconst_invalidate(void) static u16 rcache_counter; // SH2 register usage bitmasks +static u32 rcache_hregs_reg; // regs of type HRT_REG (for pinning) static u32 rcache_regs_static; // statically allocated regs static u32 rcache_regs_now; // regs used in current insn static u32 rcache_regs_soon; // regs used in the next few insns @@ -1539,28 +1555,33 @@ static u32 rcache_regs_clean; // regs needing cleaning #define rcache_regs_nowsoon (rcache_regs_now|rcache_regs_soon) #define rcache_regs_soonclean (rcache_regs_soon|rcache_regs_clean) -static void rcache_ref_vreg(int x) +static void rcache_lock_vreg(int x) { if (x >= 0) { - cache_regs[x].ref ++; - cache_regs[x].flags |= HRF_LOCKED; + if (cache_regs[x].type == HR_FREE) { + printf("locking free vreg %x, aborting\n", x); + exit(1); + } + cache_regs[x].locked ++; } } -static void rcache_unref_vreg(int x) +static void rcache_unlock_vreg(int x) { - if (x >= 0 && -- cache_regs[x].ref == 0) { - cache_regs[x].flags &= ~HRF_LOCKED; + if (x >= 0) { + if (cache_regs[x].type == HR_FREE) { + printf("unlocking free vreg %x, aborting\n", x); + exit(1); + } + cache_regs[x].locked --; } } static void rcache_free_vreg(int x) { - if (cache_regs[x].type != HR_STATIC) - cache_regs[x].type = HR_FREE; - cache_regs[x].flags &= (HRF_REG|HRF_TEMP); + cache_regs[x].type = cache_regs[x].locked ? HR_TEMP : HR_FREE; + cache_regs[x].flags &= HRF_PINNED; cache_regs[x].gregs = 0; - cache_regs[x].ref = 0; } static void rcache_unmap_vreg(int x) @@ -1582,12 +1603,11 @@ static void rcache_move_vreg(int d, int x) { int i; - if (cache_regs[d].type != HR_STATIC) - cache_regs[d].type = HR_CACHED; + cache_regs[d].type = HR_CACHED; cache_regs[d].gregs = cache_regs[x].gregs; - cache_regs[d].flags &= (HRF_TEMP|HRF_REG); - cache_regs[d].flags |= cache_regs[x].flags & ~(HRF_TEMP|HRF_REG); - cache_regs[d].ref = 0; + cache_regs[d].flags &= HRF_PINNED; + cache_regs[d].flags |= cache_regs[x].flags & ~HRF_PINNED; + cache_regs[d].locked = 0; cache_regs[d].stamp = cache_regs[x].stamp; emith_move_r_r(cache_regs[d].hreg, cache_regs[x].hreg); for (i = 0; i < ARRAY_SIZE(guest_regs); i++) @@ -1602,12 +1622,12 @@ static void rcache_clean_vreg(int x) if (cache_regs[x].flags & HRF_DIRTY) { // writeback cache_regs[x].flags &= ~HRF_DIRTY; - rcache_ref_vreg(x); + rcache_lock_vreg(x); FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, r, if (guest_regs[r].flags & GRF_DIRTY) { - if (guest_regs[r].flags & GRF_STATIC) { + if (guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) { if (guest_regs[r].vreg != guest_regs[r].sreg) { - if (!(cache_regs[guest_regs[r].sreg].flags & HRF_LOCKED)) { + if (!(cache_regs[guest_regs[r].sreg].locked)) { // statically mapped reg not in its sreg. move back to sreg rcache_evict_vreg(guest_regs[r].sreg); emith_move_r_r(cache_regs[guest_regs[r].sreg].hreg, @@ -1623,7 +1643,7 @@ static void rcache_clean_vreg(int x) rcache_remove_vreg_alias(x, r); } } else - cache_regs[x].flags |= HRF_DIRTY; + cache_regs[x].flags |= HRF_DIRTY; } else { if (~rcache_regs_discard & (1 << r)) emith_ctx_write(cache_regs[x].hreg, r * 4); @@ -1631,8 +1651,9 @@ static void rcache_clean_vreg(int x) } rcache_regs_clean &= ~(1 << r); }) - rcache_unref_vreg(x); + rcache_unlock_vreg(x); } + #if DRC_DEBUG & 64 RCACHE_CHECK("after clean"); #endif @@ -1642,16 +1663,19 @@ static void rcache_add_vreg_alias(int x, sh2_reg_e r) { cache_regs[x].gregs |= (1 << r); guest_regs[r].vreg = x; - if (cache_regs[x].type != HR_STATIC) - cache_regs[x].type = HR_CACHED; + cache_regs[x].type = HR_CACHED; } static void rcache_remove_vreg_alias(int x, sh2_reg_e r) { cache_regs[x].gregs &= ~(1 << r); - if (!cache_regs[x].gregs) + if (!cache_regs[x].gregs) { // no reg mapped -> free vreg - rcache_free_vreg(x); + if (cache_regs[x].locked) + cache_regs[x].type = HR_TEMP; + else + rcache_free_vreg(x); + } guest_regs[r].vreg = -1; } @@ -1674,17 +1698,17 @@ static void rcache_evict_vreg_aliases(int x, sh2_reg_e r) static int rcache_allocate(int what, int minprio) { - // evict reg with oldest stamp (only for HRF_REG, no temps) + // evict reg with oldest stamp (only for HRT_REG, no temps) int i, i_prio, oldest = -1, prio = 0; u16 min_stamp = (u16)-1; - for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { - // consider only unlocked REG or non-TEMP - if (cache_regs[i].flags == 0 || (cache_regs[i].flags & HRF_LOCKED)) + for (i = ARRAY_SIZE(cache_regs)-1; i >= 0; i--) { + // consider only non-static, unpinned, unlocked REG or TEMP + if ((cache_regs[i].flags & HRF_PINNED) || cache_regs[i].locked) continue; - if ((what > 0 && !(cache_regs[i].flags & HRF_REG)) || - (what == 0 && (cache_regs[i].flags & HRF_TEMP)) || - (what < 0 && !(cache_regs[i].flags & HRF_TEMP))) + if ((what > 0 && !(cache_regs[i].htype & HRT_REG)) || // get a REG + (what == 0 && (cache_regs[i].htype & HRT_TEMP)) || // get a non-TEMP + (what < 0 && !(cache_regs[i].htype & HRT_TEMP))) // get a TEMP continue; if (cache_regs[i].type == HR_FREE || cache_regs[i].type == HR_TEMP) { // REG is free @@ -1731,17 +1755,18 @@ static int rcache_allocate(int what, int minprio) static int rcache_allocate_vreg(int needed) { int x; - - // get a free reg, but use temps only if r is not needed soon - for (x = ARRAY_SIZE(cache_regs) - 1; x >= 0; x--) { - if (cache_regs[x].flags && (cache_regs[x].type == HR_FREE || - (cache_regs[x].type == HR_TEMP && !(cache_regs[x].flags & HRF_LOCKED))) && - (!needed || (cache_regs[x].flags & HRF_REG))) - break; - } - - if (x < 0) + + if (needed) { + // needed soon, try getting a REG 1st, use a TEMP only if none is available x = rcache_allocate(1, 0); + if (x < 0) + x = rcache_allocate(-1, 1); + } else { + // not needed, try getting a TEMP 1st, use a REG only if none is available + x = rcache_allocate(-1, 1); + if (x < 0) + x = rcache_allocate(1, 0); + } return x; } @@ -1753,17 +1778,7 @@ static int rcache_allocate_nontemp(void) static int rcache_allocate_temp(void) { - int x; - - // use any free reg, but prefer TEMP regs - for (x = 0; x < ARRAY_SIZE(cache_regs); x++) { - if (cache_regs[x].flags && (cache_regs[x].type == HR_FREE || - (cache_regs[x].type == HR_TEMP && !(cache_regs[x].flags & HRF_LOCKED)))) - break; - } - - if (x >= ARRAY_SIZE(cache_regs)) - x = rcache_allocate(-1, 1); + int x = rcache_allocate(-1, 1); if (x < 0) { printf("no temp register available, aborting\n"); exit(1); @@ -1788,14 +1803,14 @@ static int rcache_map_reg(sh2_reg_e r, int hr, int mode) } // deal with statically mapped regs - if (mode == RC_GR_RMW && (guest_regs[r].flags & GRF_STATIC)) { + if (mode == RC_GR_RMW && (guest_regs[r].flags & (GRF_STATIC|GRF_PINNED))) { x = guest_regs[r].sreg; if (guest_regs[r].vreg == x) { // STATIC in its sreg with no aliases, and some processing pending if (cache_regs[x].gregs == 1 << r) return cache_regs[x].hreg; } else if (cache_regs[x].type == HR_FREE || - (cache_regs[x].type == HR_TEMP && !(cache_regs[x].flags & HRF_LOCKED))) + (cache_regs[x].type == HR_TEMP && !cache_regs[x].locked)) // STATIC not in its sreg, with sreg available -> move it i = guest_regs[r].sreg; } @@ -1806,14 +1821,13 @@ static int rcache_map_reg(sh2_reg_e r, int hr, int mode) if (cache_regs[i].type == HR_CACHED) rcache_evict_vreg(i); // set new mappping - if (cache_regs[i].type != HR_STATIC) - cache_regs[i].type = HR_CACHED; + cache_regs[i].type = HR_CACHED; cache_regs[i].gregs = 1 << r; - cache_regs[i].flags &= (HRF_TEMP|HRF_REG); - cache_regs[i].ref = 0; + cache_regs[i].flags &= HRF_PINNED; + cache_regs[i].locked = 0; cache_regs[i].stamp = ++rcache_counter; cache_regs[i].flags |= HRF_DIRTY; - rcache_ref_vreg(i); + rcache_lock_vreg(i); guest_regs[r].flags |= GRF_DIRTY; guest_regs[r].vreg = i; #if DRC_DEBUG & 64 @@ -1828,25 +1842,25 @@ static void rcache_remap_vreg(int x) int d; // x must be a cached vreg - if (cache_regs[x].type != HR_CACHED && cache_regs[x].type != HR_STATIC) + if (cache_regs[x].type != HR_CACHED) return; // don't do it if x is already a REG or isn't used or to be cleaned anyway - if ((cache_regs[x].flags & HRF_REG) || + if ((cache_regs[x].htype & HRT_REG) || !(rcache_regs_used & ~rcache_regs_clean & cache_regs[x].gregs)) { // clean here to avoid data loss on invalidation rcache_clean_vreg(x); return; } - if (cache_regs[x].flags & HRF_LOCKED) { + if (cache_regs[x].locked) { printf("remap vreg %d is locked\n", x); exit(1); } // allocate a non-TEMP vreg - rcache_ref_vreg(x); // lock to avoid evicting x + rcache_lock_vreg(x); // lock to avoid evicting x d = rcache_allocate_nontemp(); - rcache_unref_vreg(x); + rcache_unlock_vreg(x); if (d < 0) { rcache_clean_vreg(x); return; @@ -1901,10 +1915,10 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr dst = src = guest_regs[r].vreg; - rcache_ref_vreg(src); // lock to avoid evicting src + rcache_lock_vreg(src); // lock to avoid evicting src // good opportunity to relocate a remapped STATIC? - if ((guest_regs[r].flags & GRF_STATIC) && src != guest_regs[r].sreg && - !(cache_regs[guest_regs[r].sreg].flags & HRF_LOCKED) && + if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) && src != guest_regs[r].sreg && + !cache_regs[guest_regs[r].sreg].locked && (src < 0 || mode != RC_GR_READ) && !(rcache_regs_nowsoon & cache_regs[guest_regs[r].sreg].gregs)) { dst = guest_regs[r].sreg; @@ -1918,10 +1932,10 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr } tr = &cache_regs[dst]; tr->stamp = rcache_counter; - rcache_unref_vreg(src); // remove r from src if (src >= 0 && src != dst) rcache_remove_vreg_alias(src, r); + rcache_unlock_vreg(src); // if r has a constant it may have aliases if (mode != RC_GR_WRITE && gconst_try_read(dst, r)) @@ -1932,24 +1946,26 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr if (mode != RC_GR_READ && src == dst && ali) { int x = -1; if (rcache_regs_nowsoon & ali) { - if (tr->type == HR_STATIC && guest_regs[r].sreg == dst && - !(tr->flags & HRF_LOCKED)) { + if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) && + guest_regs[r].sreg == dst && !tr->locked) { // split aliases if r is STATIC in sreg and dst isn't already locked - rcache_ref_vreg(dst); // lock to avoid evicting dst - if ((x = rcache_allocate_vreg(rcache_regs_nowsoon & ali)) >= 0) { + rcache_lock_vreg(dst); // lock to avoid evicting dst + x = rcache_allocate_vreg(rcache_regs_nowsoon & ali); + rcache_unlock_vreg(dst); + if (x >= 0) { src = x; rcache_move_vreg(src, dst); } - rcache_unref_vreg(dst); } else { // split r - rcache_ref_vreg(src); // lock to avoid evicting src - if ((x = rcache_allocate_vreg(rcache_regs_nowsoon & (1 << r))) >= 0) { + rcache_lock_vreg(src); // lock to avoid evicting src + x = rcache_allocate_vreg(rcache_regs_nowsoon & (1 << r)); + rcache_unlock_vreg(src); + if (x >= 0) { dst = x; tr = &cache_regs[dst]; tr->stamp = rcache_counter; } - rcache_unref_vreg(src); } } if (x < 0) @@ -1967,13 +1983,13 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr emith_ctx_read(tr->hreg, r * 4); if (hr) { *hr = (src >= 0 ? cache_regs[src].hreg : tr->hreg); - rcache_ref_vreg(reg_map_host[*hr]); - } else if (src >= 0 && cache_regs[src].hreg != tr->hreg) + rcache_lock_vreg(src >= 0 ? src : dst); + } else if (src >= 0 && mode != RC_GR_WRITE && cache_regs[src].hreg != tr->hreg) emith_move_r_r(tr->hreg, cache_regs[src].hreg); // housekeeping if (do_locking) - rcache_ref_vreg(dst); + rcache_lock_vreg(dst); if (mode != RC_GR_READ) { tr->flags |= HRF_DIRTY; guest_regs[r].flags |= GRF_DIRTY; @@ -1990,14 +2006,42 @@ static int rcache_get_reg(sh2_reg_e r, rc_gr_mode mode, int *hr) return rcache_get_reg_(r, mode, 1, hr); } +static void rcache_pin_reg(sh2_reg_e r) +{ + int hr, x; + + // don't pin if static or already pinned + if (guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) + return; + + rcache_regs_soon |= (1 << r); // kludge to prevent allocation of a temp + hr = rcache_get_reg_(r, RC_GR_RMW, 0, NULL); + x = reg_map_host[hr]; + + // can only pin non-TEMPs + if (!(cache_regs[x].htype & HRT_TEMP)) { + guest_regs[r].flags |= GRF_PINNED; + cache_regs[x].flags |= HRF_PINNED; + guest_regs[r].sreg = x; + } +#if DRC_DEBUG & 64 + RCACHE_CHECK("after pin"); +#endif +} + static int rcache_get_tmp(void) { int i; i = rcache_allocate_temp(); - rcache_ref_vreg(i); + if (i < 0) { + printf("cannot allocate temp\n"); + exit(1); + } cache_regs[i].type = HR_TEMP; + rcache_lock_vreg(i); + return cache_regs[i].hreg; } @@ -2006,14 +2050,14 @@ static int rcache_get_vreg_hr(int hr) int i; i = reg_map_host[hr]; - if (i < 0 || (cache_regs[i].flags & HRF_LOCKED)) { + if (i < 0 || cache_regs[i].locked) { printf("host register %d is locked\n", hr); exit(1); } if (cache_regs[i].type == HR_CACHED) rcache_evict_vreg(i); - else if (cache_regs[i].type == HR_TEMP && (cache_regs[i].flags & HRF_LOCKED)) { + else if (cache_regs[i].type == HR_TEMP && cache_regs[i].locked) { printf("host reg %d already used, aborting\n", hr); exit(1); } @@ -2034,7 +2078,7 @@ static int rcache_get_tmp_arg(int arg) { int x = rcache_get_vreg_arg(arg); cache_regs[x].type = HR_TEMP; - rcache_ref_vreg(x); + rcache_lock_vreg(x); return cache_regs[x].hreg; } @@ -2044,7 +2088,7 @@ static int rcache_get_tmp_ret(void) { int x = rcache_get_vreg_hr(RET_REG); cache_regs[x].type = HR_TEMP; - rcache_ref_vreg(x); + rcache_lock_vreg(x); return cache_regs[x].hreg; } @@ -2094,11 +2138,11 @@ static int rcache_get_reg_arg(int arg, sh2_reg_e r, int *hr) } else { *hr = srcr; if (dstr != srcr) // must lock srcr if not copied here - rcache_ref_vreg(reg_map_host[srcr]); + rcache_lock_vreg(reg_map_host[srcr]); } cache_regs[dstid].stamp = ++rcache_counter; - rcache_ref_vreg(dstid); + rcache_lock_vreg(dstid); #if DRC_DEBUG & 64 RCACHE_CHECK("after getarg"); #endif @@ -2114,7 +2158,7 @@ static void rcache_free_tmp(int hr) exit(1); } - rcache_free_vreg(i); + rcache_unlock_vreg(i); } // saves temporary result either in REG or in drctmp @@ -2133,10 +2177,10 @@ static int rcache_save_tmp(int hr) cache_regs[i].type = HR_CACHED; cache_regs[i].gregs = 0; // not storing any guest register - cache_regs[i].flags &= (HRF_TEMP|HRF_REG); - cache_regs[i].ref = 0; + cache_regs[i].flags &= HRF_PINNED; + cache_regs[i].locked = 0; cache_regs[i].stamp = ++rcache_counter; - rcache_ref_vreg(i); + rcache_lock_vreg(i); emith_move_r_r(cache_regs[i].hreg, hr); rcache_free_tmp(hr); return i; @@ -2167,17 +2211,13 @@ static int rcache_restore_tmp(int x) static void rcache_free(int hr) { int x = reg_map_host[hr]; - if (cache_regs[x].type == HR_TEMP) - rcache_free_tmp(hr); - else - rcache_unref_vreg(x); + rcache_unlock_vreg(x); } static void rcache_unlock(int x) { if (x >= 0) { - cache_regs[x].flags &= ~HRF_LOCKED; - cache_regs[x].ref = 0; + cache_regs[x].locked = 0; // rcache_regs_now &= ~cache_regs[x].gregs; } } @@ -2185,10 +2225,34 @@ static void rcache_unlock(int x) static void rcache_unlock_all(void) { int i; - for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { - cache_regs[i].flags &= ~HRF_LOCKED; - cache_regs[i].ref = 0; + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + cache_regs[i].locked = 0; +} + +static void rcache_unpin_all(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { + if (guest_regs[i].flags & GRF_PINNED) { + guest_regs[i].flags &= ~GRF_PINNED; + cache_regs[guest_regs[i].sreg].flags &= ~HRF_PINNED; + guest_regs[i].sreg = -1; + } } +#if DRC_DEBUG & 64 + RCACHE_CHECK("after unpin"); +#endif +} + +static void rcache_save_pinned(void) +{ + int i; + + // save pinned regs to context + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) + if ((guest_regs[i].flags & GRF_PINNED) && guest_regs[i].vreg >= 0) + emith_ctx_write(cache_regs[guest_regs[i].vreg].hreg, i * 4); } static inline void rcache_set_usage_now(u32 mask) @@ -2222,7 +2286,7 @@ static inline int rcache_is_hreg_used(int hr) int x = reg_map_host[hr]; // is hr in use? return cache_regs[x].type != HR_FREE && - (cache_regs[x].type != HR_TEMP || (cache_regs[x].flags & HRF_LOCKED)); + (cache_regs[x].type != HR_TEMP || cache_regs[x].locked); } static inline u32 rcache_used_hregs_mask(void) @@ -2231,8 +2295,8 @@ static inline u32 rcache_used_hregs_mask(void) int i; for (i = 0; i < ARRAY_SIZE(cache_regs); i++) - if ((cache_regs[i].flags & HRF_TEMP) && cache_regs[i].type != HR_FREE && - (cache_regs[i].type != HR_TEMP || (cache_regs[i].flags & HRF_LOCKED))) + if ((cache_regs[i].htype & HRT_TEMP) && cache_regs[i].type != HR_FREE && + (cache_regs[i].type != HR_TEMP || cache_regs[i].locked)) mask |= 1 << cache_regs[i].hreg; return mask; @@ -2257,7 +2321,7 @@ static inline u32 rcache_cached_mask(void) int i; for (i = 0; i < ARRAY_SIZE(cache_regs); i++) - if (cache_regs[i].type == HR_CACHED || cache_regs[i].type == HR_STATIC) + if (cache_regs[i].type == HR_CACHED) mask |= cache_regs[i].gregs; return mask; @@ -2269,7 +2333,7 @@ static void rcache_clean_tmp(void) rcache_regs_clean = (1 << ARRAY_SIZE(guest_regs)) - 1; for (i = 0; i < ARRAY_SIZE(cache_regs); i++) - if (cache_regs[i].type == HR_CACHED && (cache_regs[i].flags & HRF_TEMP)) { + if (cache_regs[i].type == HR_CACHED && (cache_regs[i].htype & HRT_TEMP)) { rcache_unlock(i); #if REMAP_REGISTER rcache_remap_vreg(i); @@ -2300,7 +2364,7 @@ static void rcache_clean_masked(u32 mask) } // clean vregs where all aliases are covered by the mask for (i = 0; i < ARRAY_SIZE(cache_regs); i++) - if ((cache_regs[i].type == HR_CACHED || cache_regs[i].type == HR_STATIC) && + if (cache_regs[i].type == HR_CACHED && (cache_regs[i].gregs & mask) && !(cache_regs[i].gregs & ~mask)) rcache_clean_vreg(i); } @@ -2312,23 +2376,24 @@ static void rcache_clean(void) rcache_regs_clean = (1 << ARRAY_SIZE(guest_regs)) - 1; for (i = ARRAY_SIZE(cache_regs)-1; i >= 0; i--) - if (cache_regs[i].type == HR_CACHED || cache_regs[i].type == HR_STATIC) + if (cache_regs[i].type == HR_CACHED) rcache_clean_vreg(i); // relocate statics to their sregs (necessary before conditional jumps) for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { - if ((guest_regs[i].flags & GRF_STATIC) && + if ((guest_regs[i].flags & (GRF_STATIC|GRF_PINNED)) && guest_regs[i].vreg != guest_regs[i].sreg) { - rcache_ref_vreg(guest_regs[i].vreg); + rcache_lock_vreg(guest_regs[i].vreg); rcache_evict_vreg(guest_regs[i].sreg); - rcache_unref_vreg(guest_regs[i].vreg); + rcache_unlock_vreg(guest_regs[i].vreg); if (guest_regs[i].vreg < 0) emith_ctx_read(cache_regs[guest_regs[i].sreg].hreg, i*4); else emith_move_r_r(cache_regs[guest_regs[i].sreg].hreg, cache_regs[guest_regs[i].vreg].hreg); cache_regs[guest_regs[i].sreg].gregs = 1 << i; - cache_regs[guest_regs[i].sreg].flags |= HRF_DIRTY; + cache_regs[guest_regs[i].sreg].type = HR_CACHED; + cache_regs[guest_regs[i].sreg].flags |= HRF_DIRTY|HRF_PINNED; guest_regs[i].flags |= GRF_DIRTY; guest_regs[i].vreg = guest_regs[i].sreg; } @@ -2341,7 +2406,7 @@ static void rcache_invalidate_tmp(void) int i; for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { - if (cache_regs[i].flags & HRF_TEMP) { + if (cache_regs[i].htype & HRT_TEMP) { rcache_unlock(i); if (cache_regs[i].type == HR_CACHED) rcache_evict_vreg(i); @@ -2365,7 +2430,8 @@ static void rcache_invalidate(void) guest_regs[i].vreg = -1; else { cache_regs[guest_regs[i].sreg].gregs = 1 << i; - cache_regs[guest_regs[i].sreg].flags |= HRF_DIRTY; + cache_regs[guest_regs[i].sreg].type = HR_CACHED; + cache_regs[guest_regs[i].sreg].flags |= HRF_DIRTY|HRF_PINNED; guest_regs[i].flags |= GRF_DIRTY; guest_regs[i].vreg = guest_regs[i].sreg; } @@ -2391,26 +2457,26 @@ static void rcache_init(void) // init is executed on every rom load, but this must only be executed once... if (once) { memset(reg_map_host, -1, sizeof(reg_map_host)); - for (i = 0; i < ARRAY_SIZE(cache_regs); i++) + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { reg_map_host[cache_regs[i].hreg] = i; + if (cache_regs[i].htype == HRT_REG) + rcache_hregs_reg |= (1 << i); + } for (i = 0; i < ARRAY_SIZE(guest_regs); i++) if (guest_regs[i].flags & GRF_STATIC) { rcache_regs_static |= (1 << i); guest_regs[i].sreg = reg_map_host[guest_regs[i].sreg]; - cache_regs[guest_regs[i].sreg].type = HR_STATIC; + rcache_hregs_reg &= ~(1 << guest_regs[i].sreg); } else guest_regs[i].sreg = -1; once = 0; } - for (i = 0; i < ARRAY_SIZE(guest_regs); i++) - if (guest_regs[i].flags & GRF_STATIC) { - guest_regs[i].vreg = guest_regs[i].sreg; - cache_regs[guest_regs[i].sreg].gregs = (1 << i); - } - rcache_invalidate(); +#if DRC_DEBUG & 64 + RCACHE_CHECK("after init"); +#endif } // --------------------------------------------------------------- @@ -2802,13 +2868,13 @@ static void emit_do_static_regs(int is_write, int tmpr) int i, r, count; for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { - if (guest_regs[i].flags & GRF_STATIC) + if (guest_regs[i].flags & (GRF_STATIC|GRF_PINNED)) r = cache_regs[guest_regs[i].vreg].hreg; else continue; for (count = 1; i < ARRAY_SIZE(guest_regs) - 1; i++, r++) { - if ((guest_regs[i + 1].flags & GRF_STATIC) && + if ((guest_regs[i + 1].flags & (GRF_STATIC|GRF_PINNED)) && cache_regs[guest_regs[i + 1].vreg].hreg == r + 1) count++; else @@ -2863,6 +2929,12 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) u32 pending_branch_direct:1; u32 pending_branch_indirect:1; } drcf = { 0, }; +#if LOOP_OPTIMIZER + void *pinned_loop_ptr[MAX_LOCAL_BRANCHES/16]; + u32 pinned_loop_pc[MAX_LOCAL_BRANCHES/16]; + u32 pinned_loop_mask[MAX_LOCAL_BRANCHES/16]; + int pinned_loop_count = 0; +#endif // PC of current, first, last SH2 insn u32 pc, base_pc, end_pc; @@ -2877,7 +2949,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) int tmp, tmp2; int cycles; int i, v; - u32 u, m1, m2; + u32 u, m1, m2, m3, m4; int op; u16 crc; @@ -2925,7 +2997,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } // collect branch_targets that don't land on delay slots - m1 = m2 = v = op = 0; + m1 = m2 = m3 = m4 = v = op = 0; for (pc = base_pc, i = 0; pc < end_pc; i++, pc += 2) { if (op_flags[i] & OF_DELAY_OP) op_flags[i] &= ~OF_BTARGET; @@ -2955,9 +3027,14 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.pending_branch_direct = drcf.pending_branch_indirect = 0; op = OF_IDLE_LOOP; // loop type v = i; - m1 = m2 = 0; + m1 = m2 = m3 = m4 = 0; + if (!drcf.loop_type) // reset basic loop it it isn't recognized as loop + op_flags[i] &= ~OF_BASIC_LOOP; } if (drcf.loop_type) { + // calculate reg masks for loop pinning + m4 |= ops[i].source & ~m3; + m3 |= ops[i].dest; // detect loop type, and store poll/delay register if (op_flags[i] & OF_POLL_INSN) { op = OF_POLL_LOOP; @@ -2971,8 +3048,12 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) m2 |= ops[i].dest; // regs modified by other insns } // branch detector - if (OP_ISBRAIMM(ops[i].op) && ops[i].imm == base_pc + 2*v) - drcf.pending_branch_direct = 1; // backward branch detected + if (OP_ISBRAIMM(ops[i].op)) { + if (ops[i].imm == base_pc + 2*v) + drcf.pending_branch_direct = 1; // backward branch detected + else + op_flags[v] &= ~OF_BASIC_LOOP; // no basic loop + } if (OP_ISBRACND(ops[i].op)) drcf.pending_branch_indirect = 1; // conditions g,h - cond.branch // poll/idle loops terminate with their backwards branch to the loop start @@ -2982,6 +3063,17 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) op = 0; // conditions not met op_flags[v] = (op_flags[v] & ~OF_LOOP) | op; // set loop type drcf.loop_type = 0; +#if LOOP_OPTIMIZER + if (op_flags[v] & OF_BASIC_LOOP) { + m3 &= ~rcache_regs_static & ~BITMASK4(SHR_PC, SHR_PR, SHR_SR, SHR_MEM); + if (m3 && count_bits(m3) < count_bits(rcache_hregs_reg) && + pinned_loop_count < ARRAY_SIZE(pinned_loop_pc)) { + pinned_loop_mask[pinned_loop_count] = m3; + pinned_loop_pc[pinned_loop_count++] = base_pc + 2*v; + } else + op_flags[v] &= ~OF_BASIC_LOOP; + } +#endif } } #endif @@ -3007,9 +3099,13 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // clear stale state after compile errors + rcache_unlock_all(); rcache_invalidate(); emith_invalidate_t(); drcf = (struct drcf) { 0 }; +#if LOOP_OPTIMIZER + pinned_loop_count = 0; +#endif // ------------------------------------------------- // 3rd pass: actual compilation @@ -3110,10 +3206,31 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_free_tmp(tmp3); #endif +#if LOOP_OPTIMIZER + if (op_flags[i] & OF_BASIC_LOOP) { + if (pinned_loop_pc[pinned_loop_count] == pc) { + // pin needed regs on loop entry + FOR_ALL_BITS_SET_DO(pinned_loop_mask[pinned_loop_count], v, rcache_pin_reg(v)); + pinned_loop_ptr[pinned_loop_count] = tcache_ptr; + } else + op_flags[i] &= ~OF_BASIC_LOOP; + } +#endif + // check cycles tmp = rcache_get_tmp_arg(0); sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); emith_cmp_r_imm(sr, 0); +#if LOOP_OPTIMIZER + // on drc exit pinned registers must be saved + if (op_flags[i] & OF_BASIC_LOOP) { + EMITH_JMP_START(DCOND_GT); + rcache_save_pinned(); + emith_move_r_imm(tmp, pc); + emith_jump(sh2_drc_exit); + EMITH_JMP_END(DCOND_GT); + } else +#endif if (emith_jump_cond_inrange(sh2_drc_exit)) { emith_move_r_imm_c(DCOND_LE, tmp, pc); emith_jump_cond(DCOND_LE, sh2_drc_exit); @@ -4237,14 +4354,13 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (OP_ISBRACND(opd_b->op)) ctaken = (op_flags[i] & OF_DELAY_OP) ? 1 : 2; cycles += ctaken; // assume branch taken -#if LOOP_DETECTION +#if LOOP_OPTIMIZER if ((drcf.loop_type == OF_IDLE_LOOP || (drcf.loop_type == OF_DELAY_LOOP && drcf.delay_reg >= 0))) { // idle or delay loop emit_sync_t_to_sr(); emith_sh2_delay_loop(cycles, drcf.delay_reg); - rcache_unlock_all(); // may lock delay_reg drcf.polling = drcf.loop_type = 0; } #endif @@ -4291,6 +4407,15 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } #endif + rcache_unlock_all(); // may lock delay_reg +#if LOOP_OPTIMIZER + if (target && pinned_loop_pc[pinned_loop_count] == target_pc) { + rcache_unpin_all(); + target = pinned_loop_ptr[pinned_loop_count]; + pinned_loop_count ++; + } +#endif + if (target == NULL) { // can't resolve branch locally, make a block exit @@ -4372,6 +4497,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.pending_branch_indirect = 0; drcf.polling = drcf.loop_type = 0; } + rcache_unlock_all(); do_host_disasm(tcache_id); } @@ -6198,6 +6324,8 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, if (OP_ISBRAIMM(opd->op)) { // BSR, BRA, BT, BF with immediate target int i_tmp = (opd->imm - base_pc) / 2; // branch target, index in ops + if (i_tmp == last_btarget) // candidate for basic loop optimizer + op_flags[i_tmp] |= OF_BASIC_LOOP; if (i_tmp == last_btarget && op <= 1) { op_flags[i_tmp] |= OF_LOOP; // conditions met -> mark loop last_btarget = i+1; // condition 4 diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 187ad716f..3565940da 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -22,6 +22,7 @@ void sh2_drc_frame(void); #define OF_B_IN_DS (1 << 4) #define OF_DELAY_INSN (1 << 5) // DT, (TODO ADD+CMP?) #define OF_POLL_INSN (1 << 6) // MOV @(...),Rn (no post increment), TST @(...) +#define OF_BASIC_LOOP (1 << 7) // pinnable loop without any branches in it #define OF_IDLE_LOOP (1 << 2) #define OF_DELAY_LOOP (2 << 2) @@ -39,7 +40,7 @@ unsigned short scan_block(unsigned int base_pc, int is_slave, #elif defined(__aarch64__) #define DRC_SR_REG "r22" #elif defined(__mips__) -#define DRC_SR_REG "s6" +#define DRC_SR_REG "s2" #elif defined(__i386__) #define DRC_SR_REG "edi" #elif defined(__x86_64__) From 072737b2fed4129ec655cf9dda28d0e5a0b5c139 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 19 Sep 2019 22:14:28 +0200 Subject: [PATCH 061/174] sh2 drc: improved RTS call stack cache --- cpu/drc/emit_arm.c | 26 +++------- cpu/drc/emit_arm64.c | 37 ++++---------- cpu/drc/emit_mips.c | 36 ++++---------- cpu/drc/emit_x86.c | 116 +++++++++++++++++++------------------------ cpu/sh2/compiler.c | 102 +++++++++++++++++++------------------ pico/32x/pwm.c | 2 +- 6 files changed, 130 insertions(+), 189 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 66a5b065b..71a109222 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -1000,10 +1000,6 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) emith_read_r_r_offs_c(A_COND_AL, r, rs, offs) #define emith_read_r_r_r(r, rs, rm) \ EOP_LDR_REG_LSL(A_COND_AL, r, rs, rm, 0) -#define emith_read_r_r_r_wb(r, rs, rm) \ - EOP_LDR_REG_LSL_WB(A_COND_AL, r, rs, rm, 0) -#define emith_read_r_r_r_ptr_wb(r, rs, rm) \ - emith_read_r_r_r_wb(r, rs, rm) #define emith_read8_r_r_offs_c(cond, r, rs, offs) \ EOP_LDRB_IMM2(cond, r, rs, offs) @@ -1049,10 +1045,6 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) emith_write_r_r_offs_c(A_COND_AL, r, rs, offs) #define emith_write_r_r_offs_ptr(r, rs, offs) \ emith_write_r_r_offs_c(A_COND_AL, r, rs, offs) -#define emith_write_r_r_r_wb(r, rs, rm) \ - EOP_STR_REG_LSL_WB(A_COND_AL, r, rs, rm, 0) -#define emith_write_r_r_r_ptr_wb(r, rs, rm) \ - emith_write_r_r_r_wb(r, rs, rm) #define emith_ctx_read_c(cond, r, offs) \ emith_read_r_r_offs_c(cond, r, CONTEXT_REG, offs) @@ -1133,21 +1125,21 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_jump_patchable(target) \ emith_jump(target) +#define emith_jump_patchable_size() 4 #define emith_jump_cond(cond, target) \ emith_xbranch(cond, target, 0) +#define emith_jump_cond_inrange(target) !0 #define emith_jump_cond_patchable(cond, target) \ emith_jump_cond(cond, target) -#define emith_jump_patch(ptr, target) ({ \ +#define emith_jump_patch(ptr, target, pos) do { \ u32 *ptr_ = ptr; \ u32 val_ = (u32 *)(target) - ptr_ - 2; \ *ptr_ = (*ptr_ & 0xff000000) | (val_ & 0x00ffffff); \ - (u8 *)ptr; \ -}) - -#define emith_jump_cond_inrange(target) !0 + if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)ptr; \ +} while (0) #define emith_jump_patch_size() 4 #define emith_jump_at(ptr, target) do { \ @@ -1184,11 +1176,6 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) emith_jump_ctx(offs); \ } while (0) -#define emith_call_link(r, target) do { \ - emith_move_r_r(r, PC); \ - emith_jump(target); \ -} while (0) - #define emith_call_cleanup() /**/ #define emith_ret_c(cond) \ @@ -1200,6 +1187,9 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_ret_to_ctx(offs) \ emith_ctx_write(LR, offs) +#define emith_add_r_ret_imm(r, imm) \ + emith_add_r_r_ptr_imm(r, LR, imm) + /* pushes r12 for eabi alignment */ #define emith_push_ret(r) do { \ int r_ = (r >= 0 ? r : 12); \ diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 8ce2ef382..72f53dd56 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -865,15 +865,6 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_read_r_r_r_c(cond, r, rs, rm) \ emith_read_r_r_r(r, rs, rm) -#define emith_read_r_r_r_ptr_wb(r, rs, rm) do { \ - emith_read_r_r_r_ptr(r, rs, rm); \ - emith_add_r_r_ptr(rs, rm); \ -} while (0) -#define emith_read_r_r_r_wb(r, rs, rm) do { \ - emith_read_r_r_r(r, rs, rm); \ - emith_add_r_r_ptr(rs, rm); \ -} while (0) - #define emith_read8_r_r_offs(r, rs, offs) \ emith_ldst_offs(AM_B, r, rs, offs, LT_LD, AM_IDX) #define emith_read8_r_r_offs_c(cond, r, rs, offs) \ @@ -935,15 +926,6 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_write_r_r_r_c(cond, r, rs, rm) \ emith_write_r_r_r(r, rs, rm) -#define emith_write_r_r_r_ptr_wb(r, rs, rm) do { \ - emith_write_r_r_r_ptr(r, rs, rm); \ - emith_add_r_r_ptr(rs, rm); \ -} while (0) -#define emith_write_r_r_r_wb(r, rs, rm) do { \ - emith_write_r_r_r(r, rs, rm); \ - emith_add_r_r_ptr(rs, rm); \ -} while (0) - #define emith_ctx_read_ptr(r, offs) \ emith_read_r_r_offs_ptr(r, CONTEXT_REG, offs) @@ -1031,6 +1013,7 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_jump_patchable(target) \ emith_jump(target) +#define emith_jump_patchable_size() 4 #define emith_jump_cond(cond, target) \ emith_bcond(tcache_ptr, 0, cond, target) @@ -1039,9 +1022,9 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) emith_bcond(tcache_ptr, 1, cond, target) #define emith_jump_cond_inrange(target) \ - !(((u8 *)target - (u8 *)tcache_ptr + 0x100000) >> 22) + !(((u8 *)target - (u8 *)tcache_ptr + 0x100000) >> 21) -#define emith_jump_patch(ptr, target) ({ \ +#define emith_jump_patch(ptr, target, pos) do { \ u32 *ptr_ = (u32 *)ptr; \ u32 disp_ = (u8 *)(target) - (u8 *)(ptr_); \ int cond_ = ptr_[0] & 0xf; \ @@ -1051,8 +1034,9 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) } else if (ptr_[0] & 0x80000000) \ EMIT_PTR(ptr_, A64_BL((disp_) & 0x0fffffff)); \ else EMIT_PTR(ptr_, A64_B((disp_) & 0x0fffffff)); \ - (u8 *)ptr; \ -}) + if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)ptr; \ +} while (0) +#define emith_jump_patch_size() 8 #define emith_jump_reg(r) \ EMIT(A64_BR(r)) @@ -1085,11 +1069,6 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) rcache_free_tmp(_t); \ } while (0) -#define emith_call_link(r, target) do { \ - EMIT(A64_ADRXLIT_IMM(r, 8)); \ - emith_jump(target); \ -} while (0) - #define emith_call_cleanup() /**/ #define emith_ret() \ @@ -1100,6 +1079,9 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_ret_to_ctx(offs) \ emith_ctx_write_ptr(LR, offs) +#define emith_add_r_ret_imm(r, imm) \ + emith_add_r_r_ptr_imm(r, LR, imm) + // NB: pushes r or r18 for SP hardware alignment #define emith_push_ret(r) do { \ int r_ = (r >= 0 ? r : 18); \ @@ -1120,7 +1102,6 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_flush() /**/ #define host_instructions_updated(base, end) __builtin___clear_cache(base, end) #define emith_update_cache() /**/ -#define emith_jump_patch_size() 8 #define emith_rw_offs_max() 0xff diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 0e85f92a7..6ff134d97 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -950,13 +950,6 @@ static void emith_lohi_nops(void) #define emith_read_r_r_r_c(cond, r, rs, rm) \ emith_read_r_r_r(r, rs, rm) -#define emith_read_r_r_r_ptr_wb(r, rs, rm) do { \ - emith_add_r_r_r(rs, rs, rm); \ - EMIT(MIPS_LW(r, rs, 0)); \ -} while (0) -#define emith_read_r_r_r_wb(r, rs, rm) \ - emith_read_r_r_r_ptr_wb(r, rs, rm) - #define emith_read8_r_r_offs(r, rs, offs) \ EMIT(MIPS_LBU(r, rs, offs)) #define emith_read8_r_r_offs_c(cond, r, rs, offs) \ @@ -1028,13 +1021,6 @@ static void emith_lohi_nops(void) #define emith_write_r_r_r_c(cond, r, rs, rm) \ emith_write_r_r_r(r, rs, rm) -#define emith_write_r_r_r_ptr_wb(r, rs, rm) do { \ - emith_add_r_r_r(rs, rs, rm); \ - EMIT(MIPS_SW(r, rs, 0)); \ -} while (0) -#define emith_write_r_r_r_wb(r, rs, rm) \ - emith_write_r_r_r_ptr_wb(r, rs, rm) - #define emith_ctx_read_ptr(r, offs) \ emith_read_r_r_offs_ptr(r, CONTEXT_REG, offs) @@ -1176,6 +1162,7 @@ static int emith_cond_check(int cond, int *r) emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)) #define emith_jump_patchable(target) \ emith_jump(target) +#define emith_jump_patchable_size() 8 /* J+delayslot */ // NB: MIPS conditional branches have only +/- 128KB range #define emith_jump_cond(cond, target) do { \ @@ -1190,6 +1177,8 @@ static int emith_cond_check(int cond, int *r) EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, (u8 *)tcache_ptr-bp-4)); \ } \ } while (0) +#define emith_jump_cond_inrange(target) \ + !(((u8 *)target - (u8 *)tcache_ptr + 0x20000) >> 18) #define emith_jump_cond_patchable(cond, target) do { \ int r_, mcond_ = emith_cond_check(cond, &r_); \ @@ -1199,16 +1188,14 @@ static int emith_cond_check(int cond, int *r) EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, (u8 *)tcache_ptr-bp-4)); \ } while (0) -#define emith_jump_cond_inrange(target) \ - !(((u8 *)target - (u8 *)tcache_ptr + 0x10000) >> 18) - // NB: returns position of patch for cache maintenance -#define emith_jump_patch(ptr, target) ({ \ +#define emith_jump_patch(ptr, target, pos) do { \ u32 *ptr_ = (u32 *)ptr-1; /* must skip condition check code */ \ while ((ptr_[0] & 0xf8000000) != OP_J << 26) ptr_ ++; \ EMIT_PTR(ptr_, MIPS_J((uintptr_t)target & 0x0fffffff)); \ - (u8 *)(ptr_-1); \ -}) + if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)(ptr_-1); \ +} while (0) +#define emith_jump_patch_size() 4 #define emith_jump_reg(r) \ emith_branch(MIPS_JR(r)) @@ -1235,11 +1222,6 @@ static int emith_cond_check(int cond, int *r) emith_call_reg(AT); \ } while (0) -#define emith_call_link(r, target) do { \ - EMIT(MIPS_BL(4)); EMIT(MIPS_ADD_IMM(r, LR, 8)); emith_flush(); \ - emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)); \ -} while (0) - #define emith_call_cleanup() /**/ #define emith_ret() \ @@ -1250,6 +1232,9 @@ static int emith_cond_check(int cond, int *r) #define emith_ret_to_ctx(offs) \ emith_ctx_write_ptr(LR, offs) +#define emith_add_r_ret_imm(r, imm) \ + emith_add_r_r_ptr_imm(r, LR, imm) + // NB: ABI SP alignment is 8 for compatibility with MIPS IV #define emith_push_ret(r) do { \ emith_sub_r_imm(SP, 8+16); /* reserve new arg save area (16) */ \ @@ -1271,7 +1256,6 @@ static int emith_cond_check(int cond, int *r) // NB: mips32r2 has SYNCI #define host_instructions_updated(base, end) __builtin___clear_cache(base, end) #define emith_update_cache() /**/ -#define emith_jump_patch_size() 4 #define emith_rw_offs_max() 0x7fff // SH2 drc specific diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index caade3a67..d8b3a2dd0 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -297,54 +297,61 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common // _r_r_r_shift #define emith_add_r_r_r_lsl(d, s1, s2, lslimm) do { \ - int tmp_ = rcache_get_tmp(); \ - emith_lsl(tmp_, s2, lslimm); \ - emith_add_r_r_r(d, s1, tmp_); \ - rcache_free_tmp(tmp_); \ + if (lslimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsl(tmp_, s2, lslimm); \ + emith_add_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_add_r_r_r(d, s1, s2); \ } while (0) #define emith_add_r_r_r_lsl_ptr(d, s1, s2, lslimm) do { \ - int tmp_ = rcache_get_tmp(); \ - emith_lsl(tmp_, s2, lslimm); \ - emith_add_r_r_r_ptr(d, s1, tmp_); \ - rcache_free_tmp(tmp_); \ + if (lslimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsl(tmp_, s2, lslimm); \ + emith_add_r_r_r_ptr(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_add_r_r_r_ptr(d, s1, s2); \ } while (0) #define emith_add_r_r_r_lsr(d, s1, s2, lsrimm) do { \ - int tmp_ = rcache_get_tmp(); \ - emith_lsr(tmp_, s2, lsrimm); \ - emith_add_r_r_r(d, s1, tmp_); \ - rcache_free_tmp(tmp_); \ + if (lsrimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsr(tmp_, s2, lsrimm); \ + emith_add_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_add_r_r_r(d, s1, s2); \ } while (0) #define emith_sub_r_r_r_lsl(d, s1, s2, lslimm) do { \ - int tmp_ = rcache_get_tmp(); \ - emith_lsl(tmp_, s2, lslimm); \ - emith_sub_r_r_r(d, s1, tmp_); \ - rcache_free_tmp(tmp_); \ + if (lslimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsl(tmp_, s2, lslimm); \ + emith_sub_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_sub_r_r_r(d, s1, s2); \ } while (0) #define emith_or_r_r_r_lsl(d, s1, s2, lslimm) do { \ - int tmp_ = rcache_get_tmp(); \ - emith_lsl(tmp_, s2, lslimm); \ - emith_or_r_r_r(d, s1, tmp_); \ - rcache_free_tmp(tmp_); \ + if (lslimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsl(tmp_, s2, lslimm); \ + emith_or_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_or_r_r_r(d, s1, s2); \ } while (0) // _r_r_shift -#define emith_or_r_r_lsl(d, s, lslimm) do { \ - int tmp_ = rcache_get_tmp(); \ - emith_lsl(tmp_, s, lslimm); \ - emith_or_r_r(d, tmp_); \ - rcache_free_tmp(tmp_); \ -} while (0) +#define emith_or_r_r_lsl(d, s, lslimm) \ + emith_or_r_r_r_lsl(d, d, s, lslimm) -// d != s #define emith_eor_r_r_lsr(d, s, lsrimm) do { \ - emith_push(s); \ - emith_lsr(s, s, lsrimm); \ - emith_eor_r_r(d, s); \ - emith_pop(s); \ + if (lsrimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsr(tmp_, s, lsrimm); \ + emith_eor_r_r(d, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_eor_r_r(d, s); \ } while (0) // _r_imm @@ -792,14 +799,6 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common EMIT_OP_MODRM64(0x8b, 0, r, 4); \ EMIT_SIB64(0, rs, rm); /* mov r, [rm + rs * 1] */ \ } while (0) -#define emith_read_r_r_r_wb(r, rs, rm) do { \ - emith_read_r_r_r(r, rs, rm); \ - emith_add_r_r_ptr(rs, rm); \ -} while (0) -#define emith_read_r_r_r_ptr_wb(r, rs, rm) do { \ - emith_read_r_r_r_ptr(r, rs, rm); \ - emith_add_r_r_ptr(rs, rm); \ -} while (0) #define emith_write_r_r_r(r, rs, rm) do { \ EMIT_XREX_IF(0, r, rm, rs); \ @@ -811,15 +810,6 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common EMIT_OP_MODRM64(0x89, 0, r, 4); \ EMIT_SIB64(0, rs, rm); /* mov [rm + rs * 1], r */ \ } while (0) -#define emith_write_r_r_r_wb(r, rs, rm) do { \ - emith_write_r_r_r(r, rs, rm); \ - emith_add_r_r_ptr(rs, rm); \ -} while (0) -#define emith_write_r_r_r_ptr_wb(r, rs, rm) do { \ - emith_write_r_r_r_ptr(r, rs, rm); \ - emith_add_r_r_ptr(rs, rm); \ -} while (0) - #define emith_ctx_read(r, offs) \ emith_read_r_r_offs(r, CONTEXT_REG, offs) @@ -846,10 +836,11 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common emith_ctx_write(r_, offs_); \ } while (0) -// assumes EBX is free #define emith_ret_to_ctx(offs) do { \ - emith_pop(xBX); \ - emith_ctx_write(xBX, offs); \ + int tmp_ = rcache_get_tmp(); \ + emith_pop(tmp_); \ + emith_ctx_write(tmp_, offs); \ + rcache_free_tmp(tmp_); \ } while (0) #define emith_jump(ptr) do { \ @@ -860,24 +851,24 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define emith_jump_patchable(target) \ emith_jump(target) +#define emith_jump_patchable_size() 5 /* JMP rel32 */ #define emith_jump_cond(cond, ptr) do { \ u32 disp = (u8 *)(ptr) - ((u8 *)tcache_ptr + 6); \ EMIT_OP(0x0f80 | (cond)); \ EMIT(disp, u32); \ } while (0) +#define emith_jump_cond_inrange(ptr) !0 #define emith_jump_cond_patchable(cond, target) \ emith_jump_cond(cond, target) -#define emith_jump_patch(ptr, target) ({ \ +#define emith_jump_patch(ptr, target, pos) do { \ u32 disp_ = (u8 *)(target) - ((u8 *)(ptr) + 4); \ u32 offs_ = (*(u8 *)(ptr) == 0x0f) ? 2 : 1; \ EMIT_PTR((u8 *)(ptr) + offs_, disp_ - offs_, u32); \ - ptr; \ -}) - -#define emith_jump_cond_inrange(ptr) !0 + if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)ptr; \ +} while (0) #define emith_jump_patch_size() 6 #define emith_jump_at(ptr, target) do { \ @@ -903,20 +894,17 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common EMIT(offs, u32); \ } while (0) -#define emith_call_link(r, target) do { \ - EMIT_OP(0xe8); \ - EMIT(0, u32); /* call pc+0 */ \ - emith_pop(r); \ - emith_add_r_r_ptr_imm(r, r, 13); \ - emith_jump(target); \ -} while (0) - #define emith_call_cleanup() \ emith_add_r_r_ptr_imm(xSP, xSP, sizeof(void *)); // remove return addr #define emith_ret() \ EMIT_OP(0xc3) +#define emith_add_r_ret_imm(r, imm) do { \ + emith_read_r_r_offs_ptr(r, xSP, 0); \ + emith_add_r_r_ptr_imm(r, r, imm); \ +} while (0) + #define emith_jump_reg(r) \ EMIT_OP_MODRM(0xff, 3, 4, r) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index f6fbadaf4..ec8554cc6 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -40,7 +40,7 @@ #define PROPAGATE_CONSTANTS 1 #define LINK_BRANCHES 1 #define BRANCH_CACHE 1 -#define CALL_STACK 0 +#define CALL_STACK 1 #define ALIAS_REGISTERS 1 #define REMAP_REGISTER 1 #define LOOP_DETECTION 1 @@ -635,7 +635,7 @@ static signed char reg_map_host[HOST_REGS]; static void REGPARM(1) (*sh2_drc_entry)(SH2 *sh2); static void REGPARM(1) (*sh2_drc_dispatcher)(u32 pc); #if CALL_STACK -static void REGPARM(2) (*sh2_drc_dispatcher_call)(u32 pc, uptr host_pr); +static u32 REGPARM(2) (*sh2_drc_dispatcher_call)(u32 pc); static void REGPARM(1) (*sh2_drc_dispatcher_return)(u32 pc); #endif static void REGPARM(1) (*sh2_drc_exit)(u32 pc); @@ -1150,7 +1150,8 @@ static void dr_block_link(struct block_entry *be, struct block_link *bl, int emi bl->jump, bl->target_pc, be->tcache_ptr); if (emit_jump) { - u8 *jump = emith_jump_patch(bl->jump, be->tcache_ptr); + u8 *jump; + emith_jump_patch(bl->jump, be->tcache_ptr, &jump); // only needs sync if patch is possibly crossing cacheline (assume 16 byte) if ((uintptr_t)jump >>4 != ((uintptr_t)jump+emith_jump_patch_size()-1) >>4) host_instructions_updated(jump, jump+emith_jump_patch_size()); @@ -1171,7 +1172,8 @@ static void dr_block_unlink(struct block_link *bl, int emit_jump) if (bl->target) { if (emit_jump) { - u8 *jump = emith_jump_patch(bl->jump, sh2_drc_dispatcher); + u8 *jump; + emith_jump_patch(bl->jump, sh2_drc_dispatcher, &jump); // update cpu caches since the previous jump target doesn't exist anymore host_instructions_updated(jump, jump+emith_jump_patch_size()); } @@ -1381,7 +1383,7 @@ static void rcache_remap_vreg(int x); { d = 1; printf("cache check r=%d c=%d not connected?\n", i, gp->cnst); }\ if ((gp->flags & GRF_CDIRTY) && (gp->vreg != -1 || !(gp->flags & GRF_CONST)))\ { d = 1; printf("cache check r=%d CDIRTY?\n", i); } \ - if (gp->flags & GRF_PINNED) { \ + if (gp->flags & (GRF_STATIC|GRF_PINNED)) { \ if (gp->sreg == -1 || !(cache_regs[gp->sreg].flags & HRF_PINNED))\ { d = 1; printf("cache check r=%d v=%d not pinned?\n", i, gp->vreg); } \ else m &= ~(1 << gp->sreg); \ @@ -4407,7 +4409,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } #endif - rcache_unlock_all(); // may lock delay_reg + rcache_unlock_all(); #if LOOP_OPTIMIZER if (target && pinned_loop_pc[pinned_loop_count] == target_pc) { rcache_unpin_all(); @@ -4427,30 +4429,26 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #if CALL_STACK if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) { // BSR - tmp = rcache_get_tmp_arg(1); - emith_call_link(tmp, sh2_drc_dispatcher_call); - rcache_free_tmp(tmp); - } else + emith_call(sh2_drc_dispatcher_call); + } #endif - target = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); + + target = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); patchable = 1; } // create branch - if (patchable) { - if (cond != -1) + if (cond != -1) { + if (patchable) emith_jump_cond_patchable(cond, target); - else if (target != NULL) { - rcache_invalidate(); - emith_jump_patchable(target); - } - } else { - if (cond != -1) + else emith_jump_cond(cond, target); - else if (target != NULL) { - rcache_invalidate(); + } else { + rcache_invalidate(); + if (patchable) + emith_jump_patchable(target); + else emith_jump(target); - } } // branch not taken, correct cycle count @@ -4476,14 +4474,14 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_invalidate(); #if CALL_STACK struct op_data *opd_b = (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd; + if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) { + // JSR/BSRF + emith_call(sh2_drc_dispatcher_call); + } + if (opd_b->rm == SHR_PR) { // RTS emith_jump(sh2_drc_dispatcher_return); - } else if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) { - // JSR/BSRF - tmp = rcache_get_tmp_arg(1); - emith_call_link(tmp, sh2_drc_dispatcher_call); - rcache_free(tmp); } else #endif if (gconst_get(SHR_PC, &target_pc)) { @@ -4544,7 +4542,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_flush(); emith_jump(sh2_drc_dispatcher); } - emith_jump_patch(branch_patch_ptr[i], target); + emith_jump_patch(branch_patch_ptr[i], target, NULL); } emith_pool_commit(0); @@ -4713,20 +4711,6 @@ static void sh2_generate_utils(void) emith_sh2_drc_exit(); emith_flush(); -#if CALL_STACK - // sh2_drc_dispatcher_call(u32 pc, uptr host_pr) - sh2_drc_dispatcher_call = (void *)tcache_ptr; - emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx)); - emith_add_r_imm(arg2, 2*sizeof(void *)); - emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *)); - emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx)); - emith_add_r_r_ptr_imm(arg3, CONTEXT_REG, offsetof(SH2, rts_cache) + sizeof(void *)); - emith_write_r_r_r_ptr_wb(arg1, arg2, arg3); - emith_ctx_read(arg3, SHR_PR * 4); - emith_write_r_r_offs(arg3, arg2, (s8)-sizeof(void *)); - emith_flush(); - // FALLTHROUGH -#endif // sh2_drc_dispatcher(u32 pc) sh2_drc_dispatcher = (void *)tcache_ptr; emith_ctx_write(arg0, SHR_PC * 4); @@ -4782,35 +4766,49 @@ static void sh2_generate_utils(void) emith_flush(); #if CALL_STACK + // pc = sh2_drc_dispatcher_call(u32 pc) + sh2_drc_dispatcher_call = (void *)tcache_ptr; + emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx)); + emith_ctx_read(arg1, SHR_PR * 4); + emith_add_r_imm(arg2, 2*sizeof(void *)); + emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *)); + emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx)); + emith_add_r_r_r_lsl_ptr(arg2, CONTEXT_REG, arg2, 0); + emith_write_r_r_offs(arg1, arg2, offsetof(SH2, rts_cache)); + emith_add_r_ret_imm(arg1, emith_jump_patchable_size()); // skip jump_patchable for rts host address + emith_write_r_r_offs_ptr(arg1, arg2, offsetof(SH2, rts_cache) + sizeof(void *)); + emith_ret(); + emith_flush(); + // sh2_drc_dispatcher_return(u32 pc) sh2_drc_dispatcher_return = (void *)tcache_ptr; emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx)); - emith_add_r_r_ptr_imm(arg1, CONTEXT_REG, offsetof(SH2, rts_cache)); - emith_read_r_r_r_wb(arg3, arg1, arg2); + emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg2, 0); + emith_read_r_r_offs(arg3, arg1, offsetof(SH2, rts_cache)); emith_cmp_r_r(arg0, arg3); #if (DRC_DEBUG & 128) EMITH_SJMP_START(DCOND_EQ); - emith_move_r_ptr_imm(arg2, (uptr)&rcmiss); - emith_read_r_r_offs_c(DCOND_NE, arg1, arg2, 0); + emith_move_r_ptr_imm(arg3, (uptr)&rcmiss); + emith_read_r_r_offs_c(DCOND_NE, arg1, arg3, 0); emith_add_r_imm_c(DCOND_NE, arg1, 1); - emith_write_r_r_offs_c(DCOND_NE, arg1, arg2, 0); + emith_write_r_r_offs_c(DCOND_NE, arg1, arg3, 0); EMITH_SJMP_END(DCOND_EQ); #endif emith_jump_cond(DCOND_NE, sh2_drc_dispatcher); - emith_read_r_r_offs_ptr(arg0, arg1, sizeof(void *)); + emith_read_r_r_offs_ptr(arg0, arg1, offsetof(SH2, rts_cache) + sizeof(void *)); emith_sub_r_imm(arg2, 2*sizeof(void *)); emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *)); emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx)); #if (DRC_DEBUG & 128) - emith_move_r_ptr_imm(arg2, (uptr)&rchit); - emith_read_r_r_offs(arg1, arg2, 0); + emith_move_r_ptr_imm(arg3, (uptr)&rchit); + emith_read_r_r_offs(arg1, arg3, 0); emith_add_r_imm(arg1, 1); - emith_write_r_r_offs(arg1, arg2, 0); + emith_write_r_r_offs(arg1, arg3, 0); #endif emith_jump_reg(arg0); emith_flush(); #endif - + // sh2_drc_test_irq(void) // assumes it's called from main function (may jump to dispatcher) sh2_drc_test_irq = (void *)tcache_ptr; diff --git a/pico/32x/pwm.c b/pico/32x/pwm.c index 1c1ec4289..0aa2f586b 100644 --- a/pico/32x/pwm.c +++ b/pico/32x/pwm.c @@ -88,7 +88,7 @@ static void consume_fifo_do(SH2 *sh2, unsigned int m68k_cycles, mem->pwm_index[0] = (mem->pwm_index[0]+1) % 4; Pico32x.pwm_p[0]--; pwm.current[0] = convert_sample(fifo_l[mem->pwm_index[0]]); - sum |=pwm.current[0]; + sum |= pwm.current[0]; } if (Pico32x.pwm_p[1] > 0) { mem->pwm_index[1] = (mem->pwm_index[1]+1) % 4; From 675cad8ce76ea5240cf7fecf09813f2924a75441 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 28 Sep 2019 16:39:26 +0200 Subject: [PATCH 062/174] sh2 drc: drc exit, block linking and branch handling revised --- cpu/drc/emit_arm.c | 28 ++- cpu/drc/emit_arm64.c | 65 +++--- cpu/drc/emit_mips.c | 60 ++--- cpu/drc/emit_x86.c | 27 ++- cpu/sh2/compiler.c | 508 +++++++++++++++++++++++++++++-------------- 5 files changed, 452 insertions(+), 236 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 71a109222..b8c6419cc 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -631,8 +631,8 @@ static void emith_pool_commit(int jumpover) static inline void emith_pool_check(void) { // check if pool must be committed - if (literal_iindex > MAX_HOST_LITERALS-4 || - (u8 *)tcache_ptr - (u8 *)literal_insn[0] > 0xe00) + if (literal_iindex > MAX_HOST_LITERALS-4 || (literal_pindex && + (u8 *)tcache_ptr - (u8 *)literal_insn[0] > 0xe00)) // pool full, or displacement is approaching the limit emith_pool_commit(1); } @@ -889,11 +889,19 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_tst_r_imm_c(cond, r, imm) \ emith_top_imm(cond, A_OP_TST, r, imm) -#define emith_move_r_imm_s8(r, imm) do { \ +#define emith_move_r_imm_s8_patchable(r, imm) do { \ + emith_flush(); \ if ((s8)(imm) < 0) \ - EOP_MVN_IMM(r, 0, ((u8)(imm) ^ 0xff)); \ + EOP_MVN_IMM(r, 0, (u8)~(imm)); \ else \ - EOP_MOV_IMM(r, 0, (u8)imm); \ + EOP_MOV_IMM(r, 0, (u8)(imm)); \ +} while (0) +#define emith_move_r_imm_s8_patch(ptr, imm) do { \ + u32 *ptr_ = (u32 *)ptr; u32 op_ = *ptr_ & 0xfe1ff000; \ + if ((s8)(imm) < 0) \ + EMIT_PTR(ptr_, op_ | (A_OP_MVN<<21) | (u8)~(imm));\ + else \ + EMIT_PTR(ptr_, op_ | (A_OP_MOV<<21) | (u8)(imm));\ } while (0) #define emith_and_r_r_imm(d, s, imm) \ @@ -1125,7 +1133,6 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_jump_patchable(target) \ emith_jump(target) -#define emith_jump_patchable_size() 4 #define emith_jump_cond(cond, target) \ emith_xbranch(cond, target, 0) @@ -1135,18 +1142,19 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) emith_jump_cond(cond, target) #define emith_jump_patch(ptr, target, pos) do { \ - u32 *ptr_ = ptr; \ + u32 *ptr_ = (u32 *)ptr; \ u32 val_ = (u32 *)(target) - ptr_ - 2; \ *ptr_ = (*ptr_ & 0xff000000) | (val_ & 0x00ffffff); \ if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)ptr; \ } while (0) +#define emith_jump_patch_inrange(ptr, target) !0 #define emith_jump_patch_size() 4 #define emith_jump_at(ptr, target) do { \ u32 val_ = (u32 *)(target) - (u32 *)(ptr) - 2; \ - emith_flush(); \ EOP_C_B_PTR(ptr, A_COND_AL, 0, val_ & 0xffffff); \ } while (0) +#define emith_jump_at_size() 4 #define emith_jump_reg_c(cond, r) \ EOP_C_BX(cond, r) @@ -1187,8 +1195,8 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_ret_to_ctx(offs) \ emith_ctx_write(LR, offs) -#define emith_add_r_ret_imm(r, imm) \ - emith_add_r_r_ptr_imm(r, LR, imm) +#define emith_add_r_ret(r) \ + emith_add_r_r_ptr(r, LR) /* pushes r12 for eabi alignment */ #define emith_push_ret(r) do { \ diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 72f53dd56..688649b5b 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -447,6 +447,8 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; #define emith_eor_r_r_r(d, s1, s2) \ emith_eor_r_r_r_lsl(d, s1, s2, 0) +#define emith_add_r_r_r_ptr(d, s1, s2) \ + emith_add_r_r_r_lsl_ptr(d, s1, s2, 0) #define emith_and_r_r_r(d, s1, s2) \ emith_and_r_r_r_lsl(d, s1, s2, 0) @@ -546,6 +548,20 @@ static void emith_move_imm64(int r, int wx, int64_t imm) #define emith_move_r_imm_c(cond, r, imm) \ emith_move_r_imm(r, imm) +#define emith_move_r_imm_s8_patchable(r, imm) do { \ + if ((s8)(imm) < 0) \ + EMIT(A64_MOVN_IMM(r, ~(s8)(imm), 0)); \ + else \ + EMIT(A64_MOVZ_IMM(r, (s8)(imm), 0)); \ +} while (0) +#define emith_move_r_imm_s8_patch(ptr, imm) do { \ + u32 *ptr_ = (u32 *)ptr; \ + int r_ = *ptr_ & 0x1f; \ + if ((s8)(imm) < 0) \ + EMIT_PTR(ptr_, A64_MOVN_IMM(r_, ~(s8)(imm), 0)); \ + else \ + EMIT_PTR(ptr_, A64_MOVZ_IMM(r_, (s8)(imm), 0)); \ +} while (0) // arithmetic, immediate static void emith_arith_imm(int op, int wx, int rd, int rn, s32 imm) @@ -995,16 +1011,6 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) emith_move_r_imm(arg, imm) // branching; NB: A64 B.cond has only +/- 1MB range -#define emith_bcond(ptr, patch, cond, target) do { \ - u32 disp_ = (u8 *)target - (u8 *)ptr; \ - if (disp_ >= 0xfff00000 || disp_ <= 0x000fffff) { /* can use near B.c */ \ - EMIT_PTR(ptr, A64_BCOND(cond, disp_ & 0x001fffff)); \ - if (patch) EMIT_PTR(ptr, A64_NOP); /* reserve space for far B */ \ - } else { /* far branch if near branch isn't possible */ \ - EMIT_PTR(ptr, A64_BCOND(emith_invert_cond(cond), 8)); \ - EMIT_PTR(ptr, A64_B((disp_ - 4) & 0x0fffffff)); \ - } \ -} while (0) #define emith_jump(target) do {\ u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ @@ -1013,30 +1019,37 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_jump_patchable(target) \ emith_jump(target) -#define emith_jump_patchable_size() 4 -#define emith_jump_cond(cond, target) \ - emith_bcond(tcache_ptr, 0, cond, target) +#define emith_jump_cond(cond, target) do { \ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ + EMIT(A64_BCOND(cond, disp_ & 0x001fffff)); \ +} while (0) #define emith_jump_cond_patchable(cond, target) \ - emith_bcond(tcache_ptr, 1, cond, target) + emith_jump_cond(cond, target) #define emith_jump_cond_inrange(target) \ !(((u8 *)target - (u8 *)tcache_ptr + 0x100000) >> 21) #define emith_jump_patch(ptr, target, pos) do { \ u32 *ptr_ = (u32 *)ptr; \ - u32 disp_ = (u8 *)(target) - (u8 *)(ptr_); \ - int cond_ = ptr_[0] & 0xf; \ - if ((ptr_[0] & 0xff000000) == 0x54000000) { /* B.cond */ \ - if (ptr_[1] != A64_NOP) cond_ = emith_invert_cond(cond_); \ - emith_bcond(ptr_, 1, cond_, target); \ - } else if (ptr_[0] & 0x80000000) \ - EMIT_PTR(ptr_, A64_BL((disp_) & 0x0fffffff)); \ - else EMIT_PTR(ptr_, A64_B((disp_) & 0x0fffffff)); \ - if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)ptr; \ + u32 disp_ = (u8 *)target - (u8 *)ptr, mask_; \ + if ((*ptr_ & 0xff000000) == 0x54000000) \ + mask_ = 0xff00001f, disp_ <<= 5; /* B.cond, range 21 bit */ \ + else mask_ = 0xfc000000; /* B[L], range 28 bit */ \ + EMIT_PTR(ptr_, (*ptr_ & mask_) | ((disp_ >> 2) & ~mask_)); \ + if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)(ptr_-1); \ +} while (0) + +#define emith_jump_patch_inrange(ptr, target) \ + !(((u8 *)target - (u8 *)ptr + 0x100000) >> 21) +#define emith_jump_patch_size() 4 + +#define emith_jump_at(ptr, target) do { \ + u32 disp_ = (u8 *)target - (u8 *)ptr; \ + EMIT_PTR(ptr, A64_B(disp_ & 0x0fffffff)); \ } while (0) -#define emith_jump_patch_size() 8 +#define emith_jump_at_size() 4 #define emith_jump_reg(r) \ EMIT(A64_BR(r)) @@ -1079,8 +1092,8 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_ret_to_ctx(offs) \ emith_ctx_write_ptr(LR, offs) -#define emith_add_r_ret_imm(r, imm) \ - emith_add_r_r_ptr_imm(r, LR, imm) +#define emith_add_r_ret(r) \ + emith_add_r_r_r_ptr(r, LR, r) // NB: pushes r or r18 for SP hardware alignment #define emith_push_ret(r) do { \ diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 6ff134d97..ad02ff245 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -285,7 +285,7 @@ static int emith_b_isswap(u32 bop, u32 lop) return bop; else if (emith_is_b(bop) && emith_rd(lop) != emith_rs(bop)) if ((bop & 0xffff) != 0x7fff) // displacement overflow? - return (bop & 0xffff0000) | ((bop & 0xffff)+1); + return (bop & 0xffff0000) | ((bop+1) & 0x0000ffff); return 0; } @@ -332,14 +332,14 @@ static void *emith_branch(u32 op) #define JMP_EMIT(cond, ptr) { \ u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; \ - EMIT_PTR(ptr, MIPS_BCONDZ(cond_m, cond_r, val_ & 0x0003ffff)); \ emith_flush(); /* NO delay slot handling across jump targets */ \ + EMIT_PTR(ptr, MIPS_BCONDZ(cond_m, cond_r, val_ & 0x0003ffff)); \ } #define JMP_EMIT_NC(ptr) { \ u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; \ - EMIT_PTR(ptr, MIPS_B(val_ & 0x0003ffff)); \ emith_flush(); \ + EMIT_PTR(ptr, MIPS_B(val_ & 0x0003ffff)); \ } #define EMITH_JMP_START(cond) { \ @@ -645,6 +645,13 @@ static void emith_move_imm(int r, uintptr_t imm) #define emith_move_r_imm_c(cond, r, imm) \ emith_move_r_imm(r, imm) +#define emith_move_r_imm_s8_patchable(r, imm) \ + EMIT(MIPS_ADD_IMM(r, Z0, (s8)(imm))) +#define emith_move_r_imm_s8_patch(ptr, imm) do { \ + u32 *ptr_ = (u32 *)ptr; \ + while (*ptr_ >> 26 != OP_ADDIU) ptr_++; \ + EMIT_PTR(ptr_, (*ptr_ & 0xffff0000) | (u16)(s8)(imm)); \ +} while (0) // arithmetic, immediate static void emith_arith_imm(int op, int rd, int rs, u32 imm) @@ -1162,41 +1169,44 @@ static int emith_cond_check(int cond, int *r) emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)) #define emith_jump_patchable(target) \ emith_jump(target) -#define emith_jump_patchable_size() 8 /* J+delayslot */ // NB: MIPS conditional branches have only +/- 128KB range #define emith_jump_cond(cond, target) do { \ int r_, mcond_ = emith_cond_check(cond, &r_); \ u32 disp_ = (u8 *)target - (u8 *)tcache_ptr - 4; \ - if (disp_ >= 0xfffe0000 || disp_ <= 0x0001ffff) { /* can use near B */ \ - emith_branch(MIPS_BCONDZ(mcond_,r_,disp_ & 0x0003ffff)); \ - } else { /* far branch if near branch isn't possible */ \ - mcond_ = emith_invert_branch(mcond_); \ - u8 *bp = emith_branch(MIPS_BCONDZ(mcond_, r_, 0)); \ - emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)); \ - EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, (u8 *)tcache_ptr-bp-4)); \ - } \ + emith_branch(MIPS_BCONDZ(mcond_,r_,disp_ & 0x0003ffff)); \ } while (0) -#define emith_jump_cond_inrange(target) \ - !(((u8 *)target - (u8 *)tcache_ptr + 0x20000) >> 18) +#define emith_jump_cond_patchable(cond, target) \ + emith_jump_cond(cond, target) -#define emith_jump_cond_patchable(cond, target) do { \ - int r_, mcond_ = emith_cond_check(cond, &r_); \ - mcond_ = emith_invert_branch(mcond_); \ - u8 *bp = emith_branch(MIPS_BCONDZ(mcond_, r_, 0));\ - emith_branch(MIPS_J((uintptr_t)target & 0x0fffffff)); \ - EMIT_PTR(bp, MIPS_BCONDZ(mcond_, r_, (u8 *)tcache_ptr-bp-4)); \ -} while (0) +#define emith_jump_cond_inrange(target) \ + ((u8 *)target - (u8 *)tcache_ptr - 4 < 0x00020000U || \ + (u8 *)target - (u8 *)tcache_ptr - 4 >= 0xfffe0010U) // mind cond_check // NB: returns position of patch for cache maintenance #define emith_jump_patch(ptr, target, pos) do { \ u32 *ptr_ = (u32 *)ptr-1; /* must skip condition check code */ \ - while ((ptr_[0] & 0xf8000000) != OP_J << 26) ptr_ ++; \ - EMIT_PTR(ptr_, MIPS_J((uintptr_t)target & 0x0fffffff)); \ + u32 disp_, mask_; \ + while (!emith_is_j(*ptr_) && !emith_is_b(*ptr_)) ptr_ ++; \ + if (emith_is_b(*ptr_)) \ + mask_ = 0xffff0000, disp_ = (u8 *)target - (u8 *)ptr_ - 4; \ + else mask_ = 0xfc000000, disp_ = (uintptr_t)target; \ + EMIT_PTR(ptr_, (*ptr_ & mask_) | ((disp_ >> 2) & ~mask_)); \ if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)(ptr_-1); \ } while (0) + +#define emith_jump_patch_inrange(ptr, target) \ + ((u8 *)target - (u8 *)ptr - 4 < 0x00020000U || \ + (u8 *)target - (u8 *)ptr - 4 >= 0xfffe0010U) // mind cond_check #define emith_jump_patch_size() 4 +#define emith_jump_at(ptr, target) do { \ + u32 *ptr_ = (u32 *)ptr; \ + EMIT_PTR(ptr_, MIPS_J((uintptr_t)target & 0x0fffffff)); \ + EMIT_PTR(ptr_, MIPS_NOP); \ +} while (0) +#define emith_jump_at_size() 8 + #define emith_jump_reg(r) \ emith_branch(MIPS_JR(r)) #define emith_jump_reg_c(cond, r) \ @@ -1232,8 +1242,8 @@ static int emith_cond_check(int cond, int *r) #define emith_ret_to_ctx(offs) \ emith_ctx_write_ptr(LR, offs) -#define emith_add_r_ret_imm(r, imm) \ - emith_add_r_r_ptr_imm(r, LR, imm) +#define emith_add_r_ret(r) \ + emith_add_r_r_ptr(r, LR) // NB: ABI SP alignment is 8 for compatibility with MIPS IV #define emith_push_ret(r) do { \ diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index d8b3a2dd0..451fa8d0a 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -371,8 +371,16 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common } \ } while (0) -#define emith_move_r_imm_s8(r, imm) \ - emith_move_r_imm(r, (u32)(signed int)(signed char)(imm)) +#define emith_move_r_imm_s8_patchable(r, imm) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP(0xb8 + ((r)&7)); \ + EMIT((s8)(imm), u32); \ +} while (0) +#define emith_move_r_imm_s8_patch(ptr, imm) do { \ + u8 *ptr_ = ptr; \ + while ((*ptr_ & 0xf8) != 0xb8) ptr_++; \ + EMIT_PTR(ptr_ + 1, (s8)(imm), u32); \ +} while (0) #define emith_arith_r_imm(op, r, imm) do { \ EMIT_REX_IF(0, 0, r); \ @@ -851,7 +859,6 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define emith_jump_patchable(target) \ emith_jump(target) -#define emith_jump_patchable_size() 5 /* JMP rel32 */ #define emith_jump_cond(cond, ptr) do { \ u32 disp = (u8 *)(ptr) - ((u8 *)tcache_ptr + 6); \ @@ -867,15 +874,17 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common u32 disp_ = (u8 *)(target) - ((u8 *)(ptr) + 4); \ u32 offs_ = (*(u8 *)(ptr) == 0x0f) ? 2 : 1; \ EMIT_PTR((u8 *)(ptr) + offs_, disp_ - offs_, u32); \ - if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)ptr; \ + if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)ptr + offs_; \ } while (0) -#define emith_jump_patch_size() 6 +#define emith_jump_patch_size() 4 +#define emith_jump_patch_inrange(ptr, target) !0 #define emith_jump_at(ptr, target) do { \ u32 disp_ = (u8 *)(target) - ((u8 *)(ptr) + 5); \ EMIT_PTR(ptr, 0xe9, u8); \ EMIT_PTR((u8 *)(ptr) + 1, disp_, u32); \ } while (0) +#define emith_jump_at_size() 5 #define emith_call(ptr) do { \ u32 disp = (u8 *)(ptr) - ((u8 *)tcache_ptr + 5); \ @@ -900,9 +909,9 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define emith_ret() \ EMIT_OP(0xc3) -#define emith_add_r_ret_imm(r, imm) do { \ - emith_read_r_r_offs_ptr(r, xSP, 0); \ - emith_add_r_r_ptr_imm(r, r, imm); \ +#define emith_add_r_ret(r) do { \ + EMIT_REX_IF(1, r, xSP); \ + emith_deref_modrm(0x03, 0, r, xSP); /* add r, [xsp] */ \ } while (0) #define emith_jump_reg(r) \ @@ -974,7 +983,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common emith_move_r_imm(rd, imm); \ } while (0) -#define host_instructions_updated(base, end) +#define host_instructions_updated(base, end) (void)(base),(void)(end) #define emith_update_cache() /**/ #define emith_rw_offs_max() 0xffffffff diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index ec8554cc6..932f21cfb 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -69,7 +69,7 @@ // 800 - state dump on exit // { #ifndef DRC_DEBUG -#define DRC_DEBUG 0//x8e7 +#define DRC_DEBUG 0//x8c7 #endif #if DRC_DEBUG @@ -288,15 +288,19 @@ static u8 *tcache_ptr; #define MAX_BLOCK_ENTRIES (BLOCK_INSN_LIMIT / 6) +enum { BL_JMP=1, BL_LDJMP, BL_JCCBLX }; struct block_link { + short tcache_id; + short type; // BL_JMP et al u32 target_pc; void *jump; // insn address + void *blx; // block link/exit area if any + u8 jdisp[8]; // jump backup buffer struct block_link *next; // either in block_entry->links or unresolved struct block_link *o_next; // ...in block_entry->o_links struct block_link *prev; struct block_link *o_prev; struct block_entry *target;// target block this is linked in (be->links) - int tcache_id; }; struct block_entry { @@ -686,18 +690,24 @@ static int dr_ctx_get_mem_ptr(SH2 *sh2, u32 a, u32 *mask) return poffs; } -static struct block_entry *dr_get_entry(u32 pc, int is_slave, int *tcache_id) +static int dr_get_tcache_id(u32 pc, int is_slave) { - struct block_entry *be; u32 tcid = 0; if ((pc & 0xe0000000) == 0xc0000000) tcid = 1 + is_slave; // data array if ((pc & ~0xfff) == 0) tcid = 1 + is_slave; // BIOS - *tcache_id = tcid; + return tcid; +} - be = HASH_FUNC(hash_tables[tcid], pc, HASH_TABLE_SIZE(tcid) - 1); +static struct block_entry *dr_get_entry(u32 pc, int is_slave, int *tcache_id) +{ + struct block_entry *be; + + *tcache_id = dr_get_tcache_id(pc, is_slave); + + be = HASH_FUNC(hash_tables[*tcache_id], pc, HASH_TABLE_SIZE(*tcache_id) - 1); if (be != NULL) // don't ask... gcc code generation hint for (; be != NULL; be = be->next) if (be->pc == pc) @@ -1101,17 +1111,11 @@ static struct block_desc *dr_add_block(u32 addr, int size, bd->size_lit = size_lit; bd->tcache_ptr = tcache_ptr; bd->crc = crc; - bd->active = 1; - - bd->entry_count = 1; - bd->entryp[0].pc = addr; - bd->entryp[0].tcache_ptr = tcache_ptr; - bd->entryp[0].links = bd->entryp[0].o_links = NULL; + bd->active = 0; + bd->entry_count = 0; #if (DRC_DEBUG & 2) - bd->entryp[0].block = bd; bd->refcount = 0; #endif - add_to_hashlist(&bd->entryp[0], tcache_id); *blk_id = *bcount; (*bcount)++; @@ -1150,11 +1154,33 @@ static void dr_block_link(struct block_entry *be, struct block_link *bl, int emi bl->jump, bl->target_pc, be->tcache_ptr); if (emit_jump) { - u8 *jump; - emith_jump_patch(bl->jump, be->tcache_ptr, &jump); + u8 *jump = bl->jump; + int jsz = emith_jump_patch_size(); + if (bl->type == BL_JMP) { // patch: jump @entry + // inlined: @jump far jump to target + emith_jump_patch(jump, be->tcache_ptr, &jump); + } else if (bl->type == BL_LDJMP) { // write: jump @entry + // inlined: @jump far jump to target + emith_jump_at(jump, be->tcache_ptr); + jsz = emith_jump_at_size(); + } else if (bl->type == BL_JCCBLX) { // patch: jump cond -> jump @entry + if (emith_jump_patch_inrange(bl->jump, be->tcache_ptr)) { + // inlined: @jump near jumpcc to target + emith_jump_patch(jump, be->tcache_ptr, &jump); + } else { // dispatcher cond immediate + // via blx: @jump near jumpcc to blx; @blx far jump + emith_jump_patch(jump, bl->blx, &jump); + emith_jump_at(bl->blx, be->tcache_ptr); + if ((((uintptr_t)bl->blx & 0xf) + emith_jump_at_size()-1) > 0xf) + host_instructions_updated(bl->blx, bl->blx + emith_jump_at_size()-1); + } + } else { + printf("unknown BL type %d\n", bl->type); + exit(1); + } // only needs sync if patch is possibly crossing cacheline (assume 16 byte) - if ((uintptr_t)jump >>4 != ((uintptr_t)jump+emith_jump_patch_size()-1) >>4) - host_instructions_updated(jump, jump+emith_jump_patch_size()); + if ((((uintptr_t)jump & 0xf) + jsz-1) > 0xf) + host_instructions_updated(jump, jump + jsz-1); } // move bl to block_entry @@ -1172,10 +1198,26 @@ static void dr_block_unlink(struct block_link *bl, int emit_jump) if (bl->target) { if (emit_jump) { - u8 *jump; - emith_jump_patch(bl->jump, sh2_drc_dispatcher, &jump); + u8 *jump = bl->jump; + int jsz = emith_jump_patch_size(); + if (bl->type == BL_JMP) { // jump_patch @dispatcher + // inlined: @jump far jump to dispatcher + emith_jump_patch(jump, sh2_drc_dispatcher, &jump); + } else if (bl->type == BL_LDJMP) { // restore: load pc, jump @dispatcher + // inlined: @jump load target_pc, far jump to dispatcher + memcpy(jump, bl->jdisp, emith_jump_at_size()); + jsz = emith_jump_at_size(); + } else if (bl->type == BL_JCCBLX) { // jump cond @blx; @blx: load pc, jump + // via blx: @jump near jumpcc to blx; @blx load target_pc, far jump + emith_jump_patch(bl->jump, bl->blx, &jump); + memcpy(bl->blx, bl->jdisp, emith_jump_at_size()); + host_instructions_updated(bl->blx, bl->blx + emith_jump_at_size()-1); + } else { + printf("unknown BL type %d\n", bl->type); + exit(1); + } // update cpu caches since the previous jump target doesn't exist anymore - host_instructions_updated(jump, jump+emith_jump_patch_size()); + host_instructions_updated(jump, jump + jsz-1); } if (bl->prev) @@ -1189,18 +1231,17 @@ static void dr_block_unlink(struct block_link *bl, int emit_jump) } #endif -static void *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_slave, int tcache_id) +static struct block_link *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_slave, int tcache_id) { #if LINK_BRANCHES struct block_link *bl = block_link_pool[tcache_id]; int cnt = block_link_pool_counts[tcache_id]; - struct block_entry *be = NULL; int target_tcache_id; // get the target block entry - be = dr_get_entry(pc, is_slave, &target_tcache_id); + target_tcache_id = dr_get_tcache_id(pc, is_slave); if (target_tcache_id && target_tcache_id != tcache_id) - return sh2_drc_dispatcher; + return NULL; // get a block link if (blink_free[tcache_id] != NULL) { @@ -1208,29 +1249,24 @@ static void *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_sla blink_free[tcache_id] = bl->next; } else if (cnt >= BLOCK_LINK_MAX_COUNT(tcache_id)) { dbg(1, "bl overflow for tcache %d", tcache_id); - return sh2_drc_dispatcher; + return NULL; } else { bl += cnt; block_link_pool_counts[tcache_id] = cnt+1; } - // prepare link and add to ougoing list of owner + // prepare link and add to outgoing list of owner bl->tcache_id = tcache_id; bl->target_pc = pc; bl->jump = tcache_ptr; + bl->blx = NULL; bl->o_next = owner->o_links; owner->o_links = bl; - if (be != NULL) { - dr_block_link(be, bl, 0); // jump not yet emitted by translate() - return be->tcache_ptr; - } - else { - add_to_hashlist_unresolved(bl, tcache_id); - return sh2_drc_dispatcher; - } + add_to_hashlist_unresolved(bl, tcache_id); + return bl; #else - return sh2_drc_dispatcher; + return NULL; #endif } @@ -1272,6 +1308,27 @@ static void dr_link_outgoing(struct block_entry *be, int tcache_id, int is_slave #endif } +static void dr_activate_block(struct block_desc *bd, int tcache_id, int is_slave) +{ + int i; + + // connect branches + for (i = 0; i < bd->entry_count; i++) { + struct block_entry *entry = &bd->entryp[i]; + add_to_hashlist(entry, tcache_id); + // incoming branches + dr_link_blocks(entry, tcache_id); + if (!tcache_id) + dr_link_blocks(entry, is_slave?2:1); + // outgoing branches + dr_link_outgoing(entry, tcache_id, is_slave); + } + + // mark memory for overwrite detection + dr_mark_memory(1, bd, tcache_id, 0); + bd->active = 1; +} + #define ADD_TO_ARRAY(array, count, item, failcode) { \ if (count >= ARRAY_SIZE(array)) { \ dbg(1, "warning: " #array " overflow"); \ @@ -2422,6 +2479,7 @@ static void rcache_invalidate(void) { int i; gconst_invalidate(); + rcache_unlock_all(); for (i = 0; i < ARRAY_SIZE(cache_regs); i++) rcache_free_vreg(i); @@ -2446,7 +2504,6 @@ static void rcache_invalidate(void) static void rcache_flush(void) { - rcache_unlock_all(); rcache_clean(); rcache_invalidate(); } @@ -2916,13 +2973,22 @@ static void *dr_get_pc_base(u32 pc, SH2 *sh2); static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { + // branch targets in current block u32 branch_target_pc[MAX_LOCAL_BRANCHES]; void *branch_target_ptr[MAX_LOCAL_BRANCHES]; int branch_target_count = 0; - void *branch_patch_ptr[MAX_LOCAL_BRANCHES]; + // unresolved local forward branches, for fixup at block end u32 branch_patch_pc[MAX_LOCAL_BRANCHES]; + void *branch_patch_ptr[MAX_LOCAL_BRANCHES]; int branch_patch_count = 0; + // external branch targets with a block link/exit area + u32 blx_target_pc[MAX_LOCAL_BRANCHES]; + void *blx_target_ptr[MAX_LOCAL_BRANCHES]; + struct block_link *blx_target_bl[MAX_LOCAL_BRANCHES]; + int blx_target_count = 0; + u8 op_flags[BLOCK_INSN_LIMIT]; + struct drcf { int delay_reg:8; u32 loop_type:8; @@ -2931,9 +2997,12 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) u32 pending_branch_direct:1; u32 pending_branch_indirect:1; } drcf = { 0, }; + #if LOOP_OPTIMIZER - void *pinned_loop_ptr[MAX_LOCAL_BRANCHES/16]; + // loops with pinned registers for optimzation + // pinned regs are like statics and don't need saving/restoring inside a loop u32 pinned_loop_pc[MAX_LOCAL_BRANCHES/16]; + void *pinned_loop_ptr[MAX_LOCAL_BRANCHES/16]; u32 pinned_loop_mask[MAX_LOCAL_BRANCHES/16]; int pinned_loop_count = 0; #endif @@ -2976,24 +3045,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) base_literals, end_literals - base_literals); if (block) { - // connect branches dbg(2, "== %csh2 reuse block %08x-%08x,%08x-%08x -> %p", sh2->is_slave ? 's' : 'm', base_pc, end_pc, base_literals, end_literals, block->entryp->tcache_ptr); - for (i = 0; i < block->entry_count; i++) { - entry = &block->entryp[i]; - add_to_hashlist(entry, tcache_id); -#if LINK_BRANCHES - // incoming branches - dr_link_blocks(entry, tcache_id); - if (!tcache_id) - dr_link_blocks(entry, sh2->is_slave?2:1); - // outgoing branches - dr_link_outgoing(entry, tcache_id, sh2->is_slave); -#endif - } - // mark memory for overwrite detection - dr_mark_memory(1, block, tcache_id, 0); - block->active = 1; + dr_activate_block(block, tcache_id, sh2->is_slave); emith_update_cache(); return block->entryp[0].tcache_ptr; } @@ -3069,7 +3123,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (op_flags[v] & OF_BASIC_LOOP) { m3 &= ~rcache_regs_static & ~BITMASK4(SHR_PC, SHR_PR, SHR_SR, SHR_MEM); if (m3 && count_bits(m3) < count_bits(rcache_hregs_reg) && - pinned_loop_count < ARRAY_SIZE(pinned_loop_pc)) { + pinned_loop_count < ARRAY_SIZE(pinned_loop_pc)-1) { pinned_loop_mask[pinned_loop_count] = m3; pinned_loop_pc[pinned_loop_count++] = base_pc + 2*v; } else @@ -3080,6 +3134,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } #endif } + pinned_loop_pc[pinned_loop_count] = -1; if (branch_target_count > 0) { memset(branch_target_ptr, 0, sizeof(branch_target_ptr[0]) * branch_target_count); @@ -3101,7 +3156,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // clear stale state after compile errors - rcache_unlock_all(); rcache_invalidate(); emith_invalidate_t(); drcf = (struct drcf) { 0 }; @@ -3146,39 +3200,31 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_sync_t(sr); rcache_flush(); emith_flush(); + } - // make block entry - v = block->entry_count; + // make block entry + v = block->entry_count; + entry = &block->entryp[v]; + if (v < ARRAY_SIZE(block->entryp)) + { entry = &block->entryp[v]; - if (v < ARRAY_SIZE(block->entryp)) - { - entry = &block->entryp[v]; - entry->pc = pc; - entry->tcache_ptr = tcache_ptr; - entry->links = entry->o_links = NULL; + entry->pc = pc; + entry->tcache_ptr = tcache_ptr; + entry->links = entry->o_links = NULL; #if (DRC_DEBUG & 2) - entry->block = block; + entry->block = block; #endif - add_to_hashlist(entry, tcache_id); - block->entry_count++; + block->entry_count++; - dbg(2, "-- %csh2 block #%d,%d entry %08x -> %p", - sh2->is_slave ? 's' : 'm', tcache_id, blkid_main, - pc, tcache_ptr); - } - else { - dbg(1, "too many entryp for block #%d,%d pc=%08x", - tcache_id, blkid_main, pc); - break; - } - } else { - entry = block->entryp; + dbg(2, "-- %csh2 block #%d,%d entry %08x -> %p", + sh2->is_slave ? 's' : 'm', tcache_id, blkid_main, + pc, tcache_ptr); + } + else { + dbg(1, "too many entryp for block #%d,%d pc=%08x", + tcache_id, blkid_main, pc); + break; } - - // since we made a block entry, link any other blocks that jump to it - dr_link_blocks(entry, tcache_id); - if (!tcache_id) // can safely link from cpu-local to global memory - dr_link_blocks(entry, sh2->is_slave?2:1); v = find_in_sorted_array(branch_target_pc, branch_target_count, pc); if (v >= 0) @@ -3220,29 +3266,35 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #endif // check cycles - tmp = rcache_get_tmp_arg(0); sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); emith_cmp_r_imm(sr, 0); + #if LOOP_OPTIMIZER - // on drc exit pinned registers must be saved + u8 *jp = NULL; if (op_flags[i] & OF_BASIC_LOOP) { - EMITH_JMP_START(DCOND_GT); + // if exiting a pinned loop pinned regs must be written back to ctx + // since they are reloaded in the loop entry code + jp = tcache_ptr; + emith_jump_cond_patchable(DCOND_GT, jp); // XXX need API for JMP_POS rcache_save_pinned(); - emith_move_r_imm(tmp, pc); - emith_jump(sh2_drc_exit); - EMITH_JMP_END(DCOND_GT); - } else + } #endif - if (emith_jump_cond_inrange(sh2_drc_exit)) { - emith_move_r_imm_c(DCOND_LE, tmp, pc); - emith_jump_cond(DCOND_LE, sh2_drc_exit); + if (blx_target_count < ARRAY_SIZE(blx_target_pc)) { + // exit via stub in blx table (saves some 1-3 insns in the main flow) + blx_target_pc[blx_target_count] = pc|1; + blx_target_bl[blx_target_count] = NULL; + blx_target_ptr[blx_target_count++] = tcache_ptr; } else { - EMITH_JMP_START(DCOND_GT); - emith_move_r_imm(tmp, pc); - emith_jump(sh2_drc_exit); - EMITH_JMP_END(DCOND_GT); + // blx table full, must inline exit code + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm_c(DCOND_LE, tmp, pc); + rcache_free_tmp(tmp); } - rcache_free_tmp(tmp); + emith_jump_cond_patchable(DCOND_LE, tcache_ptr); +#if LOOP_OPTIMIZER + if (op_flags[i] & OF_BASIC_LOOP) + emith_jump_patch(jp, tcache_ptr, NULL); +#endif #if (DRC_DEBUG & 32) // block hit counter @@ -3880,7 +3932,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 2: // SHAL Rn 0100nnnn00100000 tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_sync_t(sr); + emith_invalidate_t(); emith_tpop_carry(sr, 0); // dummy emith_lslf(tmp, tmp2, 1); emith_tpush_carry(sr, 0); @@ -3909,7 +3961,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 2: // SHAR Rn 0100nnnn00100001 tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_sync_t(sr); + emith_invalidate_t(); emith_tpop_carry(sr, 0); // dummy if (op & 0x20) { emith_asrf(tmp, tmp2, 1); @@ -3967,7 +4019,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x05: // ROTR Rn 0100nnnn00000101 tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_sync_t(sr); + emith_invalidate_t(); emith_tpop_carry(sr, 0); // dummy if (op & 1) { emith_rorf(tmp, tmp2, 1); @@ -4351,11 +4403,12 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) int cond = -1; int ctaken = 0; void *target = NULL; - int patchable = 0; + struct block_link *bl = NULL; if (OP_ISBRACND(opd_b->op)) ctaken = (op_flags[i] & OF_DELAY_OP) ? 1 : 2; cycles += ctaken; // assume branch taken + #if LOOP_OPTIMIZER if ((drcf.loop_type == OF_IDLE_LOOP || (drcf.loop_type == OF_DELAY_LOOP && drcf.delay_reg >= 0))) @@ -4365,14 +4418,35 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_sh2_delay_loop(cycles, drcf.delay_reg); drcf.polling = drcf.loop_type = 0; } + + if (target_pc < pc && pinned_loop_pc[pinned_loop_count] == target_pc) { + // backward jump at end of optimized loop + rcache_unpin_all(); + target = pinned_loop_ptr[pinned_loop_count]; + pinned_loop_count ++; + } #endif sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); + rcache_unlock_all(); rcache_clean(); - // emit condition test for conditional branch +#if CALL_STACK + void *rtsadd = NULL, *rtsret = NULL; + if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) { + // BSR - save rts data + tmp = rcache_get_tmp_arg(1); + rtsadd = tcache_ptr; + emith_move_r_imm_s8_patchable(tmp, 0); + rcache_invalidate_tmp(); + emith_call(sh2_drc_dispatcher_call); + rtsret = tcache_ptr; + } +#endif + if (OP_ISBRACND(opd_b->op)) { + // BT[S], BF[S] - emit condition test cond = (opd_b->op == OP_BRANCH_CF) ? DCOND_EQ : DCOND_NE; if (delay_dep_fw & BITMASK1(SHR_T)) { emith_sync_t(sr); @@ -4396,61 +4470,118 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { // local branch if (branch_target_ptr[v]) { - // jumps back can be linked here since host PC is already known + // local backward jump, link here now since host PC is already known target = branch_target_ptr[v]; + if (cond != -1) + emith_jump_cond(cond, target); + else { + emith_jump(target); + rcache_invalidate(); + } } else if (branch_patch_count < MAX_LOCAL_BRANCHES) { + // local forward jump target = tcache_ptr; branch_patch_pc[branch_patch_count] = target_pc; branch_patch_ptr[branch_patch_count] = target; branch_patch_count++; - patchable = 1; + if (cond != -1) + emith_jump_cond_patchable(cond, target); + else { + emith_jump_patchable(target); + rcache_invalidate(); + } } else dbg(1, "warning: too many local branches"); } #endif - rcache_unlock_all(); -#if LOOP_OPTIMIZER - if (target && pinned_loop_pc[pinned_loop_count] == target_pc) { - rcache_unpin_all(); - target = pinned_loop_ptr[pinned_loop_count]; - pinned_loop_count ++; - } -#endif - if (target == NULL) { // can't resolve branch locally, make a block exit - rcache_clean(); - tmp = rcache_get_tmp_arg(0); - emith_move_r_imm(tmp, target_pc); - rcache_free_tmp(tmp); + bl = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); + if (cond != -1) { +#if 1 + if (bl) { + if (blx_target_count < ARRAY_SIZE(blx_target_pc)) { + // conditional jumps get a blx stub for the far jump + blx_target_pc[blx_target_count] = target_pc; + blx_target_bl[blx_target_count] = bl; + blx_target_ptr[blx_target_count++] = tcache_ptr; + bl->type = BL_JCCBLX; + target = tcache_ptr; + } else { + // blx table full, patch jump only + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, target_pc); + rcache_free_tmp(tmp); + bl->jump = tcache_ptr; + bl->type = BL_JMP; + target = sh2_drc_dispatcher; + } + emith_jump_cond_patchable(cond, target); + } else { + // cannot link, inline jump @dispatcher + EMITH_JMP_START(emith_invert_cond(cond)); + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, target_pc); + rcache_free_tmp(tmp); + target = sh2_drc_dispatcher; + + emith_jump(target); + EMITH_JMP_END(emith_invert_cond(cond)); + } +#elif 1 + // jump @dispatcher - ARM 32bit version with conditional execution + EMITH_SJMP_START(emith_invert_cond(cond)); + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm_c(cond, tmp, target_pc); + rcache_free_tmp(tmp); + target = sh2_drc_dispatcher; -#if CALL_STACK - if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) { - // BSR - emith_call(sh2_drc_dispatcher_call); - } + if (bl) { + bl->jump = tcache_ptr; + bl->type = BL_JMP; + } + emith_jump_cond_patchable(cond, target); + EMITH_SJMP_END(emith_invert_cond(cond)); +#else + // jump @dispatcher - generic version (jump !cond @over, jump @trgt) + EMITH_JMP_START(emith_invert_cond(cond)); + if (bl) { + bl->jump = tcache_ptr; + bl->type = BL_LDJMP; + } + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, target_pc); + rcache_free_tmp(tmp); + target = sh2_drc_dispatcher; + + emith_jump_patchable(target); + EMITH_JMP_END(emith_invert_cond(cond)); #endif + } else { + // unconditional, has the far jump inlined + if (bl) + bl->type = BL_LDJMP; - target = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); - patchable = 1; - } + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, target_pc); + rcache_free_tmp(tmp); + target = sh2_drc_dispatcher; - // create branch - if (cond != -1) { - if (patchable) - emith_jump_cond_patchable(cond, target); - else - emith_jump_cond(cond, target); - } else { - rcache_invalidate(); - if (patchable) emith_jump_patchable(target); - else - emith_jump(target); + rcache_invalidate(); + } } + emith_flush(); + if (bl) + memcpy(bl->jdisp, bl->jump, emith_jump_at_size()); +#if CALL_STACK + if (rtsadd) + emith_move_r_imm_s8_patch(rtsadd, tcache_ptr - (u8 *)rtsret); +#endif + // branch not taken, correct cycle count if (ctaken) emith_add_r_imm(sr, ctaken << 12); @@ -4463,35 +4594,57 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.polling = drcf.loop_type = 0; } else if (drcf.pending_branch_indirect) { - void *target; u32 target_pc; + struct block_link *bl = NULL; sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); emith_sync_t(sr); rcache_clean(); + tmp = rcache_get_reg_arg(0, SHR_PC, NULL); - rcache_invalidate(); + #if CALL_STACK struct op_data *opd_b = (op_flags[i] & OF_DELAY_OP) ? opd-1 : opd; + void *rtsadd = NULL, *rtsret = NULL; + if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) { - // JSR/BSRF + // JSR, BSRF - save rts data + tmp = rcache_get_tmp_arg(1); + rtsadd = tcache_ptr; + emith_move_r_imm_s8_patchable(tmp, 0); + rcache_invalidate_tmp(); emith_call(sh2_drc_dispatcher_call); + rtsret = tcache_ptr; } +#endif +#if CALL_STACK if (opd_b->rm == SHR_PR) { - // RTS + // RTS - restore rts data, else jump to dispatcher emith_jump(sh2_drc_dispatcher_return); } else #endif if (gconst_get(SHR_PC, &target_pc)) { - // JMP const, treat like unconditional direct branch - target = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); - emith_jump_patchable(target); + // JMP, JSR, BRAF, BSRF const - treat like unconditional direct branch + bl = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); + if (bl) { // pc already loaded somewhere else, can patch jump only + bl->type = BL_JMP; + bl->jump = tcache_ptr; + } + emith_jump_patchable(sh2_drc_dispatcher); } else { - // JMP + // JMP, JSR, BRAF, BSRF not const emith_jump(sh2_drc_dispatcher); } + rcache_invalidate(); + + emith_flush(); +#if CALL_STACK + if (rtsadd) + emith_move_r_imm_s8_patch(rtsadd, tcache_ptr - (u8 *)rtsret); +#endif + drcf.pending_branch_indirect = 0; drcf.polling = drcf.loop_type = 0; } @@ -4508,24 +4661,48 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (! OP_ISBRAUC(opd->op)) { - void *target; + struct block_link *bl; tmp = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(tmp); emith_sync_t(tmp); rcache_clean(); + bl = dr_prepare_ext_branch(block->entryp, pc, sh2->is_slave, tcache_id); + if (bl) + bl->type = BL_LDJMP; tmp = rcache_get_tmp_arg(0); emith_move_r_imm(tmp, pc); - - target = dr_prepare_ext_branch(block->entryp, pc, sh2->is_slave, tcache_id); - if (target == NULL) - return NULL; + emith_jump_patchable(sh2_drc_dispatcher); rcache_invalidate(); - emith_jump_patchable(target); + emith_flush(); + if (bl) + memcpy(bl->jdisp, bl->jump, emith_jump_at_size()); } else rcache_flush(); + + // emit blx area + for (i = 0; i < blx_target_count; i++) { + void *target = (blx_target_pc[i] & 1 ? sh2_drc_exit : sh2_drc_dispatcher); + struct block_link *bl = blx_target_bl[i]; + + emith_pool_check(); + if (bl) + bl->blx = tcache_ptr; + emith_jump_patch(blx_target_ptr[i], tcache_ptr, NULL); + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, blx_target_pc[i] & ~1); + emith_jump(target); + rcache_invalidate(); + emith_flush(); + if (bl) + memcpy(bl->jdisp, bl->blx, emith_jump_at_size()); + } + emith_flush(); + do_host_disasm(tcache_id); + + emith_pool_commit(0); // link local branches for (i = 0; i < branch_patch_count; i++) { @@ -4539,20 +4716,18 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) target = tcache_ptr; tmp = rcache_get_tmp_arg(0); emith_move_r_imm(tmp, branch_patch_pc[i]); - rcache_flush(); emith_jump(sh2_drc_dispatcher); + rcache_flush(); } emith_jump_patch(branch_patch_ptr[i], target, NULL); } - emith_pool_commit(0); - - dr_mark_memory(1, block, tcache_id, 0); - tcache_ptrs[tcache_id] = tcache_ptr; - host_instructions_updated(block_entry_ptr, tcache_ptr); + dr_activate_block(block, tcache_id, sh2->is_slave); + emith_update_cache(); + do_host_disasm(tcache_id); dbg(2, " block #%d,%d -> %p tcache %d/%d, insns %d -> %d %.3f", @@ -4574,7 +4749,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) fflush(stdout); #endif - emith_update_cache(); return block_entry_ptr; } @@ -4769,14 +4943,14 @@ static void sh2_generate_utils(void) // pc = sh2_drc_dispatcher_call(u32 pc) sh2_drc_dispatcher_call = (void *)tcache_ptr; emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx)); - emith_ctx_read(arg1, SHR_PR * 4); emith_add_r_imm(arg2, 2*sizeof(void *)); emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *)); emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx)); emith_add_r_r_r_lsl_ptr(arg2, CONTEXT_REG, arg2, 0); - emith_write_r_r_offs(arg1, arg2, offsetof(SH2, rts_cache)); - emith_add_r_ret_imm(arg1, emith_jump_patchable_size()); // skip jump_patchable for rts host address - emith_write_r_r_offs_ptr(arg1, arg2, offsetof(SH2, rts_cache) + sizeof(void *)); + emith_ctx_read(arg3, SHR_PR * 4); + emith_add_r_ret(arg1); + emith_write_r_r_offs_ptr(arg1, arg2, offsetof(SH2, rts_cache)+sizeof(void *)); + emith_write_r_r_offs(arg3, arg2, offsetof(SH2, rts_cache)); emith_ret(); emith_flush(); @@ -5378,10 +5552,8 @@ void sh2_drc_finish(SH2 *sh2) if (block_tables[0] == NULL) return; - sh2_drc_flush_all(); - - for (i = 0; i < TCACHE_BUFFERS; i++) { #if (DRC_DEBUG & 4) + for (i = 0; i < TCACHE_BUFFERS; i++) { printf("~~~ tcache %d\n", i); #if 0 tcache_dsm_ptrs[i] = tcache_bases[i]; @@ -5394,8 +5566,12 @@ void sh2_drc_finish(SH2 *sh2) } #endif printf("max links: %d\n", block_link_pool_counts[i]); + } #endif + sh2_drc_flush_all(); + + for (i = 0; i < TCACHE_BUFFERS; i++) { if (block_tables[i] != NULL) free(block_tables[i]); block_tables[i] = NULL; From 45bc81f28608cb73ddede43c4df802a7a442c7c6 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 28 Sep 2019 17:12:56 +0200 Subject: [PATCH 063/174] sh2 drc: drc exit, block linking and branch handling revised (overlooked commit) --- cpu/sh2/compiler.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 932f21cfb..2c9e5b7a3 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -3270,7 +3270,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_cmp_r_imm(sr, 0); #if LOOP_OPTIMIZER - u8 *jp = NULL; + void *jp = NULL; if (op_flags[i] & OF_BASIC_LOOP) { // if exiting a pinned loop pinned regs must be written back to ctx // since they are reloaded in the loop entry code @@ -3292,8 +3292,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } emith_jump_cond_patchable(DCOND_LE, tcache_ptr); #if LOOP_OPTIMIZER - if (op_flags[i] & OF_BASIC_LOOP) + if (op_flags[i] & OF_BASIC_LOOP) { + emith_flush(); emith_jump_patch(jp, tcache_ptr, NULL); + } #endif #if (DRC_DEBUG & 32) From ea96d35b8987fbdb446a621e0c77ffb8280ab25e Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 4 Oct 2019 17:11:18 +0200 Subject: [PATCH 064/174] sh2 drc: bug fixing and optimization in register cache and branch handling --- cpu/drc/emit_arm.c | 15 ++-- cpu/drc/emit_mips.c | 4 +- cpu/sh2/compiler.c | 214 +++++++++++++++++++------------------------- 3 files changed, 104 insertions(+), 129 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index b8c6419cc..ec2958b12 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -160,7 +160,12 @@ static NOINLINE void EMIT(u32 op, u32 dst, u32 src) } } } - if (emit_index <= EMIT_CACHE_SIZE) { + if (dst & M1(PC)) { + // commit everything if a branch insn is emitted + for (i = 1; i <= emit_index+1; i++) + EMIT_PTR(emit_ptr, emit_cache[i].op); + emit_index = 0; + } else if (emit_index <= EMIT_CACHE_SIZE) { // queue not yet full emit_index++; } else { @@ -654,13 +659,14 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) literal_insn[pool_index] += move_offs; } -#define JMP_POS(ptr) \ +#define JMP_POS(ptr) { \ ptr = tcache_ptr; \ - EMIT(0,M1(PC),0); + EMIT(0,M1(PC),0); \ +} #define JMP_EMIT(cond, ptr) { \ u32 val_ = (u32 *)tcache_ptr - (u32 *)(ptr) - 2; \ - emith_flush(); \ + emith_flush(); /* NO insn swapping across jump targets */ \ EOP_C_B_PTR(ptr, cond, 0, val_ & 0xffffff); \ } @@ -890,7 +896,6 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) emith_top_imm(cond, A_OP_TST, r, imm) #define emith_move_r_imm_s8_patchable(r, imm) do { \ - emith_flush(); \ if ((s8)(imm) < 0) \ EOP_MVN_IMM(r, 0, (u8)~(imm)); \ else \ diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index ad02ff245..fadf57445 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -1249,11 +1249,11 @@ static int emith_cond_check(int cond, int *r) #define emith_push_ret(r) do { \ emith_sub_r_imm(SP, 8+16); /* reserve new arg save area (16) */ \ emith_write_r_r_offs(LR, SP, 4+16); \ - if ((r) >= 0) emith_write_r_r_offs(r, SP, 0+16); \ + if ((r) > 0) emith_write_r_r_offs(r, SP, 0+16); \ } while (0) #define emith_pop_and_ret(r) do { \ - if ((r) >= 0) emith_read_r_r_offs(r, SP, 0+16); \ + if ((r) > 0) emith_read_r_r_offs(r, SP, 0+16); \ emith_read_r_r_offs(LR, SP, 4+16); \ emith_add_r_imm(SP, 8+16); \ emith_ret(); \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 2c9e5b7a3..449ae0e1a 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -172,7 +172,6 @@ enum op_types { static u8 *tcache_dsm_ptrs[3]; static char sh2dasm_buff[64]; #define do_host_disasm(tcid) \ - emith_flush(); \ host_dasm(tcache_dsm_ptrs[tcid], emith_insn_ptr() - tcache_dsm_ptrs[tcid]); \ tcache_dsm_ptrs[tcid] = emith_insn_ptr() #else @@ -200,6 +199,7 @@ static char sh2dasm_buff[64]; #if (DRC_DEBUG & (8|256|512|1024)) || defined(PDB) #if (DRC_DEBUG & (256|512|1024)) static SH2 csh2[2][8]; +static FILE *trace[2]; #endif static void REGPARM(3) *sh2_drc_log_entry(void *block, SH2 *sh2, u32 sr) { @@ -210,7 +210,6 @@ static void REGPARM(3) *sh2_drc_log_entry(void *block, SH2 *sh2, u32 sr) pdb_step(sh2, sh2->pc); #elif (DRC_DEBUG & 256) { - static FILE *trace[2]; int idx = sh2->is_slave; if (!trace[0]) { trace[0] = fopen("pico.trace0", "wb"); @@ -225,7 +224,6 @@ static void REGPARM(3) *sh2_drc_log_entry(void *block, SH2 *sh2, u32 sr) } #elif (DRC_DEBUG & 512) { - static FILE *trace[2]; static SH2 fsh2; int idx = sh2->is_slave; if (!trace[0]) { @@ -1603,16 +1601,12 @@ static u16 rcache_counter; // SH2 register usage bitmasks static u32 rcache_hregs_reg; // regs of type HRT_REG (for pinning) static u32 rcache_regs_static; // statically allocated regs +static u32 rcache_regs_pinned; // pinned regs static u32 rcache_regs_now; // regs used in current insn static u32 rcache_regs_soon; // regs used in the next few insns static u32 rcache_regs_late; // regs used in later insns static u32 rcache_regs_discard; // regs overwritten without being used static u32 rcache_regs_clean; // regs needing cleaning -// combination masks XXX this seems obscure -#define rcache_regs_used (rcache_regs_soon|rcache_regs_late|rcache_regs_clean) -#define rcache_regs_nowused (rcache_regs_now|rcache_regs_used) -#define rcache_regs_nowsoon (rcache_regs_now|rcache_regs_soon) -#define rcache_regs_soonclean (rcache_regs_soon|rcache_regs_clean) static void rcache_lock_vreg(int x) { @@ -1677,6 +1671,7 @@ static void rcache_move_vreg(int d, int x) static void rcache_clean_vreg(int x) { + u32 rns = rcache_regs_now | rcache_regs_soon; int r; if (cache_regs[x].flags & HRF_DIRTY) { // writeback @@ -1685,23 +1680,18 @@ static void rcache_clean_vreg(int x) FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, r, if (guest_regs[r].flags & GRF_DIRTY) { if (guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) { - if (guest_regs[r].vreg != guest_regs[r].sreg) { - if (!(cache_regs[guest_regs[r].sreg].locked)) { - // statically mapped reg not in its sreg. move back to sreg - rcache_evict_vreg(guest_regs[r].sreg); - emith_move_r_r(cache_regs[guest_regs[r].sreg].hreg, - cache_regs[guest_regs[r].vreg].hreg); - rcache_remove_vreg_alias(x, r); - rcache_add_vreg_alias(guest_regs[r].sreg, r); - cache_regs[guest_regs[r].sreg].flags |= HRF_DIRTY; - } else { - // must evict since sreg is locked - if (~rcache_regs_discard & (1 << r)) - emith_ctx_write(cache_regs[x].hreg, r * 4); - guest_regs[r].flags &= ~GRF_DIRTY; - rcache_remove_vreg_alias(x, r); - } + if (guest_regs[r].vreg != guest_regs[r].sreg && + !cache_regs[guest_regs[r].sreg].locked && + !(rns & cache_regs[guest_regs[r].sreg].gregs)) { + // statically mapped reg not in its sreg. move back to sreg + rcache_evict_vreg(guest_regs[r].sreg); + emith_move_r_r(cache_regs[guest_regs[r].sreg].hreg, + cache_regs[guest_regs[r].vreg].hreg); + rcache_remove_vreg_alias(x, r); + rcache_add_vreg_alias(guest_regs[r].sreg, r); + cache_regs[guest_regs[r].sreg].flags |= HRF_DIRTY; } else + // cannot remap. keep dirty for writeback in unmap cache_regs[x].flags |= HRF_DIRTY; } else { if (~rcache_regs_discard & (1 << r)) @@ -1815,17 +1805,9 @@ static int rcache_allocate_vreg(int needed) { int x; - if (needed) { - // needed soon, try getting a REG 1st, use a TEMP only if none is available - x = rcache_allocate(1, 0); - if (x < 0) - x = rcache_allocate(-1, 1); - } else { - // not needed, try getting a TEMP 1st, use a REG only if none is available + x = rcache_allocate(1, needed ? 0 : 3); + if (x < 0) x = rcache_allocate(-1, 1); - if (x < 0) - x = rcache_allocate(1, 0); - } return x; } @@ -1838,10 +1820,6 @@ static int rcache_allocate_nontemp(void) static int rcache_allocate_temp(void) { int x = rcache_allocate(-1, 1); - if (x < 0) { - printf("no temp register available, aborting\n"); - exit(1); - } return x; } @@ -1898,6 +1876,7 @@ static int rcache_map_reg(sh2_reg_e r, int hr, int mode) // remap vreg from a TEMP to a REG if it will be used (upcoming TEMP invalidation) static void rcache_remap_vreg(int x) { + u32 rsl_d = rcache_regs_soon | rcache_regs_late; int d; // x must be a cached vreg @@ -1905,7 +1884,7 @@ static void rcache_remap_vreg(int x) return; // don't do it if x is already a REG or isn't used or to be cleaned anyway if ((cache_regs[x].htype & HRT_REG) || - !(rcache_regs_used & ~rcache_regs_clean & cache_regs[x].gregs)) { + !(rsl_d & cache_regs[x].gregs)) { // clean here to avoid data loss on invalidation rcache_clean_vreg(x); return; @@ -1971,20 +1950,22 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr { int src, dst, ali; cache_reg_t *tr; + u32 rsp_d = (rcache_regs_now | rcache_regs_soon | + rcache_regs_static | rcache_regs_pinned) & ~rcache_regs_discard; dst = src = guest_regs[r].vreg; rcache_lock_vreg(src); // lock to avoid evicting src // good opportunity to relocate a remapped STATIC? - if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) && src != guest_regs[r].sreg && + if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) && + src != guest_regs[r].sreg && (src < 0 || mode != RC_GR_READ) && !cache_regs[guest_regs[r].sreg].locked && - (src < 0 || mode != RC_GR_READ) && - !(rcache_regs_nowsoon & cache_regs[guest_regs[r].sreg].gregs)) { + !(rsp_d & cache_regs[guest_regs[r].sreg].gregs)) { dst = guest_regs[r].sreg; rcache_evict_vreg(dst); } else if (dst < 0) { // allocate a cache register - if ((dst = rcache_allocate_vreg(rcache_regs_nowsoon & (1 << r))) < 0) { + if ((dst = rcache_allocate_vreg(rsp_d & (1 << r))) < 0) { printf("no registers to evict, aborting\n"); exit(1); } @@ -2004,12 +1985,12 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr ali = tr->gregs & ~(1 << r); if (mode != RC_GR_READ && src == dst && ali) { int x = -1; - if (rcache_regs_nowsoon & ali) { + if (rsp_d & ali) { if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) && guest_regs[r].sreg == dst && !tr->locked) { // split aliases if r is STATIC in sreg and dst isn't already locked rcache_lock_vreg(dst); // lock to avoid evicting dst - x = rcache_allocate_vreg(rcache_regs_nowsoon & ali); + x = rcache_allocate_vreg(rsp_d & ali); rcache_unlock_vreg(dst); if (x >= 0) { src = x; @@ -2018,7 +1999,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr } else { // split r rcache_lock_vreg(src); // lock to avoid evicting src - x = rcache_allocate_vreg(rcache_regs_nowsoon & (1 << r)); + x = rcache_allocate_vreg(rsp_d & (1 << r)); rcache_unlock_vreg(src); if (x >= 0) { dst = x; @@ -2082,6 +2063,7 @@ static void rcache_pin_reg(sh2_reg_e r) guest_regs[r].flags |= GRF_PINNED; cache_regs[x].flags |= HRF_PINNED; guest_regs[r].sreg = x; + rcache_regs_pinned |= (1 << r); } #if DRC_DEBUG & 64 RCACHE_CHECK("after pin"); @@ -2275,10 +2257,8 @@ static void rcache_free(int hr) static void rcache_unlock(int x) { - if (x >= 0) { + if (x >= 0) cache_regs[x].locked = 0; -// rcache_regs_now &= ~cache_regs[x].gregs; - } } static void rcache_unlock_all(void) @@ -2297,6 +2277,7 @@ static void rcache_unpin_all(void) guest_regs[i].flags &= ~GRF_PINNED; cache_regs[guest_regs[i].sreg].flags &= ~HRF_PINNED; guest_regs[i].sreg = -1; + rcache_regs_pinned &= ~(1 << i); } } #if DRC_DEBUG & 64 @@ -2337,7 +2318,8 @@ static inline void rcache_set_usage_discard(u32 mask) static inline int rcache_is_cached(sh2_reg_e r) { // is r in cache or needed RSN? - return (guest_regs[r].vreg >= 0 || (rcache_regs_soonclean & (1 << r))); + u32 rsc = rcache_regs_soon | rcache_regs_clean; + return (guest_regs[r].vreg >= 0 || (rsc & (1 << r))); } static inline int rcache_is_hreg_used(int hr) @@ -2407,9 +2389,8 @@ static void rcache_clean_masked(u32 mask) { int i, r, hr; - if (!(mask &= ~rcache_regs_static)) - return; rcache_regs_clean |= mask; + mask = rcache_regs_clean; // clean constants where all aliases are covered by the mask for (i = 0; i < ARRAY_SIZE(gconsts); i++) @@ -2447,9 +2428,11 @@ static void rcache_clean(void) rcache_unlock_vreg(guest_regs[i].vreg); if (guest_regs[i].vreg < 0) emith_ctx_read(cache_regs[guest_regs[i].sreg].hreg, i*4); - else + else { emith_move_r_r(cache_regs[guest_regs[i].sreg].hreg, cache_regs[guest_regs[i].vreg].hreg); + rcache_remove_vreg_alias(guest_regs[i].vreg, i); + } cache_regs[guest_regs[i].sreg].gregs = 1 << i; cache_regs[guest_regs[i].sreg].type = HR_CACHED; cache_regs[guest_regs[i].sreg].flags |= HRF_DIRTY|HRF_PINNED; @@ -3134,7 +3117,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } #endif } - pinned_loop_pc[pinned_loop_count] = -1; if (branch_target_count > 0) { memset(branch_target_ptr, 0, sizeof(branch_target_ptr[0]) * branch_target_count); @@ -3160,6 +3142,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_invalidate_t(); drcf = (struct drcf) { 0 }; #if LOOP_OPTIMIZER + pinned_loop_pc[pinned_loop_count] = -1; pinned_loop_count = 0; #endif @@ -3292,10 +3275,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } emith_jump_cond_patchable(DCOND_LE, tcache_ptr); #if LOOP_OPTIMIZER - if (op_flags[i] & OF_BASIC_LOOP) { - emith_flush(); + if (op_flags[i] & OF_BASIC_LOOP) emith_jump_patch(jp, tcache_ptr, NULL); - } #endif #if (DRC_DEBUG & 32) @@ -3425,14 +3406,14 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) soon = late; } else { // upcoming rcache_flush, start writing back unused dirty stuff + rcache_set_usage_discard(write & ~(late|soon|opd[0].source)); rcache_clean_masked(rcache_dirty_mask() & ~(write|opd[0].dest)); break; } } rcache_set_usage_now(opd[0].source); // current insn - rcache_set_usage_soon(soon); // insns 1-3 - rcache_set_usage_late(late & ~soon); // insns 4-9 - rcache_set_usage_discard(write & ~(late|soon) & ~opd[0].source); + rcache_set_usage_soon(soon); // insns 1-4 + rcache_set_usage_late(late & ~soon); // insns 5-9 switch (opd->op) { @@ -4374,6 +4355,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) end_op: rcache_unlock_all(); + rcache_set_usage_now(0); #if DRC_DEBUG & 64 RCACHE_CHECK("after insn"); #endif @@ -4418,22 +4400,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // idle or delay loop emit_sync_t_to_sr(); emith_sh2_delay_loop(cycles, drcf.delay_reg); + rcache_unlock_all(); // may lock delay_reg drcf.polling = drcf.loop_type = 0; } - - if (target_pc < pc && pinned_loop_pc[pinned_loop_count] == target_pc) { - // backward jump at end of optimized loop - rcache_unpin_all(); - target = pinned_loop_ptr[pinned_loop_count]; - pinned_loop_count ++; - } #endif - sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - FLUSH_CYCLES(sr); - rcache_unlock_all(); - rcache_clean(); - #if CALL_STACK void *rtsadd = NULL, *rtsret = NULL; if ((opd_b->dest & BITMASK1(SHR_PR)) && pc+2 < end_pc) { @@ -4441,12 +4412,18 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp = rcache_get_tmp_arg(1); rtsadd = tcache_ptr; emith_move_r_imm_s8_patchable(tmp, 0); + rcache_clean_tmp(); rcache_invalidate_tmp(); emith_call(sh2_drc_dispatcher_call); rtsret = tcache_ptr; } #endif + // XXX move below cond test if not changing host cond (MIPS delay slot)? + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + FLUSH_CYCLES(sr); + rcache_clean(); + if (OP_ISBRACND(opd_b->op)) { // BT[S], BF[S] - emit condition test cond = (opd_b->op == OP_BRANCH_CF) ? DCOND_EQ : DCOND_NE; @@ -4466,7 +4443,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_sync_t(sr); // no modification of host status/flags between here and branching! -#if LINK_BRANCHES v = find_in_sorted_array(branch_target_pc, branch_target_count, target_pc); if (v >= 0) { @@ -4474,6 +4450,14 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (branch_target_ptr[v]) { // local backward jump, link here now since host PC is already known target = branch_target_ptr[v]; +#if LOOP_OPTIMIZER + if (pinned_loop_pc[pinned_loop_count] == target_pc) { + // backward jump at end of optimized loop + rcache_unpin_all(); + target = pinned_loop_ptr[pinned_loop_count]; + pinned_loop_count ++; + } +#endif if (cond != -1) emith_jump_cond(cond, target); else { @@ -4495,7 +4479,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } else dbg(1, "warning: too many local branches"); } -#endif if (target == NULL) { @@ -4503,36 +4486,30 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) bl = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); if (cond != -1) { #if 1 - if (bl) { - if (blx_target_count < ARRAY_SIZE(blx_target_pc)) { - // conditional jumps get a blx stub for the far jump - blx_target_pc[blx_target_count] = target_pc; - blx_target_bl[blx_target_count] = bl; - blx_target_ptr[blx_target_count++] = tcache_ptr; - bl->type = BL_JCCBLX; - target = tcache_ptr; - } else { - // blx table full, patch jump only - tmp = rcache_get_tmp_arg(0); - emith_move_r_imm(tmp, target_pc); - rcache_free_tmp(tmp); - bl->jump = tcache_ptr; - bl->type = BL_JMP; - target = sh2_drc_dispatcher; - } + if (bl && blx_target_count < ARRAY_SIZE(blx_target_pc)) { + // conditional jumps get a blx stub for the far jump + blx_target_pc[blx_target_count] = target_pc; + blx_target_bl[blx_target_count] = bl; + blx_target_ptr[blx_target_count++] = tcache_ptr; + bl->type = BL_JCCBLX; + target = tcache_ptr; emith_jump_cond_patchable(cond, target); } else { - // cannot link, inline jump @dispatcher + // not linkable, or blx table full; inline jump @dispatcher EMITH_JMP_START(emith_invert_cond(cond)); + if (bl) { + bl->jump = tcache_ptr; + bl->type = BL_LDJMP; + } tmp = rcache_get_tmp_arg(0); emith_move_r_imm(tmp, target_pc); rcache_free_tmp(tmp); target = sh2_drc_dispatcher; - emith_jump(target); + emith_jump_patchable(target); EMITH_JMP_END(emith_invert_cond(cond)); } -#elif 1 +#else // jump @dispatcher - ARM 32bit version with conditional execution EMITH_SJMP_START(emith_invert_cond(cond)); tmp = rcache_get_tmp_arg(0); @@ -4546,25 +4523,13 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } emith_jump_cond_patchable(cond, target); EMITH_SJMP_END(emith_invert_cond(cond)); -#else - // jump @dispatcher - generic version (jump !cond @over, jump @trgt) - EMITH_JMP_START(emith_invert_cond(cond)); - if (bl) { - bl->jump = tcache_ptr; - bl->type = BL_LDJMP; - } - tmp = rcache_get_tmp_arg(0); - emith_move_r_imm(tmp, target_pc); - rcache_free_tmp(tmp); - target = sh2_drc_dispatcher; - - emith_jump_patchable(target); - EMITH_JMP_END(emith_invert_cond(cond)); #endif } else { // unconditional, has the far jump inlined - if (bl) + if (bl) { + emith_flush(); // flush to inhibit insn swapping bl->type = BL_LDJMP; + } tmp = rcache_get_tmp_arg(0); emith_move_r_imm(tmp, target_pc); @@ -4576,7 +4541,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } } - emith_flush(); if (bl) memcpy(bl->jdisp, bl->jump, emith_jump_at_size()); #if CALL_STACK @@ -4599,11 +4563,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) u32 target_pc; struct block_link *bl = NULL; - sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - FLUSH_CYCLES(sr); - emith_sync_t(sr); - rcache_clean(); - tmp = rcache_get_reg_arg(0, SHR_PC, NULL); #if CALL_STACK @@ -4615,12 +4574,18 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp = rcache_get_tmp_arg(1); rtsadd = tcache_ptr; emith_move_r_imm_s8_patchable(tmp, 0); + rcache_clean_tmp(); rcache_invalidate_tmp(); emith_call(sh2_drc_dispatcher_call); rtsret = tcache_ptr; } #endif + sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); + FLUSH_CYCLES(sr); + emith_sync_t(sr); + rcache_clean(); + #if CALL_STACK if (opd_b->rm == SHR_PR) { // RTS - restore rts data, else jump to dispatcher @@ -4630,10 +4595,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (gconst_get(SHR_PC, &target_pc)) { // JMP, JSR, BRAF, BSRF const - treat like unconditional direct branch bl = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); - if (bl) { // pc already loaded somewhere else, can patch jump only + if (bl) // pc already loaded somewhere else, can patch jump only bl->type = BL_JMP; - bl->jump = tcache_ptr; - } emith_jump_patchable(sh2_drc_dispatcher); } else { // JMP, JSR, BRAF, BSRF not const @@ -4641,7 +4604,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } rcache_invalidate(); - emith_flush(); #if CALL_STACK if (rtsadd) emith_move_r_imm_s8_patch(rtsadd, tcache_ptr - (u8 *)rtsret); @@ -4671,13 +4633,15 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_clean(); bl = dr_prepare_ext_branch(block->entryp, pc, sh2->is_slave, tcache_id); - if (bl) + if (bl) { + emith_flush(); // flush to inhibit insn swapping bl->type = BL_LDJMP; + } tmp = rcache_get_tmp_arg(0); emith_move_r_imm(tmp, pc); emith_jump_patchable(sh2_drc_dispatcher); rcache_invalidate(); - emith_flush(); + if (bl) memcpy(bl->jdisp, bl->jump, emith_jump_at_size()); } else @@ -4696,7 +4660,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_move_r_imm(tmp, blx_target_pc[i] & ~1); emith_jump(target); rcache_invalidate(); - emith_flush(); + if (bl) memcpy(bl->jdisp, bl->blx, emith_jump_at_size()); } @@ -5554,6 +5518,12 @@ void sh2_drc_finish(SH2 *sh2) if (block_tables[0] == NULL) return; +#if (DRC_DEBUG & (256|512)) + if (trace[0]) fclose(trace[0]); + if (trace[1]) fclose(trace[1]); + trace[0] = trace[1] = NULL; +#endif + #if (DRC_DEBUG & 4) for (i = 0; i < TCACHE_BUFFERS; i++) { printf("~~~ tcache %d\n", i); From 242e81bacae3b739665507ba9187382e5362f2c2 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 5 Oct 2019 11:17:49 +0200 Subject: [PATCH 065/174] sh2 drc: fix i386 regression --- cpu/sh2/compiler.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 449ae0e1a..09546634d 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -1682,6 +1682,7 @@ static void rcache_clean_vreg(int x) if (guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) { if (guest_regs[r].vreg != guest_regs[r].sreg && !cache_regs[guest_regs[r].sreg].locked && + (~rcache_regs_discard & (1 << r)) && !(rns & cache_regs[guest_regs[r].sreg].gregs)) { // statically mapped reg not in its sreg. move back to sreg rcache_evict_vreg(guest_regs[r].sreg); @@ -1820,6 +1821,8 @@ static int rcache_allocate_nontemp(void) static int rcache_allocate_temp(void) { int x = rcache_allocate(-1, 1); + if (x < 0) + x = rcache_allocate(0, 0); return x; } @@ -3404,16 +3407,16 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // regs needed in the next few instructions if (v <= 4) soon = late; - } else { - // upcoming rcache_flush, start writing back unused dirty stuff - rcache_set_usage_discard(write & ~(late|soon|opd[0].source)); - rcache_clean_masked(rcache_dirty_mask() & ~(write|opd[0].dest)); + } else break; - } } rcache_set_usage_now(opd[0].source); // current insn rcache_set_usage_soon(soon); // insns 1-4 rcache_set_usage_late(late & ~soon); // insns 5-9 + rcache_set_usage_discard(write & ~(late|soon|opd[0].source)); + if (v <= 9) + // upcoming rcache_flush, start writing back unused dirty stuff + rcache_clean_masked(rcache_dirty_mask() & ~(write|opd[0].dest)); switch (opd->op) { @@ -3826,6 +3829,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_tpop_carry(sr, 0); emith_adcf_r_r_r(tmp2, tmp, tmp); emith_tpush_carry(sr, 0); // keep Q1 in T for now + rcache_free(tmp); tmp4 = rcache_get_tmp(); emith_and_r_r_imm(tmp4, sr, M); emith_eor_r_r_lsr(sr, tmp4, M_SHIFT - Q_SHIFT); // Q ^= M From 2e88630a6a82b3295e8cc573629d540fbd9e4682 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 10 Oct 2019 23:52:39 +0200 Subject: [PATCH 066/174] sh2 drc: speed optimization and bugfixing --- Makefile | 2 +- cpu/drc/emit_arm64.c | 18 ++++++++------ cpu/drc/emit_mips.c | 59 +++++++++++++++++++++++++------------------- cpu/drc/emit_x86.c | 5 ++++ cpu/sh2/compiler.c | 32 +++++++++--------------- 5 files changed, 61 insertions(+), 55 deletions(-) diff --git a/Makefile b/Makefile index e0ce4fd09..52bdea0f5 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ ifneq ("$(PLATFORM)", "libretro") endif endif -# This is actually needed, bevieve me. +# This is actually needed, believe me. # If you really have to disable this, set NO_ALIGN_FUNCTIONS elsewhere. ifndef NO_ALIGN_FUNCTIONS CFLAGS += -falign-functions=2 diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 688649b5b..3ef402b40 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -160,7 +160,7 @@ enum { XT_UXTW=0x4, XT_UXTX=0x6, XT_LSL=0x7, XT_SXTW=0xc, XT_SXTX=0xe }; #define A64_ROR_REG(rd, rn, rm) \ A64_INSN(0xd,0x0,0x3,_,rm,_,0xb,rn,rd) -// rd = REVERSE(n) rn +// rd = REVERSE(rn) #define A64_RBIT_REG(rd, rn) \ A64_INSN(0xd,0x2,0x3,_,_,_,_,rn,rd) @@ -327,9 +327,10 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; // if-then-else conditional execution helpers -#define JMP_POS(ptr) \ +#define JMP_POS(ptr) { \ ptr = tcache_ptr; \ - EMIT(A64_B(0)); + EMIT(A64_B(0)); \ +} #define JMP_EMIT(cond, ptr) { \ u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr); \ @@ -1225,9 +1226,9 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) emith_tst_r_imm(sr, S); \ EMITH_SJMP_START(DCOND_EQ); \ /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ - /* to check: add MACH[15] to MACH[31:16]. this is 0 if no overflow */ \ - emith_asrf(rn, mh, 16); /* sum = (MACH>>16) + ((MACH>>15)&1) */ \ - emith_adcf_r_imm(rn, 0); /* (MACH>>15) is in carry after shift */ \ + /* to check: add MACH >> 31 to MACH >> 15. this is 0 if no overflow */ \ + emith_asr(rn, mh, 15); \ + emith_addf_r_r_r_lsr(rn, rn, mh, 31); \ EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ @@ -1280,11 +1281,12 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_tpop_carry(sr, is_sub) do { \ if (is_sub) \ emith_eor_r_imm(sr, 1); \ - emith_lsrf(sr, sr, 1); \ + emith_ror(sr, sr, 1); \ + emith_addf_r_r(sr, sr); \ } while (0) #define emith_tpush_carry(sr, is_sub) do { \ - emith_adc_r_r(sr, sr); \ + emith_adc_r_r(sr, Z0); \ if (is_sub) \ emith_eor_r_imm(sr, 1); \ } while (0) diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index fadf57445..4a452a685 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -21,7 +21,7 @@ #define AT 1 // used to hold intermediate results #define FNZ 15 // emulated processor flags: N (bit 31) ,Z (all bits) #define FC 24 // emulated processor flags: C (bit 0), others 0 -#define FV 25 // emulated processor flags: Nt^Ns (bit 31). others ? +#define FV 25 // emulated processor flags: Nt^Ns (bit 31). others x // unified conditions; virtual, not corresponding to anything real on MIPS @@ -208,8 +208,8 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; } while (0) // FIFO for 2 instructions, for delay slot handling -u32 emith_last_insns[2] = { -1,-1 }; -int emith_last_idx, emith_last_cnt; +static u32 emith_last_insns[2] = { -1,-1 }; +static int emith_last_idx, emith_last_cnt; #define EMIT_PUSHOP() \ do { \ @@ -248,7 +248,7 @@ static int emith_is_b(u32 op) // B ((op>>26) == OP__RT && ((op>>16) & 036) == RT_BLTZ); } // register usage for dependency evaluation XXX better do this as in emit_arm? static uint64_t emith_has_rs[3] = // OP__FN, OP__RT, others - { 0x00fffffffffa0ff0ULL, 0x000fff0fUL, 0xffffffff0f007f30ULL }; + { 0x00fffffffffa0ff0ULL, 0x000fff0fUL, 0xffffffff0f007ff0ULL }; static uint64_t emith_has_rt[3] = // OP__FN, OP__RT, others { 0xff00fffffff00cffULL, 0x00000000UL, 0x8000ff0000000030ULL }; static uint64_t emith_has_rd[3] = // OP__FN, OP__RT, others (rt instead of rd) @@ -308,21 +308,23 @@ static void *emith_branch(u32 op) bop = emith_b_isswap(op, op2); } + // flush FIFO and branch + tcache_ptr = (void *)((u32 *)tcache_ptr - emith_last_cnt); + if (emith_last_insns[idx^1] != -1) + EMIT_PTR(tcache_ptr, emith_last_insns[idx^1]); if (bop) { // can swap - tcache_ptr = (void *)((u32 *)tcache_ptr - emith_last_cnt); - if (emith_last_insns[idx^1] != -1) - EMIT_PTR(tcache_ptr, emith_last_insns[idx^1]); bp = tcache_ptr; EMIT_PTR(tcache_ptr, bop); COUNT_OP; EMIT_PTR(tcache_ptr, emith_last_insns[idx]); - emith_last_insns[0] = emith_last_insns[1] = -1; - emith_last_cnt = 0; } else { // can't swap - emith_flush(); + if (emith_last_insns[idx] != -1) + EMIT_PTR(tcache_ptr, emith_last_insns[idx]); bp = tcache_ptr; EMIT_PTR(tcache_ptr, op); COUNT_OP; EMIT_PTR(tcache_ptr, MIPS_NOP); COUNT_OP; } + emith_last_insns[0] = emith_last_insns[1] = -1; + emith_last_cnt = 0; return bp; } @@ -392,8 +394,8 @@ static void *emith_branch(u32 op) // flag emulation creates 2 (ie cmp #0/beq) up to 9 (ie adcf/ble) extra insns. // flag handling shortcuts may reduce this by 1-4 insns, see emith_cond_check() -int emith_flg_rs, emith_flg_rt; // registers used in FNZ=rs-rt (aka cmp_r_r) -int emith_flg_noV; // V flag known not to be set +static int emith_flg_rs, emith_flg_rt; // registers used in FNZ=rs-rt (cmp_r_r) +static int emith_flg_noV; // V flag known not to be set // store minimal cc information: rd, rt^rs, carry // NB: the result *must* first go to FNZ, in case rd == rs or rd == rt. @@ -625,7 +627,11 @@ static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) // move immediate static void emith_move_imm(int r, uintptr_t imm) { - if ((s16)imm != imm) { + if ((s16)imm == imm) { + EMIT(MIPS_ADD_IMM(r, Z0, imm)); + } else if (!(imm >> 16)) { + EMIT(MIPS_OR_IMM(r, Z0, imm)); + } else { int s = Z0; if (imm >> 16) { EMIT(MIPS_MOVT_IMM(r, imm >> 16)); @@ -633,8 +639,7 @@ static void emith_move_imm(int r, uintptr_t imm) } if ((u16)imm) EMIT(MIPS_OR_IMM(r, s, (u16)imm)); - } else - EMIT(MIPS_ADD_IMM(r, Z0, imm)); + } } #define emith_move_r_ptr_imm(r, imm) \ @@ -1372,16 +1377,17 @@ static int emith_cond_check(int cond, int *r) emith_tst_r_imm(sr, S); \ EMITH_SJMP_START(DCOND_EQ); \ /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ - /* to check: add MACH[15] to MACH[31:16]. this is 0 if no overflow */ \ - emith_asrf(rn, mh, 16); /* sum = (MACH>>16) + ((MACH>>15)&1) */ \ - emith_adcf_r_imm(rn, 0); /* (MACH>>15) is in carry after shift */ \ + /* to check: add MACH >> 31 to MACH >> 15. this is 0 if no overflow */ \ + emith_asr(rn, mh, 15); \ + emith_add_r_r_r_lsr(rn, rn, mh, 31); /* sum = (MACH>>31)+(MACH>>15) */ \ + emith_teq_r_r(rn, Z0); /* (need only N and Z flags) */ \ EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ - EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> +ovl */ \ - emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0xffffffff */ \ - emith_sub_r_imm_c(DCOND_GT, mh, 1); /* 0x00007fff */ \ - EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_START(DCOND_PL); /* sum > 0 -> +ovl */ \ + emith_sub_r_imm_c(DCOND_MI, ml, 1); /* 0xffffffff */ \ + emith_sub_r_imm_c(DCOND_MI, mh, 1); /* 0x00007fff */ \ + EMITH_SJMP_END(DCOND_PL); \ EMITH_SJMP_END(DCOND_EQ); \ EMITH_SJMP_END(DCOND_EQ); \ } while (0) @@ -1399,14 +1405,15 @@ static int emith_cond_check(int cond, int *r) /* overflow if top 33 bits of MACH:MACL aren't all 1 or 0 */ \ /* to check: add MACL[31] to MACH. this is 0 if no overflow */ \ emith_lsr(rn, ml, 31); \ - emith_addf_r_r(rn, mh); /* sum = MACH + ((MACL>>31)&1) */ \ + emith_add_r_r(rn, mh); /* sum = MACH + ((MACL>>31)&1) */ \ + emith_teq_r_r(rn, Z0); /* (need only N and Z flags) */ \ EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> overflow */ \ /* XXX: LSB signalling only in SH1, or in SH2 too? */ \ emith_move_r_imm_c(DCOND_NE, mh, 0x00000001); /* LSB of MACH */ \ emith_move_r_imm_c(DCOND_NE, ml, 0x80000000); /* negative ovrfl */ \ - EMITH_SJMP_START(DCOND_LE); /* sum > 0 -> positive ovrfl */ \ - emith_sub_r_imm_c(DCOND_GT, ml, 1); /* 0x7fffffff */ \ - EMITH_SJMP_END(DCOND_LE); \ + EMITH_SJMP_START(DCOND_PL); /* sum > 0 -> positive ovrfl */ \ + emith_sub_r_imm_c(DCOND_MI, ml, 1); /* 0x7fffffff */ \ + EMITH_SJMP_END(DCOND_PL); \ EMITH_SJMP_END(DCOND_EQ); \ EMITH_SJMP_END(DCOND_EQ); \ } while (0) diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 451fa8d0a..44e10ecfc 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -1225,6 +1225,11 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common rcache_free_tmp(tmp_); \ } while (0) +#define emith_carry_to_t(sr, is_sub) do { \ + emith_rorc(sr); \ + emith_rol(sr, sr, 1); \ +} while (0) + #define emith_tpop_carry(sr, is_sub) \ emith_lsr(sr, sr, 1) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 09546634d..2c1e8cffe 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -69,7 +69,7 @@ // 800 - state dump on exit // { #ifndef DRC_DEBUG -#define DRC_DEBUG 0//x8c7 +#define DRC_DEBUG 0//x847 #endif #if DRC_DEBUG @@ -2999,6 +2999,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) void *block_entry_ptr; struct block_desc *block; struct block_entry *entry; + struct block_link *bl; u16 *dr_pc_base; struct op_data *opd; int blkid_main = 0; @@ -3245,6 +3246,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (pinned_loop_pc[pinned_loop_count] == pc) { // pin needed regs on loop entry FOR_ALL_BITS_SET_DO(pinned_loop_mask[pinned_loop_count], v, rcache_pin_reg(v)); + emith_flush(); pinned_loop_ptr[pinned_loop_count] = tcache_ptr; } else op_flags[i] &= ~OF_BASIC_LOOP; @@ -3920,9 +3922,8 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_invalidate_t(); - emith_tpop_carry(sr, 0); // dummy emith_lslf(tmp, tmp2, 1); - emith_tpush_carry(sr, 0); + emith_carry_to_t(sr, 0); goto end_op; case 1: // DT Rn 0100nnnn00010000 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); @@ -3949,12 +3950,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_invalidate_t(); - emith_tpop_carry(sr, 0); // dummy if (op & 0x20) { emith_asrf(tmp, tmp2, 1); } else emith_lsrf(tmp, tmp2, 1); - emith_tpush_carry(sr, 0); + emith_carry_to_t(sr, 0); goto end_op; case 1: // CMP/PZ Rn 0100nnnn00010001 tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); @@ -4007,12 +4007,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_invalidate_t(); - emith_tpop_carry(sr, 0); // dummy if (op & 1) { emith_rorf(tmp, tmp2, 1); } else emith_rolf(tmp, tmp2, 1); - emith_tpush_carry(sr, 0); + emith_carry_to_t(sr, 0); goto end_op; case 0x24: // ROTCL Rn 0100nnnn00100100 case 0x25: // ROTCR Rn 0100nnnn00100101 @@ -4391,7 +4390,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) int cond = -1; int ctaken = 0; void *target = NULL; - struct block_link *bl = NULL; if (OP_ISBRACND(opd_b->op)) ctaken = (op_flags[i] & OF_DELAY_OP) ? 1 : 2; @@ -4545,8 +4543,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } } - if (bl) - memcpy(bl->jdisp, bl->jump, emith_jump_at_size()); #if CALL_STACK if (rtsadd) emith_move_r_imm_s8_patch(rtsadd, tcache_ptr - (u8 *)rtsret); @@ -4565,7 +4561,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } else if (drcf.pending_branch_indirect) { u32 target_pc; - struct block_link *bl = NULL; tmp = rcache_get_reg_arg(0, SHR_PC, NULL); @@ -4629,8 +4624,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (! OP_ISBRAUC(opd->op)) { - struct block_link *bl; - tmp = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(tmp); emith_sync_t(tmp); @@ -4645,18 +4638,15 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_move_r_imm(tmp, pc); emith_jump_patchable(sh2_drc_dispatcher); rcache_invalidate(); - - if (bl) - memcpy(bl->jdisp, bl->jump, emith_jump_at_size()); } else rcache_flush(); // emit blx area for (i = 0; i < blx_target_count; i++) { void *target = (blx_target_pc[i] & 1 ? sh2_drc_exit : sh2_drc_dispatcher); - struct block_link *bl = blx_target_bl[i]; emith_pool_check(); + bl = blx_target_bl[i]; if (bl) bl->blx = tcache_ptr; emith_jump_patch(blx_target_ptr[i], tcache_ptr, NULL); @@ -4664,9 +4654,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_move_r_imm(tmp, blx_target_pc[i] & ~1); emith_jump(target); rcache_invalidate(); - - if (bl) - memcpy(bl->jdisp, bl->blx, emith_jump_at_size()); } emith_flush(); @@ -4692,6 +4679,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_jump_patch(branch_patch_ptr[i], target, NULL); } + // fill blx backup; do this last to backup final patched code + for (i = 0; i < block->entry_count; i++) + for (bl = block->entryp[i].o_links; bl; bl = bl->o_next) + memcpy(bl->jdisp, bl->blx ?: bl->jump, emith_jump_at_size()); + tcache_ptrs[tcache_id] = tcache_ptr; host_instructions_updated(block_entry_ptr, tcache_ptr); From e8a462c058931f46bcef3dd90f02aec418ca343e Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 11 Oct 2019 00:56:26 +0200 Subject: [PATCH 067/174] 32x, speed improvement --- pico/32x/32x.c | 28 ++++++++++++++++++---------- pico/32x/sh2soc.c | 17 ++++++++++------- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/pico/32x/32x.c b/pico/32x/32x.c index f6d1a153f..9993bfa8a 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -426,7 +426,7 @@ void p32x_sync_other_sh2(SH2 *sh2, unsigned int m68k_target) } #define STEP_LS 24 -#define STEP_N 488 // one line +#define STEP_N 528 // at least one line (488) #define sync_sh2s_normal p32x_sync_sh2s //#define sync_sh2s_lockstep p32x_sync_sh2s @@ -434,7 +434,7 @@ void p32x_sync_other_sh2(SH2 *sh2, unsigned int m68k_target) /* most timing is in 68k clock */ void sync_sh2s_normal(unsigned int m68k_target) { - unsigned int now, target, timer_cycles; + unsigned int now, target, next, timer_cycles; int cycles; elprintf(EL_32X, "sh2 sync to %u", m68k_target); @@ -458,40 +458,44 @@ void sync_sh2s_normal(unsigned int m68k_target) target = m68k_target; if (event_time_next && CYCLES_GT(target, event_time_next)) target = event_time_next; - if (CYCLES_GT(target, now + STEP_N)) - target = now + STEP_N; - while (CYCLES_GT(target, now)) { - elprintf(EL_32X, "sh2 exec to %u %d,%d/%d, flags %x", target, - target - msh2.m68krcycles_done, target - ssh2.m68krcycles_done, + next = target; + if (CYCLES_GT(target, now + STEP_N)) + next = now + STEP_N; + elprintf(EL_32X, "sh2 exec to %u %d,%d/%d, flags %x", next, + next - msh2.m68krcycles_done, next - ssh2.m68krcycles_done, m68k_target - now, Pico32x.emu_flags); pprof_start(ssh2); if (!(ssh2.state & SH2_IDLE_STATES)) { - cycles = target - ssh2.m68krcycles_done; + cycles = next - ssh2.m68krcycles_done; if (cycles > 0) { run_sh2(&ssh2, cycles > 20U ? cycles : 20U); if (event_time_next && CYCLES_GT(target, event_time_next)) target = event_time_next; + if (CYCLES_GT(next, target)) + next = target; } } pprof_end(ssh2); pprof_start(msh2); if (!(msh2.state & SH2_IDLE_STATES)) { - cycles = target - msh2.m68krcycles_done; + cycles = next - msh2.m68krcycles_done; if (cycles > 0) { run_sh2(&msh2, cycles > 20U ? cycles : 20U); if (event_time_next && CYCLES_GT(target, event_time_next)) target = event_time_next; + if (CYCLES_GT(next, target)) + next = target; } } pprof_end(msh2); - now = target; + now = next; if (!(msh2.state & SH2_IDLE_STATES)) { if (CYCLES_GT(now, msh2.m68krcycles_done)) now = msh2.m68krcycles_done; @@ -500,6 +504,10 @@ void sync_sh2s_normal(unsigned int m68k_target) if (CYCLES_GT(now, ssh2.m68krcycles_done)) now = ssh2.m68krcycles_done; } + if (now - timer_cycles >= STEP_N) { + p32x_timers_do(now - timer_cycles); + timer_cycles = now; + } } p32x_timers_do(now - timer_cycles); diff --git a/pico/32x/sh2soc.c b/pico/32x/sh2soc.c index 2b5a126c9..dd834bfbe 100644 --- a/pico/32x/sh2soc.c +++ b/pico/32x/sh2soc.c @@ -193,8 +193,9 @@ static void dmac_trigger(SH2 *sh2, struct dma_chan *chan) } // timer state - FIXME -static int timer_cycles[2]; -static int timer_tick_cycles[2]; +static u32 timer_cycles[2]; +static u32 timer_tick_cycles[2]; +static u32 timer_tick_factor[2]; // timers void p32x_timers_recalc(void) @@ -211,6 +212,7 @@ void p32x_timers_recalc(void) else cycles = 2; timer_tick_cycles[i] = cycles; + timer_tick_factor[i] = (1ULL << 32) / cycles; timer_cycles[i] = 0; elprintf(EL_32XP, "WDT cycles[%d] = %d", i, cycles); } @@ -226,11 +228,12 @@ void p32x_timers_do(unsigned int m68k_slice) void *pregs = sh2s[i].peri_regs; if (PREG8(pregs, 0x80) & 0x20) { // TME timer_cycles[i] += cycles; - cnt = PREG8(pregs, 0x81); - while (timer_cycles[i] >= timer_tick_cycles[i]) { - timer_cycles[i] -= timer_tick_cycles[i]; - cnt++; - } + // cnt = timer_cycles[i] / timer_tick_cycles[i]; + cnt = (1ULL * timer_cycles[i] * timer_tick_factor[i]) >> 32; + timer_cycles[i] -= timer_tick_cycles[i] * cnt; + if (timer_cycles[i] > timer_tick_cycles[i]) + timer_cycles[i] -= timer_tick_cycles[i], cnt++; + cnt += PREG8(pregs, 0x81); if (cnt >= 0x100) { int level = PREG8(pregs, 0xe3) >> 4; int vector = PREG8(pregs, 0xe4) & 0x7f; From 32f3a8f7b55c6c2e8537dd0a52d93fef3b2c2cd1 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 11 Oct 2019 00:02:23 +0200 Subject: [PATCH 068/174] 32x, configurable pwm irq optimization to reduce pwm irq load --- pico/32x/pwm.c | 35 +++++++++++++++++++++++++++++------ pico/pico.h | 1 + platform/common/menu_pico.c | 1 + platform/common/menu_pico.h | 1 + 4 files changed, 32 insertions(+), 6 deletions(-) diff --git a/pico/32x/pwm.c b/pico/32x/pwm.c index 0aa2f586b..3e5ce0ae0 100644 --- a/pico/32x/pwm.c +++ b/pico/32x/pwm.c @@ -14,13 +14,18 @@ static struct { int irq_reload; int doing_fifo; int silent; + int irq_timer; + int irq_state; short current[2]; } pwm; +enum { PWM_IRQ_LOCKED, PWM_IRQ_STOPPED, PWM_IRQ_LOW, PWM_IRQ_HIGH }; + void p32x_pwm_ctl_changed(void) { int control = Pico32x.regs[0x30 / 2]; int cycles = Pico32x.regs[0x32 / 2]; + int pwm_irq_opt = PicoIn.opt & POPT_PWM_IRQ_OPT; cycles = (cycles - 1) & 0x0fff; pwm.cycles = cycles; @@ -31,8 +36,10 @@ void p32x_pwm_ctl_changed(void) if ((control & 0x0f) != 0) pwm.mult = 0x10000 / cycles; - pwm.irq_reload = (control & 0x0f00) >> 8; - pwm.irq_reload = ((pwm.irq_reload - 1) & 0x0f) + 1; + pwm.irq_timer = (control & 0x0f00) >> 8; + pwm.irq_timer = ((pwm.irq_timer - 1) & 0x0f) + 1; + pwm.irq_reload = pwm.irq_timer; + pwm.irq_state = pwm_irq_opt ? PWM_IRQ_STOPPED: PWM_IRQ_LOCKED; if (Pico32x.pwm_irq_cnt == 0) Pico32x.pwm_irq_cnt = pwm.irq_reload; @@ -104,6 +111,11 @@ static void consume_fifo_do(SH2 *sh2, unsigned int m68k_cycles, if (--Pico32x.pwm_irq_cnt == 0) { Pico32x.pwm_irq_cnt = pwm.irq_reload; do_pwm_irq(sh2, m68k_cycles); + } else if (Pico32x.pwm_p[1] == 0 && pwm.irq_state >= PWM_IRQ_LOW) { + // buffer underrun. Reduce reload rate if above programmed setting. + if (pwm.irq_reload > pwm.irq_timer) + pwm.irq_reload--; + pwm.irq_state = PWM_IRQ_LOW; } } Pico32x.pwm_cycle_p = m68k_cycles * 3 - sh2_cycles_diff; @@ -221,10 +233,22 @@ void p32x_pwm_write16(unsigned int a, unsigned int d, case 6/2: // R ch fifo = Pico32xMem->pwm_fifo[1]; idx = Pico32xMem->pwm_index[1]; - if (Pico32x.pwm_p[1] < 3) + if (Pico32x.pwm_p[1] < 3) { + if (pwm.irq_state == PWM_IRQ_STOPPED) + pwm.irq_state = PWM_IRQ_LOW; + if (Pico32x.pwm_p[1] == 2 && pwm.irq_state >= PWM_IRQ_LOW) { + // buffer full. If there was no buffer underrun after last fill, + // try increasing reload rate to reduce IRQs + if (pwm.irq_reload < 3 && pwm.irq_state == PWM_IRQ_HIGH) + pwm.irq_reload ++; + pwm.irq_state = PWM_IRQ_HIGH; + } Pico32x.pwm_p[1]++; - else { -// fifo[(idx+1) % 4] = fifo[idx]; + } else { + // buffer overflow. Some roms always fill the complete buffer even if + // reload rate is set below max. Lock reload rate to programmed setting. + pwm.irq_reload = pwm.irq_timer; + pwm.irq_state = PWM_IRQ_LOCKED; idx = (idx+1) % 4; Pico32xMem->pwm_index[0] = idx; } @@ -236,7 +260,6 @@ void p32x_pwm_write16(unsigned int a, unsigned int d, if (Pico32x.pwm_p[0] < 3) Pico32x.pwm_p[0]++; else { -// fifo[(idx+1) % 4] = fifo[idx]; idx = (idx+1) % 4; Pico32xMem->pwm_index[0] = idx; } diff --git a/pico/pico.h b/pico/pico.h index a669215dc..fda1c43a8 100644 --- a/pico/pico.h +++ b/pico/pico.h @@ -72,6 +72,7 @@ extern void *p32x_bios_g, *p32x_bios_m, *p32x_bios_s; #define POPT_DIS_IDLE_DET (1<<19) #define POPT_EN_32X (1<<20) #define POPT_EN_PWM (1<<21) +#define POPT_PWM_IRQ_OPT (1<<22) #define PAHW_MCD (1<<0) #define PAHW_32X (1<<1) diff --git a/platform/common/menu_pico.c b/platform/common/menu_pico.c index 7b0cd78c8..9fb314268 100644 --- a/platform/common/menu_pico.c +++ b/platform/common/menu_pico.c @@ -506,6 +506,7 @@ static menu_entry e_menu_adv_options[] = mee_onoff ("Disable frame limiter", MA_OPT2_NO_FRAME_LIMIT,currentConfig.EmuOpt, EOPT_NO_FRMLIMIT), mee_onoff ("Enable dynarecs", MA_OPT2_DYNARECS, PicoIn.opt, POPT_EN_DRC), mee_onoff ("Status line in main menu", MA_OPT2_STATUS_LINE, currentConfig.EmuOpt, EOPT_SHOW_RTC), + mee_onoff ("PWM IRQ optimization", MA_OPT2_PWM_IRQ_OPT, PicoIn.opt, POPT_PWM_IRQ_OPT), MENU_OPTIONS_ADV mee_end, }; diff --git a/platform/common/menu_pico.h b/platform/common/menu_pico.h index 595989e84..c626c7726 100644 --- a/platform/common/menu_pico.h +++ b/platform/common/menu_pico.h @@ -58,6 +58,7 @@ typedef enum MA_OPT2_NO_SPRITE_LIM, MA_OPT2_NO_IDLE_LOOPS, MA_OPT2_OVERCLOCK_M68K, + MA_OPT2_PWM_IRQ_OPT, MA_OPT2_DONE, MA_OPT3_SCALE, /* psp (all OPT3) */ MA_OPT3_HSCALE32, From 47fa253acfc0fa7af32a3e8cd518166abfe344c9 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 11 Oct 2019 00:06:50 +0200 Subject: [PATCH 069/174] 32x, improved auto frame skip, plus new config option for max auto skip --- platform/common/config_file.c | 4 ++++ platform/common/emu.c | 13 ++++++++++--- platform/common/emu.h | 1 + platform/common/menu_pico.c | 1 + platform/common/menu_pico.h | 1 + 5 files changed, 17 insertions(+), 3 deletions(-) diff --git a/platform/common/config_file.c b/platform/common/config_file.c index 0cd27260a..0284cfd6f 100644 --- a/platform/common/config_file.c +++ b/platform/common/config_file.c @@ -326,6 +326,10 @@ static int custom_read(menu_entry *me, const char *var, const char *val) currentConfig.gamma = atoi(val); return 1; + case MA_OPT2_MAX_FRAMESKIP: + currentConfig.max_skip = atoi(val); + return 1; + /* PSP */ case MA_OPT3_SCALE: if (strcasecmp(var, "Scale factor") != 0) return 0; diff --git a/platform/common/emu.c b/platform/common/emu.c index fdde3dd70..1c2bfa2cc 100644 --- a/platform/common/emu.c +++ b/platform/common/emu.c @@ -600,6 +600,7 @@ void emu_prep_defconfig(void) defaultConfig.turbo_rate = 15; defaultConfig.msh2_khz = PICO_MSH2_HZ / 1000; defaultConfig.ssh2_khz = PICO_SSH2_HZ / 1000; + defaultConfig.max_skip = 4; // platform specific overrides pemu_prep_defconfig(); @@ -1467,10 +1468,16 @@ void emu_loop(void) else if (diff < -target_frametime_x3) { /* no time left for this frame - skip */ - /* limit auto frameskip to 8 */ - if (frames_done / 8 <= frames_shown) + /* limit auto frameskip to max_skip */ + if (fskip_cnt < currentConfig.max_skip) { + fskip_cnt++; skip = 1; - } + } + else { + fskip_cnt = 0; + } + } else + fskip_cnt = 0; // don't go in debt too much while (diff < -target_frametime_x3 * 3) { diff --git a/platform/common/emu.h b/platform/common/emu.h index 1e751f891..26e2159b4 100644 --- a/platform/common/emu.h +++ b/platform/common/emu.h @@ -76,6 +76,7 @@ typedef struct _currentConfig_t { int msh2_khz; int ssh2_khz; int overclock_68k; + int max_skip; } currentConfig_t; extern currentConfig_t currentConfig, defaultConfig; diff --git a/platform/common/menu_pico.c b/platform/common/menu_pico.c index 9fb314268..dc7ceda44 100644 --- a/platform/common/menu_pico.c +++ b/platform/common/menu_pico.c @@ -506,6 +506,7 @@ static menu_entry e_menu_adv_options[] = mee_onoff ("Disable frame limiter", MA_OPT2_NO_FRAME_LIMIT,currentConfig.EmuOpt, EOPT_NO_FRMLIMIT), mee_onoff ("Enable dynarecs", MA_OPT2_DYNARECS, PicoIn.opt, POPT_EN_DRC), mee_onoff ("Status line in main menu", MA_OPT2_STATUS_LINE, currentConfig.EmuOpt, EOPT_SHOW_RTC), + mee_range ("Max auto frameskip", MA_OPT2_MAX_FRAMESKIP, currentConfig.max_skip, 1, 10), mee_onoff ("PWM IRQ optimization", MA_OPT2_PWM_IRQ_OPT, PicoIn.opt, POPT_PWM_IRQ_OPT), MENU_OPTIONS_ADV mee_end, diff --git a/platform/common/menu_pico.h b/platform/common/menu_pico.h index c626c7726..4c0bbdd1d 100644 --- a/platform/common/menu_pico.h +++ b/platform/common/menu_pico.h @@ -58,6 +58,7 @@ typedef enum MA_OPT2_NO_SPRITE_LIM, MA_OPT2_NO_IDLE_LOOPS, MA_OPT2_OVERCLOCK_M68K, + MA_OPT2_MAX_FRAMESKIP, MA_OPT2_PWM_IRQ_OPT, MA_OPT2_DONE, MA_OPT3_SCALE, /* psp (all OPT3) */ From 8d3536852f956721e7d0b398c2bcb58b5ae0d737 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 12 Oct 2019 00:26:11 +0200 Subject: [PATCH 070/174] sh2 drc bugfix for aarch64/mips --- Makefile | 2 +- cpu/drc/emit_arm64.c | 2 +- cpu/drc/emit_mips.c | 4 ++-- cpu/drc/emit_x86.c | 2 +- pico/32x/memory.c | 20 ++++++++++---------- tools/mkoffsets.sh | 9 +++++---- 6 files changed, 20 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index 52bdea0f5..5b7e5a2ca 100644 --- a/Makefile +++ b/Makefile @@ -240,7 +240,7 @@ endif pprof: platform/linux/pprof.c $(CC) $(CFLAGS) -O2 -ggdb -DPPROF -DPPROF_TOOL -I../../ -I. $^ -o $@ $(LDFLAGS) $(LDLIBS) -pico/pico_int_offs.h:: tools/mkoffsets.sh +pico/pico_int_offs.h: tools/mkoffsets.sh make -C tools/ XCC="$(CC)" XCFLAGS="$(CFLAGS)" %.o: %.c diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 3ef402b40..4bad64690 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -979,7 +979,7 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define emith_save_caller_regs(mask) do { \ int _c, _r1, _r2; u32 _m = mask & 0x3ffff; \ if (__builtin_parity(_m) == 1) _m |= 0x40000; /* hardware align */ \ - for (_c = HOST_REGS, _r1 = -1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + for (_c = HOST_REGS-1, _r1 = -1; _m && _c >= 0; _m &= ~(1 << _c), _c--)\ if (_m & (1 << _c)) { \ _r2 = _r1, _r1 = _c; \ if (_r2 != -1) { \ diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 4a452a685..38d68f40e 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -1065,7 +1065,7 @@ static void emith_lohi_nops(void) if (__builtin_parity(_m) == 1) _m |= 0x1; /* ABI align */ \ int _s = count_bits(_m) * 4, _o = _s; \ if (_s) emith_sub_r_imm(SP, _s); \ - for (_c = HOST_REGS; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ if (_m & (1 << _c)) \ { _o -= 4; if (_c) emith_write_r_r_offs(_c, SP, _o); } \ } while (0) @@ -1279,7 +1279,7 @@ static int emith_cond_check(int cond, int *r) if (__builtin_parity(_m) == 1) _m |= 0x1; /* ABI align for SP is 8 */ \ int _s = count_bits(_m) * 4 + 16, _o = _s; /* 16 byte arg save area */ \ if (_s) emith_sub_r_imm(SP, _s); \ - for (_c = HOST_REGS; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ if (_m & (1 << _c)) \ { _o -= 4; if (_c) emith_write_r_r_offs(_c, SP, _o); } \ } while (0) diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 44e10ecfc..212a12c55 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -1115,7 +1115,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define emith_save_caller_regs(mask) do { \ int _c; u32 _m = mask & 0xfc7; /* AX, CX, DX, SI, DI, 8, 9, 10, 11 */ \ if (__builtin_parity(_m) == 1) _m |= 0x8; /* BX for ABI align */ \ - for (_c = HOST_REGS; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ if (_m & (1 << _c)) emith_push(_c); \ } while (0) diff --git a/pico/32x/memory.c b/pico/32x/memory.c index e139910a4..60820e1ae 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -347,7 +347,7 @@ static u32 p32x_reg_read16(u32 a) if ((a & 0x30) == 0x20) { unsigned int cycles = SekCyclesDone(); - if (cycles - msh2.m68krcycles_done > 244) + if (CYCLES_GT(cycles - msh2.m68krcycles_done, 244)) p32x_sync_sh2s(cycles); if (m68k_poll_detect(a, cycles, P32XF_68KCPOLL)) { @@ -360,7 +360,7 @@ static u32 p32x_reg_read16(u32 a) if (a == 2) { // INTM, INTS unsigned int cycles = SekCyclesDone(); - if (cycles - msh2.m68krcycles_done > 64) + if (CYCLES_GT(cycles - msh2.m68krcycles_done, 64)) p32x_sync_sh2s(cycles); goto out; } @@ -420,7 +420,7 @@ static void p32x_reg_write8(u32 a, u32 d) return; case 0x03: // irq ctl if ((d ^ r[0x02 / 2]) & 3) { - int cycles = SekCyclesDone(); + unsigned int cycles = SekCyclesDone(); p32x_sync_sh2s(cycles); r[0x02 / 2] = d & 3; p32x_update_cmd_irq(NULL, cycles); @@ -610,9 +610,9 @@ static void p32x_reg_write16(u32 a, u32 d) case 0x2c/2: case 0x2e/2: if (r[a / 2] != d) { - int cycles = SekCyclesDone(); + unsigned int cycles = SekCyclesDone(); - if (cycles - (int)msh2.m68krcycles_done > 30) + if (CYCLES_GT(cycles - msh2.m68krcycles_done, 64)) p32x_sync_sh2s(cycles); r[a / 2] = d; @@ -712,7 +712,7 @@ static void p32x_vdp_write16(u32 a, u32 d, SH2 *sh2) } Pico32x.vdp_regs[0x06 / 2] = a; Pico32x.vdp_regs[0x08 / 2] = d; - if (sh2 != NULL && len > 4) { + if (sh2 != NULL && len > 8) { Pico32x.vdp_regs[0x0a / 2] |= P32XV_nFEN; // supposedly takes 3 bus/6 sh2 cycles? or 3 sh2 cycles? p32x_event_schedule_sh2(sh2, P32X_EVENT_FILLEND, 3 + len); @@ -824,8 +824,8 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) if (Pico32x.sh2_regs[4 / 2] != d) { unsigned int cycles = sh2_cycles_done_m68k(sh2); Pico32x.sh2_regs[4 / 2] = d; - sh2_end_run(sh2, 4); p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); + sh2_end_run(sh2, 4); sh2_poll_write(a & ~1, d, cycles, sh2); } return; @@ -849,9 +849,9 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) unsigned int cycles = sh2_cycles_done_m68k(sh2); REG8IN16(r, a) = d; - sh2_end_run(sh2, 1); p32x_m68k_poll_event(P32XF_68KCPOLL); p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); + sh2_end_run(sh2, 1); sh2_poll_write(a & ~1, r[a / 2], cycles, sh2); } return; @@ -941,9 +941,9 @@ static void p32x_sh2reg_write16(u32 a, u32 d, SH2 *sh2) unsigned int cycles = sh2_cycles_done_m68k(sh2); Pico32x.regs[a / 2] = d; - sh2_end_run(sh2, 1); p32x_m68k_poll_event(P32XF_68KCPOLL); p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); + sh2_end_run(sh2, 1); sh2_poll_write(a, d, cycles, sh2); } return; @@ -1574,10 +1574,10 @@ static void NOINLINE sh2_sdram_poll(u32 a, u32 d, SH2 *sh2) unsigned cycles; DRC_SAVE_SR(sh2); - sh2_end_run(sh2, 1); cycles = sh2_cycles_done_m68k(sh2); sh2_poll_write(a, d, cycles, sh2); p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_RPOLL, cycles); + sh2_end_run(sh2, 1); DRC_RESTORE_SR(sh2); } diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index 8f2d888c2..2223b8046 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -11,10 +11,10 @@ ENDIAN= # compile with target C compiler and extract value from .rodata section compile_rodata () { - # $CC $CFLAGS -I .. -shared /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 - echo 'void dummy(void) { asm(""::"r" (&val)); }' >> /tmp/getoffs.c - $CC $CFLAGS -I .. -nostdlib -Wl,-edummy /tmp/getoffs.c \ - -o /tmp/getoffs.o || exit 1 + $CC $CFLAGS -I .. -c /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 + # echo 'void dummy(void) { asm(""::"r" (&val)); }' >> /tmp/getoffs.c + # $CC $CFLAGS -I .. -nostdlib -Wl,-edummy /tmp/getoffs.c \ + # -o /tmp/getoffs.o || exit 1 # find the name of the .rodata section (in case -fdata-sections is used) rosect=$(readelf -S /tmp/getoffs.o | grep '\.rodata' | sed 's/^[^.]*././;s/ .*//') @@ -48,6 +48,7 @@ get_define () # prefix struct member member... line=$(printf "#define %-20s 0x%04x" $prefix$name $rodata) } +CFLAGS="$CFLAGS -fno-lto" # determine endianess echo "const int val = 1;" >/tmp/getoffs.c compile_rodata From f147cb1438c737cb356e2286e30fa656df76f484 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 12 Oct 2019 11:10:28 +0200 Subject: [PATCH 071/174] sh2 drc: bugfix in block management --- cpu/sh2/compiler.c | 4 +--- pico/32x/memory.c | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 2c1e8cffe..b7c57b3dd 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -769,8 +769,7 @@ static void rm_block_list(struct block_list **blist) struct block_list *next, *current = *blist; while (current != NULL) { next = current->next; - current->next = blist_free; - blist_free = current; + rm_from_block_lists(current->block); current = next; } *blist = NULL; @@ -5441,7 +5440,6 @@ int sh2_drc_init(SH2 *sh2) block_tables[i] = calloc(BLOCK_MAX_COUNT(i), sizeof(*block_tables[0])); if (block_tables[i] == NULL) goto fail; - // max 2 block links (exits) per block block_link_pool[i] = calloc(BLOCK_LINK_MAX_COUNT(i), sizeof(*block_link_pool[0])); if (block_link_pool[i] == NULL) diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 60820e1ae..06215a7ce 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -513,9 +513,9 @@ static void p32x_reg_write8(u32 a, u32 d) case 0x2e: case 0x2f: if (REG8IN16(r, a) != d) { - int cycles = SekCyclesDone(); + unsigned int cycles = SekCyclesDone(); - if (cycles - (int)msh2.m68krcycles_done > 30) + if (CYCLES_GT(cycles - msh2.m68krcycles_done, 64)) p32x_sync_sh2s(cycles); REG8IN16(r, a) = d; From 85970fdc2fe5655c17d5a74d5a8ab2861518b106 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 12 Oct 2019 11:19:55 +0200 Subject: [PATCH 072/174] sh2 drc: bugfix in block management --- cpu/sh2/compiler.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index b7c57b3dd..86d4b85a7 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -766,13 +766,8 @@ static void rm_from_block_lists(struct block_desc *block) static void rm_block_list(struct block_list **blist) { - struct block_list *next, *current = *blist; - while (current != NULL) { - next = current->next; - rm_from_block_lists(current->block); - current = next; - } - *blist = NULL; + while (*blist != NULL) + rm_from_block_lists((*blist)->block); } static void REGPARM(1) flush_tcache(int tcid) From 508e5ab618eed009920f570d87f454e3d9b77bbf Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 17 Oct 2019 21:54:37 +0200 Subject: [PATCH 073/174] sh2 drc: reorganised block mgmt code, plus some small scale optimisations --- cpu/sh2/compiler.c | 691 ++++++++++++++++++------------------ cpu/sh2/compiler.h | 4 +- cpu/sh2/sh2.h | 2 +- pico/32x/memory.c | 56 +-- pico/32x/memory_arm.S | 15 +- pico/pico_int.h | 4 + platform/gp2x/PicoDrive.gpe | 2 + 7 files changed, 395 insertions(+), 379 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 86d4b85a7..1acc7215c 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -764,58 +764,16 @@ static void rm_from_block_lists(struct block_desc *block) block->list = NULL; } -static void rm_block_list(struct block_list **blist) +static void discard_block_list(struct block_list **blist) { - while (*blist != NULL) - rm_from_block_lists((*blist)->block); -} - -static void REGPARM(1) flush_tcache(int tcid) -{ - int i; -#if (DRC_DEBUG & 1) - int tc_used, bl_used; - - tc_used = tcache_sizes[tcid] - (tcache_limit[tcid] - tcache_ptrs[tcid]); - bl_used = BLOCK_MAX_COUNT(tcid) - (block_limit[tcid] - block_counts[tcid]); - elprintf(EL_STATUS, "tcache #%d flush! (%d/%d, bds %d/%d)", tcid, tc_used, - tcache_sizes[tcid], bl_used, BLOCK_MAX_COUNT(tcid)); -#endif - - block_counts[tcid] = 0; - block_limit[tcid] = BLOCK_MAX_COUNT(tcid) - 1; - block_link_pool_counts[tcid] = 0; - blink_free[tcid] = NULL; - memset(unresolved_links[tcid], 0, sizeof(*unresolved_links[0]) * HASH_TABLE_SIZE(tcid)); - memset(hash_tables[tcid], 0, sizeof(*hash_tables[0]) * HASH_TABLE_SIZE(tcid)); - tcache_ptrs[tcid] = tcache_bases[tcid]; - tcache_limit[tcid] = tcache_bases[tcid] + tcache_sizes[tcid]; - if (Pico32xMem->sdram != NULL) { - if (tcid == 0) { // ROM, RAM - memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram)); - memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram)); - memset(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)); - memset(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)); - memset(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)); - memset(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)); - sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0; - } else { - memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram)); - memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram)); - memset(Pico32xMem->drcblk_da[tcid - 1], 0, sizeof(Pico32xMem->drcblk_da[tcid - 1])); - memset(Pico32xMem->drclit_da[tcid - 1], 0, sizeof(Pico32xMem->drclit_da[tcid - 1])); - memset(sh2s[tcid - 1].branch_cache, -1, sizeof(sh2s[0].branch_cache)); - memset(sh2s[tcid - 1].rts_cache, -1, sizeof(sh2s[0].rts_cache)); - sh2s[tcid - 1].rts_cache_idx = 0; - } + struct block_list *next, *current = *blist; + while (current != NULL) { + next = current->next; + current->next = blist_free; + blist_free = current; + current = next; } -#if (DRC_DEBUG & 4) - tcache_dsm_ptrs[tcid] = tcache_bases[tcid]; -#endif - - for (i = 0; i < RAM_SIZE(tcid) / INVAL_PAGE_SIZE; i++) - rm_block_list(&inval_lookup[tcid][i]); - rm_block_list(&inactive_blocks[tcid]); + *blist = NULL; } static void add_to_hashlist(struct block_entry *be, int tcache_id) @@ -902,68 +860,127 @@ static void rm_from_hashlist_unresolved(struct block_link *bl, int tcache_id) bl->next->prev = bl->prev; } -static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit, int free); -static void dr_free_oldest_block(int tcache_id) +#if LINK_BRANCHES +static void dr_block_link(struct block_entry *be, struct block_link *bl, int emit_jump) { - struct block_desc *bd; + dbg(2, "- %slink from %p to pc %08x entry %p", emit_jump ? "":"early ", + bl->jump, bl->target_pc, be->tcache_ptr); - if (block_limit[tcache_id] >= BLOCK_MAX_COUNT(tcache_id)) { - // block desc wrap around - block_limit[tcache_id] = 0; + if (emit_jump) { + u8 *jump = bl->jump; + int jsz = emith_jump_patch_size(); + if (bl->type == BL_JMP) { // patch: jump @entry + // inlined: @jump far jump to target + emith_jump_patch(jump, be->tcache_ptr, &jump); + } else if (bl->type == BL_LDJMP) { // write: jump @entry + // inlined: @jump far jump to target + emith_jump_at(jump, be->tcache_ptr); + jsz = emith_jump_at_size(); + } else if (bl->type == BL_JCCBLX) { // patch: jump cond -> jump @entry + if (emith_jump_patch_inrange(bl->jump, be->tcache_ptr)) { + // inlined: @jump near jumpcc to target + emith_jump_patch(jump, be->tcache_ptr, &jump); + } else { // dispatcher cond immediate + // via blx: @jump near jumpcc to blx; @blx far jump + emith_jump_patch(jump, bl->blx, &jump); + emith_jump_at(bl->blx, be->tcache_ptr); + if ((((uintptr_t)bl->blx & 0xf) + emith_jump_at_size()-1) > 0xf) + host_instructions_updated(bl->blx, bl->blx + emith_jump_at_size()-1); + } + } else { + printf("unknown BL type %d\n", bl->type); + exit(1); + } + // only needs sync if patch is possibly crossing cacheline (assume 16 byte) + if ((((uintptr_t)jump & 0xf) + jsz-1) > 0xf) + host_instructions_updated(jump, jump + jsz-1); } - bd = &block_tables[tcache_id][block_limit[tcache_id]]; - if (bd->tcache_ptr && bd->tcache_ptr < tcache_ptrs[tcache_id]) { - // cache wrap around - tcache_ptrs[tcache_id] = bd->tcache_ptr; - } + // move bl to block_entry + bl->target = be; + bl->prev = NULL; + if (be->links) + be->links->prev = bl; + bl->next = be->links; + be->links = bl; +} - if (bd->addr && bd->entry_count) - sh2_smc_rm_block_entry(bd, tcache_id, 0, 1); +static void dr_block_unlink(struct block_link *bl, int emit_jump) +{ + dbg(2,"- unlink from %p to pc %08x", bl->jump, bl->target_pc); - block_limit[tcache_id]++; - if (block_limit[tcache_id] >= BLOCK_MAX_COUNT(tcache_id)) - block_limit[tcache_id] = 0; - bd = &block_tables[tcache_id][block_limit[tcache_id]]; - if (bd->tcache_ptr >= tcache_ptrs[tcache_id]) - tcache_limit[tcache_id] = bd->tcache_ptr; - else - tcache_limit[tcache_id] = tcache_bases[tcache_id] + tcache_sizes[tcache_id]; + if (bl->target) { + if (emit_jump) { + u8 *jump = bl->jump; + int jsz = emith_jump_patch_size(); + if (bl->type == BL_JMP) { // jump_patch @dispatcher + // inlined: @jump far jump to dispatcher + emith_jump_patch(jump, sh2_drc_dispatcher, &jump); + } else if (bl->type == BL_LDJMP) { // restore: load pc, jump @dispatcher + // inlined: @jump load target_pc, far jump to dispatcher + memcpy(jump, bl->jdisp, emith_jump_at_size()); + jsz = emith_jump_at_size(); + } else if (bl->type == BL_JCCBLX) { // jump cond @blx; @blx: load pc, jump + // via blx: @jump near jumpcc to blx; @blx load target_pc, far jump + emith_jump_patch(bl->jump, bl->blx, &jump); + memcpy(bl->blx, bl->jdisp, emith_jump_at_size()); + host_instructions_updated(bl->blx, bl->blx + emith_jump_at_size()-1); + } else { + printf("unknown BL type %d\n", bl->type); + exit(1); + } + // update cpu caches since the previous jump target doesn't exist anymore + host_instructions_updated(jump, jump + jsz-1); + } + + if (bl->prev) + bl->prev->next = bl->next; + else + bl->target->links = bl->next; + if (bl->next) + bl->next->prev = bl->prev; + bl->target = NULL; + } } +#endif -static u8 *dr_prepare_cache(int tcache_id, int insn_count) +static struct block_link *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_slave, int tcache_id) { - u8 *limit = tcache_limit[tcache_id]; +#if LINK_BRANCHES + struct block_link *bl = block_link_pool[tcache_id]; + int cnt = block_link_pool_counts[tcache_id]; + int target_tcache_id; - // if no block desc available - if (block_counts[tcache_id] == block_limit[tcache_id]) - dr_free_oldest_block(tcache_id); + // get the target block entry + target_tcache_id = dr_get_tcache_id(pc, is_slave); + if (target_tcache_id && target_tcache_id != tcache_id) + return NULL; - // while not enough cache space left (limit - tcache_ptr < max space needed) - while (tcache_limit[tcache_id] - tcache_ptrs[tcache_id] < insn_count * 128) - dr_free_oldest_block(tcache_id); + // get a block link + if (blink_free[tcache_id] != NULL) { + bl = blink_free[tcache_id]; + blink_free[tcache_id] = bl->next; + } else if (cnt >= BLOCK_LINK_MAX_COUNT(tcache_id)) { + dbg(1, "bl overflow for tcache %d", tcache_id); + return NULL; + } else { + bl += cnt; + block_link_pool_counts[tcache_id] = cnt+1; + } - if (limit != tcache_limit[tcache_id]) { -#if BRANCH_CACHE - if (tcache_id) - memset32(sh2s[tcache_id-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); - else { - memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); - memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4); - } -#endif -#if CALL_STACK - if (tcache_id) { - memset32(sh2s[tcache_id-1].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4); - sh2s[tcache_id-1].rts_cache_idx = 0; - } else { - memset32(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4); - memset32(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)/4); - sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0; - } + // prepare link and add to outgoing list of owner + bl->tcache_id = tcache_id; + bl->target_pc = pc; + bl->jump = tcache_ptr; + bl->blx = NULL; + bl->o_next = owner->o_links; + owner->o_links = bl; + + add_to_hashlist_unresolved(bl, tcache_id); + return bl; +#else + return NULL; #endif - } - return (u8 *)tcache_ptrs[tcache_id]; } static void dr_mark_memory(int mark, struct block_desc *block, int tcache_id, u32 nolit) @@ -1059,207 +1076,117 @@ static u32 dr_check_nolit(u32 start, u32 end, int tcache_id) return end; } -static struct block_desc *dr_find_inactive_block(int tcache_id, u16 crc, - u32 addr, int size, u32 addr_lit, int size_lit) +static void dr_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit, int free) { - struct block_list **head = &inactive_blocks[tcache_id]; - struct block_list *current; + struct block_link *bl; + u32 i; - for (current = *head; current != NULL; current = current->next) { - struct block_desc *block = current->block; - if (block->crc == crc && block->addr == addr && block->size == size && - block->addr_lit == addr_lit && block->size_lit == size_lit) - { - rm_from_block_lists(block); - return block; - } + free = free || nolit; // block is invalid if literals are overwritten + dbg(2," %sing block %08x-%08x,%08x-%08x, blkid %d,%d", free?"delet":"disabl", + bd->addr, bd->addr + bd->size, bd->addr_lit, bd->addr_lit + bd->size_lit, + tcache_id, bd - block_tables[tcache_id]); + if (bd->addr == 0 || bd->entry_count == 0) { + dbg(1, " killing dead block!? %08x", bd->addr); + return; } - return NULL; -} -static struct block_desc *dr_add_block(u32 addr, int size, - u32 addr_lit, int size_lit, u16 crc, int is_slave, int *blk_id) -{ - struct block_entry *be; - struct block_desc *bd; - int tcache_id; - int *bcount; +#if LINK_BRANCHES + // remove from hash table, make incoming links unresolved + if (bd->active) { + for (i = 0; i < bd->entry_count; i++) { + rm_from_hashlist(&bd->entryp[i], tcache_id); - // do a lookup to get tcache_id and override check - be = dr_get_entry(addr, is_slave, &tcache_id); - if (be != NULL) - dbg(1, "block override for %08x", addr); + while ((bl = bd->entryp[i].links) != NULL) { + dr_block_unlink(bl, 1); + add_to_hashlist_unresolved(bl, tcache_id); + } + } - bcount = &block_counts[tcache_id]; - if (*bcount == block_limit[tcache_id]) { - dbg(1, "bd overflow for tcache %d", tcache_id); - return NULL; + dr_mark_memory(-1, bd, tcache_id, nolit); + add_to_block_list(&inactive_blocks[tcache_id], bd); } - - bd = &block_tables[tcache_id][*bcount]; - bd->addr = addr; - bd->size = size; - bd->addr_lit = addr_lit; - bd->size_lit = size_lit; - bd->tcache_ptr = tcache_ptr; - bd->crc = crc; bd->active = 0; - bd->entry_count = 0; -#if (DRC_DEBUG & 2) - bd->refcount = 0; #endif - *blk_id = *bcount; - (*bcount)++; - if (*bcount >= BLOCK_MAX_COUNT(tcache_id)) - *bcount = 0; - - return bd; -} - -static void REGPARM(3) *dr_lookup_block(u32 pc, SH2 *sh2, int *tcache_id) -{ - struct block_entry *be = NULL; - void *block = NULL; - - be = dr_get_entry(pc, sh2->is_slave, tcache_id); - if (be != NULL) - block = be->tcache_ptr; - -#if (DRC_DEBUG & 2) - if (be != NULL) - be->block->refcount++; -#endif - return block; -} - -static void *dr_failure(void) -{ - lprintf("recompilation failed\n"); - exit(1); -} - + if (free) { #if LINK_BRANCHES -static void dr_block_link(struct block_entry *be, struct block_link *bl, int emit_jump) -{ - dbg(2, "- %slink from %p to pc %08x entry %p", emit_jump ? "":"early ", - bl->jump, bl->target_pc, be->tcache_ptr); - - if (emit_jump) { - u8 *jump = bl->jump; - int jsz = emith_jump_patch_size(); - if (bl->type == BL_JMP) { // patch: jump @entry - // inlined: @jump far jump to target - emith_jump_patch(jump, be->tcache_ptr, &jump); - } else if (bl->type == BL_LDJMP) { // write: jump @entry - // inlined: @jump far jump to target - emith_jump_at(jump, be->tcache_ptr); - jsz = emith_jump_at_size(); - } else if (bl->type == BL_JCCBLX) { // patch: jump cond -> jump @entry - if (emith_jump_patch_inrange(bl->jump, be->tcache_ptr)) { - // inlined: @jump near jumpcc to target - emith_jump_patch(jump, be->tcache_ptr, &jump); - } else { // dispatcher cond immediate - // via blx: @jump near jumpcc to blx; @blx far jump - emith_jump_patch(jump, bl->blx, &jump); - emith_jump_at(bl->blx, be->tcache_ptr); - if ((((uintptr_t)bl->blx & 0xf) + emith_jump_at_size()-1) > 0xf) - host_instructions_updated(bl->blx, bl->blx + emith_jump_at_size()-1); - } - } else { - printf("unknown BL type %d\n", bl->type); - exit(1); + // revoke outgoing links + for (bl = bd->entryp[0].o_links; bl != NULL; bl = bl->o_next) { + if (bl->target) + dr_block_unlink(bl, 0); + else + rm_from_hashlist_unresolved(bl, tcache_id); + bl->jump = NULL; + bl->next = blink_free[bl->tcache_id]; + blink_free[bl->tcache_id] = bl; } - // only needs sync if patch is possibly crossing cacheline (assume 16 byte) - if ((((uintptr_t)jump & 0xf) + jsz-1) > 0xf) - host_instructions_updated(jump, jump + jsz-1); + bd->entryp[0].o_links = NULL; +#endif + // invalidate block + rm_from_block_lists(bd); + bd->addr = bd->size = bd->addr_lit = bd->size_lit = 0; + bd->entry_count = 0; } - - // move bl to block_entry - bl->target = be; - bl->prev = NULL; - if (be->links) - be->links->prev = bl; - bl->next = be->links; - be->links = bl; + emith_update_cache(); } -static void dr_block_unlink(struct block_link *bl, int emit_jump) +static struct block_desc *dr_find_inactive_block(int tcache_id, u16 crc, + u32 addr, int size, u32 addr_lit, int size_lit) { - dbg(2,"- unlink from %p to pc %08x", bl->jump, bl->target_pc); + struct block_list **head = &inactive_blocks[tcache_id]; + struct block_list *current; - if (bl->target) { - if (emit_jump) { - u8 *jump = bl->jump; - int jsz = emith_jump_patch_size(); - if (bl->type == BL_JMP) { // jump_patch @dispatcher - // inlined: @jump far jump to dispatcher - emith_jump_patch(jump, sh2_drc_dispatcher, &jump); - } else if (bl->type == BL_LDJMP) { // restore: load pc, jump @dispatcher - // inlined: @jump load target_pc, far jump to dispatcher - memcpy(jump, bl->jdisp, emith_jump_at_size()); - jsz = emith_jump_at_size(); - } else if (bl->type == BL_JCCBLX) { // jump cond @blx; @blx: load pc, jump - // via blx: @jump near jumpcc to blx; @blx load target_pc, far jump - emith_jump_patch(bl->jump, bl->blx, &jump); - memcpy(bl->blx, bl->jdisp, emith_jump_at_size()); - host_instructions_updated(bl->blx, bl->blx + emith_jump_at_size()-1); - } else { - printf("unknown BL type %d\n", bl->type); - exit(1); - } - // update cpu caches since the previous jump target doesn't exist anymore - host_instructions_updated(jump, jump + jsz-1); + for (current = *head; current != NULL; current = current->next) { + struct block_desc *block = current->block; + if (block->crc == crc && block->addr == addr && block->size == size && + block->addr_lit == addr_lit && block->size_lit == size_lit) + { + rm_from_block_lists(block); + return block; } - - if (bl->prev) - bl->prev->next = bl->next; - else - bl->target->links = bl->next; - if (bl->next) - bl->next->prev = bl->prev; - bl->target = NULL; } + return NULL; } -#endif -static struct block_link *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_slave, int tcache_id) +static struct block_desc *dr_add_block(u32 addr, int size, + u32 addr_lit, int size_lit, u16 crc, int is_slave, int *blk_id) { -#if LINK_BRANCHES - struct block_link *bl = block_link_pool[tcache_id]; - int cnt = block_link_pool_counts[tcache_id]; - int target_tcache_id; + struct block_entry *be; + struct block_desc *bd; + int tcache_id; + int *bcount; - // get the target block entry - target_tcache_id = dr_get_tcache_id(pc, is_slave); - if (target_tcache_id && target_tcache_id != tcache_id) - return NULL; + // do a lookup to get tcache_id and override check + be = dr_get_entry(addr, is_slave, &tcache_id); + if (be != NULL) + dbg(1, "block override for %08x", addr); - // get a block link - if (blink_free[tcache_id] != NULL) { - bl = blink_free[tcache_id]; - blink_free[tcache_id] = bl->next; - } else if (cnt >= BLOCK_LINK_MAX_COUNT(tcache_id)) { - dbg(1, "bl overflow for tcache %d", tcache_id); + bcount = &block_counts[tcache_id]; + if (*bcount == block_limit[tcache_id]) { + dbg(1, "bd overflow for tcache %d", tcache_id); return NULL; - } else { - bl += cnt; - block_link_pool_counts[tcache_id] = cnt+1; } - // prepare link and add to outgoing list of owner - bl->tcache_id = tcache_id; - bl->target_pc = pc; - bl->jump = tcache_ptr; - bl->blx = NULL; - bl->o_next = owner->o_links; - owner->o_links = bl; - - add_to_hashlist_unresolved(bl, tcache_id); - return bl; -#else - return NULL; + bd = &block_tables[tcache_id][*bcount]; + bd->addr = addr; + bd->size = size; + bd->addr_lit = addr_lit; + bd->size_lit = size_lit; + bd->tcache_ptr = tcache_ptr; + bd->crc = crc; + bd->active = 0; + bd->list = NULL; + bd->entry_count = 0; +#if (DRC_DEBUG & 2) + bd->refcount = 0; #endif + + *blk_id = *bcount; + (*bcount)++; + if (*bcount >= BLOCK_MAX_COUNT(tcache_id)) + *bcount = 0; + + return bd; } static void dr_link_blocks(struct block_entry *be, int tcache_id) @@ -1321,6 +1248,139 @@ static void dr_activate_block(struct block_desc *bd, int tcache_id, int is_slave bd->active = 1; } +static void REGPARM(3) ALIGNED(32) *dr_lookup_block(u32 pc, SH2 *sh2, int *tcache_id) +{ + struct block_entry *be = NULL; + void *block = NULL; + + be = dr_get_entry(pc, sh2->is_slave, tcache_id); + if (be != NULL) + block = be->tcache_ptr; + +#if (DRC_DEBUG & 2) + if (be != NULL) + be->block->refcount++; +#endif + return block; +} + +static void dr_free_oldest_block(int tcache_id) +{ + struct block_desc *bd; + + if (block_limit[tcache_id] >= BLOCK_MAX_COUNT(tcache_id)) { + // block desc wrap around + block_limit[tcache_id] = 0; + } + bd = &block_tables[tcache_id][block_limit[tcache_id]]; + + if (bd->tcache_ptr && bd->tcache_ptr < tcache_ptrs[tcache_id]) { + // cache wrap around + tcache_ptrs[tcache_id] = bd->tcache_ptr; + } + + if (bd->addr && bd->entry_count) + dr_rm_block_entry(bd, tcache_id, 0, 1); + + block_limit[tcache_id]++; + if (block_limit[tcache_id] >= BLOCK_MAX_COUNT(tcache_id)) + block_limit[tcache_id] = 0; + bd = &block_tables[tcache_id][block_limit[tcache_id]]; + if (bd->tcache_ptr >= tcache_ptrs[tcache_id]) + tcache_limit[tcache_id] = bd->tcache_ptr; + else + tcache_limit[tcache_id] = tcache_bases[tcache_id] + tcache_sizes[tcache_id]; +} + +static u8 *dr_prepare_cache(int tcache_id, int insn_count) +{ + u8 *limit = tcache_limit[tcache_id]; + + // if no block desc available + if (block_counts[tcache_id] == block_limit[tcache_id]) + dr_free_oldest_block(tcache_id); + + // while not enough cache space left (limit - tcache_ptr < max space needed) + while (tcache_limit[tcache_id] - tcache_ptrs[tcache_id] < insn_count * 128) + dr_free_oldest_block(tcache_id); + + if (limit != tcache_limit[tcache_id]) { +#if BRANCH_CACHE + if (tcache_id) + memset32(sh2s[tcache_id-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + else { + memset32(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); + memset32(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)/4); + } +#endif +#if CALL_STACK + if (tcache_id) { + memset32(sh2s[tcache_id-1].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4); + sh2s[tcache_id-1].rts_cache_idx = 0; + } else { + memset32(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)/4); + memset32(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)/4); + sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0; + } +#endif + } + return (u8 *)tcache_ptrs[tcache_id]; +} + +static void dr_flush_tcache(int tcid) +{ + int i; +#if (DRC_DEBUG & 1) + int tc_used, bl_used; + + tc_used = tcache_sizes[tcid] - (tcache_limit[tcid] - tcache_ptrs[tcid]); + bl_used = BLOCK_MAX_COUNT(tcid) - (block_limit[tcid] - block_counts[tcid]); + elprintf(EL_STATUS, "tcache #%d flush! (%d/%d, bds %d/%d)", tcid, tc_used, + tcache_sizes[tcid], bl_used, BLOCK_MAX_COUNT(tcid)); +#endif + + block_counts[tcid] = 0; + block_limit[tcid] = BLOCK_MAX_COUNT(tcid) - 1; + block_link_pool_counts[tcid] = 0; + blink_free[tcid] = NULL; + memset(unresolved_links[tcid], 0, sizeof(*unresolved_links[0]) * HASH_TABLE_SIZE(tcid)); + memset(hash_tables[tcid], 0, sizeof(*hash_tables[0]) * HASH_TABLE_SIZE(tcid)); + tcache_ptrs[tcid] = tcache_bases[tcid]; + tcache_limit[tcid] = tcache_bases[tcid] + tcache_sizes[tcid]; + if (Pico32xMem->sdram != NULL) { + if (tcid == 0) { // ROM, RAM + memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram)); + memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram)); + memset(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)); + memset(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)); + memset(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)); + memset(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)); + sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0; + } else { + memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram)); + memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram)); + memset(Pico32xMem->drcblk_da[tcid - 1], 0, sizeof(Pico32xMem->drcblk_da[tcid - 1])); + memset(Pico32xMem->drclit_da[tcid - 1], 0, sizeof(Pico32xMem->drclit_da[tcid - 1])); + memset(sh2s[tcid - 1].branch_cache, -1, sizeof(sh2s[0].branch_cache)); + memset(sh2s[tcid - 1].rts_cache, -1, sizeof(sh2s[0].rts_cache)); + sh2s[tcid - 1].rts_cache_idx = 0; + } + } +#if (DRC_DEBUG & 4) + tcache_dsm_ptrs[tcid] = tcache_bases[tcid]; +#endif + + for (i = 0; i < RAM_SIZE(tcid) / INVAL_PAGE_SIZE; i++) + discard_block_list(&inval_lookup[tcid][i]); + discard_block_list(&inactive_blocks[tcid]); +} + +static void *dr_failure(void) +{ + lprintf("recompilation failed\n"); + exit(1); +} + #define ADD_TO_ARRAY(array, count, item, failcode) { \ if (count >= ARRAY_SIZE(array)) { \ dbg(1, "warning: " #array " overflow"); \ @@ -5066,61 +5126,7 @@ static void sh2_generate_utils(void) #endif } -static void sh2_smc_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit, int free) -{ - struct block_link *bl; - u32 i; - - free = free || nolit; // block is invalid if literals are overwritten - dbg(2," %sing block %08x-%08x,%08x-%08x, blkid %d,%d", free?"delet":"disabl", - bd->addr, bd->addr + bd->size, bd->addr_lit, bd->addr_lit + bd->size_lit, - tcache_id, bd - block_tables[tcache_id]); - if (bd->addr == 0 || bd->entry_count == 0) { - dbg(1, " killing dead block!? %08x", bd->addr); - return; - } - -#if LINK_BRANCHES - // remove from hash table, make incoming links unresolved - if (bd->active) { - for (i = 0; i < bd->entry_count; i++) { - rm_from_hashlist(&bd->entryp[i], tcache_id); - - while ((bl = bd->entryp[i].links) != NULL) { - dr_block_unlink(bl, 1); - add_to_hashlist_unresolved(bl, tcache_id); - } - } - - dr_mark_memory(-1, bd, tcache_id, nolit); - add_to_block_list(&inactive_blocks[tcache_id], bd); - } - bd->active = 0; -#endif - - if (free) { -#if LINK_BRANCHES - // revoke outgoing links - for (bl = bd->entryp[0].o_links; bl != NULL; bl = bl->o_next) { - if (bl->target) - dr_block_unlink(bl, 0); - else - rm_from_hashlist_unresolved(bl, tcache_id); - bl->jump = NULL; - bl->next = blink_free[bl->tcache_id]; - blink_free[bl->tcache_id] = bl; - } - bd->entryp[0].o_links = NULL; -#endif - // invalidate block - rm_from_block_lists(bd); - bd->addr = bd->size = bd->addr_lit = bd->size_lit = 0; - bd->entry_count = 0; - } - emith_update_cache(); -} - -static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift) +static void sh2_smc_rm_blocks(u32 a, int len, int tcache_id, u32 shift) { struct block_list **blist, *entry, *next; u32 mask = RAM_SIZE(tcache_id) - 1; @@ -5146,12 +5152,12 @@ static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift) start_lit = block->addr_lit & wtmask; end_lit = start_lit + block->size_lit; // disable/delete block if it covers the modified address - if ((start_addr <= a && a < end_addr) || - (start_lit <= a && a < end_lit)) + if ((start_addr <= a+len && a < end_addr) || + (start_lit <= a+len && a < end_lit)) { dbg(2, "smc remove @%08x", a); - end_addr = (start_lit <= a && block->size_lit ? a : 0); - sh2_smc_rm_block_entry(block, tcache_id, end_addr, 0); + end_addr = (start_lit <= a+len && block->size_lit ? a : 0); + dr_rm_block_entry(block, tcache_id, end_addr, 0); #if (DRC_DEBUG & 2) removed = 1; #endif @@ -5182,17 +5188,20 @@ static void sh2_smc_rm_blocks(u32 a, int tcache_id, u32 shift) #endif } -void sh2_drc_wcheck_ram(unsigned int a, int val, SH2 *sh2) +void sh2_drc_wcheck_ram(unsigned int a, unsigned t, SH2 *sh2) { - dbg(2, "%csh2 smc check @%08x v=%d", sh2->is_slave ? 's' : 'm', a, val); - sh2_smc_rm_blocks(a, 0, SH2_DRCBLK_RAM_SHIFT); + int off = ((u16) t ? 0 : 2); + int len = ((u16) t ? 2 : 0) + (t >> 16 ? 2 : 0); + + sh2_smc_rm_blocks(a + off, len, 0, SH2_DRCBLK_RAM_SHIFT); } -void sh2_drc_wcheck_da(unsigned int a, int val, SH2 *sh2) +void sh2_drc_wcheck_da(unsigned int a, unsigned t, SH2 *sh2) { - int cpuid = sh2->is_slave; - dbg(2, "%csh2 smc check @%08x v=%d", cpuid ? 's' : 'm', a, val); - sh2_smc_rm_blocks(a, 1 + cpuid, SH2_DRCBLK_DA_SHIFT); + int off = ((u16) t ? 0 : 2); + int len = ((u16) t ? 2 : 0) + (t >> 16 ? 2 : 0); + + sh2_smc_rm_blocks(a + off, len, 1 + sh2->is_slave, SH2_DRCBLK_DA_SHIFT); } int sh2_execute_drc(SH2 *sh2c, int cycles) @@ -5408,9 +5417,9 @@ void sh2_drc_flush_all(void) block_stats(); entry_stats(); bcache_stats(); - flush_tcache(0); - flush_tcache(1); - flush_tcache(2); + dr_flush_tcache(0); + dr_flush_tcache(1); + dr_flush_tcache(2); Pico32x.emu_flags &= ~P32XF_DRC_ROM_C; } diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 3565940da..94dff8c51 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -1,7 +1,7 @@ int sh2_drc_init(SH2 *sh2); void sh2_drc_finish(SH2 *sh2); -void sh2_drc_wcheck_ram(unsigned int a, int val, SH2 *sh2); -void sh2_drc_wcheck_da(unsigned int a, int val, SH2 *sh2); +void sh2_drc_wcheck_ram(unsigned int a, unsigned val, SH2 *sh2); +void sh2_drc_wcheck_da(unsigned int a, unsigned val, SH2 *sh2); #ifdef DRC_SH2 void sh2_drc_mem_setup(SH2 *sh2); diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index cf830dfca..57693ac1b 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -80,7 +80,7 @@ typedef struct SH2_ unsigned char data_array[0x1000]; // cache (can be used as RAM) unsigned int peri_regs[0x200/4]; // periphereal regs -} SH2; +} SH2 ALIGNED(32); #define CYCLE_MULT_SHIFT 10 #define C_M68K_TO_SH2(xsh2, c) \ diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 06215a7ce..39504416f 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -231,7 +231,7 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) for (idx = nrd = wr; idx != rd; ) { idx = (idx-1) % PFIFO_SZ; q = &fifo[idx]; - if (q->cpu != cpu && q->a == a) { q->a = -1; } + if (q->a == a && q->cpu != cpu) { q->a = -1; } if (q->a != -1) { nrd = idx; } } rd = nrd; @@ -825,7 +825,8 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) unsigned int cycles = sh2_cycles_done_m68k(sh2); Pico32x.sh2_regs[4 / 2] = d; p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); - sh2_end_run(sh2, 4); + if (p32x_sh2_ready(sh2->other_sh2, cycles+16)) + sh2_end_run(sh2, 4); sh2_poll_write(a & ~1, d, cycles, sh2); } return; @@ -851,7 +852,8 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) REG8IN16(r, a) = d; p32x_m68k_poll_event(P32XF_68KCPOLL); p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); - sh2_end_run(sh2, 1); + if (p32x_sh2_ready(sh2->other_sh2, cycles+16)) + sh2_end_run(sh2, 1); sh2_poll_write(a & ~1, r[a / 2], cycles, sh2); } return; @@ -943,7 +945,8 @@ static void p32x_sh2reg_write16(u32 a, u32 d, SH2 *sh2) Pico32x.regs[a / 2] = d; p32x_m68k_poll_event(P32XF_68KCPOLL); p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); - sh2_end_run(sh2, 1); + if (p32x_sh2_ready(sh2->other_sh2, cycles+16)) + sh2_end_run(sh2, 1); sh2_poll_write(a, d, cycles, sh2); } return; @@ -1569,7 +1572,7 @@ static u32 REGPARM(2) sh2_read32_rom(u32 a, SH2 *sh2) // writes #ifdef DRC_SH2 -static void NOINLINE sh2_sdram_poll(u32 a, u32 d, SH2 *sh2) +static void sh2_sdram_poll(u32 a, u32 d, SH2 *sh2) { unsigned cycles; @@ -1577,34 +1580,35 @@ static void NOINLINE sh2_sdram_poll(u32 a, u32 d, SH2 *sh2) cycles = sh2_cycles_done_m68k(sh2); sh2_poll_write(a, d, cycles, sh2); p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_RPOLL, cycles); - sh2_end_run(sh2, 1); + if (p32x_sh2_ready(sh2->other_sh2, cycles+16)) + sh2_end_run(sh2, 1); DRC_RESTORE_SR(sh2); } -void NOINLINE sh2_sdram_checks(u32 a, u32 d, SH2 *sh2, int t) +void sh2_sdram_checks(u32 a, u32 d, SH2 *sh2, u32 t) { - if (t & 0x80) - sh2_sdram_poll(a, d, sh2); - if (t & 0x7f) - sh2_drc_wcheck_ram(a, t & 0x7f, sh2); + if (t & 0x80) sh2_sdram_poll(a, d, sh2); + if (t & 0x7f) sh2_drc_wcheck_ram(a, t & 0x7f, sh2); } -void NOINLINE sh2_sdram_checks_l(u32 a, u32 d, SH2 *sh2, int t) +void sh2_sdram_checks_l(u32 a, u32 d, SH2 *sh2, u32 t) { - sh2_sdram_checks(a, d>>16, sh2, t); - sh2_sdram_checks(a+2, d, sh2, t>>16); + u32 m = 0x80 | 0x800000; + + if (t & 0x000080) sh2_sdram_poll(a, d>>16, sh2); + if (t & 0x800000) sh2_sdram_poll(a+2, d, sh2); + if (t & ~m) sh2_drc_wcheck_ram(a, t & ~m, sh2); } #ifndef _ASM_32X_MEMORY_C -static void sh2_da_checks(u32 a, int t, SH2 *sh2) +static void sh2_da_checks(u32 a, u32 t, SH2 *sh2) { sh2_drc_wcheck_da(a, t, sh2); } -static void NOINLINE sh2_da_checks_l(u32 a, int t, SH2 *sh2) +static void sh2_da_checks_l(u32 a, u32 t, SH2 *sh2) { - sh2_da_checks(a, t, sh2); - sh2_da_checks(a+2, t>>16, sh2); + sh2_drc_wcheck_da(a, t, sh2); } #endif #endif @@ -1667,7 +1671,7 @@ static void REGPARM(3) sh2_write8_sdram(u32 a, u32 d, SH2 *sh2) ((u8 *)sh2->p_sdram)[a1] = d; #ifdef DRC_SH2 u8 *p = sh2->p_drcblk_ram; - int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; + u32 t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; if (t) sh2_sdram_checks(a & ~1, ((u16 *)sh2->p_sdram)[a1 / 2], sh2, t); #endif @@ -1679,7 +1683,7 @@ static void REGPARM(3) sh2_write8_da(u32 a, u32 d, SH2 *sh2) sh2->data_array[a1] = d; #ifdef DRC_SH2 u8 *p = sh2->p_drcblk_da; - int t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; + u32 t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; if (t) sh2_da_checks(a, t, sh2); #endif @@ -1741,7 +1745,7 @@ static void REGPARM(3) sh2_write16_sdram(u32 a, u32 d, SH2 *sh2) ((u16 *)sh2->p_sdram)[a1 / 2] = d; #ifdef DRC_SH2 u8 *p = sh2->p_drcblk_ram; - int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; + u32 t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; if (t) sh2_sdram_checks(a, d, sh2, t); #endif @@ -1753,7 +1757,7 @@ static void REGPARM(3) sh2_write16_da(u32 a, u32 d, SH2 *sh2) ((u16 *)sh2->data_array)[a1 / 2] = d; #ifdef DRC_SH2 u8 *p = sh2->p_drcblk_da; - int t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; + u32 t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; if (t) sh2_da_checks(a, t, sh2); #endif @@ -1816,8 +1820,8 @@ static void REGPARM(3) sh2_write32_sdram(u32 a, u32 d, SH2 *sh2) *(u32 *)(sh2->p_sdram + a1) = (d << 16) | (d >> 16); #ifdef DRC_SH2 u8 *p = sh2->p_drcblk_ram; - int t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; - int u = p[(a1+2) >> SH2_DRCBLK_RAM_SHIFT]; + u32 t = p[a1 >> SH2_DRCBLK_RAM_SHIFT]; + u32 u = p[(a1+2) >> SH2_DRCBLK_RAM_SHIFT]; if (t|(u<<16)) sh2_sdram_checks_l(a, d, sh2, t|(u<<16)); #endif @@ -1829,8 +1833,8 @@ static void REGPARM(3) sh2_write32_da(u32 a, u32 d, SH2 *sh2) *((u32 *)sh2->data_array + a1/4) = (d << 16) | (d >> 16); #ifdef DRC_SH2 u8 *p = sh2->p_drcblk_da; - int t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; - int u = p[(a1+2) >> SH2_DRCBLK_DA_SHIFT]; + u32 t = p[a1 >> SH2_DRCBLK_DA_SHIFT]; + u32 u = p[(a1+2) >> SH2_DRCBLK_DA_SHIFT]; if (t|(u<<16)) sh2_da_checks_l(a, t|(u<<16), sh2); #endif diff --git a/pico/32x/memory_arm.S b/pico/32x/memory_arm.S index ba83a6bf4..b3a94b62a 100644 --- a/pico/32x/memory_arm.S +++ b/pico/32x/memory_arm.S @@ -17,6 +17,7 @@ .equ SH2_DRAM_OW, 1<<(32-SH2_DRAM_SHIFT) @ DRAM overwrite mode bit .text +.align 5 #if 0 @ u32 a, SH2 *sh2 @@ -142,11 +143,12 @@ sh2_write8_sdram: ldrb r3, [ip, r3, lsr #SH2_RAM_SHIFT+1] cmp r3, #0 bxeq lr + @ need to load aligned 16 bit data for check ldr ip, [r2, #OFS_SH2_p_sdram] bic r0, r0, #1 - mov r3, r0, lsl #SH2_RAM_SHIFT - mov r3, r3, lsr #SH2_RAM_SHIFT - ldrh r1, [ip, r3] + mov r1, r0, lsl #SH2_RAM_SHIFT + mov r1, r1, lsr #SH2_RAM_SHIFT + ldrh r1, [ip, r1] b sh2_sdram_checks #else bx lr @@ -252,13 +254,8 @@ sh2_write32_da: ldr ip, [r2, #OFS_SH2_p_drcblk_da] ldrb r1, [ip, r3, lsr #SH2_DA_SHIFT+1]! ldrb ip, [ip, #1] - orrs r3, r1, ip, lsl #16 + orrs r1, r1, ip, lsl #16 bxeq lr - stmfd sp!, {r0, r2, ip, lr} - bl sh2_drc_wcheck_da - ldmfd sp!, {r0, r2, ip, lr} - add r0, r0, #2 - mov r1, ip b sh2_drc_wcheck_da #else bx lr diff --git a/pico/pico_int.h b/pico/pico_int.h index 89acc4fbb..0fc458efb 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -921,6 +921,10 @@ void p32x_event_schedule(unsigned int now, enum p32x_event event, int after); void p32x_event_schedule_sh2(SH2 *sh2, enum p32x_event event, int after); void p32x_schedule_hint(SH2 *sh2, unsigned int m68k_cycles); +#define p32x_sh2_ready(sh2, cycles) \ + (CYCLES_GT(cycles,sh2->m68krcycles_done) && \ + !(sh2->state&(SH2_STATE_CPOLL|SH2_STATE_VPOLL|SH2_STATE_RPOLL))) + // 32x/memory.c extern struct Pico32xMem *Pico32xMem; unsigned int PicoRead8_32x(unsigned int a); diff --git a/platform/gp2x/PicoDrive.gpe b/platform/gp2x/PicoDrive.gpe index 1c0651856..59416d938 100644 --- a/platform/gp2x/PicoDrive.gpe +++ b/platform/gp2x/PicoDrive.gpe @@ -7,6 +7,8 @@ if ! [ -e /dev/accel ]; then export POLLUX_RAM_TIMINGS='ram_timings=2,9,4,1,1,1,1' export POLLUX_LCD_TIMINGS_NTSC='lcd_timings=397,1,37,277,341,0,17,337;clkdiv0=9' export POLLUX_LCD_TIMINGS_PAL='lcd_timings=428,1,37,277,341,0,17,337;clkdiv0=10' +else + export POLLUX_RAM_TIMINGS='ram_timings=3,9,4,1,1,1,1' fi ./PicoDrive "$@" From 8d931b641f23f39cf17b5b97f0b83394a3c914c7 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 18 Oct 2019 00:16:54 +0200 Subject: [PATCH 074/174] fix gp2x regression --- cpu/sh2/compiler.c | 2 +- cpu/sh2/sh2.h | 4 ++-- tools/mkoffsets.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 1acc7215c..b2306cf2b 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -1248,7 +1248,7 @@ static void dr_activate_block(struct block_desc *bd, int tcache_id, int is_slave bd->active = 1; } -static void REGPARM(3) ALIGNED(32) *dr_lookup_block(u32 pc, SH2 *sh2, int *tcache_id) +static void REGPARM(3) *dr_lookup_block(u32 pc, SH2 *sh2, int *tcache_id) { struct block_entry *be = NULL; void *block = NULL; diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index 57693ac1b..05ae70524 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -13,7 +13,7 @@ typedef enum { typedef struct SH2_ { // registers. this MUST correlate with enum sh2_reg_e. - unsigned int r[16]; // 00 + unsigned int r[16] ALIGNED(32); unsigned int pc; // 40 unsigned int ppc; unsigned int pr; @@ -80,7 +80,7 @@ typedef struct SH2_ unsigned char data_array[0x1000]; // cache (can be used as RAM) unsigned int peri_regs[0x200/4]; // periphereal regs -} SH2 ALIGNED(32); +} SH2; #define CYCLE_MULT_SHIFT 10 #define C_M68K_TO_SH2(xsh2, c) \ diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index 2223b8046..8a0557c7d 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -48,7 +48,7 @@ get_define () # prefix struct member member... line=$(printf "#define %-20s 0x%04x" $prefix$name $rodata) } -CFLAGS="$CFLAGS -fno-lto" +if echo $CFLAGS | grep -qe -flto; then CFLAGS="$CFLAGS -fno-lto"; fi # determine endianess echo "const int val = 1;" >/tmp/getoffs.c compile_rodata From 855c2acc531de78746ed80706b9874a55c2c9581 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 19 Oct 2019 08:53:28 +0200 Subject: [PATCH 075/174] 32x, finetuning --- cpu/sh2/compiler.c | 31 ++++++++++++++----------------- cpu/sh2/compiler.h | 4 ++-- pico/32x/memory.c | 41 ++++++++++++++++++++--------------------- pico/32x/memory_arm.S | 8 +++++--- 4 files changed, 41 insertions(+), 43 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index b2306cf2b..e9173c4c3 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -272,9 +272,9 @@ static void REGPARM(3) *sh2_drc_log_entry(void *block, SH2 *sh2, u32 sr) // and can be discarded early // XXX: need to tune sizes static const int tcache_sizes[TCACHE_BUFFERS] = { - DRC_TCACHE_SIZE * 14 / 16, // ROM (rarely used), DRAM - DRC_TCACHE_SIZE / 16, // BIOS, data array in master sh2 - DRC_TCACHE_SIZE / 16, // ... slave + DRC_TCACHE_SIZE * 30 / 32, // ROM (rarely used), DRAM + DRC_TCACHE_SIZE / 32, // BIOS, data array in master sh2 + DRC_TCACHE_SIZE / 32, // ... slave }; static u8 *tcache_bases[TCACHE_BUFFERS]; @@ -332,13 +332,13 @@ struct block_desc { struct block_entry entryp[MAX_BLOCK_ENTRIES]; }; -#define BLOCK_MAX_COUNT(tcid) ((tcid) ? 256 : 16*256) +#define BLOCK_MAX_COUNT(tcid) ((tcid) ? 256 : 32*256) static struct block_desc *block_tables[TCACHE_BUFFERS]; static int block_counts[TCACHE_BUFFERS]; static int block_limit[TCACHE_BUFFERS]; // we have block_link_pool to avoid using mallocs -#define BLOCK_LINK_MAX_COUNT(tcid) ((tcid) ? 1024 : 16*1024) +#define BLOCK_LINK_MAX_COUNT(tcid) ((tcid) ? 512 : 32*512) static struct block_link *block_link_pool[TCACHE_BUFFERS]; static int block_link_pool_counts[TCACHE_BUFFERS]; static struct block_link **unresolved_links[TCACHE_BUFFERS]; @@ -363,7 +363,7 @@ static struct block_list *inactive_blocks[TCACHE_BUFFERS]; // each array has len: sizeof(mem) / INVAL_PAGE_SIZE static struct block_list **inval_lookup[TCACHE_BUFFERS]; -#define HASH_TABLE_SIZE(tcid) ((tcid) ? 256 : 64*256) +#define HASH_TABLE_SIZE(tcid) ((tcid) ? 512 : 64*512) static struct block_entry **hash_tables[TCACHE_BUFFERS]; #define HASH_FUNC(hash_tab, addr, mask) \ @@ -5188,20 +5188,14 @@ static void sh2_smc_rm_blocks(u32 a, int len, int tcache_id, u32 shift) #endif } -void sh2_drc_wcheck_ram(unsigned int a, unsigned t, SH2 *sh2) +void sh2_drc_wcheck_ram(unsigned int a, unsigned len, SH2 *sh2) { - int off = ((u16) t ? 0 : 2); - int len = ((u16) t ? 2 : 0) + (t >> 16 ? 2 : 0); - - sh2_smc_rm_blocks(a + off, len, 0, SH2_DRCBLK_RAM_SHIFT); + sh2_smc_rm_blocks(a, len, 0, SH2_DRCBLK_RAM_SHIFT); } -void sh2_drc_wcheck_da(unsigned int a, unsigned t, SH2 *sh2) +void sh2_drc_wcheck_da(unsigned int a, unsigned len, SH2 *sh2) { - int off = ((u16) t ? 0 : 2); - int len = ((u16) t ? 2 : 0) + (t >> 16 ? 2 : 0); - - sh2_smc_rm_blocks(a + off, len, 1 + sh2->is_slave, SH2_DRCBLK_DA_SHIFT); + sh2_smc_rm_blocks(a, len, 1 + sh2->is_slave, SH2_DRCBLK_DA_SHIFT); } int sh2_execute_drc(SH2 *sh2c, int cycles) @@ -6403,6 +6397,9 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, last_btarget = 0; op = 0; // delay/poll insns counter for (i = 0, pc = base_pc; i < i_end; i++, pc += 2) { + int null; + if ((op_flags[i] & OF_BTARGET) && dr_get_entry(pc, is_slave, &null)) + break; // branch target already compiled opd = &ops[i]; crc += FETCH_OP(pc); @@ -6483,7 +6480,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, op ++; // condition 2 #endif } - end_pc = base_pc + i_end * 2; + end_pc = pc; // end_literals is used to decide to inline a literal or not // XXX: need better detection if this actually is used in write diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 94dff8c51..5f374c8cb 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -1,7 +1,7 @@ int sh2_drc_init(SH2 *sh2); void sh2_drc_finish(SH2 *sh2); -void sh2_drc_wcheck_ram(unsigned int a, unsigned val, SH2 *sh2); -void sh2_drc_wcheck_da(unsigned int a, unsigned val, SH2 *sh2); +void sh2_drc_wcheck_ram(unsigned int a, unsigned len, SH2 *sh2); +void sh2_drc_wcheck_da(unsigned int a, unsigned len, SH2 *sh2); #ifdef DRC_SH2 void sh2_drc_mem_setup(SH2 *sh2); diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 39504416f..44bc72d7d 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -162,15 +162,13 @@ void NOINLINE p32x_sh2_poll_event(SH2 *sh2, u32 flags, u32 m68k_cycles) sh2->poll_addr = sh2->poll_cycles = sh2->poll_cnt = 0; } -static void sh2s_sync_on_read(SH2 *sh2) +static void sh2s_sync_on_read(SH2 *sh2, unsigned cycles) { - int cycles; if (sh2->poll_cnt != 0) return; - cycles = sh2_cycles_done(sh2); - if (cycles > 600) - p32x_sync_other_sh2(sh2, sh2->m68krcycles_done + C_SH2_TO_M68K(sh2, cycles)); + if (p32x_sh2_ready(sh2->other_sh2, cycles-250)) + p32x_sync_other_sh2(sh2, cycles); } // poll fifo, stores writes to potential addresses used for polling. @@ -271,8 +269,8 @@ u32 REGPARM(3) p32x_sh2_poll_memory16(unsigned int a, u32 d, SH2 *sh2) DRC_SAVE_SR(sh2); // is this a synchronisation address? if(p[(a & 0x3ffff) >> SH2_DRCBLK_RAM_SHIFT] & 0x80) { - sh2s_sync_on_read(sh2); cycles = sh2_cycles_done_m68k(sh2); + sh2s_sync_on_read(sh2, cycles); // check poll fifo and sign-extend the result correctly d = (s16)sh2_poll_read(a, d, cycles, sh2); } @@ -291,8 +289,8 @@ u32 REGPARM(3) p32x_sh2_poll_memory32(unsigned int a, u32 d, SH2 *sh2) DRC_SAVE_SR(sh2); // is this a synchronisation address? if(p[(a & 0x3ffff) >> SH2_DRCBLK_RAM_SHIFT] & 0x80) { - sh2s_sync_on_read(sh2); cycles = sh2_cycles_done_m68k(sh2); + sh2s_sync_on_read(sh2, cycles); // check poll fifo and sign-extend the result correctly d = (sh2_poll_read(a, d >> 16, cycles, sh2) << 16) | ((u16)sh2_poll_read(a+2, d, cycles, sh2)); @@ -729,6 +727,7 @@ static void p32x_vdp_write16(u32 a, u32 d, SH2 *sh2) static u32 p32x_sh2reg_read16(u32 a, SH2 *sh2) { u16 *r = Pico32x.regs; + unsigned cycles; a &= 0x3e; switch (a/2) { @@ -737,8 +736,9 @@ static u32 p32x_sh2reg_read16(u32 a, SH2 *sh2) | Pico32x.sh2irq_mask[sh2->is_slave]; case 0x04/2: // H count (often as comm too) sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 9); - sh2s_sync_on_read(sh2); - return sh2_poll_read(a, Pico32x.sh2_regs[4 / 2], sh2_cycles_done_m68k(sh2), sh2); + cycles = sh2_cycles_done_m68k(sh2); + sh2s_sync_on_read(sh2, cycles); + return sh2_poll_read(a, Pico32x.sh2_regs[4 / 2], cycles, sh2); case 0x06/2: return (r[a / 2] & ~P32XS_FULL) | 0x4000; case 0x08/2: // DREQ src @@ -770,8 +770,9 @@ static u32 p32x_sh2reg_read16(u32 a, SH2 *sh2) case 0x2c/2: case 0x2e/2: sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 9); - sh2s_sync_on_read(sh2); - return sh2_poll_read(a, r[a / 2], sh2_cycles_done_m68k(sh2), sh2); + cycles = sh2_cycles_done_m68k(sh2); + sh2s_sync_on_read(sh2, cycles); + return sh2_poll_read(a, r[a / 2], cycles, sh2); case 0x30/2: // PWM case 0x32/2: case 0x34/2: @@ -825,7 +826,7 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) unsigned int cycles = sh2_cycles_done_m68k(sh2); Pico32x.sh2_regs[4 / 2] = d; p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); - if (p32x_sh2_ready(sh2->other_sh2, cycles+16)) + if (p32x_sh2_ready(sh2->other_sh2, cycles+8)) sh2_end_run(sh2, 4); sh2_poll_write(a & ~1, d, cycles, sh2); } @@ -852,7 +853,7 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) REG8IN16(r, a) = d; p32x_m68k_poll_event(P32XF_68KCPOLL); p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); - if (p32x_sh2_ready(sh2->other_sh2, cycles+16)) + if (p32x_sh2_ready(sh2->other_sh2, cycles+8)) sh2_end_run(sh2, 1); sh2_poll_write(a & ~1, r[a / 2], cycles, sh2); } @@ -945,7 +946,7 @@ static void p32x_sh2reg_write16(u32 a, u32 d, SH2 *sh2) Pico32x.regs[a / 2] = d; p32x_m68k_poll_event(P32XF_68KCPOLL); p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_CPOLL, cycles); - if (p32x_sh2_ready(sh2->other_sh2, cycles+16)) + if (p32x_sh2_ready(sh2->other_sh2, cycles+8)) sh2_end_run(sh2, 1); sh2_poll_write(a, d, cycles, sh2); } @@ -1580,7 +1581,7 @@ static void sh2_sdram_poll(u32 a, u32 d, SH2 *sh2) cycles = sh2_cycles_done_m68k(sh2); sh2_poll_write(a, d, cycles, sh2); p32x_sh2_poll_event(sh2->other_sh2, SH2_STATE_RPOLL, cycles); - if (p32x_sh2_ready(sh2->other_sh2, cycles+16)) + if (p32x_sh2_ready(sh2->other_sh2, cycles+8)) sh2_end_run(sh2, 1); DRC_RESTORE_SR(sh2); } @@ -1588,27 +1589,25 @@ static void sh2_sdram_poll(u32 a, u32 d, SH2 *sh2) void sh2_sdram_checks(u32 a, u32 d, SH2 *sh2, u32 t) { if (t & 0x80) sh2_sdram_poll(a, d, sh2); - if (t & 0x7f) sh2_drc_wcheck_ram(a, t & 0x7f, sh2); + if (t & 0x7f) sh2_drc_wcheck_ram(a, 2, sh2); } void sh2_sdram_checks_l(u32 a, u32 d, SH2 *sh2, u32 t) { - u32 m = 0x80 | 0x800000; - if (t & 0x000080) sh2_sdram_poll(a, d>>16, sh2); if (t & 0x800000) sh2_sdram_poll(a+2, d, sh2); - if (t & ~m) sh2_drc_wcheck_ram(a, t & ~m, sh2); + if (t & ~0x800080) sh2_drc_wcheck_ram(a, 4, sh2); } #ifndef _ASM_32X_MEMORY_C static void sh2_da_checks(u32 a, u32 t, SH2 *sh2) { - sh2_drc_wcheck_da(a, t, sh2); + sh2_drc_wcheck_da(a, 2, sh2); } static void sh2_da_checks_l(u32 a, u32 t, SH2 *sh2) { - sh2_drc_wcheck_da(a, t, sh2); + sh2_drc_wcheck_da(a, 4, sh2); } #endif #endif diff --git a/pico/32x/memory_arm.S b/pico/32x/memory_arm.S index b3a94b62a..40707fe7b 100644 --- a/pico/32x/memory_arm.S +++ b/pico/32x/memory_arm.S @@ -139,12 +139,11 @@ sh2_write8_sdram: mov r3, r3, lsl #SH2_RAM_SHIFT strb r1, [ip, r3, lsr #SH2_RAM_SHIFT] #ifdef DRC_SH2 - ldr ip, [r2, #OFS_SH2_p_drcblk_ram] - ldrb r3, [ip, r3, lsr #SH2_RAM_SHIFT+1] + ldr r1, [r2, #OFS_SH2_p_drcblk_ram] + ldrb r3, [r1, r3, lsr #SH2_RAM_SHIFT+1] cmp r3, #0 bxeq lr @ need to load aligned 16 bit data for check - ldr ip, [r2, #OFS_SH2_p_sdram] bic r0, r0, #1 mov r1, r0, lsl #SH2_RAM_SHIFT mov r1, r1, lsr #SH2_RAM_SHIFT @@ -166,6 +165,7 @@ sh2_write8_da: bic r0, r0, #1 cmp r1, #0 bxeq lr + mov r1, #2 b sh2_drc_wcheck_da #else bx lr @@ -206,6 +206,7 @@ sh2_write16_da: ldrb r1, [ip, r3, lsr #1] cmp r1, #0 bxeq lr + mov r1, #2 b sh2_drc_wcheck_da #else bx lr @@ -256,6 +257,7 @@ sh2_write32_da: ldrb ip, [ip, #1] orrs r1, r1, ip, lsl #16 bxeq lr + mov r1, #4 b sh2_drc_wcheck_da #else bx lr From c3fa864a718d3b2ed06aea5dbc8219eb97805f57 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 9 Nov 2019 10:24:52 +0100 Subject: [PATCH 076/174] sh2 drc: moved host register assignment to code emitters, minor bugfixing --- cpu/drc/emit_arm.c | 18 ++- cpu/drc/emit_arm64.c | 16 +- cpu/drc/emit_mips.c | 32 ++-- cpu/drc/emit_x86.c | 29 +++- cpu/sh2/compiler.c | 287 +++++++++--------------------------- cpu/sh2/compiler.h | 4 +- cpu/sh2/sh2.h | 1 + platform/common/disarm.c | 8 +- platform/common/disarm.h | 2 +- platform/common/dismips.c | 12 +- platform/common/dismips.h | 2 +- platform/common/host_dasm.c | 7 +- 12 files changed, 171 insertions(+), 247 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index ec2958b12..e35d3471f 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -6,9 +6,21 @@ * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. */ -#define HOST_REGS 16 -#define CONTEXT_REG 11 -#define RET_REG 0 +#define HOST_REGS 16 + +// OABI/EABI: params: r0-r3, return: r0-r1, temp: r12,r14, saved: r4-r8,r10,r11 +// SP,PC: r13,r15 must not be used. saved: r9 (for platform use, e.g. on ios) +#define RET_REG 0 +#define PARAM_REGS { 0, 1, 2, 3 } +#ifndef __MACH__ +#define PRESERVED_REGS { 4, 5, 6, 7, 8, 9, 10, 11 } +#else +#define PRESERVED_REGS { 4, 5, 6, 7, 8, 10, 11 } // no r9.. +#endif +#define TEMPORARY_REGS { 12, 14 } + +#define CONTEXT_REG 11 +#define STATIC_SH2_REGS { SHR_SR,10 , SHR_R0,8 , SHR_R0+1,9 } // XXX: tcache_ptr type for SVP and SH2 compilers differs.. #define EMIT_PTR(ptr, x) \ diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 4bad64690..0c36b2bc1 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -6,8 +6,16 @@ * See COPYING file in the top-level directory. */ #define HOST_REGS 32 -#define CONTEXT_REG 19 + +// AAPCS64: params: r0-r7, return: r0-r1, temp: r8-r17, saved: r19-r29 +// reserved: r18 (for platform use) #define RET_REG 0 +#define PARAM_REGS { 0, 1, 2, 3, 4, 5, 6, 7 } +#define PRESERVED_REGS { 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 } +#define TEMPORARY_REGS { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 } + +#define CONTEXT_REG 29 +#define STATIC_SH2_REGS { SHR_SR,28 , SHR_R0,27 , SHR_R0+1,26 } // R31 doesn't exist, it aliases either with zero or SP #define SP 31 // stack pointer @@ -100,9 +108,9 @@ enum { XT_UXTW=0x4, XT_UXTX=0x6, XT_LSL=0x7, XT_SXTW=0xc, XT_SXTX=0xe }; #define A64_NEGS_REG(rd, rm, stype, simm) \ A64_SUBS_REG(rd,Z0,rm,stype,simm) #define A64_NEGC_REG(rd, rm) \ - A64_SBC_REG(rd,Z0,rm,stype,simm) + A64_SBC_REG(rd,Z0,rm) #define A64_NEGCS_REG(rd, rm) \ - A64_SBCS_REG(rd,Z0,rm,stype,simm) + A64_SBCS_REG(rd,Z0,rm) #define A64_CMP_REG(rn, rm, stype, simm) \ A64_SUBS_REG(Z0, rn, rm, stype, simm) #define A64_CMN_REG(rn, rm, stype, simm) \ @@ -145,7 +153,7 @@ enum { XT_UXTW=0x4, XT_UXTX=0x6, XT_LSL=0x7, XT_SXTW=0xc, XT_SXTX=0xe }; A64_INSN(0xd,OP_ADD &3,0x0,_,rm,_,_,rn,rd) #define A64_ADCS_REG(rd, rn, rm) \ A64_INSN(0xd,OP_ADDS&3,0x0,_,rm,_,_,rn,rd) -#define A64_SBC_REG(rd, rn, rm, s) \ +#define A64_SBC_REG(rd, rn, rm) \ A64_INSN(0xd,OP_SUB &3,0x0,_,rm,_,_,rn,rd) #define A64_SBCS_REG(rd, rn, rm) \ A64_INSN(0xd,OP_SUBS&3,0x0,_,rm,_,_,rn,rd) diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 38d68f40e..832364e9f 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -6,8 +6,17 @@ * See COPYING file in the top-level directory. */ #define HOST_REGS 32 + +// MIPS ABI: params: r4-r7, return: r2-r3, temp: r1(at),r8-r15,r24-r25,r31(ra), +// saved: r16-r23,r30, reserved: r0(zero), r26-r27(irq), r28(gp), r29(sp) +// r1,r15,r24,r25(at,t7-t9) are used internally by the code emitter +#define RET_REG 2 // v0 +#define PARAM_REGS { 4, 5, 6, 7 } // a0-a3 +#define PRESERVED_REGS { 16, 17, 18, 19, 20, 21, 22, 23 } // s0-s7 +#define TEMPORARY_REGS { 2, 3, 8, 9, 10, 11, 12, 13, 14 } // v0-v1,t0-t6 + #define CONTEXT_REG 23 // s7 -#define RET_REG 2 // v0 +#define STATIC_SH2_REGS { SHR_SR,22 , SHR_R0,21 , SHR_R0+1,20 } // NB: the ubiquitous JZ74[46]0 uses MIPS32 Release 1, a slight MIPS II superset @@ -73,7 +82,7 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; #define MIPS_OP_IMM(op, rt, rs, imm) \ MIPS_INSN(op, rs, rt, _, _, (u16)(imm)) // I-type -// rd = rt OP rs +// rd = rs OP rt #define MIPS_ADD_REG(rd, rs, rt) \ MIPS_OP_REG(FN_ADDU, rd, rs, rt) #define MIPS_SUB_REG(rd, rs, rt) \ @@ -334,7 +343,7 @@ static void *emith_branch(u32 op) #define JMP_EMIT(cond, ptr) { \ u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; \ - emith_flush(); /* NO delay slot handling across jump targets */ \ + emith_flush(); /* prohibit delay slot switching across jump targets */ \ EMIT_PTR(ptr, MIPS_BCONDZ(cond_m, cond_r, val_ & 0x0003ffff)); \ } @@ -658,14 +667,19 @@ static void emith_move_imm(int r, uintptr_t imm) EMIT_PTR(ptr_, (*ptr_ & 0xffff0000) | (u16)(s8)(imm)); \ } while (0) -// arithmetic, immediate +// arithmetic, immediate - can only be ADDI[U], since SUBI[U] doesn't exist static void emith_arith_imm(int op, int rd, int rs, u32 imm) { - if ((s16)imm != imm) { + if ((s16)imm == imm) { + if (imm || rd != rs) + EMIT(MIPS_OP_IMM(op, rd, rs, imm)); + } else if ((s32)imm < 0) { + emith_move_r_imm(AT, -imm); + EMIT(MIPS_OP_REG(FN_SUB + (op-OP_ADDI), rd, rs, AT)); + } else { emith_move_r_imm(AT, imm); EMIT(MIPS_OP_REG(FN_ADD + (op-OP_ADDI), rd, rs, AT)); - } else if (imm || rd != rs) - EMIT(MIPS_OP_IMM(op, rd, rs, imm)); + } } #define emith_add_r_imm(r, imm) \ @@ -1137,7 +1151,7 @@ static int emith_cond_check(int cond, int *r) // conditions using CZ case DCOND_LS: // C || Z case DCOND_HI: // !C && !Z - EMIT(MIPS_ADD_IMM(AT, FC, (u16)-1)); // !C && !Z + EMIT(MIPS_ADD_IMM(AT, FC, -1)); // !C && !Z EMIT(MIPS_AND_REG(AT, FNZ, AT)); *r = AT, b = (cond == DCOND_HI ? MIPS_BNE : MIPS_BEQ); break; @@ -1161,7 +1175,7 @@ static int emith_cond_check(int cond, int *r) case DCOND_GT: // !(N^V) && !Z EMIT(MIPS_LSR_IMM(AT, FV, 31)); // Nd^V = Nt^Ns^C EMIT(MIPS_XOR_REG(AT, FC, AT)); - EMIT(MIPS_ADD_IMM(AT, AT, (u16)-1)); // !(Nd^V) && !Z + EMIT(MIPS_ADD_IMM(AT, AT, -1)); // !(Nd^V) && !Z EMIT(MIPS_AND_REG(AT, FNZ, AT)); *r = AT, b = (cond == DCOND_GT ? MIPS_BNE : MIPS_BEQ); break; diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 212a12c55..39f3a1d76 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -17,8 +17,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common xR8, xR9, xR10, xR11, xR12, xR13, xR14, xR15 }; // x86-64 only -#define CONTEXT_REG xBP -#define RET_REG xAX +#define CONTEXT_REG xBP +#define RET_REG xAX #define ICOND_JO 0x00 #define ICOND_JNO 0x01 @@ -935,6 +935,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common emith_ret(); \ } while (0) + #define EMITH_JMP_START(cond) { \ u8 *cond_ptr; \ JMP8_POS(cond_ptr) @@ -1006,6 +1007,14 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #ifndef _WIN32 +// SystemV ABI conventions: +// rbx,rbp,r12-r15 are preserved, rax,rcx,rdx,rsi,rdi,r8-r11 are temporaries +// parameters in rdi,rsi,rdx,rcx,r8,r9, return values in rax,rdx +#define PARAM_REGS { xDI, xSI, xDX, xCX, xR8, xR9 } +#define PRESERVED_REGS { xR12, xR13, xR14, xR15, xBX, xBP } +#define TEMPORARY_REGS { xAX, xR10, xR11 } +#define STATIC_SH2_REGS { SHR_SR,xBX , SHR_R0,xR15 } + #define host_arg2reg(rd, arg) \ switch (arg) { \ case 0: rd = xDI; break; \ @@ -1037,6 +1046,14 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #else // _WIN32 +// M$ ABI conventions: +// rbx,rbp,rsi,rdi,r12-r15 are preserved, rcx,rdx,rax,r8,r9,r10,r11 temporaries +// parameters in rcx,rdx,r8,r9, return values in rax,rdx +#define PARAM_REGS { xCX, xDX, xR8, xR9 } +#define PRESERVED_REGS { xSI, xDI, xR12, xR13, xR14, xR15, xBX, xBP } +#define TEMPORARY_REGS { xAX, xR10, xR11 } +#define STATIC_SH2_REGS { SHR_SR,xBX , SHR_R0,xR15 , SH2_R0+1,xR14 } + #define host_arg2reg(rd, arg) \ switch (arg) { \ case 0: rd = xCX; break; \ @@ -1087,6 +1104,14 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common assert((u32)(rm) < 8u); \ } while (0) +// MS/SystemV ABI: ebx,esi,edi,ebp are preserved, eax,ecx,edx are temporaries +// DRC uses REGPARM to pass upto 3 parameters in registers eax,ecx,edx. +// To avoid conflicts with param passing ebx must be declared temp here. +#define PARAM_REGS { xAX, xDX, xCX } +#define PRESERVED_REGS { xSI, xDI, xBP } +#define TEMPORARY_REGS { xBX } +#define STATIC_SH2_REGS { SHR_SR,xDI , SHR_R0,xSI } + #define host_arg2reg(rd, arg) \ switch (arg) { \ case 0: rd = xAX; break; \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index e9173c4c3..3cf7a0d91 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -427,213 +427,42 @@ typedef struct { static int rcache_get_tmp(void); static void rcache_free_tmp(int hr); -// Note: cache_regs[] must have at least the amount of REG/TEMP registers used -// by handlers in worst case (currently 4). -// Register assignment goes by ABI convention. Caller save registers are TEMP, -// the others are either static or REG. SR must be static, R0 very recommended. +// Note: Register assignment goes by ABI convention. Caller save registers are +// TEMPORARY, the others are PRESERVED. Unusable regs are omitted. +// there must be at least the free (not context or statically mapped) amount of +// PRESERVED/TEMPORARY registers used by handlers in worst case (currently 4). +// there must be at least 3 PARAM, and PARAM+TEMPORARY must be at least 4. +// SR and R0 should by all means be statically mapped. // XXX the static definition of SR MUST match that in compiler.h -// VBR, PC, PR must not be static (read from context in utils). -// RET_REG/params should be first TEMPs to avoid allocation conflicts in calls. -// There MUST be at least 3 params and one non-RET_REG/param TEMP. -// XXX shouldn't this be somehow defined in the code emitters? +// PC and PR must not be statically mapped (accessed in context by utils). + #ifdef __arm__ #include "../drc/emit_arm.c" - -static guest_reg_t guest_regs[] = { - // SHR_R0 .. SHR_SP -#ifndef __MACH__ // no r9.. - { GRF_STATIC, 8 }, { GRF_STATIC, 9 }, { 0 } , { 0 } , -#else - { GRF_STATIC, 8 }, { 0 } , { 0 } , { 0 } , -#endif - { 0 } , { 0 } , { 0 } , { 0 } , - { 0 } , { 0 } , { 0 } , { 0 } , - { 0 } , { 0 } , { 0 } , { 0 } , - // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, - // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, - { 0 } , { 0 } , { 0 } , { GRF_STATIC, 10 }, - { 0 } , { 0 } , { 0 } , { 0 } , -}; - -// OABI/EABI: params: r0-r3, return: r0-r1, temp: r12,r14, saved: r4-r8,r10,r11 -// SP,PC: r13,r15 must not be used. saved: r9 (for platform use, e.g. on ios) -static cache_reg_t cache_regs[] = { - { 0, HRT_TEMP }, // RET_REG, params - { 1, HRT_TEMP }, - { 2, HRT_TEMP }, // params - { 3, HRT_TEMP }, - { 12, HRT_TEMP }, // temps - { 14, HRT_TEMP }, - { 8, HRT_STATIC }, // statics -#ifndef __MACH__ // no r9.. - { 9, HRT_STATIC }, -#endif - { 10, HRT_STATIC }, - { 4, HRT_REG }, // other regs - { 5, HRT_REG }, - { 6, HRT_REG }, - { 7, HRT_REG }, -}; - #elif defined(__aarch64__) #include "../drc/emit_arm64.c" - -static guest_reg_t guest_regs[] = { - // SHR_R0 .. SHR_SP - { GRF_STATIC,20 }, { GRF_STATIC,21 }, { 0 } , { 0 } , - { 0 } , { 0 } , { 0 } , { 0 } , - { 0 } , { 0 } , { 0 } , { 0 } , - { 0 } , { 0 } , { 0 } , { 0 } , - // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, - // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, - { 0 } , { 0 } , { 0 } , { GRF_STATIC, 22 }, - { 0 } , { 0 } , { 0 } , { 0 } , -}; - -// AAPCS64: params: r0-r7, return: r0-r1, temp: r8-r17, saved: r19-r29 -// saved: r18 (for platform use) -// since drc never needs more than 4 parameters, r4-r7 are treated as temp. -static cache_reg_t cache_regs[] = { - { 0, HRT_TEMP }, // RET_REG, params - { 1, HRT_TEMP }, - { 2, HRT_TEMP }, // params - { 3, HRT_TEMP }, - { 4, HRT_TEMP }, // temps - { 5, HRT_TEMP }, - { 6, HRT_TEMP }, - { 7, HRT_TEMP }, - { 8, HRT_TEMP }, - { 9, HRT_TEMP }, - { 10, HRT_TEMP }, - { 11, HRT_TEMP }, - { 12, HRT_TEMP }, - { 13, HRT_TEMP }, - { 14, HRT_TEMP }, - { 15, HRT_TEMP }, - { 16, HRT_TEMP }, - { 17, HRT_TEMP }, - { 20, HRT_STATIC }, // statics - { 21, HRT_STATIC }, - { 22, HRT_STATIC }, - { 23, HRT_REG }, // other regs - { 24, HRT_REG }, - { 25, HRT_REG }, - { 26, HRT_REG }, - { 27, HRT_REG }, - { 28, HRT_REG }, - { 29, HRT_REG }, -}; - #elif defined(__mips__) #include "../drc/emit_mips.c" - -static guest_reg_t guest_regs[] = { - // SHR_R0 .. SHR_SP - {GRF_STATIC, 16} , {GRF_STATIC, 17} , { 0 } , { 0 } , - { 0 } , { 0 } , { 0 } , { 0 } , - { 0 } , { 0 } , { 0 } , { 0 } , - { 0 } , { 0 } , { 0 } , { 0 } , - // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, - // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, - { 0 } , { 0 } , { 0 } , {GRF_STATIC, 18} , - { 0 } , { 0 } , { 0 } , { 0 } , -}; - -// MIPS ABI: params: r4-r7, return: r2-r3, temp: r1(at),r8-r15,r24-r25,r31(ra), -// saved: r16-r23,r30, reserved: r0(zero), r26-r27(irq), r28(gp), r29(sp) -// r1,r15,r24,r25 are used internally by the code emitter -static cache_reg_t cache_regs[] = { - { 2, HRT_TEMP }, // RET_REG (v0-v1) - { 3, HRT_TEMP }, - { 4, HRT_TEMP }, // params (a0-a3) - { 5, HRT_TEMP }, - { 6, HRT_TEMP }, - { 7, HRT_TEMP }, - { 8, HRT_TEMP }, // temps (t0-t6) - { 9, HRT_TEMP }, - { 10, HRT_TEMP }, - { 11, HRT_TEMP }, - { 12, HRT_TEMP }, - { 13, HRT_TEMP }, - { 14, HRT_TEMP }, - { 16, HRT_STATIC }, // statics (s0-s2) - { 17, HRT_STATIC }, - { 18, HRT_STATIC }, - { 19, HRT_REG }, // other regs (s3-s6) - { 20, HRT_REG }, - { 21, HRT_REG }, - { 22, HRT_REG }, -}; - #elif defined(__i386__) #include "../drc/emit_x86.c" - -static guest_reg_t guest_regs[] = { - // SHR_R0 .. SHR_SP - {GRF_STATIC, xSI}, { 0 } , { 0 } , { 0 } , - { 0 } , { 0 } , { 0 } , { 0 } , - { 0 } , { 0 } , { 0 } , { 0 } , - { 0 } , { 0 } , { 0 } , { 0 } , - // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, - // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, - { 0 } , { 0 } , { 0 } , {GRF_STATIC, xDI}, - { 0 } , { 0 } , { 0 } , { 0 } , -}; - -// MS/SystemV ABI: ebx,esi,edi,ebp are preserved, eax,ecx,edx are temporaries -// DRC uses REGPARM to pass upto 3 parameters in registers eax,ecx,edx. -// To avoid conflicts with param passing ebx must be declared temp here. -static cache_reg_t cache_regs[] = { - { xAX, HRT_TEMP }, // RET_REG, param - { xDX, HRT_TEMP }, // params - { xCX, HRT_TEMP }, - { xBX, HRT_TEMP }, // temp - { xSI, HRT_STATIC }, // statics - { xDI, HRT_STATIC }, -}; - #elif defined(__x86_64__) #include "../drc/emit_x86.c" - -static guest_reg_t guest_regs[] = { - // SHR_R0 .. SHR_SP - {GRF_STATIC,xR12}, { 0 } , { 0 } , { 0 } , - { 0 } , { 0 } , { 0 } , { 0 } , - { 0 } , { 0 } , { 0 } , { 0 } , - { 0 } , { 0 } , { 0 } , { 0 } , - // SHR_PC, SHR_PPC, SHR_PR, SHR_SR, - // SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, - { 0 } , { 0 } , { 0 } , {GRF_STATIC, xBX}, - { 0 } , { 0 } , { 0 } , { 0 } , -}; - -// M$/SystemV ABI conventions: -// rbx,rbp,r12-r15 are preserved, rcx,rdx,rax,r8,r9,r10,r11 are temporaries -// rsi,rdi are preserved in M$ ABI, temporary in SystemV ABI -// parameters in rcx,rdx,r8,r9, SystemV ABI additionally uses rsi,rdi -static cache_reg_t cache_regs[] = { - { xAX, HRT_TEMP }, // RET_REG - { xDX, HRT_TEMP }, // params - { xCX, HRT_TEMP }, - { xDI, HRT_TEMP }, - { xSI, HRT_TEMP }, - { xR8, HRT_TEMP }, - { xR9, HRT_TEMP }, - { xR10,HRT_TEMP }, // temps - { xR11,HRT_TEMP }, - { xBX, HRT_STATIC }, // statics - { xR12,HRT_STATIC }, - { xR13,HRT_REG }, // other regs - { xR14,HRT_REG }, - { xR15,HRT_REG }, -}; - #else #error unsupported arch #endif +static const signed char hregs_param[] = PARAM_REGS; +static const signed char hregs_temp [] = TEMPORARY_REGS; +static const signed char hregs_saved[] = PRESERVED_REGS; +static const signed char regs_static[] = STATIC_SH2_REGS; + +#define CACHE_REGS \ + (ARRAY_SIZE(hregs_param)+ARRAY_SIZE(hregs_temp)+ARRAY_SIZE(hregs_saved)-1) +static cache_reg_t cache_regs[CACHE_REGS]; + static signed char reg_map_host[HOST_REGS]; +static guest_reg_t guest_regs[SH2_REGS]; + static void REGPARM(1) (*sh2_drc_entry)(SH2 *sh2); static void REGPARM(1) (*sh2_drc_dispatcher)(u32 pc); #if CALL_STACK @@ -884,15 +713,15 @@ static void dr_block_link(struct block_entry *be, struct block_link *bl, int emi // via blx: @jump near jumpcc to blx; @blx far jump emith_jump_patch(jump, bl->blx, &jump); emith_jump_at(bl->blx, be->tcache_ptr); - if ((((uintptr_t)bl->blx & 0xf) + emith_jump_at_size()-1) > 0xf) + if ((((uintptr_t)bl->blx & 0x1f) + emith_jump_at_size()-1) > 0x1f) host_instructions_updated(bl->blx, bl->blx + emith_jump_at_size()-1); } } else { printf("unknown BL type %d\n", bl->type); exit(1); } - // only needs sync if patch is possibly crossing cacheline (assume 16 byte) - if ((((uintptr_t)jump & 0xf) + jsz-1) > 0xf) + // only needs sync if patch is possibly crossing cacheline (assume 32 byte) + if ((((uintptr_t)jump & 0x1f) + jsz-1) > 0x1f) host_instructions_updated(jump, jump + jsz-1); } @@ -1653,7 +1482,7 @@ static void gconst_invalidate(void) static u16 rcache_counter; // SH2 register usage bitmasks -static u32 rcache_hregs_reg; // regs of type HRT_REG (for pinning) +static u32 rcache_vregs_reg; // regs of type HRT_REG (for pinning) static u32 rcache_regs_static; // statically allocated regs static u32 rcache_regs_pinned; // pinned regs static u32 rcache_regs_now; // regs used in current insn @@ -2548,30 +2377,60 @@ static void rcache_flush(void) rcache_invalidate(); } -static void rcache_init(void) +static void rcache_create(void) { - static int once = 1; - int i; + int x = 0, i; + + // create cache_regs as host register representation + // RET_REG/params should be first TEMPs to avoid allocation conflicts in calls + cache_regs[x++] = (cache_reg_t) {.hreg = RET_REG, .htype = HRT_TEMP}; + for (i = 0; i < ARRAY_SIZE(hregs_param); i++) + if (hregs_param[i] != RET_REG) + cache_regs[x++] = (cache_reg_t){.hreg = hregs_param[i],.htype = HRT_TEMP}; + + for (i = 0; i < ARRAY_SIZE(hregs_temp); i++) + if (hregs_temp[i] != RET_REG) + cache_regs[x++] = (cache_reg_t){.hreg = hregs_temp[i], .htype = HRT_TEMP}; + + for (i = ARRAY_SIZE(hregs_saved)-1; i >= 0; i--) + if (hregs_saved[i] != CONTEXT_REG) + cache_regs[x++] = (cache_reg_t){.hreg = hregs_saved[i], .htype = HRT_REG}; - // init is executed on every rom load, but this must only be executed once... - if (once) { - memset(reg_map_host, -1, sizeof(reg_map_host)); - for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { + if (x != ARRAY_SIZE(cache_regs)) { + printf("rcache_create failed (conflicting register count)\n"); + exit(1); + } + + // mapping from host_register to cache regs index + memset(reg_map_host, -1, sizeof(reg_map_host)); + for (i = 0; i < ARRAY_SIZE(cache_regs); i++) { + if (cache_regs[i].htype) reg_map_host[cache_regs[i].hreg] = i; - if (cache_regs[i].htype == HRT_REG) - rcache_hregs_reg |= (1 << i); - } + if (cache_regs[i].htype == HRT_REG) + rcache_vregs_reg |= (1 << i); + } - for (i = 0; i < ARRAY_SIZE(guest_regs); i++) - if (guest_regs[i].flags & GRF_STATIC) { - rcache_regs_static |= (1 << i); - guest_regs[i].sreg = reg_map_host[guest_regs[i].sreg]; - rcache_hregs_reg &= ~(1 << guest_regs[i].sreg); - } else - guest_regs[i].sreg = -1; - once = 0; + // create static host register mapping for SH2 regs + for (i = 0; i < ARRAY_SIZE(regs_static); i += 2) { + for (x = ARRAY_SIZE(cache_regs)-1; x >= 0; x--) + if (cache_regs[x].hreg == regs_static[i+1]) break; + if (x >= 0) { + guest_regs[regs_static[i]] = (guest_reg_t){.flags = GRF_STATIC,.sreg = x}; + rcache_regs_static |= (1 << regs_static[i]); + rcache_vregs_reg &= ~(1 << x); + } else + guest_regs[regs_static[i]] = (guest_reg_t){.sreg = -1}; } + printf("DRC registers created, %ld host regs (%d REG, %d STATIC, 1 CTX)\n", + CACHE_REGS+1L, count_bits(rcache_vregs_reg),count_bits(rcache_regs_static)); +} + +static void rcache_init(void) +{ + // create DRC data structures + rcache_create(); + rcache_invalidate(); #if DRC_DEBUG & 64 RCACHE_CHECK("after init"); @@ -5038,8 +4897,8 @@ static void sh2_generate_utils(void) emith_move_r_r_ptr(arg0, CONTEXT_REG); emith_call_ctx(offsetof(SH2, irq_callback)); // vector = sh2->irq_callback(sh2, level); // obtain new PC - emith_ctx_read(arg1, SHR_VBR * 4); - emith_add_r_r_r_lsl(arg0, arg1, RET_REG, 2); + tmp = rcache_get_reg_arg(1, SHR_VBR, &tmp2); + emith_add_r_r_r_lsl(arg0, tmp2, RET_REG, 2); emith_call(sh2_drc_read32); if (arg0 != RET_REG) emith_move_r_r(arg0, RET_REG); diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 5f374c8cb..415f01ba2 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -38,9 +38,9 @@ unsigned short scan_block(unsigned int base_pc, int is_slave, #if defined(__arm__) #define DRC_SR_REG "r10" #elif defined(__aarch64__) -#define DRC_SR_REG "r22" +#define DRC_SR_REG "r28" #elif defined(__mips__) -#define DRC_SR_REG "s2" +#define DRC_SR_REG "s6" #elif defined(__i386__) #define DRC_SR_REG "edi" #elif defined(__x86_64__) diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index 05ae70524..5f1a88411 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -8,6 +8,7 @@ typedef enum { SHR_R0 = 0, SHR_SP = 15, SHR_PC, SHR_PPC, SHR_PR, SHR_SR, SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, + SH2_REGS // register set size } sh2_reg_e; typedef struct SH2_ diff --git a/platform/common/disarm.c b/platform/common/disarm.c index 80655877a..37fd810e6 100644 --- a/platform/common/disarm.c +++ b/platform/common/disarm.c @@ -435,8 +435,10 @@ static int software_interrupt(unsigned int pc, unsigned int insn, char *buf, siz return 1; } -int disarm(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len) +int disarm(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len, uintptr_t *addr) { + *addr = 0; + if ((insn & 0x0fffffd0) == 0x012fff10) return branch_and_exchange(pc, insn, buf, buf_len); @@ -464,8 +466,10 @@ int disarm(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len) if ((insn & 0x0e000000) == 0x08000000) return block_data_transfer(pc, insn, buf, buf_len); - if ((insn & 0x0e000000) == 0x0a000000) + if ((insn & 0x0e000000) == 0x0a000000) { + *addr = (long)pc + 8 + ((long)(insn << 8) >> 6); return branch(pc, insn, buf, buf_len); + } if ((insn & 0x0e000000) == 0x0c000000) return coprocessor_data_transfer(pc, insn, buf, buf_len); diff --git a/platform/common/disarm.h b/platform/common/disarm.h index b8634f682..f11708949 100644 --- a/platform/common/disarm.h +++ b/platform/common/disarm.h @@ -23,6 +23,6 @@ #ifndef DISARM_H #define DISARM_H -int disarm(uintptr_t long pc, uint32_t, char *buf, unsigned int buf_len); +int disarm(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len, uintptr_t *sym); #endif /* DISARM_H */ diff --git a/platform/common/dismips.c b/platform/common/dismips.c index af71b0954..41c0f7a55 100644 --- a/platform/common/dismips.c +++ b/platform/common/dismips.c @@ -274,7 +274,7 @@ static unsigned long j_target(unsigned long pc, uint32_t insn) } // main disassembler function -int dismips(uintptr_t pc, uint32_t insn, char *buf, unsigned int buflen) +int dismips(uintptr_t pc, uint32_t insn, char *buf, size_t buflen, uintptr_t *sym) { const struct insn *pi = decode_insn(insn); char *rs = register_names[(insn >> 21) & 0x1f]; @@ -283,6 +283,7 @@ int dismips(uintptr_t pc, uint32_t insn, char *buf, unsigned int buflen) int sa = (insn >> 6) & 0x1f; int imm = (int16_t) insn; + *sym = 0; if (pi == NULL) { snprintf(buf, buflen, "0x%x", insn); return 0; @@ -314,13 +315,16 @@ int dismips(uintptr_t pc, uint32_t insn, char *buf, unsigned int buflen) snprintf(buf, buflen, "%s %s, %s, %d", pi->name, rd, rt, sa); break; case B_IMM_S: - snprintf(buf, buflen, "%s %s, 0x%lx", pi->name, rs, b_target(pc, insn)); + *sym = b_target(pc, insn); + snprintf(buf, buflen, "%s %s, 0x%lx", pi->name, rs, *sym); break; case B_IMM_TS: - snprintf(buf, buflen, "%s %s, %s, 0x%lx", pi->name, rs, rt, b_target(pc, insn)); + *sym = b_target(pc, insn); + snprintf(buf, buflen, "%s %s, %s, 0x%lx", pi->name, rs, rt, *sym); break; case J_IMM: - snprintf(buf, buflen, "%s 0x%lx", pi->name, j_target(pc, insn)); + *sym = j_target(pc, insn); + snprintf(buf, buflen, "%s 0x%lx", pi->name, *sym); break; case A_IMM_TS: if (abs(imm) < 1000) diff --git a/platform/common/dismips.h b/platform/common/dismips.h index e6338defa..b547003b9 100644 --- a/platform/common/dismips.h +++ b/platform/common/dismips.h @@ -1,6 +1,6 @@ #ifndef DISMIPS_H #define DISMIPS_H -int dismips(uintptr_t pc, uint32_t insn, char *buf, unsigned int buf_len); +int dismips(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len, uintptr_t *sym); #endif /* DISMIPS_H */ diff --git a/platform/common/host_dasm.c b/platform/common/host_dasm.c index d0537ef63..fc3cbe677 100644 --- a/platform/common/host_dasm.c +++ b/platform/common/host_dasm.c @@ -46,13 +46,10 @@ void host_dasm(void *addr, int len) insn = *(long *)addr; printf(" %08lx %08lx ", (long)addr, insn); - if(disasm((unsigned)addr, insn, buf, sizeof(buf))) + if(disasm((unsigned)addr, insn, buf, sizeof(buf), &symaddr)) { - symaddr = 0; - if ((insn & 0xe000000) == 0xa000000) { - symaddr = (long)addr + 8 + ((long)(insn << 8) >> 6); + if (symaddr) name = lookup_name((void *)symaddr); - } if (symaddr && name) printf("%s <%s>\n", buf, name); else if (symaddr && !name) From 5be12548d2b47dd787940715d41e2c3134da52df Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 9 Nov 2019 10:30:57 +0100 Subject: [PATCH 077/174] sh2 drc: optimizations for MIPS code emitting --- cpu/drc/emit_arm.c | 48 ++++- cpu/drc/emit_arm64.c | 52 +++++- cpu/drc/emit_mips.c | 408 +++++++++++++++++++++++++++++++----------- cpu/drc/emit_x86.c | 25 +++ cpu/sh2/compiler.c | 409 +++++++++++++++++++++++++++---------------- pico/32x/32x.c | 14 +- 6 files changed, 693 insertions(+), 263 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index e35d3471f..25a2c72fb 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -671,6 +671,8 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) literal_insn[pool_index] += move_offs; } +#define EMITH_HINT_COND(cond) /**/ + #define JMP_POS(ptr) { \ ptr = tcache_ptr; \ EMIT(0,M1(PC),0); \ @@ -721,9 +723,11 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_add_r_r_r_lsl_ptr(d, s1, s2, lslimm) \ emith_add_r_r_r_lsl(d, s1, s2, lslimm) +#define emith_adc_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_ADC_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) + #define emith_addf_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_ADD_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSL,lslimm) - #define emith_addf_r_r_r_lsr(d, s1, s2, lslimm) \ EOP_ADD_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSR,lslimm) @@ -733,6 +737,9 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_sub_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_SUB_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) +#define emith_sbc_r_r_r_lsl(d, s1, s2, lslimm) \ + EOP_SBC_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) + #define emith_subf_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_SUB_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSL,lslimm) @@ -741,10 +748,11 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_or_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_ORR_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) +#define emith_or_r_r_r_lsr(d, s1, s2, lsrimm) \ + EOP_ORR_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSR,lsrimm) #define emith_eor_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_EOR_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) - #define emith_eor_r_r_r_lsr(d, s1, s2, lsrimm) \ EOP_EOR_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSR,lsrimm) @@ -753,13 +761,20 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_or_r_r_lsl(d, s, lslimm) \ emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) +#define emith_eor_r_r_lsl(d, s, lslimm) \ + emith_eor_r_r_r_lsl(d, d, s, lslimm) #define emith_eor_r_r_lsr(d, s, lsrimm) \ emith_eor_r_r_r_lsr(d, d, s, lsrimm) #define emith_add_r_r_r(d, s1, s2) \ emith_add_r_r_r_lsl(d, s1, s2, 0) +#define emith_adc_r_r_r(d, s1, s2) \ + emith_adc_r_r_r_lsl(d, s1, s2, 0) + #define emith_addf_r_r_r(d, s1, s2) \ emith_addf_r_r_r_lsl(d, s1, s2, 0) @@ -769,6 +784,9 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_sub_r_r_r(d, s1, s2) \ emith_sub_r_r_r_lsl(d, s1, s2, 0) +#define emith_sbc_r_r_r(d, s1, s2) \ + emith_sbc_r_r_r_lsl(d, s1, s2, 0) + #define emith_subf_r_r_r(d, s1, s2) \ emith_subf_r_r_r_lsl(d, s1, s2, 0) @@ -790,11 +808,17 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_add_r_r_ptr(d, s) \ emith_add_r_r_r(d, d, s) +#define emith_adc_r_r(d, s) \ + emith_adc_r_r_r(d, d, s) + #define emith_sub_r_r(d, s) \ emith_sub_r_r_r(d, d, s) -#define emith_adc_r_r(d, s) \ - EOP_ADC_REG(A_COND_AL,0,d,d,s,A_AM1_LSL,0) +#define emith_sbc_r_r(d, s) \ + emith_sbc_r_r_r(d, d, s) + +#define emith_negc_r_r(d, s) \ + EOP_C_DOP_IMM(A_COND_AL,A_OP_RSC,0,s,d,0,0) #define emith_and_r_r_c(cond, d, s) \ EOP_AND_REG(cond,0,d,d,s,A_AM1_LSL,0) @@ -987,9 +1011,13 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) #define emith_rolcf(d) \ emith_adcf_r_r(d, d) +#define emith_rolc(d) \ + emith_adc_r_r(d, d) #define emith_rorcf(d) \ EOP_MOV_REG(A_COND_AL,1,d,d,A_AM1_ROR,0) /* ROR #0 -> RRX */ +#define emith_rorc(d) \ + EOP_MOV_REG(A_COND_AL,0,d,d,A_AM1_ROR,0) /* ROR #0 -> RRX */ #define emith_negcf_r_r(d, s) \ EOP_C_DOP_IMM(A_COND_AL,A_OP_RSC,1,s,d,0,0) @@ -1329,6 +1357,18 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) } \ } while (0) +#define emith_t_to_carry(srr, is_sub) do { \ + if (is_sub) { \ + int t_ = rcache_get_tmp(); \ + emith_eor_r_r_imm(t_, srr, 1); \ + emith_rorf(t_, t_, 1); \ + rcache_free_tmp(t_); \ + } else { \ + emith_rorf(srr, srr, 1); \ + emith_rol(srr, srr, 1); \ + } \ +} while (0) + #define emith_tpop_carry(sr, is_sub) do { \ if (is_sub) \ emith_eor_r_imm(sr, 1); \ diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 0c36b2bc1..dc0cf5594 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -370,6 +370,8 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; JMP_EMIT_NC(else_ptr); \ } +#define EMITH_HINT_COND(cond) /**/ + // "simple" jump (no more then a few insns) // ARM32 will use conditional instructions here #define EMITH_SJMP_START EMITH_JMP_START @@ -414,6 +416,24 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; #define emith_addf_r_r_r_lsr(d, s1, s2, simm) \ EMIT(A64_ADDS_REG(d, s1, s2, ST_LSR, simm)) +#define emith_adc_r_r_r_lsl(d, s1, s2, simm) \ + if (simm) { int _t = rcache_get_tmp(); \ + emith_lsl(_t, s2, simm); \ + emith_adc_r_r_r(d, s1, _t); \ + rcache_free_tmp(_t); \ + } else \ + emith_adc_r_r_r(d, s1, s2); \ +} while (0) + +#define emith_sbc_r_r_r_lsl(d, s1, s2, simm) \ + if (simm) { int _t = rcache_get_tmp(); \ + emith_lsl(_t, s2, simm); \ + emith_sbc_r_r_r(d, s1, _t); \ + rcache_free_tmp(_t); \ + } else \ + emith_sbc_r_r_r(d, s1, s2); \ +} while (0) + #define emith_sub_r_r_r_lsl(d, s1, s2, simm) \ EMIT(A64_SUB_REG(d, s1, s2, ST_LSL, simm)) @@ -422,10 +442,11 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; #define emith_or_r_r_r_lsl(d, s1, s2, simm) \ EMIT(A64_OR_REG(d, s1, s2, ST_LSL, simm)) +#define emith_or_r_r_r_lsr(d, s1, s2, simm) \ + EMIT(A64_OR_REG(d, s1, s2, ST_LSR, simm)) #define emith_eor_r_r_r_lsl(d, s1, s2, simm) \ EMIT(A64_EOR_REG(d, s1, s2, ST_LSL, simm)) - #define emith_eor_r_r_r_lsr(d, s1, s2, simm) \ EMIT(A64_EOR_REG(d, s1, s2, ST_LSR, simm)) @@ -434,7 +455,11 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; #define emith_or_r_r_lsl(d, s, lslimm) \ emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) +#define emith_eor_r_r_lsl(d, s, lslimm) \ + emith_eor_r_r_r_lsl(d, d, s, lslimm) #define emith_eor_r_r_lsr(d, s, lsrimm) \ emith_eor_r_r_r_lsr(d, d, s, lsrimm) @@ -472,6 +497,9 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; #define emith_neg_r_r(d, s) \ EMIT(A64_NEG_REG(d, s, ST_LSL, 0)) +#define emith_negc_r_r(d, s) \ + EMIT(A64_NEGC_REG(d, s)) + #define emith_adc_r_r_r(d, s1, s2) \ EMIT(A64_ADC_REG(d, s1, s2)) @@ -481,6 +509,9 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; #define emith_adcf_r_r_r(d, s1, s2) \ EMIT(A64_ADCS_REG(d, s1, s2)) +#define emith_sbc_r_r_r(d, s1, s2) \ + EMIT(A64_SBC_REG(d, s1, s2)) + #define emith_sbcf_r_r_r(d, s1, s2) \ EMIT(A64_SBCS_REG(d, s1, s2)) @@ -806,12 +837,19 @@ static void emith_log_imm(int op, int wx, int rd, int rn, u32 imm) #define emith_rolcf(d) \ emith_adcf_r_r(d, d) +#define emith_rolc(d) \ + emith_adc_r_r(d, d) #define emith_rorcf(d) do { \ EMIT(A64_RBIT_REG(d, d)); \ emith_adcf_r_r(d, d); \ EMIT(A64_RBIT_REG(d, d)); \ } while (0) +#define emith_rorc(d) do { \ + EMIT(A64_RBIT_REG(d, d)); \ + emith_adc_r_r(d, d); \ + EMIT(A64_RBIT_REG(d, d)); \ +} while (0) // signed/unsigned extend #define emith_clear_msb(d, s, count) /* bits to clear */ \ @@ -1286,6 +1324,18 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) emith_eor_r_imm(sr, 1); \ } while (0) +#define emith_t_to_carry(srr, is_sub) do { \ + if (is_sub) { \ + int t_ = rcache_get_tmp(); \ + emith_eor_r_r_imm(t_, srr, 1); \ + emith_rorf(t_, t_, 1); \ + rcache_free_tmp(t_); \ + } else { \ + emith_rorf(srr, srr, 1); \ + emith_rol(srr, srr, 1); \ + } \ +} while (0) + #define emith_tpop_carry(sr, is_sub) do { \ if (is_sub) \ emith_eor_r_imm(sr, 1); \ diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 832364e9f..825274742 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -173,15 +173,17 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; MIPS_OP_REG(FN_JALR,rd,rs,_) // conditional branches; no condition code, these compare rs against rt or Z0 -#define MIPS_BEQ (OP_BEQ << 5) -#define MIPS_BNE (OP_BNE << 5) -#define MIPS_BLE (OP_BLEZ << 5) -#define MIPS_BGT (OP_BGTZ << 5) -#define MIPS_BLT ((OP__RT << 5)|RT_BLTZ) -#define MIPS_BGE ((OP__RT << 5)|RT_BGEZ) -#define MIPS_BGTL ((OP__RT << 5)|RT_BLTZAL) -#define MIPS_BGEL ((OP__RT << 5)|RT_BGEZAL) - +#define MIPS_BEQ (OP_BEQ << 5) // rs == rt (rt in lower 5 bits) +#define MIPS_BNE (OP_BNE << 5) // rs != rt (ditto) +#define MIPS_BLE (OP_BLEZ << 5) // rs <= 0 +#define MIPS_BGT (OP_BGTZ << 5) // rs > 0 +#define MIPS_BLT ((OP__RT << 5)|RT_BLTZ) // rs < 0 +#define MIPS_BGE ((OP__RT << 5)|RT_BGEZ) // rs >= 0 +#define MIPS_BGTL ((OP__RT << 5)|RT_BLTZAL) // rs > 0, link $ra if jumping +#define MIPS_BGEL ((OP__RT << 5)|RT_BGEZAL) // rs >= 0, link $ra if jumping + +#define MIPS_BCOND(cond, rs, rt, offs16) \ + MIPS_OP_IMM((cond >> 5), rt, rs, (offs16) >> 2) #define MIPS_BCONDZ(cond, rs, offs16) \ MIPS_OP_IMM((cond >> 5), (cond & 0x1f), rs, (offs16) >> 2) #define MIPS_B(offs16) \ @@ -216,25 +218,26 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; ptr = (void *)((u8 *)(ptr) + sizeof(u32)); \ } while (0) -// FIFO for 2 instructions, for delay slot handling -static u32 emith_last_insns[2] = { -1,-1 }; -static int emith_last_idx, emith_last_cnt; +// FIFO for some instructions, for delay slot handling +#define FSZ 4 +static u32 emith_last_insns[FSZ]; +static unsigned emith_last_idx, emith_last_cnt; #define EMIT_PUSHOP() \ do { \ - emith_last_idx ^= 1; \ - if (emith_last_insns[emith_last_idx] != -1) { \ + if (emith_last_cnt > 0) { \ u32 *p = (u32 *)tcache_ptr - emith_last_cnt; \ - EMIT_PTR(p, emith_last_insns[emith_last_idx]);\ + int idx = (emith_last_idx - emith_last_cnt+1) %FSZ; \ + EMIT_PTR(p, emith_last_insns[idx]);\ emith_last_cnt --; \ } \ - emith_last_insns[emith_last_idx] = -1; \ } while (0) #define EMIT(op) \ do { \ - EMIT_PUSHOP(); \ + if (emith_last_cnt >= FSZ) EMIT_PUSHOP(); \ tcache_ptr = (void *)((u32 *)tcache_ptr + 1); \ + emith_last_idx = (emith_last_idx+1) %FSZ; \ emith_last_insns[emith_last_idx] = op; \ emith_last_cnt ++; \ COUNT_OP; \ @@ -242,7 +245,8 @@ static int emith_last_idx, emith_last_cnt; #define emith_flush() \ do { \ - int i; for (i = 0; i < 2; i++) EMIT_PUSHOP(); \ + while (emith_last_cnt) EMIT_PUSHOP(); \ + emith_flg_hint = _FHV|_FHC; \ } while (0) #define emith_insn_ptr() (u8 *)((u32 *)tcache_ptr - emith_last_cnt) @@ -279,11 +283,12 @@ static int emith_rt(u32 op) return emith_has_(rt,2,op,26,0x3f) ? (op>>16)&0x1f : 0; } static int emith_rd(u32 op) - { if ((op>>26) == OP__FN) - return emith_has_(rd,0,op, 0,0x3f) ? (op>>11)&0x1f :-1; + { int ret = emith_has_(rd,2,op,26,0x3f) ? (op>>16)&0x1f :-1; + if ((op>>26) == OP__FN) + ret = emith_has_(rd,0,op, 0,0x3f) ? (op>>11)&0x1f :-1; if ((op>>26) == OP__RT) - return -1; - return emith_has_(rd,2,op,26,0x3f) ? (op>>16)&0x1f :-1; + ret = -1; + return (ret ?: -1); // Z0 doesn't have dependencies } static int emith_b_isswap(u32 bop, u32 lop) @@ -292,48 +297,56 @@ static int emith_b_isswap(u32 bop, u32 lop) return bop; else if (emith_is_jr(bop) && emith_rd(lop) != emith_rs(bop)) return bop; - else if (emith_is_b(bop) && emith_rd(lop) != emith_rs(bop)) + else if (emith_is_b(bop) && emith_rd(lop) != emith_rs(bop) && + emith_rd(lop) != emith_rt(bop)) if ((bop & 0xffff) != 0x7fff) // displacement overflow? return (bop & 0xffff0000) | ((bop+1) & 0x0000ffff); return 0; } +static int emith_insn_swappable(u32 op1, u32 op2) +{ + if (emith_rd(op1) != emith_rd(op2) && + emith_rs(op1) != emith_rd(op2) && emith_rt(op1) != emith_rd(op2) && + emith_rs(op2) != emith_rd(op1) && emith_rt(op2) != emith_rd(op1)) + return 1; + return 0; +} + // emit branch, trying to fill the delay slot with one of the last insns static void *emith_branch(u32 op) { - int idx = emith_last_idx; - u32 op1 = emith_last_insns[idx], op2 = emith_last_insns[idx^1]; - u32 bop = 0; + unsigned idx = emith_last_idx, ds = idx; + u32 bop = 0, sop; void *bp; - - // check last insn (op1) - if (op1 != -1 && op1) - bop = emith_b_isswap(op, op1); - // if not, check older insn (op2); mustn't interact with op1 to overtake - if (!bop && op2 != -1 && op2 && emith_rd(op1) != emith_rd(op2) && - emith_rs(op1) != emith_rd(op2) && emith_rt(op1) != emith_rd(op2) && - emith_rs(op2) != emith_rd(op1) && emith_rt(op2) != emith_rd(op1)) { - idx ^= 1; - bop = emith_b_isswap(op, op2); + int i, j, s; + + // check for ds insn; older mustn't interact with newer ones to overtake + for (i = 0; i < emith_last_cnt && !bop; i++) { + ds = (idx-i)%FSZ; + sop = emith_last_insns[ds]; + for (j = i, s = 1; j > 0 && s; j--) + s = emith_insn_swappable(emith_last_insns[(ds+j)%FSZ], sop); + if (s) + bop = emith_b_isswap(op, sop); } - // flush FIFO and branch + // flush FIFO, but omit delay slot insn tcache_ptr = (void *)((u32 *)tcache_ptr - emith_last_cnt); - if (emith_last_insns[idx^1] != -1) - EMIT_PTR(tcache_ptr, emith_last_insns[idx^1]); + idx = (idx-emith_last_cnt+1)%FSZ; + for (i = emith_last_cnt; i > 0; i--, idx = (idx+1)%FSZ) + if (!bop || idx != ds) + EMIT_PTR(tcache_ptr, emith_last_insns[idx]); + emith_last_cnt = 0; + // emit branch and delay slot + bp = tcache_ptr; if (bop) { // can swap - bp = tcache_ptr; EMIT_PTR(tcache_ptr, bop); COUNT_OP; - EMIT_PTR(tcache_ptr, emith_last_insns[idx]); + EMIT_PTR(tcache_ptr, emith_last_insns[ds]); } else { // can't swap - if (emith_last_insns[idx] != -1) - EMIT_PTR(tcache_ptr, emith_last_insns[idx]); - bp = tcache_ptr; EMIT_PTR(tcache_ptr, op); COUNT_OP; EMIT_PTR(tcache_ptr, MIPS_NOP); COUNT_OP; } - emith_last_insns[0] = emith_last_insns[1] = -1; - emith_last_cnt = 0; return bp; } @@ -403,34 +416,56 @@ static void *emith_branch(u32 op) // flag emulation creates 2 (ie cmp #0/beq) up to 9 (ie adcf/ble) extra insns. // flag handling shortcuts may reduce this by 1-4 insns, see emith_cond_check() -static int emith_flg_rs, emith_flg_rt; // registers used in FNZ=rs-rt (cmp_r_r) +static int emith_cmp_rs, emith_cmp_rt; // registers used in cmp_r_r/cmp_r_imm +static s32 emith_cmp_imm; // immediate value used in cmp_r_imm +enum { _FHC=1, _FHV=2 } emith_flg_hint; // C/V flag usage hinted by compiler static int emith_flg_noV; // V flag known not to be set +#define EMITH_HINT_COND(cond) do { \ + /* only need to check cond>>1 since the lowest bit inverts the cond */ \ + unsigned _mv = BITMASK3(DCOND_VS>>1,DCOND_GE>>1,DCOND_GT>>1); \ + unsigned _mc = _mv | BITMASK2(DCOND_HS>>1,DCOND_HI>>1); \ + emith_flg_hint = (_mv & BITMASK1(cond >> 1) ? _FHV : 0); \ + emith_flg_hint |= (_mc & BITMASK1(cond >> 1) ? _FHC : 0); \ +} while (0) + // store minimal cc information: rd, rt^rs, carry // NB: the result *must* first go to FNZ, in case rd == rs or rd == rt. // NB: for adcf and sbcf, carry-in must be dealt with separately (see there) -static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) +static void emith_set_arith_flags(int rd, int rs, int rt, s32 imm, int sub) { - if (sub && rd == FNZ && rt > AT && rs > AT) // is this cmp_r_r? - emith_flg_rs = rs, emith_flg_rt = rt; - else emith_flg_rs = emith_flg_rt = 0; - - if (sub) // C = sub:rt 0) // Nt^Ns - EMIT(MIPS_XOR_REG(FV, rt, rs)); - else if (imm < 0) - EMIT(MIPS_NOR_REG(FV, rt, Z0)); - else if (imm > 0) - EMIT(MIPS_OR_REG(FV, rt, Z0)); // Nt^Ns in FV, bit 31 - else emith_flg_noV = 1; // imm #0, never overflows + if (emith_flg_hint & _FHC) { + if (sub) // C = sub:rt= 0) // Nt^Ns in FV, bit 31 + EMIT(MIPS_XOR_REG(FV, rs, rt)); + else if (imm == 0) + emith_flg_noV = 1; // imm #0 can't overflow + else if ((imm < 0) == !sub) + EMIT(MIPS_NOR_REG(FV, rs, Z0)); + else if ((imm > 0) == !sub) + EMIT(MIPS_OR_REG(FV, rs, Z0)); + } // full V = Nd^Nt^Ns^C calculation is deferred until really needed - if (rd != FNZ) + if (rd && rd != FNZ) EMIT(MIPS_MOVE_REG(rd, FNZ)); // N,Z via result value in FNZ + emith_cmp_rs = emith_cmp_rt = -1; +} + +// since MIPS has less-than and compare-branch insns, handle cmp separately by +// storing the involved regs for later use in one of those MIPS insns. +// This works for all conditions but VC/VS, but this is fortunately never used. +static void emith_set_compare_flags(int rs, int rt, s32 imm) +{ + emith_cmp_rt = rt; + emith_cmp_rs = rs; + emith_cmp_imm = imm; } // data processing, register @@ -510,6 +545,13 @@ static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) } else EMIT(MIPS_OR_REG(d, s1, s2)); \ } while (0) +#define emith_or_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSR_IMM(AT, s2, simm)); \ + EMIT(MIPS_OR_REG(d, s1, AT)); \ + } else EMIT(MIPS_OR_REG(d, s1, s2)); \ +} while (0) + #define emith_eor_r_r_r_lsl(d, s1, s2, simm) do { \ if (simm) { \ EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ @@ -533,7 +575,11 @@ static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) #define emith_or_r_r_lsl(d, s, lslimm) \ emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) +#define emith_eor_r_r_lsl(d, s, lslimm) \ + emith_eor_r_r_r_lsl(d, d, s, lslimm) #define emith_eor_r_r_lsr(d, s, lsrimm) \ emith_eor_r_r_r_lsr(d, d, s, lsrimm) @@ -570,13 +616,21 @@ static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) EMIT(MIPS_NEG_REG(d, s)) #define emith_adc_r_r_r(d, s1, s2) do { \ - emith_add_r_r_r(AT, s1, FC); \ - emith_add_r_r_r(d, AT, s2); \ + emith_add_r_r_r(AT, s2, FC); \ + emith_add_r_r_r(d, s1, AT); \ +} while (0) + +#define emith_sbc_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(AT, s2, FC); \ + emith_sub_r_r_r(d, s1, AT); \ } while (0) #define emith_adc_r_r(d, s) \ emith_adc_r_r_r(d, d, s) +#define emith_negc_r_r(d, s) \ + emith_sbc_r_r_r(d, Z0, s) + // NB: the incoming carry Cin can cause Cout if s2+Cin=0 (or s1+Cin=0 FWIW) // moreover, if s2+Cin=0 caused Cout, s1+s2+Cin=s1+0 can't cause another Cout #define emith_adcf_r_r_r(d, s1, s2) do { \ @@ -606,16 +660,23 @@ static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) #define emith_eor_r_r(d, s) \ emith_eor_r_r_r(d, d, s) -#define emith_tst_r_r_ptr(d, s) \ - emith_and_r_r_r(FNZ, d, s) +#define emith_tst_r_r_ptr(d, s) do { \ + if (d != s) { \ + emith_and_r_r_r(FNZ, d, s); \ + emith_cmp_rs = emith_cmp_rt = -1; \ + } else emith_cmp_rs = s, emith_cmp_rt = Z0; \ +} while (0) #define emith_tst_r_r(d, s) \ emith_tst_r_r_ptr(d, s) -#define emith_teq_r_r(d, s) \ - emith_eor_r_r_r(FNZ, d, s) +#define emith_teq_r_r(d, s) do { \ + emith_eor_r_r_r(FNZ, d, s); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) #define emith_cmp_r_r(d, s) \ - emith_subf_r_r_r(FNZ, d, s) + emith_set_compare_flags(d, s, 0) +// emith_subf_r_r_r(FNZ, d, s) #define emith_addf_r_r(d, s) \ emith_addf_r_r_r(d, d, s) @@ -705,8 +766,8 @@ static void emith_arith_imm(int op, int rd, int rs, u32 imm) emith_adcf_r_r_imm(r, r, imm) #define emith_cmp_r_imm(r, imm) \ - emith_subf_r_r_imm(FNZ, r, (s16)imm) - + emith_set_compare_flags(r, -1, imm) +// emith_subf_r_r_imm(FNZ, r, (s16)imm) #define emith_add_r_r_ptr_imm(d, s, imm) \ emith_arith_imm(OP_ADDIU, d, s, imm) @@ -716,7 +777,7 @@ static void emith_arith_imm(int op, int rd, int rs, u32 imm) #define emith_addf_r_r_imm(d, s, imm) do { \ emith_add_r_r_imm(FNZ, s, imm); \ - emith_set_arith_flags(d, s, 0, imm, 0); \ + emith_set_arith_flags(d, s, -1, imm, 0); \ } while (0) #define emith_adc_r_r_imm(d, s, imm) do { \ @@ -725,11 +786,16 @@ static void emith_arith_imm(int op, int rd, int rs, u32 imm) } while (0) #define emith_adcf_r_r_imm(d, s, imm) do { \ - emith_add_r_r_r(FNZ, s, FC); \ - EMIT(MIPS_SLTU_REG(AT, FNZ, FC)); \ - emith_add_r_r_imm(FNZ, FNZ, imm); \ - emith_set_arith_flags(d, s, 0, imm, 0); \ - emith_or_r_r(FC, AT); \ + if (imm == 0) { \ + emith_add_r_r_r(FNZ, s, FC); \ + emith_set_arith_flags(d, s, -1, 1, 0); \ + } else { \ + emith_add_r_r_r(FNZ, s, FC); \ + EMIT(MIPS_SLTU_REG(AT, FNZ, FC)); \ + emith_add_r_r_imm(FNZ, FNZ, imm); \ + emith_set_arith_flags(d, s, -1, imm, 0); \ + emith_or_r_r(FC, AT); \ + } \ } while (0) // NB: no SUBI in MIPS II, since ADDI takes a signed imm @@ -740,7 +806,7 @@ static void emith_arith_imm(int op, int rd, int rs, u32 imm) #define emith_subf_r_r_imm(d, s, imm) do { \ emith_sub_r_r_imm(FNZ, s, imm); \ - emith_set_arith_flags(d, s, 0, imm, 1); \ + emith_set_arith_flags(d, s, -1, imm, 1); \ } while (0) // logical, immediate @@ -777,8 +843,10 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) #define emith_bic_r_imm_c(cond, r, imm) \ emith_bic_r_imm(r, imm) -#define emith_tst_r_imm(r, imm) \ - emith_log_imm(OP_ANDI, FNZ, r, imm) +#define emith_tst_r_imm(r, imm) do { \ + emith_log_imm(OP_ANDI, FNZ, r, imm); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) #define emith_tst_r_imm_c(cond, r, imm) \ emith_tst_r_imm(r, imm) @@ -816,6 +884,17 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) EMIT(MIPS_OR_REG(d, d, AT)); \ } while (0) +#define emith_rorc(d) do { \ + emith_lsr(d, d, 1); \ + emith_lsl(AT, FC, 31); \ + emith_or_r_r(d, AT); \ +} while (0) + +#define emith_rolc(d) do { \ + emith_lsl(d, d, 1); \ + emith_or_r_r(d, FC); \ +} while (0) + // NB: all flag setting shifts make V undefined // NB: mips32r2 has EXT (useful for extracting C) #define emith_lslf(d, s, cnt) do { \ @@ -829,6 +908,7 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) emith_lsl(d, _s, 1); \ } \ emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ } while (0) #define emith_lsrf(d, s, cnt) do { \ @@ -842,6 +922,7 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) emith_lsr(d, _s, 1); \ } \ emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ } while (0) #define emith_asrf(d, s, cnt) do { \ @@ -855,18 +936,21 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) emith_asr(d, _s, 1); \ } \ emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ } while (0) #define emith_rolf(d, s, cnt) do { \ emith_rol(d, s, cnt); \ emith_and_r_r_imm(FC, d, 1); \ emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ } while (0) #define emith_rorf(d, s, cnt) do { \ emith_ror(d, s, cnt); \ emith_lsr(FC, d, 31); \ emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ } while (0) #define emith_rolcf(d) do { \ @@ -875,6 +959,7 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) emith_or_r_r(d, FC); \ emith_move_r_r(FC, AT); \ emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ } while (0) #define emith_rorcf(d) do { \ @@ -884,6 +969,7 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) emith_or_r_r(d, FC); \ emith_move_r_r(FC, AT); \ emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ } while (0) // signed/unsigned extend @@ -1108,26 +1194,84 @@ static void emith_lohi_nops(void) (((cond) >> 5) == OP__RT ? (cond) ^ 0x01 : (cond) ^ 0x20) // evaluate the emulated condition, returns a register/branch type pair -static int emith_cond_check(int cond, int *r) +static int emith_cmpr_check(int rs, int rt, int cond, int *r) { int b = 0; - // shortcut for comparing 2 registers - if (emith_flg_rs || emith_flg_rt) switch (cond) { - case DCOND_LS: EMIT(MIPS_SLTU_REG(AT, emith_flg_rs, emith_flg_rt)); + // condition check for comparing 2 registers + switch (cond) { + case DCOND_EQ: *r = rs; b = MIPS_BEQ|rt; break; + case DCOND_NE: *r = rs; b = MIPS_BNE|rt; break; + case DCOND_LO: EMIT(MIPS_SLTU_REG(AT, rs, rt)); + *r = AT, b = MIPS_BNE; break; // s < t unsigned + case DCOND_HS: EMIT(MIPS_SLTU_REG(AT, rs, rt)); + *r = AT, b = MIPS_BEQ; break; // s >= t unsigned + case DCOND_LS: EMIT(MIPS_SLTU_REG(AT, rt, rs)); *r = AT, b = MIPS_BEQ; break; // s <= t unsigned - case DCOND_HI: EMIT(MIPS_SLTU_REG(AT, emith_flg_rs, emith_flg_rt)); + case DCOND_HI: EMIT(MIPS_SLTU_REG(AT, rt, rs)); *r = AT, b = MIPS_BNE; break; // s > t unsigned - case DCOND_LT: EMIT(MIPS_SLT_REG(AT, emith_flg_rt, emith_flg_rs)); + case DCOND_LT: if (rt == 0) { *r = rs, b = MIPS_BLT; break; } // s < 0 + EMIT(MIPS_SLT_REG(AT, rs, rt)); *r = AT, b = MIPS_BNE; break; // s < t - case DCOND_GE: EMIT(MIPS_SLT_REG(AT, emith_flg_rt, emith_flg_rs)); + case DCOND_GE: if (rt == 0) { *r = rs, b = MIPS_BGE; break; } // s >= 0 + EMIT(MIPS_SLT_REG(AT, rs, rt)); *r = AT, b = MIPS_BEQ; break; // s >= t - case DCOND_LE: EMIT(MIPS_SLT_REG(AT, emith_flg_rs, emith_flg_rt)); + case DCOND_LE: if (rt == 0) { *r = rs, b = MIPS_BLE; break; } // s <= 0 + EMIT(MIPS_SLT_REG(AT, rt, rs)); *r = AT, b = MIPS_BEQ; break; // s <= t - case DCOND_GT: EMIT(MIPS_SLT_REG(AT, emith_flg_rs, emith_flg_rt)); + case DCOND_GT: if (rt == 0) { *r = rs, b = MIPS_BGT; break; } // s > 0 + EMIT(MIPS_SLT_REG(AT, rt, rs)); *r = AT, b = MIPS_BNE; break; // s > t } + return b; +} + +static int emith_cmpi_check(int rs, s32 imm, int cond, int *r) +{ + int b = 0; + + // condition check for comparing register with immediate + if (imm == 0) return emith_cmpr_check(rs, Z0, cond, r); + switch (cond) { + case DCOND_EQ: emith_move_r_imm(AT, imm); + *r = rs; b = MIPS_BEQ|AT; break; + case DCOND_NE: emith_move_r_imm(AT, imm); + *r = rs; b = MIPS_BNE|AT; break; + case DCOND_LO: EMIT(MIPS_SLTU_IMM(AT, rs, imm)); + *r = AT, b = MIPS_BNE; break; // s < imm unsigned + case DCOND_HS: EMIT(MIPS_SLTU_IMM(AT, rs, imm)); + *r = AT, b = MIPS_BEQ; break; // s >= imm unsigned + case DCOND_LS: emith_move_r_imm(AT, imm); + EMIT(MIPS_SLTU_REG(AT, AT, rs)); + *r = AT, b = MIPS_BEQ; break; // s <= imm unsigned + case DCOND_HI: emith_move_r_imm(AT, imm); + EMIT(MIPS_SLTU_REG(AT, AT, rs)); + *r = AT, b = MIPS_BNE; break; // s > imm unsigned + case DCOND_LT: EMIT(MIPS_SLT_IMM(AT, rs, imm)); + *r = AT, b = MIPS_BNE; break; // s < imm + case DCOND_GE: EMIT(MIPS_SLT_IMM(AT, rs, imm)); + *r = AT, b = MIPS_BEQ; break; // s >= imm + case DCOND_LE: emith_move_r_imm(AT, imm); + EMIT(MIPS_SLT_REG(AT, AT, rs)); + *r = AT, b = MIPS_BEQ; break; // s <= imm + case DCOND_GT: emith_move_r_imm(AT, imm); + EMIT(MIPS_SLT_REG(AT, AT, rs)); + *r = AT, b = MIPS_BNE; break; // s > imm + } + return b; +} + +static int emith_cond_check(int cond, int *r) +{ + int b = 0; + + if (emith_cmp_rs >= 0) { + if (emith_cmp_rt != -1) + b = emith_cmpr_check(emith_cmp_rs,emith_cmp_rt, cond,r); + else b = emith_cmpi_check(emith_cmp_rs,emith_cmp_imm,cond,r); + } + // shortcut for V known to be 0 if (!b && emith_flg_noV) switch (cond) { case DCOND_VS: *r = Z0; b = MIPS_BNE; break; // never @@ -1373,8 +1517,10 @@ static int emith_cond_check(int cond, int *r) #define emith_sh2_div1_step(rn, rm, sr) do { \ emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ EMITH_JMP3_START(DCOND_EQ); \ + EMITH_HINT_COND(DCOND_CS); \ emith_addf_r_r(rn, rm); \ EMITH_JMP3_MID(DCOND_EQ); \ + EMITH_HINT_COND(DCOND_CS); \ emith_subf_r_r(rn, rm); \ EMITH_JMP3_END(); \ emith_eor_r_r(sr, FC); \ @@ -1433,23 +1579,27 @@ static int emith_cond_check(int cond, int *r) } while (0) #define emith_write_sr(sr, srcr) do { \ - emith_lsr(sr, sr, 10); \ - emith_or_r_r_r_lsl(sr, sr, srcr, 22); \ - emith_ror(sr, sr, 22); \ + emith_lsr(sr, sr , 10); emith_lsl(sr, sr, 10); \ + emith_lsl(AT, srcr, 22); emith_lsr(AT, AT, 22); \ + emith_or_r_r(sr, AT); \ +} while (0) + +#define emith_carry_to_t(sr, is_sub) do { \ + emith_and_r_imm(sr, 0xfffffffe); \ + emith_or_r_r(sr, FC); \ } while (0) -#define emith_carry_to_t(srr, is_sub) do { \ - emith_lsr(sr, sr, 1); \ - emith_adc_r_r(sr, sr); \ +#define emith_t_to_carry(sr, is_sub) do { \ + emith_and_r_r_imm(FC, sr, 1); \ } while (0) #define emith_tpop_carry(sr, is_sub) do { \ emith_and_r_r_imm(FC, sr, 1); \ - emith_lsr(sr, sr, 1); \ + emith_eor_r_r(sr, FC); \ } while (0) #define emith_tpush_carry(sr, is_sub) \ - emith_adc_r_r(sr, sr) + emith_or_r_r(sr, FC) #ifdef T // T bit handling @@ -1463,9 +1613,61 @@ static void emith_clr_t_cond(int sr) static void emith_set_t_cond(int sr, int cond) { - EMITH_SJMP_START(emith_invert_cond(cond)); - emith_or_r_imm_c(cond, sr, T); - EMITH_SJMP_END(emith_invert_cond(cond)); + int b, r; + u8 *ptr; + u32 val = 0, inv = 0; + + // try to avoid jumping around if possible + if (emith_cmp_rs >= 0) { + if (emith_cmp_rt >= 0) + b = emith_cmpr_check(emith_cmp_rs, emith_cmp_rt, cond, &r); + else + b = emith_cmpi_check(emith_cmp_rs, emith_cmp_imm, cond, &r); + + // XXX this relies on the inner workings of cmp_check... + if (r == AT) + // result of slt check which returns either 0 or 1 in AT + val++, inv = (b == MIPS_BEQ); + } else { + b = emith_cond_check(cond, &r); + if (r == Z0) { + if (b == MIPS_BEQ || b == MIPS_BLE || b == MIPS_BGE) + emith_or_r_imm(sr, T); + return; + } else if (r == FC) + val++, inv = (b == MIPS_BEQ); + } + + if (!val) switch (b) { // cases: b..z r, aka cmp r,Z0 or cmp r,#0 + case MIPS_BEQ: EMIT(MIPS_SLTU_IMM(AT, r, 1)); r=AT; val++; break; + case MIPS_BNE: EMIT(MIPS_SLTU_REG(AT,Z0, r)); r=AT; val++; break; + case MIPS_BLT: EMIT(MIPS_SLT_REG(AT, r, Z0)); r=AT; val++; break; + case MIPS_BGE: EMIT(MIPS_SLT_REG(AT, r, Z0)); r=AT; val++; inv++; break; + case MIPS_BLE: EMIT(MIPS_SLT_REG(AT, Z0, r)); r=AT; val++; inv++; break; + case MIPS_BGT: EMIT(MIPS_SLT_REG(AT, Z0, r)); r=AT; val++; break; + default: // cases: beq/bne r,s, aka cmp r,s + if ((b>>5) == OP_BEQ) { + EMIT(MIPS_XOR_REG(AT, r, b&0x1f)); + EMIT(MIPS_SLTU_IMM(AT,AT, 1)); r=AT; val++; break; + } else if ((b>>5) == OP_BNE) { + EMIT(MIPS_XOR_REG(AT, r, b&0x1f)); + EMIT(MIPS_SLTU_IMM(AT,Z0,AT)); r=AT; val++; break; + } + } + if (val) { + emith_or_r_r(sr, r); + if (inv) + emith_eor_r_imm(sr, T); + return; + } + + // can't obtain result directly, use presumably slower jump !cond + or sr,T + b = emith_invert_branch(b); + ptr = emith_branch(MIPS_BCONDZ(b, r, 0)); + emith_or_r_imm(sr, T); + emith_flush(); // prohibit delay slot switching across jump targets + val = (u8 *)tcache_ptr - (u8 *)(ptr) - 4; + EMIT_PTR(ptr, MIPS_BCONDZ(b, r, val & 0x0003ffff)); } #define emith_get_t_cond() -1 diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 39f3a1d76..e7284499c 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -340,11 +340,29 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common rcache_free_tmp(tmp_); \ } else emith_or_r_r_r(d, s1, s2); \ } while (0) +#define emith_or_r_r_r_lsr(d, s1, s2, lsrimm) do { \ + if (lsrimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsr(tmp_, s2, lsrimm); \ + emith_or_r_r_r(d, s1, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_or_r_r_r(d, s1, s2); \ +} while (0) // _r_r_shift #define emith_or_r_r_lsl(d, s, lslimm) \ emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) +#define emith_eor_r_r_lsl(d, s, lslimm) do { \ + if (lslimm) { \ + int tmp_ = rcache_get_tmp(); \ + emith_lsl(tmp_, s, lslimm); \ + emith_eor_r_r(d, tmp_); \ + rcache_free_tmp(tmp_); \ + } else emith_eor_r_r(d, s); \ +} while (0) #define emith_eor_r_r_lsr(d, s, lsrimm) do { \ if (lsrimm) { \ int tmp_ = rcache_get_tmp(); \ @@ -972,6 +990,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define EMITH_SJMP2_END(cond) \ EMITH_SJMP3_END() +#define EMITH_HINT_COND(cond) /**/ + #define emith_pass_arg_r(arg, reg) do { \ int rd = 7; \ host_arg2reg(rd, arg); \ @@ -1255,6 +1275,11 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common emith_rol(sr, sr, 1); \ } while (0) +#define emith_t_to_carry(sr, is_sub) do { \ + emith_ror(sr, sr, 1); \ + emith_rol(sr, sr, 1); \ +} while (0) + #define emith_tpop_carry(sr, is_sub) \ emith_lsr(sr, sr, 1) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 3cf7a0d91..2320c5010 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -45,6 +45,7 @@ #define REMAP_REGISTER 1 #define LOOP_DETECTION 1 #define LOOP_OPTIMIZER 1 +#define T_OPTIMIZER 1 // limits (per block) #define MAX_BLOCK_SIZE (BLOCK_INSN_LIMIT * 6 * 6) @@ -108,7 +109,7 @@ static int insns_compiled, hash_collisions, host_insn_count; #define GET_Rn() \ ((op >> 8) & 0x0f) -#define SHR_T SHR_SR // might make them separate someday +#define SHR_T 30 // separate T for not-used detection #define SHR_MEM 31 #define SHR_TMP -1 @@ -122,6 +123,7 @@ static int insns_compiled, hash_collisions, host_insn_count; #define I_SHIFT 4 #define Q_SHIFT 8 #define M_SHIFT 9 +#define T_SHIFT 11 static struct op_data { u8 op; @@ -263,7 +265,6 @@ static void REGPARM(3) *sh2_drc_log_entry(void *block, SH2 *sh2, u32 sr) return block; } #endif -// } debug #define TCACHE_BUFFERS 3 @@ -1527,7 +1528,7 @@ static void rcache_unmap_vreg(int x) FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, i, if (guest_regs[i].flags & GRF_DIRTY) { // if a dirty reg is unmapped save its value to context - if (~rcache_regs_discard & (1 << i)) + if ((~rcache_regs_discard | rcache_regs_now) & (1 << i)) emith_ctx_write(cache_regs[x].hreg, i * 4); guest_regs[i].flags &= ~GRF_DIRTY; } @@ -1565,7 +1566,7 @@ static void rcache_clean_vreg(int x) if (guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) { if (guest_regs[r].vreg != guest_regs[r].sreg && !cache_regs[guest_regs[r].sreg].locked && - (~rcache_regs_discard & (1 << r)) && + ((~rcache_regs_discard | rcache_regs_now) & (1 << r)) && !(rns & cache_regs[guest_regs[r].sreg].gregs)) { // statically mapped reg not in its sreg. move back to sreg rcache_evict_vreg(guest_regs[r].sreg); @@ -1578,7 +1579,7 @@ static void rcache_clean_vreg(int x) // cannot remap. keep dirty for writeback in unmap cache_regs[x].flags |= HRF_DIRTY; } else { - if (~rcache_regs_discard & (1 << r)) + if ((~rcache_regs_discard | rcache_regs_now) & (1 << r)) emith_ctx_write(cache_regs[x].hreg, r * 4); guest_regs[r].flags &= ~GRF_DIRTY; } @@ -1875,9 +1876,22 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) && guest_regs[r].sreg == dst && !tr->locked) { // split aliases if r is STATIC in sreg and dst isn't already locked - rcache_lock_vreg(dst); // lock to avoid evicting dst - x = rcache_allocate_vreg(rsp_d & ali); - rcache_unlock_vreg(dst); + int t; + FOR_ALL_BITS_SET_DO(ali, t, + if ((guest_regs[t].flags & (GRF_STATIC|GRF_PINNED)) && + !(ali & ~(1 << t)) && + !cache_regs[guest_regs[t].sreg].locked && + !(rsp_d & cache_regs[guest_regs[t].sreg].gregs)) { + // alias is a single STATIC and its sreg is available + x = guest_regs[t].sreg; + rcache_evict_vreg(x); + } else { + rcache_lock_vreg(dst); // lock to avoid evicting dst + x = rcache_allocate_vreg(rsp_d & ali); + rcache_unlock_vreg(dst); + } + break; + ) if (x >= 0) { src = x; rcache_move_vreg(src, dst); @@ -2855,11 +2869,11 @@ static void emit_do_static_regs(int is_write, int tmpr) } #define DELAY_SAVE_T(sr) { \ + int t_ = rcache_get_tmp(); \ emith_bic_r_imm(sr, T_save); \ - emith_tst_r_imm(sr, T); \ - EMITH_SJMP_START(DCOND_EQ); \ - emith_or_r_imm_c(DCOND_NE, sr, T_save); \ - EMITH_SJMP_END(DCOND_EQ); \ + emith_and_r_r_imm(t_, sr, 1); \ + emith_or_r_r_lsl(sr, t_, T_SHIFT); \ + rcache_free_tmp(t_); \ } #define FLUSH_CYCLES(sr) \ @@ -2961,6 +2975,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) ADD_TO_ARRAY(branch_target_pc, branch_target_count, pc, ); if (ops[i].op == OP_LDC && (ops[i].dest & BITMASK1(SHR_SR)) && pc+2 < end_pc) op_flags[i+1] |= OF_BTARGET; // RTE entrypoint in case of SR.IMASK change + // unify T and SR since rcache doesn't know about "virtual" guest regs + if (ops[i].source & BITMASK1(SHR_T)) ops[i].source |= BITMASK1(SHR_SR); + if (ops[i].dest & BITMASK1(SHR_T)) ops[i].dest |= BITMASK1(SHR_SR); #if LOOP_DETECTION // loop types detected: // 1. target: ... BRA target -> idle loop @@ -3014,15 +3031,15 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) drcf.pending_branch_indirect = 1; // conditions g,h - cond.branch // poll/idle loops terminate with their backwards branch to the loop start if (drcf.pending_branch_direct && !(op_flags[i+1] & OF_DELAY_OP)) { - m2 &= ~(m1 | BITMASK2(SHR_PC, SHR_SR)); // conditions d,e + g,h + m2 &= ~(m1 | BITMASK3(SHR_PC, SHR_SR, SHR_T)); // conditions d,e + g,h if (m2 || ((op == OF_IDLE_LOOP) == (drcf.pending_branch_indirect))) op = 0; // conditions not met op_flags[v] = (op_flags[v] & ~OF_LOOP) | op; // set loop type drcf.loop_type = 0; #if LOOP_OPTIMIZER if (op_flags[v] & OF_BASIC_LOOP) { - m3 &= ~rcache_regs_static & ~BITMASK4(SHR_PC, SHR_PR, SHR_SR, SHR_MEM); - if (m3 && count_bits(m3) < count_bits(rcache_hregs_reg) && + m3 &= ~rcache_regs_static & ~BITMASK5(SHR_PC, SHR_PR, SHR_SR, SHR_T, SHR_MEM); + if (m3 && count_bits(m3) < count_bits(rcache_vregs_reg) && pinned_loop_count < ARRAY_SIZE(pinned_loop_pc)-1) { pinned_loop_mask[pinned_loop_count] = m3; pinned_loop_pc[pinned_loop_count++] = base_pc + 2*v; @@ -3154,48 +3171,63 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_free_tmp(tmp3); #endif + // check cycles + sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + #if LOOP_OPTIMIZER if (op_flags[i] & OF_BASIC_LOOP) { if (pinned_loop_pc[pinned_loop_count] == pc) { // pin needed regs on loop entry FOR_ALL_BITS_SET_DO(pinned_loop_mask[pinned_loop_count], v, rcache_pin_reg(v)); emith_flush(); + // store current PC as loop target pinned_loop_ptr[pinned_loop_count] = tcache_ptr; } else op_flags[i] &= ~OF_BASIC_LOOP; } -#endif - // check cycles - sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); - emith_cmp_r_imm(sr, 0); - -#if LOOP_OPTIMIZER - void *jp = NULL; if (op_flags[i] & OF_BASIC_LOOP) { // if exiting a pinned loop pinned regs must be written back to ctx // since they are reloaded in the loop entry code - jp = tcache_ptr; - emith_jump_cond_patchable(DCOND_GT, jp); // XXX need API for JMP_POS + emith_cmp_r_imm(sr, 0); + EMITH_JMP_START(DCOND_GT); rcache_save_pinned(); - } + + if (blx_target_count < ARRAY_SIZE(blx_target_pc)) { + // exit via stub in blx table (saves some 1-3 insns in the main flow) + blx_target_ptr[blx_target_count] = tcache_ptr; + blx_target_pc[blx_target_count] = pc|1; + blx_target_bl[blx_target_count++] = NULL; + emith_jump_patchable(tcache_ptr); + } else { + // blx table full, must inline exit code + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, pc); + emith_jump(sh2_drc_exit); + rcache_free_tmp(tmp); + } + EMITH_JMP_END(DCOND_GT); + } else #endif - if (blx_target_count < ARRAY_SIZE(blx_target_pc)) { - // exit via stub in blx table (saves some 1-3 insns in the main flow) - blx_target_pc[blx_target_count] = pc|1; - blx_target_bl[blx_target_count] = NULL; - blx_target_ptr[blx_target_count++] = tcache_ptr; - } else { - // blx table full, must inline exit code - tmp = rcache_get_tmp_arg(0); - emith_move_r_imm_c(DCOND_LE, tmp, pc); - rcache_free_tmp(tmp); + { + if (blx_target_count < ARRAY_SIZE(blx_target_pc)) { + // exit via stub in blx table (saves some 1-3 insns in the main flow) + blx_target_pc[blx_target_count] = pc|1; + blx_target_bl[blx_target_count] = NULL; + emith_cmp_r_imm(sr, 0); + blx_target_ptr[blx_target_count++] = tcache_ptr; + emith_jump_cond_patchable(DCOND_LE, tcache_ptr); + } else { + // blx table full, must inline exit code + tmp = rcache_get_tmp_arg(0); + emith_cmp_r_imm(sr, 0); + EMITH_SJMP_START(DCOND_GT); + emith_move_r_imm_c(DCOND_LE, tmp, pc); + emith_jump_cond(DCOND_LE, sh2_drc_exit); + EMITH_SJMP_END(DCOND_GT); + rcache_free_tmp(tmp); + } } - emith_jump_cond_patchable(DCOND_LE, tcache_ptr); -#if LOOP_OPTIMIZER - if (op_flags[i] & OF_BASIC_LOOP) - emith_jump_patch(jp, tcache_ptr, NULL); -#endif #if (DRC_DEBUG & 32) // block hit counter @@ -3328,7 +3360,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_set_usage_now(opd[0].source); // current insn rcache_set_usage_soon(soon); // insns 1-4 rcache_set_usage_late(late & ~soon); // insns 5-9 - rcache_set_usage_discard(write & ~(late|soon|opd[0].source)); + rcache_set_usage_discard(write & ~(late|soon)); if (v <= 9) // upcoming rcache_flush, start writing back unused dirty stuff rcache_clean_masked(rcache_dirty_mask() & ~(write|opd[0].dest)); @@ -3512,11 +3544,17 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { case 0: // CLRT 0000000000001000 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_set_t(sr, 0); +#if T_OPTIMIZER + if (~rcache_regs_discard & BITMASK1(SHR_T)) +#endif + emith_set_t(sr, 0); break; case 1: // SETT 0000000000011000 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_set_t(sr, 1); +#if T_OPTIMIZER + if (~rcache_regs_discard & BITMASK1(SHR_T)) +#endif + emith_set_t(sr, 1); break; case 2: // CLRMAC 0000000000101000 emit_move_r_imm32(SHR_MACL, 0); @@ -3602,20 +3640,16 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); + tmp = rcache_get_tmp(); emith_invalidate_t(); emith_bic_r_imm(sr, M|Q|T); - emith_tst_r_imm(tmp2, (1<<31)); - EMITH_SJMP_START(DCOND_EQ); - emith_or_r_imm_c(DCOND_NE, sr, Q); - EMITH_SJMP_END(DCOND_EQ); - emith_tst_r_imm(tmp3, (1<<31)); - EMITH_SJMP_START(DCOND_EQ); - emith_or_r_imm_c(DCOND_NE, sr, M); - EMITH_SJMP_END(DCOND_EQ); - emith_teq_r_r(tmp2, tmp3); - EMITH_SJMP_START(DCOND_PL); - emith_or_r_imm_c(DCOND_MI, sr, T); - EMITH_SJMP_END(DCOND_PL); + emith_lsr(tmp, tmp2, 31); // Q = Nn + emith_or_r_r_lsl(sr, tmp, Q_SHIFT); + emith_lsr(tmp, tmp3, 31); // M = Nm + emith_or_r_r_lsl(sr, tmp, M_SHIFT); + emith_eor_r_r_lsr(tmp, tmp2, 31); + emith_or_r_r(sr, tmp); // T = Q^M + rcache_free(tmp); goto end_op; case 0x08: // TST Rm,Rn 0010nnnnmmmm1000 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); @@ -3708,26 +3742,27 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); - emith_clr_t_cond(sr); - emith_cmp_r_r(tmp2, tmp3); switch (op & 0x07) { case 0x00: // CMP/EQ - emith_set_t_cond(sr, DCOND_EQ); + tmp = DCOND_EQ; break; case 0x02: // CMP/HS - emith_set_t_cond(sr, DCOND_HS); + tmp = DCOND_HS; break; case 0x03: // CMP/GE - emith_set_t_cond(sr, DCOND_GE); + tmp = DCOND_GE; break; case 0x06: // CMP/HI - emith_set_t_cond(sr, DCOND_HI); + tmp = DCOND_HI; break; case 0x07: // CMP/GT - emith_set_t_cond(sr, DCOND_GT); + tmp = DCOND_GT; break; } + emith_clr_t_cond(sr); + emith_cmp_r_r(tmp2, tmp3); + emith_set_t_cond(sr, tmp); goto end_op; case 0x04: // DIV1 Rm,Rn 0011nnnnmmmm0100 // Q1 = carry(Rn = (Rn << 1) | T) @@ -3738,29 +3773,27 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // Q = M ^ Q1 ^ Q2 // T = (Q == M) = !(Q ^ M) = !(Q1 ^ Q2) tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp4); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_sync_t(sr); + EMITH_HINT_COND(DCOND_CS); emith_tpop_carry(sr, 0); - emith_adcf_r_r_r(tmp2, tmp, tmp); + emith_adcf_r_r_r(tmp2, tmp4, tmp4); emith_tpush_carry(sr, 0); // keep Q1 in T for now - rcache_free(tmp); - tmp4 = rcache_get_tmp(); - emith_and_r_r_imm(tmp4, sr, M); - emith_eor_r_r_lsr(sr, tmp4, M_SHIFT - Q_SHIFT); // Q ^= M - rcache_free_tmp(tmp4); + rcache_free(tmp4); + tmp = rcache_get_tmp(); + emith_and_r_r_imm(tmp, sr, M); + emith_eor_r_r_lsr(sr, tmp, M_SHIFT - Q_SHIFT); // Q ^= M + rcache_free_tmp(tmp); // add or sub, invert T if carry to get Q1 ^ Q2 // in: (Q ^ M) passed in Q, Q1 in T emith_sh2_div1_step(tmp2, tmp3, sr); - emith_bic_r_imm(sr, Q); - emith_tst_r_imm(sr, M); - EMITH_SJMP_START(DCOND_EQ); - emith_or_r_imm_c(DCOND_NE, sr, Q); // Q = M - EMITH_SJMP_END(DCOND_EQ); - emith_tst_r_imm(sr, T); - EMITH_SJMP_START(DCOND_EQ); - emith_eor_r_imm_c(DCOND_NE, sr, Q); // Q = M ^ Q1 ^ Q2 - EMITH_SJMP_END(DCOND_EQ); + tmp = rcache_get_tmp(); + emith_bic_r_imm(sr, Q); // Q = M + emith_and_r_r_imm(tmp, sr, M); + emith_or_r_r_lsr(sr, tmp, M_SHIFT - Q_SHIFT); + emith_and_r_r_imm(tmp, sr, T); // Q = M ^ Q1 ^ Q2 + emith_eor_r_r_lsl(sr, tmp, Q_SHIFT); emith_eor_r_imm(sr, T); // T = !(Q1 ^ Q2) goto end_op; case 0x05: // DMULU.L Rm,Rn 0011nnnnmmmm0101 @@ -3791,14 +3824,28 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_sync_t(sr); - if (op & 4) { // adc - emith_tpop_carry(sr, 0); - emith_adcf_r_r_r(tmp, tmp3, tmp2); - emith_tpush_carry(sr, 0); - } else { - emith_tpop_carry(sr, 1); - emith_sbcf_r_r_r(tmp, tmp3, tmp2); - emith_tpush_carry(sr, 1); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + if (op & 4) { + emith_t_to_carry(sr, 0); + emith_adc_r_r_r(tmp, tmp3, tmp2); + } else { + emith_t_to_carry(sr, 1); + emith_sbc_r_r_r(tmp, tmp3, tmp2); + } + } else +#endif + { + EMITH_HINT_COND(DCOND_CS); + if (op & 4) { // adc + emith_tpop_carry(sr, 0); + emith_adcf_r_r_r(tmp, tmp3, tmp2); + emith_tpush_carry(sr, 0); + } else { + emith_tpop_carry(sr, 1); + emith_sbcf_r_r_r(tmp, tmp3, tmp2); + emith_tpush_carry(sr, 1); + } } goto end_op; case 0x0b: // SUBV Rm,Rn 0011nnnnmmmm1011 @@ -3806,12 +3853,23 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp3); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_clr_t_cond(sr); - if (op & 4) { - emith_addf_r_r_r(tmp, tmp3, tmp2); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + if (op & 4) + emith_add_r_r_r(tmp,tmp3,tmp2); + else + emith_sub_r_r_r(tmp,tmp3,tmp2); } else - emith_subf_r_r_r(tmp, tmp3, tmp2); - emith_set_t_cond(sr, DCOND_VS); +#endif + { + emith_clr_t_cond(sr); + EMITH_HINT_COND(DCOND_VS); + if (op & 4) + emith_addf_r_r_r(tmp, tmp3, tmp2); + else + emith_subf_r_r_r(tmp, tmp3, tmp2); + emith_set_t_cond(sr, DCOND_VS); + } goto end_op; case 0x0d: // DMULS.L Rm,Rn 0011nnnnmmmm1101 tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); @@ -3834,9 +3892,16 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 2: // SHAL Rn 0100nnnn00100000 tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_invalidate_t(); - emith_lslf(tmp, tmp2, 1); - emith_carry_to_t(sr, 0); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) + emith_lsl(tmp, tmp2, 1); + else +#endif + { + emith_invalidate_t(); + emith_lslf(tmp, tmp2, 1); + emith_carry_to_t(sr, 0); + } goto end_op; case 1: // DT Rn 0100nnnn00010000 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); @@ -3850,6 +3915,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #endif tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); emith_clr_t_cond(sr); + EMITH_HINT_COND(DCOND_EQ); emith_subf_r_r_imm(tmp, tmp2, 1); emith_set_t_cond(sr, DCOND_EQ); goto end_op; @@ -3862,12 +3928,22 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 2: // SHAR Rn 0100nnnn00100001 tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_invalidate_t(); - if (op & 0x20) { - emith_asrf(tmp, tmp2, 1); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + if (op & 0x20) + emith_asr(tmp,tmp2,1); + else + emith_lsr(tmp,tmp2,1); } else - emith_lsrf(tmp, tmp2, 1); - emith_carry_to_t(sr, 0); +#endif + { + emith_invalidate_t(); + if (op & 0x20) { + emith_asrf(tmp, tmp2, 1); + } else + emith_lsrf(tmp, tmp2, 1); + emith_carry_to_t(sr, 0); + } goto end_op; case 1: // CMP/PZ Rn 0100nnnn00010001 tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); @@ -3919,24 +3995,45 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x05: // ROTR Rn 0100nnnn00000101 tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - emith_invalidate_t(); - if (op & 1) { - emith_rorf(tmp, tmp2, 1); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + if (op & 1) + emith_ror(tmp, tmp2, 1); + else + emith_rol(tmp, tmp2, 1); } else - emith_rolf(tmp, tmp2, 1); - emith_carry_to_t(sr, 0); +#endif + { + emith_invalidate_t(); + if (op & 1) + emith_rorf(tmp, tmp2, 1); + else + emith_rolf(tmp, tmp2, 1); + emith_carry_to_t(sr, 0); + } goto end_op; case 0x24: // ROTCL Rn 0100nnnn00100100 case 0x25: // ROTCR Rn 0100nnnn00100101 tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, NULL); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_sync_t(sr); - emith_tpop_carry(sr, 0); - if (op & 1) { - emith_rorcf(tmp); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + emith_t_to_carry(sr, 0); + if (op & 1) + emith_rorc(tmp); + else + emith_rolc(tmp); } else - emith_rolcf(tmp); - emith_tpush_carry(sr, 0); +#endif + { + emith_tpop_carry(sr, 0); + if (op & 1) + emith_rorcf(tmp); + else + emith_rolcf(tmp); + emith_tpush_carry(sr, 0); + } goto end_op; case 0x15: // CMP/PL Rn 0100nnnn00010101 tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); @@ -4131,9 +4228,18 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x0a: // NEGC Rm,Rn 0110nnnnmmmm1010 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_sync_t(sr); - emith_tpop_carry(sr, 1); - emith_negcf_r_r(tmp2, tmp); - emith_tpush_carry(sr, 1); +#if T_OPTIMIZER + if (rcache_regs_discard & BITMASK1(SHR_T)) { + emith_t_to_carry(sr, 1); + emith_negc_r_r(tmp2, tmp); + } else +#endif + { + EMITH_HINT_COND(DCOND_CS); + emith_tpop_carry(sr, 1); + emith_negcf_r_r(tmp2, tmp); + emith_tpush_carry(sr, 1); + } break; case 0x0b: // NEG Rm,Rn 0110nnnnmmmm1011 emith_neg_r_r(tmp2, tmp); @@ -4639,9 +4745,6 @@ static void sh2_generate_utils(void) host_arg2reg(arg2, 2); host_arg2reg(arg3, 3); emith_move_r_r(arg0, arg0); // nop - emith_move_r_r(arg1, arg1); // nop - emith_move_r_r(arg2, arg2); // nop - emith_move_r_r(arg3, arg3); // nop emith_flush(); // sh2_drc_write8(u32 a, u32 d) @@ -4665,6 +4768,7 @@ static void sh2_generate_utils(void) // d = sh2_drc_read8(u32 a) sh2_drc_read8 = (void *)tcache_ptr; emith_ctx_read_ptr(arg1, offsetof(SH2, read8_map)); + EMITH_HINT_COND(DCOND_CS); emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CS); emith_and_r_r_c(DCOND_CC, arg0, arg3); @@ -4679,6 +4783,7 @@ static void sh2_generate_utils(void) // d = sh2_drc_read16(u32 a) sh2_drc_read16 = (void *)tcache_ptr; emith_ctx_read_ptr(arg1, offsetof(SH2, read16_map)); + EMITH_HINT_COND(DCOND_CS); emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CS); emith_and_r_r_c(DCOND_CC, arg0, arg3); @@ -4692,6 +4797,7 @@ static void sh2_generate_utils(void) // d = sh2_drc_read32(u32 a) sh2_drc_read32 = (void *)tcache_ptr; emith_ctx_read_ptr(arg1, offsetof(SH2, read32_map)); + EMITH_HINT_COND(DCOND_CS); emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CS); emith_and_r_r_c(DCOND_CC, arg0, arg3); @@ -4706,6 +4812,7 @@ static void sh2_generate_utils(void) // d = sh2_drc_read8_poll(u32 a) sh2_drc_read8_poll = (void *)tcache_ptr; emith_ctx_read_ptr(arg1, offsetof(SH2, read8_map)); + EMITH_HINT_COND(DCOND_CS); emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CC); emith_move_r_r_ptr_c(DCOND_CS, arg1, CONTEXT_REG); @@ -4723,6 +4830,7 @@ static void sh2_generate_utils(void) // d = sh2_drc_read16_poll(u32 a) sh2_drc_read16_poll = (void *)tcache_ptr; emith_ctx_read_ptr(arg1, offsetof(SH2, read16_map)); + EMITH_HINT_COND(DCOND_CS); emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CC); emith_move_r_r_ptr_c(DCOND_CS, arg1, CONTEXT_REG); @@ -4739,6 +4847,7 @@ static void sh2_generate_utils(void) // d = sh2_drc_read32_poll(u32 a) sh2_drc_read32_poll = (void *)tcache_ptr; emith_ctx_read_ptr(arg1, offsetof(SH2, read32_map)); + EMITH_HINT_COND(DCOND_CS); emith_sh2_rcall(arg0, arg1, arg2, arg3); EMITH_SJMP_START(DCOND_CC); emith_move_r_r_ptr_c(DCOND_CS, arg1, CONTEXT_REG); @@ -4834,16 +4943,19 @@ static void sh2_generate_utils(void) emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx)); emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg2, 0); emith_read_r_r_offs(arg3, arg1, offsetof(SH2, rts_cache)); - emith_cmp_r_r(arg0, arg3); #if (DRC_DEBUG & 128) + emith_cmp_r_r(arg0, arg3); EMITH_SJMP_START(DCOND_EQ); emith_move_r_ptr_imm(arg3, (uptr)&rcmiss); emith_read_r_r_offs_c(DCOND_NE, arg1, arg3, 0); emith_add_r_imm_c(DCOND_NE, arg1, 1); emith_write_r_r_offs_c(DCOND_NE, arg1, arg3, 0); + emith_jump_cond(DCOND_NE, sh2_drc_dispatcher); EMITH_SJMP_END(DCOND_EQ); -#endif +#else + emith_cmp_r_r(arg0, arg3); emith_jump_cond(DCOND_NE, sh2_drc_dispatcher); +#endif emith_read_r_r_offs_ptr(arg0, arg1, offsetof(SH2, rts_cache) + sizeof(void *)); emith_sub_r_imm(arg2, 2*sizeof(void *)); emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *)); @@ -4874,7 +4986,7 @@ static void sh2_generate_utils(void) emith_sub_r_imm(tmp, 4*2); rcache_clean(); // push SR - tmp = rcache_get_reg_arg(0, SHR_SP,&tmp2); + tmp = rcache_get_reg_arg(0, SHR_SP, &tmp2); emith_add_r_r_imm(tmp, tmp2, 4); tmp = rcache_get_reg_arg(1, SHR_SR, NULL); emith_clear_msb(tmp, tmp, 22); @@ -5478,6 +5590,8 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, else if ((lowest_mova && lowest_mova <= pc) || (lowest_literal && lowest_literal <= pc)) break; // text area collides with data area + else if ((op_flags[i] & OF_BTARGET) && dr_get_entry(pc, is_slave, &i_end)) + break; // branch target already compiled op = FETCH_OP(pc); switch ((op & 0xf000) >> 12) @@ -5490,19 +5604,19 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, switch (GET_Fx()) { case 0: // STC SR,Rn 0000nnnn00000010 - tmp = SHR_SR; + tmp = BITMASK2(SHR_SR, SHR_T); break; case 1: // STC GBR,Rn 0000nnnn00010010 - tmp = SHR_GBR; + tmp = BITMASK1(SHR_GBR); break; case 2: // STC VBR,Rn 0000nnnn00100010 - tmp = SHR_VBR; + tmp = BITMASK1(SHR_VBR); break; default: goto undefined; } opd->op = OP_MOVE; - opd->source = BITMASK1(tmp); + opd->source = tmp; opd->dest = BITMASK1(GET_Rn()); break; case 0x03: @@ -5549,7 +5663,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->imm = 1; break; case 2: // CLRMAC 0000000000101000 - opd->dest = BITMASK3(SHR_T, SHR_MACL, SHR_MACH); + opd->dest = BITMASK2(SHR_MACL, SHR_MACH); break; default: goto undefined; @@ -5612,7 +5726,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, case 2: // RTE 0000000000101011 opd->op = OP_RTE; opd->source = BITMASK1(SHR_SP); - opd->dest = BITMASK3(SHR_SP, SHR_SR, SHR_PC); + opd->dest = BITMASK4(SHR_SP, SHR_SR, SHR_T, SHR_PC); opd->cycles = 4; next_is_delay = 1; end_block = !(op_flags[i+1+next_is_delay] & OF_BTARGET); @@ -5664,7 +5778,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, break; case 0x07: // DIV0S Rm,Rn 0010nnnnmmmm0111 opd->source = BITMASK2(GET_Rm(), GET_Rn()); - opd->dest = BITMASK1(SHR_SR); + opd->dest = BITMASK2(SHR_SR, SHR_T); break; case 0x08: // TST Rm,Rn 0010nnnnmmmm1000 opd->source = BITMASK2(GET_Rm(), GET_Rn()); @@ -5707,8 +5821,8 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->dest = BITMASK1(SHR_T); break; case 0x04: // DIV1 Rm,Rn 0011nnnnmmmm0100 - opd->source = BITMASK3(GET_Rm(), GET_Rn(), SHR_SR); - opd->dest = BITMASK2(GET_Rn(), SHR_SR); + opd->source = BITMASK4(GET_Rm(), GET_Rn(), SHR_SR, SHR_T); + opd->dest = BITMASK3(GET_Rn(), SHR_SR, SHR_T); break; case 0x05: // DMULU.L Rm,Rn 0011nnnnmmmm0101 case 0x0d: // DMULS.L Rm,Rn 0011nnnnmmmm1101 @@ -5778,30 +5892,30 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, switch (op & 0x3f) { case 0x02: // STS.L MACH,@-Rn 0100nnnn00000010 - tmp = SHR_MACH; + tmp = BITMASK1(SHR_MACH); break; case 0x12: // STS.L MACL,@-Rn 0100nnnn00010010 - tmp = SHR_MACL; + tmp = BITMASK1(SHR_MACL); break; case 0x22: // STS.L PR,@-Rn 0100nnnn00100010 - tmp = SHR_PR; + tmp = BITMASK1(SHR_PR); break; case 0x03: // STC.L SR,@-Rn 0100nnnn00000011 - tmp = SHR_SR; + tmp = BITMASK2(SHR_SR, SHR_T); opd->cycles = 2; break; case 0x13: // STC.L GBR,@-Rn 0100nnnn00010011 - tmp = SHR_GBR; + tmp = BITMASK1(SHR_GBR); opd->cycles = 2; break; case 0x23: // STC.L VBR,@-Rn 0100nnnn00100011 - tmp = SHR_VBR; + tmp = BITMASK1(SHR_VBR); opd->cycles = 2; break; default: goto undefined; } - opd->source = BITMASK2(GET_Rn(), tmp); + opd->source = BITMASK1(GET_Rn()) | tmp; opd->dest = BITMASK2(GET_Rn(), SHR_MEM); break; case 0x04: @@ -5831,26 +5945,26 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, switch (op & 0x3f) { case 0x06: // LDS.L @Rm+,MACH 0100mmmm00000110 - tmp = SHR_MACH; + tmp = BITMASK1(SHR_MACH); break; case 0x16: // LDS.L @Rm+,MACL 0100mmmm00010110 - tmp = SHR_MACL; + tmp = BITMASK1(SHR_MACL); break; case 0x26: // LDS.L @Rm+,PR 0100mmmm00100110 - tmp = SHR_PR; + tmp = BITMASK1(SHR_PR); break; case 0x07: // LDC.L @Rm+,SR 0100mmmm00000111 - tmp = SHR_SR; + tmp = BITMASK2(SHR_SR, SHR_T); opd->op = OP_LDC; opd->cycles = 3; break; case 0x17: // LDC.L @Rm+,GBR 0100mmmm00010111 - tmp = SHR_GBR; + tmp = BITMASK1(SHR_GBR); opd->op = OP_LDC; opd->cycles = 3; break; case 0x27: // LDC.L @Rm+,VBR 0100mmmm00100111 - tmp = SHR_VBR; + tmp = BITMASK1(SHR_VBR); opd->op = OP_LDC; opd->cycles = 3; break; @@ -5858,7 +5972,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, goto undefined; } opd->source = BITMASK2(GET_Rn(), SHR_MEM); - opd->dest = BITMASK2(GET_Rn(), tmp); + opd->dest = BITMASK1(GET_Rn()) | tmp; break; case 0x08: case 0x09: @@ -5931,20 +6045,20 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, switch (GET_Fx()) { case 0: // LDC Rm,SR 0100mmmm00001110 - tmp = SHR_SR; + tmp = BITMASK2(SHR_SR, SHR_T); break; case 1: // LDC Rm,GBR 0100mmmm00011110 - tmp = SHR_GBR; + tmp = BITMASK1(SHR_GBR); break; case 2: // LDC Rm,VBR 0100mmmm00101110 - tmp = SHR_VBR; + tmp = BITMASK1(SHR_VBR); break; default: goto undefined; } opd->op = OP_LDC; opd->source = BITMASK1(GET_Rn()); - opd->dest = BITMASK1(tmp); + opd->dest = tmp; break; case 0x0f: // MAC.W @Rm+,@Rn+ 0100nnnnmmmm1111 @@ -6130,7 +6244,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, break; case 0x0300: // TRAPA #imm 11000011iiiiiiii opd->op = OP_TRAPA; - opd->source = BITMASK3(SHR_SP, SHR_PC, SHR_SR); + opd->source = BITMASK4(SHR_SP, SHR_PC, SHR_SR, SHR_T); opd->dest = BITMASK2(SHR_SP, SHR_PC); opd->imm = (op & 0xff); opd->cycles = 8; @@ -6256,9 +6370,6 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, last_btarget = 0; op = 0; // delay/poll insns counter for (i = 0, pc = base_pc; i < i_end; i++, pc += 2) { - int null; - if ((op_flags[i] & OF_BTARGET) && dr_get_entry(pc, is_slave, &null)) - break; // branch target already compiled opd = &ops[i]; crc += FETCH_OP(pc); diff --git a/pico/32x/32x.c b/pico/32x/32x.c index 9993bfa8a..7e2e039e5 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -38,17 +38,19 @@ void p32x_update_irls(SH2 *active_sh2, unsigned int m68k_cycles) if (active_sh2 != NULL) m68k_cycles = sh2_cycles_done_m68k(active_sh2); + // find top bit = highest irq number (0 <= irl <= 14/2) by binary search + // msh2 irqs = Pico32x.sh2irqs | Pico32x.sh2irqi[0]; - while ((irqs >>= 1)) - mlvl++; - mlvl *= 2; + if (irqs >= 0x10) mlvl += 8, irqs >>= 4; + if (irqs >= 0x04) mlvl += 4, irqs >>= 2; + if (irqs >= 0x02) mlvl += 2, irqs >>= 1; // ssh2 irqs = Pico32x.sh2irqs | Pico32x.sh2irqi[1]; - while ((irqs >>= 1)) - slvl++; - slvl *= 2; + if (irqs >= 0x10) slvl += 8, irqs >>= 4; + if (irqs >= 0x04) slvl += 4, irqs >>= 2; + if (irqs >= 0x02) slvl += 2, irqs >>= 1; mrun = sh2_irl_irq(&msh2, mlvl, msh2.state & SH2_STATE_RUN); if (mrun) { From b71d3dfaf14a20e873a9256f556cafcfa454963a Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 13 Nov 2019 21:05:35 +0100 Subject: [PATCH 078/174] sh2 drc: RISC-V (RV64IM) code emitter, some work on MIPS64 --- Makefile | 6 +- config.aarch64 | 14 -- config.caanoo | 16 -- config.caanoo47 | 16 -- config.dingux | 15 -- config.dingux54 | 15 -- config.gcw0 | 15 -- config.gp2x | 16 -- config.gp2x47 | 16 -- config.i386 | 14 -- config.x86 | 14 -- cpu/drc/emit_arm.c | 145 +++++++++-------- cpu/drc/emit_arm64.c | 4 +- cpu/drc/emit_mips.c | 174 ++++++++++++++------ cpu/drc/emit_x86.c | 12 +- cpu/sh2/compiler.c | 335 ++++++++++++++++++++------------------ cpu/sh2/compiler.h | 2 + pico/32x/32x.c | 2 +- platform/common/dismips.c | 2 +- platform/linux/emu.c | 2 +- tools/mkoffsets.sh | 2 +- 21 files changed, 394 insertions(+), 443 deletions(-) delete mode 100644 config.aarch64 delete mode 100644 config.caanoo delete mode 100644 config.caanoo47 delete mode 100644 config.dingux delete mode 100644 config.dingux54 delete mode 100644 config.gcw0 delete mode 100644 config.gp2x delete mode 100644 config.gp2x47 delete mode 100644 config.i386 delete mode 100644 config.x86 diff --git a/Makefile b/Makefile index 5b7e5a2ca..a4482d786 100644 --- a/Makefile +++ b/Makefile @@ -74,6 +74,10 @@ else ifneq (,$(findstring aarch64,$(ARCH))) use_fame ?= 1 use_cz80 ?= 1 use_sh2drc ?= 1 +else ifneq (,$(findstring riscv,$(ARCH))) +use_fame ?= 1 +use_cz80 ?= 1 +use_sh2drc ?= 1 endif -include Makefile.local @@ -288,7 +292,7 @@ pico/carthw_cfg.c: pico/carthw.cfg # random deps pico/carthw/svp/compiler.o : cpu/drc/emit_arm.c cpu/sh2/compiler.o : cpu/drc/emit_arm.c cpu/drc/emit_arm64.c -cpu/sh2/compiler.o : cpu/drc/emit_x86.c cpu/drc/emit_mips.c +cpu/sh2/compiler.o : cpu/drc/emit_x86.c cpu/drc/emit_mips.c cpu/drc/emit_riscv.c cpu/sh2/mame/sh2pico.o : cpu/sh2/mame/sh2.c pico/pico.o pico/cd/mcd.o pico/32x/32x.o : pico/pico_cmn.c pico/pico_int.h pico/memory.o pico/cd/memory.o pico/32x/memory.o : pico/pico_int.h pico/memory.h diff --git a/config.aarch64 b/config.aarch64 deleted file mode 100644 index 9631d64ec..000000000 --- a/config.aarch64 +++ /dev/null @@ -1,14 +0,0 @@ -# Automatically generated by configure -# Configured with: './configure' '--platform=generic' -CC = aarch64-linux-gnu-gcc -CXX = aarch64-linux-gnu-g++ -AS = aarch64-linux-gnu-as -STRIP = aarch64-linux-gnu-strip -CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -ASFLAGS += -LDFLAGS += # --sysroot ${HOME}/opt/aarch64/debian-arm64 -LDLIBS += -lSDL -lasound -lpng -lz -lm -ldl - -ARCH = aarch64 -PLATFORM = generic -SOUND_DRIVERS = oss alsa sdl diff --git a/config.caanoo b/config.caanoo deleted file mode 100644 index 1ffc54da8..000000000 --- a/config.caanoo +++ /dev/null @@ -1,16 +0,0 @@ -# Automatically generated by configure -# Configured with: './configure' '--platform=gp2x' -CC = arm-gph-linux-gnueabi-gcc -CXX = arm-gph-linux-gnueabi-g++ -AS = arm-gph-linux-gnueabi-as -STRIP = arm-gph-linux-gnueabi-strip -CFLAGS += -I${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I${HOME}/src/gp2x/armroot-eabi/include -D__GP2X__ -Wno-unused-result -CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -mno-thumb-interwork -fno-stack-protector -fno-common -CFLAGS += -finline-limit=42 -fipa-pta -fno-ipa-pure-const -ASFLAGS += -mfloat-abi=soft -mcpu=arm920t -LDFLAGS += --sysroot ${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root -L${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L${HOME}/src/gp2x/armroot-eabi/lib -static -LDLIBS += -lpng -lm -ldl - -ARCH = arm -PLATFORM = gp2x -SOUND_DRIVERS = oss diff --git a/config.caanoo47 b/config.caanoo47 deleted file mode 100644 index 5bcf86084..000000000 --- a/config.caanoo47 +++ /dev/null @@ -1,16 +0,0 @@ -# Automatically generated by configure -# Configured with: './configure' '--platform=gp2x' -CC = arm-linux-gnueabi-gcc -CXX = arm-linux-gnueabi-g++ -AS = arm-linux-gnueabi-as -STRIP = arm-linux-gnueabi-strip -CFLAGS += -I${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I${HOME}/src/gp2x/armroot-eabi/include -D__GP2X__ -Wno-unused-result -CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -mno-thumb-interwork -fno-stack-protector -fno-common -CFLAGS += -finline-limit=42 -fipa-pta -fno-ipa-sra -fno-ipa-pure-const -ASFLAGS += -mfloat-abi=soft -mcpu=arm920t -LDFLAGS += -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/lib/gcc/arm-gph-linux-gnueabi/4.2.4 -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L${HOME}/src/gp2x/armroot-eabi/lib -static -LDLIBS += -lpng -lm -ldl - -ARCH = arm -PLATFORM = gp2x -SOUND_DRIVERS = oss diff --git a/config.dingux b/config.dingux deleted file mode 100644 index b981bd3f9..000000000 --- a/config.dingux +++ /dev/null @@ -1,15 +0,0 @@ -# Automatically generated by configure -# Configured with: './configure' '--platform=opendingux' -CC = mipsel-linux-gcc -CXX = mipsel-linux-g++ -AS = mipsel-linux-as -STRIP = mipsel-linux-strip -CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/ -I${HOME}/opt/opendingux-toolchain/usr/include/SDL -CFLAGS += -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -fno-stack-protector -ASFLAGS += -LDFLAGS += --sysroot ${HOME}/opt/opendingux-toolchain -L${HOME}/opt/opendingux-toolchain/lib -LDLIBS += -lSDL -lasound -lpng -lz -lm -ldl - -ARCH = mipsel -PLATFORM = opendingux -SOUND_DRIVERS = sdl diff --git a/config.dingux54 b/config.dingux54 deleted file mode 100644 index a232d952b..000000000 --- a/config.dingux54 +++ /dev/null @@ -1,15 +0,0 @@ -# Automatically generated by configure -# Configured with: './configure' '--platform=opendingux' -CC = mipsel-linux-gnu-gcc -CXX = mipsel-linux-gnu-g++ -AS = mipsel-linux-gnu-as -STRIP = mipsel-linux-gnu-strip -CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/ -I${HOME}/opt/opendingux-toolchain/usr/include/SDL -CFLAGS += -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -fno-stack-protector -ASFLAGS += -LDFLAGS += -B${HOME}/opt/opendingux-toolchain/usr/lib -B${HOME}/opt/opendingux-toolchain/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/lib -LDLIBS += -lSDL -lasound -lpng -lz -lm -ldl - -ARCH = mipsel -PLATFORM = opendingux -SOUND_DRIVERS = sdl diff --git a/config.gcw0 b/config.gcw0 deleted file mode 100644 index cebe79a10..000000000 --- a/config.gcw0 +++ /dev/null @@ -1,15 +0,0 @@ -# Automatically generated by configure -# Configured with: './configure' '--platform=gcw0' -CC = mipsel-gcw0-linux-uclibc-gcc -CXX = mipsel-gcw0-linux-uclibc-g++ -AS = mipsel-gcw0-linux-uclibc-as -STRIP = mipsel-gcw0-linux-uclibc-strip -CFLAGS += -I${HOME}/opt/gcw0-toolchain/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/ -I${HOME}/opt/gcw0-toolchain/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL -CFLAGS += -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -fno-stack-protector -ASFLAGS += -LDFLAGS += --sysroot ${HOME}/opt/gcw0-toolchain/usr/mipsel-gcw0-linux-uclibc/sysroot -LDLIBS += -lSDL -lasound -lpng -lz -lm -ldl - -ARCH = mipsel -PLATFORM = opendingux -SOUND_DRIVERS = sdl diff --git a/config.gp2x b/config.gp2x deleted file mode 100644 index cf99bd774..000000000 --- a/config.gp2x +++ /dev/null @@ -1,16 +0,0 @@ -# Automatically generated by configure -# Configured with: './configure' '--platform=gp2x' -CC = arm-open2x-linux-gcc -CXX = arm-open2x-linux-g++ -AS = arm-open2x-linux-as -STRIP = arm-open2x-linux-strip -CFLAGS += -I${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I${HOME}/src/gp2x/armroot/include -D__GP2X__ -CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -mno-thumb-interwork -fno-stack-protector -fno-common -CFLAGS += -finline-limit=42 -fipa-cp -fno-ipa-pure-const -ASFLAGS += -mcpu=arm920t -mfloat-abi=soft -LDFLAGS += --sysroot ${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/src/gp2x/armroot/lib -static -LDLIBS += -lpng -lm -ldl - -ARCH = arm -PLATFORM = gp2x -SOUND_DRIVERS = oss diff --git a/config.gp2x47 b/config.gp2x47 deleted file mode 100644 index 8a86e850c..000000000 --- a/config.gp2x47 +++ /dev/null @@ -1,16 +0,0 @@ -# Automatically generated by configure -# Configured with: './configure' '--platform=gp2x' -CC = arm-linux-gnueabi-gcc -CXX = arm-linux-gnueabi-g++ -AS = arm-linux-gnueabi-as -STRIP = arm-linux-gnueabi-strip -CFLAGS += -I${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I${HOME}/src/gp2x/armroot/include -D__GP2X__ -Wno-unused-result -CFLAGS += -mabi=apcs-gnu -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t -mtune=arm920t -mno-thumb-interwork -fno-stack-protector -fno-common -CFLAGS += -finline-limit=42 -fipa-pta -fno-ipa-sra -fno-ipa-pure-const -ASFLAGS += -mabi=apcs-gnu -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t -LDFLAGS += -mabi=apcs-gnu -mfpu=fpa -B${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/usr/lib -L${HOME}/src/gp2x/armroot/lib -static -LDLIBS += -lpng -lm -ldl - -ARCH = arm -PLATFORM = gp2x -SOUND_DRIVERS = oss diff --git a/config.i386 b/config.i386 deleted file mode 100644 index 9c8c2e652..000000000 --- a/config.i386 +++ /dev/null @@ -1,14 +0,0 @@ -# Automatically generated by configure -# Configured with: './configure' '--platform=generic' -CC = gcc -CXX = g++ -AS = as -STRIP = strip -CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -m32 -ASFLAGS += -LDFLAGS += -m32 -L/usr/lib/i386-linux-gnu -L${HOME}/opt/lib32 -LDLIBS += -lSDL-1.2 -lasound -lpng -lz -lm -ldl - -ARCH = i386 -PLATFORM = generic -SOUND_DRIVERS = oss alsa sdl diff --git a/config.x86 b/config.x86 deleted file mode 100644 index 454400110..000000000 --- a/config.x86 +++ /dev/null @@ -1,14 +0,0 @@ -# Automatically generated by configure -# Configured with: './configure' '--platform=generic' -CC = gcc -CXX = g++ -AS = as -STRIP = strip -CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -ASFLAGS += -LDFLAGS += -L/usr/lib/x86_64-linux-gnu -LDLIBS += -lSDL-1.2 -lasound -lpng -lz -lm -ldl - -ARCH = x86_64 -PLATFORM = generic -SOUND_DRIVERS = oss alsa sdl diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 25a2c72fb..8f633fa3c 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -365,7 +365,7 @@ static void emith_flush(void) #define EOP_LDR_REG_LSL(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,0,1,rn,rd,shift_imm,A_AM1_LSL,rm) #define EOP_LDR_REG_LSL_WB(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,0,3,rn,rd,shift_imm,A_AM1_LSL,rm) -#define EOP_LDRB_REG_LSL(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,1,1,rn,rd,shift_imm,A_AM1_LSL,rm); +#define EOP_LDRB_REG_LSL(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,1,1,rn,rd,shift_imm,A_AM1_LSL,rm) #define EOP_STR_REG_LSL_WB(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,0,2,rn,rd,shift_imm,A_AM1_LSL,rm) #define EOP_LDRH_IMM2(cond,rd,rn,offset_8) EOP_C_AM3_IMM(cond,(offset_8) >= 0,1,rn,rd,0,1,abs(offset_8)) @@ -470,84 +470,89 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int if (cond == A_COND_NV) return; - switch (op) { - case A_OP_MOV: - rn = 0; - // count bits in imm and use MVN if more bits 1 than 0 - if (count_bits(imm) > 16) { - imm = ~imm; - op = A_OP_MVN; - } - // count insns needed for mov/orr #imm + do { + u32 u; + // try to get the topmost byte empty to possibly save an insn for (v = imm, ror2 = 0; (v >> 24) && ror2 < 32/2; ror2++) v = (v << 2) | (v >> 30); -#ifdef HAVE_ARMV7 - for (i = 2; i > 0; i--, v >>= 8) - while (v > 0xff && !(v & 3)) - v >>= 2; - if (v) { // 3+ insns needed... - if (op == A_OP_MVN) - imm = ~imm; - // ...prefer movw/movt - EOP_MOVW(rd, imm); - if (imm & 0xffff0000) - EOP_MOVT(rd, imm); - return; - } -#else - for (i = 3; i > 0; i--, v >>= 8) - while (v > 0xff && !(v & 3)) - v >>= 2; - if (v) { // 4 insns needed... - if (op == A_OP_MVN) + + switch (op) { + case A_OP_MOV: + rn = 0; + // use MVN if more bits 1 than 0 + if (count_bits(imm) > 16) { imm = ~imm; - // ...emit literal load - int idx, o; - if (literal_iindex >= MAX_HOST_LITERALS) { - elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, - "pool overflow"); - exit(1); + op = A_OP_MVN; + ror2 = -1; + break; + } + // count insns needed for mov/orr #imm +#ifdef HAVE_ARMV7 + for (i = 2, u = v; i > 0; i--, u >>= 8) + while (u > 0xff && !(u & 3)) + u >>= 2; + if (u) { // 3+ insns needed... + if (op == A_OP_MVN) + imm = ~imm; + // ...prefer movw/movt + EOP_MOVW(rd, imm); + if (imm & 0xffff0000) + EOP_MOVT(rd, imm); + return; } - idx = emith_pool_literal(imm, &o); - literal_insn[literal_iindex++] = (u32 *)tcache_ptr; - EOP_LDR_IMM2(cond, rd, PC, idx * sizeof(u32)); - if (o > 0) - EOP_C_DOP_IMM(cond, A_OP_ADD, 0, rd, rd, 0, o); - else if (o < 0) - EOP_C_DOP_IMM(cond, A_OP_SUB, 0, rd, rd, 0, -o); +#else + for (i = 2, u = v; i > 0; i--, u >>= 8) + while (u > 0xff && !(u & 3)) + u >>= 2; + if (u) { // 4 insns needed... + if (op == A_OP_MVN) + imm = ~imm; + // ...emit literal load + int idx, o; + if (literal_iindex >= MAX_HOST_LITERALS) { + elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, + "pool overflow"); + exit(1); + } + idx = emith_pool_literal(imm, &o); + literal_insn[literal_iindex++] = (u32 *)tcache_ptr; + EOP_LDR_IMM2(cond, rd, PC, idx * sizeof(u32)); + if (o > 0) + EOP_C_DOP_IMM(cond, A_OP_ADD, 0,rd,rd,0,o); + else if (o < 0) + EOP_C_DOP_IMM(cond, A_OP_SUB, 0,rd,rd,0,-o); return; - } + } #endif - break; + break; - case A_OP_AND: - // AND must fit into 1 insn. if not, use BIC - for (v = imm, ror2 = 0; (v >> 8) && ror2 < 32/2; ror2++) - v = (v << 2) | (v >> 30); - if (v >> 8) { - imm = ~imm; - op = A_OP_BIC; - } - break; - - case A_OP_SUB: - case A_OP_ADD: - // count bits in imm and swap ADD and SUB if more bits 1 than 0 - if (s == 0 && count_bits(imm) > 16) { - imm = -imm; - op ^= (A_OP_ADD^A_OP_SUB); + case A_OP_AND: + // AND must fit into 1 insn. if not, use BIC + for (u = v; u > 0xff && !(u & 3); u >>= 2) ; + if (u >> 8) { + imm = ~imm; + op = A_OP_BIC; + ror2 = -1; + } + break; + + case A_OP_SUB: + case A_OP_ADD: + // swap ADD and SUB if more bits 1 than 0 + if (s == 0 && count_bits(imm) > 16) { + imm = -imm; + op ^= (A_OP_ADD^A_OP_SUB); + ror2 = -1; + } + case A_OP_EOR: + case A_OP_ORR: + case A_OP_BIC: + if (s == 0 && imm == 0 && rd == rn) + return; + break; } - case A_OP_EOR: - case A_OP_ORR: - case A_OP_BIC: - if (s == 0 && imm == 0 && rd == rn) - return; - break; - } + } while (ror2 < 0); - // try to get the topmost byte empty to possibly save an insn - for (v = imm, ror2 = 0; (v >> 24) && ror2 < 32/2; ror2++) - v = (v << 2) | (v >> 30); do { // shift down to get 'best' rot2 while (v > 0xff && !(v & 3)) diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index dc0cf5594..3f40d4cd7 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -134,9 +134,9 @@ enum { XT_UXTW=0x4, XT_UXTX=0x6, XT_LSL=0x7, XT_SXTW=0xc, XT_SXTX=0xe }; #define A64_TST_REG(rn, rm, stype, simm) \ A64_ANDS_REG(Z0, rn, rm, stype, simm) #define A64_MOV_REG(rd, rm, stype, simm) \ - A64_OR_REG(rd, Z0, rm, stype, simm); + A64_OR_REG(rd, Z0, rm, stype, simm) #define A64_MVN_REG(rd, rm, stype, simm) \ - A64_ORN_REG(rd, Z0, rm, stype, simm); + A64_ORN_REG(rd, Z0, rm, stype, simm) // rd = rn OP (rm EXTEND simm) #define A64_ADD_XREG(rd, rn, rm, xtopt, simm) \ diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 825274742..6f07e509b 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -62,14 +62,17 @@ // opcode field (encoded in op) enum { OP__FN=000, OP__RT, OP_J, OP_JAL, OP_BEQ, OP_BNE, OP_BLEZ, OP_BGTZ }; enum { OP_ADDI=010, OP_ADDIU, OP_SLTI, OP_SLTIU, OP_ANDI, OP_ORI, OP_XORI, OP_LUI }; -enum { OP_LB=040, OP_LH, OP_LWL, OP_LW, OP_LBU, OP_LHU, OP_LWR }; -enum { OP_SB=050, OP_SH, OP_SWL, OP_SW, __(54), __(55), OP_SWR }; +enum { OP_LB=040, OP_LH, OP_LWL, OP_LW, OP_LBU, OP_LHU, OP_LWR, OP_LWU }; +enum { OP_SB=050, OP_SH, OP_SWL, OP_SW, OP_SDL, OP_SDR, OP_SWR }; +enum { OP_DADDI=030, OP_DADDIU, OP_LDL, OP_LDR, OP_SD=067, OP_LD=077 }; // function field (encoded in fn if opcode = OP__FN) enum { FN_SLL=000, __(01), FN_SRL, FN_SRA, FN_SLLV, __(05), FN_SRLV, FN_SRAV }; -enum { FN_MFHI=020, FN_MTHI, FN_MFLO, FN_MTLO }; -enum { FN_MULT=030, FN_MULTU, FN_DIV, FN_DIVU }; +enum { FN_JR=010, FN_JALR, FN_MOVZ, FN_MOVN, FN_SYNC=017 }; +enum { FN_MFHI=020, FN_MTHI, FN_MFLO, FN_MTLO, FN_DSSLV, __(25), FN_DSLRV, FN_DSRAV }; +enum { FN_MULT=030, FN_MULTU, FN_DIV, FN_DIVU, FN_DMULT, FN_DMULTU, FN_DDIV, FN_DDIVU }; enum { FN_ADD=040, FN_ADDU, FN_SUB, FN_SUBU, FN_AND, FN_OR, FN_XOR, FN_NOR }; -enum { FN_JR=010, FN_JALR, FN_MOVZ, FN_MOVN, FN_SYNC=017, FN_SLT=052, FN_SLTU }; +enum { FN_SLT=052, FN_SLTU, FN_DADD, FN_DADDU, FN_DSUB, FN_DSUBU }; +enum { FN_DSLL=070, __(71), FN_DSRL, FN_DSRA, FN_DSLL32, __(75), FN_DSRL32, FN_DSRA32 }; // rt field (encoded in rt if opcode = OP__RT) enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; @@ -85,8 +88,12 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; // rd = rs OP rt #define MIPS_ADD_REG(rd, rs, rt) \ MIPS_OP_REG(FN_ADDU, rd, rs, rt) +#define MIPS_DADD_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_DADDU, rd, rs, rt) #define MIPS_SUB_REG(rd, rs, rt) \ MIPS_OP_REG(FN_SUBU, rd, rs, rt) +#define MIPS_DSUB_REG(rd, rs, rt) \ + MIPS_OP_REG(FN_DSUBU, rd, rs, rt) #define MIPS_NEG_REG(rd, rt) \ MIPS_SUB_REG(rd, Z0, rt) @@ -122,6 +129,8 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; // rt = rs OP imm16 #define MIPS_ADD_IMM(rt, rs, imm16) \ MIPS_OP_IMM(OP_ADDIU, rt, rs, imm16) +#define MIPS_DADD_IMM(rt, rs, imm16) \ + MIPS_OP_IMM(OP_DADDIU, rt, rs, imm16) #define MIPS_XOR_IMM(rt, rs, imm16) \ MIPS_OP_IMM(OP_XORI, rt, rs, imm16) @@ -144,6 +153,11 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; #define MIPS_ASR_IMM(rd, rt, bits) \ MIPS_INSN(OP__FN, _, rt, rd, bits, FN_SRA) +#define MIPS_DLSL_IMM(rd, rt, bits) \ + MIPS_INSN(OP__FN, _, rt, rd, bits, FN_DSLL) +#define MIPS_DLSL32_IMM(rd, rt, bits) \ + MIPS_INSN(OP__FN, _, rt, rd, bits, FN_DSLL32) + // rt = (rs < imm16) #define MIPS_SLT_IMM(rt, rs, imm16) \ MIPS_OP_IMM(OP_SLTI, rt, rs, imm16) @@ -193,23 +207,45 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; // load/store indexed base +#define MIPS_LD(rt, rs, offs16) \ + MIPS_OP_IMM(OP_LD, rt, rs, (u16)(offs16)) #define MIPS_LW(rt, rs, offs16) \ - MIPS_INSN(OP_LW, rs, rt, _,_, (u16)(offs16)) + MIPS_OP_IMM(OP_LW, rt, rs, (u16)(offs16)) #define MIPS_LH(rt, rs, offs16) \ - MIPS_INSN(OP_LH, rs, rt, _,_, (u16)(offs16)) + MIPS_OP_IMM(OP_LH, rt, rs, (u16)(offs16)) #define MIPS_LB(rt, rs, offs16) \ - MIPS_INSN(OP_LB, rs, rt, _,_, (u16)(offs16)) + MIPS_OP_IMM(OP_LB, rt, rs, (u16)(offs16)) #define MIPS_LHU(rt, rs, offs16) \ - MIPS_INSN(OP_LHU, rs, rt, _,_, (u16)(offs16)) + MIPS_OP_IMM(OP_LHU, rt, rs, (u16)(offs16)) #define MIPS_LBU(rt, rs, offs16) \ - MIPS_INSN(OP_LBU, rs, rt, _,_, (u16)(offs16)) + MIPS_OP_IMM(OP_LBU, rt, rs, (u16)(offs16)) +#define MIPS_SD(rt, rs, offs16) \ + MIPS_OP_IMM(OP_SD, rt, rs, (u16)(offs16)) #define MIPS_SW(rt, rs, offs16) \ - MIPS_INSN(OP_SW, rs, rt, _,_, (u16)(offs16)) + MIPS_OP_IMM(OP_SW, rt, rs, (u16)(offs16)) #define MIPS_SH(rt, rs, offs16) \ - MIPS_INSN(OP_SH, rs, rt, _,_, (u16)(offs16)) + MIPS_OP_IMM(OP_SH, rt, rs, (u16)(offs16)) #define MIPS_SB(rt, rs, offs16) \ - MIPS_INSN(OP_SB, rs, rt, _,_, (u16)(offs16)) + MIPS_OP_IMM(OP_SB, rt, rs, (u16)(offs16)) + +// pointer operations + +#if __mips == 4 || __mips == 64 +#define OP_LP OP_LD +#define OP_SP OP_SD +#define OP_PADDIU OP_DADDIU +#define FN_PADDU FN_DADDU +#define FN_PSUBU FN_DSUBU +#define PTR_SCALE 3 +#else +#define OP_LP OP_LW +#define OP_SP OP_SW +#define OP_PADDIU OP_ADDIU +#define FN_PADDU FN_ADDU +#define FN_PSUBU FN_SUBU +#define PTR_SCALE 2 +#endif // XXX: tcache_ptr type for SVP and SH2 compilers differs.. #define EMIT_PTR(ptr, x) \ @@ -442,14 +478,14 @@ static void emith_set_arith_flags(int rd, int rs, int rt, s32 imm, int sub) if (emith_flg_hint & _FHV) { emith_flg_noV = 0; - if (rt >= 0) // Nt^Ns in FV, bit 31 + if (rt > Z0) // Nt^Ns in FV, bit 31 EMIT(MIPS_XOR_REG(FV, rs, rt)); - else if (imm == 0) + else if (rt == Z0 || imm == 0) emith_flg_noV = 1; // imm #0 can't overflow else if ((imm < 0) == !sub) EMIT(MIPS_NOR_REG(FV, rs, Z0)); else if ((imm > 0) == !sub) - EMIT(MIPS_OR_REG(FV, rs, Z0)); + EMIT(MIPS_XOR_REG(FV, rs, Z0)); } // full V = Nd^Nt^Ns^C calculation is deferred until really needed @@ -483,13 +519,17 @@ static void emith_set_compare_flags(int rs, int rt, s32 imm) EMIT(MIPS_MVN_REG(d, s)) #define emith_add_r_r_r_lsl_ptr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_OP_REG(FN_PADDU, d, s1, AT)); \ + } else EMIT(MIPS_OP_REG(FN_PADDU, d, s1, s2)); \ +} while (0) +#define emith_add_r_r_r_lsl(d, s1, s2, simm) do { \ if (simm) { \ EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ EMIT(MIPS_ADD_REG(d, s1, AT)); \ } else EMIT(MIPS_ADD_REG(d, s1, s2)); \ } while (0) -#define emith_add_r_r_r_lsl(d, s1, s2, simm) \ - emith_add_r_r_r_lsl_ptr(d, s1, s2, simm) #define emith_add_r_r_r_lsr(d, s1, s2, simm) do { \ if (simm) { \ @@ -498,6 +538,16 @@ static void emith_set_compare_flags(int rs, int rt, s32 imm) } else EMIT(MIPS_ADD_REG(d, s1, s2)); \ } while (0) +#define emith_addf_r_r_r_lsl_ptr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ + EMIT(MIPS_OP_REG(FN_PADDU, FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(MIPS_OP_REG(FN_PADDU, FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) #define emith_addf_r_r_r_lsl(d, s1, s2, simm) do { \ if (simm) { \ EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ @@ -586,6 +636,8 @@ static void emith_set_compare_flags(int rs, int rt, s32 imm) #define emith_add_r_r_r(d, s1, s2) \ emith_add_r_r_r_lsl(d, s1, s2, 0) +#define emith_addf_r_r_r_ptr(d, s1, s2) \ + emith_addf_r_r_r_lsl_ptr(d, s1, s2, 0) #define emith_addf_r_r_r(d, s1, s2) \ emith_addf_r_r_r_lsl(d, s1, s2, 0) @@ -697,14 +749,26 @@ static void emith_set_compare_flags(int rs, int rt, s32 imm) // move immediate static void emith_move_imm(int r, uintptr_t imm) { - if ((s16)imm == imm) { +#if __mips == 4 || __mips == 64 + if ((s32)imm != imm) { + emith_move_imm(r, imm >> 32); + if (imm & 0xffff0000) { + EMIT(MIPS_DLSL_IMM(r, r, 16)); + EMIT(MIPS_OR_IMM(r, r, (imm >> 16) & 0xffff)); + EMIT(MIPS_DLSL_IMM(r, r, 16)); + } else EMIT(MIPS_DLSL32_IMM(r, r, 0)); + if (imm & 0x0000ffff) + EMIT(MIPS_OR_IMM(r, r, imm & 0xffff)); + } else +#endif + if ((s16)imm == imm) { EMIT(MIPS_ADD_IMM(r, Z0, imm)); - } else if (!(imm >> 16)) { + } else if (!((u32)imm >> 16)) { EMIT(MIPS_OR_IMM(r, Z0, imm)); } else { int s = Z0; - if (imm >> 16) { - EMIT(MIPS_MOVT_IMM(r, imm >> 16)); + if ((u32)imm >> 16) { + EMIT(MIPS_MOVT_IMM(r, (u32)imm >> 16)); s = r; } if ((u16)imm) @@ -729,17 +793,17 @@ static void emith_move_imm(int r, uintptr_t imm) } while (0) // arithmetic, immediate - can only be ADDI[U], since SUBI[U] doesn't exist -static void emith_arith_imm(int op, int rd, int rs, u32 imm) +static void emith_add_imm(int ptr, int rd, int rs, u32 imm) { if ((s16)imm == imm) { if (imm || rd != rs) - EMIT(MIPS_OP_IMM(op, rd, rs, imm)); + EMIT(MIPS_OP_IMM(ptr ? OP_PADDIU:OP_ADDIU, rd,rs,imm)); } else if ((s32)imm < 0) { emith_move_r_imm(AT, -imm); - EMIT(MIPS_OP_REG(FN_SUB + (op-OP_ADDI), rd, rs, AT)); + EMIT(MIPS_OP_REG((ptr ? FN_PSUBU:FN_SUBU), rd,rs,AT)); } else { emith_move_r_imm(AT, imm); - EMIT(MIPS_OP_REG(FN_ADD + (op-OP_ADDI), rd, rs, AT)); + EMIT(MIPS_OP_REG((ptr ? FN_PADDU:FN_ADDU), rd,rs,AT)); } } @@ -760,7 +824,7 @@ static void emith_arith_imm(int op, int rd, int rs, u32 imm) emith_subf_r_r_imm(r, r, imm) #define emith_adc_r_imm(r, imm) \ - emith_adc_r_r_imm(r, r, imm); + emith_adc_r_r_imm(r, r, imm) #define emith_adcf_r_imm(r, imm) \ emith_adcf_r_r_imm(r, r, imm) @@ -770,10 +834,10 @@ static void emith_arith_imm(int op, int rd, int rs, u32 imm) // emith_subf_r_r_imm(FNZ, r, (s16)imm) #define emith_add_r_r_ptr_imm(d, s, imm) \ - emith_arith_imm(OP_ADDIU, d, s, imm) + emith_add_imm(1, d, s, imm) #define emith_add_r_r_imm(d, s, imm) \ - emith_add_r_r_ptr_imm(d, s, imm) + emith_add_imm(0, d, s, imm) #define emith_addf_r_r_imm(d, s, imm) do { \ emith_add_r_r_imm(FNZ, s, imm); \ @@ -1043,22 +1107,24 @@ static void emith_lohi_nops(void) // load/store. offs has 16 bits signed, which is currently sufficient #define emith_read_r_r_offs_ptr(r, rs, offs) \ - EMIT(MIPS_LW(r, rs, offs)) + EMIT(MIPS_OP_IMM(OP_LP, r, rs, offs)) #define emith_read_r_r_offs_ptr_c(cond, r, rs, offs) \ emith_read_r_r_offs_ptr(r, rs, offs) #define emith_read_r_r_offs(r, rs, offs) \ - emith_read_r_r_offs_ptr(r, rs, offs) + EMIT(MIPS_LW(r, rs, offs)) #define emith_read_r_r_offs_c(cond, r, rs, offs) \ emith_read_r_r_offs(r, rs, offs) #define emith_read_r_r_r_ptr(r, rs, rm) do { \ emith_add_r_r_r(AT, rs, rm); \ - EMIT(MIPS_LW(r, AT, 0)); \ + EMIT(MIPS_OP_IMM(OP_LP, r, AT, 0)); \ } while (0) -#define emith_read_r_r_r(r, rs, rm) \ - emith_read_r_r_r_ptr(r, rs, rm) +#define emith_read_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_LW(r, AT, 0)); \ +} while (0) #define emith_read_r_r_r_c(cond, r, rs, rm) \ emith_read_r_r_r(r, rs, rm) @@ -1112,24 +1178,26 @@ static void emith_lohi_nops(void) #define emith_write_r_r_offs_ptr(r, rs, offs) \ - EMIT(MIPS_SW(r, rs, offs)) + EMIT(MIPS_OP_IMM(OP_SP, r, rs, offs)) #define emith_write_r_r_offs_ptr_c(cond, r, rs, offs) \ emith_write_r_r_offs_ptr(r, rs, offs) #define emith_write_r_r_r_ptr(r, rs, rm) do { \ emith_add_r_r_r(AT, rs, rm); \ - EMIT(MIPS_SW(r, AT, 0)); \ + EMIT(MIPS_OP_IMM(OP_SP, r, AT, 0)); \ } while (0) #define emith_write_r_r_r_ptr_c(cond, r, rs, rm) \ emith_write_r_r_r_ptr(r, rs, rm) #define emith_write_r_r_offs(r, rs, offs) \ - emith_write_r_r_offs_ptr(r, rs, offs) + EMIT(MIPS_SW(r, rs, offs)) #define emith_write_r_r_offs_c(cond, r, rs, offs) \ emith_write_r_r_offs(r, rs, offs) -#define emith_write_r_r_r(r, rs, rm) \ - emith_write_r_r_r_ptr(r, rs, rm) +#define emith_write_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + EMIT(MIPS_SW(r, AT, 0)); \ +} while (0) #define emith_write_r_r_r_c(cond, r, rs, rm) \ emith_write_r_r_r(r, rs, rm) @@ -1164,7 +1232,7 @@ static void emith_lohi_nops(void) int _c; u32 _m = mask & 0x300fffc; /* r2-r15,r24-r25 */ \ if (__builtin_parity(_m) == 1) _m |= 0x1; /* ABI align */ \ int _s = count_bits(_m) * 4, _o = _s; \ - if (_s) emith_sub_r_imm(SP, _s); \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, -_s); \ for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ if (_m & (1 << _c)) \ { _o -= 4; if (_c) emith_write_r_r_offs(_c, SP, _o); } \ @@ -1177,7 +1245,7 @@ static void emith_lohi_nops(void) for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ if (_m & (1 << _c)) \ { if (_c) emith_read_r_r_offs(_c, SP, _o); _o += 4; } \ - if (_s) emith_add_r_imm(SP, _s); \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, _s); \ } while (0) #define host_arg2reg(rd, arg) \ @@ -1343,8 +1411,8 @@ static int emith_cond_check(int cond, int *r) emith_jump_cond(cond, target) #define emith_jump_cond_inrange(target) \ - ((u8 *)target - (u8 *)tcache_ptr - 4 < 0x00020000U || \ - (u8 *)target - (u8 *)tcache_ptr - 4 >= 0xfffe0010U) // mind cond_check + ((u8 *)target - (u8 *)tcache_ptr - 4 < 0x20000 && \ + (u8 *)target - (u8 *)tcache_ptr - 4 >= -0x20000+0x10) //mind cond_check // NB: returns position of patch for cache maintenance #define emith_jump_patch(ptr, target, pos) do { \ @@ -1359,8 +1427,8 @@ static int emith_cond_check(int cond, int *r) } while (0) #define emith_jump_patch_inrange(ptr, target) \ - ((u8 *)target - (u8 *)ptr - 4 < 0x00020000U || \ - (u8 *)target - (u8 *)ptr - 4 >= 0xfffe0010U) // mind cond_check + ((u8 *)target - (u8 *)ptr - 4 < 0x20000 && \ + (u8 *)target - (u8 *)ptr - 4 >= -0x20000+0x10) // mind cond_check #define emith_jump_patch_size() 4 #define emith_jump_at(ptr, target) do { \ @@ -1410,7 +1478,7 @@ static int emith_cond_check(int cond, int *r) // NB: ABI SP alignment is 8 for compatibility with MIPS IV #define emith_push_ret(r) do { \ - emith_sub_r_imm(SP, 8+16); /* reserve new arg save area (16) */ \ + emith_add_r_r_ptr_imm(SP, SP, -8-16); /* ABI: 16 byte arg save area */ \ emith_write_r_r_offs(LR, SP, 4+16); \ if ((r) > 0) emith_write_r_r_offs(r, SP, 0+16); \ } while (0) @@ -1418,7 +1486,7 @@ static int emith_cond_check(int cond, int *r) #define emith_pop_and_ret(r) do { \ if ((r) > 0) emith_read_r_r_offs(r, SP, 0+16); \ emith_read_r_r_offs(LR, SP, 4+16); \ - emith_add_r_imm(SP, 8+16); \ + emith_add_r_r_ptr_imm(SP, SP, 8+16); \ emith_ret(); \ } while (0) @@ -1436,7 +1504,7 @@ static int emith_cond_check(int cond, int *r) int _c; u32 _m = 0xd0ff0000; \ if (__builtin_parity(_m) == 1) _m |= 0x1; /* ABI align for SP is 8 */ \ int _s = count_bits(_m) * 4 + 16, _o = _s; /* 16 byte arg save area */ \ - if (_s) emith_sub_r_imm(SP, _s); \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, -_s); \ for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ if (_m & (1 << _c)) \ { _o -= 4; if (_c) emith_write_r_r_offs(_c, SP, _o); } \ @@ -1448,23 +1516,23 @@ static int emith_cond_check(int cond, int *r) for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ if (_m & (1 << _c)) \ { if (_c) emith_read_r_r_offs(_c, SP, _o); _o += 4; } \ - if (_s) emith_add_r_imm(SP, _s); \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, _s); \ emith_ret(); \ } while (0) // NB: assumes a is in arg0, tab, func and mask are temp #define emith_sh2_rcall(a, tab, func, mask) do { \ emith_lsr(mask, a, SH2_READ_SHIFT); \ - emith_add_r_r_r_lsl_ptr(tab, tab, mask, 3); \ + emith_add_r_r_r_lsl_ptr(tab, tab, mask, PTR_SCALE+1); \ emith_read_r_r_offs_ptr(func, tab, 0); \ - emith_read_r_r_offs(mask, tab, 4); \ - emith_addf_r_r_r/*_ptr*/(func, func, func); \ + emith_read_r_r_offs(mask, tab, (1 << PTR_SCALE)); \ + emith_addf_r_r_r_ptr(func, func, func); \ } while (0) // NB: assumes a, val are in arg0 and arg1, tab and func are temp #define emith_sh2_wcall(a, val, tab, func) do { \ emith_lsr(func, a, SH2_WRITE_SHIFT); \ - emith_lsl(func, func, 2); \ + emith_lsl(func, func, PTR_SCALE); \ emith_read_r_r_r_ptr(func, tab, func); \ emith_move_r_r_ptr(6, CONTEXT_REG); /* arg2 */ \ emith_jump_reg(func); \ diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index e7284499c..9ed8b5638 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -448,11 +448,11 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common // fake conditionals (using SJMP instead) #define emith_move_r_imm_c(cond, r, imm) \ - emith_move_r_imm(r, imm); + emith_move_r_imm(r, imm) #define emith_add_r_imm_c(cond, r, imm) \ - emith_add_r_imm(r, imm); + emith_add_r_imm(r, imm) #define emith_sub_r_imm_c(cond, r, imm) \ - emith_sub_r_imm(r, imm); + emith_sub_r_imm(r, imm) #define emith_or_r_imm_c(cond, r, imm) \ emith_or_r_imm(r, imm) #define emith_eor_r_imm_c(cond, r, imm) \ @@ -468,11 +468,11 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define emith_ror_c(cond, d, s, cnt) \ emith_ror(d, s, cnt) #define emith_and_r_r_c(cond, d, s) \ - emith_and_r_r(d, s); + emith_and_r_r(d, s) #define emith_add_r_r_imm_c(cond, d, s, imm) \ - emith_add_r_r_imm(d, s, imm); + emith_add_r_r_imm(d, s, imm) #define emith_sub_r_r_imm_c(cond, d, s, imm) \ - emith_sub_r_r_imm(d, s, imm); + emith_sub_r_r_imm(d, s, imm) #define emith_read8_r_r_r_c(cond, r, rs, rm) \ emith_read8_r_r_r(r, rs, rm) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 2320c5010..d1cde69ef 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -47,13 +47,9 @@ #define LOOP_OPTIMIZER 1 #define T_OPTIMIZER 1 -// limits (per block) -#define MAX_BLOCK_SIZE (BLOCK_INSN_LIMIT * 6 * 6) - -// max literal offset from the block end #define MAX_LITERAL_OFFSET 0x200 // max. MOVA, MOV @(PC) offset -#define MAX_LITERALS (BLOCK_INSN_LIMIT / 4) -#define MAX_LOCAL_BRANCHES (BLOCK_INSN_LIMIT / 4) +#define MAX_LOCAL_TARGETS (BLOCK_INSN_LIMIT / 4) +#define MAX_LOCAL_BRANCHES (BLOCK_INSN_LIMIT / 2) // debug stuff // 01 - warnings/errors @@ -294,7 +290,7 @@ struct block_link { u32 target_pc; void *jump; // insn address void *blx; // block link/exit area if any - u8 jdisp[8]; // jump backup buffer + u8 jdisp[12]; // jump backup buffer struct block_link *next; // either in block_entry->links or unresolved struct block_link *o_next; // ...in block_entry->o_links struct block_link *prev; @@ -443,6 +439,8 @@ static void rcache_free_tmp(int hr); #include "../drc/emit_arm64.c" #elif defined(__mips__) #include "../drc/emit_mips.c" +#elif defined(__riscv__) || defined(__riscv) +#include "../drc/emit_riscv.c" #elif defined(__i386__) #include "../drc/emit_x86.c" #elif defined(__x86_64__) @@ -1207,45 +1205,10 @@ static void dr_flush_tcache(int tcid) static void *dr_failure(void) { - lprintf("recompilation failed\n"); + printf("recompilation failed\n"); exit(1); } -#define ADD_TO_ARRAY(array, count, item, failcode) { \ - if (count >= ARRAY_SIZE(array)) { \ - dbg(1, "warning: " #array " overflow"); \ - failcode; \ - } else \ - array[count++] = item; \ -} - -static inline int find_in_array(u32 *array, size_t size, u32 what) -{ - size_t i; - for (i = 0; i < size; i++) - if (what == array[i]) - return i; - - return -1; -} - -static int find_in_sorted_array(u32 *array, size_t size, u32 what) -{ - // binary search in sorted array - int left = 0, right = size-1; - while (left <= right) - { - int middle = (left + right) / 2; - if (array[middle] == what) - return middle; - else if (array[middle] < what) - left = middle + 1; - else - right = middle - 1; - } - return -1; -} - // --------------------------------------------------------------- // NB rcache allocation dependencies: @@ -2868,6 +2831,88 @@ static void emit_do_static_regs(int is_write, int tmpr) } } +// block local link stuff +struct linkage { + u32 pc; + void *ptr; + struct block_link *bl; + u32 mask; +}; + +static inline int find_in_linkage(const struct linkage *array, int size, u32 pc) +{ + size_t i; + for (i = 0; i < size; i++) + if (pc == array[i].pc) + return i; + + return -1; +} + +static int find_in_sorted_linkage(const struct linkage *array, int size, u32 pc) +{ + // binary search in sorted array + int left = 0, right = size-1; + while (left <= right) + { + int middle = (left + right) / 2; + if (array[middle].pc == pc) + return middle; + else if (array[middle].pc < pc) + left = middle + 1; + else + right = middle - 1; + } + return -1; +} + +static void emit_branch_linkage_code(SH2 *sh2, struct block_desc *block, int tcache_id, + const struct linkage *targets, int target_count, + const struct linkage *links, int link_count) +{ + struct block_link *bl; + int u, v, tmp; + + for (u = 0; u < link_count; u++) { + emith_pool_check(); + // look up local branch targets + v = find_in_sorted_linkage(targets, target_count, links[u].pc); + if (v >= 0) { + if (! targets[v].ptr) { + // forward branch not yet resolved, prepare external linking + emith_jump_patch(links[u].ptr, tcache_ptr, NULL); + bl = dr_prepare_ext_branch(block->entryp, links[u].pc, sh2->is_slave, tcache_id); + if (bl) { + emith_flush(); // flush to inhibit insn swapping + bl->type = BL_LDJMP; + } + + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, links[u].pc); + rcache_free_tmp(tmp); + emith_jump_patchable(sh2_drc_dispatcher); + } else if (emith_jump_patch_inrange(links[u].ptr, targets[v].ptr)) { + // inrange local branch + emith_jump_patch(links[u].ptr, targets[v].ptr, NULL); + } else { + // far local branch + emith_jump_patch(links[u].ptr, tcache_ptr, NULL); + emith_jump(targets[v].ptr); + } + } else { + // external or exit, emit blx area entry + void *target = (links[u].pc & 1 ? sh2_drc_exit : sh2_drc_dispatcher); + if (links[u].bl) + links[u].bl->blx = tcache_ptr; + emith_jump_patch(links[u].ptr, tcache_ptr, NULL); + tmp = rcache_get_tmp_arg(0); + emith_move_r_imm(tmp, links[u].pc & ~1); + rcache_free_tmp(tmp); + emith_jump(target); + } + } +} + #define DELAY_SAVE_T(sr) { \ int t_ = rcache_get_tmp(); \ emith_bic_r_imm(sr, T_save); \ @@ -2887,17 +2932,10 @@ static void *dr_get_pc_base(u32 pc, SH2 *sh2); static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { // branch targets in current block - u32 branch_target_pc[MAX_LOCAL_BRANCHES]; - void *branch_target_ptr[MAX_LOCAL_BRANCHES]; + struct linkage branch_targets[MAX_LOCAL_TARGETS]; int branch_target_count = 0; - // unresolved local forward branches, for fixup at block end - u32 branch_patch_pc[MAX_LOCAL_BRANCHES]; - void *branch_patch_ptr[MAX_LOCAL_BRANCHES]; - int branch_patch_count = 0; - // external branch targets with a block link/exit area - u32 blx_target_pc[MAX_LOCAL_BRANCHES]; - void *blx_target_ptr[MAX_LOCAL_BRANCHES]; - struct block_link *blx_target_bl[MAX_LOCAL_BRANCHES]; + // unresolved local or external targets with block link/exit area if needed + struct linkage blx_targets[MAX_LOCAL_BRANCHES]; int blx_target_count = 0; u8 op_flags[BLOCK_INSN_LIMIT]; @@ -2906,6 +2944,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) int delay_reg:8; u32 loop_type:8; u32 polling:8; + u32 pinning:1; u32 test_irq:1; u32 pending_branch_direct:1; u32 pending_branch_indirect:1; @@ -2914,23 +2953,20 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #if LOOP_OPTIMIZER // loops with pinned registers for optimzation // pinned regs are like statics and don't need saving/restoring inside a loop - u32 pinned_loop_pc[MAX_LOCAL_BRANCHES/16]; - void *pinned_loop_ptr[MAX_LOCAL_BRANCHES/16]; - u32 pinned_loop_mask[MAX_LOCAL_BRANCHES/16]; + struct linkage pinned_loops[MAX_LOCAL_TARGETS/16]; int pinned_loop_count = 0; #endif // PC of current, first, last SH2 insn u32 pc, base_pc, end_pc; u32 base_literals, end_literals; - void *block_entry_ptr; + u8 *block_entry_ptr; struct block_desc *block; struct block_entry *entry; struct block_link *bl; u16 *dr_pc_base; struct op_data *opd; int blkid_main = 0; - int skip_op = 0; int tmp, tmp2; int cycles; int i, v; @@ -2971,8 +3007,15 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) for (pc = base_pc, i = 0; pc < end_pc; i++, pc += 2) { if (op_flags[i] & OF_DELAY_OP) op_flags[i] &= ~OF_BTARGET; - if (op_flags[i] & OF_BTARGET) - ADD_TO_ARRAY(branch_target_pc, branch_target_count, pc, ); + if (op_flags[i] & OF_BTARGET) { + if (branch_target_count < ARRAY_SIZE(branch_targets)) + branch_targets[branch_target_count++] = (struct linkage) { .pc = pc }; + else { + printf("warning: linkage overflow\n"); + end_pc = pc; + break; + } + } if (ops[i].op == OP_LDC && (ops[i].dest & BITMASK1(SHR_SR)) && pc+2 < end_pc) op_flags[i+1] |= OF_BTARGET; // RTE entrypoint in case of SR.IMASK change // unify T and SR since rcache doesn't know about "virtual" guest regs @@ -3040,9 +3083,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (op_flags[v] & OF_BASIC_LOOP) { m3 &= ~rcache_regs_static & ~BITMASK5(SHR_PC, SHR_PR, SHR_SR, SHR_T, SHR_MEM); if (m3 && count_bits(m3) < count_bits(rcache_vregs_reg) && - pinned_loop_count < ARRAY_SIZE(pinned_loop_pc)-1) { - pinned_loop_mask[pinned_loop_count] = m3; - pinned_loop_pc[pinned_loop_count++] = base_pc + 2*v; + pinned_loop_count < ARRAY_SIZE(pinned_loops)-1) { + pinned_loops[pinned_loop_count++] = + (struct linkage) { .mask = m3, .pc = base_pc + 2*v }; } else op_flags[v] &= ~OF_BASIC_LOOP; } @@ -3052,10 +3095,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #endif } - if (branch_target_count > 0) { - memset(branch_target_ptr, 0, sizeof(branch_target_ptr[0]) * branch_target_count); - } - tcache_ptr = dr_prepare_cache(tcache_id, (end_pc - base_pc) / 2); #if (DRC_DEBUG & 4) tcache_dsm_ptrs[tcache_id] = tcache_ptr; @@ -3076,7 +3115,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_invalidate_t(); drcf = (struct drcf) { 0 }; #if LOOP_OPTIMIZER - pinned_loop_pc[pinned_loop_count] = -1; + pinned_loops[pinned_loop_count].pc = -1; pinned_loop_count = 0; #endif @@ -3090,24 +3129,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) int tmp3, tmp4; int sr; - opd = &ops[i]; - op = FETCH_OP(pc); - -#if (DRC_DEBUG & 2) - insns_compiled++; -#endif -#if (DRC_DEBUG & 4) - DasmSH2(sh2dasm_buff, pc, op); - if (op_flags[i] & OF_BTARGET) { - if ((op_flags[i] & OF_LOOP) == OF_DELAY_LOOP) tmp3 = '+'; - else if ((op_flags[i] & OF_LOOP) == OF_POLL_LOOP) tmp3 = '='; - else if ((op_flags[i] & OF_LOOP) == OF_IDLE_LOOP) tmp3 = '~'; - else tmp3 = '*'; - } else if (drcf.loop_type) tmp3 = '.'; - else tmp3 = ' '; - printf("%c%08x %04x %s\n", tmp3, pc, op, sh2dasm_buff); -#endif - if (op_flags[i] & OF_BTARGET) { if (pc != base_pc) @@ -3143,9 +3164,9 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) break; } - v = find_in_sorted_array(branch_target_pc, branch_target_count, pc); + v = find_in_sorted_linkage(branch_targets, branch_target_count, pc); if (v >= 0) - branch_target_ptr[v] = tcache_ptr; + branch_targets[v].ptr = tcache_ptr; #if LOOP_DETECTION drcf.loop_type = op_flags[i] & OF_LOOP; drcf.delay_reg = -1; @@ -3176,12 +3197,13 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #if LOOP_OPTIMIZER if (op_flags[i] & OF_BASIC_LOOP) { - if (pinned_loop_pc[pinned_loop_count] == pc) { + if (pinned_loops[pinned_loop_count].pc == pc) { // pin needed regs on loop entry - FOR_ALL_BITS_SET_DO(pinned_loop_mask[pinned_loop_count], v, rcache_pin_reg(v)); + FOR_ALL_BITS_SET_DO(pinned_loops[pinned_loop_count].mask, v, rcache_pin_reg(v)); emith_flush(); // store current PC as loop target - pinned_loop_ptr[pinned_loop_count] = tcache_ptr; + pinned_loops[pinned_loop_count].ptr = tcache_ptr; + drcf.pinning = 1; } else op_flags[i] &= ~OF_BASIC_LOOP; } @@ -3193,11 +3215,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) EMITH_JMP_START(DCOND_GT); rcache_save_pinned(); - if (blx_target_count < ARRAY_SIZE(blx_target_pc)) { + if (blx_target_count < ARRAY_SIZE(blx_targets)) { // exit via stub in blx table (saves some 1-3 insns in the main flow) - blx_target_ptr[blx_target_count] = tcache_ptr; - blx_target_pc[blx_target_count] = pc|1; - blx_target_bl[blx_target_count++] = NULL; + blx_targets[blx_target_count++] = + (struct linkage) { .ptr = tcache_ptr, .pc = pc|1, .bl = NULL }; emith_jump_patchable(tcache_ptr); } else { // blx table full, must inline exit code @@ -3210,12 +3231,11 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } else #endif { - if (blx_target_count < ARRAY_SIZE(blx_target_pc)) { + if (blx_target_count < ARRAY_SIZE(blx_targets)) { // exit via stub in blx table (saves some 1-3 insns in the main flow) - blx_target_pc[blx_target_count] = pc|1; - blx_target_bl[blx_target_count] = NULL; emith_cmp_r_imm(sr, 0); - blx_target_ptr[blx_target_count++] = tcache_ptr; + blx_targets[blx_target_count++] = + (struct linkage) { .ptr = tcache_ptr, .pc = pc|1, .bl = NULL }; emith_jump_cond_patchable(DCOND_LE, tcache_ptr); } else { // blx table full, must inline exit code @@ -3282,13 +3302,40 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } #endif + // emit blx area if limits are approached + if (blx_target_count && (blx_target_count > ARRAY_SIZE(blx_targets)-4 || + !emith_jump_patch_inrange(blx_targets[0].ptr, tcache_ptr+0x100))) { + u8 *jp; + rcache_invalidate_tmp(); + jp = tcache_ptr; + emith_jump_patchable(tcache_ptr); + emit_branch_linkage_code(sh2, block, tcache_id, branch_targets, + branch_target_count, blx_targets, blx_target_count); + blx_target_count = 0; + do_host_disasm(tcache_id); + emith_jump_patch(jp, tcache_ptr, NULL); + } + emith_pool_check(); - pc += 2; - if (skip_op > 0) { - skip_op--; - continue; - } + opd = &ops[i]; + op = FETCH_OP(pc); +#if (DRC_DEBUG & 4) + DasmSH2(sh2dasm_buff, pc, op); + if (op_flags[i] & OF_BTARGET) { + if ((op_flags[i] & OF_LOOP) == OF_DELAY_LOOP) tmp3 = '+'; + else if ((op_flags[i] & OF_LOOP) == OF_POLL_LOOP) tmp3 = '='; + else if ((op_flags[i] & OF_LOOP) == OF_IDLE_LOOP) tmp3 = '~'; + else tmp3 = '*'; + } else if (drcf.loop_type) tmp3 = '.'; + else tmp3 = ' '; + printf("%c%08x %04x %s\n", tmp3, pc, op, sh2dasm_buff); +#endif + + pc += 2; +#if (DRC_DEBUG & 2) + insns_compiled++; +#endif if (op_flags[i] & OF_DELAY_OP) { @@ -4422,7 +4469,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emit_sync_t_to_sr(); emith_sh2_delay_loop(cycles, drcf.delay_reg); rcache_unlock_all(); // may lock delay_reg - drcf.polling = drcf.loop_type = 0; + drcf.polling = drcf.loop_type = drcf.pinning = 0; } #endif @@ -4464,33 +4511,39 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_sync_t(sr); // no modification of host status/flags between here and branching! - v = find_in_sorted_array(branch_target_pc, branch_target_count, target_pc); + v = find_in_sorted_linkage(branch_targets, branch_target_count, target_pc); if (v >= 0) { // local branch - if (branch_target_ptr[v]) { + if (branch_targets[v].ptr) { // local backward jump, link here now since host PC is already known - target = branch_target_ptr[v]; + target = branch_targets[v].ptr; #if LOOP_OPTIMIZER - if (pinned_loop_pc[pinned_loop_count] == target_pc) { + if (pinned_loops[pinned_loop_count].pc == target_pc) { // backward jump at end of optimized loop rcache_unpin_all(); - target = pinned_loop_ptr[pinned_loop_count]; + target = pinned_loops[pinned_loop_count].ptr; pinned_loop_count ++; } #endif - if (cond != -1) - emith_jump_cond(cond, target); - else { + if (cond != -1) { + if (emith_jump_patch_inrange(tcache_ptr, target)) { + emith_jump_cond(cond, target); + } else { + // not reachable directly, must use far branch + EMITH_JMP_START(emith_invert_cond(cond)); + emith_jump(target); + EMITH_JMP_END(emith_invert_cond(cond)); + } + } else { emith_jump(target); rcache_invalidate(); } - } else if (branch_patch_count < MAX_LOCAL_BRANCHES) { + } else if (blx_target_count < MAX_LOCAL_BRANCHES) { // local forward jump target = tcache_ptr; - branch_patch_pc[branch_patch_count] = target_pc; - branch_patch_ptr[branch_patch_count] = target; - branch_patch_count++; + blx_targets[blx_target_count++] = + (struct linkage) { .pc = target_pc, .ptr = target, .bl = NULL }; if (cond != -1) emith_jump_cond_patchable(cond, target); else { @@ -4498,7 +4551,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_invalidate(); } } else - dbg(1, "warning: too many local branches"); + dbg(1, "warning: too many unresolved branches"); } if (target == NULL) @@ -4507,13 +4560,12 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) bl = dr_prepare_ext_branch(block->entryp, target_pc, sh2->is_slave, tcache_id); if (cond != -1) { #if 1 - if (bl && blx_target_count < ARRAY_SIZE(blx_target_pc)) { + if (bl && blx_target_count < ARRAY_SIZE(blx_targets)) { // conditional jumps get a blx stub for the far jump - blx_target_pc[blx_target_count] = target_pc; - blx_target_bl[blx_target_count] = bl; - blx_target_ptr[blx_target_count++] = tcache_ptr; bl->type = BL_JCCBLX; target = tcache_ptr; + blx_targets[blx_target_count++] = + (struct linkage) { .pc = target_pc, .ptr = target, .bl = bl }; emith_jump_cond_patchable(cond, target); } else { // not linkable, or blx table full; inline jump @dispatcher @@ -4660,44 +4712,15 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } else rcache_flush(); - // emit blx area - for (i = 0; i < blx_target_count; i++) { - void *target = (blx_target_pc[i] & 1 ? sh2_drc_exit : sh2_drc_dispatcher); - - emith_pool_check(); - bl = blx_target_bl[i]; - if (bl) - bl->blx = tcache_ptr; - emith_jump_patch(blx_target_ptr[i], tcache_ptr, NULL); - tmp = rcache_get_tmp_arg(0); - emith_move_r_imm(tmp, blx_target_pc[i] & ~1); - emith_jump(target); - rcache_invalidate(); - } + // link unresolved branches, emitting blx area entries as needed + emit_branch_linkage_code(sh2, block, tcache_id, branch_targets, + branch_target_count, blx_targets, blx_target_count); emith_flush(); do_host_disasm(tcache_id); emith_pool_commit(0); - // link local branches - for (i = 0; i < branch_patch_count; i++) { - void *target; - int t; - t = find_in_sorted_array(branch_target_pc, branch_target_count, branch_patch_pc[i]); - target = branch_target_ptr[t]; - if (target == NULL) { - // flush pc and go back to dispatcher (this should no longer happen) - dbg(1, "stray branch to %08x %p", branch_patch_pc[i], tcache_ptr); - target = tcache_ptr; - tmp = rcache_get_tmp_arg(0); - emith_move_r_imm(tmp, branch_patch_pc[i]); - emith_jump(sh2_drc_dispatcher); - rcache_flush(); - } - emith_jump_patch(branch_patch_ptr[i], target, NULL); - } - // fill blx backup; do this last to backup final patched code for (i = 0; i < block->entry_count; i++) for (bl = block->entryp[i].o_links; bl; bl = bl->o_next) @@ -4927,7 +4950,7 @@ static void sh2_generate_utils(void) // pc = sh2_drc_dispatcher_call(u32 pc) sh2_drc_dispatcher_call = (void *)tcache_ptr; emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx)); - emith_add_r_imm(arg2, 2*sizeof(void *)); + emith_add_r_imm(arg2, (u32)(2*sizeof(void *))); emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *)); emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx)); emith_add_r_r_r_lsl_ptr(arg2, CONTEXT_REG, arg2, 0); @@ -4957,7 +4980,7 @@ static void sh2_generate_utils(void) emith_jump_cond(DCOND_NE, sh2_drc_dispatcher); #endif emith_read_r_r_offs_ptr(arg0, arg1, offsetof(SH2, rts_cache) + sizeof(void *)); - emith_sub_r_imm(arg2, 2*sizeof(void *)); + emith_sub_r_imm(arg2, (u32)(2*sizeof(void *))); emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *)); emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx)); #if (DRC_DEBUG & 128) diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 415f01ba2..44620f489 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -41,6 +41,8 @@ unsigned short scan_block(unsigned int base_pc, int is_slave, #define DRC_SR_REG "r28" #elif defined(__mips__) #define DRC_SR_REG "s6" +#elif defined(__riscv__) || defined(__riscv) +#define DRC_SR_REG "s11" #elif defined(__i386__) #define DRC_SR_REG "edi" #elif defined(__x86_64__) diff --git a/pico/32x/32x.c b/pico/32x/32x.c index 7e2e039e5..896b5aa1a 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -506,7 +506,7 @@ void sync_sh2s_normal(unsigned int m68k_target) if (CYCLES_GT(now, ssh2.m68krcycles_done)) now = ssh2.m68krcycles_done; } - if (now - timer_cycles >= STEP_N) { + if (CYCLES_GT(now, timer_cycles+STEP_N)) { p32x_timers_do(now - timer_cycles); timer_cycles = now; } diff --git a/platform/common/dismips.c b/platform/common/dismips.c index 41c0f7a55..f9888f2a5 100644 --- a/platform/common/dismips.c +++ b/platform/common/dismips.c @@ -127,7 +127,7 @@ static const struct insn special_insns[] = { {0x38, S_IMM_DT, "dsll"}, {0x3A, S_IMM_DT, "dsrl"}, {0x3B, S_IMM_DT, "dsra"}, - {0x3D, S_IMM_DT, "dsll32"}, + {0x3C, S_IMM_DT, "dsll32"}, {0x3E, S_IMM_DT, "dsrl32"}, {0x3F, S_IMM_DT, "dsra32"}, }; diff --git a/platform/linux/emu.c b/platform/linux/emu.c index 887d78360..936652631 100644 --- a/platform/linux/emu.c +++ b/platform/linux/emu.c @@ -29,7 +29,7 @@ void pemu_prep_defconfig(void) void pemu_validate_config(void) { -#if !defined(__arm__) && !defined(__aarch64__) && !defined(__mips__) && !defined(__i386__) && !defined(__x86_64__) +#if !defined(__arm__) && !defined(__aarch64__) && !defined(__mips__) && !defined(__riscv__) && !defined(__riscv) && !defined(__i386__) && !defined(__x86_64__) PicoIn.opt &= ~POPT_EN_DRC; #endif } diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index 8a0557c7d..349b8605c 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -16,7 +16,7 @@ compile_rodata () # $CC $CFLAGS -I .. -nostdlib -Wl,-edummy /tmp/getoffs.c \ # -o /tmp/getoffs.o || exit 1 # find the name of the .rodata section (in case -fdata-sections is used) - rosect=$(readelf -S /tmp/getoffs.o | grep '\.rodata' | + rosect=$(readelf -S /tmp/getoffs.o | grep '\.rodata\|\.sdata' | sed 's/^[^.]*././;s/ .*//') # read out .rodata section as hex string (should be only 4 or 8 bytes) ro=$(readelf -x $rosect /tmp/getoffs.o | grep '0x' | cut -c14-48 | From ef528087e705f89c4f86e4aa6f12b07cf6643e43 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 13 Nov 2019 21:58:48 +0100 Subject: [PATCH 079/174] sh2 drc: RISC-V (RV64IM) code emitter, some work on MIPS64 --- cpu/drc/emit_riscv.c | 1579 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1579 insertions(+) create mode 100644 cpu/drc/emit_riscv.c diff --git a/cpu/drc/emit_riscv.c b/cpu/drc/emit_riscv.c new file mode 100644 index 000000000..84c3ccb2c --- /dev/null +++ b/cpu/drc/emit_riscv.c @@ -0,0 +1,1579 @@ +/* + * Basic macros to emit RISC-V RV64IM instructions and some utils + * Copyright (C) 2019 kub + * + * This work is licensed under the terms of MAME license. + * See COPYING file in the top-level directory. + */ +#define HOST_REGS 32 + +// RISC-V ABI: params: x10-x17, return: r10-x11, temp: x1(ra),x5-x7,x28-x31 +// saved: x8(fp),x9,x18-x27, reserved: x0(zero), x4(tp), x3(gp), x2(sp) +// x28-x31(t3-t6) are used internally by the code emitter +#define RET_REG 10 // a0 +#define PARAM_REGS { 10, 11, 12, 13, 14, 15, 16, 17 } // a0-a7 +#define PRESERVED_REGS { 9, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 } // s1-s11 +#define TEMPORARY_REGS { 5, 6, 7 } // t0-t2 + +#define CONTEXT_REG 9 // s1 +#define STATIC_SH2_REGS { SHR_SR,27 , SHR_R0,26 , SHR_R0+1,25 } + +// registers usable for user code: r1-r25, others reserved or special +#define Z0 0 // zero register +#define GP 3 // global pointer +#define SP 2 // stack pointer +#define FP 8 // frame pointer +#define LR 1 // link register +// internally used by code emitter: +#define AT 31 // used to hold intermediate results +#define FNZ 30 // emulated processor flags: N (bit 31) ,Z (all bits) +#define FC 29 // emulated processor flags: C (bit 0), others 0 +#define FV 28 // emulated processor flags: Nt^Ns (bit 31). others x + + +// unified conditions; virtual, not corresponding to anything real on RISC-V +#define DCOND_EQ 0x0 +#define DCOND_NE 0x1 +#define DCOND_HS 0x2 +#define DCOND_LO 0x3 +#define DCOND_MI 0x4 +#define DCOND_PL 0x5 +#define DCOND_VS 0x6 +#define DCOND_VC 0x7 +#define DCOND_HI 0x8 +#define DCOND_LS 0x9 +#define DCOND_GE 0xa +#define DCOND_LT 0xb +#define DCOND_GT 0xc +#define DCOND_LE 0xd + +#define DCOND_CS DCOND_LO +#define DCOND_CC DCOND_HS + +// unified insn +#define R5_INSN(b25, b20, b15, b12, b7, op) \ + (((b25)<<25)|((b20)<<20)|((b15)<<15)|((b12)<<12)|((b7)<<7)|((op)<<0)) + +#define _ 0 //marker for "field unused" +#define _CB(v,l,s,d) ((((v)>>(s))&((1<<(l))-1))<<(d)) // copy l bits + +#define R5_R_INSN(op, f1, f2, rd, rs, rt) \ + R5_INSN(f2, rt, rs, f1, rd, op) +#define R5_I_INSN(op, f1, rd, rs, imm) \ + R5_INSN(_, _CB(imm,12,0,0), rs, f1, rd, op) +#define R5_S_INSN(op, f1, rt, rs, imm) \ + R5_INSN(_CB(imm,7,5,0), rt, rs, f1, _CB(imm,5,0,0), op) +#define R5_U_INSN(op, rd, imm) \ + R5_INSN(_,_,_, _CB(imm,20,12,0), rd, op) +// oy vey... R5 immediate encoding in branches is really unwieldy :-/ +#define R5_B_INSN(op, f1, rt, rs, imm) \ + R5_INSN(_CB(imm,1,12,6)|_CB(imm,6,5,0), rt, rs, f1, \ + _CB(imm,4,1,1)|_CB(imm,1,11,0), op) +#define R5_J_INSN(op, rd, imm) \ + R5_INSN(_CB(imm,1,20,6)|_CB(imm,6,5,0), _CB(imm,4,1,1)|_CB(imm,1,11,0),\ + _CB(imm,8,12,0), rd, op) + +// opcode +enum { OP_LUI=0x37, OP_JAL=0x6f, OP_JALR=0x67, OP_BCOND=0x63, OP_LD=0x03, + OP_ST=0x23, OP_IMM=0x13, OP_IMM32=0x1b, OP_REG=0x33, OP_REG32=0x3b }; +// func3 +enum { F1_ADD, F1_SL, F1_SLT, F1_SLTU, F1_XOR, F1_SR, F1_OR, F1_AND }; +enum { F1_BEQ, F1_BNE, F1_BLT=4, F1_BGE, F1_BLTU, F1_BGEU }; +enum { F1_B, F1_H, F1_W, F1_D, F1_BU, F1_HU, F1_WU }; +enum { F1_MUL, F1_MULH, F1_MULHSU, F1_MULHU, F1_DIV, F1_DIVU, F1_REM, F1_REMU }; +// func7 +enum { F2_ALT=0x20, F2_MULDIV=0x01 }; + +#define __(n) o##n // enum marker for "undefined" + +#define R5_NOP R5_I_INSN(OP_IMM, F1_ADD, Z0, Z0, 0) // nop: ADDI r0, r0, #0 + +// arithmetic/logical + +// rd = rs OP rt +#define R5_ADD_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_ADD, _, rd, rs, rt) +#define R5_SUB_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_ADD, F2_ALT, rd, rs, rt) + +#define R5_NEG_REG(rd, rt) \ + R5_SUB_REG(rd, Z0, rt) + +#define R5_XOR_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_XOR, _, rd, rs, rt) +#define R5_OR_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_OR , _, rd, rs, rt) +#define R5_AND_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_AND, _, rd, rs, rt) + +// rd = rs SHIFT rt +#define R5_LSL_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_SL , _, rd, rs, rt) +#define R5_LSR_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_SR , _, rd, rs, rt) +#define R5_ASR_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_SR , F2_ALT, rd, rs, rt) + +// rd = (rs < rt) +#define R5_SLT_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_SLT, _, rd, rs, rt) +#define R5_SLTU_REG(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_SLTU,_, rd, rs, rt) + +// rd = rs OP imm12 +#define R5_ADD_IMM(rd, rs, imm12) \ + R5_I_INSN(OP_IMM, F1_ADD , rd, rs, imm12) + +#define R5_XOR_IMM(rd, rs, imm12) \ + R5_I_INSN(OP_IMM, F1_XOR , rd, rs, imm12) +#define R5_OR_IMM(rd, rs, imm12) \ + R5_I_INSN(OP_IMM, F1_OR , rd, rs, imm12) +#define R5_AND_IMM(rd, rs, imm12) \ + R5_I_INSN(OP_IMM, F1_AND , rd, rs, imm12) + +#define R5_MOV_REG(rd, rs) \ + R5_ADD_IMM(rd, rs, 0) +#define R5_MVN_REG(rd, rs) \ + R5_XOR_IMM(rd, rs, -1) + +// rd = (imm12 << (0|12)) +#define R5_MOV_IMM(rd, imm12) \ + R5_OR_IMM(rd, Z0, imm12) +#define R5_MOVT_IMM(rd, imm20) \ + R5_U_INSN(OP_LUI, rd, imm20) + +// rd = rs SHIFT imm5/imm6 +#define R5_LSL_IMM(rd, rs, bits) \ + R5_R_INSN(OP_IMM, F1_SL , _, rd, rs, bits) +#define R5_LSR_IMM(rd, rs, bits) \ + R5_R_INSN(OP_IMM, F1_SR , _, rd, rs, bits) +#define R5_ASR_IMM(rd, rs, bits) \ + R5_R_INSN(OP_IMM, F1_SR , F2_ALT, rd, rs, bits) + +// rd = (rs < imm12) +#define R5_SLT_IMM(rd, rs, imm12) \ + R5_I_INSN(OP_IMM, F1_SLT , rd, rs, imm12) +#define R5_SLTU_IMM(rd, rs, imm12) \ + R5_I_INSN(OP_IMM, F1_SLTU, rd, rs, imm12) + +// multiplication + +#define R5_MULHU(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_MULHU, F2_MULDIV, rd, rs, rt) +#define R5_MULHS(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_MULH, F2_MULDIV, rd, rs, rt) +#define R5_MUL(rd, rs, rt) \ + R5_R_INSN(OP_REG, F1_MUL, F2_MULDIV, rd, rs, rt) + +// branching + +#define R5_J(imm20) \ + R5_J_INSN(OP_JAL, Z0, imm20) +#define R5_JAL(rd, imm20) \ + R5_J_INSN(OP_JAL, rd, imm20) +#define R5_JR(rs, offs12) \ + R5_I_INSN(OP_JALR, _, Z0, rs, offs12) +#define R5_JALR(rd, rs, offs12) \ + R5_I_INSN(OP_JALR, _, rd, rs, offs12) + +// conditional branches; no condition code, these compare rs against rt +#define R5_BCOND(cond, rs, rt, offs13) \ + R5_B_INSN(OP_BCOND, cond, rt, rs, offs13) +#define R5_BCONDZ(cond, rs, offs13) \ + R5_B_INSN(OP_BCOND, cond, Z0, rs, offs13) +#define R5_B(offs13) \ + R5_BCOND(F1_BEQ, Z0, Z0, offs13) + +// load/store indexed base + +#define R5_LW(rd, rs, offs12) \ + R5_I_INSN(OP_LD, F1_W, rd, rs, offs12) +#define R5_LH(rd, rs, offs12) \ + R5_I_INSN(OP_LD, F1_H, rd, rs, offs12) +#define R5_LB(rd, rs, offs12) \ + R5_I_INSN(OP_LD, F1_B, rd, rs, offs12) +#define R5_LHU(rd, rs, offs12) \ + R5_I_INSN(OP_LD, F1_HU, rd, rs, offs12) +#define R5_LBU(rd, rs, offs12) \ + R5_I_INSN(OP_LD, F1_BU, rd, rs, offs12) + +#define R5_SW(rt, rs, offs12) \ + R5_S_INSN(OP_ST, F1_W, rt, rs, offs12) +#define R5_SH(rt, rs, offs12) \ + R5_S_INSN(OP_ST, F1_H, rt, rs, offs12) +#define R5_SB(rt, rs, offs12) \ + R5_S_INSN(OP_ST, F1_B, rt, rs, offs12) + +// pointer operations + +#if __riscv_xlen == 64 +#define R5_OP32 (OP_REG32 ^ OP_REG) +#define F1_P F1_D +#define PTR_SCALE 3 + +// NB: must split 64 bit result into 2 32 bit registers +// NB: this expects 32 bit values in s1+s2, correctly sign extended to 64 bits +#define EMIT_R5_MULLU_REG(dlo, dhi, s1, s2) do { \ + EMIT(R5_MUL(dlo, s1, s2)); \ + EMIT(R5_LSR_IMM(dhi, dlo, 32)); \ + EMIT(R5_LSL_IMM(dlo, dlo, 32)); \ + EMIT(R5_LSR_IMM(dlo, dlo, 32)); \ +} while (0) + +#define EMIT_R5_MULLS_REG(dlo, dhi, s1, s2) \ + EMIT_R5_MULLU_REG(dlo, dhi, s1, s2) +#else +#define R5_OP32 0 +#define F1_P F1_W +#define PTR_SCALE 2 + +#define EMIT_R5_MULLU_REG(dlo, dhi, s1, s2) do { \ + int at = (dhi == s1 || dhi == s2 ? AT : dhi); \ + EMIT(R5_MULHU(at, s1, s2)); \ + EMIT(R5_MUL(dlo, s1, s2)); \ + if (at != dhi) emith_move_r_r(dhi, at); \ +} while (0) + +#define EMIT_R5_MULLS_REG(dlo, dhi, s1, s2) do { \ + int at = (dhi == s1 || dhi == s2 ? AT : dhi); \ + EMIT(R5_MULHS(at, s1, s2)); \ + EMIT(R5_MUL(dlo, s1, s2)); \ + if (at != dhi) emith_move_r_r(dhi, at); \ +} while (0) +#endif + +#define R5_ADDW_REG(rd, rs, rt) (R5_ADD_REG(rd, rs, rt)^R5_OP32) +#define R5_SUBW_REG(rd, rs, rt) (R5_SUB_REG(rd, rs, rt)^R5_OP32) +#define R5_LSLW_REG(rd, rs, rt) (R5_LSL_REG(rd, rs, rt)^R5_OP32) +#define R5_LSRW_REG(rd, rs, rt) (R5_LSR_REG(rd, rs, rt)^R5_OP32) +#define R5_ASRW_REG(rd, rs, rt) (R5_ASR_REG(rd, rs, rt)^R5_OP32) + +#define R5_NEGW_REG(rd, rt) (R5_NEG_REG(rd, rt) ^R5_OP32) +#define R5_MULW(rd, rs, rt) (R5_MUL(rd, rs, rt) ^R5_OP32) + +#define R5_ADDW_IMM(rd, rs, imm) (R5_ADD_IMM(rd, rs, imm) ^R5_OP32) +#define R5_LSLW_IMM(rd, rs, bits) (R5_LSL_IMM(rd, rs, bits)^R5_OP32) +#define R5_LSRW_IMM(rd, rs, bits) (R5_LSR_IMM(rd, rs, bits)^R5_OP32) +#define R5_ASRW_IMM(rd, rs, bits) (R5_ASR_IMM(rd, rs, bits)^R5_OP32) + +// XXX: tcache_ptr type for SVP and SH2 compilers differs.. +#define EMIT_PTR(ptr, x) \ + do { \ + *(u32 *)(ptr) = x; \ + ptr = (void *)((u8 *)(ptr) + sizeof(u32)); \ + } while (0) + +#define EMIT(op) \ + do { \ + EMIT_PTR(tcache_ptr, op); \ + COUNT_OP; \ + } while (0) + +// if-then-else conditional execution helpers +#define JMP_POS(ptr) { \ + ptr = tcache_ptr; \ + EMIT(R5_B(0)); \ +} + +#define JMP_EMIT(cond, ptr) { \ + u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr); \ + EMIT_PTR(ptr, R5_BCOND(cond_m, cond_r, cond_s, val_ & 0x00001fff)); \ +} + +#define JMP_EMIT_NC(ptr) { \ + u32 val_ = (u8 *)tcache_ptr - (u8 *)(ptr); \ + EMIT_PTR(ptr, R5_B(val_ & 0x00001fff)); \ +} + +#define EMITH_JMP_START(cond) { \ + int cond_r, cond_s, cond_m = emith_cond_check(cond, &cond_r, &cond_s); \ + u8 *cond_ptr; \ + JMP_POS(cond_ptr) + +#define EMITH_JMP_END(cond) \ + JMP_EMIT(cond, cond_ptr); \ +} + +#define EMITH_JMP3_START(cond) { \ + int cond_r, cond_s, cond_m = emith_cond_check(cond, &cond_r, &cond_s); \ + u8 *cond_ptr, *else_ptr; \ + JMP_POS(cond_ptr) + +#define EMITH_JMP3_MID(cond) \ + JMP_POS(else_ptr); \ + JMP_EMIT(cond, cond_ptr); + +#define EMITH_JMP3_END() \ + JMP_EMIT_NC(else_ptr); \ +} + +// "simple" jump (no more then a few insns) +// ARM32 will use conditional instructions here +#define EMITH_SJMP_START EMITH_JMP_START +#define EMITH_SJMP_END EMITH_JMP_END + +#define EMITH_SJMP3_START EMITH_JMP3_START +#define EMITH_SJMP3_MID EMITH_JMP3_MID +#define EMITH_SJMP3_END EMITH_JMP3_END + +#define EMITH_SJMP2_START(cond) \ + EMITH_SJMP3_START(cond) +#define EMITH_SJMP2_MID(cond) \ + EMITH_SJMP3_MID(cond) +#define EMITH_SJMP2_END(cond) \ + EMITH_SJMP3_END() + + +// flag register emulation. this is modelled after arm/x86. +// the FNZ register stores the result of the last flag setting operation for +// N and Z flag, used for EQ,NE,MI,PL branches. +// the FC register stores the C flag (used for HI,HS,LO,LS,CC,CS). +// the FV register stores information for V flag calculation (used for +// GT,GE,LT,LE,VC,VS). V flag is costly and only fully calculated when needed. +// the core registers may be temp registers, since the condition after calls +// is undefined anyway. + +// flag emulation creates 2 (ie cmp #0/beq) up to 9 (ie adcf/ble) extra insns. +// flag handling shortcuts may reduce this by 1-4 insns, see emith_cond_check() +static int emith_cmp_rs, emith_cmp_rt; // registers used in cmp_r_r/cmp_r_imm +static s32 emith_cmp_imm; // immediate value used in cmp_r_imm +enum { _FHC=1, _FHV=2 } emith_flg_hint; // C/V flag usage hinted by compiler +static int emith_flg_noV; // V flag known not to be set + +#define EMITH_HINT_COND(cond) do { \ + /* only need to check cond>>1 since the lowest bit inverts the cond */ \ + unsigned _mv = BITMASK3(DCOND_VS>>1,DCOND_GE>>1,DCOND_GT>>1); \ + unsigned _mc = _mv | BITMASK2(DCOND_HS>>1,DCOND_HI>>1); \ + emith_flg_hint = (_mv & BITMASK1(cond >> 1) ? _FHV : 0); \ + emith_flg_hint |= (_mc & BITMASK1(cond >> 1) ? _FHC : 0); \ +} while (0) + +// store minimal cc information: rd, rt^rs, carry +// NB: the result *must* first go to FNZ, in case rd == rs or rd == rt. +// NB: for adcf and sbcf, carry-in must be dealt with separately (see there) +static void emith_set_arith_flags(int rd, int rs, int rt, s32 imm, int sub) +{ + if (emith_flg_hint & _FHC) { + if (sub) // C = sub:rt Z0) // Nt^Ns in FV, bit 31 + EMIT(R5_XOR_REG(FV, rs, rt)); + else if (rt == Z0 || imm == 0) + emith_flg_noV = 1; // imm #0 can't overflow + else if ((imm < 0) == !sub) + EMIT(R5_XOR_IMM(FV, rs, -1)); + else if ((imm > 0) == !sub) + EMIT(R5_XOR_REG(FV, rs, Z0)); + } + // full V = Nd^Nt^Ns^C calculation is deferred until really needed + + if (rd && rd != FNZ) + EMIT(R5_MOV_REG(rd, FNZ)); // N,Z via result value in FNZ + emith_cmp_rs = emith_cmp_rt = -1; +} + +// since R5 has less-than and compare-branch insns, handle cmp separately by +// storing the involved regs for later use in one of those R5 insns. +// This works for all conditions but VC/VS, but this is fortunately never used. +static void emith_set_compare_flags(int rs, int rt, s32 imm) +{ + emith_cmp_rt = rt; + emith_cmp_rs = rs; + emith_cmp_imm = imm; +} + +// data processing, register +#define emith_move_r_r_ptr(d, s) \ + EMIT(R5_MOV_REG(d, s)) +#define emith_move_r_r_ptr_c(cond, d, s) \ + emith_move_r_r_ptr(d, s) + +#define emith_move_r_r(d, s) \ + emith_move_r_r_ptr(d, s) +#define emith_move_r_r_c(cond, d, s) \ + emith_move_r_r(d, s) + +#define emith_mvn_r_r(d, s) \ + EMIT(R5_MVN_REG(d, s)) + +#define emith_add_r_r_r_lsl_ptr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSL_IMM(AT, s2, simm)); \ + EMIT(R5_ADD_REG(d, s1, AT)); \ + } else EMIT(R5_ADD_REG(d, s1, s2)); \ +} while (0) +#define emith_add_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSLW_IMM(AT, s2, simm)); \ + EMIT(R5_ADDW_REG(d, s1, AT)); \ + } else EMIT(R5_ADDW_REG(d, s1, s2)); \ +} while (0) + +#define emith_add_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSRW_IMM(AT, s2, simm)); \ + EMIT(R5_ADDW_REG(d, s1, AT)); \ + } else EMIT(R5_ADDW_REG(d, s1, s2)); \ +} while (0) + +#define emith_addf_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSLW_IMM(AT, s2, simm)); \ + EMIT(R5_ADDW_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(R5_ADDW_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) + +#define emith_addf_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSRW_IMM(AT, s2, simm)); \ + EMIT(R5_ADDW_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(R5_ADDW_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) + +#define emith_sub_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSLW_IMM(AT, s2, simm)); \ + EMIT(R5_SUBW_REG(d, s1, AT)); \ + } else EMIT(R5_SUBW_REG(d, s1, s2)); \ +} while (0) + +#define emith_subf_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSLW_IMM(AT, s2, simm)); \ + EMIT(R5_SUBW_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 1); \ + } else { \ + EMIT(R5_SUBW_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 1); \ + } \ +} while (0) + +#define emith_or_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSLW_IMM(AT, s2, simm)); \ + EMIT(R5_OR_REG(d, s1, AT)); \ + } else EMIT(R5_OR_REG(d, s1, s2)); \ +} while (0) + +#define emith_or_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSRW_IMM(AT, s2, simm)); \ + EMIT(R5_OR_REG(d, s1, AT)); \ + } else EMIT(R5_OR_REG(d, s1, s2)); \ +} while (0) + +#define emith_eor_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSLW_IMM(AT, s2, simm)); \ + EMIT(R5_XOR_REG(d, s1, AT)); \ + } else EMIT(R5_XOR_REG(d, s1, s2)); \ +} while (0) + +#define emith_eor_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSRW_IMM(AT, s2, simm)); \ + EMIT(R5_XOR_REG(d, s1, AT)); \ + } else EMIT(R5_XOR_REG(d, s1, s2)); \ +} while (0) + +#define emith_and_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(R5_LSLW_IMM(AT, s2, simm)); \ + EMIT(R5_AND_REG(d, s1, AT)); \ + } else EMIT(R5_AND_REG(d, s1, s2)); \ +} while (0) + +#define emith_or_r_r_lsl(d, s, lslimm) \ + emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) + +#define emith_eor_r_r_lsl(d, s, lslimm) \ + emith_eor_r_r_r_lsl(d, d, s, lslimm) +#define emith_eor_r_r_lsr(d, s, lsrimm) \ + emith_eor_r_r_r_lsr(d, d, s, lsrimm) + +#define emith_add_r_r_r(d, s1, s2) \ + emith_add_r_r_r_lsl(d, s1, s2, 0) + +#define emith_addf_r_r_r_ptr(d, s1, s2) \ + emith_addf_r_r_r_lsl(d, s1, s2, 0) +#define emith_addf_r_r_r(d, s1, s2) \ + emith_addf_r_r_r_ptr(d, s1, s2) + +#define emith_sub_r_r_r(d, s1, s2) \ + emith_sub_r_r_r_lsl(d, s1, s2, 0) + +#define emith_subf_r_r_r(d, s1, s2) \ + emith_subf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_or_r_r_r(d, s1, s2) \ + emith_or_r_r_r_lsl(d, s1, s2, 0) + +#define emith_eor_r_r_r(d, s1, s2) \ + emith_eor_r_r_r_lsl(d, s1, s2, 0) + +#define emith_and_r_r_r(d, s1, s2) \ + emith_and_r_r_r_lsl(d, s1, s2, 0) + +#define emith_add_r_r_ptr(d, s) \ + emith_add_r_r_r_lsl_ptr(d, d, s, 0) +#define emith_add_r_r(d, s) \ + emith_add_r_r_r(d, d, s) + +#define emith_sub_r_r(d, s) \ + emith_sub_r_r_r(d, d, s) + +#define emith_neg_r_r(d, s) \ + EMIT(R5_NEGW_REG(d, s)) + +#define emith_adc_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(AT, s2, FC); \ + emith_add_r_r_r(d, s1, AT); \ +} while (0) + +#define emith_sbc_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(AT, s2, FC); \ + emith_sub_r_r_r(d, s1, AT); \ +} while (0) + +#define emith_adc_r_r(d, s) \ + emith_adc_r_r_r(d, d, s) + +#define emith_negc_r_r(d, s) \ + emith_sbc_r_r_r(d, Z0, s) + +// NB: the incoming carry Cin can cause Cout if s2+Cin=0 (or s1+Cin=0 FWIW) +// moreover, if s2+Cin=0 caused Cout, s1+s2+Cin=s1+0 can't cause another Cout +#define emith_adcf_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(FNZ, s2, FC); \ + EMIT(R5_SLTU_REG(AT, FNZ, FC)); \ + emith_add_r_r_r(FNZ, s1, FNZ); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + emith_or_r_r(FC, AT); \ +} while (0) + +#define emith_sbcf_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(FNZ, s2, FC); \ + EMIT(R5_SLTU_REG(AT, FNZ, FC)); \ + emith_sub_r_r_r(FNZ, s1, FNZ); \ + emith_set_arith_flags(d, s1, s2, 0, 1); \ + emith_or_r_r(FC, AT); \ +} while (0) + +#define emith_and_r_r(d, s) \ + emith_and_r_r_r(d, d, s) +#define emith_and_r_r_c(cond, d, s) \ + emith_and_r_r(d, s) + +#define emith_or_r_r(d, s) \ + emith_or_r_r_r(d, d, s) + +#define emith_eor_r_r(d, s) \ + emith_eor_r_r_r(d, d, s) + +#define emith_tst_r_r_ptr(d, s) do { \ + if (d != s) { \ + emith_and_r_r_r(FNZ, d, s); \ + emith_cmp_rs = emith_cmp_rt = -1; \ + } else emith_cmp_rs = s, emith_cmp_rt = Z0; \ +} while (0) +#define emith_tst_r_r(d, s) \ + emith_tst_r_r_ptr(d, s) + +#define emith_teq_r_r(d, s) do { \ + emith_eor_r_r_r(FNZ, d, s); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_cmp_r_r(d, s) \ + emith_set_compare_flags(d, s, 0) +// emith_subf_r_r_r(FNZ, d, s) + +#define emith_addf_r_r(d, s) \ + emith_addf_r_r_r(d, d, s) + +#define emith_subf_r_r(d, s) \ + emith_subf_r_r_r(d, d, s) + +#define emith_adcf_r_r(d, s) \ + emith_adcf_r_r_r(d, d, s) + +#define emith_sbcf_r_r(d, s) \ + emith_sbcf_r_r_r(d, d, s) + +#define emith_negcf_r_r(d, s) \ + emith_sbcf_r_r_r(d, Z0, s) + + +// move immediate +static void emith_move_imm(int r, uintptr_t imm) +{ + u32 lui = imm + _CB(imm,1,11,12); + if (lui >> 12) { + // take out the effect of the sign extension of ADDI + EMIT(R5_MOVT_IMM(r, lui)); + if (imm & 0xfff) + EMIT(R5_ADD_IMM(r, r, imm)); + } else + EMIT(R5_ADD_IMM(r, Z0, imm)); +} + +#define emith_move_r_ptr_imm(r, imm) \ + emith_move_imm(r, (uintptr_t)(imm)) + +#define emith_move_r_imm(r, imm) \ + emith_move_imm(r, (u32)(imm)) +#define emith_move_r_imm_c(cond, r, imm) \ + emith_move_r_imm(r, imm) + +#define emith_move_r_imm_s8_patchable(r, imm) \ + EMIT(R5_ADD_IMM(r, Z0, (s8)(imm))) +#define emith_move_r_imm_s8_patch(ptr, imm) do { \ + u32 *ptr_ = (u32 *)ptr; \ + while ((*ptr_ & 0xff07f) != R5_ADD_IMM(Z0, Z0, 0)) ptr_++; \ + EMIT_PTR(ptr_, (*ptr_ & 0x000fffff) | ((u16)(s8)(imm)<<20)); \ +} while (0) + +// arithmetic/logical, immediate - R5 always takes a signed 12 bit immediate + +static void emith_op_imm(int f1, int rd, int rs, u32 imm) +{ + int op32 = (f1 == F1_ADD ? R5_OP32 : 0); + if ((imm + _CB(imm,1,11,12)) >> 12) { + emith_move_r_imm(AT, imm); + EMIT(R5_R_INSN(OP_REG^op32, f1&7,_, rd, rs, AT)); + } else if (imm + (f1 == F1_AND) || rd != rs) + EMIT(R5_I_INSN(OP_IMM^op32, f1&7, rd, rs, imm)); +} + +// arithmetic, immediate - can only be ADDI, since SUBI doesn't exist +#define emith_add_r_imm(r, imm) \ + emith_add_r_r_imm(r, r, imm) +#define emith_add_r_imm_c(cond, r, imm) \ + emith_add_r_imm(r, imm) + +#define emith_addf_r_imm(r, imm) \ + emith_addf_r_r_imm(r, imm) + +#define emith_sub_r_imm(r, imm) \ + emith_sub_r_r_imm(r, r, imm) +#define emith_sub_r_imm_c(cond, r, imm) \ + emith_sub_r_imm(r, imm) + +#define emith_subf_r_imm(r, imm) \ + emith_subf_r_r_imm(r, r, imm) + +#define emith_adc_r_imm(r, imm) \ + emith_adc_r_r_imm(r, r, imm); + +#define emith_adcf_r_imm(r, imm) \ + emith_adcf_r_r_imm(r, r, imm) + +#define emith_cmp_r_imm(r, imm) \ + emith_set_compare_flags(r, -1, imm) +// emith_subf_r_r_imm(FNZ, r, imm) + +#define emith_add_r_r_ptr_imm(d, s, imm) \ + emith_op_imm(F1_ADD|F2_ALT, d, s, imm) + +#define emith_add_r_r_imm(d, s, imm) \ + emith_op_imm(F1_ADD, d, s, imm) + +#define emith_addf_r_r_imm(d, s, imm) do { \ + emith_add_r_r_imm(FNZ, s, imm); \ + emith_set_arith_flags(d, s, -1, imm, 0); \ +} while (0) + +#define emith_adc_r_r_imm(d, s, imm) do { \ + emith_add_r_r_r(AT, s, FC); \ + emith_add_r_r_imm(d, AT, imm); \ +} while (0) + +#define emith_adcf_r_r_imm(d, s, imm) do { \ + if (imm == 0) { \ + emith_add_r_r_r(FNZ, s, FC); \ + emith_set_arith_flags(d, s, -1, 1, 0); \ + } else { \ + emith_add_r_r_r(FNZ, s, FC); \ + EMIT(R5_SLTU_REG(AT, FNZ, FC)); \ + emith_add_r_r_imm(FNZ, FNZ, imm); \ + emith_set_arith_flags(d, s, -1, imm, 0); \ + emith_or_r_r(FC, AT); \ + } \ +} while (0) + +// NB: no SUBI in R5, since ADDI takes a signed imm +#define emith_sub_r_r_imm(d, s, imm) \ + emith_add_r_r_imm(d, s, -(imm)) +#define emith_sub_r_r_imm_c(cond, d, s, imm) \ + emith_sub_r_r_imm(d, s, imm) + +#define emith_subf_r_r_imm(d, s, imm) do { \ + emith_sub_r_r_imm(FNZ, s, imm); \ + emith_set_arith_flags(d, s, -1, imm, 1); \ +} while (0) + +// logical, immediate +#define emith_and_r_imm(r, imm) \ + emith_op_imm(F1_AND, r, r, imm) + +#define emith_or_r_imm(r, imm) \ + emith_op_imm(F1_OR, r, r, imm) +#define emith_or_r_imm_c(cond, r, imm) \ + emith_or_r_imm(r, imm) + +#define emith_eor_r_imm_ptr(r, imm) \ + emith_op_imm(F1_XOR, r, r, imm) +#define emith_eor_r_imm_ptr_c(cond, r, imm) \ + emith_eor_r_imm_ptr(r, imm) + +#define emith_eor_r_imm(r, imm) \ + emith_eor_r_imm_ptr(r, imm) +#define emith_eor_r_imm_c(cond, r, imm) \ + emith_eor_r_imm(r, imm) + +/* NB: BIC #imm not available in R5; use AND #~imm instead */ +#define emith_bic_r_imm(r, imm) \ + emith_op_imm(F1_AND, r, r, ~(imm)) +#define emith_bic_r_imm_c(cond, r, imm) \ + emith_bic_r_imm(r, imm) + +#define emith_tst_r_imm(r, imm) do { \ + emith_op_imm(F1_AND, FNZ, r, imm); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) +#define emith_tst_r_imm_c(cond, r, imm) \ + emith_tst_r_imm(r, imm) + +#define emith_and_r_r_imm(d, s, imm) \ + emith_op_imm(F1_AND, d, s, imm) + +#define emith_or_r_r_imm(d, s, imm) \ + emith_op_imm(F1_OR, d, s, imm) + +#define emith_eor_r_r_imm(d, s, imm) \ + emith_op_imm(F1_XOR, d, s, imm) + +// shift +#define emith_lsl(d, s, cnt) \ + EMIT(R5_LSLW_IMM(d, s, cnt)) + +#define emith_lsr(d, s, cnt) \ + EMIT(R5_LSRW_IMM(d, s, cnt)) + +#define emith_asr(d, s, cnt) \ + EMIT(R5_ASRW_IMM(d, s, cnt)) + +#define emith_ror(d, s, cnt) do { \ + EMIT(R5_LSLW_IMM(AT, s, 32-(cnt))); \ + EMIT(R5_LSRW_IMM(d, s, cnt)); \ + EMIT(R5_OR_REG(d, d, AT)); \ +} while (0) +#define emith_ror_c(cond, d, s, cnt) \ + emith_ror(d, s, cnt) + +#define emith_rol(d, s, cnt) do { \ + EMIT(R5_LSRW_IMM(AT, s, 32-(cnt))); \ + EMIT(R5_LSLW_IMM(d, s, cnt)); \ + EMIT(R5_OR_REG(d, d, AT)); \ +} while (0) + +#define emith_rorc(d) do { \ + emith_lsr(d, d, 1); \ + emith_lsl(AT, FC, 31); \ + emith_or_r_r(d, AT); \ +} while (0) + +#define emith_rolc(d) do { \ + emith_lsl(d, d, 1); \ + emith_or_r_r(d, FC); \ +} while (0) + +// NB: all flag setting shifts make V undefined +#define emith_lslf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_lsl(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_lsr(FC, _s, 31); \ + emith_lsl(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_lsrf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_lsr(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_and_r_r_imm(FC, _s, 1); \ + emith_lsr(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_asrf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_asr(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_and_r_r_imm(FC, _s, 1); \ + emith_asr(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_rolf(d, s, cnt) do { \ + emith_rol(d, s, cnt); \ + emith_and_r_r_imm(FC, d, 1); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_rorf(d, s, cnt) do { \ + emith_ror(d, s, cnt); \ + emith_lsr(FC, d, 31); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_rolcf(d) do { \ + emith_lsr(AT, d, 31); \ + emith_lsl(d, d, 1); \ + emith_or_r_r(d, FC); \ + emith_move_r_r(FC, AT); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +#define emith_rorcf(d) do { \ + emith_and_r_r_imm(AT, d, 1); \ + emith_lsr(d, d, 1); \ + emith_lsl(FC, FC, 31); \ + emith_or_r_r(d, FC); \ + emith_move_r_r(FC, AT); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_rs = emith_cmp_rt = -1; \ +} while (0) + +// signed/unsigned extend + +#define emith_clear_msb(d, s, count) /* bits to clear */ do { \ + u32 t; \ + if ((count) >= 21) { \ + t = (count) - 21; \ + t = 0x7ff >> t; \ + emith_and_r_r_imm(d, s, t); \ + } else { \ + emith_lsl(d, s, count); \ + emith_lsr(d, d, count); \ + } \ +} while (0) +#define emith_clear_msb_c(cond, d, s, count) \ + emith_clear_msb(d, s, count) + +#define emith_sext(d, s, count) /* bits to keep */ do { \ + emith_lsl(d, s, 32-(count)); \ + emith_asr(d, d, 32-(count)); \ +} while (0) + +// multiply Rd = Rn*Rm (+ Ra) + +#define emith_mul(d, s1, s2) \ + EMIT(R5_MULW(d, s1, s2)) \ + +#define emith_mul_u64(dlo, dhi, s1, s2) \ + EMIT_R5_MULLU_REG(dlo, dhi, s1, s2) + +#define emith_mul_s64(dlo, dhi, s1, s2) \ + EMIT_R5_MULLS_REG(dlo, dhi, s1, s2) + +#define emith_mula_s64(dlo, dhi, s1, s2) do { \ + int t_ = rcache_get_tmp(); \ + EMIT_R5_MULLS_REG(t_, AT, s1, s2); \ + emith_add_r_r(dhi, AT); \ + emith_add_r_r(dlo, t_); \ + EMIT(R5_SLTU_REG(AT, dlo, t_)); \ + emith_add_r_r(dhi, AT); \ + rcache_free_tmp(t_); \ +} while (0) +#define emith_mula_s64_c(cond, dlo, dhi, s1, s2) \ + emith_mula_s64(dlo, dhi, s1, s2) + +// load/store. offs has 12 bits signed, hence larger offs may use a temp +static void emith_ld_offs(int sz, int rd, int rs, int o12) +{ + if (o12 >= -0x800 && o12 < 0x800) { + EMIT(R5_I_INSN(OP_LD, sz, rd, rs, o12)); + } else { + EMIT(R5_MOVT_IMM(AT, o12 + _CB(o12,1,11,12))); \ + EMIT(R5_R_INSN(OP_REG, F1_ADD,_, AT, rs, AT)); \ + EMIT(R5_I_INSN(OP_LD, sz, rd, AT, o12)); + } +} + +#define emith_read_r_r_offs_ptr(r, rs, offs) \ + emith_ld_offs(F1_P, r, rs, offs) +#define emith_read_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_read_r_r_offs_ptr(r, rs, offs) + +#define emith_read_r_r_offs(r, rs, offs) \ + emith_ld_offs(F1_W, r, rs, offs) +#define emith_read_r_r_offs_c(cond, r, rs, offs) \ + emith_read_r_r_offs(r, rs, offs) + +#define emith_read_r_r_r_ptr(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_ld_offs(F1_P, r, AT, 0); \ +} while (0) +#define emith_read_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_ld_offs(F1_W, r, AT, 0); \ +} while (0) +#define emith_read_r_r_r_c(cond, r, rs, rm) \ + emith_read_r_r_r(r, rs, rm) + +#define emith_read8_r_r_offs(r, rs, offs) \ + emith_ld_offs(F1_BU, r, rs, offs) +#define emith_read8_r_r_offs_c(cond, r, rs, offs) \ + emith_read8_r_r_offs(r, rs, offs) + +#define emith_read8_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_ld_offs(F1_BU, r, AT, 0); \ +} while (0) +#define emith_read8_r_r_r_c(cond, r, rs, rm) \ + emith_read8_r_r_r(r, rs, rm) + +#define emith_read16_r_r_offs(r, rs, offs) \ + emith_ld_offs(F1_HU, r, rs, offs) +#define emith_read16_r_r_offs_c(cond, r, rs, offs) \ + emith_read16_r_r_offs(r, rs, offs) + +#define emith_read16_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_ld_offs(F1_HU, r, AT, 0); \ +} while (0) +#define emith_read16_r_r_r_c(cond, r, rs, rm) \ + emith_read16_r_r_r(r, rs, rm) + +#define emith_read8s_r_r_offs(r, rs, offs) \ + emith_ld_offs(F1_B, r, rs, offs) +#define emith_read8s_r_r_offs_c(cond, r, rs, offs) \ + emith_read8s_r_r_offs(r, rs, offs) + +#define emith_read8s_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_ld_offs(F1_B, r, AT, 0); \ +} while (0) +#define emith_read8s_r_r_r_c(cond, r, rs, rm) \ + emith_read8s_r_r_r(r, rs, rm) + +#define emith_read16s_r_r_offs(r, rs, offs) \ + emith_ld_offs(F1_H, r, rs, offs) +#define emith_read16s_r_r_offs_c(cond, r, rs, offs) \ + emith_read16s_r_r_offs(r, rs, offs) + +#define emith_read16s_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_ld_offs(F1_H, r, AT, 0); \ +} while (0) +#define emith_read16s_r_r_r_c(cond, r, rs, rm) \ + emith_read16s_r_r_r(r, rs, rm) + +static void emith_st_offs(int sz, int rt, int rs, int o12) +{ + if (o12 >= -0x800 && o12 < 800) { + EMIT(R5_S_INSN(OP_ST, sz, rt, rs, o12)); + } else { + EMIT(R5_MOVT_IMM(AT, o12 + _CB(o12,1,11,12))); \ + EMIT(R5_R_INSN(OP_REG, F1_ADD,_, AT, rs, AT)); \ + EMIT(R5_S_INSN(OP_ST, sz, rt, AT, o12)); + } +} + +#define emith_write_r_r_offs_ptr(r, rs, offs) \ + emith_st_offs(F1_P, r, rs, offs) +#define emith_write_r_r_offs_ptr_c(cond, r, rs, offs) \ + emith_write_r_r_offs_ptr(r, rs, offs) + +#define emith_write_r_r_r_ptr(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_st_offs(F1_P, r, AT, 0); \ +} while (0) +#define emith_write_r_r_r_ptr_c(cond, r, rs, rm) \ + emith_write_r_r_r_ptr(r, rs, rm) + +#define emith_write_r_r_offs(r, rs, offs) \ + emith_st_offs(F1_W, r, rs, offs) +#define emith_write_r_r_offs_c(cond, r, rs, offs) \ + emith_write_r_r_offs(r, rs, offs) + +#define emith_write_r_r_r(r, rs, rm) do { \ + emith_add_r_r_r(AT, rs, rm); \ + emith_st_offs(F1_W, r, AT, 0); \ +} while (0) +#define emith_write_r_r_r_c(cond, r, rs, rm) \ + emith_write_r_r_r(r, rs, rm) + +#define emith_ctx_read_ptr(r, offs) \ + emith_read_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_read(r, offs) \ + emith_read_r_r_offs(r, CONTEXT_REG, offs) +#define emith_ctx_read_c(cond, r, offs) \ + emith_ctx_read(r, offs) + +#define emith_ctx_write_ptr(r, offs) \ + emith_write_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_write(r, offs) \ + emith_write_r_r_offs(r, CONTEXT_REG, offs) + +#define emith_ctx_read_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_read(r_, offs_); \ +} while (0) + +#define emith_ctx_write_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_write(r_, offs_); \ +} while (0) + +// function call handling +#define emith_save_caller_regs(mask) do { \ + int _c; u32 _m = mask & 0x3fce0; /* x5-x7,x10-x17 */ \ + _c = count_bits(_m)&3; _m |= (1<<((4-_c)&3))-1; /* ABI align */ \ + int _s = count_bits(_m) * 4, _o = _s; \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, -_s); \ + for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + if (_m & (1 << _c)) \ + { _o -= 4; if (_c) emith_write_r_r_offs(_c, SP, _o); } \ +} while (0) + +#define emith_restore_caller_regs(mask) do { \ + int _c; u32 _m = mask & 0x3fce0; \ + _c = count_bits(_m)&3; _m |= (1<<((4-_c)&3))-1; /* ABI align */ \ + int _s = count_bits(_m) * 4, _o = 0; \ + for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) \ + { if (_c) emith_read_r_r_offs(_c, SP, _o); _o += 4; } \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, _s); \ +} while (0) + +#define host_arg2reg(rd, arg) \ + rd = (arg+10) + +#define emith_pass_arg_r(arg, reg) \ + emith_move_r_r(arg, reg) + +#define emith_pass_arg_imm(arg, imm) \ + emith_move_r_imm(arg, imm) + +// branching +#define emith_invert_branch(cond) /* inverted conditional branch */ \ + ((cond) ^ 0x01) + +// evaluate the emulated condition, returns a register/branch type pair +static int emith_cmpr_check(int rs, int rt, int cond, int *r, int *s) +{ + int b = -1; + + // condition check for comparing 2 registers + switch (cond) { + case DCOND_EQ: *r = rs; *s = rt; b = F1_BEQ; break; + case DCOND_NE: *r = rs; *s = rt; b = F1_BNE; break; + case DCOND_LO: *r = rs, *s = rt, b = F1_BLTU; break; // s < t, u + case DCOND_HS: *r = rs, *s = rt, b = F1_BGEU; break; // s >= t, u + case DCOND_LS: *r = rt, *s = rs, b = F1_BGEU; break; // s <= t, u + case DCOND_HI: *r = rt, *s = rs, b = F1_BLTU; break; // s > t, u + case DCOND_LT: *r = rs, *s = rt, b = F1_BLT; break; // s < t + case DCOND_GE: *r = rs, *s = rt, b = F1_BGE; break; // s >= t + case DCOND_LE: *r = rt, *s = rs, b = F1_BGE; break; // s <= t + case DCOND_GT: *r = rt, *s = rs, b = F1_BLT; break; // s > t + } + + return b; +} + +static int emith_cmpi_check(int rs, s32 imm, int cond, int *r, int *s) +{ + int b = -1; + + // condition check for comparing register with immediate + if (imm == 0) return emith_cmpr_check(rs, Z0, cond, r, s); + + emith_move_r_imm(AT, imm); + switch (cond) { + case DCOND_EQ: *r = AT, *s = rs, b = F1_BEQ; break; + case DCOND_NE: *r = AT, *s = rs, b = F1_BNE; break; + case DCOND_LO: *r = rs, *s = AT, b = F1_BLTU; break; // s < imm, u + case DCOND_HS: *r = rs, *s = AT, b = F1_BGEU; break; // s >= imm, u + case DCOND_LS: *r = AT, *s = rs, b = F1_BGEU; break; // s <= imm, u + case DCOND_HI: *r = AT, *s = rs, b = F1_BLTU; break; // s > imm, u + case DCOND_LT: *r = rs, *s = AT, b = F1_BLT; break; // s < imm + case DCOND_GE: *r = rs, *s = AT, b = F1_BGE; break; // s >= imm + case DCOND_LE: *r = AT, *s = rs, b = F1_BGE; break; // s <= imm + case DCOND_GT: *r = AT, *s = rs, b = F1_BLT; break; // s > imm + } + return b; +} + +static int emith_cond_check(int cond, int *r, int *s) +{ + int b = -1; + + *s = Z0; + if (emith_cmp_rs >= 0) { + if (emith_cmp_rt != -1) + b = emith_cmpr_check(emith_cmp_rs,emith_cmp_rt, cond,r,s); + else b = emith_cmpi_check(emith_cmp_rs,emith_cmp_imm,cond,r,s); + } + + // shortcut for V known to be 0 + if (b < 0 && emith_flg_noV) switch (cond) { + case DCOND_VS: *r = Z0; b = F1_BNE; break; // never + case DCOND_VC: *r = Z0; b = F1_BEQ; break; // always + case DCOND_LT: *r = FNZ, b = F1_BLT; break; // N + case DCOND_GE: *r = FNZ, b = F1_BGE; break; // !N + case DCOND_LE: *r = Z0, *s = FNZ, b = F1_BGE; break; // N || Z + case DCOND_GT: *r = Z0, *s = FNZ, b = F1_BLT; break; // !N && !Z + } + + // the full monty if no shortcut + if (b < 0) switch (cond) { + // conditions using NZ + case DCOND_EQ: *r = FNZ; b = F1_BEQ; break; // Z + case DCOND_NE: *r = FNZ; b = F1_BNE; break; // !Z + case DCOND_MI: *r = FNZ; b = F1_BLT; break; // N + case DCOND_PL: *r = FNZ; b = F1_BGE; break; // !N + // conditions using C + case DCOND_LO: *r = FC; b = F1_BNE; break; // C + case DCOND_HS: *r = FC; b = F1_BEQ; break; // !C + // conditions using CZ + case DCOND_LS: // C || Z + case DCOND_HI: // !C && !Z + EMIT(R5_ADD_IMM(AT, FC, -1)); // !C && !Z + EMIT(R5_AND_REG(AT, FNZ, AT)); + *r = AT, b = (cond == DCOND_HI ? F1_BNE : F1_BEQ); + break; + + // conditions using V + case DCOND_VS: // V + case DCOND_VC: // !V + EMIT(R5_XOR_REG(AT, FV, FNZ)); // V = Nt^Ns^Nd^C + EMIT(R5_LSRW_IMM(AT, AT, 31)); + EMIT(R5_XOR_REG(AT, AT, FC)); + *r = AT, b = (cond == DCOND_VS ? F1_BNE : F1_BEQ); + break; + // conditions using VNZ + case DCOND_LT: // N^V + case DCOND_GE: // !(N^V) + EMIT(R5_LSRW_IMM(AT, FV, 31)); // Nd^V = Nt^Ns^C + EMIT(R5_XOR_REG(AT, FC, AT)); + *r = AT, b = (cond == DCOND_LT ? F1_BNE : F1_BEQ); + break; + case DCOND_LE: // (N^V) || Z + case DCOND_GT: // !(N^V) && !Z + EMIT(R5_LSRW_IMM(AT, FV, 31)); // Nd^V = Nt^Ns^C + EMIT(R5_XOR_REG(AT, FC, AT)); + EMIT(R5_ADD_IMM(AT, AT, -1)); // !(Nd^V) && !Z + EMIT(R5_AND_REG(AT, FNZ, AT)); + *r = AT, b = (cond == DCOND_GT ? F1_BNE : F1_BEQ); + break; + } + return b; +} + +// NB: R5 unconditional jumps have only +/- 1MB range, hence use reg jumps +#define emith_jump(target) do { \ + uintptr_t target_ = (uintptr_t)(target); \ + EMIT(R5_MOVT_IMM(AT, target_ + _CB(target_,1,11,12))); \ + EMIT(R5_JR(AT, target_)); \ +} while (0) +#define emith_jump_patchable(target) \ + emith_jump(target) + +// NB: R5 conditional branches have only +/- 4KB range +#define emith_jump_cond(cond, target) do { \ + int r_, s_, mcond_ = emith_cond_check(cond, &r_, &s_); \ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ + EMIT(R5_BCOND(mcond_,r_,s_,disp_ & 0x00001fff)); \ +} while (0) +#define emith_jump_cond_patchable(cond, target) \ + emith_jump_cond(cond, target) + +#define emith_jump_cond_inrange(target) \ + ((u8 *)target - (u8 *)tcache_ptr < 0x1000 && \ + (u8 *)target - (u8 *)tcache_ptr >= -0x1000+0x10) // mind cond_check + +// NB: returns position of patch for cache maintenance +#define emith_jump_patch(ptr, target, pos) do { \ + u32 *ptr_ = (u32 *)ptr; /* must skip condition check code */ \ + while ((*ptr_&0x77) != OP_JALR && (*ptr_&0x77) != OP_BCOND) ptr_ ++; \ + if ((*ptr_&0x77) == OP_BCOND) { \ + u32 *p_ = ptr_, disp_ = (u8 *)target - (u8 *)ptr_; \ + u32 f1_ = _CB(*ptr_,3,12,0); \ + u32 r_ = _CB(*ptr_,5,15,0), s_ = _CB(*ptr_,5,20,0); \ + EMIT_PTR(p_, R5_BCOND(f1_, r_, s_, disp_ & 0x00001fff)); \ + } else { \ + u32 *p_ = -- ptr_; \ + uintptr_t target_ = (uintptr_t)(target); \ + EMIT_PTR(p_, R5_MOVT_IMM(AT, target_ + _CB(target_,1,11,12))); \ + EMIT_PTR(p_, R5_JR(AT, target_)); \ + } \ + if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)(ptr_); \ +} while (0) + +#define emith_jump_patch_inrange(ptr, target) \ + ((u8 *)target - (u8 *)ptr < 0x1000 && \ + (u8 *)target - (u8 *)ptr >= -0x1000+0x10) // mind cond_check +#define emith_jump_patch_size() 8 + +#define emith_jump_at(ptr, target) do { \ + uintptr_t target_ = (uintptr_t)(target); \ + u32 *ptr_ = (u32 *)ptr; \ + EMIT_PTR(ptr_, R5_MOVT_IMM(AT, target_ + _CB(target_,1,11,12))); \ + EMIT_PTR(ptr_, R5_JR(AT, target_)); \ +} while (0) +#define emith_jump_at_size() 8 + +#define emith_jump_reg(r) \ + EMIT(R5_JR(r, 0)) +#define emith_jump_reg_c(cond, r) \ + emith_jump_reg(r) + +#define emith_jump_ctx(offs) do { \ + emith_ctx_read_ptr(AT, offs); \ + emith_jump_reg(AT); \ +} while (0) +#define emith_jump_ctx_c(cond, offs) \ + emith_jump_ctx(offs) + +#define emith_call(target) do { \ + uintptr_t target_ = (uintptr_t)(target); \ + EMIT(R5_MOVT_IMM(AT, target_ + _CB(target_,1,11,12))); \ + EMIT(R5_JALR(LR, AT, target_)); \ +} while (0) +#define emith_call_cond(cond, target) \ + emith_call(target) + +#define emith_call_reg(r) \ + EMIT(R5_JALR(LR, r, 0)) + +#define emith_call_ctx(offs) do { \ + emith_ctx_read_ptr(AT, offs); \ + emith_call_reg(AT); \ +} while (0) + +#define emith_call_cleanup() /**/ + +#define emith_ret() \ + EMIT(R5_JR(LR, 0)) +#define emith_ret_c(cond) \ + emith_ret() + +#define emith_ret_to_ctx(offs) \ + emith_ctx_write_ptr(LR, offs) + +#define emith_add_r_ret(r) \ + emith_add_r_r_ptr(r, LR) + +#define emith_push_ret(r) do { \ + emith_add_r_r_ptr_imm(SP, SP, -16); /* ABI requires 16 byte aligment */\ + emith_write_r_r_offs(LR, SP, 4); \ + if ((r) > 0) emith_write_r_r_offs(r, SP, 0); \ +} while (0) + +#define emith_pop_and_ret(r) do { \ + if ((r) > 0) emith_read_r_r_offs(r, SP, 0); \ + emith_read_r_r_offs(LR, SP, 4); \ + emith_add_r_r_ptr_imm(SP, SP, 16); \ + emith_ret(); \ +} while (0) + + +// emitter ABI stuff +#define emith_pool_check() /**/ +#define emith_pool_commit(j) /**/ +#define emith_insn_ptr() ((u8 *)tcache_ptr) +#define emith_flush() /**/ +#define host_instructions_updated(base, end) __builtin___clear_cache(base, end) +#define emith_update_cache() /**/ +#define emith_rw_offs_max() 0x7ff + +// SH2 drc specific +#define emith_sh2_drc_entry() do { \ + int _c; u32 _m = 0x0ffc0202; /* x1,x9,x18-x27 */ \ + _c = count_bits(_m)&3; _m |= (1<<((4-_c)&3))-1; /* ABI align */ \ + int _s = count_bits(_m) * 4, _o = _s; \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, -_s); \ + for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + if (_m & (1 << _c)) \ + { _o -= 4; if (_c) emith_write_r_r_offs(_c, SP, _o); } \ +} while (0) +#define emith_sh2_drc_exit() do { \ + int _c; u32 _m = 0x0ffc0202; \ + _c = count_bits(_m)&3; _m |= (1<<((4-_c)&3))-1; /* ABI align */ \ + int _s = count_bits(_m) * 4, _o = 0; \ + for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) \ + { if (_c) emith_read_r_r_offs(_c, SP, _o); _o += 4; } \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, _s); \ + emith_ret(); \ +} while (0) + +// NB: assumes a is in arg0, tab, func and mask are temp +#define emith_sh2_rcall(a, tab, func, mask) do { \ + emith_lsr(mask, a, SH2_READ_SHIFT); \ + emith_add_r_r_r_lsl_ptr(tab, tab, mask, PTR_SCALE+1); \ + emith_read_r_r_offs_ptr(func, tab, 0); \ + emith_read_r_r_offs(mask, tab, 1 << PTR_SCALE); \ + emith_addf_r_r_r_ptr(func, func, func); \ +} while (0) + +// NB: assumes a, val are in arg0 and arg1, tab and func are temp +#define emith_sh2_wcall(a, val, tab, func) do { \ + emith_lsr(func, a, SH2_WRITE_SHIFT); \ + emith_lsl(func, func, PTR_SCALE); \ + emith_read_r_r_r_ptr(func, tab, func); \ + emith_move_r_r_ptr(12, CONTEXT_REG); /* arg2 */ \ + emith_jump_reg(func); \ +} while (0) + +#define emith_sh2_delay_loop(cycles, reg) do { \ + int sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); \ + int t1 = rcache_get_tmp(); \ + int t2 = rcache_get_tmp(); \ + int t3 = rcache_get_tmp(); \ + /* if (sr < 0) return */ \ + emith_cmp_r_imm(sr, 0); \ + EMITH_JMP_START(DCOND_LE); \ + /* turns = sr.cycles / cycles */ \ + emith_asr(t2, sr, 12); \ + emith_move_r_imm(t3, (u32)((1ULL<<32) / (cycles)) + 1); \ + emith_mul_u64(t1, t2, t2, t3); /* multiply by 1/x */ \ + rcache_free_tmp(t3); \ + if (reg >= 0) { \ + /* if (reg <= turns) turns = reg-1 */ \ + t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \ + emith_cmp_r_r(t3, t2); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_sub_r_r_imm_c(DCOND_LS, t2, t3, 1); \ + EMITH_SJMP_END(DCOND_HI); \ + /* if (reg <= 1) turns = 0 */ \ + emith_cmp_r_imm(t3, 1); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_move_r_imm_c(DCOND_LS, t2, 0); \ + EMITH_SJMP_END(DCOND_HI); \ + /* reg -= turns */ \ + emith_sub_r_r(t3, t2); \ + } \ + /* sr.cycles -= turns * cycles; */ \ + emith_move_r_imm(t1, cycles); \ + emith_mul(t1, t2, t1); \ + emith_sub_r_r_r_lsl(sr, sr, t1, 12); \ + EMITH_JMP_END(DCOND_LE); \ + rcache_free_tmp(t1); \ + rcache_free_tmp(t2); \ +} while (0) + +/* + * if Q + * t = carry(Rn += Rm) + * else + * t = carry(Rn -= Rm) + * T ^= t + */ +#define emith_sh2_div1_step(rn, rm, sr) do { \ + emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ + EMITH_JMP3_START(DCOND_EQ); \ + EMITH_HINT_COND(DCOND_CS); \ + emith_addf_r_r(rn, rm); \ + EMITH_JMP3_MID(DCOND_EQ); \ + EMITH_HINT_COND(DCOND_CS); \ + emith_subf_r_r(rn, rm); \ + EMITH_JMP3_END(); \ + emith_eor_r_r(sr, FC); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macl(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* MACH top 16 bits unused if saturated. sign ext for overfl detect */ \ + emith_sext(mh, mh, 16); \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ + /* to check: add MACH >> 31 to MACH >> 15. this is 0 if no overflow */ \ + emith_asr(rn, mh, 15); \ + emith_add_r_r_r_lsr(rn, rn, mh, 31); /* sum = (MACH>>31)+(MACH>>15) */ \ + emith_teq_r_r(rn, Z0); /* (need only N and Z flags) */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ + EMITH_SJMP_START(DCOND_PL); /* sum > 0 -> +ovl */ \ + emith_sub_r_imm_c(DCOND_MI, ml, 1); /* 0xffffffff */ \ + emith_sub_r_imm_c(DCOND_MI, mh, 1); /* 0x00007fff */ \ + EMITH_SJMP_END(DCOND_PL); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macw(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* XXX: MACH should be untouched when S is set? */ \ + emith_asr(mh, ml, 31); /* sign ext MACL to MACH for ovrfl check */ \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 33 bits of MACH:MACL aren't all 1 or 0 */ \ + /* to check: add MACL[31] to MACH. this is 0 if no overflow */ \ + emith_lsr(rn, ml, 31); \ + emith_add_r_r(rn, mh); /* sum = MACH + ((MACL>>31)&1) */ \ + emith_teq_r_r(rn, Z0); /* (need only N and Z flags) */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> overflow */ \ + /* XXX: LSB signalling only in SH1, or in SH2 too? */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x00000001); /* LSB of MACH */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x80000000); /* negative ovrfl */ \ + EMITH_SJMP_START(DCOND_PL); /* sum > 0 -> positive ovrfl */ \ + emith_sub_r_imm_c(DCOND_MI, ml, 1); /* 0x7fffffff */ \ + EMITH_SJMP_END(DCOND_PL); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +#define emith_write_sr(sr, srcr) do { \ + emith_lsr(sr, sr , 10); emith_lsl(sr, sr, 10); \ + emith_lsl(AT, srcr, 22); emith_lsr(AT, AT, 22); \ + emith_or_r_r(sr, AT); \ +} while (0) + +#define emith_carry_to_t(sr, is_sub) do { \ + emith_and_r_imm(sr, 0xfffffffe); \ + emith_or_r_r(sr, FC); \ +} while (0) + +#define emith_t_to_carry(sr, is_sub) do { \ + emith_and_r_r_imm(FC, sr, 1); \ +} while (0) + +#define emith_tpop_carry(sr, is_sub) do { \ + emith_and_r_r_imm(FC, sr, 1); \ + emith_eor_r_r(sr, FC); \ +} while (0) + +#define emith_tpush_carry(sr, is_sub) \ + emith_or_r_r(sr, FC) + +#ifdef T +// T bit handling +#define emith_invert_cond(cond) \ + ((cond) ^ 1) + +static void emith_clr_t_cond(int sr) +{ + emith_bic_r_imm(sr, T); +} + +static void emith_set_t_cond(int sr, int cond) +{ + int b, r, s; + u8 *ptr; + u32 val = 0, inv = 0; + + // try to avoid jumping around if possible + if (emith_cmp_rs >= 0) { + if (emith_cmp_rt >= 0) + b = emith_cmpr_check(emith_cmp_rs, emith_cmp_rt, cond, &r, &s); + else + b = emith_cmpi_check(emith_cmp_rs, emith_cmp_imm, cond, &r, &s); + } else { + b = emith_cond_check(cond, &r, &s); + if (r == Z0) { + if (b == F1_BEQ || b == F1_BGE || b == F1_BGEU) + emith_or_r_imm(sr, T); + return; + } else if (r == FC) + val++, inv = (b == F1_BEQ); + } + + if (!val) switch (b) { + case F1_BEQ: if (s == Z0) { EMIT(R5_SLTU_IMM(AT,r ,1)); r=AT; val++; break; } + EMIT(R5_XOR_REG(AT, r, s)); + EMIT(R5_SLTU_IMM(AT,AT, 1)); r=AT; val++; break; + case F1_BNE: if (s == Z0) { EMIT(R5_SLTU_IMM(AT,Z0,r)); r=AT; val++; break; } + EMIT(R5_XOR_REG(AT, r, s)); + EMIT(R5_SLTU_IMM(AT,Z0,AT)); r=AT; val++; break; + case F1_BLTU: EMIT(R5_SLTU_REG(AT, r, s)); r=AT; val++; break; + case F1_BGEU: EMIT(R5_SLTU_REG(AT, r, s)); r=AT; val++; inv++; break; + case F1_BLT: EMIT(R5_SLT_REG(AT, r, s)); r=AT; val++; break; + case F1_BGE: EMIT(R5_SLT_REG(AT, r, s)); r=AT; val++; inv++; break; + } + if (val) { + emith_or_r_r(sr, r); + if (inv) + emith_eor_r_imm(sr, T); + return; + } + + // can't obtain result directly, use presumably slower jump !cond + or sr,T + b = emith_invert_branch(b); + ptr = tcache_ptr; + EMIT(R5_BCOND(b, r, s, 0)); + emith_or_r_imm(sr, T); + val = (u8 *)tcache_ptr - (u8 *)(ptr); + EMIT_PTR(ptr, R5_BCOND(b, r, s, val & 0x00001fff)); +} + +#define emith_get_t_cond() -1 + +#define emith_sync_t(sr) ((void)sr) + +#define emith_invalidate_t() + +static void emith_set_t(int sr, int val) +{ + if (val) + emith_or_r_imm(sr, T); + else + emith_bic_r_imm(sr, T); +} + +static int emith_tst_t(int sr, int tf) +{ + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; +} +#endif From 81a39828a1d1eaa425b1c98d7eab66ace3f8abe9 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 19 Nov 2019 21:56:50 +0100 Subject: [PATCH 080/174] sh2 drc, improved memory management --- cpu/sh2/compiler.c | 372 +++++++++++++++++++++++++++------------------ 1 file changed, 221 insertions(+), 151 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index d1cde69ef..58ddd86f8 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -262,26 +262,20 @@ static void REGPARM(3) *sh2_drc_log_entry(void *block, SH2 *sh2, u32 sr) } #endif -#define TCACHE_BUFFERS 3 // we have 3 translation cache buffers, split from one drc/cmn buffer. // BIOS shares tcache with data array because it's only used for init // and can be discarded early -// XXX: need to tune sizes -static const int tcache_sizes[TCACHE_BUFFERS] = { - DRC_TCACHE_SIZE * 30 / 32, // ROM (rarely used), DRAM - DRC_TCACHE_SIZE / 32, // BIOS, data array in master sh2 - DRC_TCACHE_SIZE / 32, // ... slave -}; - -static u8 *tcache_bases[TCACHE_BUFFERS]; -static u8 *tcache_ptrs[TCACHE_BUFFERS]; -static u8 *tcache_limit[TCACHE_BUFFERS]; +#define TCACHE_BUFFERS 3 -// ptr for code emiters -static u8 *tcache_ptr; -#define MAX_BLOCK_ENTRIES (BLOCK_INSN_LIMIT / 6) +struct ring_buffer { + u8 *base; // ring buffer memory + unsigned item_sz; // size of one buffer item + unsigned size; // number of itmes in ring + int first, next; // read and write pointers + int used; // number of used items in ring +}; enum { BL_JMP=1, BL_LDJMP, BL_JCCBLX }; struct block_link { @@ -326,13 +320,35 @@ struct block_desc { int refcount; #endif int entry_count; - struct block_entry entryp[MAX_BLOCK_ENTRIES]; + struct block_entry *entryp; +}; + +struct block_list { + struct block_desc *block; // block reference + struct block_list *next; // pointers for doubly linked list + struct block_list *prev; + struct block_list **head; // list head (for removing from list) + struct block_list *l_next; +}; + +static u8 *tcache_ptr; // ptr for code emitters + +// XXX: need to tune sizes + +static struct ring_buffer tcache_ring[TCACHE_BUFFERS]; +static const int tcache_sizes[TCACHE_BUFFERS] = { + DRC_TCACHE_SIZE * 30 / 32, // ROM (rarely used), DRAM + DRC_TCACHE_SIZE / 32, // BIOS, data array in master sh2 + DRC_TCACHE_SIZE / 32, // ... slave }; #define BLOCK_MAX_COUNT(tcid) ((tcid) ? 256 : 32*256) +static struct ring_buffer block_ring[TCACHE_BUFFERS]; static struct block_desc *block_tables[TCACHE_BUFFERS]; -static int block_counts[TCACHE_BUFFERS]; -static int block_limit[TCACHE_BUFFERS]; + +#define ENTRY_MAX_COUNT(tcid) ((tcid) ? 8*512 : 256*512) +static struct ring_buffer entry_ring[TCACHE_BUFFERS]; +static struct block_entry *entry_tables[TCACHE_BUFFERS]; // we have block_link_pool to avoid using mallocs #define BLOCK_LINK_MAX_COUNT(tcid) ((tcid) ? 512 : 32*512) @@ -345,15 +361,6 @@ static struct block_link *blink_free[TCACHE_BUFFERS]; #define RAM_SIZE(tcid) ((tcid) ? 0x1000 : 0x40000) #define INVAL_PAGE_SIZE 0x100 -struct block_list { - struct block_desc *block; - struct block_list *next; - struct block_list *prev; - struct block_list **head; - struct block_list *l_next; -}; -struct block_list *blist_free; - static struct block_list *inactive_blocks[TCACHE_BUFFERS]; // array of pointers to block_lists for RAM and 2 data arrays @@ -366,6 +373,11 @@ static struct block_entry **hash_tables[TCACHE_BUFFERS]; #define HASH_FUNC(hash_tab, addr, mask) \ (hash_tab)[((addr) >> 1) & (mask)] +#define BLOCK_LIST_MAX_COUNT (64*1024) +static struct block_list *block_list_pool; +static int block_list_pool_count; +static struct block_list *blist_free; + #if (DRC_DEBUG & 128) #if BRANCH_CACHE int bchit, bcmiss; @@ -429,7 +441,7 @@ static void rcache_free_tmp(int hr); // there must be at least the free (not context or statically mapped) amount of // PRESERVED/TEMPORARY registers used by handlers in worst case (currently 4). // there must be at least 3 PARAM, and PARAM+TEMPORARY must be at least 4. -// SR and R0 should by all means be statically mapped. +// SR must and R0 should by all means be statically mapped. // XXX the static definition of SR MUST match that in compiler.h // PC and PR must not be statically mapped (accessed in context by utils). @@ -544,6 +556,72 @@ static struct block_entry *dr_get_entry(u32 pc, int is_slave, int *tcache_id) // --------------------------------------------------------------- +// ring buffer management +#define RING_INIT(r,m,n) *(r) = (struct ring_buffer) { .base = (u8 *)m, \ + .item_sz = sizeof(*(m)), .size = n }; + +static void *ring_alloc(struct ring_buffer *rb, int count) +{ + // allocate space in ring buffer + void *p; + + p = rb->base + rb->next * rb->item_sz; + if (rb->next+count > rb->size) { + rb->used += rb->size - rb->next; + p = rb->base; // wrap if overflow at end + rb->next = count; + } else { + rb->next += count; + if (rb->next == rb->size) rb->next = 0; + } + + rb->used += count; + return p; +} + +static void ring_wrap(struct ring_buffer *rb) +{ + // insufficient space at end of buffer memory, wrap around + rb->used += rb->size - rb->next; + rb->next = 0; +} + +static void ring_free(struct ring_buffer *rb, int count) +{ + // free oldest space in ring buffer + rb->first += count; + if (rb->first >= rb->size) rb->first -= rb->size; + + rb->used -= count; +} + +static void ring_free_p(struct ring_buffer *rb, void *p) +{ + // free ring buffer space upto given pointer + rb->first = ((u8 *)p - rb->base) / rb->item_sz; + + rb->used = rb->next - rb->first; + if (rb->used < 0) rb->used += rb->size; +} + +static void *ring_reset(struct ring_buffer *rb) +{ + // reset to initial state + rb->first = rb->next = rb->used = 0; + return rb->base + rb->next * rb->item_sz; +} + +static void *ring_first(struct ring_buffer *rb) +{ + return rb->base + rb->first * rb->item_sz; +} + +static void *ring_next(struct ring_buffer *rb) +{ + return rb->base + rb->next * rb->item_sz; +} + + // block management static void add_to_block_list(struct block_list **blist, struct block_desc *block) { @@ -552,13 +630,14 @@ static void add_to_block_list(struct block_list **blist, struct block_desc *bloc if (blist_free) { added = blist_free; blist_free = added->next; + } else if (block_list_pool_count >= BLOCK_LIST_MAX_COUNT) { + printf( "block list overflow\n"); + exit(1); } else { - added = malloc(sizeof(*added)); - } - if (!added) { - elprintf(EL_ANOMALY, "drc OOM (1)"); - return; + added = block_list_pool + block_list_pool_count; + block_list_pool_count ++; } + added->block = block; added->l_next = block->list; block->list = added; @@ -954,6 +1033,7 @@ static void dr_rm_block_entry(struct block_desc *bd, int tcache_id, u32 nolit, i rm_from_block_lists(bd); bd->addr = bd->size = bd->addr_lit = bd->size_lit = 0; bd->entry_count = 0; + bd->entryp = NULL; } emith_update_cache(); } @@ -976,26 +1056,28 @@ static struct block_desc *dr_find_inactive_block(int tcache_id, u16 crc, return NULL; } -static struct block_desc *dr_add_block(u32 addr, int size, +static struct block_desc *dr_add_block(int entries, u32 addr, int size, u32 addr_lit, int size_lit, u16 crc, int is_slave, int *blk_id) { struct block_entry *be; struct block_desc *bd; int tcache_id; - int *bcount; // do a lookup to get tcache_id and override check be = dr_get_entry(addr, is_slave, &tcache_id); if (be != NULL) dbg(1, "block override for %08x", addr); - bcount = &block_counts[tcache_id]; - if (*bcount == block_limit[tcache_id]) { + if (block_ring[tcache_id].used + 1 > block_ring[tcache_id].size || + entry_ring[tcache_id].used + entries > entry_ring[tcache_id].size) { dbg(1, "bd overflow for tcache %d", tcache_id); return NULL; } - bd = &block_tables[tcache_id][*bcount]; + *blk_id = block_ring[tcache_id].next; + bd = ring_alloc(&block_ring[tcache_id], 1); + bd->entryp = ring_alloc(&entry_ring[tcache_id], entries); + bd->addr = addr; bd->size = size; bd->addr_lit = addr_lit; @@ -1009,11 +1091,6 @@ static struct block_desc *dr_add_block(u32 addr, int size, bd->refcount = 0; #endif - *blk_id = *bcount; - (*bcount)++; - if (*bcount >= BLOCK_MAX_COUNT(tcache_id)) - *bcount = 0; - return bd; } @@ -1094,45 +1171,54 @@ static void REGPARM(3) *dr_lookup_block(u32 pc, SH2 *sh2, int *tcache_id) static void dr_free_oldest_block(int tcache_id) { - struct block_desc *bd; + struct block_desc *bf; - if (block_limit[tcache_id] >= BLOCK_MAX_COUNT(tcache_id)) { - // block desc wrap around - block_limit[tcache_id] = 0; - } - bd = &block_tables[tcache_id][block_limit[tcache_id]]; + bf = ring_first(&block_ring[tcache_id]); + if (bf->addr && bf->entry_count) + dr_rm_block_entry(bf, tcache_id, 0, 1); + ring_free(&block_ring[tcache_id], 1); - if (bd->tcache_ptr && bd->tcache_ptr < tcache_ptrs[tcache_id]) { - // cache wrap around - tcache_ptrs[tcache_id] = bd->tcache_ptr; + if (block_ring[tcache_id].used) { + bf = ring_first(&block_ring[tcache_id]); + ring_free_p(&entry_ring[tcache_id], bf->entryp); + ring_free_p(&tcache_ring[tcache_id], bf->tcache_ptr); + } else { + // reset since size of code block isn't known if no successor block exists + ring_reset(&block_ring[tcache_id]); + ring_reset(&entry_ring[tcache_id]); + ring_reset(&tcache_ring[tcache_id]); } - - if (bd->addr && bd->entry_count) - dr_rm_block_entry(bd, tcache_id, 0, 1); - - block_limit[tcache_id]++; - if (block_limit[tcache_id] >= BLOCK_MAX_COUNT(tcache_id)) - block_limit[tcache_id] = 0; - bd = &block_tables[tcache_id][block_limit[tcache_id]]; - if (bd->tcache_ptr >= tcache_ptrs[tcache_id]) - tcache_limit[tcache_id] = bd->tcache_ptr; - else - tcache_limit[tcache_id] = tcache_bases[tcache_id] + tcache_sizes[tcache_id]; } -static u8 *dr_prepare_cache(int tcache_id, int insn_count) +static inline void dr_reserve_cache(int tcache_id, struct ring_buffer *rb, int count) { - u8 *limit = tcache_limit[tcache_id]; - - // if no block desc available - if (block_counts[tcache_id] == block_limit[tcache_id]) + // while not enough space available + if (rb->next + count >= rb->size){ + // not enough space in rest of buffer -> wrap around + while (rb->first >= rb->next && rb->used) + dr_free_oldest_block(tcache_id); + if (rb->first == 0 && rb->used) + dr_free_oldest_block(tcache_id); + ring_wrap(rb); + } + while (rb->first >= rb->next && rb->next + count > rb->first && rb->used) dr_free_oldest_block(tcache_id); +} - // while not enough cache space left (limit - tcache_ptr < max space needed) - while (tcache_limit[tcache_id] - tcache_ptrs[tcache_id] < insn_count * 128) +static u8 *dr_prepare_cache(int tcache_id, int insn_count, int entry_count) +{ + int bf = block_ring[tcache_id].first; + + // reserve one block desc + if (block_ring[tcache_id].used >= block_ring[tcache_id].size) dr_free_oldest_block(tcache_id); + // reserve block entries + dr_reserve_cache(tcache_id, &entry_ring[tcache_id], entry_count); + // reserve cache space + dr_reserve_cache(tcache_id, &tcache_ring[tcache_id], insn_count*128); - if (limit != tcache_limit[tcache_id]) { + if (bf != block_ring[tcache_id].first) { + // deleted some block(s), clear branch cache and return stack #if BRANCH_CACHE if (tcache_id) memset32(sh2s[tcache_id-1].branch_cache, -1, sizeof(sh2s[0].branch_cache)/4); @@ -1152,29 +1238,27 @@ static u8 *dr_prepare_cache(int tcache_id, int insn_count) } #endif } - return (u8 *)tcache_ptrs[tcache_id]; + + return ring_next(&tcache_ring[tcache_id]); } static void dr_flush_tcache(int tcid) { int i; #if (DRC_DEBUG & 1) - int tc_used, bl_used; - - tc_used = tcache_sizes[tcid] - (tcache_limit[tcid] - tcache_ptrs[tcid]); - bl_used = BLOCK_MAX_COUNT(tcid) - (block_limit[tcid] - block_counts[tcid]); - elprintf(EL_STATUS, "tcache #%d flush! (%d/%d, bds %d/%d)", tcid, tc_used, - tcache_sizes[tcid], bl_used, BLOCK_MAX_COUNT(tcid)); + elprintf(EL_STATUS, "tcache #%d flush! (%d/%d, bds %d/%d bes %d/%d)", tcid, + tcache_ring[tcid].used, tcache_ring[tcid].size, block_ring[tcid].used, + block_ring[tcid].size, entry_ring[tcid].used, entry_ring[tcid].size); #endif - block_counts[tcid] = 0; - block_limit[tcid] = BLOCK_MAX_COUNT(tcid) - 1; + ring_reset(&tcache_ring[tcid]); + ring_reset(&block_ring[tcid]); + ring_reset(&entry_ring[tcid]); + block_link_pool_counts[tcid] = 0; blink_free[tcid] = NULL; memset(unresolved_links[tcid], 0, sizeof(*unresolved_links[0]) * HASH_TABLE_SIZE(tcid)); memset(hash_tables[tcid], 0, sizeof(*hash_tables[0]) * HASH_TABLE_SIZE(tcid)); - tcache_ptrs[tcid] = tcache_bases[tcid]; - tcache_limit[tcid] = tcache_bases[tcid] + tcache_sizes[tcid]; if (Pico32xMem->sdram != NULL) { if (tcid == 0) { // ROM, RAM memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram)); @@ -1195,7 +1279,7 @@ static void dr_flush_tcache(int tcid) } } #if (DRC_DEBUG & 4) - tcache_dsm_ptrs[tcid] = tcache_bases[tcid]; + tcache_dsm_ptrs[tcid] = tcache_ring[tcid].base; #endif for (i = 0; i < RAM_SIZE(tcid) / INVAL_PAGE_SIZE; i++) @@ -3095,13 +3179,13 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #endif } - tcache_ptr = dr_prepare_cache(tcache_id, (end_pc - base_pc) / 2); + tcache_ptr = dr_prepare_cache(tcache_id, (end_pc - base_pc) / 2, branch_target_count); #if (DRC_DEBUG & 4) tcache_dsm_ptrs[tcache_id] = tcache_ptr; #endif - block = dr_add_block(base_pc, end_pc - base_pc, base_literals, - end_literals - base_literals, crc, sh2->is_slave, &blkid_main); + block = dr_add_block(branch_target_count, base_pc, end_pc - base_pc, + base_literals, end_literals-base_literals, crc, sh2->is_slave, &blkid_main); if (block == NULL) return NULL; @@ -3143,7 +3227,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // make block entry v = block->entry_count; entry = &block->entryp[v]; - if (v < ARRAY_SIZE(block->entryp)) + if (v < branch_target_count) { entry = &block->entryp[v]; entry->pc = pc; @@ -4726,7 +4810,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) for (bl = block->entryp[i].o_links; bl; bl = bl->o_next) memcpy(bl->jdisp, bl->blx ?: bl->jump, emith_jump_at_size()); - tcache_ptrs[tcache_id] = tcache_ptr; + ring_alloc(&tcache_ring[tcache_id], tcache_ptr - block_entry_ptr); host_instructions_updated(block_entry_ptr, tcache_ptr); dr_activate_block(block, tcache_id, sh2->is_slave); @@ -4736,10 +4820,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) dbg(2, " block #%d,%d -> %p tcache %d/%d, insns %d -> %d %.3f", tcache_id, blkid_main, tcache_ptr, - tcache_ptr - tcache_bases[tcache_id], tcache_sizes[tcache_id], + tcache_ring[tcache_id].used, tcache_ring[tcache_id].size, insns_compiled, host_insn_count, (float)host_insn_count / insns_compiled); if ((sh2->pc & 0xc6000000) == 0x02000000) { // ROM - dbg(2, " hash collisions %d/%d", hash_collisions, block_counts[tcache_id]); + dbg(2, " hash collisions %d/%d", hash_collisions, block_ring[tcache_id].used); Pico32x.emu_flags |= P32XF_DRC_ROM_C; } /* @@ -5220,10 +5304,7 @@ static void block_stats(void) printf("block stats:\n"); for (b = 0; b < ARRAY_SIZE(block_tables); b++) { - for (i = 0; i < block_counts[b]; i++) - if (block_tables[b][i].addr != 0) - total += block_tables[b][i].refcount; - for (i = block_limit[b]; i < BLOCK_MAX_COUNT(b); i++) + for (i = block_ring[b].first; i != block_ring[b].next; i = (i+1)%block_ring[b].size) if (block_tables[b][i].addr != 0) total += block_tables[b][i].refcount; } @@ -5233,20 +5314,11 @@ static void block_stats(void) struct block_desc *blk, *maxb = NULL; int max = 0; for (b = 0; b < ARRAY_SIZE(block_tables); b++) { - for (i = 0; i < block_counts[b]; i++) { - blk = &block_tables[b][i]; - if (blk->addr != 0 && blk->refcount > max) { - max = blk->refcount; - maxb = blk; - } - } - for (i = block_limit[b]; i < BLOCK_MAX_COUNT(b); i++) { - blk = &block_tables[b][i]; - if (blk->addr != 0 && blk->refcount > max) { + for (i = block_ring[b].first; i != block_ring[b].next; i = (i+1)%block_ring[b].size) + if ((blk = &block_tables[b][i])->addr != 0 && blk->refcount > max) { max = blk->refcount; maxb = blk; } - } } if (maxb == NULL) break; @@ -5255,12 +5327,9 @@ static void block_stats(void) maxb->refcount = 0; } - for (b = 0; b < ARRAY_SIZE(block_tables); b++) { - for (i = 0; i < block_counts[b]; i++) - block_tables[b][i].refcount = 0; - for (i = block_limit[b]; i < BLOCK_MAX_COUNT(b); i++) + for (b = 0; b < ARRAY_SIZE(block_tables); b++) + for (i = block_ring[b].first; i != block_ring[b].next; i = (i+1)%block_ring[b].size) block_tables[b][i].refcount = 0; - } #endif } @@ -5272,10 +5341,7 @@ void entry_stats(void) printf("block entry stats:\n"); for (b = 0; b < ARRAY_SIZE(block_tables); b++) { - for (i = 0; i < block_counts[b]; i++) - for (j = 0; j < block_tables[b][i].entry_count; j++) - total += block_tables[b][i].entryp[j].entry_count; - for (i = block_limit[b]; i < BLOCK_MAX_COUNT(b); i++) + for (i = block_ring[b].first; i != block_ring[b].next; i = (i+1)%block_ring[b].size) for (j = 0; j < block_tables[b][i].entry_count; j++) total += block_tables[b][i].entryp[j].entry_count; } @@ -5286,15 +5352,7 @@ void entry_stats(void) struct block_entry *maxb = NULL; int max = 0; for (b = 0; b < ARRAY_SIZE(block_tables); b++) { - for (i = 0; i < block_counts[b]; i++) { - blk = &block_tables[b][i]; - for (j = 0; j < blk->entry_count; j++) - if (blk->entryp[j].entry_count > max) { - max = blk->entryp[j].entry_count; - maxb = &blk->entryp[j]; - } - } - for (i = block_limit[b]; i < BLOCK_MAX_COUNT(b); i++) { + for (i = block_ring[b].first; i != block_ring[b].next; i = (i+1)%block_ring[b].size) { blk = &block_tables[b][i]; for (j = 0; j < blk->entry_count; j++) if (blk->entryp[j].entry_count > max) { @@ -5311,10 +5369,7 @@ void entry_stats(void) } for (b = 0; b < ARRAY_SIZE(block_tables); b++) { - for (i = 0; i < block_counts[b]; i++) - for (j = 0; j < block_tables[b][i].entry_count; j++) - block_tables[b][i].entryp[j].entry_count = 0; - for (i = block_limit[b]; i < BLOCK_MAX_COUNT(b); i++) + for (i = block_ring[b].first; i != block_ring[b].next; i = (i+1)%block_ring[b].size) for (j = 0; j < block_tables[b][i].entry_count; j++) block_tables[b][i].entryp[j].entry_count = 0; } @@ -5432,6 +5487,9 @@ int sh2_drc_init(SH2 *sh2) block_tables[i] = calloc(BLOCK_MAX_COUNT(i), sizeof(*block_tables[0])); if (block_tables[i] == NULL) goto fail; + entry_tables[i] = calloc(ENTRY_MAX_COUNT(i), sizeof(*entry_tables[0])); + if (entry_tables[i] == NULL) + goto fail; block_link_pool[i] = calloc(BLOCK_LINK_MAX_COUNT(i), sizeof(*block_link_pool[0])); if (block_link_pool[i] == NULL) @@ -5449,33 +5507,39 @@ int sh2_drc_init(SH2 *sh2) unresolved_links[i] = calloc(HASH_TABLE_SIZE(i), sizeof(*unresolved_links[0])); if (unresolved_links[i] == NULL) goto fail; +//atexit(sh2_drc_finish); + + RING_INIT(&block_ring[i], block_tables[i], BLOCK_MAX_COUNT(i)); + RING_INIT(&entry_ring[i], entry_tables[i], ENTRY_MAX_COUNT(i)); } - memset(block_counts, 0, sizeof(block_counts)); - for (i = 0; i < ARRAY_SIZE(block_counts); i++) { - block_limit[i] = BLOCK_MAX_COUNT(i) - 1; - } + + block_list_pool = calloc(BLOCK_LIST_MAX_COUNT, sizeof(*block_list_pool)); + if (block_list_pool == NULL) + goto fail; + block_list_pool_count = 0; + blist_free = NULL; + memset(block_link_pool_counts, 0, sizeof(block_link_pool_counts)); - for (i = 0; i < ARRAY_SIZE(blink_free); i++) { - blink_free[i] = NULL; - } + memset(blink_free, 0, sizeof(blink_free)); drc_cmn_init(); rcache_init(); + tcache_ptr = tcache; sh2_generate_utils(); host_instructions_updated(tcache, tcache_ptr); emith_update_cache(); - tcache_bases[0] = tcache_ptrs[0] = tcache_ptr; - tcache_limit[0] = tcache_bases[0] + tcache_sizes[0] - (tcache_ptr-tcache); - for (i = 1; i < ARRAY_SIZE(tcache_bases); i++) { - tcache_bases[i] = tcache_ptrs[i] = tcache_bases[i - 1] + tcache_sizes[i - 1]; - tcache_limit[i] = tcache_bases[i] + tcache_sizes[i]; + i = tcache_ptr - tcache; + RING_INIT(&tcache_ring[0], tcache_ptr, tcache_sizes[0] - i); + for (i = 1; i < ARRAY_SIZE(tcache_ring); i++) { + RING_INIT(&tcache_ring[i], tcache_ring[i-1].base + tcache_sizes[i-1], + tcache_sizes[i]); } #if (DRC_DEBUG & 4) for (i = 0; i < ARRAY_SIZE(block_tables); i++) - tcache_dsm_ptrs[i] = tcache_bases[i]; + tcache_dsm_ptrs[i] = tcache_ring[i].base; // disasm the utils tcache_dsm_ptrs[0] = tcache; do_host_disasm(0); @@ -5498,7 +5562,6 @@ int sh2_drc_init(SH2 *sh2) void sh2_drc_finish(SH2 *sh2) { - struct block_list *bl, *bn; int i; if (block_tables[0] == NULL) @@ -5514,17 +5577,22 @@ void sh2_drc_finish(SH2 *sh2) for (i = 0; i < TCACHE_BUFFERS; i++) { printf("~~~ tcache %d\n", i); #if 0 - tcache_dsm_ptrs[i] = tcache_bases[i]; - tcache_ptr = tcache_ptrs[i]; - do_host_disasm(i); - if (tcache_limit[i] < tcache_bases[i] + tcache_sizes[i]) { - tcache_dsm_ptrs[i] = tcache_limit[i]; - tcache_ptr = tcache_bases[i] + tcache_sizes[i]; + if (tcache_ring[i].first < tcache_ring[i].next) { + tcache_dsm_ptrs[i] = tcache_ring[i].first; + tcache_ptr = tcache_ring[i].next; + do_host_disasm(i); + } else if (tcache_ring[i].used) { + tcache_dsm_ptrs[i] = tcache_ring[i].first; + tcache_ptr = tcache_ring[i].base + tcache_ring[i].size; + do_host_disasm(i); + tcache_dsm_ptrs[i] = tcache_ring[i].base; + tcache_ptr = tcache_ring[i].next; do_host_disasm(i); } #endif printf("max links: %d\n", block_link_pool_counts[i]); } + printf("max block list: %d\n", block_list_pool_count); #endif sh2_drc_flush_all(); @@ -5533,6 +5601,9 @@ void sh2_drc_finish(SH2 *sh2) if (block_tables[i] != NULL) free(block_tables[i]); block_tables[i] = NULL; + if (entry_tables[i] != NULL) + free(entry_tables[i]); + entry_tables[i] = NULL; if (block_link_pool[i] != NULL) free(block_link_pool[i]); block_link_pool[i] = NULL; @@ -5548,10 +5619,9 @@ void sh2_drc_finish(SH2 *sh2) } } - for (bl = blist_free; bl; bl = bn) { - bn = bl->next; - free(bl); - } + if (block_list_pool != NULL) + free(block_list_pool); + block_list_pool = NULL; blist_free = NULL; drc_cmn_cleanup(); From a1efdc9eed5e441e576823d16f78f46edcdafc84 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 19 Nov 2019 21:59:44 +0100 Subject: [PATCH 081/174] sh2 drc, small improvements and bug fixes for code emitters --- cpu/drc/emit_arm.c | 20 ++++--- cpu/drc/emit_arm64.c | 14 +++-- cpu/drc/emit_mips.c | 32 +++++++---- cpu/drc/emit_riscv.c | 129 ++++++++++++++++++++++++++++++++++++------- cpu/drc/emit_x86.c | 11 +++- cpu/sh2/compiler.c | 71 ++++++++++++------------ 6 files changed, 194 insertions(+), 83 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 8f633fa3c..8ea148eb8 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -478,6 +478,7 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int switch (op) { case A_OP_MOV: + case A_OP_MVN: rn = 0; // use MVN if more bits 1 than 0 if (count_bits(imm) > 16) { @@ -501,7 +502,7 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int return; } #else - for (i = 2, u = v; i > 0; i--, u >>= 8) + for (i = 3, u = v; i > 0; i--, u >>= 8) while (u > 0xff && !(u & 3)) u >>= 2; if (u) { // 4 insns needed... @@ -1387,22 +1388,25 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) } while (0) /* + * T = carry(Rn = (Rn << 1) | T) * if Q - * t = carry(Rn += Rm) + * T ^= !carry(Rn += Rm) * else - * t = carry(Rn -= Rm) - * T ^= t + * T ^= !carry(Rn -= Rm) */ #define emith_sh2_div1_step(rn, rm, sr) do { \ void *jmp0, *jmp1; \ + emith_tpop_carry(sr, 0); /* Rn = 2*Rn+T */\ + emith_adcf_r_r_r(rn, rn, rn); \ + emith_tpush_carry(sr, 0); \ emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ JMP_POS(jmp0); /* beq do_sub */ \ - emith_addf_r_r(rn, rm); \ - emith_eor_r_imm_c(A_COND_CS, sr, T); \ + emith_addf_r_r(rn, rm); /* Rn += Rm */ \ + emith_eor_r_imm_c(A_COND_CC, sr, T); \ JMP_POS(jmp1); /* b done */ \ JMP_EMIT(A_COND_EQ, jmp0); /* do_sub: */ \ - emith_subf_r_r(rn, rm); \ - emith_eor_r_imm_c(A_COND_CC, sr, T); \ + emith_subf_r_r(rn, rm); /* Rn -= Rm */ \ + emith_eor_r_imm_c(A_COND_CS, sr, T); \ JMP_EMIT(A_COND_AL, jmp1); /* done: */ \ } while (0) diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 3f40d4cd7..8f4718ee5 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -372,7 +372,7 @@ enum { AM_IDX, AM_IDXPOST, AM_IDXREG, AM_IDXPRE }; #define EMITH_HINT_COND(cond) /**/ -// "simple" jump (no more then a few insns) +// "simple" jump (no more than a few insns) // ARM32 will use conditional instructions here #define EMITH_SJMP_START EMITH_JMP_START #define EMITH_SJMP_END EMITH_JMP_END @@ -1240,22 +1240,26 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) } while (0) /* + * T = carry(Rn = (Rn << 1) | T) * if Q - * t = carry(Rn += Rm) + * t = !carry(Rn += Rm) * else - * t = carry(Rn -= Rm) + * t = !carry(Rn -= Rm) * T ^= t */ #define emith_sh2_div1_step(rn, rm, sr) do { \ int tmp_ = rcache_get_tmp(); \ - emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ + emith_tpop_carry(sr, 0); \ + emith_adcf_r_r_r(rn, rn, rn); \ + emith_tpush_carry(sr, 0); \ + emith_tst_r_imm(sr, Q); \ EMITH_SJMP3_START(DCOND_EQ); \ emith_addf_r_r(rn, rm); \ emith_adc_r_r_r(tmp_, Z0, Z0); \ + emith_eor_r_imm(tmp_, 1); \ EMITH_SJMP3_MID(DCOND_EQ); \ emith_subf_r_r(rn, rm); \ emith_adc_r_r_r(tmp_, Z0, Z0); \ - emith_eor_r_imm(tmp_, 1); \ EMITH_SJMP3_END(); \ emith_eor_r_r(sr, tmp_); \ rcache_free_tmp(tmp_); \ diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 6f07e509b..c9c006c84 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -7,9 +7,10 @@ */ #define HOST_REGS 32 -// MIPS ABI: params: r4-r7, return: r2-r3, temp: r1(at),r8-r15,r24-r25,r31(ra), +// MIPS32 ABI: params: r4-r7, return: r2-r3, temp: r1(at),r8-r15,r24-r25,r31(ra) // saved: r16-r23,r30, reserved: r0(zero), r26-r27(irq), r28(gp), r29(sp) // r1,r15,r24,r25(at,t7-t9) are used internally by the code emitter +// MIPSN32/MIPS64 ABI: params: r4-r11, no caller-reserved save area on stack #define RET_REG 2 // v0 #define PARAM_REGS { 4, 5, 6, 7 } // a0-a3 #define PRESERVED_REGS { 16, 17, 18, 19, 20, 21, 22, 23 } // s0-s7 @@ -424,7 +425,7 @@ static void *emith_branch(u32 op) JMP_EMIT_NC(else_ptr); \ } -// "simple" jump (no more then a few insns) +// "simple" jump (no more than a few insns) // ARM32 will use conditional instructions here #define EMITH_SJMP_START EMITH_JMP_START #define EMITH_SJMP_END EMITH_JMP_END @@ -761,7 +762,7 @@ static void emith_move_imm(int r, uintptr_t imm) EMIT(MIPS_OR_IMM(r, r, imm & 0xffff)); } else #endif - if ((s16)imm == imm) { + if ((s16)imm == imm) { EMIT(MIPS_ADD_IMM(r, Z0, imm)); } else if (!((u32)imm >> 16)) { EMIT(MIPS_OR_IMM(r, Z0, imm)); @@ -1576,22 +1577,31 @@ static int emith_cond_check(int cond, int *r) } while (0) /* + * T = !carry(Rn = (Rn << 1) | T) * if Q - * t = carry(Rn += Rm) + * C = carry(Rn += Rm) * else - * t = carry(Rn -= Rm) - * T ^= t + * C = carry(Rn -= Rm) + * T ^= C */ #define emith_sh2_div1_step(rn, rm, sr) do { \ + int t_ = rcache_get_tmp(); \ + emith_and_r_r_imm(AT, sr, T); \ + emith_lsr(FC, rn, 31); /*Rn = (Rn<<1)+T*/ \ + emith_lsl(t_, rn, 1); \ + emith_or_r_r(t_, AT); \ + emith_or_r_imm(sr, T); /* T = !carry */ \ + emith_eor_r_r(sr, FC); \ emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ EMITH_JMP3_START(DCOND_EQ); \ - EMITH_HINT_COND(DCOND_CS); \ - emith_addf_r_r(rn, rm); \ + emith_add_r_r_r(rn, t_, rm); \ + EMIT(MIPS_SLTU_REG(FC, rn, t_)); \ EMITH_JMP3_MID(DCOND_EQ); \ - EMITH_HINT_COND(DCOND_CS); \ - emith_subf_r_r(rn, rm); \ + emith_sub_r_r_r(rn, t_, rm); \ + EMIT(MIPS_SLTU_REG(FC, t_, rn)); \ EMITH_JMP3_END(); \ - emith_eor_r_r(sr, FC); \ + emith_eor_r_r(sr, FC); /* T ^= carry */ \ + rcache_free_tmp(t_); \ } while (0) /* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ diff --git a/cpu/drc/emit_riscv.c b/cpu/drc/emit_riscv.c index 84c3ccb2c..b66d6350b 100644 --- a/cpu/drc/emit_riscv.c +++ b/cpu/drc/emit_riscv.c @@ -7,7 +7,7 @@ */ #define HOST_REGS 32 -// RISC-V ABI: params: x10-x17, return: r10-x11, temp: x1(ra),x5-x7,x28-x31 +// RISC-V ABI: params: x10-x17, return: x10-x11, temp: x1(ra),x5-x7,x28-x31 // saved: x8(fp),x9,x18-x27, reserved: x0(zero), x4(tp), x3(gp), x2(sp) // x28-x31(t3-t6) are used internally by the code emitter #define RET_REG 10 // a0 @@ -74,13 +74,14 @@ _CB(imm,8,12,0), rd, op) // opcode -enum { OP_LUI=0x37, OP_JAL=0x6f, OP_JALR=0x67, OP_BCOND=0x63, OP_LD=0x03, - OP_ST=0x23, OP_IMM=0x13, OP_IMM32=0x1b, OP_REG=0x33, OP_REG32=0x3b }; +enum { OP_LUI=0x37, OP_AUIPC=0x17, OP_JAL=0x6f, // 20-bit immediate + OP_JALR=0x67, OP_BCOND=0x63, OP_LD=0x03, OP_ST=0x23, // 12-bit immediate + OP_IMM=0x13, OP_REG=0x33, OP_IMM32=0x1b, OP_REG32=0x3b }; // func3 -enum { F1_ADD, F1_SL, F1_SLT, F1_SLTU, F1_XOR, F1_SR, F1_OR, F1_AND }; -enum { F1_BEQ, F1_BNE, F1_BLT=4, F1_BGE, F1_BLTU, F1_BGEU }; -enum { F1_B, F1_H, F1_W, F1_D, F1_BU, F1_HU, F1_WU }; +enum { F1_ADD, F1_SL, F1_SLT, F1_SLTU, F1_XOR, F1_SR, F1_OR, F1_AND };// IMM/REG enum { F1_MUL, F1_MULH, F1_MULHSU, F1_MULHU, F1_DIV, F1_DIVU, F1_REM, F1_REMU }; +enum { F1_BEQ, F1_BNE, F1_BLT=4, F1_BGE, F1_BLTU, F1_BGEU }; // BCOND +enum { F1_B, F1_H, F1_W, F1_D, F1_BU, F1_HU, F1_WU }; // LD/ST // func7 enum { F2_ALT=0x20, F2_MULDIV=0x01 }; @@ -141,6 +142,8 @@ enum { F2_ALT=0x20, F2_MULDIV=0x01 }; R5_OR_IMM(rd, Z0, imm12) #define R5_MOVT_IMM(rd, imm20) \ R5_U_INSN(OP_LUI, rd, imm20) +#define R5_MOVA_IMM(rd, imm20) \ + R5_U_INSN(OP_AUIPC, rd, imm20) // rd = rs SHIFT imm5/imm6 #define R5_LSL_IMM(rd, rs, bits) \ @@ -212,8 +215,10 @@ enum { F2_ALT=0x20, F2_MULDIV=0x01 }; #define PTR_SCALE 3 // NB: must split 64 bit result into 2 32 bit registers -// NB: this expects 32 bit values in s1+s2, correctly sign extended to 64 bits +// NB: expects 32 bit values in s1+s2, correctly sign extended to 64 bits #define EMIT_R5_MULLU_REG(dlo, dhi, s1, s2) do { \ + /*EMIT(R5_ADDW_IMM(s1, s1, 0));*/ \ + /*EMIT(R5_ADDW_IMM(s2, s2, 0));*/ \ EMIT(R5_MUL(dlo, s1, s2)); \ EMIT(R5_LSR_IMM(dhi, dlo, 32)); \ EMIT(R5_LSL_IMM(dlo, dlo, 32)); \ @@ -307,7 +312,7 @@ enum { F2_ALT=0x20, F2_MULDIV=0x01 }; JMP_EMIT_NC(else_ptr); \ } -// "simple" jump (no more then a few insns) +// "simple" jump (no more than a few insns) // ARM32 will use conditional instructions here #define EMITH_SJMP_START EMITH_JMP_START #define EMITH_SJMP_END EMITH_JMP_END @@ -620,6 +625,67 @@ static void emith_set_compare_flags(int rs, int rt, s32 imm) // move immediate +#define MAX_HOST_LITERALS 32 // pool must be smaller than 4 KB +static uintptr_t literal_pool[MAX_HOST_LITERALS]; +static u32 *literal_insn[MAX_HOST_LITERALS]; +static int literal_pindex, literal_iindex; + +static inline int emith_pool_literal(uintptr_t imm) +{ + int idx = literal_pindex - 8; // max look behind in pool + // see if one of the last literals was the same (or close enough) + for (idx = (idx < 0 ? 0 : idx); idx < literal_pindex; idx++) + if (imm == literal_pool[idx]) + break; + if (idx == literal_pindex) // store new literal + literal_pool[literal_pindex++] = imm; + return idx; +} + +static void emith_pool_commit(int jumpover) +{ + int i, sz = literal_pindex * sizeof(uintptr_t); + u8 *pool = (u8 *)tcache_ptr; + + // nothing to commit if pool is empty + if (sz == 0) + return; + // align pool to pointer size + if (jumpover) + pool += sizeof(u32); + i = (uintptr_t)pool & (sizeof(void *)-1); + pool += (i ? sizeof(void *)-i : 0); + // need branch over pool if not at block end + if (jumpover) + EMIT(R5_B(sz + (pool-(u8 *)tcache_ptr))); + // safety check - pool must be after insns and reachable + if ((u32)(pool - (u8 *)literal_insn[0] + 8) > 0x7ff) { + elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, + "pool offset out of range"); + exit(1); + } + // copy pool and adjust addresses in insns accessing the pool + memcpy(pool, literal_pool, sz); + for (i = 0; i < literal_iindex; i++) { + *literal_insn[i] += ((u8 *)pool - (u8 *)literal_insn[i]) << 20; + } + // count pool constants as insns for statistics + for (i = 0; i < literal_pindex * sizeof(uintptr_t)/sizeof(u32); i++) + COUNT_OP; + + tcache_ptr = (void *)((u8 *)pool + sz); + literal_pindex = literal_iindex = 0; +} + +static void emith_pool_check(void) +{ + // check if pool must be committed + if (literal_iindex > MAX_HOST_LITERALS-4 || (literal_pindex && + (u8 *)tcache_ptr - (u8 *)literal_insn[0] > 0x700)) + // pool full, or displacement is approaching the limit + emith_pool_commit(1); +} + static void emith_move_imm(int r, uintptr_t imm) { u32 lui = imm + _CB(imm,1,11,12); @@ -632,8 +698,24 @@ static void emith_move_imm(int r, uintptr_t imm) EMIT(R5_ADD_IMM(r, Z0, imm)); } +static void emith_move_ptr_imm(int r, uintptr_t imm) +{ +#if __riscv_xlen == 64 + if ((s32)imm != imm) { + int idx; + if (literal_iindex >= MAX_HOST_LITERALS) + emith_pool_commit(1); + idx = emith_pool_literal(imm); + EMIT(R5_MOVA_IMM(AT, 0)); // loads PC of MOVA insn... + 4 in LD + literal_insn[literal_iindex++] = (u32 *)tcache_ptr; + EMIT(R5_I_INSN(OP_LD, F1_P, r, AT, idx*sizeof(uintptr_t) + 4)); + } else +#endif + emith_move_imm(r, imm); +} + #define emith_move_r_ptr_imm(r, imm) \ - emith_move_imm(r, (uintptr_t)(imm)) + emith_move_ptr_imm(r, (uintptr_t)(imm)) #define emith_move_r_imm(r, imm) \ emith_move_imm(r, (u32)(imm)) @@ -644,7 +726,6 @@ static void emith_move_imm(int r, uintptr_t imm) EMIT(R5_ADD_IMM(r, Z0, (s8)(imm))) #define emith_move_r_imm_s8_patch(ptr, imm) do { \ u32 *ptr_ = (u32 *)ptr; \ - while ((*ptr_ & 0xff07f) != R5_ADD_IMM(Z0, Z0, 0)) ptr_++; \ EMIT_PTR(ptr_, (*ptr_ & 0x000fffff) | ((u16)(s8)(imm)<<20)); \ } while (0) @@ -1235,7 +1316,6 @@ static int emith_cond_check(int cond, int *r, int *s) // NB: returns position of patch for cache maintenance #define emith_jump_patch(ptr, target, pos) do { \ u32 *ptr_ = (u32 *)ptr; /* must skip condition check code */ \ - while ((*ptr_&0x77) != OP_JALR && (*ptr_&0x77) != OP_BCOND) ptr_ ++; \ if ((*ptr_&0x77) == OP_BCOND) { \ u32 *p_ = ptr_, disp_ = (u8 *)target - (u8 *)ptr_; \ u32 f1_ = _CB(*ptr_,3,12,0); \ @@ -1319,8 +1399,6 @@ static int emith_cond_check(int cond, int *r, int *s) // emitter ABI stuff -#define emith_pool_check() /**/ -#define emith_pool_commit(j) /**/ #define emith_insn_ptr() ((u8 *)tcache_ptr) #define emith_flush() /**/ #define host_instructions_updated(base, end) __builtin___clear_cache(base, end) @@ -1404,22 +1482,31 @@ static int emith_cond_check(int cond, int *r, int *s) } while (0) /* + * T = !carry(Rn = (Rn << 1) | T) * if Q - * t = carry(Rn += Rm) + * C = carry(Rn += Rm) * else - * t = carry(Rn -= Rm) - * T ^= t + * C = carry(Rn -= Rm) + * T ^= C */ #define emith_sh2_div1_step(rn, rm, sr) do { \ + int t_ = rcache_get_tmp(); \ + emith_and_r_r_imm(AT, sr, T); \ + emith_lsr(FC, rn, 31); /*Rn = (Rn<<1)+T*/ \ + emith_lsl(t_, rn, 1); \ + emith_or_r_r(t_, AT); \ + emith_or_r_imm(sr, T); /* T = !carry */ \ + emith_eor_r_r(sr, FC); \ emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ EMITH_JMP3_START(DCOND_EQ); \ - EMITH_HINT_COND(DCOND_CS); \ - emith_addf_r_r(rn, rm); \ + emith_add_r_r_r(rn, t_, rm); \ + EMIT(R5_SLTU_REG(FC, rn, t_)); \ EMITH_JMP3_MID(DCOND_EQ); \ - EMITH_HINT_COND(DCOND_CS); \ - emith_subf_r_r(rn, rm); \ + emith_sub_r_r_r(rn, t_, rm); \ + EMIT(R5_SLTU_REG(FC, t_, rn)); \ EMITH_JMP3_END(); \ - emith_eor_r_r(sr, FC); \ + emith_eor_r_r(sr, FC); /* T ^= carry */ \ + rcache_free_tmp(t_); \ } while (0) /* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 9ed8b5638..0b3f76970 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -974,7 +974,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common JMP8_EMIT_NC(else_ptr); \ } -// "simple" jump (no more then a few insns) +// "simple" jump (no more than a few insns) // ARM will use conditional instructions here #define EMITH_SJMP_START EMITH_JMP_START #define EMITH_SJMP_END EMITH_JMP_END @@ -1287,15 +1287,19 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common emith_adc_r_r(sr, sr) /* + * T = carry(Rn = (Rn << 1) | T) * if Q * t = carry(Rn += Rm) * else * t = carry(Rn -= Rm) - * T ^= t + * T = !(T ^ t) */ #define emith_sh2_div1_step(rn, rm, sr) do { \ u8 *jmp0, *jmp1; \ int tmp_ = rcache_get_tmp(); \ + emith_tpop_carry(sr, 0); /* Rn = 2*Rn+T */\ + emith_adcf_r_r_r(rn, rn, rn); \ + emith_tpush_carry(sr, 0); /* T = C1 */ \ emith_eor_r_r(tmp_, tmp_); \ emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ JMP8_POS(jmp0); /* je do_sub */ \ @@ -1305,7 +1309,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common emith_sub_r_r(rn, rm); \ JMP8_EMIT_NC(jmp1); /* done: */ \ emith_adc_r_r(tmp_, tmp_); \ - emith_eor_r_r(sr, tmp_); \ + emith_eor_r_r(sr, tmp_);/* T = !(C1^C2) */\ + emith_eor_r_imm(sr, T); \ rcache_free_tmp(tmp_); \ } while (0) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 58ddd86f8..a12dfe967 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -2957,20 +2957,18 @@ static void emit_branch_linkage_code(SH2 *sh2, struct block_desc *block, int tca struct block_link *bl; int u, v, tmp; + emith_flush(); for (u = 0; u < link_count; u++) { emith_pool_check(); // look up local branch targets - v = find_in_sorted_linkage(targets, target_count, links[u].pc); - if (v >= 0) { - if (! targets[v].ptr) { + if (links[u].mask & 0x2) { + v = find_in_sorted_linkage(targets, target_count, links[u].pc); + if (v < 0 || ! targets[v].ptr) { // forward branch not yet resolved, prepare external linking emith_jump_patch(links[u].ptr, tcache_ptr, NULL); bl = dr_prepare_ext_branch(block->entryp, links[u].pc, sh2->is_slave, tcache_id); - if (bl) { - emith_flush(); // flush to inhibit insn swapping + if (bl) bl->type = BL_LDJMP; - } - tmp = rcache_get_tmp_arg(0); emith_move_r_imm(tmp, links[u].pc); rcache_free_tmp(tmp); @@ -2985,7 +2983,7 @@ static void emit_branch_linkage_code(SH2 *sh2, struct block_desc *block, int tca } } else { // external or exit, emit blx area entry - void *target = (links[u].pc & 1 ? sh2_drc_exit : sh2_drc_dispatcher); + void *target = (links[u].mask & 0x1 ? sh2_drc_exit : sh2_drc_dispatcher); if (links[u].bl) links[u].bl->blx = tcache_ptr; emith_jump_patch(links[u].ptr, tcache_ptr, NULL); @@ -3024,6 +3022,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) u8 op_flags[BLOCK_INSN_LIMIT]; + enum flg_states { FLG_UNKNOWN, FLG_UNUSED, FLG_0, FLG_1 }; struct drcf { int delay_reg:8; u32 loop_type:8; @@ -3032,6 +3031,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) u32 test_irq:1; u32 pending_branch_direct:1; u32 pending_branch_indirect:1; + u32 Tflag:2, Mflag:2; } drcf = { 0, }; #if LOOP_OPTIMIZER @@ -3169,7 +3169,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (m3 && count_bits(m3) < count_bits(rcache_vregs_reg) && pinned_loop_count < ARRAY_SIZE(pinned_loops)-1) { pinned_loops[pinned_loop_count++] = - (struct linkage) { .mask = m3, .pc = base_pc + 2*v }; + (struct linkage) { .pc = base_pc + 2*v, .mask = m3 }; } else op_flags[v] &= ~OF_BASIC_LOOP; } @@ -3220,6 +3220,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); FLUSH_CYCLES(sr); emith_sync_t(sr); + drcf.Mflag = FLG_UNKNOWN; rcache_flush(); emith_flush(); } @@ -3302,7 +3303,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if (blx_target_count < ARRAY_SIZE(blx_targets)) { // exit via stub in blx table (saves some 1-3 insns in the main flow) blx_targets[blx_target_count++] = - (struct linkage) { .ptr = tcache_ptr, .pc = pc|1, .bl = NULL }; + (struct linkage) { .pc = pc, .ptr = tcache_ptr, .mask = 0x1 }; emith_jump_patchable(tcache_ptr); } else { // blx table full, must inline exit code @@ -3319,7 +3320,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // exit via stub in blx table (saves some 1-3 insns in the main flow) emith_cmp_r_imm(sr, 0); blx_targets[blx_target_count++] = - (struct linkage) { .ptr = tcache_ptr, .pc = pc|1, .bl = NULL }; + (struct linkage) { .pc = pc, .ptr = tcache_ptr, .mask = 0x1 }; emith_jump_cond_patchable(DCOND_LE, tcache_ptr); } else { // blx table full, must inline exit code @@ -3704,6 +3705,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_invalidate_t(); emith_bic_r_imm(sr, M|Q|T); + drcf.Mflag = FLG_0; break; case 2: // MOVT Rn 0000nnnn00101001 sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); @@ -3781,6 +3783,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_eor_r_r_lsr(tmp, tmp2, 31); emith_or_r_r(sr, tmp); // T = Q^M rcache_free(tmp); + drcf.Mflag = FLG_UNKNOWN; goto end_op; case 0x08: // TST Rm,Rn 0010nnnnmmmm1000 sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); @@ -3846,17 +3849,16 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); tmp = rcache_get_reg(SHR_MACL, RC_GR_WRITE, NULL); + tmp4 = rcache_get_tmp(); if (op & 1) { emith_sext(tmp, tmp2, 16); - } else + emith_sext(tmp4, tmp3, 16); + } else { emith_clear_msb(tmp, tmp2, 16); - tmp2 = rcache_get_tmp(); - if (op & 1) { - emith_sext(tmp2, tmp3, 16); - } else - emith_clear_msb(tmp2, tmp3, 16); - emith_mul(tmp, tmp, tmp2); - rcache_free_tmp(tmp2); + emith_clear_msb(tmp4, tmp3, 16); + } + emith_mul(tmp, tmp, tmp4); + rcache_free_tmp(tmp4); goto end_op; } goto default_; @@ -3904,28 +3906,27 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // Q = M ^ Q1 ^ Q2 // T = (Q == M) = !(Q ^ M) = !(Q1 ^ Q2) tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); - tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp4); + tmp2 = rcache_get_reg(GET_Rn(), RC_GR_RMW, NULL); sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); emith_sync_t(sr); - EMITH_HINT_COND(DCOND_CS); - emith_tpop_carry(sr, 0); - emith_adcf_r_r_r(tmp2, tmp4, tmp4); - emith_tpush_carry(sr, 0); // keep Q1 in T for now - rcache_free(tmp4); tmp = rcache_get_tmp(); - emith_and_r_r_imm(tmp, sr, M); - emith_eor_r_r_lsr(sr, tmp, M_SHIFT - Q_SHIFT); // Q ^= M + if (drcf.Mflag != FLG_0) { + emith_and_r_r_imm(tmp, sr, M); + emith_eor_r_r_lsr(sr, tmp, M_SHIFT - Q_SHIFT); // Q ^= M + } rcache_free_tmp(tmp); - // add or sub, invert T if carry to get Q1 ^ Q2 - // in: (Q ^ M) passed in Q, Q1 in T + // shift Rn, add T, add or sub Rm, set T = !(Q1 ^ Q2) + // in: (Q ^ M) passed in Q emith_sh2_div1_step(tmp2, tmp3, sr); tmp = rcache_get_tmp(); - emith_bic_r_imm(sr, Q); // Q = M - emith_and_r_r_imm(tmp, sr, M); - emith_or_r_r_lsr(sr, tmp, M_SHIFT - Q_SHIFT); - emith_and_r_r_imm(tmp, sr, T); // Q = M ^ Q1 ^ Q2 + emith_or_r_imm(sr, Q); // Q = !T + emith_and_r_r_imm(tmp, sr, T); emith_eor_r_r_lsl(sr, tmp, Q_SHIFT); - emith_eor_r_imm(sr, T); // T = !(Q1 ^ Q2) + if (drcf.Mflag != FLG_0) { // Q = M ^ !T = M ^ Q1 ^ Q2 + emith_and_r_r_imm(tmp, sr, M); + emith_eor_r_r_lsr(sr, tmp, M_SHIFT - Q_SHIFT); + } + rcache_free_tmp(tmp); goto end_op; case 0x05: // DMULU.L Rm,Rn 0011nnnnmmmm0101 tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); @@ -4627,7 +4628,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // local forward jump target = tcache_ptr; blx_targets[blx_target_count++] = - (struct linkage) { .pc = target_pc, .ptr = target, .bl = NULL }; + (struct linkage) { .pc = target_pc, .ptr = target, .mask = 0x2 }; if (cond != -1) emith_jump_cond_patchable(cond, target); else { From 58e4b59f4b7d09b6df2ece98676548fed4524db6 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 20 Nov 2019 01:01:33 +0100 Subject: [PATCH 082/174] sh2 drc: fixed some RISC-V bugs --- README.md | 8 ++++---- cpu/drc/emit_riscv.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 8154f7dc0..67f60c2cf 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ This is my foray into dynamic recompilation using PicoDrive, a Megadrive / Genesis / Sega CD / Mega CD / 32X / SMS emulator. -I added support for MIPS (mips32r1) and ARM64 (aarch64) to the recompiler, as -well as spent much effort to optimize the code generated by the DRC. +I added support for MIPS (mips32r1), ARM64 (aarch64) and RISC-V (RV64IM) to the +SH2 recompiler, as well as spent much effort to optimize the DRC-generated code. I also optimized SH2 memory access inside the emulator, and did some work on M68K/SH2 CPU synchronization to fix some problems and speed up the emulator. -It got a bit out of hand. I ended up doing fixes and optimzations all over the +It got a bit out of hand. I ended up doing fixes and optimizations all over the place, mainly for 32X and CD, 32X graphics handling, and probably some more, -see the commit history. +see the commit history. As a result, 32X emulation speed has improved a lot. ### compiling diff --git a/cpu/drc/emit_riscv.c b/cpu/drc/emit_riscv.c index b66d6350b..fe4da0350 100644 --- a/cpu/drc/emit_riscv.c +++ b/cpu/drc/emit_riscv.c @@ -220,9 +220,9 @@ enum { F2_ALT=0x20, F2_MULDIV=0x01 }; /*EMIT(R5_ADDW_IMM(s1, s1, 0));*/ \ /*EMIT(R5_ADDW_IMM(s2, s2, 0));*/ \ EMIT(R5_MUL(dlo, s1, s2)); \ - EMIT(R5_LSR_IMM(dhi, dlo, 32)); \ + EMIT(R5_ASR_IMM(dhi, dlo, 32)); \ EMIT(R5_LSL_IMM(dlo, dlo, 32)); \ - EMIT(R5_LSR_IMM(dlo, dlo, 32)); \ + EMIT(R5_ASR_IMM(dlo, dlo, 32)); \ } while (0) #define EMIT_R5_MULLS_REG(dlo, dhi, s1, s2) \ @@ -1322,7 +1322,7 @@ static int emith_cond_check(int cond, int *r, int *s) u32 r_ = _CB(*ptr_,5,15,0), s_ = _CB(*ptr_,5,20,0); \ EMIT_PTR(p_, R5_BCOND(f1_, r_, s_, disp_ & 0x00001fff)); \ } else { \ - u32 *p_ = -- ptr_; \ + u32 *p_ = ptr_; \ uintptr_t target_ = (uintptr_t)(target); \ EMIT_PTR(p_, R5_MOVT_IMM(AT, target_ + _CB(target_,1,11,12))); \ EMIT_PTR(p_, R5_JR(AT, target_)); \ From 62f827c4540f478194f1026a7ae1ff20ac1b538a Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 27 Nov 2019 21:02:53 +0100 Subject: [PATCH 083/174] sh2 drc: bug fixing --- cpu/drc/emit_arm64.c | 2 +- cpu/drc/emit_mips.c | 10 ++++++---- cpu/drc/emit_riscv.c | 9 ++++----- cpu/sh2/compiler.c | 28 +++++++++++++++++----------- 4 files changed, 28 insertions(+), 21 deletions(-) diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 8f4718ee5..7a8327479 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -25,7 +25,7 @@ #define PR 18 // platform register // All operations but ptr ops are using the lower 32 bits of the A64 registers. -// The upper 32 bits are only used in ptr ops. +// The upper 32 bits are only used in ptr ops and are zeroed by A64 32 bit ops. #define A64_COND_EQ 0x0 diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index c9c006c84..062737f62 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -33,6 +33,8 @@ #define FC 24 // emulated processor flags: C (bit 0), others 0 #define FV 25 // emulated processor flags: Nt^Ns (bit 31). others x +// All operations but ptr ops are using the lower 32 bits of the registers. +// The upper 32 bits always contain the sign extension from the lower 32 bits. // unified conditions; virtual, not corresponding to anything real on MIPS #define DCOND_EQ 0x0 @@ -1095,10 +1097,10 @@ static void emith_lohi_nops(void) emith_lohi_nops(); \ EMIT(MIPS_MULT(s1, s2)); \ EMIT(MIPS_MFLO(AT)); \ - emith_add_r_r(dlo, AT); \ - EMIT(MIPS_SLTU_REG(t_, dlo, AT)); \ - EMIT(MIPS_MFHI(AT)); \ + EMIT(MIPS_MFHI(t_)); \ last_lohi = (u8 *)tcache_ptr; \ + emith_add_r_r(dlo, AT); \ + EMIT(MIPS_SLTU_REG(AT, dlo, AT)); \ emith_add_r_r(dhi, AT); \ emith_add_r_r(dhi, t_); \ rcache_free_tmp(t_); \ @@ -1479,7 +1481,7 @@ static int emith_cond_check(int cond, int *r) // NB: ABI SP alignment is 8 for compatibility with MIPS IV #define emith_push_ret(r) do { \ - emith_add_r_r_ptr_imm(SP, SP, -8-16); /* ABI: 16 byte arg save area */ \ + emith_add_r_r_ptr_imm(SP, SP, -8-16); /* O32: 16 byte arg save area */ \ emith_write_r_r_offs(LR, SP, 4+16); \ if ((r) > 0) emith_write_r_r_offs(r, SP, 0+16); \ } while (0) diff --git a/cpu/drc/emit_riscv.c b/cpu/drc/emit_riscv.c index fe4da0350..0f614f18d 100644 --- a/cpu/drc/emit_riscv.c +++ b/cpu/drc/emit_riscv.c @@ -30,6 +30,8 @@ #define FC 29 // emulated processor flags: C (bit 0), others 0 #define FV 28 // emulated processor flags: Nt^Ns (bit 31). others x +// All operations but ptr ops are using the lower 32 bits of the registers. +// The upper 32 bits always contain the sign extension from the lower 32 bits. // unified conditions; virtual, not corresponding to anything real on RISC-V #define DCOND_EQ 0x0 @@ -217,12 +219,9 @@ enum { F2_ALT=0x20, F2_MULDIV=0x01 }; // NB: must split 64 bit result into 2 32 bit registers // NB: expects 32 bit values in s1+s2, correctly sign extended to 64 bits #define EMIT_R5_MULLU_REG(dlo, dhi, s1, s2) do { \ - /*EMIT(R5_ADDW_IMM(s1, s1, 0));*/ \ - /*EMIT(R5_ADDW_IMM(s2, s2, 0));*/ \ EMIT(R5_MUL(dlo, s1, s2)); \ EMIT(R5_ASR_IMM(dhi, dlo, 32)); \ - EMIT(R5_LSL_IMM(dlo, dlo, 32)); \ - EMIT(R5_ASR_IMM(dlo, dlo, 32)); \ + EMIT(R5_ADDW_IMM(dlo, dlo, 0)); \ } while (0) #define EMIT_R5_MULLS_REG(dlo, dhi, s1, s2) \ @@ -633,7 +632,7 @@ static int literal_pindex, literal_iindex; static inline int emith_pool_literal(uintptr_t imm) { int idx = literal_pindex - 8; // max look behind in pool - // see if one of the last literals was the same (or close enough) + // see if one of the last literals was the same for (idx = (idx < 0 ? 0 : idx); idx < literal_pindex; idx++) if (imm == literal_pool[idx]) break; diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index a12dfe967..57bfc212d 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -7,21 +7,24 @@ * See COPYING file in the top-level directory. * * notes: - * - tcache, block descriptor, link buffer overflows result in sh2_translate() - * failure, followed by full tcache invalidation for that region + * - tcache, block descriptor, block entry buffer overflows result in oldest + * blocks being deleted until enough space is available + * - link and list element buffer overflows result in failure and exit * - jumps between blocks are tracked for SMC handling (in block_entry->links), - * except jumps between different tcaches + * except jumps from global to CPU-local tcaches * * implemented: * - static register allocation * - remaining register caching and tracking in temporaries * - block-local branch linking - * - block linking (except between tcaches) + * - block linking * - some constant propagation + * - call stack caching for host block entry address + * - delay, poll, and idle loop detection and handling + * - some T/M flag optimizations where the value is known or isn't used * * TODO: * - better constant propagation - * - stack caching? * - bug fixing */ #include @@ -1068,7 +1071,7 @@ static struct block_desc *dr_add_block(int entries, u32 addr, int size, if (be != NULL) dbg(1, "block override for %08x", addr); - if (block_ring[tcache_id].used + 1 > block_ring[tcache_id].size || + if (block_ring[tcache_id].used + 1 > block_ring[tcache_id].size || entry_ring[tcache_id].used + entries > entry_ring[tcache_id].size) { dbg(1, "bd overflow for tcache %d", tcache_id); return NULL; @@ -3014,13 +3017,13 @@ static void *dr_get_pc_base(u32 pc, SH2 *sh2); static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { // branch targets in current block - struct linkage branch_targets[MAX_LOCAL_TARGETS]; + static struct linkage branch_targets[MAX_LOCAL_TARGETS]; int branch_target_count = 0; // unresolved local or external targets with block link/exit area if needed - struct linkage blx_targets[MAX_LOCAL_BRANCHES]; + static struct linkage blx_targets[MAX_LOCAL_BRANCHES]; int blx_target_count = 0; - u8 op_flags[BLOCK_INSN_LIMIT]; + static u8 op_flags[BLOCK_INSN_LIMIT]; enum flg_states { FLG_UNKNOWN, FLG_UNUSED, FLG_0, FLG_1 }; struct drcf { @@ -3037,7 +3040,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #if LOOP_OPTIMIZER // loops with pinned registers for optimzation // pinned regs are like statics and don't need saving/restoring inside a loop - struct linkage pinned_loops[MAX_LOCAL_TARGETS/16]; + static struct linkage pinned_loops[MAX_LOCAL_TARGETS/16]; int pinned_loop_count = 0; #endif @@ -3479,6 +3482,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // no sense in looking any further than the next rcache flush tmp = ((op_flags[i+v] & OF_BTARGET) || (op_flags[i+v-1] & OF_DELAY_OP) || (OP_ISBRACND(opd[v-1].op) && !(op_flags[i+v] & OF_DELAY_OP))); + // XXX looking behind cond branch to avoid evicting regs used later? if (pc + 2*v <= end_pc && !tmp) { // (pc already incremented above) late |= opd[v].source & ~write; // ignore source regs after they have been written to @@ -4636,6 +4640,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_invalidate(); } } else + // no space for resolving forward branch, handle it as external dbg(1, "warning: too many unresolved branches"); } @@ -4657,6 +4662,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) EMITH_JMP_START(emith_invert_cond(cond)); if (bl) { bl->jump = tcache_ptr; + emith_flush(); // flush to inhibit insn swapping bl->type = BL_LDJMP; } tmp = rcache_get_tmp_arg(0); @@ -5534,7 +5540,7 @@ int sh2_drc_init(SH2 *sh2) i = tcache_ptr - tcache; RING_INIT(&tcache_ring[0], tcache_ptr, tcache_sizes[0] - i); for (i = 1; i < ARRAY_SIZE(tcache_ring); i++) { - RING_INIT(&tcache_ring[i], tcache_ring[i-1].base + tcache_sizes[i-1], + RING_INIT(&tcache_ring[i], tcache_ring[i-1].base + tcache_ring[i-1].size, tcache_sizes[i]); } From f042878251c7ea6743942e2a29f91007db96a4f8 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 27 Nov 2019 23:05:27 +0100 Subject: [PATCH 084/174] release 1.95 --- platform/common/menu_pico.c | 1 + platform/common/version.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/platform/common/menu_pico.c b/platform/common/menu_pico.c index dc7ceda44..327190a55 100644 --- a/platform/common/menu_pico.c +++ b/platform/common/menu_pico.c @@ -938,6 +938,7 @@ static const char credits[] = "MAME devs: SH2, YM2612 and SN76496 cores\n" "Eke, Stef: some Sega CD code\n" "Inder, ketchupgun: graphics\n" + "Irixxxx: SH2 drc improvements\n" #ifdef __GP2X__ "Squidge: mmuhack\n" "Dzz: ARM940 sample\n" diff --git a/platform/common/version.h b/platform/common/version.h index 8b3adbf85..a8c3034b0 100644 --- a/platform/common/version.h +++ b/platform/common/version.h @@ -1 +1 @@ -#define VERSION "1.93+" +#define VERSION "1.95" From 5e1f7e7e8bee71477b085fdb7e854c8b157f3981 Mon Sep 17 00:00:00 2001 From: kub Date: Mon, 2 Dec 2019 22:31:14 +0100 Subject: [PATCH 085/174] sh2 drc, tentative MIPS32/64 Release 2 support --- cpu/drc/emit_arm64.c | 23 +++--- cpu/drc/emit_mips.c | 144 ++++++++++++++++++++++++++++---------- cpu/drc/emit_riscv.c | 4 +- platform/common/dismips.c | 75 +++++++++++++++++--- 4 files changed, 186 insertions(+), 60 deletions(-) diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 7a8327479..c827fe2ca 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -221,10 +221,15 @@ enum { XT_UXTW=0x4, XT_UXTX=0x6, XT_LSL=0x7, XT_SXTW=0xc, XT_SXTX=0xe }; #define A64_ROR_IMM(rd, rn, bits) /* EXTR */ \ A64_INSN(0x9,0x0,0x6,_,rn,_,bits,rn,rd) -#define A64_SXT_IMM(rd, rn, bits) \ - A64_INSN(0x9,0x0,0x4,0,0,_,bits-1,rn,rd) -#define A64_UXT_IMM(rd, rn, bits) \ - A64_INSN(0x9,0x2,0x4,0,0,_,bits-1,rn,rd) +#define A64_SXT_IMM(rd, rn, bits) /* SBFM */ \ + A64_INSN(0x9,0x0,0x4,_,0,_,bits-1,rn,rd) +#define A64_UXT_IMM(rd, rn, bits) /* UBFM */ \ + A64_INSN(0x9,0x2,0x4,_,0,_,bits-1,rn,rd) + +#define A64_BFX_IMM(rd, rn, lsb, bits) /* UBFM */ \ + A64_INSN(0x9,0x2,0x4,_,lsb,_,bits-1,rn,rd) +#define A64_BFI_IMM(rd, rn, lsb, bits) /* BFM */ \ + A64_INSN(0x9,0x1,0x4,_,(32-lsb)&31,_,bits-1,rn,rd) // multiplication @@ -1302,8 +1307,7 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) EMITH_SJMP_START(DCOND_EQ); \ /* overflow if top 33 bits of MACH:MACL aren't all 1 or 0 */ \ /* to check: add MACL[31] to MACH. this is 0 if no overflow */ \ - emith_lsr(rn, ml, 31); \ - emith_addf_r_r(rn, mh); /* sum = MACH + ((MACL>>31)&1) */ \ + emith_addf_r_r_r_lsr(rn, mh, ml, 31); /* sum = MACH + (MACL>>31) */ \ EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> overflow */ \ /* XXX: LSB signalling only in SH1, or in SH2 too? */ \ emith_move_r_imm_c(DCOND_NE, mh, 0x00000001); /* LSB of MACH */ \ @@ -1315,11 +1319,8 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) EMITH_SJMP_END(DCOND_EQ); \ } while (0) -#define emith_write_sr(sr, srcr) do { \ - emith_lsr(sr, sr, 10); \ - emith_or_r_r_r_lsl(sr, sr, srcr, 22); \ - emith_ror(sr, sr, 22); \ -} while (0) +#define emith_write_sr(sr, srcr) \ + EMIT(A64_BFI_IMM(sr, srcr, 0, 10)) #define emith_carry_to_t(srr, is_sub) do { \ emith_lsr(sr, sr, 1); \ diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 062737f62..753c31229 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -1,5 +1,5 @@ /* - * Basic macros to emit MIPS II/MIPS32 Release 1 instructions and some utils + * Basic macros to emit MIPS32/MIPS64 Release 1 or 2 instructions and some utils * Copyright (C) 2019 kub * * This work is licensed under the terms of MAME license. @@ -65,9 +65,10 @@ // opcode field (encoded in op) enum { OP__FN=000, OP__RT, OP_J, OP_JAL, OP_BEQ, OP_BNE, OP_BLEZ, OP_BGTZ }; enum { OP_ADDI=010, OP_ADDIU, OP_SLTI, OP_SLTIU, OP_ANDI, OP_ORI, OP_XORI, OP_LUI }; +enum { OP_DADDI=030, OP_DADDIU, OP_LDL, OP_LDR, OP__FN2=034, OP__FN3=037 }; enum { OP_LB=040, OP_LH, OP_LWL, OP_LW, OP_LBU, OP_LHU, OP_LWR, OP_LWU }; enum { OP_SB=050, OP_SH, OP_SWL, OP_SW, OP_SDL, OP_SDR, OP_SWR }; -enum { OP_DADDI=030, OP_DADDIU, OP_LDL, OP_LDR, OP_SD=067, OP_LD=077 }; +enum { OP_SD=067, OP_LD=077 }; // function field (encoded in fn if opcode = OP__FN) enum { FN_SLL=000, __(01), FN_SRL, FN_SRA, FN_SLLV, __(05), FN_SRLV, FN_SRAV }; enum { FN_JR=010, FN_JALR, FN_MOVZ, FN_MOVN, FN_SYNC=017 }; @@ -76,39 +77,54 @@ enum { FN_MULT=030, FN_MULTU, FN_DIV, FN_DIVU, FN_DMULT, FN_DMULTU, FN_DDIV, FN_ enum { FN_ADD=040, FN_ADDU, FN_SUB, FN_SUBU, FN_AND, FN_OR, FN_XOR, FN_NOR }; enum { FN_SLT=052, FN_SLTU, FN_DADD, FN_DADDU, FN_DSUB, FN_DSUBU }; enum { FN_DSLL=070, __(71), FN_DSRL, FN_DSRA, FN_DSLL32, __(75), FN_DSRL32, FN_DSRA32 }; +// function field (encoded in fn if opcode = OP__FN2) +enum { FN2_MADD=000, FN2_MADDU, FN2_MUL, __(03), FN2_MSUB, FN2_MSUBU }; +enum { FN2_CLZ=040, FN2_CLO, FN2_DCLZ=044, FN2_DCLO }; +// function field (encoded in fn if opcode = OP__FN3) +enum { FN3_EXT=000, FN3_DEXTM, FN3_DEXTU, FN3_DEXT, FN3_INS, FN3_DINSM, FN3_DINSU, FN3_DINS }; +enum { FN3_BSHFL=040, FN3_DBSHFL=044 }; // rt field (encoded in rt if opcode = OP__RT) enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; +// bit shuffle function (encoded in sa if function = FN3_BSHFL) +enum { BS_SBH=002, BS_SHD=005, BS_SEB=020, BS_SEH=030 }; +// r (rotate) bit function (encoded in rs/sa if function = FN_SRL/FN_SRLV) +enum { RB_SRL=0, RB_ROTR=1 }; + #define MIPS_NOP 000 // null operation: SLL r0, r0, #0 // arithmetic/logical -#define MIPS_OP_REG(op, rd, rs, rt) \ - MIPS_INSN(OP__FN, rs, rt, rd, _, op) // R-type, SPECIAL +#define MIPS_OP_REG(op, sa, rd, rs, rt) \ + MIPS_INSN(OP__FN, rs, rt, rd, sa, op) // R-type, SPECIAL +#define MIPS_OP2_REG(op, sa, rd, rs, rt) \ + MIPS_INSN(OP__FN2, rs, rt, rd, sa, op) // R-type, SPECIAL2 +#define MIPS_OP3_REG(op, sa, rd, rs, rt) \ + MIPS_INSN(OP__FN3, rs, rt, rd, sa, op) // R-type, SPECIAL3 #define MIPS_OP_IMM(op, rt, rs, imm) \ MIPS_INSN(op, rs, rt, _, _, (u16)(imm)) // I-type // rd = rs OP rt #define MIPS_ADD_REG(rd, rs, rt) \ - MIPS_OP_REG(FN_ADDU, rd, rs, rt) + MIPS_OP_REG(FN_ADDU,_, rd, rs, rt) #define MIPS_DADD_REG(rd, rs, rt) \ - MIPS_OP_REG(FN_DADDU, rd, rs, rt) + MIPS_OP_REG(FN_DADDU,_, rd, rs, rt) #define MIPS_SUB_REG(rd, rs, rt) \ - MIPS_OP_REG(FN_SUBU, rd, rs, rt) + MIPS_OP_REG(FN_SUBU,_, rd, rs, rt) #define MIPS_DSUB_REG(rd, rs, rt) \ - MIPS_OP_REG(FN_DSUBU, rd, rs, rt) + MIPS_OP_REG(FN_DSUBU,_, rd, rs, rt) #define MIPS_NEG_REG(rd, rt) \ MIPS_SUB_REG(rd, Z0, rt) #define MIPS_XOR_REG(rd, rs, rt) \ - MIPS_OP_REG(FN_XOR, rd, rs, rt) + MIPS_OP_REG(FN_XOR,_, rd, rs, rt) #define MIPS_OR_REG(rd, rs, rt) \ - MIPS_OP_REG(FN_OR, rd, rs, rt) + MIPS_OP_REG(FN_OR,_, rd, rs, rt) #define MIPS_AND_REG(rd, rs, rt) \ - MIPS_OP_REG(FN_AND, rd, rs, rt) + MIPS_OP_REG(FN_AND,_, rd, rs, rt) #define MIPS_NOR_REG(rd, rs, rt) \ - MIPS_OP_REG(FN_NOR, rd, rs, rt) + MIPS_OP_REG(FN_NOR,_, rd, rs, rt) #define MIPS_MOVE_REG(rd, rs) \ MIPS_OR_REG(rd, rs, Z0) @@ -117,17 +133,29 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; // rd = rt SHIFT rs #define MIPS_LSL_REG(rd, rt, rs) \ - MIPS_OP_REG(FN_SLLV, rd, rs, rt) + MIPS_OP_REG(FN_SLLV,_, rd, rs, rt) #define MIPS_LSR_REG(rd, rt, rs) \ - MIPS_OP_REG(FN_SRLV, rd, rs, rt) + MIPS_OP_REG(FN_SRLV,RB_SRL, rd, rs, rt) #define MIPS_ASR_REG(rd, rt, rs) \ - MIPS_OP_REG(FN_SRAV, rd, rs, rt) + MIPS_OP_REG(FN_SRAV,_, rd, rs, rt) +#define MIPS_ROR_REG(rd, rt, rs) \ + MIPS_OP_REG(FN_SRLV,RB_ROTR, rd, rs, rt) + +#define MIPS_SEB_REG(rd, rt) \ + MIPS_OP3_REG(FN3_BSHFL, BS_SEB, rd, _, rt) +#define MIPS_SEH_REG(rd, rt) \ + MIPS_OP3_REG(FN3_BSHFL, BS_SEH, rd, _, rt) + +#define MIPS_EXT_IMM(rt, rs, lsb, sz) \ + MIPS_OP3_REG(FN3_EXT, lsb, (sz)-1, rs, rt) +#define MIPS_INS_IMM(rt, rs, lsb, sz) \ + MIPS_OP3_REG(FN3_INS, lsb, (lsb)+(sz)-1, rs, rt) // rd = (rs < rt) #define MIPS_SLT_REG(rd, rs, rt) \ - MIPS_OP_REG(FN_SLT, rd, rs, rt) + MIPS_OP_REG(FN_SLT,_, rd, rs, rt) #define MIPS_SLTU_REG(rd, rs, rt) \ - MIPS_OP_REG(FN_SLTU, rd, rs, rt) + MIPS_OP_REG(FN_SLTU,_, rd, rs, rt) // rt = rs OP imm16 #define MIPS_ADD_IMM(rt, rs, imm16) \ @@ -152,9 +180,11 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; #define MIPS_LSL_IMM(rd, rt, bits) \ MIPS_INSN(OP__FN, _, rt, rd, bits, FN_SLL) #define MIPS_LSR_IMM(rd, rt, bits) \ - MIPS_INSN(OP__FN, _, rt, rd, bits, FN_SRL) + MIPS_INSN(OP__FN, RB_SRL, rt, rd, bits, FN_SRL) #define MIPS_ASR_IMM(rd, rt, bits) \ MIPS_INSN(OP__FN, _, rt, rd, bits, FN_SRA) +#define MIPS_ROR_IMM(rd, rt, bits) \ + MIPS_INSN(OP__FN, RB_ROTR, rt, rd, bits, FN_SRL) #define MIPS_DLSL_IMM(rd, rt, bits) \ MIPS_INSN(OP__FN, _, rt, rd, bits, FN_DSLL) @@ -170,13 +200,17 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; // multiplication #define MIPS_MULT(rt, rs) \ - MIPS_OP_REG(FN_MULT, _, rs, rt) + MIPS_OP_REG(FN_MULT,_, _, rs, rt) #define MIPS_MULTU(rt, rs) \ - MIPS_OP_REG(FN_MULTU, _, rs, rt) + MIPS_OP_REG(FN_MULTU,_, _, rs, rt) +#define MIPS_MADD(rt, rs) \ + MIPS_OP2_REG(FN_MADD,_, _, rs, rt) +#define MIPS_MADDU(rt, rs) \ + MIPS_OP2_REG(FN_MADDU,_, _, rs, rt) #define MIPS_MFLO(rd) \ - MIPS_OP_REG(FN_MFLO, rd, _, _) + MIPS_OP_REG(FN_MFLO,_, rd, _, _) #define MIPS_MFHI(rd) \ - MIPS_OP_REG(FN_MFHI, rd, _, _) + MIPS_OP_REG(FN_MFHI,_, rd, _, _) // branching @@ -185,9 +219,9 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; #define MIPS_JAL(abs26) \ MIPS_INSN(OP_JAL, _,_,_,_, (abs26) >> 2) #define MIPS_JR(rs) \ - MIPS_OP_REG(FN_JR,_,rs,_) + MIPS_OP_REG(FN_JR,_, _,rs,_) #define MIPS_JALR(rd, rs) \ - MIPS_OP_REG(FN_JALR,rd,rs,_) + MIPS_OP_REG(FN_JALR,_, rd,rs,_) // conditional branches; no condition code, these compare rs against rt or Z0 #define MIPS_BEQ (OP_BEQ << 5) // rs == rt (rt in lower 5 bits) @@ -234,7 +268,7 @@ enum { RT_BLTZ=000, RT_BGEZ, RT_BLTZAL=020, RT_BGEZAL, RT_SYNCI=037 }; // pointer operations -#if __mips == 4 || __mips == 64 +#if _MIPS_SZPTR == 64 #define OP_LP OP_LD #define OP_SP OP_SD #define OP_PADDIU OP_DADDIU @@ -524,8 +558,8 @@ static void emith_set_compare_flags(int rs, int rt, s32 imm) #define emith_add_r_r_r_lsl_ptr(d, s1, s2, simm) do { \ if (simm) { \ EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ - EMIT(MIPS_OP_REG(FN_PADDU, d, s1, AT)); \ - } else EMIT(MIPS_OP_REG(FN_PADDU, d, s1, s2)); \ + EMIT(MIPS_OP_REG(FN_PADDU,_, d, s1, AT)); \ + } else EMIT(MIPS_OP_REG(FN_PADDU,_, d, s1, s2)); \ } while (0) #define emith_add_r_r_r_lsl(d, s1, s2, simm) do { \ if (simm) { \ @@ -544,10 +578,10 @@ static void emith_set_compare_flags(int rs, int rt, s32 imm) #define emith_addf_r_r_r_lsl_ptr(d, s1, s2, simm) do { \ if (simm) { \ EMIT(MIPS_LSL_IMM(AT, s2, simm)); \ - EMIT(MIPS_OP_REG(FN_PADDU, FNZ, s1, AT)); \ + EMIT(MIPS_OP_REG(FN_PADDU,_, FNZ, s1, AT)); \ emith_set_arith_flags(d, s1, AT, 0, 0); \ } else { \ - EMIT(MIPS_OP_REG(FN_PADDU, FNZ, s1, s2)); \ + EMIT(MIPS_OP_REG(FN_PADDU,_, FNZ, s1, s2)); \ emith_set_arith_flags(d, s1, s2, 0, 0); \ } \ } while (0) @@ -752,7 +786,7 @@ static void emith_set_compare_flags(int rs, int rt, s32 imm) // move immediate static void emith_move_imm(int r, uintptr_t imm) { -#if __mips == 4 || __mips == 64 +#if _MIPS_SZPTR == 64 if ((s32)imm != imm) { emith_move_imm(r, imm >> 32); if (imm & 0xffff0000) { @@ -803,10 +837,10 @@ static void emith_add_imm(int ptr, int rd, int rs, u32 imm) EMIT(MIPS_OP_IMM(ptr ? OP_PADDIU:OP_ADDIU, rd,rs,imm)); } else if ((s32)imm < 0) { emith_move_r_imm(AT, -imm); - EMIT(MIPS_OP_REG((ptr ? FN_PSUBU:FN_SUBU), rd,rs,AT)); + EMIT(MIPS_OP_REG((ptr ? FN_PSUBU:FN_SUBU),_, rd,rs,AT)); } else { emith_move_r_imm(AT, imm); - EMIT(MIPS_OP_REG((ptr ? FN_PADDU:FN_ADDU), rd,rs,AT)); + EMIT(MIPS_OP_REG((ptr ? FN_PADDU:FN_ADDU),_, rd,rs,AT)); } } @@ -881,7 +915,7 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) { if (imm >> 16) { emith_move_r_imm(AT, imm); - EMIT(MIPS_OP_REG(FN_AND + (op-OP_ANDI), rd, rs, AT)); + EMIT(MIPS_OP_REG(FN_AND + (op-OP_ANDI),_, rd, rs, AT)); } else if (op == OP_ANDI || imm || rd != rs) EMIT(MIPS_OP_IMM(op, rd, rs, imm)); } @@ -936,20 +970,31 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) #define emith_asr(d, s, cnt) \ EMIT(MIPS_ASR_IMM(d, s, cnt)) -// NB: mips32r2 has ROT (SLR with R bit set) +#if defined(__mips_isa_rev) && __mips_isa_rev >= 2 +#define emith_ror(d, s, cnt) do { \ + EMIT(MIPS_ROR_IMM(d, s, cnt)); \ +} while (0) +#else #define emith_ror(d, s, cnt) do { \ EMIT(MIPS_LSL_IMM(AT, s, 32-(cnt))); \ EMIT(MIPS_LSR_IMM(d, s, cnt)); \ EMIT(MIPS_OR_REG(d, d, AT)); \ } while (0) +#endif #define emith_ror_c(cond, d, s, cnt) \ emith_ror(d, s, cnt) +#if defined(__mips_isa_rev) && __mips_isa_rev >= 2 +#define emith_rol(d, s, cnt) do { \ + EMIT(MIPS_ROR_IMM(d, s, 32-(cnt))); \ +} while (0) +#else #define emith_rol(d, s, cnt) do { \ EMIT(MIPS_LSR_IMM(AT, s, 32-(cnt))); \ EMIT(MIPS_LSL_IMM(d, s, cnt)); \ EMIT(MIPS_OR_REG(d, d, AT)); \ } while (0) +#endif #define emith_rorc(d) do { \ emith_lsr(d, d, 1); \ @@ -963,7 +1008,6 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) } while (0) // NB: all flag setting shifts make V undefined -// NB: mips32r2 has EXT (useful for extracting C) #define emith_lslf(d, s, cnt) do { \ int _s = s; \ if ((cnt) > 1) { \ @@ -1040,7 +1084,10 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) } while (0) // signed/unsigned extend -// NB: mips32r2 has EXT and INS +#if defined(__mips_isa_rev) && __mips_isa_rev >= 2 +#define emith_clear_msb(d, s, count) /* bits to clear */ \ + EMIT(MIPS_EXT_IMM(d, s, 0, 32-(count))) +#else #define emith_clear_msb(d, s, count) /* bits to clear */ do { \ u32 t; \ if ((count) >= 16) { \ @@ -1052,14 +1099,27 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) emith_lsr(d, d, count); \ } \ } while (0) +#endif #define emith_clear_msb_c(cond, d, s, count) \ emith_clear_msb(d, s, count) -// NB: mips32r2 has SE[BH]H +#if defined(__mips_isa_rev) && __mips_isa_rev >= 2 +#define emith_sext(d, s, count) /* bits to keep */ do { \ + if (count == 8) \ + EMIT(MIPS_SEB_REG(d, s)); \ + else if (count == 16) \ + EMIT(MIPS_SEH_REG(d, s)); \ + else { \ + emith_lsl(d, s, 32-(count)); \ + emith_asr(d, d, 32-(count)); \ + } \ +} while (0) +#else #define emith_sext(d, s, count) /* bits to keep */ do { \ emith_lsl(d, s, 32-(count)); \ emith_asr(d, d, 32-(count)); \ } while (0) +#endif // multiply Rd = Rn*Rm (+ Ra); NB: next 2 insns after MFLO/MFHI mustn't be MULT static u8 *last_lohi; @@ -1658,16 +1718,26 @@ static int emith_cond_check(int cond, int *r) EMITH_SJMP_END(DCOND_EQ); \ } while (0) +#if defined(__mips_isa_rev) && __mips_isa_rev >= 2 +#define emith_write_sr(sr, srcr) \ + EMIT(MIPS_INS_IMM(sr, srcr, 0, 10)) +#else #define emith_write_sr(sr, srcr) do { \ emith_lsr(sr, sr , 10); emith_lsl(sr, sr, 10); \ emith_lsl(AT, srcr, 22); emith_lsr(AT, AT, 22); \ emith_or_r_r(sr, AT); \ } while (0) +#endif +#if defined(__mips_isa_rev) && __mips_isa_rev >= 2 +#define emith_carry_to_t(sr, is_sub) \ + EMIT(MIPS_INS_IMM(sr, FC, 0, 1)) +#else #define emith_carry_to_t(sr, is_sub) do { \ emith_and_r_imm(sr, 0xfffffffe); \ emith_or_r_r(sr, FC); \ } while (0) +#endif #define emith_t_to_carry(sr, is_sub) do { \ emith_and_r_r_imm(FC, sr, 1); \ diff --git a/cpu/drc/emit_riscv.c b/cpu/drc/emit_riscv.c index 0f614f18d..ed45e01ca 100644 --- a/cpu/drc/emit_riscv.c +++ b/cpu/drc/emit_riscv.c @@ -1499,10 +1499,10 @@ static int emith_cond_check(int cond, int *r, int *s) emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ EMITH_JMP3_START(DCOND_EQ); \ emith_add_r_r_r(rn, t_, rm); \ - EMIT(R5_SLTU_REG(FC, rn, t_)); \ + EMIT(R5_SLTU_REG(FC, rn, t_)); \ EMITH_JMP3_MID(DCOND_EQ); \ emith_sub_r_r_r(rn, t_, rm); \ - EMIT(R5_SLTU_REG(FC, t_, rn)); \ + EMIT(R5_SLTU_REG(FC, t_, rn)); \ EMITH_JMP3_END(); \ emith_eor_r_r(sr, FC); /* T ^= carry */ \ rcache_free_tmp(t_); \ diff --git a/platform/common/dismips.c b/platform/common/dismips.c index f9888f2a5..61c70bfe2 100644 --- a/platform/common/dismips.c +++ b/platform/common/dismips.c @@ -56,13 +56,15 @@ static char *const register_names[32] = { enum insn_type { REG_DTS, REG_TS, // 3, 2, or 1 regs - REG_DS, REG_D, REG_S, + REG_DS, REG_DT, REG_D, REG_S, S_IMM_DT, // 2 regs with shift amount + F_IMM_TS, // 2 regs with bitfield spec B_IMM_S, B_IMM_TS, // pc-relative branches with 1 or 2 regs J_IMM, // region-relative jump - A_IMM_TS, // arithmetic immediate with 1 or 2 regs - L_IMM_T, L_IMM_TS, // logical immediate with 2 regs + A_IMM_TS, // arithmetic immediate with 2 regs + L_IMM_T, L_IMM_TS, // logical immediate with 1 or 2 regs M_IMM_TS, // memory indexed with 2 regs + SR_BIT = 0x80 // shift right with R-bit }; struct insn { @@ -77,10 +79,10 @@ struct insn { #define OP_SPECIAL 0x00 static const struct insn special_insns[] = { {0x00, S_IMM_DT, "sll"}, - {0x02, S_IMM_DT, "srl"}, + {0x02, S_IMM_DT|SR_BIT, "srl\0rotr"}, {0x03, S_IMM_DT, "sra"}, {0x04, REG_DTS, "sllv"}, - {0x06, REG_DTS, "srlv"}, + {0x06, REG_DTS|SR_BIT, "srlv\0rotrv"}, {0x07, REG_DTS, "srav"}, {0x08, REG_S, "jr"}, {0x09, REG_DS, "jalr"}, @@ -94,7 +96,7 @@ static const struct insn special_insns[] = { {0x12, REG_D, "mflo"}, {0x13, REG_S, "mtlo"}, {0x14, REG_DTS, "dsllv"}, - {0x16, REG_DTS, "dslrv"}, + {0x16, REG_DTS|SR_BIT, "dsrlv\0drotrv"}, {0x17, REG_DTS, "dsrav"}, {0x18, REG_TS, "mult"}, {0x19, REG_TS, "multu"}, @@ -125,10 +127,10 @@ static const struct insn special_insns[] = { // {0x34, REG_TS, "teq" }, // {0x36, REG_TS, "tne" }, {0x38, S_IMM_DT, "dsll"}, - {0x3A, S_IMM_DT, "dsrl"}, + {0x3A, S_IMM_DT|SR_BIT, "dsrl\0drotrv"}, {0x3B, S_IMM_DT, "dsra"}, {0x3C, S_IMM_DT, "dsll32"}, - {0x3E, S_IMM_DT, "dsrl32"}, + {0x3E, S_IMM_DT|SR_BIT, "dsrl32\0drotr32"}, {0x3F, S_IMM_DT, "dsra32"}, }; @@ -146,6 +148,32 @@ static const struct insn special2_insns[] = { {0x25, REG_DS, "dclo" }, }; +// instructions with opcode SPECIAL3 (R-type) +#define OP_SPECIAL3 0x1F +static const struct insn special3_insns[] = { + {0x00, F_IMM_TS, "ext" }, + {0x01, F_IMM_TS, "dextm" }, + {0x02, F_IMM_TS, "dextu" }, + {0x03, F_IMM_TS, "dext" }, + {0x04, F_IMM_TS, "ins" }, + {0x05, F_IMM_TS, "dinsm" }, + {0x06, F_IMM_TS, "dinsu" }, + {0x07, F_IMM_TS, "dins" }, +}; + +// instruction with opcode SPECIAL3 and function *BSHFL +#define FN_BSHFL 0x20 +static const struct insn bshfl_insns[] = { + {0x02, REG_DT, "wsbh" }, + {0x10, REG_DT, "seb" }, + {0x18, REG_DT, "seh" }, +}; +#define FN_DBSHFL 0x24 +static const struct insn dbshfl_insns[] = { + {0x02, REG_DT, "dsbh" }, + {0x05, REG_DT, "dshd" }, +}; + // instructions with opcode REGIMM (I-type) #define OP_REGIMM 0x01 static const struct insn regimm_insns[] = { @@ -240,6 +268,20 @@ static const struct insn *decode_insn(uint32_t insn) op = insn & 0x3f; pi = special2_insns; r = ARRAY_SIZE(special2_insns)-1; + } else if (op == OP_SPECIAL3) { + op = insn & 0x3f; + if (op == FN_BSHFL) { + op = (insn >> 6) & 0x1f; + pi = bshfl_insns; + r = ARRAY_SIZE(bshfl_insns)-1; + } else if (op == FN_DBSHFL) { + op = (insn >> 6) & 0x1f; + pi = dbshfl_insns; + r = ARRAY_SIZE(dbshfl_insns)-1; + } else { + pi = special3_insns; + r = ARRAY_SIZE(special3_insns)-1; + } } else if (op == OP_REGIMM) { op = (insn>>16) & 0x1f; pi = regimm_insns; @@ -280,7 +322,7 @@ int dismips(uintptr_t pc, uint32_t insn, char *buf, size_t buflen, uintptr_t *sy char *rs = register_names[(insn >> 21) & 0x1f]; char *rt = register_names[(insn >> 16) & 0x1f]; char *rd = register_names[(insn >> 11) & 0x1f]; - int sa = (insn >> 6) & 0x1f; + int sa = (insn >> 6) & 0x1f, sb = (insn >> 11) & 0x1f; int imm = (int16_t) insn; *sym = 0; @@ -289,10 +331,12 @@ int dismips(uintptr_t pc, uint32_t insn, char *buf, size_t buflen, uintptr_t *sy return 0; } - switch (pi->type) { + switch (pi->type & ~SR_BIT) { case REG_DTS: if ((insn & 0x3f) == 0x25 /*OR*/ && (insn & 0x1f0000) == 0 /*zero*/) snprintf(buf, buflen, "move %s, %s", rd, rs); + else if ((pi->type & SR_BIT) && (insn & (1<<6))) + snprintf(buf, buflen, "%s %s, %s, %s", pi->name+strlen(pi->name)+1, rd, rs, rt); else snprintf(buf, buflen, "%s %s, %s, %s", pi->name, rd, rs, rt); break; @@ -302,6 +346,9 @@ int dismips(uintptr_t pc, uint32_t insn, char *buf, size_t buflen, uintptr_t *sy case REG_DS: snprintf(buf, buflen, "%s %s, %s", pi->name, rd, rs); break; + case REG_DT: + snprintf(buf, buflen, "%s %s, %s", pi->name, rd, rt); + break; case REG_D: snprintf(buf, buflen, "%s %s", pi->name, rd); break; @@ -311,9 +358,17 @@ int dismips(uintptr_t pc, uint32_t insn, char *buf, size_t buflen, uintptr_t *sy case S_IMM_DT: if (insn == 0x00000000) snprintf(buf, buflen, "nop"); + else if ((pi->type & SR_BIT) && (insn & (1<<21))) + snprintf(buf, buflen, "%s %s, %s, %d", pi->name+strlen(pi->name)+1, rd, rt, sa); else snprintf(buf, buflen, "%s %s, %s, %d", pi->name, rd, rt, sa); break; + case F_IMM_TS: + if (insn & 0x01) sb+=32; + if (insn & 0x02) sa+=32; + if (insn & 0x04) sb-=sa; + snprintf(buf, buflen, "%s %s, %s, %d, %d", pi->name, rt, rs, sa, sb+1); + break; case B_IMM_S: *sym = b_target(pc, insn); snprintf(buf, buflen, "%s %s, 0x%lx", pi->name, rs, *sym); From 753eae054edabf849caba72238f81b6ccd9391ba Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 3 Dec 2019 23:52:13 +0100 Subject: [PATCH 086/174] remove textrels with -fPIC/-fPIE (for android/ios) --- cpu/DrZ80/{drz80.s => drz80.S} | 293 +++++++++++----------- cpu/drc/emit_arm64.c | 4 +- cpu/drc/emit_mips.c | 10 +- pico/32x/draw_arm.S | 18 +- pico/arm_features.h | 32 +++ pico/cd/memory_arm.S | 110 ++++---- pico/memory_arm.S | 27 +- pico/sound/{ym2612_arm.s => ym2612_arm.S} | 59 ++--- platform/common/common.mak | 4 +- platform/common/dismips.c | 2 +- 10 files changed, 298 insertions(+), 261 deletions(-) rename cpu/DrZ80/{drz80.s => drz80.S} (90%) rename pico/sound/{ym2612_arm.s => ym2612_arm.S} (95%) diff --git a/cpu/DrZ80/drz80.s b/cpu/DrZ80/drz80.S similarity index 90% rename from cpu/DrZ80/drz80.s rename to cpu/DrZ80/drz80.S index c2a64df3f..4d592b169 100644 --- a/cpu/DrZ80/drz80.s +++ b/cpu/DrZ80/drz80.S @@ -5,6 +5,8 @@ ;@ For commercial use, separate licencing terms must be obtained. +#include "../../pico/arm_features.h" + .data .align 4 @@ -102,6 +104,7 @@ DrZ80Ver: .long 0x0001 ;@--------------------------------------- .text + PIC_LDR_INIT() .if DRZ80_XMAP @@ -1370,7 +1373,7 @@ DrZ80Run: blne DoInterrupt .endif - ldr opcodes,MAIN_opcodes_POINTER2 + PIC_LDR(opcodes, r0, MAIN_opcodes) cmp z80_icount,#0 ;@ irq might have used all cycles ldrplb r0,[z80pc],#1 @@ -1382,11 +1385,7 @@ z80_execute_end: stmia cpucontext,{z80pc-z80sp} ;@ save Z80 registers mov r0,z80_icount ldmia sp!,{r4-r12,pc} ;@ restore registers from stack and return to C code - -MAIN_opcodes_POINTER2: .word MAIN_opcodes -.if INTERRUPT_MODE -Interrupt_local: .word Interrupt -.endif +.pool DoInterrupt: .if INTERRUPT_MODE @@ -1395,8 +1394,9 @@ DoInterrupt: ;@ save everything back into DrZ80 context stmia cpucontext,{z80pc-z80sp} ;@ save Z80 registers stmfd sp!,{r3,r4,r5,lr} ;@ save rest of regs on stack + PIC_LDR(r2, r3, Interrupt) mov lr,pc - ldr pc,Interrupt_local + bx r2 ldmfd sp!,{r3,r4,r5,lr} ;@ load regs from stack ;@ reload regs from DrZ80 context ldmia cpucontext,{z80pc-z80sp} ;@ load Z80 registers @@ -4469,7 +4469,6 @@ opcode_2_6: and z80hl,z80hl,#0xFF<<16 orr z80hl,z80hl,r1, lsl #24 fetch 7 -DAATABLE_LOCAL: .word DAATable ;@DAA opcode_2_7: mov r1,z80a, lsr #24 @@ -4479,13 +4478,14 @@ opcode_2_7: orrne r1,r1,#512 tst z80f,#1<= 2 -#define emith_ror(d, s, cnt) do { \ - EMIT(MIPS_ROR_IMM(d, s, cnt)); \ -} while (0) +#define emith_ror(d, s, cnt) \ + EMIT(MIPS_ROR_IMM(d, s, cnt)) #else #define emith_ror(d, s, cnt) do { \ EMIT(MIPS_LSL_IMM(AT, s, 32-(cnt))); \ @@ -985,9 +984,8 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) emith_ror(d, s, cnt) #if defined(__mips_isa_rev) && __mips_isa_rev >= 2 -#define emith_rol(d, s, cnt) do { \ - EMIT(MIPS_ROR_IMM(d, s, 32-(cnt))); \ -} while (0) +#define emith_rol(d, s, cnt) \ + EMIT(MIPS_ROR_IMM(d, s, 32-(cnt))) #else #define emith_rol(d, s, cnt) do { \ EMIT(MIPS_LSR_IMM(AT, s, 32-(cnt))); \ diff --git a/pico/32x/draw_arm.S b/pico/32x/draw_arm.S index f351d8e00..ad5d428b1 100644 --- a/pico/32x/draw_arm.S +++ b/pico/32x/draw_arm.S @@ -7,6 +7,7 @@ @* See COPYING file in the top-level directory. @* +#include "pico/arm_features.h" #include "pico/pico_int_offs.h" .extern Pico32x @@ -17,11 +18,12 @@ .text .align 2 + PIC_LDR_INIT() .macro call_scan_prep cond est @ &Pico.est .if \cond - ldr r4, =PicoScan32xBegin - ldr r5, =PicoScan32xEnd + PIC_LDR(r4, r6, PicoScan32xBegin) + PIC_LDR(r5, r6, PicoScan32xEnd) ldr r6, [\est, #OFS_EST_DrawLineDest] ldr r4, [r4] ldr r5, [r5] @@ -66,8 +68,8 @@ \name: stmfd sp!, {r4-r11,lr} - ldr lr,=Pico - ldr r10,=Pico32x + PIC_LDR(lr, r9, Pico) + PIC_LDR(r10,r9, Pico32x) ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] ldrh r10,[r10, #0x40] @ Pico32x.vdp_regs[0] add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd @@ -192,8 +194,8 @@ \name: stmfd sp!, {r4-r11,lr} - ldr lr,=Pico - ldr r10,=Pico32xMem + PIC_LDR(lr, r9, Pico) + PIC_LDR(r10,r9, Pico32xMem) ldr r9,=OFS_PMEM32x_pal_native ldr r10, [r10] ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] @@ -361,8 +363,8 @@ \name: stmfd sp!, {r4-r11,lr} - ldr lr,=Pico - ldr r10,=Pico32xMem + PIC_LDR(lr, r9, Pico) + PIC_LDR(r10,r9, Pico32xMem) ldr r9,=OFS_PMEM32x_pal_native ldr r10, [r10] ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] diff --git a/pico/arm_features.h b/pico/arm_features.h index fdec52298..4b456f456 100644 --- a/pico/arm_features.h +++ b/pico/arm_features.h @@ -49,4 +49,36 @@ #endif +// indexed branch (XB) via branch table (BT) +#ifdef __PIC__ +#define PIC_XB(c,r,s) add##c pc, r, s +#define PIC_BT(a) b a +#else +#define PIC_XB(c,r,s) ldr##c pc, [pc, r, s] +#define PIC_BT(a) .word a +#endif + +// load data address (LDR) either via literal pool or via GOT +#ifdef __PIC__ +// can't use pool loads since ldr= only allows symbol or constants, not expr :-( +#define PIC_LDR_INIT() \ + .ifndef PIC_LDR_DEF; PIC_LDR_DEF=1; \ + .macro pic_ldr r t a; \ + ldr \r, [pc, $.LD\@-.-8]; \ + ldr \t, [pc, $.LD\@-.-4]; \ + .LP\@:add \r, pc; \ + ldr \r, [\r, \t]; \ + add pc, $4; \ + .LD\@:.word _GLOBAL_OFFSET_TABLE_-.LP\@-8; \ + .word \a(GOT); \ + .endm; \ + .endif; +#define PIC_LDR(r,t,a) \ + pic_ldr r, t, a +#else +#define PIC_LDR_INIT() +#define PIC_LDR(r,t,a) \ + ldr r, =a +#endif + #endif /* __ARM_FEATURES_H__ */ diff --git a/pico/cd/memory_arm.S b/pico/cd/memory_arm.S index 04920b625..95ad09ff3 100644 --- a/pico/cd/memory_arm.S +++ b/pico/cd/memory_arm.S @@ -6,6 +6,7 @@ @* See COPYING file in the top-level directory. @* +#include "../arm_features.h" #include "../pico_int_offs.h" .equiv PCM_STEP_SHIFT, 11 @@ -65,6 +66,7 @@ .extern PicoWrite16_io .extern m68k_comm_check + PIC_LDR_INIT() @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @@ -73,16 +75,16 @@ @ r0=addr[in,out], r1,r2=tmp .macro cell_map ands r1, r0, #0x01c000 - ldrne pc, [pc, r1, lsr #12] - beq 0f @ most common? - .long 0f - .long 0f - .long 0f - .long 0f - .long 1f - .long 1f - .long 2f - .long 3f + PIC_XB(ne ,r1, lsr #12) + b 0f @ most common? + PIC_BT(0f) + PIC_BT(0f) + PIC_BT(0f) + PIC_BT(0f) + PIC_BT(1f) + PIC_BT(1f) + PIC_BT(2f) + PIC_BT(3f) 1: @ x16 cells and r1, r0, #0x7e00 @ col and r2, r0, #0x01fc @ row @@ -128,7 +130,7 @@ PicoReadM68k8_cell1: @ 0x220000 - 0x23ffff, cell arranged mov r3, #0x0e0000 0: cell_map - ldr r1, =Pico + PIC_LDR(r1, r2, Pico) add r0, r0, r3 ldr r1, [r1, #OFS_Pico_rom] @ Pico.mcd (used everywhere) eor r0, r0, #1 @@ -141,26 +143,26 @@ PicoRead8_mcd_io: cmp r1, #0x2000 @ a120xx? bne PicoRead8_io - ldr r1, =Pico + PIC_LDR(r1, r2, Pico) and r0, r0, #0x3f ldr r1, [r1, #OFS_Pico_rom] @ Pico.mcd cmp r0, #0x0e - ldrlt pc, [pc, r0, lsl #2] + PIC_XB(lt ,r0, lsl #2) b m_m68k_read8_hi - .long m_m68k_read8_r00 - .long m_m68k_read8_r01 - .long m_m68k_read8_r02 - .long m_m68k_read8_r03 - .long m_m68k_read8_r04 - .long m_read_null @ unused bits - .long m_m68k_read8_r06 - .long m_m68k_read8_r07 - .long m_m68k_read8_r08 - .long m_m68k_read8_r09 - .long m_read_null @ reserved - .long m_read_null - .long m_m68k_read8_r0c - .long m_m68k_read8_r0d + PIC_BT(m_m68k_read8_r00) + PIC_BT(m_m68k_read8_r01) + PIC_BT(m_m68k_read8_r02) + PIC_BT(m_m68k_read8_r03) + PIC_BT(m_m68k_read8_r04) + PIC_BT(m_read_null) @ unused bits + PIC_BT(m_m68k_read8_r06) + PIC_BT(m_m68k_read8_r07) + PIC_BT(m_m68k_read8_r08) + PIC_BT(m_m68k_read8_r09) + PIC_BT(m_read_null) @ reserved + PIC_BT(m_read_null) + PIC_BT(m_m68k_read8_r0c) + PIC_BT(m_m68k_read8_r0d) m_m68k_read8_r00: add r1, r1, #0x110000 ldr r0, [r1, #0x30] @@ -238,7 +240,7 @@ PicoReadM68k16_cell1: @ 0x220000 - 0x23ffff, cell arranged mov r3, #0x0e0000 0: cell_map - ldr r1, =Pico + PIC_LDR(r1, r2, Pico) add r0, r0, r3 ldr r1, [r1, #OFS_Pico_rom] @ Pico.mcd bic r0, r0, #1 @@ -252,19 +254,19 @@ PicoRead16_mcd_io: bne PicoRead16_io m_m68k_read16_m68k_regs: - ldr r1, =Pico + PIC_LDR(r1, r2, Pico) and r0, r0, #0x3e ldr r1, [r1, #OFS_Pico_rom] @ Pico.mcd cmp r0, #0x0e - ldrlt pc, [pc, r0, lsl #1] + PIC_XB(lt ,r0, lsl #1) b m_m68k_read16_hi - .long m_m68k_read16_r00 - .long m_m68k_read16_r02 - .long m_m68k_read16_r04 - .long m_m68k_read16_r06 - .long m_m68k_read16_r08 - .long m_read_null @ reserved - .long m_m68k_read16_r0c + PIC_BT(m_m68k_read16_r00) + PIC_BT(m_m68k_read16_r02) + PIC_BT(m_m68k_read16_r04) + PIC_BT(m_m68k_read16_r06) + PIC_BT(m_m68k_read16_r08) + PIC_BT(m_read_null) @ reserved + PIC_BT(m_m68k_read16_r0c) m_m68k_read16_r00: add r1, r1, #0x110000 ldr r0, [r1, #0x30] @@ -329,7 +331,7 @@ PicoWriteM68k8_cell1: @ 0x220000 - 0x23ffff, cell arranged 0: mov r3, r1 cell_map - ldr r2, =Pico + PIC_LDR(r2, r1, Pico) add r0, r0, r12 ldr r2, [r2, #OFS_Pico_rom] @ Pico.mcd ldr r2, [r2] @@ -357,7 +359,7 @@ PicoWriteM68k16_cell1: @ 0x220000 - 0x23ffff, cell arranged 0: mov r3, r1 cell_map - ldr r1, =Pico + PIC_LDR(r1, r2, Pico) add r0, r0, r12 ldr r1, [r1, #OFS_Pico_rom] @ Pico.mcd bic r0, r0, #1 @@ -399,7 +401,7 @@ PicoReadS68k8_dec0: @ 0x080000 - 0x0bffff PicoReadS68k8_dec1: mov r3, #0x0a0000 @ + ^ / 2 0: - ldr r2, =Pico + PIC_LDR(r2, r1, Pico) eor r0, r0, #2 ldr r2, [r2, #OFS_Pico_rom] @ Pico.mcd movs r0, r0, lsr #1 @ +4-6 <<16 @@ -431,7 +433,7 @@ m_s68k_read8_regs: bx lr m_s68k_read8_comm: - ldr r1, =Pico + PIC_LDR(r1, r2, Pico) ldr r1, [r1, #OFS_Pico_rom] @ Pico.mcd add r1, r1, #0x110000 ldrb r1, [r1, r0] @@ -444,7 +446,7 @@ m_s68k_read8_pcm: bne m_read_null @ must not trash r3 and r12 - ldr r1, =Pico + PIC_LDR(r1, r2, Pico) bic r0, r0, #0xff0000 ldr r1, [r1, #OFS_Pico_rom] @ Pico.mcd mov r2, #0x110000 @@ -479,7 +481,7 @@ PicoReadS68k16_dec0: @ 0x080000 - 0x0bffff PicoReadS68k16_dec1: mov r3, #0x0a0000 @ + ^ / 2 0: - ldr r2, =Pico + PIC_LDR(r2, r1, Pico) eor r0, r0, #2 ldr r2, [r2, #OFS_Pico_rom] @ Pico.mcd mov r0, r0, lsr #1 @ +4-6 <<16 @@ -505,12 +507,11 @@ m_s68k_read16_regs: mov r0, #1 b cdc_host_r - @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ .macro m_s68k_write8_2M_decode - ldr r2, =Pico + PIC_LDR(r2, ip, Pico) eor r0, r0, #2 ldr r2, [r2, #OFS_Pico_rom] @ Pico.mcd movs r0, r0, lsr #1 @ +4-6 <<16 @@ -594,7 +595,7 @@ m_s68k_write8_pcm: bxlt lr m_s68k_write8_pcm_ram: - ldr r3, =Pico + PIC_LDR(r3, r2, Pico) bic r0, r0, #0x00e000 ldr r3, [r3, #OFS_Pico_rom] @ Pico.mcd mov r0, r0, lsr #1 @@ -608,12 +609,11 @@ m_s68k_write8_pcm_ram: strb r1, [r3, r0] bx lr - @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ .macro m_s68k_write16_2M_decode - ldr r2, =Pico + PIC_LDR(r2, ip, Pico) eor r0, r0, #2 ldr r2, [r2, #OFS_Pico_rom] @ Pico.mcd mov r0, r0, lsr #1 @ +4-6 <<16 @@ -694,7 +694,7 @@ m_s68k_write16_regs: bne s68k_reg_write16 m_s68k_write16_regs_spec: @ special case - ldr r2, =Pico + PIC_LDR(r2, r0, Pico) mov r0, #0x110000 ldr r2, [r2, #OFS_Pico_rom] @ Pico.mcd add r0, r0, #0x00000f @@ -707,7 +707,7 @@ m_s68k_write16_regs_spec: @ special case .global s68k_write16 s68k_read8: - ldr r3, =s68k_read8_map + PIC_LDR(r3, r2, s68k_read8_map) bic r0, r0, #0xff000000 mov r2, r0, lsr #16 ldr r3, [r3, r2, lsl #2] @@ -718,7 +718,7 @@ s68k_read8: bx r3 s68k_read16: - ldr r3, =s68k_read16_map + PIC_LDR(r3, r2, s68k_read16_map) bic r0, r0, #0xff000000 mov r2, r0, lsr #16 ldr r3, [r3, r2, lsl #2] @@ -729,7 +729,7 @@ s68k_read16: bx r3 s68k_read32: - ldr r3, =s68k_read16_map + PIC_LDR(r3, r2, s68k_read16_map) bic r0, r0, #0xff000000 mov r2, r0, lsr #16 ldr r3, [r3, r2, lsl #2] @@ -755,7 +755,7 @@ s68k_read32: bx lr s68k_write8: - ldr r3, =s68k_write8_map + PIC_LDR(r3, r2, s68k_write8_map) bic r0, r0, #0xff000000 mov r2, r0, lsr #16 ldr r3, [r3, r2, lsl #2] @@ -766,7 +766,7 @@ s68k_write8: bx r3 s68k_write16: - ldr r3, =s68k_write16_map + PIC_LDR(r3, r2, s68k_write16_map) bic r0, r0, #0xff000000 mov r2, r0, lsr #16 ldr r3, [r3, r2, lsl #2] @@ -777,7 +777,7 @@ s68k_write16: bx r3 s68k_write32: - ldr r3, =s68k_write16_map + PIC_LDR(r3, r2, s68k_write16_map) bic r0, r0, #0xff000000 mov r2, r0, lsr #16 ldr r3, [r3, r2, lsl #2] diff --git a/pico/memory_arm.S b/pico/memory_arm.S index 333780c10..ebeb346b7 100644 --- a/pico/memory_arm.S +++ b/pico/memory_arm.S @@ -7,6 +7,7 @@ * See COPYING file in the top-level directory. */ +#include "arm_features.h" #include "pico_int_offs.h" .equ SRR_MAPPED, (1 << 0) @@ -24,8 +25,10 @@ .global PicoWrite8_io .global PicoWrite16_io + PIC_LDR_INIT() + PicoRead8_sram: @ u32 a - ldr r3, =Pico + PIC_LDR(r3, r1, Pico) ldr r1, [r3, #OFS_Pico_sv_end] cmp r0, r1 bgt m_read8_nosram @@ -74,7 +77,7 @@ m_read8_not_io: cmp r2, #0x1000 bne PicoRead8_32x - ldr r3, =Pico + PIC_LDR(r3, r1, Pico) mov r1, r0 ldr r0, [r3, #OFS_Pico_m_rotate] add r0, r0, #1 @@ -97,7 +100,7 @@ m_read8_not_io: @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ PicoRead16_sram: @ u32 a, u32 d - ldr r3, =Pico + PIC_LDR(r3, r1, Pico) ldr r1, [r3, #OFS_Pico_sv_end] cmp r0, r1 bgt m_read16_nosram @@ -142,7 +145,7 @@ m_read16_not_io: cmp r2, #0x1000 bne PicoRead16_32x - ldr r3, =Pico + PIC_LDR(r3, r2, Pico) and r2, r0, #0xff00 ldr r0, [r3, #OFS_Pico_m_rotate] add r0, r0, #1 @@ -184,7 +187,7 @@ m_write8_not_z80ctl: eor r2, r2, #0x003000 eors r2, r2, #0x0000f1 bne PicoWrite8_32x - ldr r3, =Pico + PIC_LDR(r3, r2, Pico) ldrb r2, [r3, #OFS_Pico_m_sram_reg] and r1, r1, #(SRR_MAPPED|SRR_READONLY) bic r2, r2, #(SRR_MAPPED|SRR_READONLY) @@ -214,7 +217,7 @@ m_write16_not_z80ctl: eor r2, r2, #0x003000 eors r2, r2, #0x0000f0 bne PicoWrite16_32x - ldr r3, =Pico + PIC_LDR(r3, r2, Pico) ldrb r2, [r3, #OFS_Pico_m_sram_reg] and r1, r1, #(SRR_MAPPED|SRR_READONLY) bic r2, r2, #(SRR_MAPPED|SRR_READONLY) @@ -228,7 +231,7 @@ m_write16_not_z80ctl: .global m68k_write16 m68k_read8: - ldr r3, =m68k_read8_map + PIC_LDR(r3, r2, m68k_read8_map) bic r0, r0, #0xff000000 mov r2, r0, lsr #16 ldr r3, [r3, r2, lsl #2] @@ -239,7 +242,7 @@ m68k_read8: bx r3 m68k_read16: - ldr r3, =m68k_read16_map + PIC_LDR(r3, r2, m68k_read16_map) bic r0, r0, #0xff000000 mov r2, r0, lsr #16 ldr r3, [r3, r2, lsl #2] @@ -250,7 +253,7 @@ m68k_read16: bx r3 m68k_read32: - ldr r3, =m68k_read16_map + PIC_LDR(r3, r2, m68k_read16_map) bic r0, r0, #0xff000000 mov r2, r0, lsr #16 ldr r3, [r3, r2, lsl #2] @@ -276,7 +279,7 @@ m68k_read32: bx lr m68k_write8: - ldr r3, =m68k_write8_map + PIC_LDR(r3, r2, m68k_write8_map) bic r0, r0, #0xff000000 mov r2, r0, lsr #16 ldr r3, [r3, r2, lsl #2] @@ -287,7 +290,7 @@ m68k_write8: bx r3 m68k_write16: - ldr r3, =m68k_write16_map + PIC_LDR(r3, r2, m68k_write16_map) bic r0, r0, #0xff000000 mov r2, r0, lsr #16 ldr r3, [r3, r2, lsl #2] @@ -298,7 +301,7 @@ m68k_write16: bx r3 m68k_write32: - ldr r3, =m68k_write16_map + PIC_LDR(r3, r2, m68k_write16_map) bic r0, r0, #0xff000000 mov r2, r0, lsr #16 ldr r3, [r3, r2, lsl #2] diff --git a/pico/sound/ym2612_arm.s b/pico/sound/ym2612_arm.S similarity index 95% rename from pico/sound/ym2612_arm.s rename to pico/sound/ym2612_arm.S index 9c436d41b..7d4c609a1 100644 --- a/pico/sound/ym2612_arm.s +++ b/pico/sound/ym2612_arm.S @@ -12,6 +12,8 @@ @ vim:filetype=armasm +#include "../arm_features.h" + .equiv SLOT1, 0 .equiv SLOT2, 2 .equiv SLOT3, 1 @@ -34,6 +36,7 @@ .text .align 2 + PIC_LDR_INIT() @ r5=slot, r1=eg_cnt, trashes: r0,r2,r3 @ writes output to routp, but only if vol_out changes @@ -556,8 +559,8 @@ upd_algo0: stmfd sp!, {r4-r10,lr} mov lr, r0 - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab + PIC_LDR(r3, ip, ym_sin_tab) + PIC_LDR(r5, ip, ym_tl_tab) ldmia lr, {r6-r7} ldr r10, [lr, #0x54] ldr r12, [lr, #0x4c] @@ -573,8 +576,8 @@ upd_algo1: stmfd sp!, {r4-r10,lr} mov lr, r0 - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab + PIC_LDR(r3, ip, ym_sin_tab) + PIC_LDR(r5, ip, ym_tl_tab) ldmia lr, {r6-r7} ldr r10, [lr, #0x54] ldr r12, [lr, #0x4c] @@ -590,8 +593,8 @@ upd_algo2: stmfd sp!, {r4-r10,lr} mov lr, r0 - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab + PIC_LDR(r3, ip, ym_sin_tab) + PIC_LDR(r5, ip, ym_tl_tab) ldmia lr, {r6-r7} ldr r10, [lr, #0x54] ldr r12, [lr, #0x4c] @@ -607,8 +610,8 @@ upd_algo3: stmfd sp!, {r4-r10,lr} mov lr, r0 - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab + PIC_LDR(r3, ip, ym_sin_tab) + PIC_LDR(r5, ip, ym_tl_tab) ldmia lr, {r6-r7} ldr r10, [lr, #0x54] ldr r12, [lr, #0x4c] @@ -624,8 +627,8 @@ upd_algo4: stmfd sp!, {r4-r10,lr} mov lr, r0 - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab + PIC_LDR(r3, ip, ym_sin_tab) + PIC_LDR(r5, ip, ym_tl_tab) ldmia lr, {r6-r7} ldr r10, [lr, #0x54] ldr r12, [lr, #0x4c] @@ -641,8 +644,8 @@ upd_algo5: stmfd sp!, {r4-r10,lr} mov lr, r0 - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab + PIC_LDR(r3, ip, ym_sin_tab) + PIC_LDR(r5, ip, ym_tl_tab) ldmia lr, {r6-r7} ldr r10, [lr, #0x54] ldr r12, [lr, #0x4c] @@ -658,8 +661,8 @@ upd_algo6: stmfd sp!, {r4-r10,lr} mov lr, r0 - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab + PIC_LDR(r3, ip, ym_sin_tab) + PIC_LDR(r5, ip, ym_tl_tab) ldmia lr, {r6-r7} ldr r10, [lr, #0x54] ldr r12, [lr, #0x4c] @@ -675,8 +678,8 @@ upd_algo7: stmfd sp!, {r4-r10,lr} mov lr, r0 - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab + PIC_LDR(r3, ip, ym_sin_tab) + PIC_LDR(r5, ip, ym_tl_tab) ldmia lr, {r6-r7} ldr r10, [lr, #0x54] ldr r12, [lr, #0x4c] @@ -692,8 +695,8 @@ upd_slot1: stmfd sp!, {r4-r10,lr} mov lr, r0 - ldr r3, =ym_sin_tab - ldr r5, =ym_tl_tab + PIC_LDR(r3, ip, ym_sin_tab) + PIC_LDR(r5, ip, ym_tl_tab) ldmia lr, {r6-r7} ldr r10, [lr, #0x54] ldr r12, [lr, #0x4c] @@ -781,7 +784,7 @@ eg_done: beq crl_loop @ -- SLOT1 -- - ldr r3, =ym_tl_tab + PIC_LDR(r3, r2, ym_tl_tab) @ lr=context, r12=pack (stereo, lastchan, disabled, lfo_enabled | pan_r, pan_l, ams[2] | AMmasks[4] | FB[4] | lfo_ampm[16]) @ r0-r2=scratch, r3=tl_tab, r5=scratch, r6-r7=vol_out[4], r10=op1_out @@ -789,16 +792,16 @@ eg_done: @ -- SLOT2+ -- and r0, r4, #7 - ldr pc, [pc, r0, lsl #2] + PIC_XB(,r0, lsl #2) nop - .word crl_algo0 - .word crl_algo1 - .word crl_algo2 - .word crl_algo3 - .word crl_algo4 - .word crl_algo5 - .word crl_algo6 - .word crl_algo7 + PIC_BT(crl_algo0) + PIC_BT(crl_algo1) + PIC_BT(crl_algo2) + PIC_BT(crl_algo3) + PIC_BT(crl_algo4) + PIC_BT(crl_algo5) + PIC_BT(crl_algo6) + PIC_BT(crl_algo7) .pool crl_algo0: diff --git a/platform/common/common.mak b/platform/common/common.mak index 35f6ac9ef..3c9ff81d4 100644 --- a/platform/common/common.mak +++ b/platform/common/common.mak @@ -59,7 +59,7 @@ SRCS_COMMON += $(R)pico/memory_arm.S endif ifeq "$(asm_ym2612)" "1" DEFINES += _ASM_YM2612_C -SRCS_COMMON += $(R)pico/sound/ym2612_arm.s +SRCS_COMMON += $(R)pico/sound/ym2612_arm.S endif ifeq "$(asm_misc)" "1" DEFINES += _ASM_MISC_C @@ -148,7 +148,7 @@ endif # --- Z80 --- ifeq "$(use_drz80)" "1" DEFINES += _USE_DRZ80 -SRCS_COMMON += $(R)cpu/DrZ80/drz80.s +SRCS_COMMON += $(R)cpu/DrZ80/drz80.S endif # ifeq "$(use_cz80)" "1" diff --git a/platform/common/dismips.c b/platform/common/dismips.c index 61c70bfe2..19c0b427f 100644 --- a/platform/common/dismips.c +++ b/platform/common/dismips.c @@ -1,5 +1,5 @@ /* - * very basic mips disassembler for MIPS32/MIPS64 Release 1, only for picodrive + * very basic mips disassembler for MIPS32/MIPS64 Release 2, only for picodrive * Copyright (C) 2019 kub * * This work is licensed under the terms of MAME license. From 26dd75aee8728e32bfb14a78a04784e0c30a08e3 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 11 Dec 2019 20:16:14 +0100 Subject: [PATCH 087/174] sh2 drc: cleanup, fix for drc crash, for mips code emitter --- cpu/drc/emit_mips.c | 110 ++++++++++++++++++------------------ cpu/sh2/compiler.c | 50 ++++++++++++---- cpu/sh2/compiler.h | 31 +++++----- cpu/sh2/sh2.h | 28 ++++----- pico/arm_features.h | 12 ++-- pico/pico_port.h | 2 + platform/common/disarm.c | 4 +- platform/common/disarm.h | 2 +- platform/common/dismips.c | 11 +++- platform/common/dismips.h | 2 +- platform/common/host_dasm.c | 4 +- tools/mkoffsets.sh | 15 +++-- 12 files changed, 151 insertions(+), 120 deletions(-) diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 453801f15..765986a6b 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -20,6 +20,9 @@ #define STATIC_SH2_REGS { SHR_SR,22 , SHR_R0,21 , SHR_R0+1,20 } // NB: the ubiquitous JZ74[46]0 uses MIPS32 Release 1, a slight MIPS II superset +#ifndef __mips_isa_rev +#define __mips_isa_rev 1 // surprisingly not always defined +#endif // registers usable for user code: r1-r25, others reserved or special #define Z0 0 // zero register @@ -333,32 +336,49 @@ static int emith_is_b(u32 op) // B { return ((op>>26) & 074) == OP_BEQ || ((op>>26) == OP__RT && ((op>>16) & 036) == RT_BLTZ); } // register usage for dependency evaluation XXX better do this as in emit_arm? -static uint64_t emith_has_rs[3] = // OP__FN, OP__RT, others - { 0x00fffffffffa0ff0ULL, 0x000fff0fUL, 0xffffffff0f007ff0ULL }; -static uint64_t emith_has_rt[3] = // OP__FN, OP__RT, others - { 0xff00fffffff00cffULL, 0x00000000UL, 0x8000ff0000000030ULL }; -static uint64_t emith_has_rd[3] = // OP__FN, OP__RT, others (rt instead of rd) - { 0xff00fffffff50fffULL, 0x00000000UL, 0x119100ff0f00ff00ULL }; +static uint64_t emith_has_rs[5] = // OP__FN1-3, OP__RT, others + { 0x005ffcffffda0fd2ULL, 0x0000003300000037ULL, 0x00000000000000ffULL, + 0x800f5f0fUL, 0xf7ffffff0ff07ff0ULL }; +static uint64_t emith_has_rt[5] = // OP__FN1-3, OP__RT, others + { 0xdd5ffcffffd00cddULL, 0x0000000000000037ULL, 0x0000001100000000ULL, + 0x00000000UL, 0x80007f440c300030ULL }; +static uint64_t emith_has_rd[5] = // OP__FN1-3, OP__RT, others(rt instead of rd) + { 0xdd00fcff00d50edfULL, 0x0000003300000004ULL, 0x08000011000000ffULL, + 0x00000000UL, 0x119100ff0f00ff00ULL }; #define emith_has_(rx,ix,op,sa,m) \ (emith_has_##rx[ix] & (1ULL << (((op)>>(sa)) & (m)))) static int emith_rs(u32 op) { if ((op>>26) == OP__FN) return emith_has_(rs,0,op, 0,0x3f) ? (op>>21)&0x1f : 0; + if ((op>>26) == OP__FN2) + return emith_has_(rs,1,op, 0,0x3f) ? (op>>21)&0x1f : 0; + if ((op>>26) == OP__FN3) + return emith_has_(rs,2,op, 0,0x3f) ? (op>>21)&0x1f : 0; if ((op>>26) == OP__RT) - return emith_has_(rs,1,op,16,0x1f) ? (op>>21)&0x1f : 0; - return emith_has_(rs,2,op,26,0x3f) ? (op>>21)&0x1f : 0; + return emith_has_(rs,3,op,16,0x1f) ? (op>>21)&0x1f : 0; + return emith_has_(rs,4,op,26,0x3f) ? (op>>21)&0x1f : 0; } static int emith_rt(u32 op) { if ((op>>26) == OP__FN) return emith_has_(rt,0,op, 0,0x3f) ? (op>>16)&0x1f : 0; + if ((op>>26) == OP__FN2) + return emith_has_(rt,1,op, 0,0x3f) ? (op>>16)&0x1f : 0; + if ((op>>26) == OP__FN3) + return emith_has_(rt,2,op, 0,0x3f) ? (op>>16)&0x1f : 0; if ((op>>26) == OP__RT) return 0; - return emith_has_(rt,2,op,26,0x3f) ? (op>>16)&0x1f : 0; + return emith_has_(rt,4,op,26,0x3f) ? (op>>16)&0x1f : 0; } static int emith_rd(u32 op) - { int ret = emith_has_(rd,2,op,26,0x3f) ? (op>>16)&0x1f :-1; + { int ret = emith_has_(rd,4,op,26,0x3f) ? (op>>16)&0x1f :-1; if ((op>>26) == OP__FN) ret = emith_has_(rd,0,op, 0,0x3f) ? (op>>11)&0x1f :-1; + if ((op>>26) == OP__FN2) + ret = emith_has_(rd,1,op, 0,0x3f) ? (op>>11)&0x1f :-1; + if ((op>>26) == OP__FN3 && (op&0x3f) == FN3_BSHFL) + ret = emith_has_(rd,2,op, 0,0x3f) ? (op>>11)&0x1f :-1; + if ((op>>26) == OP__FN3 && (op&0x3f) != FN3_BSHFL) + ret = emith_has_(rd,2,op, 0,0x3f) ? (op>>16)&0x1f :-1; if ((op>>26) == OP__RT) ret = -1; return (ret ?: -1); // Z0 doesn't have dependencies @@ -970,29 +990,23 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) #define emith_asr(d, s, cnt) \ EMIT(MIPS_ASR_IMM(d, s, cnt)) -#if defined(__mips_isa_rev) && __mips_isa_rev >= 2 -#define emith_ror(d, s, cnt) \ - EMIT(MIPS_ROR_IMM(d, s, cnt)) -#else #define emith_ror(d, s, cnt) do { \ - EMIT(MIPS_LSL_IMM(AT, s, 32-(cnt))); \ - EMIT(MIPS_LSR_IMM(d, s, cnt)); \ - EMIT(MIPS_OR_REG(d, d, AT)); \ + if (__mips_isa_rev < 2) { \ + EMIT(MIPS_LSL_IMM(AT, s, 32-(cnt))); \ + EMIT(MIPS_LSR_IMM(d, s, cnt)); \ + EMIT(MIPS_OR_REG(d, d, AT)); \ + } else EMIT(MIPS_ROR_IMM(d, s, cnt)); \ } while (0) -#endif #define emith_ror_c(cond, d, s, cnt) \ emith_ror(d, s, cnt) -#if defined(__mips_isa_rev) && __mips_isa_rev >= 2 -#define emith_rol(d, s, cnt) \ - EMIT(MIPS_ROR_IMM(d, s, 32-(cnt))) -#else #define emith_rol(d, s, cnt) do { \ - EMIT(MIPS_LSR_IMM(AT, s, 32-(cnt))); \ - EMIT(MIPS_LSL_IMM(d, s, cnt)); \ - EMIT(MIPS_OR_REG(d, d, AT)); \ + if (__mips_isa_rev < 2) { \ + EMIT(MIPS_LSR_IMM(AT, s, 32-(cnt))); \ + EMIT(MIPS_LSL_IMM(d, s, cnt)); \ + EMIT(MIPS_OR_REG(d, d, AT)); \ + } else EMIT(MIPS_ROR_IMM(d, s, 32-(cnt))); \ } while (0) -#endif #define emith_rorc(d) do { \ emith_lsr(d, d, 1); \ @@ -1082,13 +1096,11 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) } while (0) // signed/unsigned extend -#if defined(__mips_isa_rev) && __mips_isa_rev >= 2 -#define emith_clear_msb(d, s, count) /* bits to clear */ \ - EMIT(MIPS_EXT_IMM(d, s, 0, 32-(count))) -#else #define emith_clear_msb(d, s, count) /* bits to clear */ do { \ u32 t; \ - if ((count) >= 16) { \ + if (__mips_isa_rev >= 2) \ + EMIT(MIPS_EXT_IMM(d, s, 0, 32-(count))); \ + else if ((count) >= 16) { \ t = (count) - 16; \ t = 0xffff >> t; \ emith_and_r_r_imm(d, s, t); \ @@ -1097,27 +1109,19 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm) emith_lsr(d, d, count); \ } \ } while (0) -#endif #define emith_clear_msb_c(cond, d, s, count) \ emith_clear_msb(d, s, count) -#if defined(__mips_isa_rev) && __mips_isa_rev >= 2 #define emith_sext(d, s, count) /* bits to keep */ do { \ - if (count == 8) \ + if (__mips_isa_rev >= 2 && count == 8) \ EMIT(MIPS_SEB_REG(d, s)); \ - else if (count == 16) \ + else if (__mips_isa_rev >= 2 && count == 16) \ EMIT(MIPS_SEH_REG(d, s)); \ else { \ emith_lsl(d, s, 32-(count)); \ emith_asr(d, d, 32-(count)); \ } \ } while (0) -#else -#define emith_sext(d, s, count) /* bits to keep */ do { \ - emith_lsl(d, s, 32-(count)); \ - emith_asr(d, d, 32-(count)); \ -} while (0) -#endif // multiply Rd = Rn*Rm (+ Ra); NB: next 2 insns after MFLO/MFHI mustn't be MULT static u8 *last_lohi; @@ -1716,26 +1720,20 @@ static int emith_cond_check(int cond, int *r) EMITH_SJMP_END(DCOND_EQ); \ } while (0) -#if defined(__mips_isa_rev) && __mips_isa_rev >= 2 -#define emith_write_sr(sr, srcr) \ - EMIT(MIPS_INS_IMM(sr, srcr, 0, 10)) -#else #define emith_write_sr(sr, srcr) do { \ - emith_lsr(sr, sr , 10); emith_lsl(sr, sr, 10); \ - emith_lsl(AT, srcr, 22); emith_lsr(AT, AT, 22); \ - emith_or_r_r(sr, AT); \ + if (__mips_isa_rev < 2) { \ + emith_lsr(sr, sr , 10); emith_lsl(sr, sr, 10); \ + emith_lsl(AT, srcr, 22); emith_lsr(AT, AT, 22); \ + emith_or_r_r(sr, AT); \ + } else EMIT(MIPS_INS_IMM(sr, srcr, 0, 10)); \ } while (0) -#endif -#if defined(__mips_isa_rev) && __mips_isa_rev >= 2 -#define emith_carry_to_t(sr, is_sub) \ - EMIT(MIPS_INS_IMM(sr, FC, 0, 1)) -#else #define emith_carry_to_t(sr, is_sub) do { \ - emith_and_r_imm(sr, 0xfffffffe); \ - emith_or_r_r(sr, FC); \ + if (__mips_isa_rev < 2) { \ + emith_and_r_imm(sr, 0xfffffffe); \ + emith_or_r_r(sr, FC); \ + } else EMIT(MIPS_INS_IMM(sr, FC, 0, 1)); \ } while (0) -#endif #define emith_t_to_carry(sr, is_sub) do { \ emith_and_r_r_imm(FC, sr, 1); \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 57bfc212d..ca9a05500 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -440,7 +440,7 @@ static int rcache_get_tmp(void); static void rcache_free_tmp(int hr); // Note: Register assignment goes by ABI convention. Caller save registers are -// TEMPORARY, the others are PRESERVED. Unusable regs are omitted. +// TEMPORARY, callee save registers are PRESERVED. Unusable regs are omitted. // there must be at least the free (not context or statically mapped) amount of // PRESERVED/TEMPORARY registers used by handlers in worst case (currently 4). // there must be at least 3 PARAM, and PARAM+TEMPORARY must be at least 4. @@ -496,6 +496,11 @@ static void REGPARM(2) (*sh2_drc_write8)(u32 a, u32 d); static void REGPARM(2) (*sh2_drc_write16)(u32 a, u32 d); static void REGPARM(2) (*sh2_drc_write32)(u32 a, u32 d); +#ifdef DRC_SR_REG +void REGPARM(1) (*sh2_drc_save_sr)(SH2 *sh2); +void REGPARM(1) (*sh2_drc_restore_sr)(SH2 *sh2); +#endif + // flags for memory access #define MF_SIZEMASK 0x03 // size of access #define MF_POSTINCR 0x10 // post increment (for read_rr) @@ -1578,7 +1583,7 @@ static void rcache_unmap_vreg(int x) FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, i, if (guest_regs[i].flags & GRF_DIRTY) { // if a dirty reg is unmapped save its value to context - if ((~rcache_regs_discard | rcache_regs_now) & (1 << i)) + if (~rcache_regs_discard & (1 << i)) emith_ctx_write(cache_regs[x].hreg, i * 4); guest_regs[i].flags &= ~GRF_DIRTY; } @@ -3107,6 +3112,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) op_flags[i+1] |= OF_BTARGET; // RTE entrypoint in case of SR.IMASK change // unify T and SR since rcache doesn't know about "virtual" guest regs if (ops[i].source & BITMASK1(SHR_T)) ops[i].source |= BITMASK1(SHR_SR); + if (ops[i].dest & BITMASK1(SHR_T)) ops[i].source |= BITMASK1(SHR_SR); if (ops[i].dest & BITMASK1(SHR_T)) ops[i].dest |= BITMASK1(SHR_SR); #if LOOP_DETECTION // loop types detected: @@ -5028,7 +5034,6 @@ static void sh2_generate_utils(void) emith_move_r_r_ptr(arg0, CONTEXT_REG); emith_ctx_read(arg1, offsetof(SH2, drc_tmp)); // tcache_id emith_call(sh2_translate); -/* just after lookup function, jump to address returned */ emith_tst_r_r_ptr(RET_REG, RET_REG); EMITH_SJMP_START(DCOND_EQ); emith_jump_reg_c(DCOND_NE, RET_REG); @@ -5057,8 +5062,8 @@ static void sh2_generate_utils(void) emith_ctx_read(arg2, offsetof(SH2, rts_cache_idx)); emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg2, 0); emith_read_r_r_offs(arg3, arg1, offsetof(SH2, rts_cache)); -#if (DRC_DEBUG & 128) emith_cmp_r_r(arg0, arg3); +#if (DRC_DEBUG & 128) EMITH_SJMP_START(DCOND_EQ); emith_move_r_ptr_imm(arg3, (uptr)&rcmiss); emith_read_r_r_offs_c(DCOND_NE, arg1, arg3, 0); @@ -5067,7 +5072,6 @@ static void sh2_generate_utils(void) emith_jump_cond(DCOND_NE, sh2_drc_dispatcher); EMITH_SJMP_END(DCOND_EQ); #else - emith_cmp_r_r(arg0, arg3); emith_jump_cond(DCOND_NE, sh2_drc_dispatcher); #endif emith_read_r_r_offs_ptr(arg0, arg1, offsetof(SH2, rts_cache) + sizeof(void *)); @@ -5109,7 +5113,7 @@ static void sh2_generate_utils(void) emith_call(p32x_sh2_write32); // XXX: use sh2_drc_write32? // push PC rcache_get_reg_arg(0, SHR_SP, NULL); - emith_ctx_read(arg1, SHR_PC * 4); + rcache_get_reg_arg(1, SHR_PC, NULL); emith_move_r_r_ptr(arg2, CONTEXT_REG); rcache_invalidate_tmp(); emith_call(p32x_sh2_write32); @@ -5143,6 +5147,24 @@ static void sh2_generate_utils(void) emith_jump(sh2_drc_dispatcher); emith_flush(); +#ifdef DRC_SR_REG + // sh2_drc_save_sr(SH2 *sh2) + sh2_drc_save_sr = (void *)tcache_ptr; + tmp = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); + emith_write_r_r_offs(tmp, arg0, SHR_SR * 4); + rcache_invalidate(); + emith_ret(); + emith_flush(); + + // sh2_drc_restore_sr(SH2 *sh2) + sh2_drc_restore_sr = (void *)tcache_ptr; + tmp = rcache_get_reg(SHR_SR, RC_GR_WRITE, NULL); + emith_read_r_r_offs(tmp, arg0, SHR_SR * 4); + rcache_flush(); + emith_ret(); + emith_flush(); +#endif + #ifdef PDB_NET // debug #define MAKE_READ_WRAPPER(func) { \ @@ -5204,6 +5226,10 @@ static void sh2_generate_utils(void) host_dasm_new_symbol(sh2_drc_read8_poll); host_dasm_new_symbol(sh2_drc_read16_poll); host_dasm_new_symbol(sh2_drc_read32_poll); +#ifdef DRC_SR_REG + host_dasm_new_symbol(sh2_drc_save_sr); + host_dasm_new_symbol(sh2_drc_restore_sr); +#endif #endif #if DRC_DEBUG @@ -5273,12 +5299,12 @@ static void sh2_smc_rm_blocks(u32 a, int len, int tcache_id, u32 shift) #endif } -void sh2_drc_wcheck_ram(unsigned int a, unsigned len, SH2 *sh2) +void sh2_drc_wcheck_ram(u32 a, unsigned len, SH2 *sh2) { sh2_smc_rm_blocks(a, len, 0, SH2_DRCBLK_RAM_SHIFT); } -void sh2_drc_wcheck_da(unsigned int a, unsigned len, SH2 *sh2) +void sh2_drc_wcheck_da(u32 a, unsigned len, SH2 *sh2) { sh2_smc_rm_blocks(a, len, 1 + sh2->is_slave, SH2_DRCBLK_DA_SHIFT); } @@ -5295,7 +5321,7 @@ int sh2_execute_drc(SH2 *sh2c, int cycles) sh2_drc_entry(sh2c); // TODO: irq cycles - ret_cycles = (signed int)sh2c->sr >> 12; + ret_cycles = (int32_t)sh2c->sr >> 12; if (ret_cycles > 0) dbg(1, "warning: drc returned with cycles: %d", ret_cycles); @@ -5777,6 +5803,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, break; case 1: // DIV0U 0000000000011001 CHECK_UNHANDLED_BITS(0xf00, undefined); + opd->source = BITMASK1(SHR_SR); opd->dest = BITMASK2(SHR_SR, SHR_T); break; case 2: // MOVT Rn 0000nnnn00101001 @@ -5877,7 +5904,7 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, opd->dest = BITMASK2(GET_Rn(), SHR_MEM); break; case 0x07: // DIV0S Rm,Rn 0010nnnnmmmm0111 - opd->source = BITMASK2(GET_Rm(), GET_Rn()); + opd->source = BITMASK3(SHR_SR, GET_Rm(), GET_Rn()); opd->dest = BITMASK2(SHR_SR, SHR_T); break; case 0x08: // TST Rm,Rn 0010nnnnmmmm1000 @@ -6470,6 +6497,9 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, last_btarget = 0; op = 0; // delay/poll insns counter for (i = 0, pc = base_pc; i < i_end; i++, pc += 2) { + int null; + if ((op_flags[i] & OF_BTARGET) && dr_get_entry(pc, is_slave, &null)) + break; // branch target already compiled opd = &ops[i]; crc += FETCH_OP(pc); diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 44620f489..804f2a70f 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -1,7 +1,7 @@ int sh2_drc_init(SH2 *sh2); void sh2_drc_finish(SH2 *sh2); -void sh2_drc_wcheck_ram(unsigned int a, unsigned len, SH2 *sh2); -void sh2_drc_wcheck_da(unsigned int a, unsigned len, SH2 *sh2); +void sh2_drc_wcheck_ram(uint32_t a, unsigned len, SH2 *sh2); +void sh2_drc_wcheck_da(uint32_t a, unsigned len, SH2 *sh2); #ifdef DRC_SH2 void sh2_drc_mem_setup(SH2 *sh2); @@ -28,13 +28,13 @@ void sh2_drc_frame(void); #define OF_DELAY_LOOP (2 << 2) #define OF_POLL_LOOP (3 << 2) -unsigned short scan_block(unsigned int base_pc, int is_slave, - unsigned char *op_flags, unsigned int *end_pc, - unsigned int *base_literals, unsigned int *end_literals); +unsigned short scan_block(uint32_t base_pc, int is_slave, + unsigned char *op_flags, uint32_t *end_pc, + uint32_t *base_literals, uint32_t *end_literals); -#if defined(DRC_SH2) -// direct access to some host CPU registers used by the DRC -// XXX MUST match definitions for SHR_SR in cpu/sh2/compiler.c +#if defined(DRC_SH2) && defined(__GNUC__) +// direct access to some host CPU registers used by the DRC +// XXX MUST match definitions for SHR_SR in cpu/drc/emit_*.c #if defined(__arm__) #define DRC_SR_REG "r10" #elif defined(__aarch64__) @@ -47,19 +47,20 @@ unsigned short scan_block(unsigned int base_pc, int is_slave, #define DRC_SR_REG "edi" #elif defined(__x86_64__) #define DRC_SR_REG "ebx" -#else -#warning "direct DRC register access not available for this host" #endif #endif #ifdef DRC_SR_REG -#define DRC_DECLARE_SR register int sh2_sr asm(DRC_SR_REG) +extern void REGPARM(1) (*sh2_drc_save_sr)(SH2 *sh2); +extern void REGPARM(1) (*sh2_drc_restore_sr)(SH2 *sh2); + +#define DRC_DECLARE_SR register int32_t sh2_sr asm(DRC_SR_REG) #define DRC_SAVE_SR(sh2) \ - if ((sh2->state & (SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN) \ - sh2->sr = sh2_sr; + if (likely((sh2->state & (SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN)) \ + sh2_drc_save_sr(sh2) #define DRC_RESTORE_SR(sh2) \ - if ((sh2->state & (SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN) \ - sh2_sr = sh2->sr; + if (likely((sh2->state & (SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN)) \ + sh2_drc_restore_sr(sh2) #else #define DRC_DECLARE_SR #define DRC_SAVE_SR(sh2) diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index 5f1a88411..2d73db59a 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -14,13 +14,13 @@ typedef enum { typedef struct SH2_ { // registers. this MUST correlate with enum sh2_reg_e. - unsigned int r[16] ALIGNED(32); - unsigned int pc; // 40 - unsigned int ppc; - unsigned int pr; - unsigned int sr; - unsigned int gbr, vbr; // 50 - unsigned int mach, macl; // 58 + uint32_t r[16] ALIGNED(32); + uint32_t pc; // 40 + uint32_t ppc; + uint32_t pr; + uint32_t sr; + uint32_t gbr, vbr; // 50 + uint32_t mach, macl; // 58 // common const void *read8_map; @@ -48,14 +48,14 @@ typedef struct SH2_ #define SH2_STATE_VPOLL (1 << 3) // polling VDP #define SH2_STATE_RPOLL (1 << 4) // polling address in SDRAM unsigned int state; - unsigned int poll_addr; + uint32_t poll_addr; int poll_cycles; int poll_cnt; // DRC branch cache. size must be 2^n and <=128 int rts_cache_idx; - struct { unsigned int pc; void *code; } rts_cache[16]; - struct { unsigned int pc; void *code; } branch_cache[128]; + struct { uint32_t pc; void *code; } rts_cache[16]; + struct { uint32_t pc; void *code; } branch_cache[128]; // interpreter stuff int icount; // cycles left in current timeslice @@ -79,15 +79,15 @@ typedef struct SH2_ unsigned int mult_m68k_to_sh2; unsigned int mult_sh2_to_m68k; - unsigned char data_array[0x1000]; // cache (can be used as RAM) - unsigned int peri_regs[0x200/4]; // periphereal regs + uint8_t data_array[0x1000]; // cache (can be used as RAM) + uint32_t peri_regs[0x200/4]; // periphereal regs } SH2; #define CYCLE_MULT_SHIFT 10 #define C_M68K_TO_SH2(xsh2, c) \ - (int)(((unsigned long long)(c) * (xsh2)->mult_m68k_to_sh2) >> CYCLE_MULT_SHIFT) + (int)(((uint64_t)(c) * (xsh2)->mult_m68k_to_sh2) >> CYCLE_MULT_SHIFT) #define C_SH2_TO_M68K(xsh2, c) \ - (int)(((unsigned long long)(c+3U) * (xsh2)->mult_sh2_to_m68k) >> CYCLE_MULT_SHIFT) + (int)(((uint64_t)(c+3U) * (xsh2)->mult_sh2_to_m68k) >> CYCLE_MULT_SHIFT) int sh2_init(SH2 *sh2, int is_slave, SH2 *other_sh2); void sh2_finish(SH2 *sh2); diff --git a/pico/arm_features.h b/pico/arm_features.h index 4b456f456..b772b77c5 100644 --- a/pico/arm_features.h +++ b/pico/arm_features.h @@ -60,9 +60,8 @@ // load data address (LDR) either via literal pool or via GOT #ifdef __PIC__ -// can't use pool loads since ldr= only allows symbol or constants, not expr :-( +// can't use pool loads since ldr= only allows a symbol or a constant expr :-( #define PIC_LDR_INIT() \ - .ifndef PIC_LDR_DEF; PIC_LDR_DEF=1; \ .macro pic_ldr r t a; \ ldr \r, [pc, $.LD\@-.-8]; \ ldr \t, [pc, $.LD\@-.-4]; \ @@ -71,14 +70,11 @@ add pc, $4; \ .LD\@:.word _GLOBAL_OFFSET_TABLE_-.LP\@-8; \ .word \a(GOT); \ - .endm; \ - .endif; -#define PIC_LDR(r,t,a) \ - pic_ldr r, t, a + .endm; +#define PIC_LDR(r,t,a) pic_ldr r, t, a #else #define PIC_LDR_INIT() -#define PIC_LDR(r,t,a) \ - ldr r, =a +#define PIC_LDR(r,t,a) ldr r, =a #endif #endif /* __ARM_FEATURES_H__ */ diff --git a/pico/pico_port.h b/pico/pico_port.h index e26e6ca2c..af9ce8534 100644 --- a/pico/pico_port.h +++ b/pico/pico_port.h @@ -17,10 +17,12 @@ #define NOINLINE __attribute__((noinline)) #define ALIGNED(n) __attribute__((aligned(n))) #define unlikely(x) __builtin_expect((x), 0) +#define likely(x) __builtin_expect(!!(x), 1) #else #define NOINLINE #define ALIGNED(n) #define unlikely(x) (x) +#define likely(x) (x) #endif #ifdef _MSC_VER diff --git a/platform/common/disarm.c b/platform/common/disarm.c index 37fd810e6..249922064 100644 --- a/platform/common/disarm.c +++ b/platform/common/disarm.c @@ -435,7 +435,7 @@ static int software_interrupt(unsigned int pc, unsigned int insn, char *buf, siz return 1; } -int disarm(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len, uintptr_t *addr) +int disarm(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len, unsigned long *addr) { *addr = 0; @@ -467,7 +467,7 @@ int disarm(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len, uintptr_t *ad return block_data_transfer(pc, insn, buf, buf_len); if ((insn & 0x0e000000) == 0x0a000000) { - *addr = (long)pc + 8 + ((long)(insn << 8) >> 6); + *addr = (unsigned long)pc+8 + ((unsigned long)(insn << 8) >> 6); return branch(pc, insn, buf, buf_len); } diff --git a/platform/common/disarm.h b/platform/common/disarm.h index f11708949..a07675fd0 100644 --- a/platform/common/disarm.h +++ b/platform/common/disarm.h @@ -23,6 +23,6 @@ #ifndef DISARM_H #define DISARM_H -int disarm(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len, uintptr_t *sym); +int disarm(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len, unsigned long *sym); #endif /* DISARM_H */ diff --git a/platform/common/dismips.c b/platform/common/dismips.c index 19c0b427f..dc06ce80e 100644 --- a/platform/common/dismips.c +++ b/platform/common/dismips.c @@ -6,8 +6,9 @@ * See COPYING file in the top-level directory. */ -// XXX unimplemented: SYSCALL, BREAK, SYNC, SDBBP, T*, CACHE, PREF, -// MOVF/MOVT, LWC*/LDC*, SWC*/SDC*, COP*. +// unimplemented insns: MOV[FT], SYSCALL, BREAK, SYNC, SYNCI, T*, SDBBP, RDHWR, +// CACHE, PREF, LWC*/LDC*, SWC*/SDC*, and all of COP* (fpu, mmu, irq, exc, ...) +// unimplemented variants of insns: EHB, SSNOP (both SLL zero), JALR.HB, JR.HB // however, it's certainly good enough for anything picodrive DRC throws at it. #include @@ -79,6 +80,7 @@ struct insn { #define OP_SPECIAL 0x00 static const struct insn special_insns[] = { {0x00, S_IMM_DT, "sll"}, +// {0x01, , "movf\0movt"}, {0x02, S_IMM_DT|SR_BIT, "srl\0rotr"}, {0x03, S_IMM_DT, "sra"}, {0x04, REG_DTS, "sllv"}, @@ -146,6 +148,7 @@ static const struct insn special2_insns[] = { {0x21, REG_DS, "clo" }, {0x24, REG_DS, "dclz" }, {0x25, REG_DS, "dclo" }, +// {0x37, , "sdbbp" }, }; // instructions with opcode SPECIAL3 (R-type) @@ -159,6 +162,7 @@ static const struct insn special3_insns[] = { {0x05, F_IMM_TS, "dinsm" }, {0x06, F_IMM_TS, "dinsu" }, {0x07, F_IMM_TS, "dins" }, +// {0x3b, , "rdhwr" }, }; // instruction with opcode SPECIAL3 and function *BSHFL @@ -192,6 +196,7 @@ static const struct insn regimm_insns[] = { {0x12, B_IMM_S, "bltzall"}, {0x13, B_IMM_S, "bgezall"}, {0x13, B_IMM_S, "bgezall"}, +// {0x1f, , "synci" }, }; // instructions with other opcodes (I-type) @@ -316,7 +321,7 @@ static unsigned long j_target(unsigned long pc, uint32_t insn) } // main disassembler function -int dismips(uintptr_t pc, uint32_t insn, char *buf, size_t buflen, uintptr_t *sym) +int dismips(uintptr_t pc, uint32_t insn, char *buf, size_t buflen, unsigned long *sym) { const struct insn *pi = decode_insn(insn); char *rs = register_names[(insn >> 21) & 0x1f]; diff --git a/platform/common/dismips.h b/platform/common/dismips.h index b547003b9..8d1059254 100644 --- a/platform/common/dismips.h +++ b/platform/common/dismips.h @@ -1,6 +1,6 @@ #ifndef DISMIPS_H #define DISMIPS_H -int dismips(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len, uintptr_t *sym); +int dismips(uintptr_t pc, uint32_t insn, char *buf, size_t buf_len, unsigned long *sym); #endif /* DISMIPS_H */ diff --git a/platform/common/host_dasm.c b/platform/common/host_dasm.c index fc3cbe677..2084aa91d 100644 --- a/platform/common/host_dasm.c +++ b/platform/common/host_dasm.c @@ -37,14 +37,14 @@ void host_dasm(void *addr, int len) void *end = (char *)addr + len; const char *name; char buf[64]; - long insn, symaddr; + unsigned long insn, symaddr; while (addr < end) { name = lookup_name(addr); if (name != NULL) printf("%s:\n", name); - insn = *(long *)addr; + insn = *(unsigned long *)addr; printf(" %08lx %08lx ", (long)addr, insn); if(disasm((unsigned)addr, insn, buf, sizeof(buf), &symaddr)) { diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index 349b8605c..8a1092e0a 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -12,15 +12,12 @@ ENDIAN= compile_rodata () { $CC $CFLAGS -I .. -c /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 - # echo 'void dummy(void) { asm(""::"r" (&val)); }' >> /tmp/getoffs.c - # $CC $CFLAGS -I .. -nostdlib -Wl,-edummy /tmp/getoffs.c \ - # -o /tmp/getoffs.o || exit 1 # find the name of the .rodata section (in case -fdata-sections is used) rosect=$(readelf -S /tmp/getoffs.o | grep '\.rodata\|\.sdata' | sed 's/^[^.]*././;s/ .*//') - # read out .rodata section as hex string (should be only 4 or 8 bytes) + # read out .rodata section as hex string (should be only 4 bytes) ro=$(readelf -x $rosect /tmp/getoffs.o | grep '0x' | cut -c14-48 | - tr -d ' \n') + tr -d ' \n' | cut -c1-8) if [ "$ENDIAN" = "le" ]; then # swap needed for le target hex="" @@ -41,16 +38,18 @@ get_define () # prefix struct member member... struct=$1; shift field=$(echo $* | sed 's/ /./g') name=$(echo $* | sed 's/ /_/g') - echo '#include "pico/pico_int.h"' > /tmp/getoffs.c + echo '#include ' > /tmp/getoffs.c + echo '#include "pico/pico_int.h"' >> /tmp/getoffs.c echo "static const struct $struct p;" >> /tmp/getoffs.c - echo "const int val = (char *)&p.$field - (char*)&p;" >>/tmp/getoffs.c + echo "const int32_t val = (char *)&p.$field - (char*)&p;" >>/tmp/getoffs.c compile_rodata line=$(printf "#define %-20s 0x%04x" $prefix$name $rodata) } if echo $CFLAGS | grep -qe -flto; then CFLAGS="$CFLAGS -fno-lto"; fi # determine endianess -echo "const int val = 1;" >/tmp/getoffs.c +echo '#include ' >/tmp/getoffs.c +echo "const int32_t val = 1;" >>/tmp/getoffs.c compile_rodata ENDIAN=$(if [ "$rodata" -eq 1 ]; then echo be; else echo le; fi) # output header From cb20bbd83906e9b5434a0484c4b9106b2e2de40b Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 13 Dec 2019 18:23:03 +0100 Subject: [PATCH 088/174] sh2 drc: fix speed regression --- cpu/drc/emit_arm.c | 2 +- cpu/drc/emit_arm64.c | 2 +- cpu/drc/emit_mips.c | 2 +- cpu/drc/emit_riscv.c | 2 +- cpu/drc/emit_x86.c | 2 +- cpu/sh2/compiler.h | 32 ++++++++++++++++++++++++-------- cpu/sh2/sh2.h | 1 + pico/32x/memory.c | 10 +++++----- 8 files changed, 35 insertions(+), 18 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 8ea148eb8..af9491f13 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -20,7 +20,7 @@ #define TEMPORARY_REGS { 12, 14 } #define CONTEXT_REG 11 -#define STATIC_SH2_REGS { SHR_SR,10 , SHR_R0,8 , SHR_R0+1,9 } +#define STATIC_SH2_REGS { SHR_SR,10 , SHR_R(0),8 , SHR_R(1),9 } // XXX: tcache_ptr type for SVP and SH2 compilers differs.. #define EMIT_PTR(ptr, x) \ diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 26fede3a6..8d1a7dd1a 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -15,7 +15,7 @@ #define TEMPORARY_REGS { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 } #define CONTEXT_REG 29 -#define STATIC_SH2_REGS { SHR_SR,28 , SHR_R0,27 , SHR_R0+1,26 } +#define STATIC_SH2_REGS { SHR_SR,28 , SHR_R(0),27 , SHR_R(1),26 } // R31 doesn't exist, it aliases either with zero or SP #define SP 31 // stack pointer diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 765986a6b..8cb094deb 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -17,7 +17,7 @@ #define TEMPORARY_REGS { 2, 3, 8, 9, 10, 11, 12, 13, 14 } // v0-v1,t0-t6 #define CONTEXT_REG 23 // s7 -#define STATIC_SH2_REGS { SHR_SR,22 , SHR_R0,21 , SHR_R0+1,20 } +#define STATIC_SH2_REGS { SHR_SR,22 , SHR_R(0),21 , SHR_R(1),20 } // NB: the ubiquitous JZ74[46]0 uses MIPS32 Release 1, a slight MIPS II superset #ifndef __mips_isa_rev diff --git a/cpu/drc/emit_riscv.c b/cpu/drc/emit_riscv.c index ed45e01ca..90234b229 100644 --- a/cpu/drc/emit_riscv.c +++ b/cpu/drc/emit_riscv.c @@ -16,7 +16,7 @@ #define TEMPORARY_REGS { 5, 6, 7 } // t0-t2 #define CONTEXT_REG 9 // s1 -#define STATIC_SH2_REGS { SHR_SR,27 , SHR_R0,26 , SHR_R0+1,25 } +#define STATIC_SH2_REGS { SHR_SR,27 , SHR_R(0),26 , SHR_R(1),25 } // registers usable for user code: r1-r25, others reserved or special #define Z0 0 // zero register diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 0b3f76970..ec13551e3 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -1072,7 +1072,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define PARAM_REGS { xCX, xDX, xR8, xR9 } #define PRESERVED_REGS { xSI, xDI, xR12, xR13, xR14, xR15, xBX, xBP } #define TEMPORARY_REGS { xAX, xR10, xR11 } -#define STATIC_SH2_REGS { SHR_SR,xBX , SHR_R0,xR15 , SH2_R0+1,xR14 } +#define STATIC_SH2_REGS { SHR_SR,xBX , SHR_R(0),xR15 , SH2_R(1),xR14 } #define host_arg2reg(rd, arg) \ switch (arg) { \ diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 804f2a70f..dd37d4707 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -33,34 +33,50 @@ unsigned short scan_block(uint32_t base_pc, int is_slave, uint32_t *base_literals, uint32_t *end_literals); #if defined(DRC_SH2) && defined(__GNUC__) -// direct access to some host CPU registers used by the DRC -// XXX MUST match definitions for SHR_SR in cpu/drc/emit_*.c +// direct access to some host CPU registers used by the DRC if gcc is used. +// XXX MUST match SHR_SR definitions in cpu/drc/emit_*.c; should be moved there +// XXX yuck, there's no portable way to determine register size. Use long long +// if target is 64 bit and data model is ILP32 or LLP64(windows), else long #if defined(__arm__) #define DRC_SR_REG "r10" +#define DRC_REG_LL 0 // 32 bit #elif defined(__aarch64__) #define DRC_SR_REG "r28" +#define DRC_REG_LL (__ILP32__ || _WIN32) #elif defined(__mips__) #define DRC_SR_REG "s6" +#define DRC_REG_LL (_MIPS_SIM == _ABIN32) #elif defined(__riscv__) || defined(__riscv) #define DRC_SR_REG "s11" +#define DRC_REG_LL 0 // no ABI for (__ILP32__ && __riscv_xlen != 32) #elif defined(__i386__) #define DRC_SR_REG "edi" +#define DRC_REG_LL 0 // 32 bit #elif defined(__x86_64__) -#define DRC_SR_REG "ebx" +#define DRC_SR_REG "rbx" +#define DRC_REG_LL (__ILP32__ || _WIN32) #endif #endif #ifdef DRC_SR_REG +// XXX this is more clear but produces too much overhead for slow platforms extern void REGPARM(1) (*sh2_drc_save_sr)(SH2 *sh2); extern void REGPARM(1) (*sh2_drc_restore_sr)(SH2 *sh2); -#define DRC_DECLARE_SR register int32_t sh2_sr asm(DRC_SR_REG) +// NB: sh2_sr MUST have register size if optimizing with -O3 (-fif-conversion) +#if DRC_REG_LL +#define DRC_DECLARE_SR register long long _sh2_sr asm(DRC_SR_REG) +#else +#define DRC_DECLARE_SR register long _sh2_sr asm(DRC_SR_REG) +#endif #define DRC_SAVE_SR(sh2) \ - if (likely((sh2->state & (SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN)) \ - sh2_drc_save_sr(sh2) + if (likely((sh2->state&(SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN)) \ + sh2->sr = (s32)_sh2_sr +// sh2_drc_save_sr(sh2) #define DRC_RESTORE_SR(sh2) \ - if (likely((sh2->state & (SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN)) \ - sh2_drc_restore_sr(sh2) + if (likely((sh2->state&(SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN)) \ + _sh2_sr = (s32)sh2->sr +// sh2_drc_restore_sr(sh2) #else #define DRC_DECLARE_SR #define DRC_SAVE_SR(sh2) diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index 2d73db59a..2f2dfd922 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -10,6 +10,7 @@ typedef enum { SHR_GBR, SHR_VBR, SHR_MACH, SHR_MACL, SH2_REGS // register set size } sh2_reg_e; +#define SHR_R(n) (SHR_R0+(n)) typedef struct SH2_ { diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 44bc72d7d..30d9b577c 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -254,14 +254,14 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) sh2_poll_rd[hix] = rd; sh2_poll_wr[hix] = wr; } -u32 REGPARM(3) p32x_sh2_poll_memory8(unsigned int a, u32 d, SH2 *sh2) +u32 REGPARM(3) p32x_sh2_poll_memory8(u32 a, u32 d, SH2 *sh2) { int shift = (a & 1 ? 0 : 8); d = (s8)(p32x_sh2_poll_memory16(a & ~1, d << shift, sh2) >> shift); return d; } -u32 REGPARM(3) p32x_sh2_poll_memory16(unsigned int a, u32 d, SH2 *sh2) +u32 REGPARM(3) p32x_sh2_poll_memory16(u32 a, u32 d, SH2 *sh2) { unsigned char *p = sh2->p_drcblk_ram; unsigned int cycles; @@ -281,7 +281,7 @@ u32 REGPARM(3) p32x_sh2_poll_memory16(unsigned int a, u32 d, SH2 *sh2) return d; } -u32 REGPARM(3) p32x_sh2_poll_memory32(unsigned int a, u32 d, SH2 *sh2) +u32 REGPARM(3) p32x_sh2_poll_memory32(u32 a, u32 d, SH2 *sh2) { unsigned char *p = sh2->p_drcblk_ram; unsigned int cycles; @@ -2017,9 +2017,9 @@ int p32x_sh2_memcpy(u32 dst, u32 src, int count, int size, SH2 *sh2) // ----------------------------------------------------------------- -static void z80_md_bank_write_32x(unsigned int a, unsigned char d) +static void z80_md_bank_write_32x(u32 a, unsigned char d) { - unsigned int addr68k; + u32 addr68k; addr68k = Pico.m.z80_bank68k << 15; addr68k += a & 0x7fff; From 4cd464bbde47055d100cfc27535f2c3932352d5b Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 21 Dec 2019 16:33:52 +0100 Subject: [PATCH 089/174] sh2 drc: optimize T bit handling for A64 --- Makefile | 5 ++-- cpu/drc/emit_arm64.c | 64 +++++++++++++++++++++++++++++--------------- cpu/drc/emit_riscv.c | 5 +--- cpu/sh2/compiler.c | 12 ++++----- 4 files changed, 53 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index a4482d786..ed5fd4b69 100644 --- a/Makefile +++ b/Makefile @@ -42,10 +42,11 @@ endif ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1")) # very small caches, avoid optimization options making the binary much bigger -CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp +CFLAGS += -finline-limit=43 -fno-unroll-loops -fno-ipa-cp -ffast-math # this gets you about 20% better execution speed on 32bit arm/mips -CFLAGS += -fno-common -fno-stack-protector -fno-guess-branch-probability -fno-caller-saves -fno-tree-loop-if-convert -ffast-math +CFLAGS += -fno-common -fno-stack-protector -fno-guess-branch-probability -fno-caller-saves -fno-tree-loop-if-convert -fno-regmove endif +#OBJS += align.o # default settings ifeq "$(ARCH)" "arm" diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 8d1a7dd1a..2e873161a 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -44,10 +44,11 @@ #define A64_COND_LE 0xd #define A64_COND_CS A64_COND_HS #define A64_COND_CC A64_COND_LO +// "fake" conditions for T bit handling #define A64_COND_AL 0xe #define A64_COND_NV 0xf -/* unified conditions */ +// DRC conditions #define DCOND_EQ A64_COND_EQ #define DCOND_NE A64_COND_NE #define DCOND_MI A64_COND_MI @@ -261,6 +262,13 @@ enum { XT_UXTW=0x4, XT_UXTX=0x6, XT_LSL=0x7, XT_SXTW=0xc, XT_SXTX=0xe }; #define A64_BCOND(cond, offs19) \ A64_INSN(0xa,0x2,_,_,_,_,_,(offs19) >> 2,(cond)) +// conditional select + +#define A64_CINC(cond, rn, rm) \ + A64_INSN(0xd,0x0,0x2,0,rm,(cond)^1,0x1,rm,rn) /* CSINC */ +#define A64_CSET(cond, rn) \ + A64_CINC(cond, rn, Z0) + // load pc-relative #define A64_LDRLIT_IMM(rd, offs19) \ @@ -1356,38 +1364,52 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #ifdef T // T bit handling +static int tcond = -1; + #define emith_invert_cond(cond) \ ((cond) ^ 1) -static void emith_clr_t_cond(int sr) -{ - emith_bic_r_imm(sr, T); -} +#define emith_clr_t_cond(sr) \ + (void)sr -static void emith_set_t_cond(int sr, int cond) -{ - EMITH_SJMP_START(emith_invert_cond(cond)); - emith_or_r_imm_c(cond, sr, T); - EMITH_SJMP_END(emith_invert_cond(cond)); -} +#define emith_set_t_cond(sr, cond) \ + tcond = cond -#define emith_get_t_cond() -1 +#define emith_get_t_cond() \ + tcond -#define emith_sync_t(sr) ((void)sr) +#define emith_invalidate_t() \ + tcond = -1 -#define emith_invalidate_t() +#define emith_set_t(sr, val) \ + tcond = ((val) ? A64_COND_AL: A64_COND_NV) -static void emith_set_t(int sr, int val) +static void emith_sync_t(int sr) { - if (val) - emith_or_r_imm(sr, T); - else - emith_bic_r_imm(sr, T); + if (tcond == A64_COND_AL) + emith_or_r_imm(sr, T); + else if (tcond == A64_COND_NV) + emith_bic_r_imm(sr, T); + else if (tcond >= 0) { + int tmp = rcache_get_tmp(); + EMIT(A64_CSET(tcond, tmp)); + EMIT(A64_BFI_IMM(sr, tmp, 0, 1)); // assumes SR.T = bit 0 + rcache_free_tmp(tmp); + } + tcond = -1; } static int emith_tst_t(int sr, int tf) { - emith_tst_r_imm(sr, T); - return tf ? DCOND_NE: DCOND_EQ; + if (tcond < 0) { + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; + } else if (tcond >= A64_COND_AL) { + // MUST sync because A64_COND_AL/NV isn't a real condition + emith_sync_t(sr); + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; + } else + return tf ? tcond : emith_invert_cond(tcond); } #endif diff --git a/cpu/drc/emit_riscv.c b/cpu/drc/emit_riscv.c index 90234b229..69ed530ea 100644 --- a/cpu/drc/emit_riscv.c +++ b/cpu/drc/emit_riscv.c @@ -87,8 +87,6 @@ enum { F1_B, F1_H, F1_W, F1_D, F1_BU, F1_HU, F1_WU }; // LD/ST // func7 enum { F2_ALT=0x20, F2_MULDIV=0x01 }; -#define __(n) o##n // enum marker for "undefined" - #define R5_NOP R5_I_INSN(OP_IMM, F1_ADD, Z0, Z0, 0) // nop: ADDI r0, r0, #0 // arithmetic/logical @@ -687,9 +685,8 @@ static void emith_pool_check(void) static void emith_move_imm(int r, uintptr_t imm) { - u32 lui = imm + _CB(imm,1,11,12); + u32 lui = imm + _CB(imm,1,11,12); // compensate for ADDI sign extension if (lui >> 12) { - // take out the effect of the sign extension of ADDI EMIT(R5_MOVT_IMM(r, lui)); if (imm & 0xfff) EMIT(R5_ADD_IMM(r, r, imm)); diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index ca9a05500..bd3e5b43b 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -446,7 +446,6 @@ static void rcache_free_tmp(int hr); // there must be at least 3 PARAM, and PARAM+TEMPORARY must be at least 4. // SR must and R0 should by all means be statically mapped. // XXX the static definition of SR MUST match that in compiler.h -// PC and PR must not be statically mapped (accessed in context by utils). #ifdef __arm__ #include "../drc/emit_arm.c" @@ -3365,7 +3364,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_get_reg_arg(2, SHR_SR, NULL); tmp2 = rcache_get_tmp_arg(0); tmp3 = rcache_get_tmp_arg(1); - tmp4 = rcache_get_tmp_arg(3); + tmp4 = rcache_get_tmp(); emith_move_r_ptr_imm(tmp2, tcache_ptr); emith_move_r_r_ptr(tmp3, CONTEXT_REG); emith_move_r_imm(tmp4, pc); @@ -5049,11 +5048,12 @@ static void sh2_generate_utils(void) emith_add_r_imm(arg2, (u32)(2*sizeof(void *))); emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *)); emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx)); - emith_add_r_r_r_lsl_ptr(arg2, CONTEXT_REG, arg2, 0); - emith_ctx_read(arg3, SHR_PR * 4); + emith_add_r_r_r_lsl_ptr(arg3, CONTEXT_REG, arg2, 0); + rcache_get_reg_arg(2, SHR_PR, NULL); emith_add_r_ret(arg1); - emith_write_r_r_offs_ptr(arg1, arg2, offsetof(SH2, rts_cache)+sizeof(void *)); - emith_write_r_r_offs(arg3, arg2, offsetof(SH2, rts_cache)); + emith_write_r_r_offs_ptr(arg1, arg3, offsetof(SH2, rts_cache)+sizeof(void *)); + emith_write_r_r_offs(arg2, arg3, offsetof(SH2, rts_cache)); + rcache_flush(); emith_ret(); emith_flush(); From 732e6504a3b7c1f77ef888a472edd6007d1d47f1 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 21 Dec 2019 22:54:40 +0100 Subject: [PATCH 090/174] sh2 drc: updates from mame for ym2612 sound --- pico/sound/ym2612.c | 72 +++++++++++++++++++++++++++-------------- pico/sound/ym2612_arm.S | 14 +++++--- platform/gp2x/emu.c | 2 +- 3 files changed, 57 insertions(+), 31 deletions(-) diff --git a/pico/sound/ym2612.c b/pico/sound/ym2612.c index 0867f558b..56408524e 100644 --- a/pico/sound/ym2612.c +++ b/pico/sound/ym2612.c @@ -5,6 +5,8 @@ ** ** SSG-EG was also removed, because it's rarely used, Sega2.doc even does not ** document it ("proprietary") and tells to write 0 to SSG-EG control register. +** +** updated with fixes from mame 0.216 (file version 1.5.1) (kub) */ /* @@ -148,7 +150,7 @@ void memset32(int *dest, int c, int count); #define FREQ_SH 16 /* 16.16 fixed point (frequency calculations) */ #define EG_SH 16 /* 16.16 fixed point (envelope generator timing) */ -#define LFO_SH 25 /* 7.25 fixed point (LFO calculations) */ +#define LFO_SH 24 /* 8.24 fixed point (LFO calculations) */ #define TIMER_SH 16 /* 16.16 fixed point (timers calculations) */ #define ENV_BITS 10 @@ -287,8 +289,8 @@ O(18),O(18),O(18),O(18),O(18),O(18),O(18),O(18), O(18),O(18),O(18),O(18),O(18),O(18),O(18),O(18), /* rates 00-11 */ -O( 0),O( 1),O( 2),O( 3), -O( 0),O( 1),O( 2),O( 3), +O(18),O(18),O( 0),O( 0), +O( 0),O( 0),O( 2),O( 2), O( 0),O( 1),O( 2),O( 3), O( 0),O( 1),O( 2),O( 3), O( 0),O( 1),O( 2),O( 3), @@ -328,10 +330,10 @@ O(16),O(16),O(16),O(16),O(16),O(16),O(16),O(16) #define O(a) (a*1) static const UINT8 eg_rate_shift[32+64+32]={ /* Envelope Generator counter shifts (32 + 64 rates + 32 RKS) */ /* 32 infinite time rates */ -O(0),O(0),O(0),O(0),O(0),O(0),O(0),O(0), -O(0),O(0),O(0),O(0),O(0),O(0),O(0),O(0), -O(0),O(0),O(0),O(0),O(0),O(0),O(0),O(0), -O(0),O(0),O(0),O(0),O(0),O(0),O(0),O(0), +O(11),O(11),O(11),O(11),O(11),O(11),O(11),O(11), +O(11),O(11),O(11),O(11),O(11),O(11),O(11),O(11), +O(11),O(11),O(11),O(11),O(11),O(11),O(11),O(11), +O(11),O(11),O(11),O(11),O(11),O(11),O(11),O(11), /* rates 00-11 */ O(11),O(11),O(11),O(11), @@ -560,7 +562,13 @@ INLINE void FM_KEYON(int c , int s ) { SLOT->key = 1; SLOT->phase = 0; /* restart Phase Generator */ - SLOT->state = EG_ATT; /* phase -> Attack */ + if (SLOT->ar + SLOT->ksr < 32+62) { + SLOT->state = (SLOT->volume > MIN_ATT_INDEX) ? EG_ATT : + ((SLOT->sl == MIN_ATT_INDEX) ? EG_SUS : EG_DEC); + } else { + SLOT->volume = MIN_ATT_INDEX; + SLOT->state = (SLOT->sl == MIN_ATT_INDEX) ? EG_SUS : EG_DEC; + } ym2612.slot_mask |= (1<eg_pack_ar = eg_inc_pack[eg_sel_ar] | (eg_sh_ar<<24); @@ -656,6 +664,9 @@ INLINE void set_sl_rr(FM_SLOT *SLOT, int v) SLOT->sl = sl_table[ v>>4 ]; + if (SLOT->state == EG_DEC && (SLOT->volume >= (INT32)(SLOT->sl))) + SLOT->state = EG_SUS; + SLOT->rr = 34 + ((v&0x0f)<<2); eg_sh_rr = eg_rate_shift [SLOT->rr + SLOT->ksr]; @@ -715,12 +726,12 @@ INLINE int advance_lfo(int lfo_ampm, UINT32 lfo_cnt_old, UINT32 lfo_cnt) if (prev_pos != pos) { lfo_ampm &= 0xff; - /* triangle */ + /* triangle (inverted) */ /* AM: 0 to 126 step +2, 126 to 0 step -2 */ if (pos<64) - lfo_ampm |= ((pos&63) * 2) << 8; /* 0 - 126 */ + lfo_ampm |= ((pos^63) * 2) << 8; /* 0 - 126 */ else - lfo_ampm |= (126 - (pos&63)*2) << 8; + lfo_ampm |= ((pos&63) * 2) << 8; } else { @@ -759,7 +770,7 @@ INLINE void update_eg_phase(UINT16 *vol_out, FM_SLOT *SLOT, UINT32 eg_cnt) if ( volume <= MIN_ATT_INDEX ) { volume = MIN_ATT_INDEX; - SLOT->state = EG_DEC; + SLOT->state = (SLOT->sl == MIN_ATT_INDEX) ? EG_SUS: EG_DEC; } break; @@ -1124,22 +1135,29 @@ static int chan_render(int *buffer, int length, int c, UINT32 flags) // flags: s { UINT8 blk; UINT32 fn; - int kc,fc; + int kc,fc,fdt; - blk = block_fnum >> 11; block_fnum = block_fnum*2 + lfo_fn_table_index_offset; - + blk = (block_fnum&0x7000) >> 12; fn = block_fnum & 0xfff; /* keyscale code */ - kc = (blk<<2) | opn_fktable[fn >> 8]; + kc = (blk<<2) | opn_fktable[(fn >> 7) & 0xf]; /* phase increment counter */ - fc = fn_table[fn]>>(7-blk); - - crct.incr1 = ((fc+crct.CH->SLOT[SLOT1].DT[kc])*crct.CH->SLOT[SLOT1].mul) >> 1; - crct.incr2 = ((fc+crct.CH->SLOT[SLOT2].DT[kc])*crct.CH->SLOT[SLOT2].mul) >> 1; - crct.incr3 = ((fc+crct.CH->SLOT[SLOT3].DT[kc])*crct.CH->SLOT[SLOT3].mul) >> 1; - crct.incr4 = ((fc+crct.CH->SLOT[SLOT4].DT[kc])*crct.CH->SLOT[SLOT4].mul) >> 1; + fc = (fn_table[fn]>>(7-blk)); + + fdt = fc + crct.CH->SLOT[SLOT1].DT[kc]; + if (fdt < 0) fdt += fn_table[0x7ff*2] >> 2; + crct.incr1 = (fdt*crct.CH->SLOT[SLOT1].mul) >> 1; + fdt = fc + crct.CH->SLOT[SLOT2].DT[kc]; + if (fdt < 0) fdt += fn_table[0x7ff*2] >> 2; + crct.incr2 = (fdt*crct.CH->SLOT[SLOT2].mul) >> 1; + fdt = fc + crct.CH->SLOT[SLOT3].DT[kc]; + if (fdt < 0) fdt += fn_table[0x7ff*2] >> 2; + crct.incr3 = (fdt*crct.CH->SLOT[SLOT3].mul) >> 1; + fdt = fc + crct.CH->SLOT[SLOT4].DT[kc]; + if (fdt < 0) fdt += fn_table[0x7ff*2] >> 2; + crct.incr4 = (fdt*crct.CH->SLOT[SLOT4].mul) >> 1; } else /* LFO phase modulation = zero */ { @@ -1201,7 +1219,7 @@ INLINE void refresh_fc_eg_slot(FM_SLOT *SLOT, int fc, int kc) else { eg_sh = 0; - eg_sel = 17; + eg_sel = 18; } SLOT->eg_pack_ar = eg_inc_pack[eg_sel] | (eg_sh<<24); @@ -1256,7 +1274,7 @@ static void init_timetables(const UINT8 *dttable) /* DeTune table */ for (d = 0;d <= 3;d++){ for (i = 0;i <= 31;i++){ - rate = ((double)dttable[d*32 + i]) * SIN_LEN * ym2612.OPN.ST.freqbase * (1< Date: Tue, 31 Dec 2019 10:55:40 +0100 Subject: [PATCH 091/174] add DC filter to sound mixer to remove potential PCM DC offset --- pico/sound/mix.c | 73 ++++++++++++++++++++++++++++++---------- pico/sound/mix.h | 1 + pico/sound/mix_arm.S | 79 +++++++++++++++++++++++++++++++++++--------- pico/sound/sound.c | 2 ++ 4 files changed, 121 insertions(+), 34 deletions(-) diff --git a/pico/sound/mix.c b/pico/sound/mix.c index 202ba3551..242cb375f 100644 --- a/pico/sound/mix.c +++ b/pico/sound/mix.c @@ -6,41 +6,72 @@ * See COPYING file in the top-level directory. */ +#include "string.h" + #define MAXOUT (+32767) #define MINOUT (-32768) /* limitter */ -#define Limit(val, max,min) { \ - if ( val > max ) val = max; \ - else if ( val < min ) val = min; \ +#define Limit16(val) { \ + val -= (val >> 2); \ + if ((short)val != val) val = (val < 0 ? MINOUT : MAXOUT); \ } int mix_32_to_16l_level; -void mix_32_to_16l_stereo_core(short *dest, int *src, int count, int level) +static struct iir2 { // 2-pole IIR + int x[2]; // sample buffer + int y[2]; // filter intermediates +} lfi2, rfi2; + +// NB ">>" rounds to -infinity, "/" to 0. To compensate the effect possibly use +// "-(-y>>n)" (round to +infinity) instead of "y>>n" in places. + +// NB uses Q12 fixpoint; samples mustn't have more than 20 bits for this. +#define QB 12 + + +// exponential moving average filter for DC filtering +// y[n] = (x[n]-y[n-1])*(1/8192) (corner approx. 20Hz, gain 1) +static inline int filter_exp(struct iir2 *fi2, int x) { - int l, r; + int xf = (x<y[0]; + fi2->y[0] += xf >> 13; + xf -= xf >> 2; // level reduction to avoid clipping from overshoot + return xf>>QB; +} - for (; count > 0; count--) - { - l = r = *dest; - l += *src++ >> level; - r += *src++ >> level; - Limit( l, MAXOUT, MINOUT ); - Limit( r, MAXOUT, MINOUT ); - *dest++ = l; - *dest++ = r; - } +// unfiltered (for testing) +static inline int filter_null(struct iir2 *fi2, int x) +{ + return x; +} + +#define mix_32_to_16l_stereo_core(dest, src, count, lv, fl) { \ + int l, r; \ + \ + for (; count > 0; count--) \ + { \ + l = r = *dest; \ + l += *src++ >> lv; \ + r += *src++ >> lv; \ + l = fl(&lfi2, l); \ + r = fl(&rfi2, r); \ + Limit16(l); \ + Limit16(r); \ + *dest++ = l; \ + *dest++ = r; \ + } \ } void mix_32_to_16l_stereo_lvl(short *dest, int *src, int count) { - mix_32_to_16l_stereo_core(dest, src, count, mix_32_to_16l_level); + mix_32_to_16l_stereo_core(dest, src, count, mix_32_to_16l_level, filter_exp); } void mix_32_to_16l_stereo(short *dest, int *src, int count) { - mix_32_to_16l_stereo_core(dest, src, count, 0); + mix_32_to_16l_stereo_core(dest, src, count, 0, filter_exp); } void mix_32_to_16_mono(short *dest, int *src, int count) @@ -51,7 +82,8 @@ void mix_32_to_16_mono(short *dest, int *src, int count) { l = *dest; l += *src++; - Limit( l, MAXOUT, MINOUT ); + l = filter_exp(&lfi2, l); + Limit16(l); *dest++ = l; } } @@ -87,3 +119,8 @@ void mix_16h_to_32_s2(int *dest_buf, short *mp3_buf, int count) } } +void mix_reset(void) +{ + memset(&lfi2, 0, sizeof(lfi2)); + memset(&rfi2, 0, sizeof(rfi2)); +} diff --git a/pico/sound/mix.h b/pico/sound/mix.h index b9315114c..e128bad17 100644 --- a/pico/sound/mix.h +++ b/pico/sound/mix.h @@ -8,3 +8,4 @@ void mix_32_to_16_mono(short *dest, int *src, int count); extern int mix_32_to_16l_level; void mix_32_to_16l_stereo_lvl(short *dest, int *src, int count); +void mix_reset(void); diff --git a/pico/sound/mix_arm.S b/pico/sound/mix_arm.S index 5088e61bb..bb7388d6b 100644 --- a/pico/sound/mix_arm.S +++ b/pico/sound/mix_arm.S @@ -166,13 +166,6 @@ m16_32_s2_no_unal2: @ limit and shift up by 16 @ reg=int_sample, lr=1, r3=tmp, kills flags .macro Limitsh reg -@ movs r4, r3, asr #16 -@ cmnne r4, #1 -@ beq c32_16_no_overflow -@ tst r4, r4 -@ mov r3, #0x8000 -@ subpl r3, r3, #1 - add r3, lr, \reg, asr #15 bics r3, r3, #1 @ in non-overflow conditions r3 is 0 or 1 moveq \reg, \reg, lsl #16 @@ -180,20 +173,30 @@ m16_32_s2_no_unal2: subpl \reg, \reg, #0x00010000 .endm +@ filter out DC offset +@ in=int_sample (max 20 bit), y=filter memory, r3=tmp +.macro DCfilt in y + rsb r3, \y, \in, asl #12 @ fixpoint 20.12 + add \y, \y, r3, asr #13 + sub \in, \in, \y, asr #12 + sub \in, \in, \in, asr #2 @ reduce audio lvl some +.endm @ mix 32bit audio (with 16bits really used, upper bits indicate overflow) with normal 16 bit audio with left channel only @ warning: this function assumes dest is word aligned .global mix_32_to_16l_stereo @ short *dest, int *src, int count mix_32_to_16l_stereo: - stmfd sp!, {r4-r8,lr} - - mov lr, #1 + stmfd sp!, {r4-r8,r10-r11,lr} mov r2, r2, lsl #1 subs r2, r2, #4 bmi m32_16l_st_end + mov lr, #1 + ldr r12, =filter + ldmia r12, {r10-r11} + m32_16l_st_loop: ldmia r0, {r8,r12} ldmia r1!, {r4-r7} @@ -203,6 +206,10 @@ m32_16l_st_loop: add r5, r5, r8, asr #16 add r6, r6, r12,asr #16 add r7, r7, r12,asr #16 + DCfilt r4, r10 + DCfilt r5, r11 + DCfilt r6, r10 + DCfilt r7, r11 Limitsh r4 Limitsh r5 Limitsh r6 @@ -221,13 +228,17 @@ m32_16l_st_end: ldmia r1!,{r4,r5} add r4, r4, r6 add r5, r5, r6 + DCfilt r4, r10 + DCfilt r5, r11 Limitsh r4 Limitsh r5 orr r4, r5, r4, lsr #16 str r4, [r0], #4 m32_16l_st_no_unal2: - ldmfd sp!, {r4-r8,lr} + ldr r12, =filter + stmia r12, {r10-r11} + ldmfd sp!, {r4-r8,r10-r11,lr} bx lr @@ -235,9 +246,11 @@ m32_16l_st_no_unal2: .global mix_32_to_16_mono @ short *dest, int *src, int count mix_32_to_16_mono: - stmfd sp!, {r4-r8,lr} + stmfd sp!, {r4-r8,r10-r11,lr} mov lr, #1 + ldr r12, =filter + ldr r10, [r12] @ check if dest is word aligned tst r0, #2 @@ -262,6 +275,10 @@ m32_16_mo_loop: add r7, r7, r12,asr #16 mov r12,r12,lsl #16 add r6, r6, r12,asr #16 + DCfilt r4, r10 + DCfilt r5, r10 + DCfilt r6, r10 + DCfilt r7, r10 Limitsh r4 Limitsh r5 Limitsh r6 @@ -281,6 +298,8 @@ m32_16_mo_end: add r5, r5, r6, asr #16 mov r6, r6, lsl #16 add r4, r4, r6, asr #16 + DCfilt r4, r10 + DCfilt r5, r10 Limitsh r4 Limitsh r5 orr r4, r5, r4, lsr #16 @@ -288,14 +307,18 @@ m32_16_mo_end: m32_16_mo_no_unal2: tst r2, #1 - ldmeqfd sp!, {r4-r8,pc} + beq m32_16_mo_no_unal ldrsh r5, [r0] ldr r4, [r1], #4 add r4, r4, r5 + DCfilt r4, r10 Limit r4 strh r4, [r0], #2 - ldmfd sp!, {r4-r8,lr} +m32_16_mo_no_unal: + ldr r12, =filter + str r10, [r12] + ldmfd sp!, {r4-r8,r10-r11,lr} bx lr @@ -315,11 +338,13 @@ mix_32_to_16l_level: .global mix_32_to_16l_stereo_lvl @ short *dest, int *src, int count mix_32_to_16l_stereo_lvl: - stmfd sp!, {r4-r9,lr} + stmfd sp!, {r4-r11,lr} ldr r9, =mix_32_to_16l_level mov lr, #1 ldr r9, [r9] + ldr r12, =filter + ldm r12, {r10-r11} mov r2, r2, lsl #1 subs r2, r2, #4 @@ -338,6 +363,10 @@ m32_16l_st_l_loop: mov r5, r5, asr r9 mov r6, r6, asr r9 mov r7, r7, asr r9 + DCfilt r4, r10 + DCfilt r5, r11 + DCfilt r6, r10 + DCfilt r7, r11 Limitsh r4 Limitsh r5 Limitsh r6 @@ -358,15 +387,33 @@ m32_16l_st_l_end: add r5, r5, r6 mov r4, r4, asr r9 mov r5, r5, asr r9 + DCfilt r4, r10 + DCfilt r5, r11 Limitsh r4 Limitsh r5 orr r4, r5, r4, lsr #16 str r4, [r0], #4 m32_16l_st_l_no_unal2: - ldmfd sp!, {r4-r9,lr} + ldr r12, =filter + stmia r12, {r10-r11} + ldmfd sp!, {r4-r11,lr} + bx lr + +.global mix_reset @ void +mix_reset: + ldr r0, =filter + mov r1, #0 + str r1, [r0] + str r1, [r0, #4] bx lr +.data + DCfilt r4, r10 + DCfilt r5, r11 +filter: + .ds 8 + #endif /* __GP2X__ */ @ vim:filetype=armasm diff --git a/pico/sound/sound.c b/pico/sound/sound.c index 0d2ae0f55..376fc9a90 100644 --- a/pico/sound/sound.c +++ b/pico/sound/sound.c @@ -156,6 +156,8 @@ PICO_INTERNAL void PsndReset(void) // Reset low pass filter lpf_lp = 0; lpf_rp = 0; + + mix_reset(); } From 65ae6dfdc9a4418a6e1ab484f13717bf29ec559e Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 8 Jan 2020 00:49:13 +0100 Subject: [PATCH 092/174] audio: added SSG-EG to YM2612, plus some timing changes for SN76496+YM2612 --- Makefile | 2 +- cpu/drc/emit_arm64.c | 2 +- pico/memory.c | 6 +- pico/pico.h | 2 +- pico/pico_cmn.c | 29 +-- pico/pico_int.h | 7 +- pico/sms.c | 8 +- pico/sound/mix.c | 7 +- pico/sound/mix_arm.S | 6 +- pico/sound/sound.c | 150 ++++++-------- pico/sound/ym2612.c | 367 ++++++++++++++++++++++++----------- pico/sound/ym2612.h | 22 ++- pico/sound/ym2612_arm.S | 420 +++++++++++++++++++++------------------- 13 files changed, 572 insertions(+), 456 deletions(-) diff --git a/Makefile b/Makefile index ed5fd4b69..b38b54c02 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ endif ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1")) # very small caches, avoid optimization options making the binary much bigger -CFLAGS += -finline-limit=43 -fno-unroll-loops -fno-ipa-cp -ffast-math +CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp -ffast-math # this gets you about 20% better execution speed on 32bit arm/mips CFLAGS += -fno-common -fno-stack-protector -fno-guess-branch-probability -fno-caller-saves -fno-tree-loop-if-convert -fno-regmove endif diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index 2e873161a..f4645bc15 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -1393,7 +1393,7 @@ static void emith_sync_t(int sr) else if (tcond >= 0) { int tmp = rcache_get_tmp(); EMIT(A64_CSET(tcond, tmp)); - EMIT(A64_BFI_IMM(sr, tmp, 0, 1)); // assumes SR.T = bit 0 + EMIT(A64_BFI_IMM(sr, tmp, __builtin_ffs(T)-1, 1)); rcache_free_tmp(tmp); } tcond = -1; diff --git a/pico/memory.c b/pico/memory.c index cc82f7898..9fe3a0852 100644 --- a/pico/memory.c +++ b/pico/memory.c @@ -546,7 +546,7 @@ static void PicoWrite8_z80(u32 a, u32 d) } if ((a & 0x6000) == 0x4000) { // FM Sound if (PicoIn.opt & POPT_EN_FM) - Pico.m.status |= ym2612_write_local(a & 3, d & 0xff, 0) & 1; + ym2612_write_local(a & 3, d & 0xff, 0); return; } // TODO: probably other VDP access too? Maybe more mirrors? @@ -1059,6 +1059,8 @@ static int ym2612_write_local(u32 a, u32 d, int is_from_z80) break; } + int scanline = get_scanline(is_from_z80); + PsndDoFM(scanline); #ifdef __GP2X__ if (PicoIn.opt & POPT_EXT_FM) return YM2612Write_940(a, d, get_scanline(is_from_z80)); @@ -1224,7 +1226,7 @@ static unsigned char z80_md_bank_read(unsigned short a) static void z80_md_ym2612_write(unsigned int a, unsigned char data) { if (PicoIn.opt & POPT_EN_FM) - Pico.m.status |= ym2612_write_local(a, data, 1) & 1; + ym2612_write_local(a, data, 1); } static void z80_md_vdp_br_write(unsigned int a, unsigned char data) diff --git a/pico/pico.h b/pico/pico.h index fda1c43a8..09249f153 100644 --- a/pico/pico.h +++ b/pico/pico.h @@ -70,7 +70,7 @@ extern void *p32x_bios_g, *p32x_bios_m, *p32x_bios_s; #define POPT_EN_DRC (1<<17) #define POPT_DIS_SPRITE_LIM (1<<18) #define POPT_DIS_IDLE_DET (1<<19) -#define POPT_EN_32X (1<<20) +#define POPT_EN_32X (1<<20) // x0 0000 #define POPT_EN_PWM (1<<21) #define POPT_PWM_IRQ_OPT (1<<22) diff --git a/pico/pico_cmn.c b/pico/pico_cmn.c index 1f89da905..5fa0b16f2 100644 --- a/pico/pico_cmn.c +++ b/pico/pico_cmn.c @@ -88,7 +88,6 @@ static void do_timing_hacks_vb(void) static int PicoFrameHints(void) { struct PicoVideo *pv = &Pico.video; - int line_sample = Pico.m.pal ? 68 : 93; int vdp_slots = (Pico.video.reg[12] & 1) ? 18 : 16; int lines, y, lines_vis, skip; int vcnt_wrap, vcnt_adj; @@ -150,23 +149,6 @@ static int PicoFrameHints(void) } } - // get samples from sound chips - if ((y == 224 || y == line_sample) && PicoIn.sndOut) - { - cycles = SekCyclesDone(); - - if (Pico.m.z80Run && !Pico.m.z80_reset && (PicoIn.opt&POPT_EN_Z80)) - PicoSyncZ80(cycles); -#ifdef PICO_CD - if (PicoIn.AHW & PAHW_MCD) - pcd_sync_s68k(cycles, 0); -#endif -#ifdef PICO_32X - p32x_sync_sh2s(cycles); -#endif - PsndGetSamples(y); - } - // Run scanline: Pico.t.m68c_line_start = Pico.t.m68c_aim; do_timing_hacks_as(pv, vdp_slots); @@ -238,10 +220,6 @@ static int PicoFrameHints(void) p32x_start_blank(); #endif - // get samples from sound chips - if (y == 224 && PicoIn.sndOut) - PsndGetSamples(y); - // Run scanline: CPUS_RUN(CYCLES_M68K_LINE - CYCLES_M68K_VINT_LAG); @@ -298,7 +276,7 @@ static int PicoFrameHints(void) pv->status |= ((pv->reg[1] >> 3) ^ SR_VB) & SR_VB; // forced blanking // last scanline - Pico.m.scanline = y; + Pico.m.scanline = y++; pv->v_counter = 0xff; pv->lwrite_cnt = 0; @@ -337,6 +315,11 @@ static int PicoFrameHints(void) #ifdef PICO_32X p32x_sync_sh2s(cycles); #endif + + // get samples from sound chips + if (PicoIn.sndOut) + PsndGetSamples(y); + timers_cycle(); pv->hint_cnt = hint; diff --git a/pico/pico_int.h b/pico/pico_int.h index 0fc458efb..d3da72ce4 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -336,7 +336,7 @@ struct PicoMisc unsigned char eeprom_cycle; // EEPROM cycle number unsigned char eeprom_slave; // EEPROM slave word for X24C02 and better SRAMs unsigned char eeprom_status; - unsigned char status; // rapid_ym2612, multi_ym_updates + unsigned char pad1; // was ym2612 status unsigned short dma_xfers; // 18 unsigned char eeprom_wb[2]; // EEPROM latch/write buffer unsigned int frame_count; // 1c for movies and idle det @@ -433,6 +433,8 @@ struct PicoSound int len_e_cnt; short dac_line; short psg_line; + unsigned int fm_mult; // samples per line in Q16 + unsigned int fm_pos; // last FM position in Q16 }; // run tools/mkoffsets pico/pico_int_offs.h if you change these @@ -872,9 +874,10 @@ PICO_INTERNAL void PsndReset(void); PICO_INTERNAL void PsndStartFrame(void); PICO_INTERNAL void PsndDoDAC(int line_to); PICO_INTERNAL void PsndDoPSG(int line_to); +PICO_INTERNAL void PsndDoFM(int line_to); PICO_INTERNAL void PsndClear(void); PICO_INTERNAL void PsndGetSamples(int y); -PICO_INTERNAL void PsndGetSamplesMS(void); +PICO_INTERNAL void PsndGetSamplesMS(int y); // sms.c #ifndef NO_SMS diff --git a/pico/sms.c b/pico/sms.c index 2800e2094..b016f197b 100644 --- a/pico/sms.c +++ b/pico/sms.c @@ -320,16 +320,12 @@ void PicoFrameMS(void) } } - // 224 because of how it's done for MD... - if (y == 224 && PicoIn.sndOut) - PsndGetSamplesMS(); - cycles_aim += cycles_line; cycles_done += z80_run((cycles_aim - cycles_done) >> 8) << 8; } - if (PicoIn.sndOut && Pico.snd.psg_line < lines) - PsndDoPSG(lines - 1); + if (PicoIn.sndOut) + PsndGetSamplesMS(lines); } void PicoFrameDrawOnlyMS(void) diff --git a/pico/sound/mix.c b/pico/sound/mix.c index 242cb375f..4b4bbdd81 100644 --- a/pico/sound/mix.c +++ b/pico/sound/mix.c @@ -12,16 +12,15 @@ #define MINOUT (-32768) /* limitter */ -#define Limit16(val) { \ - val -= (val >> 2); \ - if ((short)val != val) val = (val < 0 ? MINOUT : MAXOUT); \ -} +#define Limit16(val) \ + if ((short)val != val) val = (val < 0 ? MINOUT : MAXOUT) int mix_32_to_16l_level; static struct iir2 { // 2-pole IIR int x[2]; // sample buffer int y[2]; // filter intermediates + int i; } lfi2, rfi2; // NB ">>" rounds to -infinity, "/" to 0. To compensate the effect possibly use diff --git a/pico/sound/mix_arm.S b/pico/sound/mix_arm.S index bb7388d6b..104b30655 100644 --- a/pico/sound/mix_arm.S +++ b/pico/sound/mix_arm.S @@ -400,6 +400,8 @@ m32_16l_st_l_no_unal2: ldmfd sp!, {r4-r11,lr} bx lr +#endif /* __GP2X__ */ + .global mix_reset @ void mix_reset: ldr r0, =filter @@ -409,11 +411,7 @@ mix_reset: bx lr .data - DCfilt r4, r10 - DCfilt r5, r11 filter: .ds 8 -#endif /* __GP2X__ */ - @ vim:filetype=armasm diff --git a/pico/sound/sound.c b/pico/sound/sound.c index 376fc9a90..3c948436e 100644 --- a/pico/sound/sound.c +++ b/pico/sound/sound.c @@ -98,52 +98,17 @@ void (*low_pass_filter)(int *buf32, int length) = low_pass_filter_stereo; static void dac_recalculate(void) { int lines = Pico.m.pal ? 313 : 262; - int mid = Pico.m.pal ? 68 : 93; - int i, dac_cnt, pos, len; + int i, pos; - if (Pico.snd.len <= lines) - { - // shrinking algo - dac_cnt = -Pico.snd.len; - len=1; pos=0; - dac_info[225] = 1; - - for(i=226; i != 225; i++) - { - if (i >= lines) i = 0; - if(dac_cnt < 0) { - pos++; - dac_cnt += lines; - } - dac_cnt -= Pico.snd.len; - dac_info[i] = pos; - } - } - else + pos = 0; // Q16 + + for(i = 0; i <= lines; i++) { - // stretching - dac_cnt = Pico.snd.len; - pos=0; - for(i = 225; i != 224; i++) - { - if (i >= lines) i = 0; - len=0; - while(dac_cnt >= 0) { - dac_cnt -= lines; - len++; - } - if (i == mid) // midpoint - while(pos+len < Pico.snd.len/2) { - dac_cnt -= lines; - len++; - } - dac_cnt += Pico.snd.len; - pos += len; - dac_info[i] = pos; - } + dac_info[i] = ((pos+(1<<15)) >> 16); // round to nearest + pos += Pico.snd.fm_mult; } - for (i = lines; i < sizeof(dac_info) / sizeof(dac_info[0]); i++) - dac_info[i] = dac_info[0]; + for (i = lines+1; i < sizeof(dac_info) / sizeof(dac_info[0]); i++) + dac_info[i] = dac_info[i-1]; } @@ -166,6 +131,7 @@ void PsndRerate(int preserve_state) { void *state = NULL; int target_fps = Pico.m.pal ? 50 : 60; + int target_lines = Pico.m.pal ? 313 : 262; if (preserve_state) { state = malloc(0x204); @@ -192,6 +158,9 @@ void PsndRerate(int preserve_state) Pico.snd.len_e_add = ((PicoIn.sndRate - Pico.snd.len * target_fps) << 16) / target_fps; Pico.snd.len_e_cnt = 0; + // samples per line + Pico.snd.fm_mult = 65536.0 * PicoIn.sndRate / (target_fps*target_lines); + // recalculate dac info dac_recalculate(); @@ -223,8 +192,7 @@ PICO_INTERNAL void PsndStartFrame(void) } Pico.snd.dac_line = Pico.snd.psg_line = 0; - Pico.m.status &= ~1; - dac_info[224] = Pico.snd.len_use; + Pico.snd.fm_pos = 0; } PICO_INTERNAL void PsndDoDAC(int line_to) @@ -233,9 +201,6 @@ PICO_INTERNAL void PsndDoDAC(int line_to) int dout = ym2612.dacout; int line_from = Pico.snd.dac_line; - if (line_to >= 313) - line_to = 312; - pos = dac_info[line_from]; pos1 = dac_info[line_to + 1]; len = pos1 - pos; @@ -262,14 +227,9 @@ PICO_INTERNAL void PsndDoPSG(int line_to) int pos, pos1, len; int stereo = 0; - if (line_to >= 313) - line_to = 312; - pos = dac_info[line_from]; pos1 = dac_info[line_to + 1]; len = pos1 - pos; - //elprintf(EL_STATUS, "%3d %3d %3d %3d %3d", - // pos, pos1, len, line_from, line_to); if (len <= 0) return; @@ -285,6 +245,34 @@ PICO_INTERNAL void PsndDoPSG(int line_to) SN76496Update(PicoIn.sndOut + pos, len, stereo); } +PICO_INTERNAL void PsndDoFM(int line_to) +{ + int pos, len; + int stereo = 0; + + // Q16, number of samples to fill in buffer + len = ((line_to-1) * Pico.snd.fm_mult) - Pico.snd.fm_pos; + + // don't do this too often (no more than 256 per sec) + if (len >> 16 <= PicoIn.sndRate >> 9) + return; + + // update position and calculate buffer offset and length + pos = Pico.snd.fm_pos >> 16; + Pico.snd.fm_pos += len; + len = (Pico.snd.fm_pos >> 16) - pos; + + // fill buffer + if (PicoIn.opt & POPT_EN_STEREO) { + stereo = 1; + pos <<= 1; + } + if (PicoIn.opt & POPT_EN_FM) + YM2612UpdateOne(PsndBuffer + pos, len, stereo, 1); + else + memset32(PsndBuffer + pos, 0, len<> 3; + int fmlen = (Pico.snd.fm_pos >> 16) - offset; offset <<= stereo; + buf32 = PsndBuffer+offset; pprof_start(sound); @@ -362,14 +351,15 @@ static int PsndRender(int offset, int length) return length; } - // Add in the stereo FM buffer - if (PicoIn.opt & POPT_EN_FM) { - buf32_updated = YM2612UpdateOne(buf32, length, stereo, 1); - } else - memset32(buf32, 0, length< 0) { + int *fmbuf = buf32 + (fmlen << stereo); + if (PicoIn.opt & POPT_EN_FM) + YM2612UpdateOne(fmbuf, length-fmlen, stereo, 1); + else + memset32(fmbuf, 0, (length-fmlen)< max ) val = max; \ - else if ( val < min ) val = min; \ -} - - /* TL_TAB_LEN is calculated as: * 13 - sinus amplitude bits (Y axis) * 2 - sinus sign bit (Y axis) @@ -289,8 +281,8 @@ O(18),O(18),O(18),O(18),O(18),O(18),O(18),O(18), O(18),O(18),O(18),O(18),O(18),O(18),O(18),O(18), /* rates 00-11 */ -O(18),O(18),O( 0),O( 0), -O( 0),O( 0),O( 2),O( 2), +O(18),O(18),O( 2),O( 3), +O( 0),O( 1),O( 2),O( 3), O( 0),O( 1),O( 2),O( 3), O( 0),O( 1),O( 2),O( 3), O( 0),O( 1),O( 2),O( 3), @@ -554,6 +546,13 @@ INLINE void set_timers( int v ) ym2612.OPN.ST.status &= ~1; } +INLINE void recalc_volout(FM_SLOT *SLOT) +{ + INT16 vol_out = SLOT->volume; + if ((SLOT->ssg&0x0c) == 0x0c) + vol_out = (0x200 - SLOT->volume) & MAX_ATT_INDEX; + SLOT->vol_out = vol_out + SLOT->tl; +} INLINE void FM_KEYON(int c , int s ) { @@ -562,13 +561,15 @@ INLINE void FM_KEYON(int c , int s ) { SLOT->key = 1; SLOT->phase = 0; /* restart Phase Generator */ + SLOT->ssg ^= SLOT->ssgn; + SLOT->ssgn = 0; + SLOT->state = (SLOT->sl == MIN_ATT_INDEX) ? EG_SUS : EG_DEC; if (SLOT->ar + SLOT->ksr < 32+62) { - SLOT->state = (SLOT->volume > MIN_ATT_INDEX) ? EG_ATT : - ((SLOT->sl == MIN_ATT_INDEX) ? EG_SUS : EG_DEC); + if (SLOT->volume > MIN_ATT_INDEX) SLOT->state = EG_ATT; } else { SLOT->volume = MIN_ATT_INDEX; - SLOT->state = (SLOT->sl == MIN_ATT_INDEX) ? EG_SUS : EG_DEC; } + recalc_volout(SLOT); ym2612.slot_mask |= (1<key ) { SLOT->key = 0; - if (SLOT->state>EG_REL) + if (SLOT->state>EG_REL) { SLOT->state = EG_REL;/* phase -> Release */ + if (SLOT->ssg&0x08) { + if (SLOT->ssg&0x04) + SLOT->volume = (0x200 - SLOT->volume); + if (SLOT->volume >= 0x200) { + SLOT->volume = MAX_ATT_INDEX; + SLOT->state = EG_OFF; + } + } + } + SLOT->vol_out = SLOT->volume + SLOT->tl; } } @@ -597,12 +608,15 @@ INLINE void set_det_mul(FM_CH *CH, FM_SLOT *SLOT, int v) INLINE void set_tl(FM_SLOT *SLOT, int v) { SLOT->tl = (v&0x7f)<<(ENV_BITS-7); /* 7bit TL */ + if (SLOT->state > EG_REL) + recalc_volout(SLOT); } /* set attack rate & key scale */ INLINE void set_ar_ksr(FM_CH *CH, FM_SLOT *SLOT, int v) { UINT8 old_KSR = SLOT->KSR; + int eg_sh_ar, eg_sel_ar; SLOT->ar = (v&0x1f) ? 32 + ((v&0x1f)<<1) : 0; @@ -611,24 +625,20 @@ INLINE void set_ar_ksr(FM_CH *CH, FM_SLOT *SLOT, int v) { CH->SLOT[SLOT1].Incr=-1; } + + /* refresh Attack rate */ + if ((SLOT->ar + SLOT->ksr) < 32+62) + { + eg_sh_ar = eg_rate_shift [SLOT->ar + SLOT->ksr ]; + eg_sel_ar = eg_rate_select[SLOT->ar + SLOT->ksr ]; + } else { - int eg_sh_ar, eg_sel_ar; - - /* refresh Attack rate */ - if ((SLOT->ar + SLOT->ksr) < 32+62) - { - eg_sh_ar = eg_rate_shift [SLOT->ar + SLOT->ksr ]; - eg_sel_ar = eg_rate_select[SLOT->ar + SLOT->ksr ]; - } - else - { - eg_sh_ar = 0; - eg_sel_ar = 18; - } - - SLOT->eg_pack_ar = eg_inc_pack[eg_sel_ar] | (eg_sh_ar<<24); + eg_sh_ar = 0; + eg_sel_ar = 18; } + + SLOT->eg_pack_ar = eg_inc_pack[eg_sel_ar] | (eg_sh_ar<<24); } /* set decay rate */ @@ -750,7 +760,7 @@ INLINE int advance_lfo(int lfo_ampm, UINT32 lfo_cnt_old, UINT32 lfo_cnt) return lfo_ampm; } -INLINE void update_eg_phase(UINT16 *vol_out, FM_SLOT *SLOT, UINT32 eg_cnt) +INLINE void update_eg_phase(FM_SLOT *SLOT, UINT32 eg_cnt) { INT32 volume = SLOT->volume; UINT32 pack = SLOT->eg_pack[SLOT->state - 1]; @@ -763,44 +773,113 @@ INLINE void update_eg_phase(UINT16 *vol_out, FM_SLOT *SLOT, UINT32 eg_cnt) eg_inc_val = pack >> ((eg_cnt >> shift) & 7) * 3; eg_inc_val = (1 << (eg_inc_val & 7)) >> 1; - switch (SLOT->state) - { - case EG_ATT: /* attack phase */ - volume += ( ~volume * eg_inc_val ) >> 4; - if ( volume <= MIN_ATT_INDEX ) + if (SLOT->ssg&0x08) { + switch (SLOT->state) { - volume = MIN_ATT_INDEX; - SLOT->state = (SLOT->sl == MIN_ATT_INDEX) ? EG_SUS: EG_DEC; - } - break; + case EG_ATT: /* attack phase */ + volume += ( ~volume * eg_inc_val ) >> 4; + if ( volume <= MIN_ATT_INDEX ) + { + volume = MIN_ATT_INDEX; + SLOT->state = (SLOT->sl == MIN_ATT_INDEX) ? EG_SUS: EG_DEC; + } + break; - case EG_DEC: /* decay phase */ - volume += eg_inc_val; - if ( volume >= (INT32) SLOT->sl ) - SLOT->state = EG_SUS; - break; + case EG_DEC: /* decay phase */ + if (volume < 0x200) + volume += 4*eg_inc_val; + if ( volume >= (INT32) SLOT->sl ) + SLOT->state = EG_SUS; + break; - case EG_SUS: /* sustain phase */ - volume += eg_inc_val; - if ( volume >= MAX_ATT_INDEX ) - { - volume = MAX_ATT_INDEX; - /* do not change SLOT->state (verified on real chip) */ + case EG_SUS: /* sustain phase */ + if (volume < 0x200) + volume += 4*eg_inc_val; + break; + + case EG_REL: /* release phase */ + if (volume < 0x200) + volume += 4*eg_inc_val; + if ( volume >= 0x200 ) + { + volume = MAX_ATT_INDEX; + SLOT->state = EG_OFF; + } + break; } - break; - case EG_REL: /* release phase */ - volume += eg_inc_val; - if ( volume >= MAX_ATT_INDEX ) + SLOT->vol_out = volume + SLOT->tl; + if ((SLOT->ssg&0x04) && (SLOT->state > EG_REL)) + SLOT->vol_out = ((0x200 - volume) & MAX_ATT_INDEX) + SLOT->tl; + } else { + switch (SLOT->state) { - volume = MAX_ATT_INDEX; - SLOT->state = EG_OFF; + case EG_ATT: /* attack phase */ + volume += ( ~volume * eg_inc_val ) >> 4; + if ( volume <= MIN_ATT_INDEX ) + { + volume = MIN_ATT_INDEX; + SLOT->state = (SLOT->sl == MIN_ATT_INDEX) ? EG_SUS: EG_DEC; + } + break; + + case EG_DEC: /* decay phase */ + volume += eg_inc_val; + if ( volume >= (INT32) SLOT->sl ) + SLOT->state = EG_SUS; + break; + + case EG_SUS: /* sustain phase */ + volume += eg_inc_val; + if ( volume >= MAX_ATT_INDEX ) + { + volume = MAX_ATT_INDEX; + /* do not change SLOT->state (verified on real chip) */ + } + break; + + case EG_REL: /* release phase */ + volume += eg_inc_val; + if ( volume >= MAX_ATT_INDEX ) + { + volume = MAX_ATT_INDEX; + SLOT->state = EG_OFF; + } + break; } - break; - } + SLOT->vol_out = volume + SLOT->tl; + } SLOT->volume = volume; - *vol_out = SLOT->tl + volume; /* tl is 7bit<<3, volume 0-1023 (0-2039 total) */ +} + +INLINE void update_ssg_eg_phase(FM_SLOT *SLOT) +{ + if (SLOT->ssg&0x01) { + if (SLOT->ssg&0x02) { + SLOT->ssg ^= SLOT->ssgn ^ 4; + SLOT->ssgn = 4; + } + + if (SLOT->state != EG_ATT && !(SLOT->ssg&0x04)) + SLOT->volume = MAX_ATT_INDEX; + } else { + if (SLOT->ssg&0x02) { + SLOT->ssg ^= 4; + SLOT->ssgn ^= 4; + } else + SLOT->phase = 0; + + if (SLOT->state != EG_ATT) { + SLOT->state = (SLOT->sl == MIN_ATT_INDEX) ? EG_SUS : EG_DEC; + if (SLOT->ar + SLOT->ksr < 32+62) { + if (SLOT->volume > MIN_ATT_INDEX) SLOT->state = EG_ATT; + } else { + SLOT->volume = MIN_ATT_INDEX; + } + } + } + recalc_volout(SLOT); } #endif @@ -846,6 +925,16 @@ static void chan_render_loop(chan_rend_context *ct, int *buffer, int length) { int smp = 0; /* produced sample */ unsigned int eg_out, eg_out2, eg_out4; + FM_SLOT *SLOT; + + SLOT = &ct->CH->SLOT[SLOT1]; + if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) update_ssg_eg_phase(SLOT); + SLOT = &ct->CH->SLOT[SLOT2]; + if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) update_ssg_eg_phase(SLOT); + SLOT = &ct->CH->SLOT[SLOT3]; + if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) update_ssg_eg_phase(SLOT); + SLOT = &ct->CH->SLOT[SLOT4]; + if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) update_ssg_eg_phase(SLOT); if (ct->pack & 8) { /* LFO enabled ? (test Earthworm Jim in between demo 1 and 2) */ ct->pack = (ct->pack&0xffff) | (advance_lfo(ct->pack >> 16, ct->lfo_cnt, ct->lfo_cnt + ct->lfo_inc) << 16); @@ -857,12 +946,58 @@ static void chan_render_loop(chan_rend_context *ct, int *buffer, int length) { ct->eg_timer -= EG_TIMER_OVERFLOW; ct->eg_cnt++; - - if (ct->CH->SLOT[SLOT1].state != EG_OFF) update_eg_phase(&ct->vol_out1, &ct->CH->SLOT[SLOT1], ct->eg_cnt); - if (ct->CH->SLOT[SLOT2].state != EG_OFF) update_eg_phase(&ct->vol_out2, &ct->CH->SLOT[SLOT2], ct->eg_cnt); - if (ct->CH->SLOT[SLOT3].state != EG_OFF) update_eg_phase(&ct->vol_out3, &ct->CH->SLOT[SLOT3], ct->eg_cnt); - if (ct->CH->SLOT[SLOT4].state != EG_OFF) update_eg_phase(&ct->vol_out4, &ct->CH->SLOT[SLOT4], ct->eg_cnt); + if (ct->eg_cnt >= 4096) ct->eg_cnt = 1; + + SLOT = &ct->CH->SLOT[SLOT1]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt); + SLOT = &ct->CH->SLOT[SLOT2]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt); + SLOT = &ct->CH->SLOT[SLOT3]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt); + SLOT = &ct->CH->SLOT[SLOT4]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt); + } +#if 0 + UINT32 ifrac0 = ct->eg_timer / (EG_TIMER_OVERFLOW>>EG_SH); + UINT32 ifrac1 = (1<CH->SLOT[SLOT1]; + ct->vol_out1 = (SLOT->vol_ipol*ifrac1 + SLOT->vol_out*ifrac0) >> EG_SH; + SLOT = &ct->CH->SLOT[SLOT2]; + ct->vol_out2 = (SLOT->vol_ipol*ifrac1 + SLOT->vol_out*ifrac0) >> EG_SH; + SLOT = &ct->CH->SLOT[SLOT3]; + ct->vol_out3 = (SLOT->vol_ipol*ifrac1 + SLOT->vol_out*ifrac0) >> EG_SH; + SLOT = &ct->CH->SLOT[SLOT4]; + ct->vol_out4 = (SLOT->vol_ipol*ifrac1 + SLOT->vol_out*ifrac0) >> EG_SH; +#else + switch (ct->eg_timer >> EG_SH) + { + case 0: + ct->vol_out1 = ct->CH->SLOT[SLOT1].vol_ipol; + ct->vol_out2 = ct->CH->SLOT[SLOT2].vol_ipol; + ct->vol_out3 = ct->CH->SLOT[SLOT3].vol_ipol; + ct->vol_out4 = ct->CH->SLOT[SLOT4].vol_ipol; + break; + case (EG_TIMER_OVERFLOW>>EG_SH)-1: + ct->vol_out1 = ct->CH->SLOT[SLOT1].vol_out; + ct->vol_out2 = ct->CH->SLOT[SLOT2].vol_out; + ct->vol_out3 = ct->CH->SLOT[SLOT3].vol_out; + ct->vol_out4 = ct->CH->SLOT[SLOT4].vol_out; + break; + default: + ct->vol_out1 = (ct->CH->SLOT[SLOT1].vol_ipol + + ct->CH->SLOT[SLOT1].vol_out) >> 1; + ct->vol_out2 = (ct->CH->SLOT[SLOT2].vol_ipol + + ct->CH->SLOT[SLOT2].vol_out) >> 1; + ct->vol_out3 = (ct->CH->SLOT[SLOT3].vol_ipol + + ct->CH->SLOT[SLOT3].vol_out) >> 1; + ct->vol_out4 = (ct->CH->SLOT[SLOT4].vol_ipol + + ct->CH->SLOT[SLOT4].vol_out) >> 1; } +#endif if (ct->pack & 4) continue; /* output disabled */ @@ -892,7 +1027,7 @@ static void chan_render_loop(chan_rend_context *ct, int *buffer, int length) if (ct->pack & (1<<(SLOT4+8))) eg_out4 += add; } - switch( ct->CH->ALGO ) + switch( ct->algo&0x7 ) { case 0: { @@ -1086,6 +1221,33 @@ static void chan_render_finish(void) ym2612.OPN.lfo_cnt = crct.lfo_cnt; } +static UINT32 update_lfo_phase(FM_SLOT *SLOT, UINT32 block_fnum) +{ + UINT32 fnum_lfo; + INT32 lfo_fn_table_index_offset; + UINT8 blk; + UINT32 fn; + int fc,fdt; + + fnum_lfo = ((block_fnum & 0x7f0) >> 4) * 32 * 8; + lfo_fn_table_index_offset = lfo_pm_table[ fnum_lfo + crct.CH->pms + ((crct.pack>>16)&0xff) ]; + if (lfo_fn_table_index_offset) /* LFO phase modulation active */ + { + block_fnum = block_fnum*2 + lfo_fn_table_index_offset; + blk = (block_fnum&0x7000) >> 12; + fn = block_fnum & 0xfff; + + /* phase increment counter */ + fc = (fn_table[fn]>>(7-blk)); + + fdt = fc + SLOT->DT[crct.CH->kcode]; + if (fdt < 0) fdt += fn_table[0x7ff*2] >> 2; + + return (fdt * SLOT->mul) >> 1; + } else + return SLOT->Incr; +} + static int chan_render(int *buffer, int length, int c, UINT32 flags) // flags: stereo, ?, disabled, ?, pan_r, pan_l { crct.CH = &ym2612.CH[c]; @@ -1114,58 +1276,22 @@ static int chan_render(int *buffer, int length, int c, UINT32 flags) // flags: s crct.phase3 = crct.CH->SLOT[SLOT3].phase; crct.phase4 = crct.CH->SLOT[SLOT4].phase; - /* current output from EG circuit (without AM from LFO) */ - crct.vol_out1 = crct.CH->SLOT[SLOT1].tl + ((UINT32)crct.CH->SLOT[SLOT1].volume); - crct.vol_out2 = crct.CH->SLOT[SLOT2].tl + ((UINT32)crct.CH->SLOT[SLOT2].volume); - crct.vol_out3 = crct.CH->SLOT[SLOT3].tl + ((UINT32)crct.CH->SLOT[SLOT3].volume); - crct.vol_out4 = crct.CH->SLOT[SLOT4].tl + ((UINT32)crct.CH->SLOT[SLOT4].volume); - crct.op1_out = crct.CH->op1_out; crct.algo = crct.CH->ALGO & 7; - if(crct.CH->pms) + if(crct.CH->pms && (ym2612.OPN.ST.mode & 0xC0) && c == 2) { + /* 3 slot mode */ + crct.incr1 = update_lfo_phase(&crct.CH->SLOT[SLOT1], ym2612.OPN.SL3.block_fnum[1]); + crct.incr2 = update_lfo_phase(&crct.CH->SLOT[SLOT2], ym2612.OPN.SL3.block_fnum[2]); + crct.incr3 = update_lfo_phase(&crct.CH->SLOT[SLOT3], ym2612.OPN.SL3.block_fnum[0]); + crct.incr4 = update_lfo_phase(&crct.CH->SLOT[SLOT4], crct.CH->block_fnum); + } + else if(crct.CH->pms) { - /* add support for 3 slot mode */ - UINT32 block_fnum = crct.CH->block_fnum; - - UINT32 fnum_lfo = ((block_fnum & 0x7f0) >> 4) * 32 * 8; - INT32 lfo_fn_table_index_offset = lfo_pm_table[ fnum_lfo + crct.CH->pms + ((crct.pack>>16)&0xff) ]; - - if (lfo_fn_table_index_offset) /* LFO phase modulation active */ - { - UINT8 blk; - UINT32 fn; - int kc,fc,fdt; - - block_fnum = block_fnum*2 + lfo_fn_table_index_offset; - blk = (block_fnum&0x7000) >> 12; - fn = block_fnum & 0xfff; - - /* keyscale code */ - kc = (blk<<2) | opn_fktable[(fn >> 7) & 0xf]; - /* phase increment counter */ - fc = (fn_table[fn]>>(7-blk)); - - fdt = fc + crct.CH->SLOT[SLOT1].DT[kc]; - if (fdt < 0) fdt += fn_table[0x7ff*2] >> 2; - crct.incr1 = (fdt*crct.CH->SLOT[SLOT1].mul) >> 1; - fdt = fc + crct.CH->SLOT[SLOT2].DT[kc]; - if (fdt < 0) fdt += fn_table[0x7ff*2] >> 2; - crct.incr2 = (fdt*crct.CH->SLOT[SLOT2].mul) >> 1; - fdt = fc + crct.CH->SLOT[SLOT3].DT[kc]; - if (fdt < 0) fdt += fn_table[0x7ff*2] >> 2; - crct.incr3 = (fdt*crct.CH->SLOT[SLOT3].mul) >> 1; - fdt = fc + crct.CH->SLOT[SLOT4].DT[kc]; - if (fdt < 0) fdt += fn_table[0x7ff*2] >> 2; - crct.incr4 = (fdt*crct.CH->SLOT[SLOT4].mul) >> 1; - } - else /* LFO phase modulation = zero */ - { - crct.incr1 = crct.CH->SLOT[SLOT1].Incr; - crct.incr2 = crct.CH->SLOT[SLOT2].Incr; - crct.incr3 = crct.CH->SLOT[SLOT3].Incr; - crct.incr4 = crct.CH->SLOT[SLOT4].Incr; - } + crct.incr1 = update_lfo_phase(&crct.CH->SLOT[SLOT1], crct.CH->block_fnum); + crct.incr2 = update_lfo_phase(&crct.CH->SLOT[SLOT2], crct.CH->block_fnum); + crct.incr3 = update_lfo_phase(&crct.CH->SLOT[SLOT3], crct.CH->block_fnum); + crct.incr4 = update_lfo_phase(&crct.CH->SLOT[SLOT4], crct.CH->block_fnum); } else /* no LFO phase modulation */ { @@ -1297,8 +1423,13 @@ static void reset_channels(FM_CH *CH) CH[c].fc = 0; for(s = 0 ; s < 4 ; s++ ) { + CH[c].SLOT[s].Incr = -1; + CH[c].SLOT[s].key = 0; + CH[c].SLOT[s].phase = 0; + CH[c].SLOT[s].ssg = CH[c].SLOT[s].ssgn = 0; CH[c].SLOT[s].state= EG_OFF; CH[c].SLOT[s].volume = MAX_ATT_INDEX; + CH[c].SLOT[s].vol_out = MAX_ATT_INDEX; } CH[c].mem_value = CH[c].op1_out = 0; } @@ -1503,8 +1634,10 @@ static int OPNWriteReg(int r, int v) break; case 0x90: /* SSG-EG */ - // removed. - ret = 0; + SLOT->ssg = v&0x0f; + SLOT->ssg ^= SLOT->ssgn; + if (SLOT->state > EG_REL) + recalc_volout(SLOT); break; case 0xa0: diff --git a/pico/sound/ym2612.h b/pico/sound/ym2612.h index bbe6b1a48..3a1ea7a9b 100644 --- a/pico/sound/ym2612.h +++ b/pico/sound/ym2612.h @@ -53,6 +53,11 @@ typedef struct }; UINT32 eg_pack[4]; }; + + UINT8 ssg; /* 0x30 SSG-EG waveform */ + UINT8 ssgn; + UINT16 vol_out; /* 0x32 current output from EG (without LFO) */ + UINT16 vol_ipol; /* 0x34 interpolator memory */ } FM_SLOT; @@ -176,21 +181,22 @@ int YM2612PicoStateLoad2(int *tat, int *tbt); #else /* GP2X specific */ #include "../../platform/gp2x/940ctl.h" -#define YM2612Init(baseclock,rate) { \ +#define YM2612Init(baseclock,rate) do { \ if (PicoIn.opt&POPT_EXT_FM) YM2612Init_940(baseclock, rate); \ else YM2612Init_(baseclock, rate); \ -} -#define YM2612ResetChip() { \ +} while (0) +#define YM2612ResetChip() do { \ if (PicoIn.opt&POPT_EXT_FM) YM2612ResetChip_940(); \ else YM2612ResetChip_(); \ -} -#define YM2612UpdateOne(buffer,length,stereo,is_buf_empty) \ +} while (0) +#define YM2612UpdateOne(buffer,length,stereo,is_buf_empty) do { \ (PicoIn.opt&POPT_EXT_FM) ? YM2612UpdateOne_940(buffer, length, stereo, is_buf_empty) : \ - YM2612UpdateOne_(buffer, length, stereo, is_buf_empty); -#define YM2612PicoStateLoad() { \ + YM2612UpdateOne_(buffer, length, stereo, is_buf_empty); \ +} while (0) +#define YM2612PicoStateLoad() do { \ if (PicoIn.opt&POPT_EXT_FM) YM2612PicoStateLoad_940(); \ else YM2612PicoStateLoad_(); \ -} +} while (0) #endif /* __GP2X__ */ diff --git a/pico/sound/ym2612_arm.S b/pico/sound/ym2612_arm.S index 9b807928a..86e5f1c07 100644 --- a/pico/sound/ym2612_arm.S +++ b/pico/sound/ym2612_arm.S @@ -1,6 +1,7 @@ /* * PicoDrive * (C) notaz, 2006 + * (C) kub, 2020 added SSG-EG and simple output rate interpolation * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. @@ -18,7 +19,7 @@ .equiv SLOT2, 2 .equiv SLOT3, 1 .equiv SLOT4, 3 -.equiv SLOT_STRUCT_SIZE, 0x30 +.equiv SLOT_STRUCT_SIZE, 0x38 .equiv TL_TAB_LEN, 0x1A00 @@ -28,11 +29,11 @@ .equiv EG_REL, 1 .equiv EG_OFF, 0 -.equiv EG_SH, 16 @ 16.16 fixed point (envelope generator timing) +.equiv EG_SH, 16 @ 16.16 fixed point (envelope generator timing) .equiv EG_TIMER_OVERFLOW, (3*(1<= (INT32) SLOT->sl ) + strgeb r3, [r5,#0x17] @ state + b 10f + +4: @ EG_ATT + subs r3, r3, #1 @ eg_inc_val_shift - 1 + mvnpl r2, r0 + movpl r2, r2, lsl r3 + addpl r0, r0, r2, asr #4 + cmp r0, #0 @ if (volume <= MIN_ATT_INDEX) + bgt 10f + ldr r2, [r5,#0x1c] + mov r0, #0 + cmp r2, #0 + movne r3, #EG_DEC + moveq r3, #EG_SUS + strb r3, [r5,#0x17] @ state + b 10f + +1: @ EG_REL + mov r2, #0x200 + cmp r0, r2 @ if ( volume >= 0x200 ) + movge r0, #1024 + subge r0, #1 + movge r3, #EG_OFF + strgeb r3, [r5,#0x17] @ state + +10: @ finish + strh r0, [r5,#0x1a] @ volume + ldrb r2, [r5,#0x30] @ ssg + ldrb r3, [r5,#0x17] @ state + cmp r2, #0x0c @ if ( ssg&0x04 && state > EG_REL ) + cmpge r3, #EG_REL+1 + rsbge r0, r0, #0x200 @ volume = (0x200-volume) & MAX_ATT + lslge r0, r0, #10 + lsrge r0, r0, #10 + +11: + ldrh r3, [r5,#0x18] @ tl + add r0, r0, r3 @ volume += tl + strh r0, [r5,#0x32] @ vol_out .if \slot == SLOT1 mov r6, r6, lsr #16 - add r0, r0, r3 orr r6, r0, r6, lsl #16 .elseif \slot == SLOT2 mov r6, r6, lsl #16 - add r0, r0, r3 mov r0, r0, lsl #16 orr r6, r0, r6, lsr #16 .elseif \slot == SLOT3 mov r7, r7, lsr #16 - add r0, r0, r3 orr r7, r0, r7, lsl #16 .elseif \slot == SLOT4 mov r7, r7, lsl #16 - add r0, r0, r3 mov r0, r0, lsl #16 orr r7, r0, r7, lsr #16 .endif @@ -137,6 +202,63 @@ 0: @ EG_OFF .endm +@ r5=slot, trashes: r0,r2,r3 +.macro update_ssg_eg + ldrh r0, [r5,#0x30] @ ssg+ssgn + ldrb r2, [r5,#0x17] @ state + ldrh r3, [r5,#0x1a] @ volume + tst r0, #0x08 @ ssg enabled? + beq 9f + cmp r2, #EG_REL @ state > EG_REL? + ble 9f + cmp r3, #0x200 @ volume >= 0x200? + blt 9f + + tst r0, #0x01 + beq 1f + + tst r0, #0x02 + eorne r0, r0, lsr #8 @ ssg ^= ssgn ^ 4 + eorne r0, r0, #0x4 + orrne r0, r0, #0x400 @ ssgn = 4 + strneh r0, [r5,#0x30] + + eor r0, r0, #0x4 @ if ( !(ssg&0x04 ) + tst r0, #0x4 + cmpne r2, #EG_ATT @ if ( state != EG_ATT ) + movne r0, #0x400 + subne r0, r0, #1 + strneh r0, [r5,#0x1a] @ volume = MAX_ATT + b 9f + +1: tst r0, #0x02 + eorne r0, r0, #0x4 @ ssg ^= 4 + eorne r0, r0, #0x400 @ ssgn ^= 4 + strneh r0, [r5,#0x30] + moveq r3, #0 + streq r3, [r5,#0x0c] @ phase = 0 + + cmp r2, #EG_ATT @ if ( state != EG_ATT ) + beq 9f + + ldr r3, [r5,#0x1c] @ sl + mov r2, #EG_SUS @ state = sl==MIN_ATT ? EG_SUS:EG_DEC + cmp r3, #0 + + ldr r0, [r5,#0x04] @ ar + ldr r3, [r5,#0x14] @ ksr + movne r2, #EG_DEC + add r0, r0, r3 + cmp r0, #32+62 @ if ( ar+ksr >= 32+62 ) + ldrlt r0, [r5,#0x1a] + movge r0, #0 + strgeh r0, [r5,#0x1a] @ volume = MIN_ATT + + cmp r0, #0 + movgt r2, #EG_ATT + strb r2, [r5,#0x17] @ state +9: +.endm @ r12=lfo_ampm[31:16], r1=lfo_cnt_old, r2=lfo_cnt, r3=scratch .macro advance_lfo_m @@ -532,187 +654,6 @@ .endm -/* -.global update_eg_phase @ FM_SLOT *SLOT, UINT32 eg_cnt - -update_eg_phase: - stmfd sp!, {r5,r6} - mov r5, r0 @ slot - ldrh r3, [r5,#0x18] @ tl - ldrh r6, [r5,#0x1a] @ volume - add r6, r6, r3 - update_eg_phase_slot SLOT1 - mov r0, r6 - ldmfd sp!, {r5,r6} - bx lr -.pool - - -.global advance_lfo @ int lfo_ampm, UINT32 lfo_cnt_old, UINT32 lfo_cnt - -advance_lfo: - mov r12, r0, lsl #16 - advance_lfo_m - mov r0, r12, lsr #16 - bx lr -.pool - - -.global upd_algo0 @ chan_rend_context *c -upd_algo0: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - PIC_LDR(r3, ip, ym_sin_tab) - PIC_LDR(r5, ip, ym_tl_tab) - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo0_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_algo1 @ chan_rend_context *c -upd_algo1: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - PIC_LDR(r3, ip, ym_sin_tab) - PIC_LDR(r5, ip, ym_tl_tab) - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo1_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_algo2 @ chan_rend_context *c -upd_algo2: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - PIC_LDR(r3, ip, ym_sin_tab) - PIC_LDR(r5, ip, ym_tl_tab) - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo2_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_algo3 @ chan_rend_context *c -upd_algo3: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - PIC_LDR(r3, ip, ym_sin_tab) - PIC_LDR(r5, ip, ym_tl_tab) - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo3_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_algo4 @ chan_rend_context *c -upd_algo4: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - PIC_LDR(r3, ip, ym_sin_tab) - PIC_LDR(r5, ip, ym_tl_tab) - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo4_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_algo5 @ chan_rend_context *c -upd_algo5: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - PIC_LDR(r3, ip, ym_sin_tab) - PIC_LDR(r5, ip, ym_tl_tab) - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo5_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_algo6 @ chan_rend_context *c -upd_algo6: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - PIC_LDR(r3, ip, ym_sin_tab) - PIC_LDR(r5, ip, ym_tl_tab) - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo6_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_algo7 @ chan_rend_context *c -upd_algo7: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - PIC_LDR(r3, ip, ym_sin_tab) - PIC_LDR(r5, ip, ym_tl_tab) - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_algo7_m - - ldmfd sp!, {r4-r10,pc} -.pool - - -.global upd_slot1 @ chan_rend_context *c -upd_slot1: - stmfd sp!, {r4-r10,lr} - mov lr, r0 - - PIC_LDR(r3, ip, ym_sin_tab) - PIC_LDR(r5, ip, ym_tl_tab) - ldmia lr, {r6-r7} - ldr r10, [lr, #0x54] - ldr r12, [lr, #0x4c] - - upd_slot1_m - str r10, [lr, #0x38] - - ldmfd sp!, {r4-r10,pc} -.pool -*/ - - @ lr=context, r12=pack (stereo, lastchan, disabled, lfo_enabled | pan_r, pan_l, ams[2] | AMmasks[4] | FB[4] | lfo_ampm[16]) @ r0-r2=scratch, r3=sin_tab/scratch, r4=(length<<8)|unused[4],was_update,algo[3], r5=tl_tab/slot, @ r6-r7=vol_out[4], r8=eg_timer, r9=eg_timer_add[31:16], r10=op1_out, r11=buffer @@ -730,14 +671,21 @@ chan_render_loop: add r0, lr, #0x44 ldmia r0, {r8,r9} @ eg_timer, eg_timer_add ldr r10, [lr, #0x54] @ op1_out - ldmia lr, {r6,r7} @ load volumes +@ ldmia lr, {r6,r7} @ load volumes + ldr r5, [lr, #0x40] @ CH + ldrh r6, [r5, #0x32] @ vol_out values for all slots + ldrh r2, [r5, #0x32+SLOT_STRUCT_SIZE*2] + ldrh r7, [r5, #0x32+SLOT_STRUCT_SIZE] + ldrh r3, [r5, #0x32+SLOT_STRUCT_SIZE*3] + orr r6, r6, r2, lsl #16 + orr r7, r7, r3, lsl #16 tst r12, #8 @ lfo? beq crl_loop crl_loop_lfo: add r0, lr, #0x30 - ldmia r0, {r1,r2} + ldmia r0, {r1,r2} @ lfo_cnt, lfo_inc subs r4, r4, #0x100 bmi crl_loop_end @@ -754,15 +702,29 @@ crl_loop: subs r4, r4, #0x100 bmi crl_loop_end + @ -- SSG -- + add r0, lr, #0x3c + ldmia r0, {r1,r5} @ eg_cnt, CH + + @ r5=slot, trashes: r0,r2,r3 + update_ssg_eg + add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT2 (2) + update_ssg_eg + sub r5, r5, #SLOT_STRUCT_SIZE @ SLOT3 (1) + update_ssg_eg + add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT4 (3) + update_ssg_eg + sub r5, r5, #SLOT_STRUCT_SIZE*3 + @ -- EG -- add r8, r8, r9 cmp r8, #EG_TIMER_OVERFLOW bcc eg_done - add r0, lr, #0x3c - ldmia r0, {r1,r5} @ eg_cnt, CH eg_loop: sub r8, r8, #EG_TIMER_OVERFLOW add r1, r1, #1 + cmp r1, #4096 + movge r1, #1 @ SLOT1 (0) @ r5=slot, r1=eg_cnt, trashes: r0,r2,r3 update_eg_phase_slot SLOT1 @@ -774,8 +736,8 @@ eg_loop: update_eg_phase_slot SLOT4 cmp r8, #EG_TIMER_OVERFLOW - subcs r5, r5, #SLOT_STRUCT_SIZE*3 - bcs eg_loop + sub r5, r5, #SLOT_STRUCT_SIZE*3 + bhs eg_loop str r1, [lr, #0x3c] eg_done: @@ -787,6 +749,66 @@ eg_done: cmp r0, #0x4 beq crl_loop + @ output interpolation +#if 0 + @ basic interpolator, interpolate in middle region, else use closer value + mov r3, r8, lsr #EG_SH @ eg_timer, [0..3<>EG_SH)/2 + bgt 0f @ mix is vol_out + + ldrh r0, [r5,#0x34] @ SLOT1 vol_ipol + lsleq r2, r6, #16 + addeq r0, r0, r2, lsr #16 + lsreq r0, r0, #1 + mov r6, r6, lsr #16 + orr r6, r0, r6, lsl #16 + + ldrh r0, [r5,#0x34+SLOT_STRUCT_SIZE*2] @ SLOT2 vol_ipol + addeq r0, r0, r6, lsr #16 + lsreq r0, r0, #1 + mov r6, r6, lsl #16 + orr r6, r6, r0 + ror r6, r6, #16 + + ldrh r0, [r5,#0x34+SLOT_STRUCT_SIZE] @ SLOT3 vol_ipol + lsleq r2, r7, #16 + addeq r0, r0, r2, lsr #16 + lsreq r0, r0, #1 + mov r7, r7, lsr #16 + orr r7, r0, r7, lsl #16 + + ldrh r0, [r5,#0x34+SLOT_STRUCT_SIZE*3] @ SLOT4 vol_ipol + addeq r0, r0, r7, lsr #16 + lsreq r0, r0, #1 + mov r7, r7, lsl #16 + orr r7, r7, r0 + ror r7, r7, #16 +#elif 0 + @ super-basic... just take value closest to sample point + mov r3, r8, lsr #EG_SH-1 @ eg_timer, [0..3<>EG_SH) + bgt 0f @ mix is vol_out + + ldrh r0, [r5,#0x34] @ SLOT1 vol_ipol + mov r6, r6, lsr #16 + orr r6, r0, r6, lsl #16 + + ldrh r0, [r5,#0x34+SLOT_STRUCT_SIZE*2] @ SLOT2 vol_ipol + mov r6, r6, lsl #16 + orr r6, r6, r0 + ror r6, r6, #16 + + ldrh r0, [r5,#0x34+SLOT_STRUCT_SIZE] @ SLOT3 vol_ipol + mov r7, r7, lsr #16 + orr r7, r0, r7, lsl #16 + + ldrh r0, [r5,#0x34+SLOT_STRUCT_SIZE*3] @ SLOT4 vol_ipol + mov r7, r7, lsl #16 + orr r7, r7, r0 + ror r7, r7, #16 +#endif +0: + @ -- SLOT1 -- PIC_LDR(r3, r2, ym_tl_tab) From 44a60b45939b774750edbf95ffebe9baafde28ac Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 14 Jan 2020 22:49:03 +0100 Subject: [PATCH 093/174] bug fixes in drc, audio, display --- cpu/sh2/compiler.c | 15 ++------- cpu/sh2/compiler.h | 1 - pico/32x/32x.c | 1 - pico/32x/sh2soc.c | 14 ++++++--- pico/draw2.c | 13 ++++++++ pico/draw2_arm.S | 23 +++++++++++--- pico/pico.h | 1 + pico/sound/mix_arm.S | 6 ++-- pico/sound/sound.c | 24 ++++++-------- pico/sound/ym2612.c | 37 ++++++++++++++++------ pico/sound/ym2612.h | 5 +-- pico/sound/ym2612_arm.S | 66 ++++++++++++++++++--------------------- platform/common/dismips.c | 8 +++-- platform/linux/emu.c | 5 ++- 14 files changed, 128 insertions(+), 91 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index bd3e5b43b..043204241 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -703,8 +703,8 @@ static void add_to_hashlist(struct block_entry *be, int tcache_id) #if (DRC_DEBUG & 2) if (be->next != NULL) { - printf(" %08x: entry hash collision with %08x\n", - be->pc, be->next->pc); + printf(" %08x@%p: entry hash collision with %08x@%p\n", + be->pc, be->tcache_ptr, be->next->pc, be->next->tcache_ptr); hash_collisions++; } #endif @@ -5323,7 +5323,7 @@ int sh2_execute_drc(SH2 *sh2c, int cycles) // TODO: irq cycles ret_cycles = (int32_t)sh2c->sr >> 12; if (ret_cycles > 0) - dbg(1, "warning: drc returned with cycles: %d", ret_cycles); + dbg(1, "warning: drc returned with cycles: %d, pc %08x", ret_cycles, sh2c->pc); sh2c->sr &= 0x3f3; return ret_cycles; @@ -5506,10 +5506,6 @@ void sh2_drc_mem_setup(SH2 *sh2) sh2->p_drcblk_ram = Pico32xMem->drcblk_ram; } -void sh2_drc_frame(void) -{ -} - int sh2_drc_init(SH2 *sh2) { int i; @@ -5716,8 +5712,6 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, else if ((lowest_mova && lowest_mova <= pc) || (lowest_literal && lowest_literal <= pc)) break; // text area collides with data area - else if ((op_flags[i] & OF_BTARGET) && dr_get_entry(pc, is_slave, &i_end)) - break; // branch target already compiled op = FETCH_OP(pc); switch ((op & 0xf000) >> 12) @@ -6497,9 +6491,6 @@ u16 scan_block(u32 base_pc, int is_slave, u8 *op_flags, u32 *end_pc_out, last_btarget = 0; op = 0; // delay/poll insns counter for (i = 0, pc = base_pc; i < i_end; i++, pc += 2) { - int null; - if ((op_flags[i] & OF_BTARGET) && dr_get_entry(pc, is_slave, &null)) - break; // branch target already compiled opd = &ops[i]; crc += FETCH_OP(pc); diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index dd37d4707..00a8707b9 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -6,7 +6,6 @@ void sh2_drc_wcheck_da(uint32_t a, unsigned len, SH2 *sh2); #ifdef DRC_SH2 void sh2_drc_mem_setup(SH2 *sh2); void sh2_drc_flush_all(void); -void sh2_drc_frame(void); #else #define sh2_drc_mem_setup(x) #define sh2_drc_flush_all() diff --git a/pico/32x/32x.c b/pico/32x/32x.c index 896b5aa1a..aa45ba7bd 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -580,7 +580,6 @@ void PicoFrame32x(void) PicoFrameStart(); PicoFrameHints(); - sh2_drc_frame(); elprintf(EL_32X, "poll: %02x %02x %02x", Pico32x.emu_flags & 3, msh2.state, ssh2.state); diff --git a/pico/32x/sh2soc.c b/pico/32x/sh2soc.c index dd834bfbe..cf11666dd 100644 --- a/pico/32x/sh2soc.c +++ b/pico/32x/sh2soc.c @@ -137,11 +137,15 @@ static void dmac_memcpy(struct dma_chan *chan, SH2 *sh2) if (!up || chan->tcr < 4) return; - // XXX Mars Check Program fills a 64K buffer, then copies 32K longwords from - // DRAM to SDRAM in 4-longword mode, which is 128K. This overwrites a comm - // area in SDRAM, which is why the check fails. - // Is this a buswidth mismatch problem? As a kludge, usw 16-bit width xfers - if (size == 3 && (chan->sar & 0xdf000000) == 0x04000000) size = 1; +#if MARS_CHECK_HACK + // XXX Mars Check Program copies 32K longwords (128KB) from a 64KB buffer in + // ROM or DRAM to SDRAM in 4-longword mode, overwriting an SDRAM comm area in + // turn, which crashes the test on emulators without CPU cache emulation. + // This may be a bug in Mars Check. As a kludge limit the transfer to 64KB, + // which is what the check program test uses for checking the result. + // A better way would clearly be to have a mechanism to patch the ROM... + if (size == 3 && chan->tcr == 32768 && chan->dar == 0x06020000) size = 1; +#endif if (size == 3) size = 2; // 4-word xfer mode still counts in words // XXX check TCR being a multiple of 4 in 4-word xfer mode? // XXX check alignment of sar/dar, generating a bus error if unaligned? diff --git a/pico/draw2.c b/pico/draw2.c index f0e0518e7..38a90ef3b 100644 --- a/pico/draw2.c +++ b/pico/draw2.c @@ -157,6 +157,8 @@ static void DrawWindowFull(int start, int end, int prio, struct PicoEState *est) { nametab=(pvid->reg[3]&0x3e)<<9; // 32-cell mode nametab_step = 1<<5; + if (!(PicoIn.opt&POPT_DIS_32C_BORDER)) + scrpos += 32; } nametab += nametab_step*start; @@ -240,6 +242,8 @@ static void DrawLayerFull(int plane, int *hcache, int planestart, int planeend, else nametab=(pvid->reg[4]&0x07)<<12; // B scrpos = est->Draw2FB; + if (!(pvid->reg[12]&1) && !(PicoIn.opt&POPT_DIS_32C_BORDER)) + scrpos += 32; scrpos+=8*LINE_WIDTH*(planestart-START_ROW); // Get vertical scroll value: @@ -315,6 +319,8 @@ static void DrawTilesFromCacheF(int *hc, struct PicoEState *est) short blank=-1; // The tile we know is blank unsigned char *scrpos = est->Draw2FB, *pd = 0; + if (!(Pico.video.reg[12]&1) && !(PicoIn.opt&POPT_DIS_32C_BORDER)) + scrpos += 32; // *hcache++ = code|(dx<<16)|(trow<<27); // cache it scrpos+=(*hc++)*LINE_WIDTH - START_ROW*LINE_WIDTH*8; @@ -377,6 +383,8 @@ static void DrawSpriteFull(unsigned int *sprite, struct PicoEState *est) while(sy <= START_ROW*8) { sy+=8; tile+=tdeltay; height--; } scrpos = est->Draw2FB; + if (!(Pico.video.reg[12]&1) && !(PicoIn.opt&POPT_DIS_32C_BORDER)) + scrpos += 32; scrpos+=(sy-START_ROW*8)*LINE_WIDTH; for (; height > 0; height--, sy+=8, tile+=tdeltay) @@ -502,6 +510,11 @@ static void DrawDisplayFull(void) maxw = 264; maxcolc = 32; } + // 32C border for centering? (for asm) + est->rendstatus &= ~PDRAW_BORDER_32; + if ((est->rendstatus&PDRAW_32_COLS) && !(PicoIn.opt&POPT_DIS_32C_BORDER)) + est->rendstatus |= PDRAW_BORDER_32; + // horizontal window? if ((win=pvid->reg[0x12])) { diff --git a/pico/draw2_arm.S b/pico/draw2_arm.S index 6b094495a..ded0d5a5a 100644 --- a/pico/draw2_arm.S +++ b/pico/draw2_arm.S @@ -414,7 +414,10 @@ DrawLayerFull: ldr r11,[sp, #9*4] @ est sub r4, r9, #(START_ROW<<24) + ldr r7, [r11, #OFS_EST_rendstatus] ldr r11, [r11, #OFS_EST_Draw2FB] + tst r7, #0x100 @ H32 border mode? + addne r11, r11, #32 mov r4, r4, asr #24 mov r7, #328*8 mla r11, r4, r7, r11 @ scrpos+=8*328*(planestart-START_ROW); @@ -590,8 +593,11 @@ DrawTilesFromCacheF: mov r9, #0xff000000 @ r9=prevcode=-1 mvn r6, #0 @ r6=prevy=-1 + ldr r7, [r1, #OFS_EST_rendstatus] ldr r4, [r1, #OFS_EST_Draw2FB] ldr r2, [r0], #4 @ read y offset + tst r7, #0x100 @ H32 border mode? + addne r4, r4, #32 mov r7, #328 mla r2, r7, r2, r4 sub r12, r2, #(328*8*START_ROW) @ r12=scrpos @@ -688,13 +694,18 @@ DrawWindowFull: ldr r4, [r11, #OFS_Pico_video_reg+12] mov r5, #1 @ nametab_step + ldr r11, [r3, #OFS_EST_Draw2FB] tst r4, #1 @ 40 cell mode? andne r12, r12, #0xf000 @ 0x3c<<10 - andeq r12, r12, #0xf800 movne r5, r5, lsl #7 - moveq r5, r5, lsl #6 @ nametab_step - - and r4, r0, #0xff + bne 0f + ldr r7, [r3, #OFS_EST_rendstatus] + and r12, r12, #0xf800 + mov r5, r5, lsl #6 @ nametab_step + tst r7, #0x100 + addne r11, r11, #32 @ center screen in H32 mode + +0: and r4, r0, #0xff mla r12, r5, r4, r12 @ nametab += nametab_step*start; ldr r10, [r3, #OFS_EST_PicoMem_vram] @@ -715,7 +726,6 @@ DrawWindowFull: mov r9, #0xff000000 @ r9=prevcode=-1 - ldr r11, [r3, #OFS_EST_Draw2FB] and r4, r0, #0xff add r11, r11, #328*8 sub r4, r4, #START_ROW @@ -915,8 +925,11 @@ DrawSpriteFull: and r3, lr, #0x6000 mov r3, r3, lsr #9 @ r3=pal=((code>>9)&0x30); + ldr r0, [r1, #OFS_EST_rendstatus] ldr r11, [r1, #OFS_EST_Draw2FB] ldr r10, [r1, #OFS_EST_PicoMem_vram] + tst r0, #0x100 @ H32 border mode? + addne r11, r11, #32 sub r1, r12, #(START_ROW*8) mov r0, #328 mla r11, r1, r0, r11 @ scrpos+=(sy-START_ROW*8)*328; diff --git a/pico/pico.h b/pico/pico.h index 09249f153..6f231cf08 100644 --- a/pico/pico.h +++ b/pico/pico.h @@ -206,6 +206,7 @@ void PicoDoHighPal555(int sh, int line, struct PicoEState *est); #define PDRAW_PLANE_HI_PRIO (1<<6) // have layer with all hi prio tiles (mk3) #define PDRAW_SHHI_DONE (1<<7) // layer sh/hi already processed #define PDRAW_32_COLS (1<<8) // 32 column mode +#define PDRAW_BORDER_32 (1<<9) // center H32 in buffer (32 px border) extern int rendstatus_old; extern int rendlines; diff --git a/pico/sound/mix_arm.S b/pico/sound/mix_arm.S index 104b30655..a1558d743 100644 --- a/pico/sound/mix_arm.S +++ b/pico/sound/mix_arm.S @@ -176,10 +176,10 @@ m16_32_s2_no_unal2: @ filter out DC offset @ in=int_sample (max 20 bit), y=filter memory, r3=tmp .macro DCfilt in y - rsb r3, \y, \in, asl #12 @ fixpoint 20.12 + rsb r3, \y, \in, lsl #12 @ fixpoint 20.12 add \y, \y, r3, asr #13 - sub \in, \in, \y, asr #12 - sub \in, \in, \in, asr #2 @ reduce audio lvl some + sub r3, r3, r3, asr #2 @ reduce audio lvl some + asr \in, r3, #12 .endm @ mix 32bit audio (with 16bits really used, upper bits indicate overflow) with normal 16 bit audio with left channel only diff --git a/pico/sound/sound.c b/pico/sound/sound.c index 3c948436e..98b4bf2e8 100644 --- a/pico/sound/sound.c +++ b/pico/sound/sound.c @@ -104,7 +104,7 @@ static void dac_recalculate(void) for(i = 0; i <= lines; i++) { - dac_info[i] = ((pos+(1<<15)) >> 16); // round to nearest + dac_info[i] = ((pos+0x8000) >> 16); // round to nearest pos += Pico.snd.fm_mult; } for (i = lines+1; i < sizeof(dac_info) / sizeof(dac_info[0]); i++) @@ -156,10 +156,10 @@ void PsndRerate(int preserve_state) // calculate Pico.snd.len Pico.snd.len = PicoIn.sndRate / target_fps; Pico.snd.len_e_add = ((PicoIn.sndRate - Pico.snd.len * target_fps) << 16) / target_fps; - Pico.snd.len_e_cnt = 0; + Pico.snd.len_e_cnt = 0; // Q16 - // samples per line - Pico.snd.fm_mult = 65536.0 * PicoIn.sndRate / (target_fps*target_lines); + // samples per line (Q16) + Pico.snd.fm_mult = 65536LL * PicoIn.sndRate / (target_fps*target_lines); // recalculate dac info dac_recalculate(); @@ -250,7 +250,7 @@ PICO_INTERNAL void PsndDoFM(int line_to) int pos, len; int stereo = 0; - // Q16, number of samples to fill in buffer + // Q16, number of samples since last call len = ((line_to-1) * Pico.snd.fm_mult) - Pico.snd.fm_pos; // don't do this too often (no more than 256 per sec) @@ -258,9 +258,9 @@ PICO_INTERNAL void PsndDoFM(int line_to) return; // update position and calculate buffer offset and length - pos = Pico.snd.fm_pos >> 16; + pos = (Pico.snd.fm_pos+0x8000) >> 16; Pico.snd.fm_pos += len; - len = (Pico.snd.fm_pos >> 16) - pos; + len = ((Pico.snd.fm_pos+0x8000) >> 16) - pos; // fill buffer if (PicoIn.opt & POPT_EN_STEREO) { @@ -269,8 +269,6 @@ PICO_INTERNAL void PsndDoFM(int line_to) } if (PicoIn.opt & POPT_EN_FM) YM2612UpdateOne(PsndBuffer + pos, len, stereo, 1); - else - memset32(PsndBuffer + pos, 0, len<> 3; - int fmlen = (Pico.snd.fm_pos >> 16) - offset; + int fmlen = ((Pico.snd.fm_pos+0x8000) >> 16) - offset; offset <<= stereo; buf32 = PsndBuffer+offset; @@ -356,15 +356,11 @@ static int PsndRender(int offset, int length) int *fmbuf = buf32 + (fmlen << stereo); if (PicoIn.opt & POPT_EN_FM) YM2612UpdateOne(fmbuf, length-fmlen, stereo, 1); - else - memset32(fmbuf, 0, (length-fmlen)<ssg ^= SLOT->ssgn; SLOT->ssgn = 0; SLOT->state = (SLOT->sl == MIN_ATT_INDEX) ? EG_SUS : EG_DEC; - if (SLOT->ar + SLOT->ksr < 32+62) { + if (SLOT->ar_ksr < 32+62) { if (SLOT->volume > MIN_ATT_INDEX) SLOT->state = EG_ATT; } else { SLOT->volume = MIN_ATT_INDEX; @@ -619,6 +619,7 @@ INLINE void set_ar_ksr(FM_CH *CH, FM_SLOT *SLOT, int v) int eg_sh_ar, eg_sel_ar; SLOT->ar = (v&0x1f) ? 32 + ((v&0x1f)<<1) : 0; + SLOT->ar_ksr = SLOT->ar + SLOT->ksr; SLOT->KSR = 3-(v>>6); if (SLOT->KSR != old_KSR) @@ -627,10 +628,10 @@ INLINE void set_ar_ksr(FM_CH *CH, FM_SLOT *SLOT, int v) } /* refresh Attack rate */ - if ((SLOT->ar + SLOT->ksr) < 32+62) + if ((SLOT->ar_ksr) < 32+62) { - eg_sh_ar = eg_rate_shift [SLOT->ar + SLOT->ksr ]; - eg_sel_ar = eg_rate_select[SLOT->ar + SLOT->ksr ]; + eg_sh_ar = eg_rate_shift [SLOT->ar_ksr]; + eg_sel_ar = eg_rate_select[SLOT->ar_ksr]; } else { @@ -872,7 +873,7 @@ INLINE void update_ssg_eg_phase(FM_SLOT *SLOT) if (SLOT->state != EG_ATT) { SLOT->state = (SLOT->sl == MIN_ATT_INDEX) ? EG_SUS : EG_DEC; - if (SLOT->ar + SLOT->ksr < 32+62) { + if (SLOT->ar_ksr < 32+62) { if (SLOT->volume > MIN_ATT_INDEX) SLOT->state = EG_ATT; } else { SLOT->volume = MIN_ATT_INDEX; @@ -972,7 +973,7 @@ static void chan_render_loop(chan_rend_context *ct, int *buffer, int length) ct->vol_out3 = (SLOT->vol_ipol*ifrac1 + SLOT->vol_out*ifrac0) >> EG_SH; SLOT = &ct->CH->SLOT[SLOT4]; ct->vol_out4 = (SLOT->vol_ipol*ifrac1 + SLOT->vol_out*ifrac0) >> EG_SH; -#else +#elif 1 switch (ct->eg_timer >> EG_SH) { case 0: @@ -997,6 +998,23 @@ static void chan_render_loop(chan_rend_context *ct, int *buffer, int length) ct->vol_out4 = (ct->CH->SLOT[SLOT4].vol_ipol + ct->CH->SLOT[SLOT4].vol_out) >> 1; } +#elif 0 + if (ct->eg_timer >> (EG_SH-1) < EG_TIMER_OVERFLOW >> EG_SH) { + ct->vol_out1 = ct->CH->SLOT[SLOT1].vol_ipol; + ct->vol_out2 = ct->CH->SLOT[SLOT2].vol_ipol; + ct->vol_out3 = ct->CH->SLOT[SLOT3].vol_ipol; + ct->vol_out4 = ct->CH->SLOT[SLOT4].vol_ipol; + } else { + ct->vol_out1 = ct->CH->SLOT[SLOT1].vol_out; + ct->vol_out2 = ct->CH->SLOT[SLOT2].vol_out; + ct->vol_out3 = ct->CH->SLOT[SLOT3].vol_out; + ct->vol_out4 = ct->CH->SLOT[SLOT4].vol_out; + } +#else + ct->vol_out1 = ct->CH->SLOT[SLOT1].vol_out; + ct->vol_out2 = ct->CH->SLOT[SLOT2].vol_out; + ct->vol_out3 = ct->CH->SLOT[SLOT3].vol_out; + ct->vol_out4 = ct->CH->SLOT[SLOT4].vol_out; #endif if (ct->pack & 4) continue; /* output disabled */ @@ -1335,12 +1353,13 @@ INLINE void refresh_fc_eg_slot(FM_SLOT *SLOT, int fc, int kc) { int eg_sh, eg_sel; SLOT->ksr = ksr; + SLOT->ar_ksr = SLOT->ar + ksr; /* calculate envelope generator rates */ - if ((SLOT->ar + ksr) < 32+62) + if ((SLOT->ar_ksr) < 32+62) { - eg_sh = eg_rate_shift [SLOT->ar + ksr ]; - eg_sel = eg_rate_select[SLOT->ar + ksr ]; + eg_sh = eg_rate_shift [SLOT->ar_ksr]; + eg_sel = eg_rate_select[SLOT->ar_ksr]; } else { diff --git a/pico/sound/ym2612.h b/pico/sound/ym2612.h index 3a1ea7a9b..73e693f92 100644 --- a/pico/sound/ym2612.h +++ b/pico/sound/ym2612.h @@ -56,8 +56,9 @@ typedef struct UINT8 ssg; /* 0x30 SSG-EG waveform */ UINT8 ssgn; - UINT16 vol_out; /* 0x32 current output from EG (without LFO) */ - UINT16 vol_ipol; /* 0x34 interpolator memory */ + UINT16 ar_ksr; /* 0x32 ar+ksr */ + UINT16 vol_out; /* 0x34 current output from EG (without LFO) */ + UINT16 vol_ipol; /* 0x36 interpolator memory */ } FM_SLOT; diff --git a/pico/sound/ym2612_arm.S b/pico/sound/ym2612_arm.S index 86e5f1c07..4cb928509 100644 --- a/pico/sound/ym2612_arm.S +++ b/pico/sound/ym2612_arm.S @@ -42,10 +42,10 @@ @ r5=slot, r1=eg_cnt, trashes: r0,r2,r3 @ writes output to routp, but only if vol_out changes .macro update_eg_phase_slot slot - ldrh r0, [r5,#0x32] @ vol_out + ldrh r0, [r5,#0x34] @ vol_out ldrb r2, [r5,#0x17] @ state add r3, r5, #0x1c - strh r0, [r5,#0x34] @ vol_ipol + strh r0, [r5,#0x36] @ vol_ipol tst r2, r2 beq 0f @ EG_OFF @@ -182,7 +182,7 @@ 11: ldrh r3, [r5,#0x18] @ tl add r0, r0, r3 @ volume += tl - strh r0, [r5,#0x32] @ vol_out + strh r0, [r5,#0x34] @ vol_out .if \slot == SLOT1 mov r6, r6, lsr #16 orr r6, r0, r6, lsl #16 @@ -207,11 +207,9 @@ ldrh r0, [r5,#0x30] @ ssg+ssgn ldrb r2, [r5,#0x17] @ state ldrh r3, [r5,#0x1a] @ volume - tst r0, #0x08 @ ssg enabled? - beq 9f - cmp r2, #EG_REL @ state > EG_REL? - ble 9f - cmp r3, #0x200 @ volume >= 0x200? + cmp r0, #0x08 @ ssg enabled && + cmpge r2, #EG_REL+1 @ state > EG_REL && + cmpge r3, #0x200 @ volume >= 0x200? blt 9f tst r0, #0x01 @@ -226,35 +224,33 @@ eor r0, r0, #0x4 @ if ( !(ssg&0x04 ) tst r0, #0x4 cmpne r2, #EG_ATT @ if ( state != EG_ATT ) - movne r0, #0x400 - subne r0, r0, #1 - strneh r0, [r5,#0x1a] @ volume = MAX_ATT + movne r3, #0x400 + subne r3, r3, #1 + strneh r3, [r5,#0x1a] @ volume = MAX_ATT b 9f 1: tst r0, #0x02 eorne r0, r0, #0x4 @ ssg ^= 4 eorne r0, r0, #0x400 @ ssgn ^= 4 strneh r0, [r5,#0x30] - moveq r3, #0 - streq r3, [r5,#0x0c] @ phase = 0 + moveq r0, #0 + streq r0, [r5,#0x0c] @ phase = 0 cmp r2, #EG_ATT @ if ( state != EG_ATT ) beq 9f - ldr r3, [r5,#0x1c] @ sl + ldr r0, [r5,#0x1c] @ sl mov r2, #EG_SUS @ state = sl==MIN_ATT ? EG_SUS:EG_DEC - cmp r3, #0 + cmp r0, #0 - ldr r0, [r5,#0x04] @ ar - ldr r3, [r5,#0x14] @ ksr + ldrh r0, [r5,#0x32] @ ar+ksr movne r2, #EG_DEC - add r0, r0, r3 cmp r0, #32+62 @ if ( ar+ksr >= 32+62 ) - ldrlt r0, [r5,#0x1a] - movge r0, #0 - strgeh r0, [r5,#0x1a] @ volume = MIN_ATT + movge r3, #0 + strgeh r3, [r5,#0x1a] @ volume = MIN_ATT + bge 9f - cmp r0, #0 + cmp r3, #0 movgt r2, #EG_ATT strb r2, [r5,#0x17] @ state 9: @@ -673,10 +669,10 @@ chan_render_loop: ldr r10, [lr, #0x54] @ op1_out @ ldmia lr, {r6,r7} @ load volumes ldr r5, [lr, #0x40] @ CH - ldrh r6, [r5, #0x32] @ vol_out values for all slots - ldrh r2, [r5, #0x32+SLOT_STRUCT_SIZE*2] - ldrh r7, [r5, #0x32+SLOT_STRUCT_SIZE] - ldrh r3, [r5, #0x32+SLOT_STRUCT_SIZE*3] + ldrh r6, [r5, #0x34] @ vol_out values for all slots + ldrh r2, [r5, #0x34+SLOT_STRUCT_SIZE*2] + ldrh r7, [r5, #0x34+SLOT_STRUCT_SIZE] + ldrh r3, [r5, #0x34+SLOT_STRUCT_SIZE*3] orr r6, r6, r2, lsl #16 orr r7, r7, r3, lsl #16 @@ -756,28 +752,28 @@ eg_done: cmp r3, #(EG_TIMER_OVERFLOW>>EG_SH)/2 bgt 0f @ mix is vol_out - ldrh r0, [r5,#0x34] @ SLOT1 vol_ipol + ldrh r0, [r5,#0x36] @ SLOT1 vol_ipol lsleq r2, r6, #16 addeq r0, r0, r2, lsr #16 lsreq r0, r0, #1 mov r6, r6, lsr #16 orr r6, r0, r6, lsl #16 - ldrh r0, [r5,#0x34+SLOT_STRUCT_SIZE*2] @ SLOT2 vol_ipol + ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*2] @ SLOT2 vol_ipol addeq r0, r0, r6, lsr #16 lsreq r0, r0, #1 mov r6, r6, lsl #16 orr r6, r6, r0 ror r6, r6, #16 - ldrh r0, [r5,#0x34+SLOT_STRUCT_SIZE] @ SLOT3 vol_ipol + ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE] @ SLOT3 vol_ipol lsleq r2, r7, #16 addeq r0, r0, r2, lsr #16 lsreq r0, r0, #1 mov r7, r7, lsr #16 orr r7, r0, r7, lsl #16 - ldrh r0, [r5,#0x34+SLOT_STRUCT_SIZE*3] @ SLOT4 vol_ipol + ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*3] @ SLOT4 vol_ipol addeq r0, r0, r7, lsr #16 lsreq r0, r0, #1 mov r7, r7, lsl #16 @@ -787,22 +783,22 @@ eg_done: @ super-basic... just take value closest to sample point mov r3, r8, lsr #EG_SH-1 @ eg_timer, [0..3<>EG_SH) - bgt 0f @ mix is vol_out + bge 0f @ mix is vol_out - ldrh r0, [r5,#0x34] @ SLOT1 vol_ipol + ldrh r0, [r5,#0x36] @ SLOT1 vol_ipol mov r6, r6, lsr #16 orr r6, r0, r6, lsl #16 - ldrh r0, [r5,#0x34+SLOT_STRUCT_SIZE*2] @ SLOT2 vol_ipol + ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*2] @ SLOT2 vol_ipol mov r6, r6, lsl #16 orr r6, r6, r0 ror r6, r6, #16 - ldrh r0, [r5,#0x34+SLOT_STRUCT_SIZE] @ SLOT3 vol_ipol + ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE] @ SLOT3 vol_ipol mov r7, r7, lsr #16 orr r7, r0, r7, lsl #16 - ldrh r0, [r5,#0x34+SLOT_STRUCT_SIZE*3] @ SLOT4 vol_ipol + ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*3] @ SLOT4 vol_ipol mov r7, r7, lsl #16 orr r7, r7, r0 ror r7, r7, #16 diff --git a/platform/common/dismips.c b/platform/common/dismips.c index dc06ce80e..d855ad6b8 100644 --- a/platform/common/dismips.c +++ b/platform/common/dismips.c @@ -368,10 +368,12 @@ int dismips(uintptr_t pc, uint32_t insn, char *buf, size_t buflen, unsigned long else snprintf(buf, buflen, "%s %s, %s, %d", pi->name, rd, rt, sa); break; + //dext: pos,size-1 dextm: pos,size-33 dextu: pos-32,size-1 + //dins: pos,pos+size-1 dinsm: pos,pos+size-33 dinsu: pos-32,pos+size-33 case F_IMM_TS: - if (insn & 0x01) sb+=32; - if (insn & 0x02) sa+=32; - if (insn & 0x04) sb-=sa; + if (insn & 0x01) sb+=32; // ...m + if (insn & 0x02) sa+=32; // ...u + if (insn & 0x04) sb-=sa; // ins snprintf(buf, buflen, "%s %s, %s, %d, %d", pi->name, rt, rs, sa, sb+1); break; case B_IMM_S: diff --git a/platform/linux/emu.c b/platform/linux/emu.c index 936652631..5e4dd72a2 100644 --- a/platform/linux/emu.c +++ b/platform/linux/emu.c @@ -176,7 +176,10 @@ void plat_debug_cat(char *str) void emu_video_mode_change(int start_line, int line_count, int is_32cols) { // clear whole screen in all buffers - memset32(g_screen_ptr, 0, g_screen_ppitch * g_screen_height * 2 / 4); + if (currentConfig.renderer != RT_16BIT && !(PicoIn.AHW & PAHW_32X)) + memset32(Pico.est.Draw2FB, 0, (320+8) * (8+240+8) / 4); + else + memset32(g_screen_ptr, 0, g_screen_ppitch * g_screen_height * 2 / 4); } void pemu_loop_prep(void) From 0b7d8138277a32e4c46e03fc459b49cc04db722a Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 14 Jan 2020 23:00:44 +0100 Subject: [PATCH 094/174] emulator timing fixes, VDP DMA fixes, improved DAC audio --- cpu/cz80/cz80.c | 2 ++ cpu/cz80/cz80_op.c | 7 +++--- pico/32x/32x.c | 3 ++- pico/debug.c | 22 +++++------------ pico/memory.c | 17 +++++-------- pico/pico.c | 33 ++++++++++++++----------- pico/pico_cmn.c | 60 +++++++++++++++++++++++----------------------- pico/pico_int.h | 10 ++++---- pico/sound/sound.c | 55 ++++++++++++++++++++++++++++++------------ pico/videoport.c | 7 +++--- 10 files changed, 118 insertions(+), 98 deletions(-) diff --git a/cpu/cz80/cz80.c b/cpu/cz80/cz80.c index 0326b0b84..6b9afcde9 100644 --- a/cpu/cz80/cz80.c +++ b/cpu/cz80/cz80.c @@ -288,6 +288,8 @@ INT32 Cz80_Exec(cz80_struc *CPU, INT32 cycles) #if CZ80_ENCRYPTED_ROM CPU->OPBase = OPBase; #endif + if (CPU->HaltState) + CPU->ICount = 0; cycles -= CPU->ICount; #if !CZ80_EMULATE_R_EXACTLY zR = (zR + (cycles >> 2)) & 0x7f; diff --git a/cpu/cz80/cz80_op.c b/cpu/cz80/cz80_op.c index f84f8e754..5d623caf2 100644 --- a/cpu/cz80/cz80_op.c +++ b/cpu/cz80/cz80_op.c @@ -687,13 +687,14 @@ switch (Opcode) OP(0x76): // HALT OP_HALT: CPU->HaltState = 1; - CPU->ICount = 0; +// CPU->ICount = 0; goto Cz80_Check_Interrupt; OP(0xf3): // DI OP_DI: zIFF = 0; - RET(4) + USE_CYCLES(4) + goto Cz80_Exec_nocheck; OP(0xfb): // EI OP_EI: @@ -712,8 +713,6 @@ switch (Opcode) if (CPU->IRQState) { afterEI = 1; - CPU->ExtraCycles += 1 - CPU->ICount; - CPU->ICount = 1; } } else zIFF2 = (1 << 2); diff --git a/pico/32x/32x.c b/pico/32x/32x.c index aa45ba7bd..0f0cc4f5c 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -269,7 +269,8 @@ void p32x_schedule_hint(SH2 *sh2, unsigned int m68k_cycles) return; // nobody cares // note: when Pico.m.scanline is 224, SH2s might // still be at scanline 93 (or so) - if (!(Pico32x.sh2_regs[0] & 0x80) && Pico.m.scanline > 224) + if (!(Pico32x.sh2_regs[0] & 0x80) && + Pico.m.scanline > (Pico.video.reg[1] & 0x08 ? 240 : 224)) return; after = (Pico32x.sh2_regs[4 / 2] + 1) * 488; diff --git a/pico/debug.c b/pico/debug.c index 50cbaf387..e617d9086 100644 --- a/pico/debug.c +++ b/pico/debug.c @@ -369,42 +369,32 @@ void PDebugDumpMem(void) void PDebugZ80Frame(void) { - int lines, line_sample; + int lines; if (PicoIn.AHW & PAHW_SMS) return; - if (Pico.m.pal) { + if (Pico.m.pal) lines = 313; - line_sample = 68; - } else { + else lines = 262; - line_sample = 93; - } z80_resetCycles(); PsndStartFrame(); - if (/*Pico.m.z80Run &&*/ !Pico.m.z80_reset && (PicoIn.opt&POPT_EN_Z80)) - PicoSyncZ80(Pico.t.m68c_cnt + line_sample * 488); - if (PicoIn.sndOut) - PsndGetSamples(line_sample); - if (/*Pico.m.z80Run &&*/ !Pico.m.z80_reset && (PicoIn.opt&POPT_EN_Z80)) { PicoSyncZ80(Pico.t.m68c_cnt + 224 * 488); z80_int(); } - if (PicoIn.sndOut) - PsndGetSamples(224); // sync z80 if (/*Pico.m.z80Run &&*/ !Pico.m.z80_reset && (PicoIn.opt&POPT_EN_Z80)) { Pico.t.m68c_cnt += Pico.m.pal ? 151809 : 127671; // cycles adjusted for converter PicoSyncZ80(Pico.t.m68c_cnt); } - if (PicoIn.sndOut && ym2612.dacen && Pico.snd.dac_line < lines) - PsndDoDAC(lines - 1); - PsndDoPSG(lines - 1); + + if (PicoIn.sndOut) + PsndGetSamples(lines); timers_cycle(); Pico.t.m68c_aim = Pico.t.m68c_cnt; diff --git a/pico/memory.c b/pico/memory.c index 9fe3a0852..1d9b91351 100644 --- a/pico/memory.c +++ b/pico/memory.c @@ -943,11 +943,11 @@ static int ym2612_write_local(u32 a, u32 d, int is_from_z80) a &= 3; if (a == 1 && ym2612.OPN.ST.address == 0x2a) /* DAC data */ { - int scanline = get_scanline(is_from_z80); - //elprintf(EL_STATUS, "%03i -> %03i dac w %08x z80 %i", Pico.snd.dac_line, scanline, d, is_from_z80); + int cycles = is_from_z80 ? z80_cyclesDone() : z80_cycles_from_68k(); + //elprintf(EL_STATUS, "%03i dac w %08x z80 %i", cycles, d, is_from_z80); ym2612.dacout = ((int)d - 0x80) << 6; if (ym2612.dacen) - PsndDoDAC(scanline); + PsndDoDAC(cycles); return 0; } @@ -1029,13 +1029,9 @@ static int ym2612_write_local(u32 a, u32 d, int is_from_z80) return 0; } case 0x2b: { /* DAC Sel (YM2612) */ - int scanline = get_scanline(is_from_z80); - if (ym2612.dacen != (d & 0x80)) { - ym2612.dacen = d & 0x80; - Pico.snd.dac_line = scanline; - } + ym2612.dacen = d & 0x80; #ifdef __GP2X__ - if (PicoIn.opt & POPT_EXT_FM) YM2612Write_940(a, d, scanline); + if (PicoIn.opt & POPT_EXT_FM) YM2612Write_940(a, d, get_scanline(is_from_z80)); #endif return 0; } @@ -1059,8 +1055,7 @@ static int ym2612_write_local(u32 a, u32 d, int is_from_z80) break; } - int scanline = get_scanline(is_from_z80); - PsndDoFM(scanline); + PsndDoFM(get_scanline(is_from_z80)); #ifdef __GP2X__ if (PicoIn.opt & POPT_EXT_FM) return YM2612Write_940(a, d, get_scanline(is_from_z80)); diff --git a/pico/pico.c b/pico/pico.c index f6b43cd69..2a16a0e23 100644 --- a/pico/pico.c +++ b/pico/pico.c @@ -224,40 +224,45 @@ void PicoLoopPrepare(void) // this table is wrong and should be removed // keeping it for now to compensate wrong timing elswhere, mainly for Outrunners -static const int dma_timings[] = { - 83, 166, 83, 83, // vblank: 32cell: dma2vram dma2[vs|c]ram vram_fill vram_copy - 102, 204, 102, 102, // vblank: 40cell: - 8, 16, 8, 8, // active: 32cell: - 17, 18, 9, 9 // ... +static const int dma_timings[] = { // Q16 + // dma2vram dma2[vs|c]ram vram_fill vram_copy + // VRAM has half the width of VSRAM/CRAM, thus half the performance + ( 83<<16)/488, (166<<16)/488, (165<<16)/488, ( 83<<16)/488, // vblank 32cell + (102<<16)/488, (204<<16)/488, (203<<16)/488, (102<<16)/488, // vblank 40cell + ( 8<<16)/488, ( 16<<16)/488, ( 15<<16)/488, ( 8<<16)/488, // active 32cell + ( 9<<16)/488, ( 18<<16)/488, ( 17<<16)/488, ( 9<<16)/488 // active 40cell }; -static const int dma_bsycles[] = { - (488<<8)/83, (488<<8)/166, (488<<8)/83, (488<<8)/83, - (488<<8)/102, (488<<8)/204, (488<<8)/102, (488<<8)/102, - (488<<8)/8, (488<<8)/16, (488<<8)/8, (488<<8)/8, - (488<<8)/9, (488<<8)/18, (488<<8)/9, (488<<8)/9 +static const int dma_bsycles[] = { // Q16 + (488<<16)/83, (488<<16)/166, (488<<16)/165, (488<<16)/83, + (488<<16)/102, (488<<16)/204, (488<<16)/203, (488<<16)/102, + (488<<16)/8, (488<<16)/16, (488<<16)/15, (488<<16)/8, + (488<<16)/9, (488<<16)/18, (488<<16)/17, (488<<16)/9 }; // grossly inaccurate.. FIXME FIXXXMEE -PICO_INTERNAL int CheckDMA(void) +PICO_INTERNAL int CheckDMA(int cycles) { int burn = 0, xfers_can, dma_op = Pico.video.reg[0x17]>>6; // see gens for 00 and 01 modes int xfers = Pico.m.dma_xfers; int dma_op1; + // safety pin + if (cycles <= 0) return 0; + if(!(dma_op&2)) dma_op = (Pico.video.type==1) ? 0 : 1; // setting dma_timings offset here according to Gens dma_op1 = dma_op; if(Pico.video.reg[12] & 1) dma_op |= 4; // 40 cell mode? if(!(Pico.video.status&8)&&(Pico.video.reg[1]&0x40)) dma_op|=8; // active display? - xfers_can = dma_timings[dma_op]; + xfers_can = (dma_timings[dma_op] * cycles + 0xff) >> 16; if(xfers <= xfers_can) { Pico.video.status &= ~SR_DMA; if (!(dma_op & 2)) - burn = xfers * dma_bsycles[dma_op] >> 8; // have to be approximate because can't afford division.. + burn = xfers * dma_bsycles[dma_op] >> 16; Pico.m.dma_xfers = 0; } else { - if(!(dma_op&2)) burn = 488; + if(!(dma_op&2)) burn = cycles; Pico.m.dma_xfers -= xfers_can; } diff --git a/pico/pico_cmn.c b/pico/pico_cmn.c index 5fa0b16f2..8c22c9773 100644 --- a/pico/pico_cmn.c +++ b/pico/pico_cmn.c @@ -22,25 +22,29 @@ #endif // sync m68k to Pico.t.m68c_aim -static void SekSyncM68k(void) +static void SekExecM68k(int cyc_do) { - int cyc_do; - pprof_start(m68k); - pevt_log_m68k_o(EVT_RUN_START); - - while ((cyc_do = Pico.t.m68c_aim - Pico.t.m68c_cnt) > 0) { - Pico.t.m68c_cnt += cyc_do; + Pico.t.m68c_cnt += cyc_do; #if defined(EMU_C68K) - PicoCpuCM68k.cycles = cyc_do; - CycloneRun(&PicoCpuCM68k); - Pico.t.m68c_cnt -= PicoCpuCM68k.cycles; + PicoCpuCM68k.cycles = cyc_do; + CycloneRun(&PicoCpuCM68k); + Pico.t.m68c_cnt -= PicoCpuCM68k.cycles; #elif defined(EMU_M68K) - Pico.t.m68c_cnt += m68k_execute(cyc_do) - cyc_do; + Pico.t.m68c_cnt += m68k_execute(cyc_do) - cyc_do; #elif defined(EMU_F68K) - Pico.t.m68c_cnt += fm68k_emulate(&PicoCpuFM68k, cyc_do, 0) - cyc_do; + Pico.t.m68c_cnt += fm68k_emulate(&PicoCpuFM68k, cyc_do, 0) - cyc_do; #endif - } +} + +static void SekSyncM68k(void) +{ + int cyc_do; + pprof_start(m68k); + pevt_log_m68k_o(EVT_RUN_START); + + while ((cyc_do = Pico.t.m68c_aim - Pico.t.m68c_cnt) > 0) + SekExecM68k(cyc_do); SekCyclesLeft = 0; @@ -68,7 +72,7 @@ static void do_hint(struct PicoVideo *pv) } } -static void do_timing_hacks_as(struct PicoVideo *pv, int vdp_slots) +static void do_timing_hacks_as(struct PicoVideo *pv, int vdp_slots, int cycles) { pv->lwrite_cnt += vdp_slots - Pico.m.dma_xfers * 2; // wrong *2 if (pv->lwrite_cnt > vdp_slots) @@ -76,13 +80,13 @@ static void do_timing_hacks_as(struct PicoVideo *pv, int vdp_slots) else if (pv->lwrite_cnt < 0) pv->lwrite_cnt = 0; if (Pico.m.dma_xfers) - SekCyclesBurn(CheckDMA()); + SekCyclesBurn(CheckDMA(cycles)); } -static void do_timing_hacks_vb(void) +static void do_timing_hacks_vb(int cycles) { if (unlikely(Pico.m.dma_xfers)) - SekCyclesBurn(CheckDMA()); + SekCyclesBurn(CheckDMA(cycles)); } static int PicoFrameHints(void) @@ -151,7 +155,7 @@ static int PicoFrameHints(void) // Run scanline: Pico.t.m68c_line_start = Pico.t.m68c_aim; - do_timing_hacks_as(pv, vdp_slots); + do_timing_hacks_as(pv, vdp_slots, CYCLES_M68K_LINE); CPUS_RUN(CYCLES_M68K_LINE); if (PicoLineHook) PicoLineHook(); @@ -192,19 +196,18 @@ static int PicoFrameHints(void) // also delay between F bit (bit 7) is set in SR and IRQ happens (Ex-Mutants) // also delay between last H-int and V-int (Golden Axe 3) Pico.t.m68c_line_start = Pico.t.m68c_aim; - do_timing_hacks_vb(); + do_timing_hacks_vb(CYCLES_M68K_VINT_LAG); CPUS_RUN(CYCLES_M68K_VINT_LAG); pv->status |= SR_F; pv->pending_ints |= 0x20; if (pv->reg[1] & 0x20) { - Pico.t.m68c_aim = Pico.t.m68c_cnt + 11; // HACK - SekSyncM68k(); + SekExecM68k(11); // HACK elprintf(EL_INTS, "vint: @ %06x [%u]", SekPc, SekCyclesDone()); SekInterrupt(6); } - cycles = SekCyclesDone(); + cycles = Pico.t.m68c_aim; if (Pico.m.z80Run && !Pico.m.z80_reset && (PicoIn.opt&POPT_EN_Z80)) { PicoSyncZ80(cycles); elprintf(EL_INTS, "zint"); @@ -221,6 +224,7 @@ static int PicoFrameHints(void) #endif // Run scanline: + do_timing_hacks_vb(CYCLES_M68K_LINE - CYCLES_M68K_VINT_LAG); CPUS_RUN(CYCLES_M68K_LINE - CYCLES_M68K_VINT_LAG); if (PicoLineHook) PicoLineHook(); @@ -256,7 +260,7 @@ static int PicoFrameHints(void) // Run scanline: Pico.t.m68c_line_start = Pico.t.m68c_aim; - do_timing_hacks_vb(); + do_timing_hacks_vb(CYCLES_M68K_LINE); CPUS_RUN(CYCLES_M68K_LINE); if (PicoLineHook) PicoLineHook(); @@ -267,7 +271,7 @@ static int PicoFrameHints(void) unsigned int l = PicoIn.overclockM68k * lines / 100; while (l-- > 0) { Pico.t.m68c_cnt -= CYCLES_M68K_LINE; - do_timing_hacks_vb(); + do_timing_hacks_vb(CYCLES_M68K_LINE); SekSyncM68k(); } } @@ -293,20 +297,16 @@ static int PicoFrameHints(void) // Run scanline: Pico.t.m68c_line_start = Pico.t.m68c_aim; - do_timing_hacks_as(pv, vdp_slots); + do_timing_hacks_as(pv, vdp_slots, CYCLES_M68K_LINE); CPUS_RUN(CYCLES_M68K_LINE); if (PicoLineHook) PicoLineHook(); pevt_log_m68k_o(EVT_NEXT_LINE); // sync cpus - cycles = SekCyclesDone(); + cycles = Pico.t.m68c_aim; if (Pico.m.z80Run && !Pico.m.z80_reset && (PicoIn.opt&POPT_EN_Z80)) PicoSyncZ80(cycles); - if (PicoIn.sndOut && ym2612.dacen && Pico.snd.dac_line < lines) - PsndDoDAC(lines - 1); - if (PicoIn.sndOut && Pico.snd.psg_line < lines) - PsndDoPSG(lines - 1); #ifdef PICO_CD if (PicoIn.AHW & PAHW_MCD) diff --git a/pico/pico_int.h b/pico/pico_int.h index d3da72ce4..58d3da889 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -193,7 +193,7 @@ extern struct DrZ80 drZ80; #define z80_int_assert(a) Cz80_Set_IRQ(&CZ80, 0, (a) ? ASSERT_LINE : CLEAR_LINE) #define z80_nmi() Cz80_Set_IRQ(&CZ80, IRQ_LINE_NMI, 0) -#define z80_cyclesLeft (CZ80.ICount - CZ80.ExtraCycles) +#define z80_cyclesLeft CZ80.ICount #define z80_subCLeft(c) CZ80.ICount -= c #define z80_pc() Cz80_Get_Reg(&CZ80, CZ80_PC) @@ -431,7 +431,9 @@ struct PicoSound short len_use; // adjusted int len_e_add; // for non-int samples/frame int len_e_cnt; - short dac_line; + int dac_val, dac_val2; // last DAC sample + unsigned int dac_mult; // z80 clocks per line in Q16 + unsigned int dac_pos; // last DAC position in Q16 short psg_line; unsigned int fm_mult; // samples per line in Q16 unsigned int fm_pos; // last FM position in Q16 @@ -738,7 +740,7 @@ extern struct Pico Pico; extern struct PicoMem PicoMem; extern void (*PicoResetHook)(void); extern void (*PicoLineHook)(void); -PICO_INTERNAL int CheckDMA(void); +PICO_INTERNAL int CheckDMA(int cycles); PICO_INTERNAL void PicoDetectRegion(void); PICO_INTERNAL void PicoSyncZ80(unsigned int m68k_cycles_done); @@ -872,7 +874,7 @@ PICO_INTERNAL_ASM void wram_1M_to_2M(unsigned char *m); // sound/sound.c PICO_INTERNAL void PsndReset(void); PICO_INTERNAL void PsndStartFrame(void); -PICO_INTERNAL void PsndDoDAC(int line_to); +PICO_INTERNAL void PsndDoDAC(int cycle_to); PICO_INTERNAL void PsndDoPSG(int line_to); PICO_INTERNAL void PsndDoFM(int line_to); PICO_INTERNAL void PsndClear(void); diff --git a/pico/sound/sound.c b/pico/sound/sound.c index 98b4bf2e8..688812829 100644 --- a/pico/sound/sound.c +++ b/pico/sound/sound.c @@ -160,6 +160,8 @@ void PsndRerate(int preserve_state) // samples per line (Q16) Pico.snd.fm_mult = 65536LL * PicoIn.sndRate / (target_fps*target_lines); + // samples per z80 clock (Q20) + Pico.snd.dac_mult = 16 * Pico.snd.fm_mult * 15/7 / 488; // recalculate dac info dac_recalculate(); @@ -191,34 +193,46 @@ PICO_INTERNAL void PsndStartFrame(void) Pico.snd.len_use++; } - Pico.snd.dac_line = Pico.snd.psg_line = 0; - Pico.snd.fm_pos = 0; + Pico.snd.psg_line = 0; } -PICO_INTERNAL void PsndDoDAC(int line_to) +PICO_INTERNAL void PsndDoDAC(int cyc_to) { - int pos, pos1, len; + int pos, len; int dout = ym2612.dacout; - int line_from = Pico.snd.dac_line; - pos = dac_info[line_from]; - pos1 = dac_info[line_to + 1]; - len = pos1 - pos; + // number of samples to fill in buffer (Q20) + len = (cyc_to * Pico.snd.dac_mult) - Pico.snd.dac_pos; + + // update position and calculate buffer offset and length + pos = (Pico.snd.dac_pos+0x80000) >> 20; + Pico.snd.dac_pos += len; + len = ((Pico.snd.dac_pos+0x80000) >> 20) - pos; + + // avoid loss of the 1st sample of a new block (Q rounding issues) + if (pos+len == 0) + len = 1, Pico.snd.dac_pos += 0x80000; if (len <= 0) return; - Pico.snd.dac_line = line_to + 1; - if (!PicoIn.sndOut) return; + // fill buffer, applying a rather weak order 1 bessel IIR on the way + // y[n] = (x[n] + x[n-1])*(1/2) (3dB cutoff at 11025 Hz, no gain) + // 1 sample delay for correct IIR filtering over audio frame boundaries if (PicoIn.opt & POPT_EN_STEREO) { short *d = PicoIn.sndOut + pos*2; - for (; len > 0; len--, d+=2) *d += dout; + // left channel only, mixed ro right channel in mixing phase + *d++ += Pico.snd.dac_val2; d++; + while (--len) *d++ += Pico.snd.dac_val, d++; } else { short *d = PicoIn.sndOut + pos; - for (; len > 0; len--, d++) *d += dout; + *d++ += Pico.snd.dac_val2; + while (--len) *d++ += Pico.snd.dac_val; } + Pico.snd.dac_val2 = (Pico.snd.dac_val + dout) >> 1; + Pico.snd.dac_val = dout; } PICO_INTERNAL void PsndDoPSG(int line_to) @@ -332,6 +346,8 @@ PICO_INTERNAL void PsndClear(void) } if (!(PicoIn.opt & POPT_EN_FM)) memset32(PsndBuffer, 0, PicoIn.opt & POPT_EN_STEREO ? len*2 : len); + // drop pos remainder to avoid rounding errors (not entirely correct though) + Pico.snd.dac_pos = Pico.snd.fm_pos = 0; } @@ -340,6 +356,7 @@ static int PsndRender(int offset, int length) int *buf32; int stereo = (PicoIn.opt & 8) >> 3; int fmlen = ((Pico.snd.fm_pos+0x8000) >> 16) - offset; + int daclen = ((Pico.snd.dac_pos+0x80000) >> 20) - offset; offset <<= stereo; buf32 = PsndBuffer+offset; @@ -351,6 +368,15 @@ static int PsndRender(int offset, int length) return length; } + // Fill up DAC output in case of missing samples (Q16 rounding errors) + if (length-daclen > 0) { + short *dacbuf = PicoIn.sndOut + (daclen << stereo); + for (; length-daclen > 0; daclen++) { + *dacbuf++ += Pico.snd.dac_val; + if (stereo) dacbuf++; + } + } + // Add in parts of the FM buffer not yet done if (length-fmlen > 0) { int *fmbuf = buf32 + (fmlen << stereo); @@ -396,8 +422,8 @@ PICO_INTERNAL void PsndGetSamples(int y) { static int curr_pos = 0; - if (ym2612.dacen && Pico.snd.dac_line < y) - PsndDoDAC(y - 1); + if (ym2612.dacen) + PsndDoDAC(cycles_68k_to_z80(Pico.t.m68c_aim - Pico.t.m68c_frame_start)); PsndDoPSG(y - 1); curr_pos = PsndRender(0, Pico.snd.len_use); @@ -406,7 +432,6 @@ PICO_INTERNAL void PsndGetSamples(int y) PicoIn.writeSound(curr_pos * ((PicoIn.opt & POPT_EN_STEREO) ? 4 : 2)); // clear sound buffer PsndClear(); - Pico.snd.dac_line = y; } PICO_INTERNAL void PsndGetSamplesMS(int y) diff --git a/pico/videoport.c b/pico/videoport.c index d18c2cf9d..d196ee4ff 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -97,7 +97,7 @@ static void DmaSlow(int len, unsigned int source) Pico.m.dma_xfers = len; if (Pico.m.dma_xfers < len) // lame 16bit var Pico.m.dma_xfers = ~0; - SekCyclesBurnRun(CheckDMA()); + SekCyclesBurnRun(CheckDMA(488 - (SekCyclesDone()-Pico.t.m68c_line_start))); if ((source & 0xe00000) == 0xe00000) { // Ram base = (u16 *)PicoMem.ram; @@ -344,7 +344,8 @@ static NOINLINE void CommandChange(void) static void DrawSync(int blank_on) { - if (Pico.m.scanline < 224 && !(PicoIn.opt & POPT_ALT_RENDERER) && + int lines = Pico.video.reg[1]&0x08 ? 240 : 224; + if (Pico.m.scanline < lines && !(PicoIn.opt & POPT_ALT_RENDERER) && !PicoIn.skipFrame && Pico.est.DrawScanline <= Pico.m.scanline) { //elprintf(EL_ANOMALY, "sync"); PicoDrawSync(Pico.m.scanline, blank_on); @@ -363,7 +364,7 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) { case 0x00: // Data port 0 or 2 // try avoiding the sync.. - if (Pico.m.scanline < 224 && (pvid->reg[1]&0x40) && + if (Pico.m.scanline < (pvid->reg[1]&0x08 ? 240 : 224) && (pvid->reg[1]&0x40) && !(!pvid->pending && ((pvid->command & 0xc00000f0) == 0x40000010 && PicoMem.vsram[pvid->addr>>1] == d)) ) From 23b1e02a9deed507597f77cb7a43427e09a90d53 Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 26 Jan 2020 20:12:18 +0100 Subject: [PATCH 095/174] audio fixes for overdrive demo --- cpu/cz80/cz80.c | 3 ++- cpu/cz80/cz80_op.c | 1 - pico/memory.c | 8 +++++--- pico/pico_int.h | 11 ++++++----- pico/sound/ym2612_arm.S | 3 ++- 5 files changed, 15 insertions(+), 11 deletions(-) diff --git a/cpu/cz80/cz80.c b/cpu/cz80/cz80.c index 6b9afcde9..51abc40fe 100644 --- a/cpu/cz80/cz80.c +++ b/cpu/cz80/cz80.c @@ -278,7 +278,8 @@ INT32 Cz80_Exec(cz80_struc *CPU, INT32 cycles) CPU->ICount -= CPU->ExtraCycles; CPU->ExtraCycles = 0; } - goto Cz80_Exec; + if (!CPU->HaltState) + goto Cz80_Exec; } } else CPU->ICount = 0; diff --git a/cpu/cz80/cz80_op.c b/cpu/cz80/cz80_op.c index 5d623caf2..317e9587b 100644 --- a/cpu/cz80/cz80_op.c +++ b/cpu/cz80/cz80_op.c @@ -687,7 +687,6 @@ switch (Opcode) OP(0x76): // HALT OP_HALT: CPU->HaltState = 1; -// CPU->ICount = 0; goto Cz80_Check_Interrupt; OP(0xf3): // DI diff --git a/pico/memory.c b/pico/memory.c index 1d9b91351..d61491c14 100644 --- a/pico/memory.c +++ b/pico/memory.c @@ -733,8 +733,10 @@ static void PicoWrite8_vdp(u32 a, u32 d) static void PicoWrite16_vdp(u32 a, u32 d) { - if ((a & 0x00f9) == 0x0010) // PSG Sound + if ((a & 0x00f9) == 0x0010) { // PSG Sound psg_write_68k(d); + return; + } if ((a & 0x00e0) == 0x0000) { PicoVideoWrite(a, d); return; @@ -898,10 +900,10 @@ void ym2612_sync_timers(int z80_cycles, int mode_old, int mode_new) int xcycles = z80_cycles << 8; /* check for overflows */ - if ((mode_old & 4) && xcycles > Pico.t.timer_a_next_oflow) + if ((mode_old & 4) && xcycles >= Pico.t.timer_a_next_oflow) ym2612.OPN.ST.status |= 1; - if ((mode_old & 8) && xcycles > Pico.t.timer_b_next_oflow) + if ((mode_old & 8) && xcycles >= Pico.t.timer_b_next_oflow) ym2612.OPN.ST.status |= 2; /* update timer a */ diff --git a/pico/pico_int.h b/pico/pico_int.h index 58d3da889..70bfa7104 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -822,10 +822,10 @@ void ym2612_pack_state(void); void ym2612_unpack_state(void); #define TIMER_NO_OFLOW 0x70000000 -// tA = 72 * (1024 - NA) / M -#define TIMER_A_TICK_ZCYCLES 17203 -// tB = 1152 * (256 - NA) / M -#define TIMER_B_TICK_ZCYCLES 262800 // 275251 broken, see Dai Makaimura +// tA = 72 * (1024 - NA) / M, with M = mclock/2 -> tick = 72 * 2/mclock +#define TIMER_A_TICK_ZCYCLES 17203 // zcycles = Q8*tick*zclock = Q8*77*2*7/15 +// tB = 1152 * (256 - NA) / M, +#define TIMER_B_TICK_ZCYCLES 275251 // zcycles = Q8*1152*2*7/15 #define timers_cycle() \ if (Pico.t.timer_a_next_oflow > 0 && Pico.t.timer_a_next_oflow < TIMER_NO_OFLOW) \ @@ -837,7 +837,8 @@ void ym2612_unpack_state(void); #define timers_reset() \ Pico.t.timer_a_next_oflow = Pico.t.timer_b_next_oflow = TIMER_NO_OFLOW; \ Pico.t.timer_a_step = TIMER_A_TICK_ZCYCLES * 1024; \ - Pico.t.timer_b_step = TIMER_B_TICK_ZCYCLES * 256; + Pico.t.timer_b_step = TIMER_B_TICK_ZCYCLES * 256; \ + ym2612.OPN.ST.status &= ~3; // videoport.c diff --git a/pico/sound/ym2612_arm.S b/pico/sound/ym2612_arm.S index 4cb928509..e3ec370d0 100644 --- a/pico/sound/ym2612_arm.S +++ b/pico/sound/ym2612_arm.S @@ -206,8 +206,9 @@ .macro update_ssg_eg ldrh r0, [r5,#0x30] @ ssg+ssgn ldrb r2, [r5,#0x17] @ state + and r3, r0, #0x08 + cmp r3, #0x08 @ ssg enabled && ldrh r3, [r5,#0x1a] @ volume - cmp r0, #0x08 @ ssg enabled && cmpge r2, #EG_REL+1 @ state > EG_REL && cmpge r3, #0x200 @ volume >= 0x200? blt 9f From ffc5179571591ba5cafccb6ba9eba42f3fbd6d5c Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 26 Jan 2020 20:40:07 +0100 Subject: [PATCH 096/174] sprite rendering improvements for masking and limit edge cases --- pico/draw.c | 111 +++++++++++++++++++++++++++++------------------ pico/draw2.c | 3 +- pico/draw_arm.S | 63 ++++++++++++++++++--------- pico/pico_int.h | 4 +- pico/videoport.c | 3 +- 5 files changed, 118 insertions(+), 66 deletions(-) diff --git a/pico/draw.c b/pico/draw.c index 06c54807b..2922914fd 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -53,7 +53,11 @@ static int HighPreSpr[80*2+1]; // slightly preprocessed sprites #define SPRL_HAVE_LO 0x40 // *lo* #define SPRL_MAY_HAVE_OP 0x20 // may have operator sprites on the line #define SPRL_LO_ABOVE_HI 0x10 // low priority sprites may be on top of hi -unsigned char HighLnSpr[240][3 + MAX_LINE_SPRITES]; // sprite_count, ^flags, tile_count, [spritep]... +#define SPRL_HAVE_X 0x08 // have sprites with x != 0 +#define SPRL_TILE_OVFL 0x04 // tile limit exceeded on previous line +#define SPRL_HAVE_MASK0 0x02 // have sprite with x == 0 in 1st slot +#define SPRL_MASKED 0x01 // lo prio masking by sprite with x == 0 active +unsigned char HighLnSpr[240][4+MAX_LINE_SPRITES+1]; // sprite_count, ^flags, tile_count, sprites_total, [spritep]..., last_width int rendstatus_old; int rendlines; @@ -706,7 +710,7 @@ static void DrawTilesFromCache(int *hc, int sh, int rlim, struct PicoEState *est // Index + 0 : hhhhvvvv ab--hhvv yyyyyyyy yyyyyyyy // a: offscreen h, b: offs. v, h: horiz. size // Index + 4 : xxxxxxxx xxxxxxxx pccvhnnn nnnnnnnn // x: x coord + 8 -static void DrawSprite(int *sprite, int sh) +static void DrawSprite(int *sprite, int sh, int w) { void (*fTileFunc)(unsigned char *pd, unsigned int pack, int pal); unsigned char *pd = Pico.est.HighCol; @@ -746,6 +750,7 @@ static void DrawSprite(int *sprite, int sh) else fTileFunc=TileNorm; } + if (w) width = w; // tile limit for (; width; width--,sx+=8,tile+=delta) { unsigned int pack; @@ -833,12 +838,13 @@ static NOINLINE void DrawAllSpritesInterlace(int pri, int sh) struct PicoVideo *pvid=&Pico.video; int i,u,table,link=0,sline=Pico.est.DrawScanline<<1; unsigned int *sprites[80]; // Sprite index + int max_sprites = Pico.video.reg[12]&1 ? 80 : 64; table=pvid->reg[5]&0x7f; if (pvid->reg[12]&1) table&=0x7e; // Lowest bit 0 in 40-cell mode table<<=8; // Get sprite table address/2 - for (i=u=0; u < 80 && i < 21; u++) + for (i = u = 0; u < max_sprites && link < max_sprites; u++) { unsigned int *sprite; int code, sx, sy, height; @@ -888,15 +894,18 @@ static void DrawSpritesSHi(unsigned char *sprited, const struct PicoEState *est) void (*fTileFunc)(unsigned char *pd, unsigned int pack, int pal); unsigned char *pd = Pico.est.HighCol; unsigned char *p; - int cnt; + int cnt, w; cnt = sprited[0] & 0x7f; if (cnt == 0) return; - p = &sprited[3]; + p = &sprited[4]; + if ((sprited[1] & (SPRL_TILE_OVFL|SPRL_HAVE_MASK0)) == (SPRL_TILE_OVFL|SPRL_HAVE_MASK0)) + return; // masking effective due to tile overflow // Go through sprites backwards: - for (cnt--; cnt >= 0; cnt--) + w = p[cnt]; // possibly clipped width of last sprite + for (cnt--; cnt >= 0; cnt--, w = 0) { int *sprite, code, pal, tile, sx, sy; int offs, delta, width, height, row; @@ -940,6 +949,7 @@ static void DrawSpritesSHi(unsigned char *sprited, const struct PicoEState *est) tile &= 0x7ff; tile<<=4; tile+=(row&7)<<1; // Tile address delta<<=4; // Delta of address + if (w) width = w; // tile limit for (; width; width--,sx+=8,tile+=delta) { unsigned int pack; @@ -967,7 +977,9 @@ static void DrawSpritesHiAS(unsigned char *sprited, int sh) if (cnt == 0) return; memset(mb, 0xff, sizeof(mb)); - p = &sprited[3]; + p = &sprited[4]; + if ((sprited[1] & (SPRL_TILE_OVFL|SPRL_HAVE_MASK0)) == (SPRL_TILE_OVFL|SPRL_HAVE_MASK0)) + return; // masking effective due to tile overflow // Go through sprites: for (entry = 0; entry < cnt; entry++) @@ -1019,6 +1031,7 @@ static void DrawSpritesHiAS(unsigned char *sprited, int sh) tile &= 0x7ff; tile<<=4; tile+=(row&7)<<1; // Tile address delta<<=4; // Delta of address + if (entry+1 == cnt) width = p[entry+1]; // last sprite width limited? for (; width; width--,sx+=8,tile+=delta) { unsigned int pack; @@ -1065,10 +1078,10 @@ static NOINLINE void PrepareSprites(int full) { int pack; // updates: tilecode, sx - for (u=0; u < max_sprites && (pack = *pd); u++, pd+=2) + for (u=0; u < max_sprites && link < max_sprites && (pack = *pd); u++, pd+=2) { unsigned int *sprite; - int code2, sx, sy, height; + int code2, sx, sy, height, width; sprite=(unsigned int *)(PicoMem.vram+((table+(link<<2))&0x7ffc)); // Find sprite @@ -1078,25 +1091,29 @@ static NOINLINE void PrepareSprites(int full) sx -= 0x78; // Get X coordinate + 8 sy = (pack << 16) >> 16; height = (pack >> 24) & 0xf; + width = (pack >> 28); if (sy < max_lines && - sy + (height<<3) > est->DrawScanline && // sprite onscreen (y)? - (sx > -24 || sx < max_width)) // onscreen x + sy + (height<<3) > est->DrawScanline) // sprite onscreen (y)? { int y = (sy >= est->DrawScanline) ? sy : est->DrawScanline; int entry = ((pd - HighPreSpr) / 2) | ((code2>>8)&0x80); for (; y < sy + (height<<3) && y < max_lines; y++) { int i, cnt; - cnt = HighLnSpr[y][0] & 0x7f; - if (cnt >= max_line_sprites) continue; // sprite limit? + cnt = HighLnSpr[y][0]; + if (HighLnSpr[y][3] >= max_line_sprites) continue; // sprite limit? for (i = 0; i < cnt; i++) - if (((HighLnSpr[y][3+i] ^ entry) & 0x7f) == 0) goto found; + if (((HighLnSpr[y][4+i] ^ entry) & 0x7f) == 0) goto found; // this sprite was previously missing - HighLnSpr[y][3+cnt] = entry; - HighLnSpr[y][0] = cnt + 1; + HighLnSpr[y][3] ++; + if (sx > -24 && sx < max_width) { // onscreen x + HighLnSpr[y][4+cnt] = entry; // XXX wrong sequence? + HighLnSpr[y][5+cnt] = width; // XXX should count tiles for limit + HighLnSpr[y][0] = cnt + 1; + } found:; if (entry & 0x80) HighLnSpr[y][1] |= SPRL_HAVE_HI; @@ -1118,7 +1135,7 @@ found:; for (u = 0; u < max_lines; u++) *((int *)&HighLnSpr[u][0]) = 0; - for (u = 0; u < max_sprites; u++) + for (u = 0; u < max_sprites && link < max_sprites; u++) { unsigned int *sprite; int code, code2, sx, sy, hv, height, width; @@ -1138,7 +1155,7 @@ found:; if (sy < max_lines && sy + (height<<3) > est->DrawScanline) // sprite onscreen (y)? { - int entry, y, sx_min, onscr_x, maybe_op = 0; + int entry, y, w, sx_min, onscr_x, maybe_op = 0; sx_min = 8-(width<<3); onscr_x = sx_min < sx && sx < max_width; @@ -1149,29 +1166,36 @@ found:; y = (sy >= est->DrawScanline) ? sy : est->DrawScanline; for (; y < sy + (height<<3) && y < max_lines; y++) { - unsigned char *p = &HighLnSpr[y][0]; + unsigned char *p = &HighLnSpr[y][0]; int cnt = p[0]; - if (cnt >= max_line_sprites) continue; // sprite limit? - - if (p[2] >= max_line_sprites*2) { // tile limit? - p[0] |= 0x80; - continue; + if (p[3] >= max_line_sprites) continue; // sprite limit? + if ((p[1] & SPRL_MASKED) && !(entry & 0x80)) continue; // masked? + + w = width; + if (p[2] + width > max_line_sprites*2) { // tile limit? + if (y+1 < 240) HighLnSpr[y+1][1] |= SPRL_TILE_OVFL; + if (p[2] >= max_line_sprites*2) continue; + w = max_line_sprites*2 - p[2]; } - p[2] += width; + p[2] += w; + p[3] ++; if (sx == -0x78) { - if (cnt > 0) - p[0] |= 0x80; // masked, no more sprites for this line - continue; - } - // must keep the first sprite even if it's offscreen, for masking - if (cnt > 0 && !onscr_x) continue; // offscreen x + if (p[1] & (SPRL_HAVE_X|SPRL_TILE_OVFL)) + p[1] |= SPRL_MASKED; // masked, no more low sprites for this line + if (!(p[1] & SPRL_HAVE_X) && cnt == 0) + p[1] |= SPRL_HAVE_MASK0; // 1st sprite is masking + } else + p[1] |= SPRL_HAVE_X; + + if (!onscr_x) continue; // offscreen x - p[3+cnt] = entry; + p[4+cnt] = entry; + p[5+cnt] = w; // width clipped by tile limit for sprite renderer p[0] = cnt + 1; p[1] |= (entry & 0x80) ? SPRL_HAVE_HI : SPRL_HAVE_LO; p[1] |= maybe_op; // there might be op sprites on this line - if (cnt > 0 && (code2 & 0x8000) && !(p[3+cnt-1]&0x80)) + if (cnt > 0 && (code2 & 0x8000) && !(p[4+cnt-1]&0x80)) p[1] |= SPRL_LO_ABOVE_HI; } } @@ -1189,9 +1213,10 @@ found:; for (u = 0; u < max_lines; u++) { int y; - printf("c%03i: %2i, %2i: ", u, HighLnSpr[u][0] & 0x7f, HighLnSpr[u][2]); - for (y = 0; y < HighLnSpr[u][0] & 0x7f; y++) - printf(" %i", HighLnSpr[u][y+3]); + printf("c%03i: f %x c %2i/%2i w %2i: ", u, HighLnSpr[u][1], + HighLnSpr[u][0], HighLnSpr[u][3], HighLnSpr[u][2]); + for (y = 0; y < HighLnSpr[u][0]; y++) + printf(" %i", HighLnSpr[u][y+4]); printf("\n"); } #endif @@ -1203,20 +1228,22 @@ static void DrawAllSprites(unsigned char *sprited, int prio, int sh, struct PicoEState *est) { unsigned char *p; - int cnt; + int cnt, w = sprited[2]; cnt = sprited[0] & 0x7f; if (cnt == 0) return; - p = &sprited[3]; + p = &sprited[4]; + if ((sprited[1] & (SPRL_TILE_OVFL|SPRL_HAVE_MASK0)) == (SPRL_TILE_OVFL|SPRL_HAVE_MASK0)) + return; // masking effective due to tile overflow // Go through sprites backwards: - for (cnt--; cnt >= 0; cnt--) + w = p[cnt]; // possibly clipped width of last sprite + for (cnt--; cnt >= 0; cnt--, w = 0) { - int offs; + int *sp = HighPreSpr + (p[cnt]&0x7f) * 2; if ((p[cnt] >> 7) != prio) continue; - offs = (p[cnt]&0x7f) * 2; - DrawSprite(HighPreSpr + offs, sh); + DrawSprite(sp, sh, w); } } diff --git a/pico/draw2.c b/pico/draw2.c index 38a90ef3b..85e2b2759 100644 --- a/pico/draw2.c +++ b/pico/draw2.c @@ -420,12 +420,13 @@ static void DrawAllSpritesFull(int prio, int maxwidth) int i,u,link=0; unsigned int *sprites[80]; // Sprites int y_min=START_ROW*8, y_max=END_ROW*8; // for a simple sprite masking + int max_sprites = Pico.video.reg[12]&1 ? 80 : 64; table=pvid->reg[5]&0x7f; if (pvid->reg[12]&1) table&=0x7e; // Lowest bit 0 in 40-cell mode table<<=8; // Get sprite table address/2 - for (i=u=0; u < 80; u++) + for (i = u = 0; u < max_sprites && link < max_sprites; u++) { unsigned int *sprite=NULL; int code, code2, sx, sy, height; diff --git a/pico/draw_arm.S b/pico/draw_arm.S index 2efc804c4..fb6d0950f 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -942,17 +942,23 @@ DrawTilesFromCache: .global DrawSpritesSHi DrawSpritesSHi: - ldr r3, [r0] + ldrb r3, [r0] mov r12,#0xff ands r3, r3, #0x7f bxeq lr - stmfd sp!, {r1,r4-r11,lr} @ +est - strb r12,[r0,#2] @ set end marker - add r10,r0, #3 @ r10=HighLnSpr end + stmfd sp!, {r1,r3-r11,lr} @ +est + strb r12,[r0,#3] @ set end marker + ldrb r12,[r0,#1] + add r10,r0, #4 @ r10=HighLnSpr end + mvn r12,r12 + tst r12,#0x6 @ masking in slot 1 and tile ovfl? + ldmeqfd sp!, {r1,r3-r11,pc} add r10,r10,r3 @ r10=HighLnSpr end + ldrb r12,[r10,#0] @ width of last sprite ldr r11,[r1, #OFS_EST_HighCol] + str r12,[sp, #4] mov r12,#0xf ldr lr, [r1, #OFS_EST_PicoMem_vram] @@ -963,7 +969,7 @@ DrawSpriteSHi: ldr r7, [sp] @ est ldr r1, [r7, #OFS_EST_HighPreSpr] cmp r0, #0xff - ldmeqfd sp!, {r1,r4-r11,pc} @ end of list + ldmeqfd sp!, {r1,r3-r11,pc} @ end of list and r0, r0, #0x7f add r0, r1, r0, lsl #3 @@ -1007,10 +1013,16 @@ DrawSpriteSHi: and r7, r7, #7 add r8, r8, r7, lsl #1 @ tile+=(row&7)<<1; // Tile address + ldr r0, [sp, #4] + add r6, r6, #1 @ inc now + cmp r0, #0 @ check width of last sprite + movne r6, r0 + movne r0, #0 + strne r0, [sp, #4] + mov r5, r5, lsl #4 @ delta<<=4; // Delta of address mov r3, r4, lsr #9 @ r3=pal=((code>>9)&0x30); - add r6, r6, #1 @ inc now adds r0, r2, #0 @ mov sx to r0 and set ZV flags b .dsprShi_loop_enter @@ -1126,11 +1138,18 @@ DrawAllSprites: @ time to do some real work stmfd sp!, {r1,r3-r11,lr} @ +sh|prio<<1 +est mov r12,#0xff - strb r12,[r0,#2] @ set end marker - add r10,r0, #3 + strb r12,[r0,#3] @ set end marker + ldrb r12,[r0,#1] + add r10,r0 ,#4 + mvn r12,r12 + tst r12,#0x6 @ masking in slot 1 and tile ovfl? + ldmeqfd sp!, {r1,r3-r11,pc} add r10,r10,r2 @ r10=HighLnSpr end + ldrb r12,[r10,#0] @ width of last sprite ldr r11,[r3, #OFS_EST_HighCol] + orr r1 ,r1 ,r12,lsl #24 + str r1, [sp] mov r12,#0xf ldr lr, [r3, #OFS_EST_PicoMem_vram] @@ -1140,13 +1159,13 @@ DrawAllSprites: DrawSprite: @ draw next sprite ldrb r0, [r10,#-1]! - ldr r8, [sp] @ sh|prio<<1 + ldr r4, [sp] @ sh|prio<<1|lastw<<24 ldr r7, [sp, #4] @ est - mov r2, r0, lsr #7 + mov r2, r0, lsl #24 cmp r0, #0xff ldmeqfd sp!, {r1,r3-r11,pc} @ end of list - cmp r2, r8, lsr #1 - bne DrawSprite @ wrong priority + eor r2, r2, r4, lsl #30 + bmi DrawSprite @ wrong priority ldr r1, [r7, #OFS_EST_HighPreSpr] and r0, r0, #0x7f add r0, r1, r0, lsl #3 @@ -1158,20 +1177,20 @@ DrawSprite: mov r5, r3, lsr #24 and r5, r5, #7 @ r5=height - mov r4, r3, lsl #16 @ r4=sy<<16 (tmp) + mov r8, r3, lsl #16 @ r8=sy<<16 (tmp) ldr r9, [r0, #4] - sub r7, r7, r4, asr #16 @ r7=row=DrawScanline-sy + sub r7, r7, r8, asr #16 @ r7=row=DrawScanline-sy mov r2, r9, asr #16 @ r2=sx mov r9, r9, lsl #16 mov r9, r9, lsr #16 - orr r9, r9, r8, lsl #31 @ r9=code|sh[31] + orr r9, r9, r4, lsl #31 @ r9=code|sh[31] tst r9, #0x1000 - movne r4, r5, lsl #3 - subne r4, r4, #1 - subne r7, r4, r7 @ if (code&0x1000) row=(height<<3)-1-row; // Flip Y + movne r8, r5, lsl #3 + subne r8, r8, #1 + subne r7, r8, r7 @ if (code&0x1000) row=(height<<3)-1-row; // Flip Y add r8, r9, r7, lsr #3 @ tile+=row>>3; // Tile number increases going down tst r9, #0x0800 @@ -1183,7 +1202,12 @@ DrawSprite: and r7, r7, #7 add r8, r8, r7, lsl #1 @ tile+=(row&7)<<1; // Tile address -.dspr_continue: + add r6, r6, #1 @ inc now + cmp r4, #0x1000000 @ check width of last sprite + movhs r6, r4, lsr #24 + bichs r4, r4, #0xff000000 + strhs r4, [sp] + @ cache some stuff to avoid mem access mov r5, r5, lsl #4 @ delta<<=4; // Delta of address and r4, r9, #0x6000 @@ -1193,7 +1217,6 @@ DrawSprite: mov r3, r4, lsr #9 @ r3=pal=((code>>9)&0x30); orrmi r3, r3, #0x40 @ for sh/hi - add r6, r6, #1 @ inc now adds r0, r2, #0 @ mov sx to r0 and set ZV flags b .dspr_loop_enter diff --git a/pico/pico_int.h b/pico/pico_int.h index 70bfa7104..a24fc6f60 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -667,8 +667,8 @@ void FinalizeLine555(int sh, int line, struct PicoEState *est); void PicoDrawSetOutBufMD(void *dest, int increment); extern int (*PicoScanBegin)(unsigned int num); extern int (*PicoScanEnd)(unsigned int num); -#define MAX_LINE_SPRITES 29 -extern unsigned char HighLnSpr[240][3 + MAX_LINE_SPRITES]; +#define MAX_LINE_SPRITES 27 // +1 last sprite width, +4 hdr; total 32 +extern unsigned char HighLnSpr[240][4+MAX_LINE_SPRITES+1]; extern void *DrawLineDestBase; extern int DrawLineDestIncrement; diff --git a/pico/videoport.c b/pico/videoport.c index d196ee4ff..c2fbd0cae 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -200,6 +200,7 @@ static void DmaSlow(int len, unsigned int source) a = (a + inc) & 0x1ffff; } Pico.video.addr_u = a >> 16; + Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; break; default: @@ -266,6 +267,7 @@ static NOINLINE void DmaFill(int data) // Increment address register a = (u16)(a + inc); } + Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; break; case 3: // cram case 5: { // vsram @@ -289,7 +291,6 @@ static NOINLINE void DmaFill(int data) Pico.video.reg[0x15] = source; Pico.video.reg[0x16] = source >> 8; - Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; } static NOINLINE void CommandDma(void) From 9c1d9b17ce59c2493fc7433e61bf6402c9b5fd8b Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 26 Jan 2020 20:43:05 +0100 Subject: [PATCH 097/174] added debug reg sprite plane support (fixes some issues in overdrive 2 demo) --- pico/draw.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/pico/draw.c b/pico/draw.c index 2922914fd..dfbd53cc3 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -223,6 +223,14 @@ TileFlipMakerAS(TileFlipAS_onlymark, pix_sh_as_onlymark) TileNormMaker(TileNorm_and, pix_and) TileFlipMaker(TileFlip_and, pix_and) +// forced sprite draw (through debug reg) +#define pix_sh_and(x) /* XXX is there S/H with forced draw? */ \ + if (t>=0xe) pd[x]=(pd[x]&0x3f)|(t<<6); /* c0 shadow, 80 hilight */ \ + else pd[x] = (pd[x] & 0xc0) | (pd[x] & (pal | t)) + +TileNormMaker(TileNormSH_and, pix_sh_and) +TileFlipMaker(TileFlipSH_and, pix_sh_and) + // -------------------------------------------- #ifndef _ASM_DRAW_C @@ -1045,6 +1053,66 @@ static void DrawSpritesHiAS(unsigned char *sprited, int sh) } } +static void DrawSpritesForced(unsigned char *sprited) +{ + void (*fTileFunc)(unsigned char *pd, unsigned int pack, int pal); + unsigned char *pd = Pico.est.HighCol; + unsigned char *p; + int entry, cnt; + + cnt = sprited[0] & 0x7f; + if (cnt == 0) return; + + p = &sprited[4]; + if ((sprited[1] & (SPRL_TILE_OVFL|SPRL_HAVE_MASK0)) == (SPRL_TILE_OVFL|SPRL_HAVE_MASK0)) + return; // masking effective due to tile overflow + + // Go through sprites: + for (entry = 0; entry < cnt; entry++) + { + int *sprite, code, pal, tile, sx, sy; + int offs, delta, width, height, row; + + offs = (p[entry] & 0x7f) * 2; + sprite = HighPreSpr + offs; + code = sprite[1]; + pal = (code>>9)&0x30; + + if (code&0x800) fTileFunc = TileFlipSH_and; + else fTileFunc = TileNormSH_and; + + // parse remaining sprite data + sy=sprite[0]; + sx=code>>16; // X + width=sy>>28; + height=(sy>>24)&7; // Width and height in tiles + sy=(sy<<16)>>16; // Y + + row=Pico.est.DrawScanline-sy; // Row of the sprite we are on + + if (code&0x1000) row=(height<<3)-1-row; // Flip Y + + tile=code + (row>>3); // Tile number increases going down + delta=height; // Delta to increase tile by going right + if (code&0x0800) { tile+=delta*(width-1); delta=-delta; } // Flip X + + tile &= 0x7ff; tile<<=4; tile+=(row&7)<<1; // Tile address + delta<<=4; // Delta of address + + if (entry+1 == cnt) width = p[entry+1]; // last sprite width limited? + for (; width; width--,sx+=8,tile+=delta) + { + unsigned int pack; + + if(sx<=0) continue; + if(sx>=328) break; // Offscreen + + pack = *(unsigned int *)(PicoMem.vram + (tile & 0x7fff)); + fTileFunc(pd + sx, pack, pal); + } + } +} + // Index + 0 : ----hhvv -lllllll -------y yyyyyyyy // Index + 4 : -------x xxxxxxxx pccvhnnn nnnnnnnn @@ -1529,6 +1597,8 @@ static int DrawDisplay(int sh) DrawTilesFromCacheForced(HighCacheB); else if (pvid->debug_p & PVD_FORCE_A) DrawTilesFromCacheForced(HighCacheA); + else if (pvid->debug_p & PVD_FORCE_S) + DrawSpritesForced(sprited); #if 0 { @@ -1621,7 +1691,7 @@ static void PicoLine(int line, int offs, int sh, int bgc) return; } - if (Pico.video.debug_p & (PVD_FORCE_A | PVD_FORCE_B)) + if (Pico.video.debug_p & (PVD_FORCE_A | PVD_FORCE_B | PVD_FORCE_S)) bgc = 0x3f; // Draw screen: From 85d333ab79e2a585e11ff865865d44ed6a073046 Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 26 Jan 2020 20:46:21 +0100 Subject: [PATCH 098/174] VDP timing improvements --- pico/pico.c | 3 ++- pico/pico_cmn.c | 2 +- pico/pico_int.h | 5 ++--- pico/videoport.c | 10 ++++++---- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pico/pico.c b/pico/pico.c index 2a16a0e23..b65b7de86 100644 --- a/pico/pico.c +++ b/pico/pico.c @@ -254,7 +254,7 @@ PICO_INTERNAL int CheckDMA(int cycles) dma_op1 = dma_op; if(Pico.video.reg[12] & 1) dma_op |= 4; // 40 cell mode? if(!(Pico.video.status&8)&&(Pico.video.reg[1]&0x40)) dma_op|=8; // active display? - xfers_can = (dma_timings[dma_op] * cycles + 0xff) >> 16; + xfers_can = (dma_timings[dma_op] * cycles + 0x8000) >> 16; if(xfers <= xfers_can) { Pico.video.status &= ~SR_DMA; @@ -265,6 +265,7 @@ PICO_INTERNAL int CheckDMA(int cycles) if(!(dma_op&2)) burn = cycles; Pico.m.dma_xfers -= xfers_can; } + Pico.t.dma_end = SekCyclesDone() + burn; elprintf(EL_VDPDMA, "~Dma %i op=%i can=%i burn=%i [%u]", Pico.m.dma_xfers, dma_op1, xfers_can, burn, SekCyclesDone()); diff --git a/pico/pico_cmn.c b/pico/pico_cmn.c index 8c22c9773..b7e7d8358 100644 --- a/pico/pico_cmn.c +++ b/pico/pico_cmn.c @@ -56,10 +56,10 @@ static void SekSyncM68k(void) static __inline void SekRunM68k(int cyc) { Pico.t.m68c_aim += cyc; + Pico.t.m68c_cnt += cyc >> 6; // refresh slowdowns cyc = Pico.t.m68c_aim - Pico.t.m68c_cnt; if (cyc <= 0) return; - Pico.t.m68c_cnt += cyc >> 6; // refresh slowdowns SekSyncM68k(); } diff --git a/pico/pico_int.h b/pico/pico_int.h index a24fc6f60..357de4a9f 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -137,9 +137,7 @@ extern m68ki_cpu_core PicoCpuMM68k, PicoCpuMS68k; // burn cycles while not in SekRun() and while in #define SekCyclesBurn(c) Pico.t.m68c_cnt += c -#define SekCyclesBurnRun(c) { \ - SekCyclesLeft -= c; \ -} +#define SekCyclesBurnRun(c) SekCyclesLeft -= c // note: sometimes may extend timeslice to delay an irq #define SekEndRun(after) { \ @@ -421,6 +419,7 @@ struct PicoTiming unsigned int z80c_aim; int z80_scanline; + unsigned int dma_end; // end of current DMA op (m68k cycles) int timer_a_next_oflow, timer_a_step; // in z80 cycles int timer_b_next_oflow, timer_b_step; }; diff --git a/pico/videoport.c b/pico/videoport.c index c2fbd0cae..16a731192 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -376,12 +376,12 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) pvid->pending=0; } - if (!(pvid->status & SR_VB) && !(PicoIn.opt&POPT_DIS_VDP_FIFO)) + if (!(pvid->status & SR_VB) && (pvid->reg[1]&0x40) && !(PicoIn.opt&POPT_DIS_VDP_FIFO)) { int use = pvid->type == 1 ? 2 : 1; pvid->lwrite_cnt -= use; if (pvid->lwrite_cnt < 0) - SekCyclesLeft = 0; + SekCyclesBurnRun(488 - (SekCyclesDone()-Pico.t.m68c_line_start)); elprintf(EL_ASVDP, "VDP data write: [%04x] %04x [%u] {%i} #%i @ %06x", Pico.video.addr, d, SekCyclesDone(), Pico.video.type, pvid->lwrite_cnt, SekPc); } @@ -509,9 +509,11 @@ static u32 SrLow(const struct PicoVideo *pv) { unsigned int c, d = pv->status; - c = SekCyclesDone() - Pico.t.m68c_line_start - 39; - if (c < 92) + c = SekCyclesDone(); + if (c - Pico.t.m68c_line_start - 39 < 92) d |= SR_HB; + if (CYCLES_GT(c, Pico.t.dma_end)) + d &= ~SR_DMA; return d; } From e0a39a3dd165b86eeb3b4b8c7d2995f03d5dcf99 Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 26 Jan 2020 20:48:25 +0100 Subject: [PATCH 099/174] improved VRAM128K support (overdrive 2) --- pico/videoport.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/pico/videoport.c b/pico/videoport.c index 16a731192..cdc5796ce 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -22,11 +22,12 @@ static __inline void AutoIncrement(void) Pico.video.addr=(unsigned short)(Pico.video.addr+Pico.video.reg[0xf]); } -static NOINLINE void VideoWrite128(u32 a, u16 d) +static NOINLINE unsigned int VideoWrite128(u32 a, u16 d) { // nasty a = ((a & 2) >> 1) | ((a & 0x400) >> 9) | (a & 0x3FC) | ((a & 0x1F800) >> 1); ((u8 *)PicoMem.vram)[a] = d; + return a; } static void VideoWrite(u16 d) @@ -38,16 +39,19 @@ static void VideoWrite(u16 d) case 1: if (a & 1) d = (u16)((d << 8) | (d >> 8)); PicoMem.vram [(a >> 1) & 0x7fff] = d; - if (a - ((unsigned)(Pico.video.reg[5]&0x7f) << 9) < 0x400) + if ((unsigned)(a - ((Pico.video.reg[5]&0x7f) << 9)) < 0x400) Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; break; case 3: if (PicoMem.cram [(a >> 1) & 0x3f] != d) Pico.m.dirtyPal = 1; PicoMem.cram [(a >> 1) & 0x3f] = d; break; case 5: PicoMem.vsram[(a >> 1) & 0x3f] = d; break; - case 0x81: - a |= Pico.video.addr_u << 16; - VideoWrite128(a, d); - break; + case 0x81: if (a & 1) + d = (u16)((d << 8) | (d >> 8)); + a |= Pico.video.addr_u << 16; + a = VideoWrite128(a, d); + if ((unsigned)(a - ((Pico.video.reg[5]&0x7f) << 9)) < 0x400) + Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; + break; //default:elprintf(EL_ANOMALY, "VDP write %04x with bad type %i", d, Pico.video.type); break; } @@ -276,6 +280,16 @@ static NOINLINE void DmaFill(int data) if (!once++) elprintf(EL_STATUS|EL_ANOMALY|EL_VDPDMA, "TODO: cram/vsram fill"); } + case 0x81: + for (l = len; l; l--) { + VideoWrite128(a, data); + + // Increment address register + a = (a + inc) & 0x1ffff; + } + Pico.video.addr_u = a >> 16; + Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; + break; default: a += len * inc; break; From b1640ba1d1a361464196c59f2843bb379b53c342 Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 26 Jan 2020 20:49:20 +0100 Subject: [PATCH 100/174] regression fix for gp2x 8bit fast mode --- platform/common/arm_utils.s | 1 + platform/linux/blit.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/platform/common/arm_utils.s b/platform/common/arm_utils.s index 9e8d9f250..6696e5afe 100644 --- a/platform/common/arm_utils.s +++ b/platform/common/arm_utils.s @@ -141,6 +141,7 @@ vidcpy_m2: movne lr, #64 tstne r3, r3 addne r0, r0, #32 + addne r1, r1, #32 vidCpyM2_loop_out: mov r6, #10 diff --git a/platform/linux/blit.c b/platform/linux/blit.c index 96326fe13..82bc4ba53 100644 --- a/platform/linux/blit.c +++ b/platform/linux/blit.c @@ -61,10 +61,11 @@ void vidcpy_m2(void *dest, void *src, int m32col, int with_32c_border) for (i = 0; i < 224; i++) { ps += 8; + ps += 32; pd += 32; for (u = 0; u < 256; u++) *pd++ = *ps++; - ps += 64; + ps += 32; pd += 32; } } else { From 134ae4b2dd012da3338e0b2b4ede545d381507c9 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 7 Feb 2020 19:55:05 +0100 Subject: [PATCH 101/174] new hvcounter tables as per spritesmind.net threads --- pico/misc.c | 136 +++++++++++++++++++++++++--------------------------- 1 file changed, 64 insertions(+), 72 deletions(-) diff --git a/pico/misc.c b/pico/misc.c index 47842e3fa..ab282c247 100644 --- a/pico/misc.c +++ b/pico/misc.c @@ -8,84 +8,76 @@ #include "pico_int.h" -// H-counter table for hvcounter reads in 40col mode -// based on Gens code +// H-counter table for hvcounter reads in 40col mode, starting at HINT const unsigned char hcounts_40[] = { -0x07,0x07,0x08,0x08,0x08,0x09,0x09,0x0a,0x0a,0x0b,0x0b,0x0b,0x0c,0x0c,0x0d,0x0d, -0x0e,0x0e,0x0e,0x0f,0x0f,0x10,0x10,0x10,0x11,0x11,0x12,0x12,0x13,0x13,0x13,0x14, -0x14,0x15,0x15,0x15,0x16,0x16,0x17,0x17,0x18,0x18,0x18,0x19,0x19,0x1a,0x1a,0x1b, -0x1b,0x1b,0x1c,0x1c,0x1d,0x1d,0x1d,0x1e,0x1e,0x1f,0x1f,0x20,0x20,0x20,0x21,0x21, -0x22,0x22,0x23,0x23,0x23,0x24,0x24,0x25,0x25,0x25,0x26,0x26,0x27,0x27,0x28,0x28, -0x28,0x29,0x29,0x2a,0x2a,0x2a,0x2b,0x2b,0x2c,0x2c,0x2d,0x2d,0x2d,0x2e,0x2e,0x2f, -0x2f,0x30,0x30,0x30,0x31,0x31,0x32,0x32,0x32,0x33,0x33,0x34,0x34,0x35,0x35,0x35, -0x36,0x36,0x37,0x37,0x38,0x38,0x38,0x39,0x39,0x3a,0x3a,0x3a,0x3b,0x3b,0x3c,0x3c, -0x3d,0x3d,0x3d,0x3e,0x3e,0x3f,0x3f,0x3f,0x40,0x40,0x41,0x41,0x42,0x42,0x42,0x43, -0x43,0x44,0x44,0x45,0x45,0x45,0x46,0x46,0x47,0x47,0x47,0x48,0x48,0x49,0x49,0x4a, -0x4a,0x4a,0x4b,0x4b,0x4c,0x4c,0x4d,0x4d,0x4d,0x4e,0x4e,0x4f,0x4f,0x4f,0x50,0x50, -0x51,0x51,0x52,0x52,0x52,0x53,0x53,0x54,0x54,0x55,0x55,0x55,0x56,0x56,0x57,0x57, -0x57,0x58,0x58,0x59,0x59,0x5a,0x5a,0x5a,0x5b,0x5b,0x5c,0x5c,0x5c,0x5d,0x5d,0x5e, -0x5e,0x5f,0x5f,0x5f,0x60,0x60,0x61,0x61,0x62,0x62,0x62,0x63,0x63,0x64,0x64,0x64, -0x65,0x65,0x66,0x66,0x67,0x67,0x67,0x68,0x68,0x69,0x69,0x6a,0x6a,0x6a,0x6b,0x6b, -0x6c,0x6c,0x6c,0x6d,0x6d,0x6e,0x6e,0x6f,0x6f,0x6f,0x70,0x70,0x71,0x71,0x71,0x72, -0x72,0x73,0x73,0x74,0x74,0x74,0x75,0x75,0x76,0x76,0x77,0x77,0x77,0x78,0x78,0x79, -0x79,0x79,0x7a,0x7a,0x7b,0x7b,0x7c,0x7c,0x7c,0x7d,0x7d,0x7e,0x7e,0x7f,0x7f,0x7f, -0x80,0x80,0x81,0x81,0x81,0x82,0x82,0x83,0x83,0x84,0x84,0x84,0x85,0x85,0x86,0x86, -0x86,0x87,0x87,0x88,0x88,0x89,0x89,0x89,0x8a,0x8a,0x8b,0x8b,0x8c,0x8c,0x8c,0x8d, -0x8d,0x8e,0x8e,0x8e,0x8f,0x8f,0x90,0x90,0x91,0x91,0x91,0x92,0x92,0x93,0x93,0x94, -0x94,0x94,0x95,0x95,0x96,0x96,0x96,0x97,0x97,0x98,0x98,0x99,0x99,0x99,0x9a,0x9a, -0x9b,0x9b,0x9b,0x9c,0x9c,0x9d,0x9d,0x9e,0x9e,0x9e,0x9f,0x9f,0xa0,0xa0,0xa1,0xa1, -0xa1,0xa2,0xa2,0xa3,0xa3,0xa3,0xa4,0xa4,0xa5,0xa5,0xa6,0xa6,0xa6,0xa7,0xa7,0xa8, -0xa8,0xa9,0xa9,0xa9,0xaa,0xaa,0xab,0xab,0xab,0xac,0xac,0xad,0xad,0xae,0xae,0xae, -0xaf,0xaf,0xb0,0xb0, -0xe4,0xe4,0xe4,0xe5,0xe5,0xe6,0xe6,0xe6,0xe7,0xe7,0xe8,0xe8,0xe9,0xe9,0xe9,0xea, -0xea,0xeb,0xeb,0xeb,0xec,0xec,0xed,0xed,0xee,0xee,0xee,0xef,0xef,0xf0,0xf0,0xf1, -0xf1,0xf1,0xf2,0xf2,0xf3,0xf3,0xf3,0xf4,0xf4,0xf5,0xf5,0xf6,0xf6,0xf6,0xf7,0xf7, -0xf8,0xf8,0xf9,0xf9,0xf9,0xfa,0xfa,0xfb,0xfb,0xfb,0xfc,0xfc,0xfd,0xfd,0xfe,0xfe, -0xfe,0xff,0xff,0x00,0x00,0x00,0x01,0x01,0x02,0x02,0x03,0x03,0x03,0x04,0x04,0x05, -0x05,0x06,0x06,0x06, -0x07,0x07,0x08,0x08,0x08,0x09,0x09,0x0a,0x0a,0x0b,0x0b,0x0b,0x0c,0x0c,0x0d,0x0d, -0x0e,0x0e,0x0e,0x0f,0x0f,0x10,0x10,0x10, +0xa5,0xa5,0xa5,0xa6,0xa6,0xa7,0xa7,0xa8,0xa8,0xa8,0xa9,0xa9,0xaa,0xaa,0xab,0xab, +0xac,0xac,0xac,0xad,0xad,0xae,0xae,0xaf,0xaf,0xaf,0xb0,0xb0,0xb1,0xb1,0xb2,0xb2, +0xb3,0xb3,0xb3,0xb4,0xb4,0xb5,0xb5,0xb6,0xe4,0xe4,0xe5,0xe5,0xe6,0xe6,0xe7,0xe7, +0xe7,0xe8,0xe8,0xe8,0xe9,0xe9,0xe9,0xea,0xea,0xeb,0xeb,0xeb,0xec,0xec,0xec,0xed, +0xed,0xed,0xee,0xee,0xee,0xef,0xef,0xf0,0xf0,0xf0,0xf1,0xf1,0xf1,0xf2,0xf2,0xf2, +0xf3,0xf3,0xf3,0xf4,0xf4,0xf5,0xf5,0xf5,0xf6,0xf6,0xf6,0xf7,0xf7,0xf7,0xf8,0xf8, +0xf9,0xf9,0xfa,0xfa,0xfb,0xfb,0xfb,0xfc,0xfc,0xfd,0xfd,0xfe,0xfe,0xfe,0xff,0xff, +0x00,0x00,0x01,0x01,0x02,0x02,0x02,0x03,0x03,0x04,0x04,0x05,0x05,0x05,0x06,0x06, +0x07,0x07,0x08,0x08,0x09,0x09,0x09,0x0a,0x0a,0x0b,0x0b,0x0c,0x0c,0x0c,0x0d,0x0d, +0x0e,0x0e,0x0f,0x0f,0x10,0x10,0x10,0x11,0x11,0x12,0x12,0x13,0x13,0x13,0x14,0x14, +0x15,0x15,0x16,0x16,0x17,0x17,0x17,0x18,0x18,0x19,0x19,0x1a,0x1a,0x1a,0x1b,0x1b, +0x1c,0x1c,0x1d,0x1d,0x1e,0x1e,0x1e,0x1f,0x1f,0x20,0x20,0x21,0x21,0x21,0x22,0x22, +0x23,0x23,0x24,0x24,0x25,0x25,0x25,0x26,0x26,0x27,0x27,0x28,0x28,0x28,0x29,0x29, +0x2a,0x2a,0x2b,0x2b,0x2c,0x2c,0x2c,0x2d,0x2d,0x2e,0x2e,0x2f,0x2f,0x2f,0x30,0x30, +0x31,0x31,0x32,0x32,0x33,0x33,0x33,0x34,0x34,0x35,0x35,0x36,0x36,0x36,0x37,0x37, +0x38,0x38,0x39,0x39,0x3a,0x3a,0x3a,0x3b,0x3b,0x3c,0x3c,0x3d,0x3d,0x3d,0x3e,0x3e, +0x3f,0x3f,0x40,0x40,0x41,0x41,0x41,0x42,0x42,0x43,0x43,0x44,0x44,0x44,0x45,0x45, +0x46,0x46,0x47,0x47,0x48,0x48,0x48,0x49,0x49,0x4a,0x4a,0x4b,0x4b,0x4b,0x4c,0x4c, +0x4d,0x4d,0x4e,0x4e,0x4f,0x4f,0x4f,0x50,0x50,0x51,0x51,0x52,0x52,0x52,0x53,0x53, +0x54,0x54,0x55,0x55,0x56,0x56,0x56,0x57,0x57,0x58,0x58,0x59,0x59,0x59,0x5a,0x5a, +0x5b,0x5b,0x5c,0x5c,0x5d,0x5d,0x5d,0x5e,0x5e,0x5f,0x5f,0x60,0x60,0x60,0x61,0x61, +0x62,0x62,0x63,0x63,0x64,0x64,0x64,0x65,0x65,0x66,0x66,0x67,0x67,0x67,0x68,0x68, +0x69,0x69,0x6a,0x6a,0x6b,0x6b,0x6b,0x6c,0x6c,0x6d,0x6d,0x6e,0x6e,0x6e,0x6f,0x6f, +0x70,0x70,0x71,0x71,0x72,0x72,0x72,0x73,0x73,0x74,0x74,0x75,0x75,0x75,0x76,0x76, +0x77,0x77,0x78,0x78,0x79,0x79,0x79,0x7a,0x7a,0x7b,0x7b,0x7c,0x7c,0x7c,0x7d,0x7d, +0x7e,0x7e,0x7f,0x7f,0x80,0x80,0x80,0x81,0x81,0x82,0x82,0x83,0x83,0x83,0x84,0x84, +0x85,0x85,0x86,0x86,0x87,0x87,0x87,0x88,0x88,0x89,0x89,0x8a,0x8a,0x8a,0x8b,0x8b, +0x8c,0x8c,0x8d,0x8d,0x8e,0x8e,0x8e,0x8f,0x8f,0x90,0x90,0x91,0x91,0x91,0x92,0x92, +0x93,0x93,0x94,0x94,0x95,0x95,0x95,0x96,0x96,0x97,0x97,0x98,0x98,0x98,0x99,0x99, +0x9a,0x9a,0x9b,0x9b,0x9c,0x9c,0x9c,0x9d,0x9d,0x9e,0x9e,0x9f,0x9f,0x9f,0xa0,0xa0, +0xa1,0xa1,0xa2,0xa2,0xa3,0xa3,0xa3,0xa4,0xa5,0xa5,0xa5,0xa6,0xa6,0xa7,0xa7,0xa8, }; -// H-counter table for hvcounter reads in 32col mode +// H-counter table for hvcounter reads in 32col mode, starting at HINT const unsigned char hcounts_32[] = { -0x05,0x05,0x05,0x06,0x06,0x07,0x07,0x07,0x08,0x08,0x08,0x09,0x09,0x09,0x0a,0x0a, -0x0a,0x0b,0x0b,0x0b,0x0c,0x0c,0x0c,0x0d,0x0d,0x0d,0x0e,0x0e,0x0f,0x0f,0x0f,0x10, -0x10,0x10,0x11,0x11,0x11,0x12,0x12,0x12,0x13,0x13,0x13,0x14,0x14,0x14,0x15,0x15, -0x15,0x16,0x16,0x17,0x17,0x17,0x18,0x18,0x18,0x19,0x19,0x19,0x1a,0x1a,0x1a,0x1b, -0x1b,0x1b,0x1c,0x1c,0x1c,0x1d,0x1d,0x1d,0x1e,0x1e,0x1f,0x1f,0x1f,0x20,0x20,0x20, -0x21,0x21,0x21,0x22,0x22,0x22,0x23,0x23,0x23,0x24,0x24,0x24,0x25,0x25,0x26,0x26, -0x26,0x27,0x27,0x27,0x28,0x28,0x28,0x29,0x29,0x29,0x2a,0x2a,0x2a,0x2b,0x2b,0x2b, -0x2c,0x2c,0x2c,0x2d,0x2d,0x2e,0x2e,0x2e,0x2f,0x2f,0x2f,0x30,0x30,0x30,0x31,0x31, -0x31,0x32,0x32,0x32,0x33,0x33,0x33,0x34,0x34,0x34,0x35,0x35,0x36,0x36,0x36,0x37, -0x37,0x37,0x38,0x38,0x38,0x39,0x39,0x39,0x3a,0x3a,0x3a,0x3b,0x3b,0x3b,0x3c,0x3c, -0x3d,0x3d,0x3d,0x3e,0x3e,0x3e,0x3f,0x3f,0x3f,0x40,0x40,0x40,0x41,0x41,0x41,0x42, -0x42,0x42,0x43,0x43,0x43,0x44,0x44,0x45,0x45,0x45,0x46,0x46,0x46,0x47,0x47,0x47, -0x48,0x48,0x48,0x49,0x49,0x49,0x4a,0x4a,0x4a,0x4b,0x4b,0x4b,0x4c,0x4c,0x4d,0x4d, -0x4d,0x4e,0x4e,0x4e,0x4f,0x4f,0x4f,0x50,0x50,0x50,0x51,0x51,0x51,0x52,0x52,0x52, -0x53,0x53,0x53,0x54,0x54,0x55,0x55,0x55,0x56,0x56,0x56,0x57,0x57,0x57,0x58,0x58, -0x58,0x59,0x59,0x59,0x5a,0x5a,0x5a,0x5b,0x5b,0x5c,0x5c,0x5c,0x5d,0x5d,0x5d,0x5e, -0x5e,0x5e,0x5f,0x5f,0x5f,0x60,0x60,0x60,0x61,0x61,0x61,0x62,0x62,0x62,0x63,0x63, -0x64,0x64,0x64,0x65,0x65,0x65,0x66,0x66,0x66,0x67,0x67,0x67,0x68,0x68,0x68,0x69, -0x69,0x69,0x6a,0x6a,0x6a,0x6b,0x6b,0x6c,0x6c,0x6c,0x6d,0x6d,0x6d,0x6e,0x6e,0x6e, -0x6f,0x6f,0x6f,0x70,0x70,0x70,0x71,0x71,0x71,0x72,0x72,0x72,0x73,0x73,0x74,0x74, -0x74,0x75,0x75,0x75,0x76,0x76,0x76,0x77,0x77,0x77,0x78,0x78,0x78,0x79,0x79,0x79, -0x7a,0x7a,0x7b,0x7b,0x7b,0x7c,0x7c,0x7c,0x7d,0x7d,0x7d,0x7e,0x7e,0x7e,0x7f,0x7f, -0x7f,0x80,0x80,0x80,0x81,0x81,0x81,0x82,0x82,0x83,0x83,0x83,0x84,0x84,0x84,0x85, -0x85,0x85,0x86,0x86,0x86,0x87,0x87,0x87,0x88,0x88,0x88,0x89,0x89,0x89,0x8a,0x8a, -0x8b,0x8b,0x8b,0x8c,0x8c,0x8c,0x8d,0x8d,0x8d,0x8e,0x8e,0x8e,0x8f,0x8f,0x8f,0x90, -0x90,0x90,0x91,0x91, -0xe8,0xe8,0xe8,0xe9,0xe9,0xe9,0xea,0xea,0xea,0xeb,0xeb,0xeb,0xec,0xec,0xec,0xed, -0xed,0xed,0xee,0xee,0xee,0xef,0xef,0xf0,0xf0,0xf0,0xf1,0xf1,0xf1,0xf2,0xf2,0xf2, -0xf3,0xf3,0xf3,0xf4,0xf4,0xf4,0xf5,0xf5,0xf5,0xf6,0xf6,0xf6,0xf7,0xf7,0xf8,0xf8, -0xf8,0xf9,0xf9,0xf9,0xfa,0xfa,0xfa,0xfb,0xfb,0xfb,0xfc,0xfc,0xfc,0xfd,0xfd,0xfd, -0xfe,0xfe,0xfe,0xff,0xff,0x00,0x00,0x00,0x01,0x01,0x01,0x02,0x02,0x02,0x03,0x03, -0x03,0x04,0x04,0x04, -0x05,0x05,0x05,0x06,0x06,0x07,0x07,0x07,0x08,0x08,0x08,0x09,0x09,0x09,0x0a,0x0a, -0x0a,0x0b,0x0b,0x0b,0x0c,0x0c,0x0c,0x0d, +0x85,0x85,0x85,0x86,0x86,0x86,0x87,0x87,0x87,0x88,0x88,0x88,0x89,0x89,0x89,0x8a, +0x8a,0x8a,0x8b,0x8b,0x8c,0x8c,0x8c,0x8d,0x8d,0x8d,0x8e,0x8e,0x8e,0x8f,0x8f,0x8f, +0x90,0x90,0x90,0x91,0x91,0x91,0x92,0x92,0x93,0x93,0x93,0xe9,0xe9,0xe9,0xea,0xea, +0xea,0xeb,0xeb,0xeb,0xec,0xec,0xec,0xed,0xed,0xed,0xee,0xee,0xef,0xef,0xef,0xf0, +0xf0,0xf0,0xf1,0xf1,0xf1,0xf2,0xf2,0xf2,0xf3,0xf3,0xf3,0xf4,0xf4,0xf4,0xf5,0xf5, +0xf6,0xf6,0xf6,0xf7,0xf7,0xf7,0xf8,0xf8,0xf8,0xf9,0xf9,0xf9,0xfa,0xfa,0xfa,0xfb, +0xfb,0xfb,0xfc,0xfc,0xfd,0xfd,0xfd,0xfe,0xfe,0xfe,0xff,0xff,0xff,0x00,0x00,0x00, +0x01,0x01,0x01,0x02,0x02,0x02,0x03,0x03,0x04,0x04,0x04,0x05,0x05,0x05,0x06,0x06, +0x06,0x07,0x07,0x07,0x08,0x08,0x08,0x09,0x09,0x09,0x0a,0x0a,0x0b,0x0b,0x0b,0x0c, +0x0c,0x0c,0x0d,0x0d,0x0d,0x0e,0x0e,0x0e,0x0f,0x0f,0x0f,0x10,0x10,0x10,0x11,0x11, +0x12,0x12,0x12,0x13,0x13,0x13,0x14,0x14,0x14,0x15,0x15,0x15,0x16,0x16,0x16,0x17, +0x17,0x17,0x18,0x18,0x19,0x19,0x19,0x1a,0x1a,0x1a,0x1b,0x1b,0x1b,0x1c,0x1c,0x1c, +0x1d,0x1d,0x1d,0x1e,0x1e,0x1e,0x1f,0x1f,0x20,0x20,0x20,0x21,0x21,0x21,0x22,0x22, +0x22,0x23,0x23,0x23,0x24,0x24,0x24,0x25,0x25,0x25,0x26,0x26,0x27,0x27,0x27,0x28, +0x28,0x28,0x29,0x29,0x29,0x2a,0x2a,0x2a,0x2b,0x2b,0x2b,0x2c,0x2c,0x2c,0x2d,0x2d, +0x2e,0x2e,0x2e,0x2f,0x2f,0x2f,0x30,0x30,0x30,0x31,0x31,0x31,0x32,0x32,0x32,0x33, +0x33,0x33,0x34,0x34,0x35,0x35,0x35,0x36,0x36,0x36,0x37,0x37,0x37,0x38,0x38,0x38, +0x39,0x39,0x39,0x3a,0x3a,0x3a,0x3b,0x3b,0x3c,0x3c,0x3c,0x3d,0x3d,0x3d,0x3e,0x3e, +0x3e,0x3f,0x3f,0x3f,0x40,0x40,0x40,0x41,0x41,0x41,0x42,0x42,0x43,0x43,0x43,0x44, +0x44,0x44,0x45,0x45,0x45,0x46,0x46,0x46,0x47,0x47,0x47,0x48,0x48,0x48,0x49,0x49, +0x4a,0x4a,0x4a,0x4b,0x4b,0x4b,0x4c,0x4c,0x4c,0x4d,0x4d,0x4d,0x4e,0x4e,0x4e,0x4f, +0x4f,0x4f,0x50,0x50,0x51,0x51,0x51,0x52,0x52,0x52,0x53,0x53,0x53,0x54,0x54,0x54, +0x55,0x55,0x55,0x56,0x56,0x56,0x57,0x57,0x58,0x58,0x58,0x59,0x59,0x59,0x5a,0x5a, +0x5a,0x5b,0x5b,0x5b,0x5c,0x5c,0x5c,0x5d,0x5d,0x5d,0x5e,0x5e,0x5f,0x5f,0x5f,0x60, +0x60,0x60,0x61,0x61,0x61,0x62,0x62,0x62,0x63,0x63,0x63,0x64,0x64,0x64,0x65,0x65, +0x66,0x66,0x66,0x67,0x67,0x67,0x68,0x68,0x68,0x69,0x69,0x69,0x6a,0x6a,0x6a,0x6b, +0x6b,0x6b,0x6c,0x6c,0x6d,0x6d,0x6d,0x6e,0x6e,0x6e,0x6f,0x6f,0x6f,0x70,0x70,0x70, +0x71,0x71,0x71,0x72,0x72,0x72,0x73,0x73,0x74,0x74,0x74,0x75,0x75,0x75,0x76,0x76, +0x76,0x77,0x77,0x77,0x78,0x78,0x78,0x79,0x79,0x79,0x7a,0x7a,0x7b,0x7b,0x7b,0x7c, +0x7c,0x7c,0x7d,0x7d,0x7d,0x7e,0x7e,0x7e,0x7f,0x7f,0x7f,0x80,0x80,0x80,0x81,0x81, +0x82,0x82,0x82,0x83,0x83,0x83,0x84,0x84,0x85,0x85,0x85,0x86,0x86,0x86,0x87,0x87, }; - #ifndef _ASM_MISC_C PICO_INTERNAL_ASM void memcpy16bswap(unsigned short *dest, void *src, int count) { From 76403c0a201ad0aa2899c88b6ff8d9cb36dfa84c Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 7 Feb 2020 22:10:18 +0100 Subject: [PATCH 102/174] revised VDP fifo implementation --- pico/debug.c | 6 + pico/pico.c | 57 +----- pico/pico_cmn.c | 37 ++-- pico/pico_int.h | 12 +- pico/videoport.c | 456 +++++++++++++++++++++++++++++++++++++++++------ 5 files changed, 430 insertions(+), 138 deletions(-) diff --git a/pico/debug.c b/pico/debug.c index e617d9086..e4b5232ec 100644 --- a/pico/debug.c +++ b/pico/debug.c @@ -43,6 +43,12 @@ char *PDebugMain(void) !!(Pico.sv.flags & SRF_ENABLED), !!(Pico.sv.flags & SRF_EEPROM), Pico.sv.eeprom_type); MVP; sprintf(dstrp, "sram range: %06x-%06x, reg: %02x\n", Pico.sv.start, Pico.sv.end, Pico.m.sram_reg); MVP; sprintf(dstrp, "pend int: v:%i, h:%i, vdp status: %04x\n", bit(pv->pending_ints,5), bit(pv->pending_ints,4), pv->status); MVP; + sprintf(dstrp, "VDP regs 00-07: %02x %02x %02x %02x %02x %02x %02x %02x\n",reg[0],reg[1],reg[2],reg[3],reg[4],reg[5],reg[6],reg[7]); MVP; + sprintf(dstrp, "VDP regs 08-0f: %02x %02x %02x %02x %02x %02x %02x %02x\n",reg[8],reg[9],reg[10],reg[11],reg[12],reg[13],reg[14],reg[15]); MVP; + sprintf(dstrp, "VDP regs 10-17: %02x %02x %02x %02x %02x %02x %02x %02x\n",reg[16],reg[17],reg[18],reg[19],reg[20],reg[21],reg[22],reg[23]); MVP; + sprintf(dstrp, "VDP regs 18-1f: %02x %02x %02x %02x %02x %02x %02x %02x\n",reg[24],reg[25],reg[26],reg[27],reg[28],reg[29],reg[30],reg[31]); MVP; + r = (reg[5]<<9)+(reg[6]<<11); + sprintf(dstrp, "sprite #0: %04x %04x %04x %04x\n",PicoMem.vram[r/2],PicoMem.vram[r/2+1],PicoMem.vram[r/2+2],PicoMem.vram[r/2+3]); MVP; sprintf(dstrp, "pal: %i, hw: %02x, frame#: %i, cycles: %u\n", Pico.m.pal, Pico.m.hardware, Pico.m.frame_count, SekCyclesDone()); MVP; sprintf(dstrp, "M68k: PC: %06x, SR: %04x, irql: %i\n", SekPc, SekSr, SekIrqLevel); MVP; for (r = 0; r < 8; r++) { diff --git a/pico/pico.c b/pico/pico.c index b65b7de86..9db2fc641 100644 --- a/pico/pico.c +++ b/pico/pico.c @@ -67,6 +67,7 @@ void PicoPower(void) memset(&Pico.video,0,sizeof(Pico.video)); memset(&Pico.m,0,sizeof(Pico.m)); + memset(&Pico.t,0,sizeof(Pico.t)); Pico.video.pending_ints=0; z80_reset(); @@ -182,8 +183,7 @@ int PicoReset(void) PsndReset(); // pal must be known here // create an empty "dma" to cause 68k exec start at random frame location - if (Pico.m.dma_xfers == 0 && !(PicoIn.opt & POPT_DIS_VDP_FIFO)) - Pico.m.dma_xfers = rand() & 0x1fff; + PicoVideoFIFOWrite(rand() & 0x1fff, 0, 0, PVS_CPURD); SekFinishIdleDet(); @@ -222,57 +222,6 @@ void PicoLoopPrepare(void) rendstatus_old = -1; } -// this table is wrong and should be removed -// keeping it for now to compensate wrong timing elswhere, mainly for Outrunners -static const int dma_timings[] = { // Q16 - // dma2vram dma2[vs|c]ram vram_fill vram_copy - // VRAM has half the width of VSRAM/CRAM, thus half the performance - ( 83<<16)/488, (166<<16)/488, (165<<16)/488, ( 83<<16)/488, // vblank 32cell - (102<<16)/488, (204<<16)/488, (203<<16)/488, (102<<16)/488, // vblank 40cell - ( 8<<16)/488, ( 16<<16)/488, ( 15<<16)/488, ( 8<<16)/488, // active 32cell - ( 9<<16)/488, ( 18<<16)/488, ( 17<<16)/488, ( 9<<16)/488 // active 40cell -}; - -static const int dma_bsycles[] = { // Q16 - (488<<16)/83, (488<<16)/166, (488<<16)/165, (488<<16)/83, - (488<<16)/102, (488<<16)/204, (488<<16)/203, (488<<16)/102, - (488<<16)/8, (488<<16)/16, (488<<16)/15, (488<<16)/8, - (488<<16)/9, (488<<16)/18, (488<<16)/17, (488<<16)/9 -}; - -// grossly inaccurate.. FIXME FIXXXMEE -PICO_INTERNAL int CheckDMA(int cycles) -{ - int burn = 0, xfers_can, dma_op = Pico.video.reg[0x17]>>6; // see gens for 00 and 01 modes - int xfers = Pico.m.dma_xfers; - int dma_op1; - - // safety pin - if (cycles <= 0) return 0; - - if(!(dma_op&2)) dma_op = (Pico.video.type==1) ? 0 : 1; // setting dma_timings offset here according to Gens - dma_op1 = dma_op; - if(Pico.video.reg[12] & 1) dma_op |= 4; // 40 cell mode? - if(!(Pico.video.status&8)&&(Pico.video.reg[1]&0x40)) dma_op|=8; // active display? - xfers_can = (dma_timings[dma_op] * cycles + 0x8000) >> 16; - if(xfers <= xfers_can) - { - Pico.video.status &= ~SR_DMA; - if (!(dma_op & 2)) - burn = xfers * dma_bsycles[dma_op] >> 16; - Pico.m.dma_xfers = 0; - } else { - if(!(dma_op&2)) burn = cycles; - Pico.m.dma_xfers -= xfers_can; - } - Pico.t.dma_end = SekCyclesDone() + burn; - - elprintf(EL_VDPDMA, "~Dma %i op=%i can=%i burn=%i [%u]", - Pico.m.dma_xfers, dma_op1, xfers_can, burn, SekCyclesDone()); - //dprintf("~aim: %i, cnt: %i", Pico.t.m68c_aim, Pico.t.m68c_cnt); - return burn; -} - #include "pico_cmn.c" /* sync z80 to 68k */ @@ -319,7 +268,7 @@ void PicoFrame(void) goto end; } - //if(Pico.video.reg[12]&0x2) Pico.video.status ^= 0x10; // change odd bit in interlace mode + //if(Pico.video.reg[12]&0x2) Pico.video.status ^= SR_ODD; // change odd bit in interlace mode PicoFrameStart(); PicoFrameHints(); diff --git a/pico/pico_cmn.c b/pico/pico_cmn.c index b7e7d8358..753898401 100644 --- a/pico/pico_cmn.c +++ b/pico/pico_cmn.c @@ -72,27 +72,19 @@ static void do_hint(struct PicoVideo *pv) } } -static void do_timing_hacks_as(struct PicoVideo *pv, int vdp_slots, int cycles) +static void do_timing_hacks_end(struct PicoVideo *pv) { - pv->lwrite_cnt += vdp_slots - Pico.m.dma_xfers * 2; // wrong *2 - if (pv->lwrite_cnt > vdp_slots) - pv->lwrite_cnt = vdp_slots; - else if (pv->lwrite_cnt < 0) - pv->lwrite_cnt = 0; - if (Pico.m.dma_xfers) - SekCyclesBurn(CheckDMA(cycles)); + PicoVideoFIFOSync(488); } -static void do_timing_hacks_vb(int cycles) +static void do_timing_hacks_start(struct PicoVideo *pv) { - if (unlikely(Pico.m.dma_xfers)) - SekCyclesBurn(CheckDMA(cycles)); + SekCyclesBurn(PicoVideoFIFOHint()); // prolong cpu HOLD if necessary } static int PicoFrameHints(void) { struct PicoVideo *pv = &Pico.video; - int vdp_slots = (Pico.video.reg[12] & 1) ? 18 : 16; int lines, y, lines_vis, skip; int vcnt_wrap, vcnt_adj; unsigned int cycles; @@ -155,8 +147,9 @@ static int PicoFrameHints(void) // Run scanline: Pico.t.m68c_line_start = Pico.t.m68c_aim; - do_timing_hacks_as(pv, vdp_slots, CYCLES_M68K_LINE); + do_timing_hacks_start(pv); CPUS_RUN(CYCLES_M68K_LINE); + do_timing_hacks_end(pv); if (PicoLineHook) PicoLineHook(); pevt_log_m68k_o(EVT_NEXT_LINE); @@ -175,10 +168,6 @@ static int PicoFrameHints(void) #endif } - // VDP FIFO - pv->lwrite_cnt = 0; - Pico.video.status |= SR_EMPT; - memcpy(PicoIn.padInt, PicoIn.pad, sizeof(PicoIn.padInt)); PAD_DELAY(); @@ -196,7 +185,7 @@ static int PicoFrameHints(void) // also delay between F bit (bit 7) is set in SR and IRQ happens (Ex-Mutants) // also delay between last H-int and V-int (Golden Axe 3) Pico.t.m68c_line_start = Pico.t.m68c_aim; - do_timing_hacks_vb(CYCLES_M68K_VINT_LAG); + do_timing_hacks_start(pv); CPUS_RUN(CYCLES_M68K_VINT_LAG); pv->status |= SR_F; @@ -224,8 +213,8 @@ static int PicoFrameHints(void) #endif // Run scanline: - do_timing_hacks_vb(CYCLES_M68K_LINE - CYCLES_M68K_VINT_LAG); CPUS_RUN(CYCLES_M68K_LINE - CYCLES_M68K_VINT_LAG); + do_timing_hacks_end(pv); if (PicoLineHook) PicoLineHook(); pevt_log_m68k_o(EVT_NEXT_LINE); @@ -260,8 +249,9 @@ static int PicoFrameHints(void) // Run scanline: Pico.t.m68c_line_start = Pico.t.m68c_aim; - do_timing_hacks_vb(CYCLES_M68K_LINE); + do_timing_hacks_start(pv); CPUS_RUN(CYCLES_M68K_LINE); + do_timing_hacks_end(pv); if (PicoLineHook) PicoLineHook(); pevt_log_m68k_o(EVT_NEXT_LINE); @@ -271,8 +261,9 @@ static int PicoFrameHints(void) unsigned int l = PicoIn.overclockM68k * lines / 100; while (l-- > 0) { Pico.t.m68c_cnt -= CYCLES_M68K_LINE; - do_timing_hacks_vb(CYCLES_M68K_LINE); + do_timing_hacks_start(pv); SekSyncM68k(); + do_timing_hacks_end(pv); } } @@ -282,7 +273,6 @@ static int PicoFrameHints(void) // last scanline Pico.m.scanline = y++; pv->v_counter = 0xff; - pv->lwrite_cnt = 0; PAD_DELAY(); @@ -297,8 +287,9 @@ static int PicoFrameHints(void) // Run scanline: Pico.t.m68c_line_start = Pico.t.m68c_aim; - do_timing_hacks_as(pv, vdp_slots, CYCLES_M68K_LINE); + do_timing_hacks_start(pv); CPUS_RUN(CYCLES_M68K_LINE); + do_timing_hacks_end(pv); if (PicoLineHook) PicoLineHook(); pevt_log_m68k_o(EVT_NEXT_LINE); diff --git a/pico/pico_int.h b/pico/pico_int.h index 357de4a9f..b3ce8a722 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -296,6 +296,10 @@ extern SH2 sh2s[2]; // not part of real SR #define PVS_ACTIVE (1 << 16) #define PVS_VB2 (1 << 17) // ignores forced blanking +#define PVS_CPUWR (1 << 18) // CPU hold by FIFO full +#define PVS_CPURD (1 << 19) // CPU hold by FIFO full +#define PVS_DMAPEND (1 << 20) // DMA operation waiting for start +#define PVS_DMAFILL (1 << 21) // DMA fill is in progress struct PicoVideo { @@ -306,7 +310,7 @@ struct PicoVideo unsigned short addr; // Read/Write address unsigned int status; // Status bits (SR) and extra flags unsigned char pending_ints; // pending interrupts: ??VH???? - signed char lwrite_cnt; // VDP write count during active display line + signed char pad1; // was VDP write count unsigned short v_counter; // V-counter unsigned short debug; // raw debug register unsigned char debug_p; // ... parsed: PVD_* @@ -335,7 +339,7 @@ struct PicoMisc unsigned char eeprom_slave; // EEPROM slave word for X24C02 and better SRAMs unsigned char eeprom_status; unsigned char pad1; // was ym2612 status - unsigned short dma_xfers; // 18 + unsigned short pad2; // 18 was dma_xfers unsigned char eeprom_wb[2]; // EEPROM latch/write buffer unsigned int frame_count; // 1c for movies and idle det }; @@ -419,7 +423,6 @@ struct PicoTiming unsigned int z80c_aim; int z80_scanline; - unsigned int dma_end; // end of current DMA op (m68k cycles) int timer_a_next_oflow, timer_a_step; // in z80 cycles int timer_b_next_oflow, timer_b_step; }; @@ -850,6 +853,9 @@ unsigned char PicoVideoRead8CtlL(void); unsigned char PicoVideoRead8HV_H(void); unsigned char PicoVideoRead8HV_L(void); extern int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned int *mask); +void PicoVideoFIFOSync(int cycles); +int PicoVideoFIFOHint(void); +int PicoVideoFIFOWrite(int count, int byte_p, unsigned sr_mask, unsigned sr_flags); // misc.c PICO_INTERNAL_ASM void memcpy16bswap(unsigned short *dest, void *src, int count); diff --git a/pico/videoport.c b/pico/videoport.c index cdc5796ce..881a74a31 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -14,9 +14,296 @@ extern const unsigned char hcounts_32[]; extern const unsigned char hcounts_40[]; +static unsigned hvlatch; // latched hvcounter value +static int blankline; // display disabled for this line int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned int *mask) = NULL; + +/* VDP FIFO implementation + * + * fifo_slot: last slot executed in this scanline + * fifo_cnt: #slots remaining for active FIFO write (#writes<<#bytep) + * fifo_total: #total FIFO entries pending + * fifo_data: last values transferred through fifo + * fifo_queue: fifo transfer queue (#writes, VRAM_byte_p) + * + * FIFO states: empty total=0 + * inuse total>0 && total<4 + * full total==4 + * wait total>4 + * Conditions: + * fifo_slot is always behind slot2cyc[cycles]. Advancing it beyond cycles + * implies blocking the 68k up to that slot. + * + * A FIFO write goes to the end of the fifo queue. There can be more pending + * writes than FIFO slots, but the 68k will be blocked in most of those cases. + * This is only about correct timing, data xfer must be handled by the caller. + * Blocking the CPU means burning cycles via SekCyclesBurn*(), which is to be + * executed by the caller. + * + * FIFOSync "executes" FIFO write slots up to the given cycle in the current + * scanline. A queue entry completely executed is removed from the queue. + * FIFOWrite pushes writes to the transfer queue. If it's a blocking write, 68k + * is blocked if more than 4 FIFO writes are pending. + * FIFORead executes a 68k read. 68k is blocked until the next transfer slot. + */ + +// FIFO transfer slots per line: H32 blank, H40 blank, H32 active, H40 active +static const short vdpslots[] = { 166, 204, 16, 18 }; +// mapping between slot# and 68k cycles in a blanked scanline +static const int vdpcyc2sl_bl[] = { (166<<16)/488, (204<<16)/488, (16<<16)/488, (18<<16)/488 }; +static const int vdpsl2cyc_bl[] = { (488<<16)/166, (488<<16)/204, (488<<16)/16, (488<<16)/18 }; + +// VDP transfer slots in active display 32col mode. 1 slot is 488/171 = 2.8538 +// 68k cycles. Only 16 of the 171 slots in a scanline can be used by CPU/DMA: +// (HINT=slot 0): 13,27,42,50,58,74,82,90,106,114,122,138,146,154,169,170 +const unsigned char vdpcyc2sl_32[] = { // 68k cycles/4 since HINT to slot # +// 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, + 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, + 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11, +11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14, +14,14,14,14,14,14,14,14,15,16,16,16,16,16,16,16, +}; +const unsigned char vdpsl2cyc_32[] = { // slot # to 68k cycles/4 since HINT + 0, 9, 19, 30, 35, 41, 52, 58, 64, 75, 81, 87, 98,104,110,120,121,123,123 +}; + +// VDP transfer slots in active display 40col mode. 1 slot is 488/210 = 2.3238 +// 68k cycles. Only 18 of the 210 slots in a scanline can be used by CPU/DMA: +// (HINT=0): 23,49,57,65,81,89,97,113,121,129,145,153,161,177,185,193,208,209 +const unsigned char vdpcyc2sl_40[] = { // 68k cycles/4 since HINT to slot # +// 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, + 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, + 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 8, 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10, +10,10,10,10,11,11,11,11,12,12,12,12,12,13,13,13, +13,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15, +16,16,16,16,16,16,16,16,17,18,18,18,18,18,18,18, +}; +const unsigned char vdpsl2cyc_40[] = { // slot # to 68k cycles/4 since HINT + 0, 13, 28, 33, 37, 47, 51, 56, 65, 70, 74, 84, 88, 93,102,107,112,120,121,123,123 +}; + +// NB code assumes fifo_* arrays have size 2^n +// last transferred FIFO data, ...x = index XXX currently only CPU +static short fifo_data[4], fifo_dx; +// queued FIFO transfers, ...x = index, ...l = queue length +// each entry has 2 values: [n]>>1=#writes, [n]&1=is VRAM byte access +static int fifo_queue[8], fifo_qx, fifo_ql; + +signed int fifo_cnt; // pending slots for current queue entry +unsigned short fifo_slot; // last executed slot in current scanline +unsigned int fifo_total; // total# of pending FIFO entries + +// sync FIFO to cycles +void PicoVideoFIFOSync(int cycles) +{ + struct PicoVideo *pv = &Pico.video; + int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); + int h40 = pv->reg[12] & 1; + const unsigned char *cs = h40 ? vdpcyc2sl_40 : vdpcyc2sl_32; + int slots, done; + + // calculate #slots since last executed slot + if (active) slots = cs[cycles/4]; + else slots = (cycles * vdpcyc2sl_bl[h40] + cycles) >> 16; + slots -= fifo_slot; + + // advance FIFO queue by #done slots + done = slots; + while (done > 0 && fifo_ql) { + int l = done, b = fifo_queue[fifo_qx&7] & 1; + if (l > fifo_cnt) + l = fifo_cnt; + fifo_total -= ((fifo_cnt & b) + l) >> b; + fifo_slot += l; + fifo_cnt -= l; + done -= l; + + if (fifo_cnt == 0) { + fifo_qx ++, fifo_ql --; + fifo_cnt= (fifo_queue[fifo_qx&7] >> 1) << (fifo_queue[fifo_qx&7] & 1); + } + } + + // release CPU and terminate DMA if FIFO isn't blocking the 68k anymore + if (fifo_total <= 4) { + pv->status &= ~PVS_CPUWR; + pv->command &= ~0x80; + if (!(pv->status & PVS_DMAPEND)) + pv->status &= ~(SR_DMA|PVS_DMAFILL); + } + if (fifo_total == 0) + pv->status &= ~PVS_CPURD; +} + +// drain FIFO, blocking 68k on the way. FIFO must be synced prior to drain. +int PicoVideoFIFODrain(int level, int cycles) +{ + struct PicoVideo *pv = &Pico.video; + int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); + int h40 = pv->reg[12] & 1; + const unsigned char *sc = h40 ? vdpsl2cyc_40 : vdpsl2cyc_32; + int maxsl = vdpslots[h40 + 2*active]; // max xfer slots in this scanline + int burn = 0; + + while (fifo_total > level && fifo_slot < maxsl) { + int b = fifo_queue[fifo_qx&7] & 1; + int cnt = (fifo_total-level) << b; + int last = fifo_slot; + int slot = (fifo_cnt maxsl) { + // target in later scanline, advance to eol + slot = maxsl; + fifo_slot = maxsl; + cycles = 488; + } else { + // advance FIFO to target slot and CPU to cycles at that slot + fifo_slot = slot; + if (active) cycles = sc[slot]*4; + else cycles = ((slot * vdpsl2cyc_bl[h40] + slot) >> 16); + } + burn += cycles - ocyc; + + slot -= last; + fifo_total -= ((fifo_cnt & b) + slot) >> b; + fifo_cnt -= slot; + + if (fifo_cnt == 0) { + fifo_qx ++, fifo_ql --; + fifo_cnt= (fifo_queue[fifo_qx&7] >> 1) << (fifo_queue[fifo_qx&7] & 1); + } + } + + // release CPU and terminate DMA if FIFO isn't blocking the bus anymore + if (fifo_total <= 4) { + pv->status &= ~PVS_CPUWR; + pv->command &= ~0x80; + if (!(pv->status & PVS_DMAPEND)) + pv->status &= ~(SR_DMA|PVS_DMAFILL); + } + if (fifo_total == 0) + pv->status &= ~PVS_CPURD; + + return burn; +} + +// read VDP data port +int PicoVideoFIFORead(void) +{ + struct PicoVideo *pv = &Pico.video; + int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); + int h40 = pv->reg[12] & 1; + const unsigned char *cs = h40 ? vdpcyc2sl_40 : vdpcyc2sl_32; + const unsigned char *sc = h40 ? vdpsl2cyc_40 : vdpsl2cyc_32; + int lc = SekCyclesDone()-Pico.t.m68c_line_start+4; + int burn = 0; + + PicoVideoFIFOSync(lc); + + // advance FIFO and CPU until FIFO is empty + burn = PicoVideoFIFODrain(0, lc); + lc += burn; + if (fifo_total > 0) + pv->status |= PVS_CPURD; // target slot is in later scanline + else { + // use next VDP access slot for reading, block 68k until then + if (active) { + fifo_slot = cs[lc/4] + 1; + burn += sc[fifo_slot]*4; + } else { + fifo_slot = ((lc * vdpcyc2sl_bl[h40] + lc) >> 16) + 1; + burn += ((fifo_slot * vdpsl2cyc_bl[h40] + fifo_slot) >> 16); + } + burn -= lc; + } + + return burn; +} + +// write VDP data port +int PicoVideoFIFOWrite(int count, int byte_p, unsigned sr_mask,unsigned sr_flags) +{ + struct PicoVideo *pv = &Pico.video; + int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); + int h40 = pv->reg[12] & 1; + const unsigned char *cs = h40 ? vdpcyc2sl_40 : vdpcyc2sl_32; + int lc = SekCyclesDone()-Pico.t.m68c_line_start+4; + int burn = 0; + + PicoVideoFIFOSync(lc); + pv->status = (pv->status & ~sr_mask) | sr_flags; + + if (count) { + // update FIFO state if it was empty + if (fifo_total == 0 && count) { + if (active) fifo_slot = cs[lc/4]; + else fifo_slot = (lc * vdpcyc2sl_bl[h40] + lc) >> 16; + fifo_cnt = count << byte_p; + } + + // create xfer queue entry + int x = (fifo_qx + fifo_ql) & 7; + fifo_queue[x] = (count << 1) | byte_p; + fifo_ql ++; + fifo_total += count; + } + + // if CPU is waiting for the bus, advance CPU and FIFO until bus is free + if ((pv->status & (PVS_CPUWR|PVS_DMAFILL)) == PVS_CPUWR) + burn = PicoVideoFIFODrain(4, lc); + + return burn; +} + +// at HINT, advance FIFO to new scanline +int PicoVideoFIFOHint(void) +{ + struct PicoVideo *pv = &Pico.video; + int burn = 0; + + // reset slot to start of scanline + fifo_slot = 0; + + // if CPU is waiting for the bus, advance CPU and FIFO until bus is free + if (pv->status & PVS_CPURD) + burn = PicoVideoFIFORead(); + if (pv->status & PVS_CPUWR) + burn = PicoVideoFIFOWrite(0, 0, 0, 0); + + return burn; +} + +// switch FIFO mode between active/inactive display +void PicoVideoFIFOMode(int active) +{ + struct PicoVideo *pv = &Pico.video; + const unsigned char *cs = pv->reg[12]&1 ? vdpcyc2sl_40 : vdpcyc2sl_32; + int h40 = pv->reg[12] & 1; + int lc = SekCyclesDone() - Pico.t.m68c_line_start; + + PicoVideoFIFOSync(lc); + + if (fifo_total) { + // recalculate FIFO slot for new mode + if (!(pv->status & SR_VB) && active) + fifo_slot = cs[lc/4]; + else fifo_slot = ((lc * vdpcyc2sl_bl[h40] + lc) >> 16); + } +} + + +// VDP memory rd/wr + static __inline void AutoIncrement(void) { Pico.video.addr=(unsigned short)(Pico.video.addr+Pico.video.reg[0xf]); @@ -60,15 +347,19 @@ static void VideoWrite(u16 d) static unsigned int VideoRead(void) { - unsigned int a=0,d=0; + unsigned int a, d = fifo_data[(fifo_dx+1)&3]; a=Pico.video.addr; a>>=1; + SekCyclesBurnRun(PicoVideoFIFORead()); switch (Pico.video.type) { case 0: d=PicoMem.vram [a & 0x7fff]; break; - case 8: d=PicoMem.cram [a & 0x003f]; break; - case 4: d=PicoMem.vsram[a & 0x003f]; break; + case 8: d=(PicoMem.cram [a & 0x003f] & 0x0eee) | (d & ~0x0eee); break; + case 4: if ((a & 0x3f) >= 0x28) a = 0; + d=(PicoMem.vsram [a & 0x003f] & 0x07ff) | (d & ~0x07ff); break; + case 12:a=PicoMem.vram [a & 0x7fff]; if (Pico.video.addr&1) a >>= 8; + d=(a & 0x00ff) | (d & ~0x00ff); break; default:elprintf(EL_ANOMALY, "VDP read with bad type %i", Pico.video.type); break; } @@ -76,6 +367,8 @@ static unsigned int VideoRead(void) return d; } +// VDP DMA + static int GetDmaLength(void) { struct PicoVideo *pvid=&Pico.video; @@ -95,13 +388,11 @@ static void DmaSlow(int len, unsigned int source) u32 mask = 0x1ffff; elprintf(EL_VDPDMA, "DmaSlow[%i] %06x->%04x len %i inc=%i blank %i [%u] @ %06x", - Pico.video.type, source, a, len, inc, (Pico.video.status&8)||!(Pico.video.reg[1]&0x40), + Pico.video.type, source, a, len, inc, (Pico.video.status&SR_VB)||!(Pico.video.reg[1]&0x40), SekCyclesDone(), SekPc); - Pico.m.dma_xfers = len; - if (Pico.m.dma_xfers < len) // lame 16bit var - Pico.m.dma_xfers = ~0; - SekCyclesBurnRun(CheckDMA(488 - (SekCyclesDone()-Pico.t.m68c_line_start))); + SekCyclesBurnRun(PicoVideoFIFOWrite(len, Pico.video.type == 1, PVS_DMAPEND, + SR_DMA | PVS_CPUWR) + 8); if ((source & 0xe00000) == 0xe00000) { // Ram base = (u16 *)PicoMem.ram; @@ -224,14 +515,12 @@ static void DmaCopy(int len) int source; elprintf(EL_VDPDMA, "DmaCopy len %i [%u]", len, SekCyclesDone()); - Pico.m.dma_xfers = len; - if (Pico.m.dma_xfers < len) - Pico.m.dma_xfers = ~0; - Pico.video.status |= SR_DMA; + SekCyclesBurnRun(PicoVideoFIFOWrite(len, 1, PVS_CPUWR|PVS_DMAPEND, SR_DMA)); source =Pico.video.reg[0x15]; source|=Pico.video.reg[0x16]<<8; + // XXX implement VRAM 128k? Is this even working? for (; len; len--) { vr[a] = vr[source++ & 0xffff]; @@ -255,10 +544,7 @@ static NOINLINE void DmaFill(int data) len = GetDmaLength(); elprintf(EL_VDPDMA, "DmaFill len %i inc %i [%u]", len, inc, SekCyclesDone()); - Pico.m.dma_xfers = len; - if (Pico.m.dma_xfers < len) // lame 16bit var - Pico.m.dma_xfers = ~0; - Pico.video.status |= SR_DMA; + SekCyclesBurnRun(PicoVideoFIFOWrite(len, Pico.video.type == 1, PVS_CPUWR|PVS_DMAPEND, SR_DMA)); switch (Pico.video.type) { @@ -274,13 +560,24 @@ static NOINLINE void DmaFill(int data) Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; break; case 3: // cram + Pico.m.dirtyPal = 1; + for (l = len; l; l--) { + PicoMem.cram[(a/2) & 0x3f] = data; + + // Increment address register + a += inc; + } + break; case 5: { // vsram - // TODO: needs fifo; anyone using these? - static int once; - if (!once++) - elprintf(EL_STATUS|EL_ANOMALY|EL_VDPDMA, "TODO: cram/vsram fill"); + for (l = len; l; l--) { + PicoMem.vsram[(a/2) & 0x3f] = data; + + // Increment address register + a += inc; + } + break; } - case 0x81: + case 0x81: // vram 128k for (l = len; l; l--) { VideoWrite128(a, data); @@ -307,17 +604,22 @@ static NOINLINE void DmaFill(int data) } +// VDP command handling + static NOINLINE void CommandDma(void) { struct PicoVideo *pvid=&Pico.video; u32 len, method; u32 source; - if ((pvid->reg[1]&0x10)==0) return; // DMA not enabled - - if (Pico.m.dma_xfers) + pvid->status |= PVS_DMAPEND; + PicoVideoFIFOSync(SekCyclesDone()-Pico.t.m68c_line_start); + if (pvid->status & SR_DMA) { elprintf(EL_VDPDMA, "Dma overlap, left=%d @ %06x", - Pico.m.dma_xfers, SekPc); + fifo_total, SekPc); + fifo_total = fifo_ql = 0; + } + pvid->status |= SR_DMA; len = GetDmaLength(); source =Pico.video.reg[0x15]; @@ -329,9 +631,10 @@ static NOINLINE void CommandDma(void) DmaSlow(len, source << 1); // 68000 to VDP else if (method == 3) DmaCopy(len); // VRAM Copy - else + else { + pvid->status |= PVS_DMAFILL; return; - + } source += len; Pico.video.reg[0x13] = Pico.video.reg[0x14] = 0; Pico.video.reg[0x15] = source; @@ -357,13 +660,21 @@ static NOINLINE void CommandChange(void) pvid->addr_u = (u8)((cmd >> 2) & 1); } -static void DrawSync(int blank_on) +// VDP interface + +static void DrawSync(int skip) { int lines = Pico.video.reg[1]&0x08 ? 240 : 224; - if (Pico.m.scanline < lines && !(PicoIn.opt & POPT_ALT_RENDERER) && - !PicoIn.skipFrame && Pico.est.DrawScanline <= Pico.m.scanline) { + int last = Pico.m.scanline - (skip || blankline == Pico.m.scanline); + + if (last < lines && !(PicoIn.opt & POPT_ALT_RENDERER) && + !PicoIn.skipFrame && Pico.est.DrawScanline <= last) { //elprintf(EL_ANOMALY, "sync"); - PicoDrawSync(Pico.m.scanline, blank_on); + if (blankline >= 0 && blankline < last) { + PicoDrawSync(blankline, 1); + blankline = -1; + } + PicoDrawSync(last, 0); } } @@ -390,19 +701,19 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) pvid->pending=0; } - if (!(pvid->status & SR_VB) && (pvid->reg[1]&0x40) && !(PicoIn.opt&POPT_DIS_VDP_FIFO)) + if (!(PicoIn.opt&POPT_DIS_VDP_FIFO)) { - int use = pvid->type == 1 ? 2 : 1; - pvid->lwrite_cnt -= use; - if (pvid->lwrite_cnt < 0) - SekCyclesBurnRun(488 - (SekCyclesDone()-Pico.t.m68c_line_start)); - elprintf(EL_ASVDP, "VDP data write: [%04x] %04x [%u] {%i} #%i @ %06x", - Pico.video.addr, d, SekCyclesDone(), Pico.video.type, pvid->lwrite_cnt, SekPc); + fifo_data[++fifo_dx&3] = d; + SekCyclesBurnRun(PicoVideoFIFOWrite(1, pvid->type == 1, 0, PVS_CPUWR)); + + elprintf(EL_ASVDP, "VDP data write: [%04x] %04x [%u] {%i} @ %06x", + Pico.video.addr, d, SekCyclesDone(), Pico.video.type, SekPc); } VideoWrite(d); - if ((pvid->command&0x80) && (pvid->reg[1]&0x10) && (pvid->reg[0x17]>>6)==2) - DmaFill(d); + // start DMA fill on write. NB VSRAM and CRAM fills use wrong FIFO data. + if ((pvid->status & (PVS_DMAPEND|PVS_DMAFILL)) == (PVS_DMAPEND|PVS_DMAFILL)) + DmaFill(fifo_data[(fifo_dx + !!(pvid->type&~0x81))&3]); break; @@ -410,6 +721,8 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) if (pvid->pending) { // Low word of command: + if (!(pvid->reg[1]&0x10)) + d = (d&~0x80)|(pvid->command&0x80); pvid->command &= 0xffff0000; pvid->command |= d; pvid->pending = 0; @@ -427,16 +740,24 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) // Register write: int num=(d>>8)&0x1f; int dold=pvid->reg[num]; - int blank_on = 0; + int skip=0; pvid->type=0; // register writes clear command (else no Sega logo in Golden Axe II) if (num > 0x0a && !(pvid->reg[1]&4)) { elprintf(EL_ANOMALY, "%02x written to reg %02x in SMS mode @ %06x", d, num, SekPc); return; } - if (num == 1 && !(d&0x40) && SekCyclesDone() - Pico.t.m68c_line_start <= 488-390) - blank_on = 1; - DrawSync(blank_on); + if (num == 0 && !(pvid->reg[0]&2) && (d&2)) + hvlatch = PicoVideoRead(0x08); + if (num == 1 && ((pvid->reg[1]^d)&0x40)) { + PicoVideoFIFOMode(d & 0x40); + // handle line blanking before line rendering + if (SekCyclesDone() - Pico.t.m68c_line_start <= 488-390) { + skip = 1; + blankline = d&0x40 ? -1 : Pico.m.scanline; + } + } + DrawSync(skip); pvid->reg[num]=(unsigned char)d; switch (num) { @@ -519,15 +840,23 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) } } -static u32 SrLow(const struct PicoVideo *pv) +static u32 VideoSr(const struct PicoVideo *pv) { unsigned int c, d = pv->status; + unsigned int hp = pv->reg[12]&1 ? 32:40; // HBLANK start + unsigned int hl = pv->reg[12]&1 ? 94:84; // HBLANK length c = SekCyclesDone(); - if (c - Pico.t.m68c_line_start - 39 < 92) + if (c - Pico.t.m68c_line_start - hp < hl) d |= SR_HB; - if (CYCLES_GT(c, Pico.t.dma_end)) - d &= ~SR_DMA; + + PicoVideoFIFOSync(c-Pico.t.m68c_line_start); + if (pv->status & SR_DMA) + d |= SR_EMPT; // unused by DMA, or rather flags not updated? + else if (fifo_total >= 4) + d |= SR_FULL; + else if (!fifo_total) + d |= SR_EMPT; return d; } @@ -538,8 +867,11 @@ PICO_INTERNAL_ASM unsigned int PicoVideoRead(unsigned int a) if (a == 0x04) // control port { struct PicoVideo *pv = &Pico.video; - unsigned int d = SrLow(pv); - pv->pending = 0; + unsigned int d = VideoSr(pv); + if (pv->pending) { + CommandChange(); + pv->pending = 0; + } elprintf(EL_SR, "SR read: %04x [%u] @ %06x", d, SekCyclesDone(), SekPc); return d; } @@ -564,12 +896,14 @@ PICO_INTERNAL_ASM unsigned int PicoVideoRead(unsigned int a) unsigned int d; d = (SekCyclesDone() - Pico.t.m68c_line_start) & 0x1ff; // FIXME - if (Pico.video.reg[12]&1) - d = hcounts_40[d]; - else d = hcounts_32[d]; + if (Pico.video.reg[0]&2) + d = hvlatch; + else if (Pico.video.reg[12]&1) + d = hcounts_40[d] | (Pico.video.v_counter << 8); + else d = hcounts_32[d] | (Pico.video.v_counter << 8); elprintf(EL_HVCNT, "hv: %02x %02x [%u] @ %06x", d, Pico.video.v_counter, SekCyclesDone(), SekPc); - return d | (Pico.video.v_counter << 8); + return d; } if (a==0x00) // data port @@ -592,16 +926,22 @@ unsigned char PicoVideoRead8DataL(void) unsigned char PicoVideoRead8CtlH(void) { - u8 d = (u8)(Pico.video.status >> 8); - Pico.video.pending = 0; + u8 d = VideoSr(&Pico.video) >> 8; + if (Pico.video.pending) { + CommandChange(); + Pico.video.pending = 0; + } elprintf(EL_SR, "SR read (h): %02x @ %06x", d, SekPc); return d; } unsigned char PicoVideoRead8CtlL(void) { - u8 d = SrLow(&Pico.video); - Pico.video.pending = 0; + u8 d = VideoSr(&Pico.video); + if (Pico.video.pending) { + CommandChange(); + Pico.video.pending = 0; + } elprintf(EL_SR, "SR read (l): %02x @ %06x", d, SekPc); return d; } From 3adc47cb46d1eb8522b650a0dfbd73249c92ddf5 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 8 Feb 2020 13:29:32 +0100 Subject: [PATCH 103/174] sh2 drc: fix for crash in generated code on x86_64 --- cpu/drc/emit_x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index ec13551e3..80ec04445 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -1007,7 +1007,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define host_instructions_updated(base, end) (void)(base),(void)(end) #define emith_update_cache() /**/ -#define emith_rw_offs_max() 0xffffffff +// NB this MUST be <0x40000000 to avoid overflow in address calculations +#define emith_rw_offs_max() 0xfffffff // for better perfomance: <0x10000000 #ifdef __x86_64__ From af8222b7565c25e72a9bdd434e745d26745aaeb9 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 8 Feb 2020 15:20:05 +0100 Subject: [PATCH 104/174] vdp fifo: kludge for DMA fill interrupted by CPU --- pico/videoport.c | 109 +++++++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 51 deletions(-) diff --git a/pico/videoport.c b/pico/videoport.c index 881a74a31..0a6a103fb 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -26,7 +26,7 @@ int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned * fifo_cnt: #slots remaining for active FIFO write (#writes<<#bytep) * fifo_total: #total FIFO entries pending * fifo_data: last values transferred through fifo - * fifo_queue: fifo transfer queue (#writes, VRAM_byte_p) + * fifo_queue: fifo transfer queue (#writes, flags) * * FIFO states: empty total=0 * inuse total>0 && total<4 @@ -95,42 +95,66 @@ const unsigned char vdpsl2cyc_40[] = { // slot # to 68k cycles/4 since HINT // last transferred FIFO data, ...x = index XXX currently only CPU static short fifo_data[4], fifo_dx; // queued FIFO transfers, ...x = index, ...l = queue length -// each entry has 2 values: [n]>>1=#writes, [n]&1=is VRAM byte access +// each entry has 2 values: [n]>>2=#writes, [n]&3=flags:2=DMA fill 1=byte access static int fifo_queue[8], fifo_qx, fifo_ql; signed int fifo_cnt; // pending slots for current queue entry unsigned short fifo_slot; // last executed slot in current scanline unsigned int fifo_total; // total# of pending FIFO entries -// sync FIFO to cycles -void PicoVideoFIFOSync(int cycles) +// do the FIFO math +static __inline int AdvanceFIFOEntry(int slots) +{ + int l = slots, b = fifo_queue[fifo_qx&7] & 1; + + if (l > fifo_cnt) + l = fifo_cnt; + fifo_total -= ((fifo_cnt & b) + l) >> b; + fifo_cnt -= l; + + if (fifo_cnt == 0) { + fifo_qx ++, fifo_ql --; + fifo_cnt= (fifo_queue[fifo_qx&7] >> 2) << (fifo_queue[fifo_qx&7] & 1); + } + return l; +} + +static __inline int GetFIFOSlot(struct PicoVideo *pv, int cycles) { - struct PicoVideo *pv = &Pico.video; int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); int h40 = pv->reg[12] & 1; const unsigned char *cs = h40 ? vdpcyc2sl_40 : vdpcyc2sl_32; + + if (active) return cs[cycles/4]; + else return (cycles * vdpcyc2sl_bl[h40] + cycles) >> 16; +} + +static inline int GetFIFOCycles(struct PicoVideo *pv, int slot) +{ + int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); + int h40 = pv->reg[12] & 1; + const unsigned char *sc = h40 ? vdpsl2cyc_40 : vdpsl2cyc_32; + + if (active) return sc[slot]*4; + else return ((slot * vdpsl2cyc_bl[h40] + slot) >> 16); +} + +// sync FIFO to cycles +void PicoVideoFIFOSync(int cycles) +{ + struct PicoVideo *pv = &Pico.video; int slots, done; // calculate #slots since last executed slot - if (active) slots = cs[cycles/4]; - else slots = (cycles * vdpcyc2sl_bl[h40] + cycles) >> 16; + slots = GetFIFOSlot(pv, cycles); slots -= fifo_slot; // advance FIFO queue by #done slots done = slots; while (done > 0 && fifo_ql) { - int l = done, b = fifo_queue[fifo_qx&7] & 1; - if (l > fifo_cnt) - l = fifo_cnt; - fifo_total -= ((fifo_cnt & b) + l) >> b; + int l = AdvanceFIFOEntry(done); fifo_slot += l; - fifo_cnt -= l; done -= l; - - if (fifo_cnt == 0) { - fifo_qx ++, fifo_ql --; - fifo_cnt= (fifo_queue[fifo_qx&7] >> 1) << (fifo_queue[fifo_qx&7] & 1); - } } // release CPU and terminate DMA if FIFO isn't blocking the 68k anymore @@ -150,7 +174,6 @@ int PicoVideoFIFODrain(int level, int cycles) struct PicoVideo *pv = &Pico.video; int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); int h40 = pv->reg[12] & 1; - const unsigned char *sc = h40 ? vdpsl2cyc_40 : vdpsl2cyc_32; int maxsl = vdpslots[h40 + 2*active]; // max xfer slots in this scanline int burn = 0; @@ -169,19 +192,11 @@ int PicoVideoFIFODrain(int level, int cycles) } else { // advance FIFO to target slot and CPU to cycles at that slot fifo_slot = slot; - if (active) cycles = sc[slot]*4; - else cycles = ((slot * vdpsl2cyc_bl[h40] + slot) >> 16); + cycles = GetFIFOCycles(pv, slot); } burn += cycles - ocyc; - slot -= last; - fifo_total -= ((fifo_cnt & b) + slot) >> b; - fifo_cnt -= slot; - - if (fifo_cnt == 0) { - fifo_qx ++, fifo_ql --; - fifo_cnt= (fifo_queue[fifo_qx&7] >> 1) << (fifo_queue[fifo_qx&7] & 1); - } + AdvanceFIFOEntry(slot - last); } // release CPU and terminate DMA if FIFO isn't blocking the bus anymore @@ -201,10 +216,6 @@ int PicoVideoFIFODrain(int level, int cycles) int PicoVideoFIFORead(void) { struct PicoVideo *pv = &Pico.video; - int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); - int h40 = pv->reg[12] & 1; - const unsigned char *cs = h40 ? vdpcyc2sl_40 : vdpcyc2sl_32; - const unsigned char *sc = h40 ? vdpsl2cyc_40 : vdpsl2cyc_32; int lc = SekCyclesDone()-Pico.t.m68c_line_start+4; int burn = 0; @@ -217,43 +228,33 @@ int PicoVideoFIFORead(void) pv->status |= PVS_CPURD; // target slot is in later scanline else { // use next VDP access slot for reading, block 68k until then - if (active) { - fifo_slot = cs[lc/4] + 1; - burn += sc[fifo_slot]*4; - } else { - fifo_slot = ((lc * vdpcyc2sl_bl[h40] + lc) >> 16) + 1; - burn += ((fifo_slot * vdpsl2cyc_bl[h40] + fifo_slot) >> 16); - } - burn -= lc; + fifo_slot = GetFIFOSlot(pv, lc) + 1; + burn += GetFIFOCycles(pv, fifo_slot) - lc; } return burn; } // write VDP data port -int PicoVideoFIFOWrite(int count, int byte_p, unsigned sr_mask,unsigned sr_flags) +int PicoVideoFIFOWrite(int count, int flags, unsigned sr_mask,unsigned sr_flags) { struct PicoVideo *pv = &Pico.video; - int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); - int h40 = pv->reg[12] & 1; - const unsigned char *cs = h40 ? vdpcyc2sl_40 : vdpcyc2sl_32; int lc = SekCyclesDone()-Pico.t.m68c_line_start+4; int burn = 0; PicoVideoFIFOSync(lc); pv->status = (pv->status & ~sr_mask) | sr_flags; - if (count) { + if (count && fifo_ql < 8) { // update FIFO state if it was empty if (fifo_total == 0 && count) { - if (active) fifo_slot = cs[lc/4]; - else fifo_slot = (lc * vdpcyc2sl_bl[h40] + lc) >> 16; - fifo_cnt = count << byte_p; + fifo_slot = GetFIFOSlot(pv, lc); + fifo_cnt = count << (flags&1); } // create xfer queue entry int x = (fifo_qx + fifo_ql) & 7; - fifo_queue[x] = (count << 1) | byte_p; + fifo_queue[x] = (count << 2) | flags; fifo_ql ++; fifo_total += count; } @@ -261,6 +262,11 @@ int PicoVideoFIFOWrite(int count, int byte_p, unsigned sr_mask,unsigned sr_flags // if CPU is waiting for the bus, advance CPU and FIFO until bus is free if ((pv->status & (PVS_CPUWR|PVS_DMAFILL)) == PVS_CPUWR) burn = PicoVideoFIFODrain(4, lc); + else if (fifo_queue[fifo_qx&7]&2) { + // if interrupting a DMA fill terminate it + AdvanceFIFOEntry(fifo_cnt); + pv->status &= ~PVS_DMAFILL; + } return burn; } @@ -515,7 +521,7 @@ static void DmaCopy(int len) int source; elprintf(EL_VDPDMA, "DmaCopy len %i [%u]", len, SekCyclesDone()); - SekCyclesBurnRun(PicoVideoFIFOWrite(len, 1, PVS_CPUWR|PVS_DMAPEND, SR_DMA)); + SekCyclesBurnRun(PicoVideoFIFOWrite(len, 1, PVS_CPUWR | PVS_DMAPEND, SR_DMA)); source =Pico.video.reg[0x15]; source|=Pico.video.reg[0x16]<<8; @@ -544,7 +550,8 @@ static NOINLINE void DmaFill(int data) len = GetDmaLength(); elprintf(EL_VDPDMA, "DmaFill len %i inc %i [%u]", len, inc, SekCyclesDone()); - SekCyclesBurnRun(PicoVideoFIFOWrite(len, Pico.video.type == 1, PVS_CPUWR|PVS_DMAPEND, SR_DMA)); + SekCyclesBurnRun(PicoVideoFIFOWrite(len, 2|(Pico.video.type == 1), + PVS_CPUWR | PVS_DMAPEND, SR_DMA)); switch (Pico.video.type) { From 511110a2445a3c788e5f9fe99e514e07ff8b9037 Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 16 Feb 2020 08:32:29 +0100 Subject: [PATCH 105/174] fix compatibility with ancient gas --- README.md | 5 +---- cyclone_gp2x.patch | 41 ----------------------------------------- 2 files changed, 1 insertion(+), 45 deletions(-) delete mode 100644 cyclone_gp2x.patch diff --git a/README.md b/README.md index 67f60c2cf..a5d0ad3a2 100644 --- a/README.md +++ b/README.md @@ -35,10 +35,7 @@ opendingux|opendingux|CROSS_COMPILE=mipsel-linux- CFLAGS="-I$TC/usr/include -I$T opendingux|opendingux with ubuntu mips gcc 5.4|CROSS_COMPILE=mipsel-linux-gnu- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="-B$TC/usr/lib -B$TC/lib -Wl,-rpath-link=$TC/usr/lib -Wl,-rpath-link=$TC/lib" ./configure --platform=opendingux gcw0|gcw0|CROSS_COMPILE=mipsel-gcw0-linux-uclibc- CFLAGS="-I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include -I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL" LDFLAGS="--sysroot $TC/usr/mipsel-gcw0-linux-uclibc/sysroot" ./configure --platform=gcw0 -For gp2x, wiz, and caanoo you may need to compile libpng first, and additionally -cyclone_gp2x.patch may need to be applied to the cpu/cyclone submodule: - -> patch -d cpu/cyclone -p1 Date: Sun, 16 Feb 2020 08:42:45 +0100 Subject: [PATCH 106/174] 32X poll detection fix --- pico/32x/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 30d9b577c..f772d28dc 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -239,7 +239,7 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) // NB this can take an eternity on m68k: mov.b , needs // 28 m68k-cycles (~80 sh2-cycles) to complete (observed in Metal Head) q = &fifo[(sh2_poll_wr[hix]-1) % PFIFO_SZ]; - if (rd != wr && q->a == a && !CYCLES_GT(cycles,q->cycles+30)) { + if (cpu < 0 && rd != wr && q->a == a && !CYCLES_GT(cycles,q->cycles+30)) { q->d = d; } else { // store write to poll address in fifo From 96e5e8af08f07460595a6c29b9c0a4644d4d44aa Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 16 Feb 2020 13:48:51 +0100 Subject: [PATCH 107/174] vdp rendering fixes --- pico/draw.c | 20 ++++++++++++-------- pico/draw2.c | 2 +- pico/draw_arm.S | 15 ++++++++++----- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/pico/draw.c b/pico/draw.c index dfbd53cc3..9324a5a9b 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -339,12 +339,13 @@ static void DrawStripVSRam(struct TileStrip *ts, int plane_sh, int cellskip) oldcode = code; // Get tile address/2: addr=(code&0x7ff)<<4; - if (code&0x1000) addr+=14-ty; else addr+=ty; // Y-flip pal=((code>>9)&0x30)|((plane_sh<<5)&0x40); } - pack = *(unsigned int *)(PicoMem.vram + addr); + if (code & 0x1000) ty ^= 0xe; // Y-flip + pack = *(unsigned int *)(PicoMem.vram + addr+ty); + if (!pack) { blank = code; continue; @@ -394,7 +395,7 @@ void DrawStripInterlace(struct TileStrip *ts) if (code!=oldcode) { oldcode = code; // Get tile address/2: - addr=(code&0x7ff)<<5; + addr=(code&0x3ff)<<5; if (code&0x1000) addr+=30-ty; else addr+=ty; // Y-flip // pal=Pico.cram+((code>>9)&0x30); @@ -449,8 +450,11 @@ static void DrawLayer(int plane_sh, int *hcache, int cellskip, int maxcells, else ts.nametab=(pvid->reg[2]&0x38)<< 9; // A htab=pvid->reg[13]<<9; // Horizontal scroll table address - if ( pvid->reg[11]&2) htab+=est->DrawScanline<<1; // Offset by line - if ((pvid->reg[11]&1)==0) htab&=~0xf; // Offset by tile + switch (pvid->reg[11]&3) { + case 1: htab += (est->DrawScanline<<1) & 0x0f; break; + case 2: htab += (est->DrawScanline<<1) & ~0x0f; break; // Offset by tile + case 3: htab += (est->DrawScanline<<1); break; // Offset by line + } htab+=plane_sh&1; // A or B // Get horizontal scroll value, will be masked later @@ -626,9 +630,9 @@ static void DrawTilesFromCache(int *hc, int sh, int rlim, struct PicoEState *est if (!sh) { - short blank=-1; // The tile we know is blank + int blank=-1; // The tile we know is blank while ((code=*hc++)) { - if (!(code & 0x8000) || (short)code == blank) + if (!(code & 0x8000) || (unsigned short)code == blank) continue; // Get tile address/2: addr = (code & 0x7ff) << 4; @@ -636,7 +640,7 @@ static void DrawTilesFromCache(int *hc, int sh, int rlim, struct PicoEState *est pack = *(unsigned int *)(PicoMem.vram + addr); if (!pack) { - blank = (short)code; + blank = (unsigned short)code; continue; } diff --git a/pico/draw2.c b/pico/draw2.c index 85e2b2759..910697707 100644 --- a/pico/draw2.c +++ b/pico/draw2.c @@ -20,7 +20,7 @@ #define LINE_WIDTH 328 #endif -static unsigned char PicoDraw2FB_[(8+320) * (8+240+8)]; +static unsigned char PicoDraw2FB_[(8+320) * (8+240+8) + 8]; static int HighCache2A[41*(TILE_ROWS+1)+1+1]; // caches for high layers static int HighCache2B[41*(TILE_ROWS+1)+1+1]; diff --git a/pico/draw_arm.S b/pico/draw_arm.S index fb6d0950f..967bf6aa4 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -342,11 +342,15 @@ DrawLayer: mov r4, r8, lsr #8 @ pvid->reg[13] mov r4, r4, lsl #10 @ htab=pvid->reg[13]<<9; (halfwords) - tst r7, #2 - addne r4, r4, r2, lsl #2 @ htab+=DrawScanline<<1; // Offset by line - tst r7, #1 - biceq r4, r4, #0x1f @ htab&=~0xf; // Offset by tile - add r4, r4, r0, lsl #1 @ htab+=plane + + ands r3, r7, #0x03 + beq 0f + cmp r3, #2 + mov r3, r2, lsl #2 @ htab+=DrawScanline<<1; // Offset by line + biceq r3, #0x1f @ htab&=~0xf; // Offset by tile + andlt r3, #0x1f + add r4, r4, r3 +0: add r4, r4, r0, lsl #1 @ htab+=plane bic r4, r4, #0x00ff0000 @ just in case ldrh r3, [lr, r4] @ r3=hscroll @@ -599,6 +603,7 @@ DrawLayer: tst r7, #0x8000 bne .DrawStrip_vs_hiprio + orr r7, r7, r10, lsl #24 @ code | (ty << 24) cmp r7, r9 beq .DrawStrip_vs_samecode @ we know stuff about this tile already From 867ca1b96908dba98322ca86de1875d2e12954ac Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 16 Feb 2020 14:08:48 +0100 Subject: [PATCH 108/174] vdp fifo, tentative fix for broken save/load --- pico/pico_int.h | 9 ++++- pico/state.c | 8 +++- pico/videoport.c | 98 +++++++++++++++++++++++++++++++----------------- 3 files changed, 77 insertions(+), 38 deletions(-) diff --git a/pico/pico_int.h b/pico/pico_int.h index b3ce8a722..703292243 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -316,7 +316,10 @@ struct PicoVideo unsigned char debug_p; // ... parsed: PVD_* unsigned char addr_u; // bit16 of .addr unsigned char hint_cnt; - unsigned char pad[0x0b]; + unsigned char pad2; + unsigned short hv_latch; // latched hvcounter value + signed int fifo_cnt; // pending xfers for current FIFO queue entry + unsigned char pad[0x04]; }; struct PicoMisc @@ -339,7 +342,7 @@ struct PicoMisc unsigned char eeprom_slave; // EEPROM slave word for X24C02 and better SRAMs unsigned char eeprom_status; unsigned char pad1; // was ym2612 status - unsigned short pad2; // 18 was dma_xfers + unsigned short dma_xfers; // 18 unused (was VDP DMA transfer count) unsigned char eeprom_wb[2]; // EEPROM latch/write buffer unsigned int frame_count; // 1c for movies and idle det }; @@ -856,6 +859,8 @@ extern int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, u void PicoVideoFIFOSync(int cycles); int PicoVideoFIFOHint(void); int PicoVideoFIFOWrite(int count, int byte_p, unsigned sr_mask, unsigned sr_flags); +void PicoVideoSave(void); +void PicoVideoLoad(void); // misc.c PICO_INTERNAL_ASM void memcpy16bswap(unsigned short *dest, void *src, int count); diff --git a/pico/state.c b/pico/state.c index dc15bc05f..3deb5aa72 100644 --- a/pico/state.c +++ b/pico/state.c @@ -254,6 +254,8 @@ static int state_save(void *file) CHECKED_WRITE_BUFF(CHUNK_ZRAM, PicoMem.zram); CHECKED_WRITE_BUFF(CHUNK_CRAM, PicoMem.cram); CHECKED_WRITE_BUFF(CHUNK_MISC, Pico.m); + + PicoVideoSave(); CHECKED_WRITE_BUFF(CHUNK_VIDEO, Pico.video); z80_pack(buff_z80); @@ -437,7 +439,11 @@ static int state_load(void *file) case CHUNK_CRAM: CHECKED_READ_BUFF(PicoMem.cram); break; case CHUNK_VSRAM: CHECKED_READ_BUFF(PicoMem.vsram); break; case CHUNK_MISC: CHECKED_READ_BUFF(Pico.m); break; - case CHUNK_VIDEO: CHECKED_READ_BUFF(Pico.video); break; + case CHUNK_VIDEO: + CHECKED_READ_BUFF(Pico.video); + PicoVideoLoad(); + break; + case CHUNK_IOPORTS: CHECKED_READ_BUFF(PicoMem.ioports); break; case CHUNK_PSG: CHECKED_READ2(28*4, sn76496_regs); break; case CHUNK_FM: diff --git a/pico/videoport.c b/pico/videoport.c index 0a6a103fb..f64ac6939 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -14,7 +14,6 @@ extern const unsigned char hcounts_32[]; extern const unsigned char hcounts_40[]; -static unsigned hvlatch; // latched hvcounter value static int blankline; // display disabled for this line int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned int *mask) = NULL; @@ -70,7 +69,7 @@ const unsigned char vdpcyc2sl_32[] = { // 68k cycles/4 since HINT to slot # 14,14,14,14,14,14,14,14,15,16,16,16,16,16,16,16, }; const unsigned char vdpsl2cyc_32[] = { // slot # to 68k cycles/4 since HINT - 0, 9, 19, 30, 35, 41, 52, 58, 64, 75, 81, 87, 98,104,110,120,121,123,123 + 0, 9, 19, 30, 35, 41, 52, 58, 64, 75, 81, 87, 98,104,110,120,121,123 }; // VDP transfer slots in active display 40col mode. 1 slot is 488/210 = 2.3238 @@ -88,33 +87,37 @@ const unsigned char vdpcyc2sl_40[] = { // 68k cycles/4 since HINT to slot # 16,16,16,16,16,16,16,16,17,18,18,18,18,18,18,18, }; const unsigned char vdpsl2cyc_40[] = { // slot # to 68k cycles/4 since HINT - 0, 13, 28, 33, 37, 47, 51, 56, 65, 70, 74, 84, 88, 93,102,107,112,120,121,123,123 + 0, 13, 28, 33, 37, 47, 51, 56, 65, 70, 74, 84, 88, 93,102,107,112,120,121,123 }; // NB code assumes fifo_* arrays have size 2^n // last transferred FIFO data, ...x = index XXX currently only CPU -static short fifo_data[4], fifo_dx; +static short fifo_data[4], fifo_dx; // XXX must go into save? + // queued FIFO transfers, ...x = index, ...l = queue length // each entry has 2 values: [n]>>2=#writes, [n]&3=flags:2=DMA fill 1=byte access -static int fifo_queue[8], fifo_qx, fifo_ql; +static int fifo_queue[8], fifo_qx, fifo_ql; // XXX must go into save? +unsigned int fifo_total; // total# of pending FIFO entries -signed int fifo_cnt; // pending slots for current queue entry unsigned short fifo_slot; // last executed slot in current scanline -unsigned int fifo_total; // total# of pending FIFO entries // do the FIFO math -static __inline int AdvanceFIFOEntry(int slots) +static __inline int AdvanceFIFOEntry(struct PicoVideo *pv, int slots) { int l = slots, b = fifo_queue[fifo_qx&7] & 1; - if (l > fifo_cnt) - l = fifo_cnt; - fifo_total -= ((fifo_cnt & b) + l) >> b; - fifo_cnt -= l; + if (l > pv->fifo_cnt) + l = pv->fifo_cnt; + fifo_total -= ((pv->fifo_cnt & b) + l) >> b; + pv->fifo_cnt -= l; - if (fifo_cnt == 0) { - fifo_qx ++, fifo_ql --; - fifo_cnt= (fifo_queue[fifo_qx&7] >> 2) << (fifo_queue[fifo_qx&7] & 1); + if (pv->fifo_cnt == 0) { + if (fifo_ql) + fifo_qx ++, fifo_ql --; + if (fifo_ql) + pv->fifo_cnt= (fifo_queue[fifo_qx&7] >> 2) << (fifo_queue[fifo_qx&7] & 1); + else + fifo_total = 0; } return l; } @@ -129,7 +132,7 @@ static __inline int GetFIFOSlot(struct PicoVideo *pv, int cycles) else return (cycles * vdpcyc2sl_bl[h40] + cycles) >> 16; } -static inline int GetFIFOCycles(struct PicoVideo *pv, int slot) +static __inline int GetFIFOCycles(struct PicoVideo *pv, int slot) { int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); int h40 = pv->reg[12] & 1; @@ -146,13 +149,12 @@ void PicoVideoFIFOSync(int cycles) int slots, done; // calculate #slots since last executed slot - slots = GetFIFOSlot(pv, cycles); - slots -= fifo_slot; + slots = GetFIFOSlot(pv, cycles) - fifo_slot; // advance FIFO queue by #done slots done = slots; - while (done > 0 && fifo_ql) { - int l = AdvanceFIFOEntry(done); + while (done > 0 && pv->fifo_cnt) { + int l = AdvanceFIFOEntry(pv, done); fifo_slot += l; done -= l; } @@ -181,7 +183,7 @@ int PicoVideoFIFODrain(int level, int cycles) int b = fifo_queue[fifo_qx&7] & 1; int cnt = (fifo_total-level) << b; int last = fifo_slot; - int slot = (fifo_cntfifo_cntfifo_cnt:cnt) + last; // target slot unsigned ocyc = cycles; if (slot > maxsl) { @@ -196,7 +198,7 @@ int PicoVideoFIFODrain(int level, int cycles) } burn += cycles - ocyc; - AdvanceFIFOEntry(slot - last); + AdvanceFIFOEntry(pv, slot - last); } // release CPU and terminate DMA if FIFO isn't blocking the bus anymore @@ -249,7 +251,7 @@ int PicoVideoFIFOWrite(int count, int flags, unsigned sr_mask,unsigned sr_flags) // update FIFO state if it was empty if (fifo_total == 0 && count) { fifo_slot = GetFIFOSlot(pv, lc); - fifo_cnt = count << (flags&1); + pv->fifo_cnt = count << (flags&1); } // create xfer queue entry @@ -263,8 +265,8 @@ int PicoVideoFIFOWrite(int count, int flags, unsigned sr_mask,unsigned sr_flags) if ((pv->status & (PVS_CPUWR|PVS_DMAFILL)) == PVS_CPUWR) burn = PicoVideoFIFODrain(4, lc); else if (fifo_queue[fifo_qx&7]&2) { - // if interrupting a DMA fill terminate it - AdvanceFIFOEntry(fifo_cnt); + // if interrupting a DMA fill terminate it XXX wrong, changes fill data + AdvanceFIFOEntry(pv, pv->fifo_cnt); pv->status &= ~PVS_DMAFILL; } @@ -699,9 +701,9 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) // try avoiding the sync.. if (Pico.m.scanline < (pvid->reg[1]&0x08 ? 240 : 224) && (pvid->reg[1]&0x40) && !(!pvid->pending && - ((pvid->command & 0xc00000f0) == 0x40000010 && PicoMem.vsram[pvid->addr>>1] == d)) + ((pvid->command & 0xc00000f0) == 0x40000010 && PicoMem.vsram[pvid->addr>>1] == (d & 0x7ff))) ) - DrawSync(0); + DrawSync(SekCyclesDone() - Pico.t.m68c_line_start <= 488-440); if (pvid->pending) { CommandChange(); @@ -736,7 +738,7 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) CommandChange(); // Check for dma: if (d & 0x80) { - DrawSync(0); + DrawSync(SekCyclesDone() - Pico.t.m68c_line_start <= 488-390); CommandDma(); } } @@ -747,7 +749,6 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) // Register write: int num=(d>>8)&0x1f; int dold=pvid->reg[num]; - int skip=0; pvid->type=0; // register writes clear command (else no Sega logo in Golden Axe II) if (num > 0x0a && !(pvid->reg[1]&4)) { elprintf(EL_ANOMALY, "%02x written to reg %02x in SMS mode @ %06x", d, num, SekPc); @@ -755,16 +756,14 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) } if (num == 0 && !(pvid->reg[0]&2) && (d&2)) - hvlatch = PicoVideoRead(0x08); + pvid->hv_latch = PicoVideoRead(0x08); if (num == 1 && ((pvid->reg[1]^d)&0x40)) { PicoVideoFIFOMode(d & 0x40); // handle line blanking before line rendering - if (SekCyclesDone() - Pico.t.m68c_line_start <= 488-390) { - skip = 1; + if (SekCyclesDone() - Pico.t.m68c_line_start <= 488-390) blankline = d&0x40 ? -1 : Pico.m.scanline; - } } - DrawSync(skip); + DrawSync(SekCyclesDone() - Pico.t.m68c_line_start <= 488-390); pvid->reg[num]=(unsigned char)d; switch (num) { @@ -904,7 +903,7 @@ PICO_INTERNAL_ASM unsigned int PicoVideoRead(unsigned int a) d = (SekCyclesDone() - Pico.t.m68c_line_start) & 0x1ff; // FIXME if (Pico.video.reg[0]&2) - d = hvlatch; + d = Pico.video.hv_latch; else if (Pico.video.reg[12]&1) d = hcounts_40[d] | (Pico.video.v_counter << 8); else d = hcounts_32[d] | (Pico.video.v_counter << 8); @@ -970,4 +969,33 @@ unsigned char PicoVideoRead8HV_L(void) return d; } +void PicoVideoSave(void) +{ + struct PicoVideo *pv = &Pico.video; + int l, x; + + // account for all outstanding xfers XXX kludge, entry attr's not saved + for (l = fifo_ql, x = fifo_qx + l-1; l > 1; l--, x--) + pv->fifo_cnt += (fifo_queue[x&7] >> 2) << (fifo_queue[x&7] & 1); +} + +void PicoVideoLoad(void) +{ + struct PicoVideo *pv = &Pico.video; + int l; + + // convert former dma_xfers (why was this in PicoMisc anyway?) + if (Pico.m.dma_xfers) { + pv->fifo_cnt = Pico.m.dma_xfers * (pv->type == 1 ? 2 : 1); + fifo_total = Pico.m.dma_xfers; + Pico.m.dma_xfers = 0; + } + + // rebuild SAT cache XXX wrong since cache and memory can differ + for (l = 0; l < 80; l++) { + *((u16 *)VdpSATCache + 2*l ) = PicoMem.vram[(sat>>1) + l*4 ]; + *((u16 *)VdpSATCache + 2*l+1) = PicoMem.vram[(sat>>1) + l*4 + 1]; + } +} + // vim:shiftwidth=2:ts=2:expandtab From b2176c9d412b8870621e4e6e9e14cbe483fef16b Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 16 Feb 2020 13:53:50 +0100 Subject: [PATCH 109/174] vdp sprite handling improvement (SAT cache) --- pico/draw.c | 222 +++++++++++++++++------------------------------ pico/pico_int.h | 1 + pico/videoport.c | 137 ++++++++++++++++++----------- 3 files changed, 170 insertions(+), 190 deletions(-) diff --git a/pico/draw.c b/pico/draw.c index 9324a5a9b..7b66b43c4 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -45,6 +45,8 @@ static int HighCacheA[41+1]; // caches for high layers static int HighCacheB[41+1]; static int HighPreSpr[80*2+1]; // slightly preprocessed sprites +unsigned int VdpSATCache[128]; // VDP sprite cache (1st 32 sprite attr bits) + #define LF_PLANE_1 (1 << 0) #define LF_SH (1 << 1) // must be = 2 #define LF_FORCE (1 << 2) @@ -1124,14 +1126,14 @@ static void DrawSpritesForced(unsigned char *sprited) // Index + 0 : hhhhvvvv ----hhvv yyyyyyyy yyyyyyyy // v, h: vert./horiz. size // Index + 4 : xxxxxxxx xxxxxxxx pccvhnnn nnnnnnnn // x: x coord + 8 -static NOINLINE void PrepareSprites(int full) +static NOINLINE void PrepareSprites(int max_lines) { const struct PicoVideo *pvid=&Pico.video; const struct PicoEState *est=&Pico.est; int u,link=0,sh; int table=0; int *pd = HighPreSpr; - int max_lines = 224, max_sprites = 80, max_width = 328; + int max_sprites = 80, max_width = 328; int max_line_sprites = 20; // 20 sprites, 40 tiles if (!(Pico.video.reg[12]&1)) @@ -1139,160 +1141,101 @@ static NOINLINE void PrepareSprites(int full) if (PicoIn.opt & POPT_DIS_SPRITE_LIM) max_line_sprites = MAX_LINE_SPRITES; - if (pvid->reg[1]&8) max_lines = 240; sh = Pico.video.reg[0xC]&8; // shadow/hilight? table=pvid->reg[5]&0x7f; if (pvid->reg[12]&1) table&=0x7e; // Lowest bit 0 in 40-cell mode table<<=8; // Get sprite table address/2 - if (!full) - { - int pack; - // updates: tilecode, sx - for (u=0; u < max_sprites && link < max_sprites && (pack = *pd); u++, pd+=2) - { - unsigned int *sprite; - int code2, sx, sy, height, width; - - sprite=(unsigned int *)(PicoMem.vram+((table+(link<<2))&0x7ffc)); // Find sprite + for (u = est->DrawScanline; u < max_lines; u++) + *((int *)&HighLnSpr[u][0]) = 0; - // parse sprite info - code2 = sprite[1]; - sx = (code2>>16)&0x1ff; - sx -= 0x78; // Get X coordinate + 8 - sy = (pack << 16) >> 16; - height = (pack >> 24) & 0xf; - width = (pack >> 28); + for (u = 0; u < max_sprites && link < max_sprites; u++) + { + unsigned int *sprite; + int code, code2, sx, sy, hv, height, width; - if (sy < max_lines && - sy + (height<<3) > est->DrawScanline) // sprite onscreen (y)? - { - int y = (sy >= est->DrawScanline) ? sy : est->DrawScanline; - int entry = ((pd - HighPreSpr) / 2) | ((code2>>8)&0x80); - for (; y < sy + (height<<3) && y < max_lines; y++) - { - int i, cnt; - cnt = HighLnSpr[y][0]; - if (HighLnSpr[y][3] >= max_line_sprites) continue; // sprite limit? - - for (i = 0; i < cnt; i++) - if (((HighLnSpr[y][4+i] ^ entry) & 0x7f) == 0) goto found; - - // this sprite was previously missing - HighLnSpr[y][3] ++; - if (sx > -24 && sx < max_width) { // onscreen x - HighLnSpr[y][4+cnt] = entry; // XXX wrong sequence? - HighLnSpr[y][5+cnt] = width; // XXX should count tiles for limit - HighLnSpr[y][0] = cnt + 1; - } -found:; - if (entry & 0x80) - HighLnSpr[y][1] |= SPRL_HAVE_HI; - else HighLnSpr[y][1] |= SPRL_HAVE_LO; - } - } + sprite=(unsigned int *)(PicoMem.vram+((table+(link<<2))&0x7ffc)); // Find sprite - code2 &= ~0xfe000000; - code2 -= 0x00780000; // Get X coordinate + 8 in upper 16 bits - pd[1] = code2; + // parse sprite info. the 1st half comes from the VDPs internal cache, + // the 2nd half is read from VRAM + code = VdpSATCache[link]; // normally but not always equal to sprite[0] + sy = (code&0x1ff)-0x80; + hv = (code>>24)&0xf; + height = (hv&3)+1; + width = (hv>>2)+1; - // Find next sprite - link=(sprite[0]>>16)&0x7f; - if (!link) break; // End of sprites - } - } - else - { - for (u = 0; u < max_lines; u++) - *((int *)&HighLnSpr[u][0]) = 0; + code2 = sprite[1]; + sx = (code2>>16)&0x1ff; + sx -= 0x78; // Get X coordinate + 8 - for (u = 0; u < max_sprites && link < max_sprites; u++) + if (sy < max_lines && sy + (height<<3) >= est->DrawScanline) // sprite onscreen (y)? { - unsigned int *sprite; - int code, code2, sx, sy, hv, height, width; + int entry, y, w, sx_min, onscr_x, maybe_op = 0; - sprite=(unsigned int *)(PicoMem.vram+((table+(link<<2))&0x7ffc)); // Find sprite + sx_min = 8-(width<<3); + onscr_x = sx_min < sx && sx < max_width; + if (sh && (code2 & 0x6000) == 0x6000) + maybe_op = SPRL_MAY_HAVE_OP; - // parse sprite info - code = sprite[0]; - sy = (code&0x1ff)-0x80; - hv = (code>>24)&0xf; - height = (hv&3)+1; - - width = (hv>>2)+1; - code2 = sprite[1]; - sx = (code2>>16)&0x1ff; - sx -= 0x78; // Get X coordinate + 8 - - if (sy < max_lines && sy + (height<<3) > est->DrawScanline) // sprite onscreen (y)? + entry = ((pd - HighPreSpr) / 2) | ((code2>>8)&0x80); + y = (sy >= est->DrawScanline) ? sy : est->DrawScanline; + for (; y < sy + (height<<3) && y < max_lines; y++) { - int entry, y, w, sx_min, onscr_x, maybe_op = 0; - - sx_min = 8-(width<<3); - onscr_x = sx_min < sx && sx < max_width; - if (sh && (code2 & 0x6000) == 0x6000) - maybe_op = SPRL_MAY_HAVE_OP; - - entry = ((pd - HighPreSpr) / 2) | ((code2>>8)&0x80); - y = (sy >= est->DrawScanline) ? sy : est->DrawScanline; - for (; y < sy + (height<<3) && y < max_lines; y++) - { - unsigned char *p = &HighLnSpr[y][0]; - int cnt = p[0]; - if (p[3] >= max_line_sprites) continue; // sprite limit? - if ((p[1] & SPRL_MASKED) && !(entry & 0x80)) continue; // masked? - - w = width; - if (p[2] + width > max_line_sprites*2) { // tile limit? - if (y+1 < 240) HighLnSpr[y+1][1] |= SPRL_TILE_OVFL; - if (p[2] >= max_line_sprites*2) continue; - w = max_line_sprites*2 - p[2]; - } - p[2] += w; - p[3] ++; - - if (sx == -0x78) { - if (p[1] & (SPRL_HAVE_X|SPRL_TILE_OVFL)) - p[1] |= SPRL_MASKED; // masked, no more low sprites for this line - if (!(p[1] & SPRL_HAVE_X) && cnt == 0) - p[1] |= SPRL_HAVE_MASK0; // 1st sprite is masking - } else - p[1] |= SPRL_HAVE_X; - - if (!onscr_x) continue; // offscreen x - - p[4+cnt] = entry; - p[5+cnt] = w; // width clipped by tile limit for sprite renderer - p[0] = cnt + 1; - p[1] |= (entry & 0x80) ? SPRL_HAVE_HI : SPRL_HAVE_LO; - p[1] |= maybe_op; // there might be op sprites on this line - if (cnt > 0 && (code2 & 0x8000) && !(p[4+cnt-1]&0x80)) - p[1] |= SPRL_LO_ABOVE_HI; + unsigned char *p = &HighLnSpr[y][0]; + int cnt = p[0]; + if (p[3] >= max_line_sprites) continue; // sprite limit? + if ((p[1] & SPRL_MASKED) && !(entry & 0x80)) continue; // masked? + + w = width; + if (p[2] + width > max_line_sprites*2) { // tile limit? + if (y+1 < 240) HighLnSpr[y+1][1] |= SPRL_TILE_OVFL; + if (p[2] >= max_line_sprites*2) continue; + w = max_line_sprites*2 - p[2]; } + p[2] += w; + p[3] ++; + + if (sx == -0x78) { + if (p[1] & (SPRL_HAVE_X|SPRL_TILE_OVFL)) + p[1] |= SPRL_MASKED; // masked, no more low sprites for this line + if (!(p[1] & SPRL_HAVE_X) && cnt == 0) + p[1] |= SPRL_HAVE_MASK0; // 1st sprite is masking + } else + p[1] |= SPRL_HAVE_X; + + if (!onscr_x) continue; // offscreen x + + p[4+cnt] = entry; + p[5+cnt] = w; // width clipped by tile limit for sprite renderer + p[0] = cnt + 1; + p[1] |= (entry & 0x80) ? SPRL_HAVE_HI : SPRL_HAVE_LO; + p[1] |= maybe_op; // there might be op sprites on this line + if (cnt > 0 && (code2 & 0x8000) && !(p[4+cnt-1]&0x80)) + p[1] |= SPRL_LO_ABOVE_HI; } + } - *pd++ = (width<<28)|(height<<24)|(hv<<16)|((unsigned short)sy); - *pd++ = (sx<<16)|((unsigned short)code2); + *pd++ = (width<<28)|(height<<24)|(hv<<16)|((unsigned short)sy); + *pd++ = (sx<<16)|((unsigned short)code2); - // Find next sprite - link=(code>>16)&0x7f; - if (!link) break; // End of sprites - } - *pd = 0; + // Find next sprite + link=(code>>16)&0x7f; + if (!link) break; // End of sprites + } + *pd = 0; #if 0 - for (u = 0; u < max_lines; u++) - { - int y; - printf("c%03i: f %x c %2i/%2i w %2i: ", u, HighLnSpr[u][1], - HighLnSpr[u][0], HighLnSpr[u][3], HighLnSpr[u][2]); - for (y = 0; y < HighLnSpr[u][0]; y++) - printf(" %i", HighLnSpr[u][y+4]); - printf("\n"); - } -#endif + for (u = 0; u < max_lines; u++) + { + int y; + printf("c%03i: f %x c %2i/%2i w %2i: ", u, HighLnSpr[u][1], + HighLnSpr[u][0], HighLnSpr[u][3], HighLnSpr[u][2]); + for (y = 0; y < HighLnSpr[u][0]; y++) + printf(" %i", HighLnSpr[u][y+4]); + printf("\n"); } +#endif } #ifndef _ASM_DRAW_C @@ -1505,12 +1448,11 @@ static int DrawDisplay(int sh) int win=0, edge=0, hvwind=0, lflags; int maxw, maxcells; - if (est->rendstatus & (PDRAW_SPRITES_MOVED|PDRAW_DIRTY_SPRITES)) { - // elprintf(EL_STATUS, "PrepareSprites(%i)", (est->rendstatus>>4)&1); - PrepareSprites(est->rendstatus & PDRAW_DIRTY_SPRITES); - est->rendstatus &= ~(PDRAW_SPRITES_MOVED|PDRAW_DIRTY_SPRITES); - } + if (!(est->DrawScanline & 15) || + (est->rendstatus & (PDRAW_SPRITES_MOVED|PDRAW_DIRTY_SPRITES))) + PrepareSprites((est->DrawScanline+16) & ~15); + est->rendstatus &= ~(PDRAW_SPRITES_MOVED|PDRAW_DIRTY_SPRITES); est->rendstatus &= ~(PDRAW_SHHI_DONE|PDRAW_PLANE_HI_PRIO); if (pvid->reg[12]&1) { @@ -1656,8 +1598,6 @@ PICO_INTERNAL void PicoFrameStart(void) if (PicoIn.opt & POPT_ALT_RENDERER) return; - - PrepareSprites(1); } static void DrawBlankedLine(int line, int offs, int sh, int bgc) diff --git a/pico/pico_int.h b/pico/pico_int.h index 703292243..12f35b562 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -676,6 +676,7 @@ extern int (*PicoScanEnd)(unsigned int num); extern unsigned char HighLnSpr[240][4+MAX_LINE_SPRITES+1]; extern void *DrawLineDestBase; extern int DrawLineDestIncrement; +extern unsigned int VdpSATCache[128]; // draw2.c void PicoDraw2Init(void); diff --git a/pico/videoport.c b/pico/videoport.c index f64ac6939..b9e0401b1 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -15,6 +15,8 @@ extern const unsigned char hcounts_32[]; extern const unsigned char hcounts_40[]; static int blankline; // display disabled for this line +static unsigned sat; // VRAM addr of sprite attribute table +static int satxbits; // index bits in SAT address int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned int *mask) = NULL; @@ -315,14 +317,37 @@ void PicoVideoFIFOMode(int active) static __inline void AutoIncrement(void) { Pico.video.addr=(unsigned short)(Pico.video.addr+Pico.video.reg[0xf]); + if (Pico.video.addr < Pico.video.reg[0xf]) Pico.video.addr_u ^= 1; } -static NOINLINE unsigned int VideoWrite128(u32 a, u16 d) +static __inline void UpdateSAT(u32 a, u32 d) +{ + Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; + if (!((a^sat) >> satxbits) && !(a & 4)) { + int num = (a >> 3) & 0x7f; + ((u16 *)&VdpSATCache[num])[(a&3) >> 1] = d; + } +} + +static NOINLINE void VideoWriteVRAM128(u32 a, u16 d) { // nasty - a = ((a & 2) >> 1) | ((a & 0x400) >> 9) | (a & 0x3FC) | ((a & 0x1F800) >> 1); - ((u8 *)PicoMem.vram)[a] = d; - return a; + u32 b = ((a & 2) >> 1) | ((a & 0x400) >> 9) | (a & 0x3FC) | ((a & 0x1F800) >> 1); + + ((u8 *)PicoMem.vram)[b] = d; + if (!((u16)(b^sat) >> satxbits)) + Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; + + if (!((u16)(a^sat) >> satxbits)) + UpdateSAT(a, d); +} + +static void VideoWriteVRAM(u32 a, u16 d) +{ + PicoMem.vram [(u16)a >> 1] = d; + + if (!((u16)(a^sat) >> satxbits)) + UpdateSAT(a, d); } static void VideoWrite(u16 d) @@ -333,19 +358,15 @@ static void VideoWrite(u16 d) { case 1: if (a & 1) d = (u16)((d << 8) | (d >> 8)); - PicoMem.vram [(a >> 1) & 0x7fff] = d; - if ((unsigned)(a - ((Pico.video.reg[5]&0x7f) << 9)) < 0x400) - Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; + a |= Pico.video.addr_u << 16; + VideoWriteVRAM(a, d); break; case 3: if (PicoMem.cram [(a >> 1) & 0x3f] != d) Pico.m.dirtyPal = 1; - PicoMem.cram [(a >> 1) & 0x3f] = d; break; - case 5: PicoMem.vsram[(a >> 1) & 0x3f] = d; break; - case 0x81: if (a & 1) - d = (u16)((d << 8) | (d >> 8)); + PicoMem.cram [(a >> 1) & 0x3f] = d & 0xeee; break; + case 5: PicoMem.vsram[(a >> 1) & 0x3f] = d & 0x7ff; break; + case 0x81: a |= Pico.video.addr_u << 16; - a = VideoWrite128(a, d); - if ((unsigned)(a - ((Pico.video.reg[5]&0x7f) << 9)) < 0x400) - Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; + VideoWriteVRAM128(a, d); break; //default:elprintf(EL_ANOMALY, "VDP write %04x with bad type %i", d, Pico.video.type); break; } @@ -363,9 +384,10 @@ static unsigned int VideoRead(void) switch (Pico.video.type) { case 0: d=PicoMem.vram [a & 0x7fff]; break; - case 8: d=(PicoMem.cram [a & 0x003f] & 0x0eee) | (d & ~0x0eee); break; + case 8: d=PicoMem.cram [a & 0x003f] | (d & ~0x0eee); break; + case 4: if ((a & 0x3f) >= 0x28) a = 0; - d=(PicoMem.vsram [a & 0x003f] & 0x07ff) | (d & ~0x07ff); break; + d=PicoMem.vsram [a & 0x003f] | (d & ~0x07ff); break; case 12:a=PicoMem.vram [a & 0x7fff]; if (Pico.video.addr&1) a >>= 8; d=(a & 0x00ff) | (d & ~0x00ff); break; default:elprintf(EL_ANOMALY, "VDP read with bad type %i", Pico.video.type); break; @@ -391,7 +413,7 @@ static int GetDmaLength(void) static void DmaSlow(int len, unsigned int source) { u32 inc = Pico.video.reg[0xf]; - u32 a = Pico.video.addr; + u32 a = Pico.video.addr | (Pico.video.addr_u << 16); u16 *r, *base = NULL; u32 mask = 0x1ffff; @@ -451,26 +473,28 @@ static void DmaSlow(int len, unsigned int source) switch (Pico.video.type) { case 1: // vram +#if 0 r = PicoMem.vram; - if (inc == 2 && !(a & 1) && a + len * 2 < 0x10000 - && !(((source + len - 1) ^ source) & ~mask)) + if (inc == 2 && !(a & 1) && (a >> 16) == ((a + len*2) >> 16) && + (source & ~mask) == ((source + len-1) & ~mask) && + (a << 16 >= (sat+0x280) << 16 || (a + len*2) << 16 <= sat << 16)) { // most used DMA mode memcpy((char *)r + a, base + (source & mask), len * 2); a += len * 2; } else +#endif { for(; len; len--) { u16 d = base[source++ & mask]; if(a & 1) d=(d<<8)|(d>>8); - r[a >> 1] = d; + VideoWriteVRAM(a, d); // AutoIncrement - a = (u16)(a + inc); + a = (a+inc) & ~0x20000; } } - Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; break; case 3: // cram @@ -478,9 +502,9 @@ static void DmaSlow(int len, unsigned int source) r = PicoMem.cram; for (; len; len--) { - r[(a / 2) & 0x3f] = base[source++ & mask]; + r[(a / 2) & 0x3f] = base[source++ & mask] & 0xeee; // AutoIncrement - a += inc; + a = (a+inc) & ~0x20000; } break; @@ -488,22 +512,20 @@ static void DmaSlow(int len, unsigned int source) r = PicoMem.vsram; for (; len; len--) { - r[(a / 2) & 0x3f] = base[source++ & mask]; + r[(a / 2) & 0x3f] = base[source++ & mask] & 0x7ff; // AutoIncrement - a += inc; + a = (a+inc) & ~0x20000; } break; case 0x81: // vram 128k - a |= Pico.video.addr_u << 16; for(; len; len--) { - VideoWrite128(a, base[source++ & mask]); + u16 d = base[source++ & mask]; + VideoWriteVRAM128(a, d); // AutoIncrement - a = (a + inc) & 0x1ffff; + a = (a+inc) & ~0x20000; } - Pico.video.addr_u = a >> 16; - Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; break; default: @@ -512,12 +534,13 @@ static void DmaSlow(int len, unsigned int source) break; } // remember addr - Pico.video.addr=(u16)a; + Pico.video.addr = a; + Pico.video.addr_u = a >> 16; } static void DmaCopy(int len) { - u16 a = Pico.video.addr; + u32 a = Pico.video.addr | (Pico.video.addr_u << 16); u8 *vr = (u8 *)PicoMem.vram; u8 inc = Pico.video.reg[0xf]; int source; @@ -528,21 +551,23 @@ static void DmaCopy(int len) source =Pico.video.reg[0x15]; source|=Pico.video.reg[0x16]<<8; - // XXX implement VRAM 128k? Is this even working? + // XXX implement VRAM 128k? Is this even working? count still in bytes? for (; len; len--) { - vr[a] = vr[source++ & 0xffff]; + vr[(u16)a] = vr[(u16)(source++)]; + if (!((u16)(a^sat) >> satxbits)) + UpdateSAT(a, ((u16 *)vr)[(u16)a >> 1]); // AutoIncrement - a=(u16)(a+inc); + a = (a+inc) & ~0x20000; } // remember addr - Pico.video.addr=a; - Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; + Pico.video.addr = a; + Pico.video.addr_u = a >> 16; } static NOINLINE void DmaFill(int data) { - u16 a = Pico.video.addr; + u32 a = Pico.video.addr | (Pico.video.addr_u << 16); u8 *vr = (u8 *)PicoMem.vram; u8 high = (u8)(data >> 8); u8 inc = Pico.video.reg[0xf]; @@ -561,40 +586,41 @@ static NOINLINE void DmaFill(int data) for (l = len; l; l--) { // Write upper byte to adjacent address // (here we are byteswapped, so address is already 'adjacent') - vr[a] = high; + vr[(u16)a] = high; + if (!((u16)(a^sat) >> satxbits)) + UpdateSAT(a, ((u16 *)vr)[(u16)a >> 1]); // Increment address register - a = (u16)(a + inc); + a = (a+inc) & ~0x20000; } - Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; break; case 3: // cram Pico.m.dirtyPal = 1; + data &= 0xeee; for (l = len; l; l--) { PicoMem.cram[(a/2) & 0x3f] = data; // Increment address register - a += inc; + a = (a+inc) & ~0x20000; } break; case 5: { // vsram + data &= 0x7ff; for (l = len; l; l--) { PicoMem.vsram[(a/2) & 0x3f] = data; // Increment address register - a += inc; + a = (a+inc) & ~0x20000; } break; } case 0x81: // vram 128k for (l = len; l; l--) { - VideoWrite128(a, data); + VideoWriteVRAM128(a, data); // Increment address register - a = (a + inc) & 0x1ffff; + a = (a+inc) & ~0x20000; } - Pico.video.addr_u = a >> 16; - Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; break; default: a += len * inc; @@ -603,6 +629,7 @@ static NOINLINE void DmaFill(int data) // remember addr Pico.video.addr = a; + Pico.video.addr_u = a >> 16; // register update Pico.video.reg[0x13] = Pico.video.reg[0x14] = 0; source = Pico.video.reg[0x15]; @@ -779,14 +806,21 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) pvid->status |= ((d >> 3) ^ SR_VB) & SR_VB; // forced blanking goto update_irq; case 0x05: - //elprintf(EL_STATUS, "spritep moved to %04x", (unsigned)(Pico.video.reg[5]&0x7f) << 9); + case 0x06: if (d^dold) Pico.est.rendstatus |= PDRAW_SPRITES_MOVED; break; case 0x0c: // renderers should update their palettes if sh/hi mode is changed if ((d^dold)&8) Pico.m.dirtyPal = 1; break; + default: + return; } + sat = ((pvid->reg[5]&0x7f) << 9) | ((pvid->reg[6]&0x20) << 11); + satxbits = 9; + if (Pico.video.reg[12]&1) + sat &= ~0x200, satxbits = 10; // H40, zero lowest SAT bit + //elprintf(EL_STATUS, "spritep moved to %04x", sat); return; update_irq: @@ -991,6 +1025,11 @@ void PicoVideoLoad(void) Pico.m.dma_xfers = 0; } + sat = ((pv->reg[5]&0x7f) << 9) | ((pv->reg[6]&0x20) << 11); + satxbits = 9; + if (pv->reg[12]&1) + sat &= ~0x200, satxbits = 10; // H40, zero lowest SAT bit + // rebuild SAT cache XXX wrong since cache and memory can differ for (l = 0; l < 80; l++) { *((u16 *)VdpSATCache + 2*l ) = PicoMem.vram[(sat>>1) + l*4 ]; From d0f5b4b4e2b54d8d5bf9a9ff64622bacdad4bcaa Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 23 Feb 2020 11:33:02 +0100 Subject: [PATCH 110/174] vdp fifo, another revision --- pico/pico_int.h | 7 +- pico/videoport.c | 217 +++++++++++++++++++++++++---------------------- 2 files changed, 120 insertions(+), 104 deletions(-) diff --git a/pico/pico_int.h b/pico/pico_int.h index 12f35b562..65b56f1d2 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -296,10 +296,9 @@ extern SH2 sh2s[2]; // not part of real SR #define PVS_ACTIVE (1 << 16) #define PVS_VB2 (1 << 17) // ignores forced blanking -#define PVS_CPUWR (1 << 18) // CPU hold by FIFO full -#define PVS_CPURD (1 << 19) // CPU hold by FIFO full -#define PVS_DMAPEND (1 << 20) // DMA operation waiting for start -#define PVS_DMAFILL (1 << 21) // DMA fill is in progress +#define PVS_CPUWR (1 << 18) // CPU write blocked by FIFO full +#define PVS_CPURD (1 << 19) // CPU read blocked by FIFO not empty +#define PVS_DMAFILL (1 << 20) // DMA fill is waiting for fill data struct PicoVideo { diff --git a/pico/videoport.c b/pico/videoport.c index b9e0401b1..533c78806 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -37,8 +37,10 @@ int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned * fifo_slot is always behind slot2cyc[cycles]. Advancing it beyond cycles * implies blocking the 68k up to that slot. * - * A FIFO write goes to the end of the fifo queue. There can be more pending - * writes than FIFO slots, but the 68k will be blocked in most of those cases. + * A FIFO write goes to the end of the FIFO queue, but DMA running in background + * is always the last queue entry (transfers by CPU intervene and come 1st). + * There can be more pending writes than FIFO slots, but the CPU will be blocked + * until FIFO level (without background DMA) <= 4. * This is only about correct timing, data xfer must be handled by the caller. * Blocking the CPU means burning cycles via SekCyclesBurn*(), which is to be * executed by the caller. @@ -50,16 +52,14 @@ int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned * FIFORead executes a 68k read. 68k is blocked until the next transfer slot. */ -// FIFO transfer slots per line: H32 blank, H40 blank, H32 active, H40 active -static const short vdpslots[] = { 166, 204, 16, 18 }; -// mapping between slot# and 68k cycles in a blanked scanline -static const int vdpcyc2sl_bl[] = { (166<<16)/488, (204<<16)/488, (16<<16)/488, (18<<16)/488 }; -static const int vdpsl2cyc_bl[] = { (488<<16)/166, (488<<16)/204, (488<<16)/16, (488<<16)/18 }; +// mapping between slot# and 68k cycles in a blanked scanline [H32, H40] +static const int vdpcyc2sl_bl[] = { (166<<16)/488, (204<<16)/488 }; +static const int vdpsl2cyc_bl[] = { (488<<16)/166, (488<<16)/204 }; // VDP transfer slots in active display 32col mode. 1 slot is 488/171 = 2.8538 // 68k cycles. Only 16 of the 171 slots in a scanline can be used by CPU/DMA: // (HINT=slot 0): 13,27,42,50,58,74,82,90,106,114,122,138,146,154,169,170 -const unsigned char vdpcyc2sl_32[] = { // 68k cycles/4 since HINT to slot # +static const unsigned char vdpcyc2sl_32[] = { // 68k cycles/4 to slot # // 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, @@ -70,14 +70,14 @@ const unsigned char vdpcyc2sl_32[] = { // 68k cycles/4 since HINT to slot # 11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14, 14,14,14,14,14,14,14,14,15,16,16,16,16,16,16,16, }; -const unsigned char vdpsl2cyc_32[] = { // slot # to 68k cycles/4 since HINT - 0, 9, 19, 30, 35, 41, 52, 58, 64, 75, 81, 87, 98,104,110,120,121,123 +static const unsigned char vdpsl2cyc_32[] = { // slot # to 68k cycles/4 + 0, 9, 19, 30, 35, 41, 52, 58, 64, 75, 81, 87, 98,104,110,120,121,131 }; // VDP transfer slots in active display 40col mode. 1 slot is 488/210 = 2.3238 // 68k cycles. Only 18 of the 210 slots in a scanline can be used by CPU/DMA: // (HINT=0): 23,49,57,65,81,89,97,113,121,129,145,153,161,177,185,193,208,209 -const unsigned char vdpcyc2sl_40[] = { // 68k cycles/4 since HINT to slot # +static const unsigned char vdpcyc2sl_40[] = { // 68k cycles/4 to slot # // 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, @@ -88,8 +88,8 @@ const unsigned char vdpcyc2sl_40[] = { // 68k cycles/4 since HINT to slot # 13,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15, 16,16,16,16,16,16,16,16,17,18,18,18,18,18,18,18, }; -const unsigned char vdpsl2cyc_40[] = { // slot # to 68k cycles/4 since HINT - 0, 13, 28, 33, 37, 47, 51, 56, 65, 70, 74, 84, 88, 93,102,107,112,120,121,123 +static const unsigned char vdpsl2cyc_40[] = { // slot # to 68k cycles/4 + 0, 13, 28, 33, 37, 47, 51, 56, 65, 70, 74, 84, 88, 93,102,107,112,120,121,135 }; // NB code assumes fifo_* arrays have size 2^n @@ -97,51 +97,79 @@ const unsigned char vdpsl2cyc_40[] = { // slot # to 68k cycles/4 since HINT static short fifo_data[4], fifo_dx; // XXX must go into save? // queued FIFO transfers, ...x = index, ...l = queue length -// each entry has 2 values: [n]>>2=#writes, [n]&3=flags:2=DMA fill 1=byte access +// each entry has 2 values: [n]>>3 = #writes, [n]&7 = flags static int fifo_queue[8], fifo_qx, fifo_ql; // XXX must go into save? -unsigned int fifo_total; // total# of pending FIFO entries +enum { FQ_BYTE = 1, FQ_BGDMA = 2, FQ_FGDMA = 4 }; // queue flags, NB: BYTE = 1! +unsigned int fifo_total; // total# of pending FIFO entries (w/o BGDMA) unsigned short fifo_slot; // last executed slot in current scanline +// map cycles to FIFO slot +static __inline int GetFIFOSlot(struct PicoVideo *pv, int cycles) +{ + int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); + int h40 = pv->reg[12] & 1; + + if (active) return (h40 ? vdpcyc2sl_40 : vdpcyc2sl_32)[cycles/4]; + else return (cycles * vdpcyc2sl_bl[h40] + cycles) >> 16; +} + +// map FIFO slot to cycles +static __inline int GetFIFOCycles(struct PicoVideo *pv, int slot) +{ + int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); + int h40 = pv->reg[12] & 1; + + if (active) return (h40 ? vdpsl2cyc_40 : vdpsl2cyc_32)[slot]*4; + else return ((slot * vdpsl2cyc_bl[h40] + slot) >> 16); +} + // do the FIFO math static __inline int AdvanceFIFOEntry(struct PicoVideo *pv, int slots) { - int l = slots, b = fifo_queue[fifo_qx&7] & 1; + int l = slots, b = fifo_queue[fifo_qx] & FQ_BYTE; + // advance currently active FIFO entry if (l > pv->fifo_cnt) l = pv->fifo_cnt; - fifo_total -= ((pv->fifo_cnt & b) + l) >> b; + if (!(fifo_queue[fifo_qx] & FQ_BGDMA)) + fifo_total -= ((pv->fifo_cnt & b) + l) >> b; pv->fifo_cnt -= l; + // if entry has been processed... if (pv->fifo_cnt == 0) { + if (fifo_ql) { + // terminate DMA if applicable + if ((pv->status & SR_DMA) && (fifo_queue[fifo_qx] & FQ_BGDMA)) { + pv->status &= ~SR_DMA; + pv->command &= ~0x80; + } + // remove entry from FIFO + fifo_qx ++, fifo_qx &= 7, fifo_ql --; + } + // start processing for next entry if there is one if (fifo_ql) - fifo_qx ++, fifo_ql --; - if (fifo_ql) - pv->fifo_cnt= (fifo_queue[fifo_qx&7] >> 2) << (fifo_queue[fifo_qx&7] & 1); + pv->fifo_cnt= (fifo_queue[fifo_qx] >> 3) << (fifo_queue[fifo_qx] & FQ_BYTE); else fifo_total = 0; } return l; } -static __inline int GetFIFOSlot(struct PicoVideo *pv, int cycles) -{ - int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); - int h40 = pv->reg[12] & 1; - const unsigned char *cs = h40 ? vdpcyc2sl_40 : vdpcyc2sl_32; - - if (active) return cs[cycles/4]; - else return (cycles * vdpcyc2sl_bl[h40] + cycles) >> 16; -} - -static __inline int GetFIFOCycles(struct PicoVideo *pv, int slot) +static __inline void SetFIFOState(struct PicoVideo *pv) { - int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); - int h40 = pv->reg[12] & 1; - const unsigned char *sc = h40 ? vdpsl2cyc_40 : vdpsl2cyc_32; - - if (active) return sc[slot]*4; - else return ((slot * vdpsl2cyc_bl[h40] + slot) >> 16); + // release CPU and terminate DMA if FIFO isn't blocking the 68k anymore + if (fifo_total == 0) + pv->status &= ~PVS_CPURD; + if (fifo_total <= 4) { + int x = (fifo_qx + fifo_ql - 1) & 7; + if ((pv->status & SR_DMA) && !(pv->status & PVS_DMAFILL) && + fifo_ql && !(fifo_queue[x] & FQ_BGDMA)) { + pv->status &= ~SR_DMA; + pv->command &= ~0x80; + } + pv->status &= ~PVS_CPUWR; + } } // sync FIFO to cycles @@ -161,57 +189,40 @@ void PicoVideoFIFOSync(int cycles) done -= l; } - // release CPU and terminate DMA if FIFO isn't blocking the 68k anymore - if (fifo_total <= 4) { - pv->status &= ~PVS_CPUWR; - pv->command &= ~0x80; - if (!(pv->status & PVS_DMAPEND)) - pv->status &= ~(SR_DMA|PVS_DMAFILL); - } - if (fifo_total == 0) - pv->status &= ~PVS_CPURD; + SetFIFOState(pv); } // drain FIFO, blocking 68k on the way. FIFO must be synced prior to drain. -int PicoVideoFIFODrain(int level, int cycles) +int PicoVideoFIFODrain(int level, int cycles, int bgdma) { struct PicoVideo *pv = &Pico.video; - int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); - int h40 = pv->reg[12] & 1; - int maxsl = vdpslots[h40 + 2*active]; // max xfer slots in this scanline + int maxsl = GetFIFOSlot(pv, 488); // max xfer slots in this scanline int burn = 0; - while (fifo_total > level && fifo_slot < maxsl) { - int b = fifo_queue[fifo_qx&7] & 1; - int cnt = (fifo_total-level) << b; + // process FIFO entries until low level is reached + while (fifo_total > level && fifo_slot < maxsl && + (!(fifo_queue[fifo_qx] & FQ_BGDMA) || bgdma)) { + int b = fifo_queue[fifo_qx] & FQ_BYTE; + int cnt = ((fifo_total-level) << b) - (pv->fifo_cnt & b); int last = fifo_slot; - int slot = (pv->fifo_cntfifo_cnt:cnt) + last; // target slot + int slot = (pv->fifo_cnt < cnt ? pv->fifo_cnt : cnt) + last; // target slot unsigned ocyc = cycles; if (slot > maxsl) { // target in later scanline, advance to eol slot = maxsl; - fifo_slot = maxsl; cycles = 488; } else { // advance FIFO to target slot and CPU to cycles at that slot - fifo_slot = slot; cycles = GetFIFOCycles(pv, slot); } + fifo_slot = slot; burn += cycles - ocyc; AdvanceFIFOEntry(pv, slot - last); } - // release CPU and terminate DMA if FIFO isn't blocking the bus anymore - if (fifo_total <= 4) { - pv->status &= ~PVS_CPUWR; - pv->command &= ~0x80; - if (!(pv->status & PVS_DMAPEND)) - pv->status &= ~(SR_DMA|PVS_DMAFILL); - } - if (fifo_total == 0) - pv->status &= ~PVS_CPURD; + SetFIFOState(pv); return burn; } @@ -220,13 +231,13 @@ int PicoVideoFIFODrain(int level, int cycles) int PicoVideoFIFORead(void) { struct PicoVideo *pv = &Pico.video; - int lc = SekCyclesDone()-Pico.t.m68c_line_start+4; + int lc = SekCyclesDone()-Pico.t.m68c_line_start; int burn = 0; PicoVideoFIFOSync(lc); // advance FIFO and CPU until FIFO is empty - burn = PicoVideoFIFODrain(0, lc); + burn = PicoVideoFIFODrain(0, lc, 1); lc += burn; if (fifo_total > 0) pv->status |= PVS_CPURD; // target slot is in later scanline @@ -243,34 +254,41 @@ int PicoVideoFIFORead(void) int PicoVideoFIFOWrite(int count, int flags, unsigned sr_mask,unsigned sr_flags) { struct PicoVideo *pv = &Pico.video; - int lc = SekCyclesDone()-Pico.t.m68c_line_start+4; - int burn = 0; + int lc = SekCyclesDone()-Pico.t.m68c_line_start; + int burn = 0, x; PicoVideoFIFOSync(lc); pv->status = (pv->status & ~sr_mask) | sr_flags; if (count && fifo_ql < 8) { // update FIFO state if it was empty - if (fifo_total == 0 && count) { - fifo_slot = GetFIFOSlot(pv, lc); - pv->fifo_cnt = count << (flags&1); + if (fifo_ql == 0) { + fifo_slot = GetFIFOSlot(pv, lc+10); // FIFO latency ~4 vdp slots + pv->fifo_cnt = count << (flags & FQ_BYTE); } // create xfer queue entry - int x = (fifo_qx + fifo_ql) & 7; - fifo_queue[x] = (count << 2) | flags; + x = (fifo_qx + fifo_ql - 1) & 7; + if (fifo_ql && (fifo_queue[x] & FQ_BGDMA)) { + // CPU FIFO writes have priority over a background DMA Fill/Copy + fifo_queue[(x+1) & 7] = fifo_queue[x]; + if (fifo_ql == 1) { + // XXX if interrupting a DMA fill, fill data changes + int f = fifo_queue[x] & 7; + fifo_queue[(x+1) & 7] = (pv->fifo_cnt >> (f & FQ_BYTE) << 3) | f; + pv->fifo_cnt = count << (flags & FQ_BYTE); + } + } else + x = (x+1) & 7; + fifo_queue[x] = (count << 3) | flags; fifo_ql ++; - fifo_total += count; + if (!(flags & FQ_BGDMA)) + fifo_total += count; } // if CPU is waiting for the bus, advance CPU and FIFO until bus is free - if ((pv->status & (PVS_CPUWR|PVS_DMAFILL)) == PVS_CPUWR) - burn = PicoVideoFIFODrain(4, lc); - else if (fifo_queue[fifo_qx&7]&2) { - // if interrupting a DMA fill terminate it XXX wrong, changes fill data - AdvanceFIFOEntry(pv, pv->fifo_cnt); - pv->status &= ~PVS_DMAFILL; - } + if (pv->status & PVS_CPUWR) + burn = PicoVideoFIFODrain(4, lc, 0); return burn; } @@ -287,7 +305,7 @@ int PicoVideoFIFOHint(void) // if CPU is waiting for the bus, advance CPU and FIFO until bus is free if (pv->status & PVS_CPURD) burn = PicoVideoFIFORead(); - if (pv->status & PVS_CPUWR) + else if (pv->status & PVS_CPUWR) burn = PicoVideoFIFOWrite(0, 0, 0, 0); return burn; @@ -297,16 +315,15 @@ int PicoVideoFIFOHint(void) void PicoVideoFIFOMode(int active) { struct PicoVideo *pv = &Pico.video; - const unsigned char *cs = pv->reg[12]&1 ? vdpcyc2sl_40 : vdpcyc2sl_32; int h40 = pv->reg[12] & 1; int lc = SekCyclesDone() - Pico.t.m68c_line_start; PicoVideoFIFOSync(lc); - if (fifo_total) { + if (fifo_ql) { // recalculate FIFO slot for new mode if (!(pv->status & SR_VB) && active) - fifo_slot = cs[lc/4]; + fifo_slot = (pv->reg[12]&1 ? vdpcyc2sl_40 : vdpcyc2sl_32)[lc/4]; else fifo_slot = ((lc * vdpcyc2sl_bl[h40] + lc) >> 16); } } @@ -421,8 +438,8 @@ static void DmaSlow(int len, unsigned int source) Pico.video.type, source, a, len, inc, (Pico.video.status&SR_VB)||!(Pico.video.reg[1]&0x40), SekCyclesDone(), SekPc); - SekCyclesBurnRun(PicoVideoFIFOWrite(len, Pico.video.type == 1, PVS_DMAPEND, - SR_DMA | PVS_CPUWR) + 8); + SekCyclesBurnRun(PicoVideoFIFOWrite(len, FQ_FGDMA | (Pico.video.type == 1), + 0, SR_DMA| PVS_CPUWR)); if ((source & 0xe00000) == 0xe00000) { // Ram base = (u16 *)PicoMem.ram; @@ -546,7 +563,8 @@ static void DmaCopy(int len) int source; elprintf(EL_VDPDMA, "DmaCopy len %i [%u]", len, SekCyclesDone()); - SekCyclesBurnRun(PicoVideoFIFOWrite(len, 1, PVS_CPUWR | PVS_DMAPEND, SR_DMA)); + SekCyclesBurnRun(PicoVideoFIFOWrite(len, FQ_BGDMA | FQ_BYTE, + PVS_CPUWR, SR_DMA)); source =Pico.video.reg[0x15]; source|=Pico.video.reg[0x16]<<8; @@ -577,8 +595,8 @@ static NOINLINE void DmaFill(int data) len = GetDmaLength(); elprintf(EL_VDPDMA, "DmaFill len %i inc %i [%u]", len, inc, SekCyclesDone()); - SekCyclesBurnRun(PicoVideoFIFOWrite(len, 2|(Pico.video.type == 1), - PVS_CPUWR | PVS_DMAPEND, SR_DMA)); + SekCyclesBurnRun(PicoVideoFIFOWrite(len, FQ_BGDMA | (Pico.video.type == 1), + PVS_CPUWR | PVS_DMAFILL, SR_DMA)); switch (Pico.video.type) { @@ -648,7 +666,6 @@ static NOINLINE void CommandDma(void) u32 len, method; u32 source; - pvid->status |= PVS_DMAPEND; PicoVideoFIFOSync(SekCyclesDone()-Pico.t.m68c_line_start); if (pvid->status & SR_DMA) { elprintf(EL_VDPDMA, "Dma overlap, left=%d @ %06x", @@ -748,12 +765,14 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) VideoWrite(d); // start DMA fill on write. NB VSRAM and CRAM fills use wrong FIFO data. - if ((pvid->status & (PVS_DMAPEND|PVS_DMAFILL)) == (PVS_DMAPEND|PVS_DMAFILL)) + if (pvid->status & PVS_DMAFILL) DmaFill(fifo_data[(fifo_dx + !!(pvid->type&~0x81))&3]); break; case 0x04: // Control (command) port 4 or 6 + if (pvid->status & SR_DMA) + SekCyclesBurnRun(PicoVideoFIFORead()); // kludge, flush out running DMA if (pvid->pending) { // Low word of command: @@ -886,14 +905,12 @@ static u32 VideoSr(const struct PicoVideo *pv) unsigned int hp = pv->reg[12]&1 ? 32:40; // HBLANK start unsigned int hl = pv->reg[12]&1 ? 94:84; // HBLANK length - c = SekCyclesDone(); - if (c - Pico.t.m68c_line_start - hp < hl) + c = SekCyclesDone() - Pico.t.m68c_line_start; + if (c - hp < hl) d |= SR_HB; - PicoVideoFIFOSync(c-Pico.t.m68c_line_start); - if (pv->status & SR_DMA) - d |= SR_EMPT; // unused by DMA, or rather flags not updated? - else if (fifo_total >= 4) + PicoVideoFIFOSync(c); + if (fifo_total >= 4) d |= SR_FULL; else if (!fifo_total) d |= SR_EMPT; @@ -1010,7 +1027,7 @@ void PicoVideoSave(void) // account for all outstanding xfers XXX kludge, entry attr's not saved for (l = fifo_ql, x = fifo_qx + l-1; l > 1; l--, x--) - pv->fifo_cnt += (fifo_queue[x&7] >> 2) << (fifo_queue[x&7] & 1); + pv->fifo_cnt += (fifo_queue[x&7] >> 2) << (fifo_queue[x&7] & FQ_BYTE); } void PicoVideoLoad(void) From cdcc29af58424193c64dc21a46ce9fbe67f90469 Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 23 Feb 2020 20:15:07 +0100 Subject: [PATCH 111/174] vdp sprite rendering fix --- pico/videoport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pico/videoport.c b/pico/videoport.c index 533c78806..122908e43 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -341,7 +341,7 @@ static __inline void UpdateSAT(u32 a, u32 d) { Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; if (!((a^sat) >> satxbits) && !(a & 4)) { - int num = (a >> 3) & 0x7f; + int num = (a-sat) >> 3; ((u16 *)&VdpSATCache[num])[(a&3) >> 1] = d; } } From b1395deb22668caf7bf143ce382af7a43a9cb2b2 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 25 Feb 2020 21:59:02 +0100 Subject: [PATCH 112/174] vdp fifo, refined timing --- pico/videoport.c | 60 +++++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/pico/videoport.c b/pico/videoport.c index 122908e43..fb6bd69ad 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -2,6 +2,7 @@ * PicoDrive * (c) Copyright Dave, 2004 * (C) notaz, 2006-2009 + * (C) kub, 2020 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. @@ -58,38 +59,38 @@ static const int vdpsl2cyc_bl[] = { (488<<16)/166, (488<<16)/204 }; // VDP transfer slots in active display 32col mode. 1 slot is 488/171 = 2.8538 // 68k cycles. Only 16 of the 171 slots in a scanline can be used by CPU/DMA: -// (HINT=slot 0): 13,27,42,50,58,74,82,90,106,114,122,138,146,154,169,170 +// (HINT=slot 0): 11,25,40,48,56,72,80,88,104,112,120,136,144,152,167,168 static const unsigned char vdpcyc2sl_32[] = { // 68k cycles/4 to slot # // 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, - 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, - 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11, -11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14, -14,14,14,14,14,14,14,14,15,16,16,16,16,16,16,16, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, + 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9,10, +10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11, +11,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14, +14,14,14,14,14,14,14,15,16,16,16,16,16,16,16,16, }; static const unsigned char vdpsl2cyc_32[] = { // slot # to 68k cycles/4 - 0, 9, 19, 30, 35, 41, 52, 58, 64, 75, 81, 87, 98,104,110,120,121,131 + 0, 8, 18, 28, 33, 39, 51, 56, 62, 74, 79, 85, 97,102,108,119,120,130 }; // VDP transfer slots in active display 40col mode. 1 slot is 488/210 = 2.3238 // 68k cycles. Only 18 of the 210 slots in a scanline can be used by CPU/DMA: -// (HINT=0): 23,49,57,65,81,89,97,113,121,129,145,153,161,177,185,193,208,209 +// (HINT=0): 21,47,55,63,79,87,95,111,119,127,143,151,159,175,183,191,206,207, static const unsigned char vdpcyc2sl_40[] = { // 68k cycles/4 to slot # // 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, - 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, - 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 8, 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10, -10,10,10,10,11,11,11,11,12,12,12,12,12,13,13,13, -13,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15, -16,16,16,16,16,16,16,16,17,18,18,18,18,18,18,18, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, + 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10, +10,10,10,11,11,11,11,12,12,12,12,12,13,13,13,13, +13,13,13,13,13,14,14,14,14,14,15,15,15,15,15,16, +16,16,16,16,16,16,16,17,18,18,18,18,18,18,18,18, }; static const unsigned char vdpsl2cyc_40[] = { // slot # to 68k cycles/4 - 0, 13, 28, 33, 37, 47, 51, 56, 65, 70, 74, 84, 88, 93,102,107,112,120,121,135 + 0, 12, 27, 32, 36, 46, 50, 55, 64, 69, 73, 83, 87, 92,101,106,111,119,120,134 }; // NB code assumes fifo_* arrays have size 2^n @@ -164,7 +165,7 @@ static __inline void SetFIFOState(struct PicoVideo *pv) if (fifo_total <= 4) { int x = (fifo_qx + fifo_ql - 1) & 7; if ((pv->status & SR_DMA) && !(pv->status & PVS_DMAFILL) && - fifo_ql && !(fifo_queue[x] & FQ_BGDMA)) { + (!fifo_ql || !(fifo_queue[x] & FQ_BGDMA))) { pv->status &= ~SR_DMA; pv->command &= ~0x80; } @@ -263,7 +264,7 @@ int PicoVideoFIFOWrite(int count, int flags, unsigned sr_mask,unsigned sr_flags) if (count && fifo_ql < 8) { // update FIFO state if it was empty if (fifo_ql == 0) { - fifo_slot = GetFIFOSlot(pv, lc+10); // FIFO latency ~4 vdp slots + fifo_slot = GetFIFOSlot(pv, lc+9); // FIFO latency ~3 vdp slots pv->fifo_cnt = count << (flags & FQ_BYTE); } @@ -340,7 +341,7 @@ static __inline void AutoIncrement(void) static __inline void UpdateSAT(u32 a, u32 d) { Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; - if (!((a^sat) >> satxbits) && !(a & 4)) { + if (!(a & 4)) { int num = (a-sat) >> 3; ((u16 *)&VdpSATCache[num])[(a&3) >> 1] = d; } @@ -672,7 +673,6 @@ static NOINLINE void CommandDma(void) fifo_total, SekPc); fifo_total = fifo_ql = 0; } - pvid->status |= SR_DMA; len = GetDmaLength(); source =Pico.video.reg[0x15]; @@ -685,7 +685,7 @@ static NOINLINE void CommandDma(void) else if (method == 3) DmaCopy(len); // VRAM Copy else { - pvid->status |= PVS_DMAFILL; + pvid->status |= SR_DMA|PVS_DMAFILL; return; } source += len; @@ -747,7 +747,7 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) !(!pvid->pending && ((pvid->command & 0xc00000f0) == 0x40000010 && PicoMem.vsram[pvid->addr>>1] == (d & 0x7ff))) ) - DrawSync(SekCyclesDone() - Pico.t.m68c_line_start <= 488-440); + DrawSync(0); // XXX it's unclear when vscroll data is fetched from vsram? if (pvid->pending) { CommandChange(); @@ -902,8 +902,8 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) static u32 VideoSr(const struct PicoVideo *pv) { unsigned int c, d = pv->status; - unsigned int hp = pv->reg[12]&1 ? 32:40; // HBLANK start - unsigned int hl = pv->reg[12]&1 ? 94:84; // HBLANK length + unsigned int hp = pv->reg[12]&1 ? 15*488/210+1 : 15*488/171+1; // HBLANK start + unsigned int hl = pv->reg[12]&1 ? 37*488/210+1 : 28*488/171+1; // HBLANK len c = SekCyclesDone() - Pico.t.m68c_line_start; if (c - hp < hl) @@ -1013,7 +1013,9 @@ unsigned char PicoVideoRead8HV_H(void) unsigned char PicoVideoRead8HV_L(void) { u32 d = (SekCyclesDone() - Pico.t.m68c_line_start) & 0x1ff; // FIXME - if (Pico.video.reg[12]&1) + if (Pico.video.reg[0]&2) + d = Pico.video.hv_latch; + else if (Pico.video.reg[12]&1) d = hcounts_40[d]; else d = hcounts_32[d]; elprintf(EL_HVCNT, "hcounter: %02x [%u] @ %06x", d, SekCyclesDone(), SekPc); From da1793b7051ee7630efe3c728ad582b6fe147ba6 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 26 Feb 2020 20:31:40 +0100 Subject: [PATCH 113/174] bugfix for ARM asm sprite rendering --- pico/draw.c | 2 +- pico/draw_arm.S | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pico/draw.c b/pico/draw.c index 7b66b43c4..02a3589ec 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -1243,7 +1243,7 @@ static void DrawAllSprites(unsigned char *sprited, int prio, int sh, struct PicoEState *est) { unsigned char *p; - int cnt, w = sprited[2]; + int cnt, w; cnt = sprited[0] & 0x7f; if (cnt == 0) return; diff --git a/pico/draw_arm.S b/pico/draw_arm.S index 967bf6aa4..0eb161e36 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -1169,7 +1169,9 @@ DrawSprite: mov r2, r0, lsl #24 cmp r0, #0xff ldmeqfd sp!, {r1,r3-r11,pc} @ end of list - eor r2, r2, r4, lsl #30 + eors r2, r2, r4, lsl #30 + bic r2, r4, #0xff000000 + str r2, [sp] bmi DrawSprite @ wrong priority ldr r1, [r7, #OFS_EST_HighPreSpr] and r0, r0, #0x7f @@ -1210,8 +1212,6 @@ DrawSprite: add r6, r6, #1 @ inc now cmp r4, #0x1000000 @ check width of last sprite movhs r6, r4, lsr #24 - bichs r4, r4, #0xff000000 - strhs r4, [sp] @ cache some stuff to avoid mem access mov r5, r5, lsl #4 @ delta<<=4; // Delta of address From ca2bd555c6a31ad83685eebb7f90dc6c84eefe90 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 26 Feb 2020 20:36:46 +0100 Subject: [PATCH 114/174] fix for EI insn in cz80 (partial revert of 43e1401) --- cpu/cz80/cz80_op.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpu/cz80/cz80_op.c b/cpu/cz80/cz80_op.c index 317e9587b..b1520088b 100644 --- a/cpu/cz80/cz80_op.c +++ b/cpu/cz80/cz80_op.c @@ -712,6 +712,8 @@ switch (Opcode) if (CPU->IRQState) { afterEI = 1; + CPU->ExtraCycles += 1 - CPU->ICount; + CPU->ICount = 1; } } else zIFF2 = (1 << 2); From 9145ace9c8ccf848eacb8304dc0e42232f97f69a Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 27 Feb 2020 21:19:37 +0100 Subject: [PATCH 115/174] fix for VINT while DMA is running --- pico/pico_cmn.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pico/pico_cmn.c b/pico/pico_cmn.c index 753898401..50a632ca7 100644 --- a/pico/pico_cmn.c +++ b/pico/pico_cmn.c @@ -191,7 +191,8 @@ static int PicoFrameHints(void) pv->status |= SR_F; pv->pending_ints |= 0x20; if (pv->reg[1] & 0x20) { - SekExecM68k(11); // HACK + if (Pico.t.m68c_cnt - Pico.t.m68c_aim < 60) // CPU blocked? + SekExecM68k(11); // HACK elprintf(EL_INTS, "vint: @ %06x [%u]", SekPc, SekCyclesDone()); SekInterrupt(6); } From e3bb43d261262dfde691f07caf81365d1ffdbd98 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 27 Feb 2020 21:31:04 +0100 Subject: [PATCH 116/174] vdp, tentative fix for save/load compatibility --- pico/videoport.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pico/videoport.c b/pico/videoport.c index fb6bd69ad..264bb0bcc 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -101,9 +101,9 @@ static short fifo_data[4], fifo_dx; // XXX must go into save? // each entry has 2 values: [n]>>3 = #writes, [n]&7 = flags static int fifo_queue[8], fifo_qx, fifo_ql; // XXX must go into save? enum { FQ_BYTE = 1, FQ_BGDMA = 2, FQ_FGDMA = 4 }; // queue flags, NB: BYTE = 1! -unsigned int fifo_total; // total# of pending FIFO entries (w/o BGDMA) +static unsigned int fifo_total; // total# of pending FIFO entries (w/o BGDMA) -unsigned short fifo_slot; // last executed slot in current scanline +static unsigned short fifo_slot; // last executed slot in current scanline // map cycles to FIFO slot static __inline int GetFIFOSlot(struct PicoVideo *pv, int cycles) @@ -313,7 +313,7 @@ int PicoVideoFIFOHint(void) } // switch FIFO mode between active/inactive display -void PicoVideoFIFOMode(int active) +static void PicoVideoFIFOMode(int active) { struct PicoVideo *pv = &Pico.video; int h40 = pv->reg[12] & 1; @@ -671,7 +671,7 @@ static NOINLINE void CommandDma(void) if (pvid->status & SR_DMA) { elprintf(EL_VDPDMA, "Dma overlap, left=%d @ %06x", fifo_total, SekPc); - fifo_total = fifo_ql = 0; + pvid->fifo_cnt = fifo_total = fifo_ql = 0; } len = GetDmaLength(); @@ -1029,7 +1029,7 @@ void PicoVideoSave(void) // account for all outstanding xfers XXX kludge, entry attr's not saved for (l = fifo_ql, x = fifo_qx + l-1; l > 1; l--, x--) - pv->fifo_cnt += (fifo_queue[x&7] >> 2) << (fifo_queue[x&7] & FQ_BYTE); + pv->fifo_cnt += (fifo_queue[x&7] >> 3) << (fifo_queue[x&7] & FQ_BYTE); } void PicoVideoLoad(void) From 51d29ec555d9767f56a737b50a7c33b9e03550ab Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 29 Feb 2020 23:45:23 +0100 Subject: [PATCH 117/174] improved hi prio sprite rendering speed --- pico/draw.c | 97 ++++++++++++++++++++++------------------------------- 1 file changed, 41 insertions(+), 56 deletions(-) diff --git a/pico/draw.c b/pico/draw.c index 02a3589ec..1ec7db153 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -100,7 +100,7 @@ void blockcpy_or(void *dst, void *src, size_t n, int pat) #define blockcpy memcpy #endif -#define TileNormMaker_(pix_func) \ +#define TileNormMaker_(pix_func,ret) \ { \ unsigned int t; \ \ @@ -112,9 +112,10 @@ void blockcpy_or(void *dst, void *src, size_t n, int pat) t = (pack&0x0f000000)>>24; pix_func(5); \ t = (pack&0x00f00000)>>20; pix_func(6); \ t = (pack&0x000f0000)>>16; pix_func(7); \ + return ret; \ } -#define TileFlipMaker_(pix_func) \ +#define TileFlipMaker_(pix_func,ret) \ { \ unsigned int t; \ \ @@ -126,23 +127,24 @@ void blockcpy_or(void *dst, void *src, size_t n, int pat) t = (pack&0x000000f0)>> 4; pix_func(5); \ t = (pack&0x00000f00)>> 8; pix_func(6); \ t = (pack&0x0000f000)>>12; pix_func(7); \ + return ret; \ } #define TileNormMaker(funcname, pix_func) \ static void funcname(unsigned char *pd, unsigned int pack, int pal) \ -TileNormMaker_(pix_func) +TileNormMaker_(pix_func,) #define TileFlipMaker(funcname, pix_func) \ static void funcname(unsigned char *pd, unsigned int pack, int pal) \ -TileFlipMaker_(pix_func) +TileFlipMaker_(pix_func,) #define TileNormMakerAS(funcname, pix_func) \ -static void funcname(unsigned char *pd, unsigned char *mb, unsigned int pack, int pal) \ -TileNormMaker_(pix_func) +static unsigned funcname(unsigned char *pd, unsigned m, unsigned int pack, int pal) \ +TileNormMaker_(pix_func,m) #define TileFlipMakerAS(funcname, pix_func) \ -static void funcname(unsigned char *pd, unsigned char *mb, unsigned int pack, int pal) \ -TileFlipMaker_(pix_func) +static unsigned funcname(unsigned char *pd, unsigned m, unsigned int pack, int pal) \ +TileFlipMaker_(pix_func,m) #define pix_just_write(x) \ if (t) pd[x]=pal|t @@ -184,17 +186,19 @@ TileFlipMaker(TileFlipSH_onlyop_lp, pix_sh_onlyop) #endif +// AS: sprite mask bits in m shifted to bits 8-15, see DrawSpritesHiAS + // draw a sprite pixel (AS) #define pix_as(x) \ - if (t & mb[x]) mb[x] = 0, pd[x] = pal | t + if (t && (m & (1<<(x+8)))) m &= ~(1<<(x+8)), pd[x] = pal | t TileNormMakerAS(TileNormAS, pix_as) TileFlipMakerAS(TileFlipAS, pix_as) // draw a sprite pixel, process operator colors (AS) #define pix_sh_as(x) \ - if (t & mb[x]) { \ - mb[x] = 0; \ + if (t && (m & (1<<(x+8)))) { \ + m &= ~(1<<(x+8)); \ if (t>=0xe) pd[x]=(pd[x]&0x3f)|(t<<6); /* c0 shadow, 80 hilight */ \ else pd[x] = pal | t; \ } @@ -203,8 +207,8 @@ TileNormMakerAS(TileNormSH_AS, pix_sh_as) TileFlipMakerAS(TileFlipSH_AS, pix_sh_as) #define pix_sh_as_onlyop(x) \ - if (t & mb[x]) { \ - mb[x] = 0; \ + if (t && (m & (1<<(x+8)))) { \ + m &= ~(1<<(x+8)); \ pix_sh_onlyop(x); \ } @@ -213,7 +217,7 @@ TileFlipMakerAS(TileFlipSH_AS_onlyop_lp, pix_sh_as_onlyop) // mark pixel as sprite pixel (AS) #define pix_sh_as_onlymark(x) \ - if (t) mb[x] = 0 + if (t) m &= ~(1<<(x+8)) TileNormMakerAS(TileNormAS_onlymark, pix_sh_as_onlymark) TileFlipMakerAS(TileFlipAS_onlymark, pix_sh_as_onlymark) @@ -905,6 +909,10 @@ static NOINLINE void DrawAllSpritesInterlace(int pri, int sh) */ static void DrawSpritesSHi(unsigned char *sprited, const struct PicoEState *est) { + static void (*tilefuncs[2][2][2])(unsigned char *, unsigned, int) = { + { {NULL, NULL}, {TileNorm, TileFlip} }, + { {TileNormSH_onlyop_lp, TileFlipSH_onlyop_lp}, {TileNormSH, TileFlipSH} } + }; // [sh?][hi?][flip?] void (*fTileFunc)(unsigned char *pd, unsigned int pack, int pal); unsigned char *pd = Pico.est.HighCol; unsigned char *p; @@ -929,21 +937,8 @@ static void DrawSpritesSHi(unsigned char *sprited, const struct PicoEState *est) code = sprite[1]; pal = (code>>9)&0x30; - if (pal == 0x30) - { - if (code & 0x8000) // hi priority - { - if (code&0x800) fTileFunc=TileFlipSH; - else fTileFunc=TileNormSH; - } else { - if (code&0x800) fTileFunc=TileFlipSH_onlyop_lp; - else fTileFunc=TileNormSH_onlyop_lp; - } - } else { - if (!(code & 0x8000)) continue; // non-operator low sprite, already drawn - if (code&0x800) fTileFunc=TileFlip; - else fTileFunc=TileNorm; - } + fTileFunc = tilefuncs[pal == 0x30][!!(code & 0x8000)][!!(code & 0x800)]; + if (fTileFunc == NULL) continue; // non-operator low sprite, already drawn // parse remaining sprite data sy=sprite[0]; @@ -980,11 +975,15 @@ static void DrawSpritesSHi(unsigned char *sprited, const struct PicoEState *est) static void DrawSpritesHiAS(unsigned char *sprited, int sh) { - void (*fTileFunc)(unsigned char *pd, unsigned char *mb, - unsigned int pack, int pal); + static unsigned (*tilefuncs[2][2][2])(unsigned char *, unsigned, unsigned, int) = { + { {TileNormAS_onlymark, TileFlipAS_onlymark}, {TileNormAS, TileFlipAS} }, + { {TileNormSH_AS_onlyop_lp, TileFlipSH_AS_onlyop_lp}, {TileNormSH_AS, TileFlipSH_AS} } + }; // [sh?][hi?][flip?] + unsigned (*fTileFunc)(unsigned char *pd, unsigned m, unsigned int pack, int pal); unsigned char *pd = Pico.est.HighCol; - unsigned char mb[8+320+8]; - unsigned char *p; + unsigned char mb[1+320/8+1]; + unsigned char *p, *mp; + unsigned m; int entry, cnt; cnt = sprited[0] & 0x7f; @@ -1006,26 +1005,7 @@ static void DrawSpritesHiAS(unsigned char *sprited, int sh) code = sprite[1]; pal = (code>>9)&0x30; - if (sh && pal == 0x30) - { - if (code & 0x8000) // hi priority - { - if (code&0x800) fTileFunc = TileFlipSH_AS; - else fTileFunc = TileNormSH_AS; - } else { - if (code&0x800) fTileFunc = TileFlipSH_AS_onlyop_lp; - else fTileFunc = TileNormSH_AS_onlyop_lp; - } - } else { - if (code & 0x8000) // hi priority - { - if (code&0x800) fTileFunc = TileFlipAS; - else fTileFunc = TileNormAS; - } else { - if (code&0x800) fTileFunc = TileFlipAS_onlymark; - else fTileFunc = TileNormAS_onlymark; - } - } + fTileFunc = tilefuncs[(sh && pal == 0x30)][!!(code&0x8000)][!!(code&0x800)]; // parse remaining sprite data sy=sprite[0]; @@ -1054,8 +1034,12 @@ static void DrawSpritesHiAS(unsigned char *sprited, int sh) if(sx>=328) break; // Offscreen pack = *(unsigned int *)(PicoMem.vram + (tile & 0x7fff)); - fTileFunc(pd + sx, mb + sx, pack, pal); - } + + m = (m >> 8) | mp[1] << 8; // next mask byte + // shift mask bits to bits 8-15 for easier load/store handling + m = fTileFunc(pd + sx, m << (8-(sx&0x7)), pack, pal) >> (8-(sx&0x7)); + } + *mp = m >> 8; // write last mask byte } } @@ -1106,7 +1090,8 @@ static void DrawSpritesForced(unsigned char *sprited) delta<<=4; // Delta of address if (entry+1 == cnt) width = p[entry+1]; // last sprite width limited? - for (; width; width--,sx+=8,tile+=delta) + mp = mb+(sx>>3); + for (m = *mp << 8; width; width--, sx+=8, *mp++ = m, tile+=delta) { unsigned int pack; From c1d5d254e9aa1b0072fa86a27262e021c7301612 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 29 Feb 2020 23:47:14 +0100 Subject: [PATCH 118/174] more ARM asm sprite rendering bugfixes --- pico/draw_arm.S | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pico/draw_arm.S b/pico/draw_arm.S index 0eb161e36..de45f592e 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -317,9 +317,10 @@ DrawLayer: moveq r1, #0x0007 movgt r1, #0x00ff @ r1=ymask=(height<<8)|0xff; ...; // Y Mask in pixels - add r10, r10, #5 - cmp r10, #7 - subge r10, r10, #1 @ r10=shift[width] (5,6,6,7) + cmp r10, #2 + addlt r10, r10, #5 + moveq r10, #5 + movgt r10, #7 @ r10=shift[width] (5,6,5,7) ldr r2, [r12, #OFS_EST_DrawScanline] ldr lr, [r12, #OFS_EST_PicoMem_vram] @@ -366,7 +367,8 @@ DrawLayer: bne .DrawStrip_interlace tst r0, r0 - movne r7, r7, lsr #16 + moveq r7, r7, lsl #16 + mov r7, r7, lsr #16 @ Find the line in the name table add r2, r2, r7 @@ -699,8 +701,8 @@ DrawLayer: @ interlace mode 2? Sonic 2? .DrawStrip_interlace: tst r0, r0 - moveq r7, r7, lsl #21 - movne r7, r7, lsl #5 + movne r7, r7, lsr #16 + mov r7, r7, lsl #21 @ Find the line in the name table add r2, r7, r2, lsl #22 @ r2=(vscroll+(DrawScanline<<1))<<21 (11 bits); From e7fec045a73e9171eb2383d286178fa08ae2342f Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 1 Mar 2020 18:50:55 +0100 Subject: [PATCH 119/174] vdp sprite rendering fixes --- pico/draw.c | 14 +++++++------- pico/draw_arm.S | 2 -- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pico/draw.c b/pico/draw.c index 1ec7db153..66c2b9f46 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -333,7 +333,7 @@ static void DrawStripVSRam(struct TileStrip *ts, int plane_sh, int cellskip) } code=PicoMem.vram[ts->nametab+nametabadd+(tilex&ts->xmask)]; - if (code==blank) continue; + if ((code<<16|ty)==blank) continue; if (code>>15) { // high priority tile int cval = code | (dx<<16) | (ty<<25); if(code&0x1000) cval^=7<<26; @@ -353,7 +353,7 @@ static void DrawStripVSRam(struct TileStrip *ts, int plane_sh, int cellskip) pack = *(unsigned int *)(PicoMem.vram + addr+ty); if (!pack) { - blank = code; + blank = code<<16|ty; continue; } @@ -638,7 +638,7 @@ static void DrawTilesFromCache(int *hc, int sh, int rlim, struct PicoEState *est { int blank=-1; // The tile we know is blank while ((code=*hc++)) { - if (!(code & 0x8000) || (unsigned short)code == blank) + if ((code<<16|code>>25) == blank) continue; // Get tile address/2: addr = (code & 0x7ff) << 4; @@ -646,7 +646,7 @@ static void DrawTilesFromCache(int *hc, int sh, int rlim, struct PicoEState *est pack = *(unsigned int *)(PicoMem.vram + addr); if (!pack) { - blank = (unsigned short)code; + blank = code<<16|code>>25; continue; } @@ -1026,7 +1026,8 @@ static void DrawSpritesHiAS(unsigned char *sprited, int sh) delta<<=4; // Delta of address if (entry+1 == cnt) width = p[entry+1]; // last sprite width limited? - for (; width; width--,sx+=8,tile+=delta) + mp = mb+(sx>>3); + for (m = *mp << 8; width; width--, sx+=8, *mp++ = m, tile+=delta) { unsigned int pack; @@ -1090,8 +1091,7 @@ static void DrawSpritesForced(unsigned char *sprited) delta<<=4; // Delta of address if (entry+1 == cnt) width = p[entry+1]; // last sprite width limited? - mp = mb+(sx>>3); - for (m = *mp << 8; width; width--, sx+=8, *mp++ = m, tile+=delta) + for (; width; width--,sx+=8,tile+=delta) { unsigned int pack; diff --git a/pico/draw_arm.S b/pico/draw_arm.S index de45f592e..860ab0f72 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -796,8 +796,6 @@ DrawTilesFromCache: bic r4, r1, #0xfe00 add r1, r11, r4 @ r1=pdest - movs r7, r6, lsl #16 - bpl .dtfc_loop @ !(code & 0x8000) cmp r5, r7, lsr #16 beq .dtfc_samecode @ if (code==prevcode) From 4eb847792595b2f406e9be9b051c4cf2891d19ec Mon Sep 17 00:00:00 2001 From: kub Date: Mon, 2 Mar 2020 19:40:07 +0100 Subject: [PATCH 120/174] ARM SVP drc revived --- pico/carthw/svp/compiler.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/pico/carthw/svp/compiler.c b/pico/carthw/svp/compiler.c index 06aa17919..df051e478 100644 --- a/pico/carthw/svp/compiler.c +++ b/pico/carthw/svp/compiler.c @@ -1438,12 +1438,9 @@ static int translate_op(unsigned int op, int *pc, int imm, int *end_cond, int *j } tr_mov16(0, *pc); tr_r0_to_STACK(*pc); - if (tmpv != A_COND_AL) { - u32 *real_ptr = tcache_ptr; - tcache_ptr = jump_op; - EOP_C_B(tr_neg_cond(tmpv),0,real_ptr - jump_op - 2); - tcache_ptr = real_ptr; - } + if (tmpv != A_COND_AL) + EOP_C_B_PTR(jump_op, tr_neg_cond(tmpv), 0, + tcache_ptr - jump_op - 2); tr_mov16_cond(tmpv, 0, imm); if (tmpv != A_COND_AL) tr_mov16_cond(tr_neg_cond(tmpv), 0, *pc); @@ -1712,12 +1709,8 @@ static void *emit_block_epilogue(int cycles, int cond, int pc, int end_pc) ssp_block_table[pc]; if (target != NULL) emith_jump(target); - else { - int ops = emith_jump(ssp_drc_next); - end_ptr = tcache_ptr; - // cause the next block to be emitted over jump instruction - tcache_ptr -= ops; - } + else + emith_jump(ssp_drc_next); } else { u32 *target1 = (pc < 0x400) ? From 22f7ee9407fa104b1a2f445de25f5a72709b1dba Mon Sep 17 00:00:00 2001 From: kub Date: Mon, 2 Mar 2020 23:48:55 +0100 Subject: [PATCH 121/174] arm asm sprite rendering: add line accidently deleted in ea431e9 --- pico/draw_arm.S | 1 + 1 file changed, 1 insertion(+) diff --git a/pico/draw_arm.S b/pico/draw_arm.S index 860ab0f72..8dc660c25 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -796,6 +796,7 @@ DrawTilesFromCache: bic r4, r1, #0xfe00 add r1, r11, r4 @ r1=pdest + movs r7, r6, lsl #16 cmp r5, r7, lsr #16 beq .dtfc_samecode @ if (code==prevcode) From c80d0518ce3a8c003c6231f246a3f93b9e22a9dc Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 3 Mar 2020 20:29:23 +0100 Subject: [PATCH 122/174] fix config file parsing for long filenames --- platform/common/config_file.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/platform/common/config_file.c b/platform/common/config_file.c index 0284cfd6f..da19fad1a 100644 --- a/platform/common/config_file.c +++ b/platform/common/config_file.c @@ -39,7 +39,7 @@ static char *mystrip(char *str); static int seek_sect(FILE *f, const char *section) { - char line[128], *tmp; + char line[640], *tmp; int len; len = strlen(section); @@ -100,7 +100,7 @@ int config_write(const char *fname) FILE *fn = NULL; menu_entry *me; int t; - char line[128]; + char line[640]; fn = fopen(fname, "w"); if (fn == NULL) @@ -169,7 +169,7 @@ int config_write(const char *fname) int config_writelrom(const char *fname) { - char line[128], *tmp, *optr = NULL; + char line[640], *tmp, *optr = NULL; char *old_data = NULL; int size; FILE *f; @@ -216,7 +216,7 @@ int config_writelrom(const char *fname) int config_readlrom(const char *fname) { - char line[128], *tmp; + char line[640], *tmp; int i, len, ret = -1; FILE *f; @@ -507,7 +507,7 @@ static void parse(const char *var, const char *val, int *keys_encountered) int config_readsect(const char *fname, const char *section) { - char line[128], *var, *val; + char line[640], *var, *val; int keys_encountered = 0; FILE *f; int ret; From 1be46899ca57e32d71b44cb6d15c25c0731c5bc8 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 3 Mar 2020 20:32:38 +0100 Subject: [PATCH 123/174] vdp, some small improvements --- pico/videoport.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/pico/videoport.c b/pico/videoport.c index 264bb0bcc..fd7a3a46b 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -53,6 +53,8 @@ int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned * FIFORead executes a 68k read. 68k is blocked until the next transfer slot. */ +// FIFO transfer slots per line: [active][h40] +static const short vdpslots[2][2] = {{ 166, 204 },{ 16, 18 }}; // mapping between slot# and 68k cycles in a blanked scanline [H32, H40] static const int vdpcyc2sl_bl[] = { (166<<16)/488, (204<<16)/488 }; static const int vdpsl2cyc_bl[] = { (488<<16)/166, (488<<16)/204 }; @@ -115,6 +117,14 @@ static __inline int GetFIFOSlot(struct PicoVideo *pv, int cycles) else return (cycles * vdpcyc2sl_bl[h40] + cycles) >> 16; } +static __inline int GetMaxFIFOSlot(struct PicoVideo *pv) +{ + int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); + int h40 = pv->reg[12] & 1; + + return vdpslots[active][h40]; +} + // map FIFO slot to cycles static __inline int GetFIFOCycles(struct PicoVideo *pv, int slot) { @@ -150,7 +160,7 @@ static __inline int AdvanceFIFOEntry(struct PicoVideo *pv, int slots) } // start processing for next entry if there is one if (fifo_ql) - pv->fifo_cnt= (fifo_queue[fifo_qx] >> 3) << (fifo_queue[fifo_qx] & FQ_BYTE); + pv->fifo_cnt = (fifo_queue[fifo_qx] >> 3) << (fifo_queue[fifo_qx] & FQ_BYTE); else fifo_total = 0; } @@ -190,14 +200,15 @@ void PicoVideoFIFOSync(int cycles) done -= l; } - SetFIFOState(pv); + if (done != slots) + SetFIFOState(pv); } // drain FIFO, blocking 68k on the way. FIFO must be synced prior to drain. int PicoVideoFIFODrain(int level, int cycles, int bgdma) { struct PicoVideo *pv = &Pico.video; - int maxsl = GetFIFOSlot(pv, 488); // max xfer slots in this scanline + int maxsl = GetMaxFIFOSlot(pv); // max xfer slots in this scanline int burn = 0; // process FIFO entries until low level is reached @@ -279,10 +290,17 @@ int PicoVideoFIFOWrite(int count, int flags, unsigned sr_mask,unsigned sr_flags) fifo_queue[(x+1) & 7] = (pv->fifo_cnt >> (f & FQ_BYTE) << 3) | f; pv->fifo_cnt = count << (flags & FQ_BYTE); } - } else + x = (x-1) & 7; + } + if (fifo_ql && (fifo_queue[x] & 7) == flags) { + // amalgamate entries if of same type + fifo_queue[x] += (count << 3); + if (fifo_ql == 1) pv->fifo_cnt += count << (flags & FQ_BYTE); + } else { + fifo_ql ++; x = (x+1) & 7; - fifo_queue[x] = (count << 3) | flags; - fifo_ql ++; + fifo_queue[x] = (count << 3) | flags; + } if (!(flags & FQ_BGDMA)) fifo_total += count; } @@ -340,9 +358,10 @@ static __inline void AutoIncrement(void) static __inline void UpdateSAT(u32 a, u32 d) { + unsigned num = (a-sat) >> 3; + Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; - if (!(a & 4)) { - int num = (a-sat) >> 3; + if (!(a & 4) && num < 128) { ((u16 *)&VdpSATCache[num])[(a&3) >> 1] = d; } } From 6d9aba2774c00f160ed61106a52e8dd4d0528618 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 3 Mar 2020 20:34:11 +0100 Subject: [PATCH 124/174] 32x, small improvement for poll detector --- pico/32x/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pico/32x/memory.c b/pico/32x/memory.c index f772d28dc..f4f0a18b9 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -239,7 +239,7 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2) // NB this can take an eternity on m68k: mov.b , needs // 28 m68k-cycles (~80 sh2-cycles) to complete (observed in Metal Head) q = &fifo[(sh2_poll_wr[hix]-1) % PFIFO_SZ]; - if (cpu < 0 && rd != wr && q->a == a && !CYCLES_GT(cycles,q->cycles+30)) { + if (rd != wr && q->a == a && !CYCLES_GT(cycles,q->cycles + (cpu<0 ? 30:4))) { q->d = d; } else { // store write to poll address in fifo From bef6e534916e64005e55c45391f462433f654aa5 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 3 Mar 2020 20:36:55 +0100 Subject: [PATCH 125/174] vdp rendering, tiny improvement --- pico/draw.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pico/draw.c b/pico/draw.c index 66c2b9f46..bcc4acaf6 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -636,19 +636,14 @@ static void DrawTilesFromCache(int *hc, int sh, int rlim, struct PicoEState *est if (!sh) { - int blank=-1; // The tile we know is blank while ((code=*hc++)) { - if ((code<<16|code>>25) == blank) - continue; // Get tile address/2: addr = (code & 0x7ff) << 4; addr += code >> 25; // y offset into tile pack = *(unsigned int *)(PicoMem.vram + addr); - if (!pack) { - blank = code<<16|code>>25; + if (!pack) continue; - } dx = (code >> 16) & 0x1ff; pal = ((code >> 9) & 0x30); From c2af49368c1653594c06973ce45923ccd0d407a1 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 14 Mar 2020 19:14:04 +0100 Subject: [PATCH 126/174] vdp rendering improvements --- pico/draw.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pico/draw.c b/pico/draw.c index bcc4acaf6..8343a3418 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -324,7 +324,7 @@ static void DrawStripVSRam(struct TileStrip *ts, int plane_sh, int cellskip) //if((cell&1)==0) { int line,vscroll; - vscroll=PicoMem.vsram[(plane_sh&1)+(cell&~1)]; + vscroll=PicoMem.vsram[(plane_sh&1)+(cell&0x3e)]; // Find the line in the name table line=(vscroll+scan)&ts->line&0xffff; // ts->line is really ymask .. @@ -479,6 +479,7 @@ static void DrawLayer(int plane_sh, int *hcache, int cellskip, int maxcells, // shit, we have 2-cell column based vscroll // luckily this doesn't happen too often ts.line=ymask|(shift[width]<<24); // save some stuff instead of line + PicoMem.vsram[(plane_sh & 1)+0x3e] = PicoMem.vsram[0x27]; // XXX really? DrawStripVSRam(&ts, plane_sh, cellskip); } else { vscroll = PicoMem.vsram[plane_sh & 1]; // Get vertical scroll value @@ -1022,7 +1023,7 @@ static void DrawSpritesHiAS(unsigned char *sprited, int sh) if (entry+1 == cnt) width = p[entry+1]; // last sprite width limited? mp = mb+(sx>>3); - for (m = *mp << 8; width; width--, sx+=8, *mp++ = m, tile+=delta) + for (m = *mp; width; width--, sx+=8, *mp++ = m, m >>= 8, tile+=delta) { unsigned int pack; @@ -1031,11 +1032,11 @@ static void DrawSpritesHiAS(unsigned char *sprited, int sh) pack = *(unsigned int *)(PicoMem.vram + (tile & 0x7fff)); - m = (m >> 8) | mp[1] << 8; // next mask byte + m |= mp[1] << 8; // next mask byte // shift mask bits to bits 8-15 for easier load/store handling m = fTileFunc(pd + sx, m << (8-(sx&0x7)), pack, pal) >> (8-(sx&0x7)); } - *mp = m >> 8; // write last mask byte + *mp = m; // write last mask byte } } @@ -1428,10 +1429,6 @@ static int DrawDisplay(int sh) int win=0, edge=0, hvwind=0, lflags; int maxw, maxcells; - if (!(est->DrawScanline & 15) || - (est->rendstatus & (PDRAW_SPRITES_MOVED|PDRAW_DIRTY_SPRITES))) - PrepareSprites((est->DrawScanline+16) & ~15); - est->rendstatus &= ~(PDRAW_SPRITES_MOVED|PDRAW_DIRTY_SPRITES); est->rendstatus &= ~(PDRAW_SHHI_DONE|PDRAW_PLANE_HI_PRIO); @@ -1646,6 +1643,8 @@ void PicoDrawSync(int to, int blank_last_line) if (to > 223) to = 223; } + if (Pico.est.DrawScanline <= to - blank_last_line) + PrepareSprites(to - blank_last_line + 1); for (line = Pico.est.DrawScanline; line < to; line++) PicoLine(line, offs, sh, bgc); From 38758e632ac62e832620e8a0b112f6d40c96a6d4 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 14 Mar 2020 19:30:28 +0100 Subject: [PATCH 127/174] hvcounter table resolution reduced --- pico/misc.c | 94 +++++++++++++++++------------------------------- pico/videoport.c | 8 ++--- 2 files changed, 36 insertions(+), 66 deletions(-) diff --git a/pico/misc.c b/pico/misc.c index ab282c247..4837fd3e5 100644 --- a/pico/misc.c +++ b/pico/misc.c @@ -10,72 +10,42 @@ // H-counter table for hvcounter reads in 40col mode, starting at HINT const unsigned char hcounts_40[] = { -0xa5,0xa5,0xa5,0xa6,0xa6,0xa7,0xa7,0xa8,0xa8,0xa8,0xa9,0xa9,0xaa,0xaa,0xab,0xab, -0xac,0xac,0xac,0xad,0xad,0xae,0xae,0xaf,0xaf,0xaf,0xb0,0xb0,0xb1,0xb1,0xb2,0xb2, -0xb3,0xb3,0xb3,0xb4,0xb4,0xb5,0xb5,0xb6,0xe4,0xe4,0xe5,0xe5,0xe6,0xe6,0xe7,0xe7, -0xe7,0xe8,0xe8,0xe8,0xe9,0xe9,0xe9,0xea,0xea,0xeb,0xeb,0xeb,0xec,0xec,0xec,0xed, -0xed,0xed,0xee,0xee,0xee,0xef,0xef,0xf0,0xf0,0xf0,0xf1,0xf1,0xf1,0xf2,0xf2,0xf2, -0xf3,0xf3,0xf3,0xf4,0xf4,0xf5,0xf5,0xf5,0xf6,0xf6,0xf6,0xf7,0xf7,0xf7,0xf8,0xf8, -0xf9,0xf9,0xfa,0xfa,0xfb,0xfb,0xfb,0xfc,0xfc,0xfd,0xfd,0xfe,0xfe,0xfe,0xff,0xff, -0x00,0x00,0x01,0x01,0x02,0x02,0x02,0x03,0x03,0x04,0x04,0x05,0x05,0x05,0x06,0x06, -0x07,0x07,0x08,0x08,0x09,0x09,0x09,0x0a,0x0a,0x0b,0x0b,0x0c,0x0c,0x0c,0x0d,0x0d, -0x0e,0x0e,0x0f,0x0f,0x10,0x10,0x10,0x11,0x11,0x12,0x12,0x13,0x13,0x13,0x14,0x14, -0x15,0x15,0x16,0x16,0x17,0x17,0x17,0x18,0x18,0x19,0x19,0x1a,0x1a,0x1a,0x1b,0x1b, -0x1c,0x1c,0x1d,0x1d,0x1e,0x1e,0x1e,0x1f,0x1f,0x20,0x20,0x21,0x21,0x21,0x22,0x22, -0x23,0x23,0x24,0x24,0x25,0x25,0x25,0x26,0x26,0x27,0x27,0x28,0x28,0x28,0x29,0x29, -0x2a,0x2a,0x2b,0x2b,0x2c,0x2c,0x2c,0x2d,0x2d,0x2e,0x2e,0x2f,0x2f,0x2f,0x30,0x30, -0x31,0x31,0x32,0x32,0x33,0x33,0x33,0x34,0x34,0x35,0x35,0x36,0x36,0x36,0x37,0x37, -0x38,0x38,0x39,0x39,0x3a,0x3a,0x3a,0x3b,0x3b,0x3c,0x3c,0x3d,0x3d,0x3d,0x3e,0x3e, -0x3f,0x3f,0x40,0x40,0x41,0x41,0x41,0x42,0x42,0x43,0x43,0x44,0x44,0x44,0x45,0x45, -0x46,0x46,0x47,0x47,0x48,0x48,0x48,0x49,0x49,0x4a,0x4a,0x4b,0x4b,0x4b,0x4c,0x4c, -0x4d,0x4d,0x4e,0x4e,0x4f,0x4f,0x4f,0x50,0x50,0x51,0x51,0x52,0x52,0x52,0x53,0x53, -0x54,0x54,0x55,0x55,0x56,0x56,0x56,0x57,0x57,0x58,0x58,0x59,0x59,0x59,0x5a,0x5a, -0x5b,0x5b,0x5c,0x5c,0x5d,0x5d,0x5d,0x5e,0x5e,0x5f,0x5f,0x60,0x60,0x60,0x61,0x61, -0x62,0x62,0x63,0x63,0x64,0x64,0x64,0x65,0x65,0x66,0x66,0x67,0x67,0x67,0x68,0x68, -0x69,0x69,0x6a,0x6a,0x6b,0x6b,0x6b,0x6c,0x6c,0x6d,0x6d,0x6e,0x6e,0x6e,0x6f,0x6f, -0x70,0x70,0x71,0x71,0x72,0x72,0x72,0x73,0x73,0x74,0x74,0x75,0x75,0x75,0x76,0x76, -0x77,0x77,0x78,0x78,0x79,0x79,0x79,0x7a,0x7a,0x7b,0x7b,0x7c,0x7c,0x7c,0x7d,0x7d, -0x7e,0x7e,0x7f,0x7f,0x80,0x80,0x80,0x81,0x81,0x82,0x82,0x83,0x83,0x83,0x84,0x84, -0x85,0x85,0x86,0x86,0x87,0x87,0x87,0x88,0x88,0x89,0x89,0x8a,0x8a,0x8a,0x8b,0x8b, -0x8c,0x8c,0x8d,0x8d,0x8e,0x8e,0x8e,0x8f,0x8f,0x90,0x90,0x91,0x91,0x91,0x92,0x92, -0x93,0x93,0x94,0x94,0x95,0x95,0x95,0x96,0x96,0x97,0x97,0x98,0x98,0x98,0x99,0x99, -0x9a,0x9a,0x9b,0x9b,0x9c,0x9c,0x9c,0x9d,0x9d,0x9e,0x9e,0x9f,0x9f,0x9f,0xa0,0xa0, -0xa1,0xa1,0xa2,0xa2,0xa3,0xa3,0xa3,0xa4,0xa5,0xa5,0xa5,0xa6,0xa6,0xa7,0xa7,0xa8, +0xa5,0xa6,0xa7,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xae,0xaf,0xb0,0xb1,0xb2, +0xb3,0xb4,0xb5,0xb5,0xe4,0xe5,0xe6,0xe7,0xe8,0xe8,0xe9,0xea,0xea,0xeb,0xec,0xed, +0xed,0xee,0xef,0xef,0xf0,0xf1,0xf2,0xf2,0xf3,0xf4,0xf4,0xf5,0xf6,0xf7,0xf7,0xf8, +0xf9,0xfa,0xfb,0xfc,0xfd,0xfd,0xfe,0xff,0x00,0x01,0x02,0x03,0x04,0x04,0x05,0x06, +0x07,0x08,0x09,0x0a,0x0b,0x0b,0x0c,0x0d,0x0e,0x0f,0x10,0x11,0x12,0x12,0x13,0x14, +0x15,0x16,0x17,0x18,0x19,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x20,0x21,0x22, +0x23,0x24,0x25,0x26,0x27,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2e,0x2f,0x30, +0x31,0x32,0x33,0x34,0x35,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3c,0x3d,0x3e, +0x3f,0x40,0x41,0x42,0x43,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4a,0x4b,0x4c, +0x4d,0x4e,0x4f,0x50,0x51,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x58,0x59,0x5a, +0x5b,0x5c,0x5d,0x5e,0x5f,0x5f,0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x66,0x67,0x68, +0x69,0x6a,0x6b,0x6c,0x6d,0x6d,0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x74,0x75,0x76, +0x77,0x78,0x79,0x7a,0x7b,0x7b,0x7c,0x7d,0x7e,0x7f,0x80,0x81,0x82,0x82,0x83,0x84, +0x85,0x86,0x87,0x88,0x89,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,0x90,0x90,0x91,0x92, +0x93,0x94,0x95,0x96,0x97,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9e,0x9f,0xa0, +0xa1,0xa2,0xa3,0xa4,0xa5,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xac,0xad,0xae, }; // H-counter table for hvcounter reads in 32col mode, starting at HINT const unsigned char hcounts_32[] = { -0x85,0x85,0x85,0x86,0x86,0x86,0x87,0x87,0x87,0x88,0x88,0x88,0x89,0x89,0x89,0x8a, -0x8a,0x8a,0x8b,0x8b,0x8c,0x8c,0x8c,0x8d,0x8d,0x8d,0x8e,0x8e,0x8e,0x8f,0x8f,0x8f, -0x90,0x90,0x90,0x91,0x91,0x91,0x92,0x92,0x93,0x93,0x93,0xe9,0xe9,0xe9,0xea,0xea, -0xea,0xeb,0xeb,0xeb,0xec,0xec,0xec,0xed,0xed,0xed,0xee,0xee,0xef,0xef,0xef,0xf0, -0xf0,0xf0,0xf1,0xf1,0xf1,0xf2,0xf2,0xf2,0xf3,0xf3,0xf3,0xf4,0xf4,0xf4,0xf5,0xf5, -0xf6,0xf6,0xf6,0xf7,0xf7,0xf7,0xf8,0xf8,0xf8,0xf9,0xf9,0xf9,0xfa,0xfa,0xfa,0xfb, -0xfb,0xfb,0xfc,0xfc,0xfd,0xfd,0xfd,0xfe,0xfe,0xfe,0xff,0xff,0xff,0x00,0x00,0x00, -0x01,0x01,0x01,0x02,0x02,0x02,0x03,0x03,0x04,0x04,0x04,0x05,0x05,0x05,0x06,0x06, -0x06,0x07,0x07,0x07,0x08,0x08,0x08,0x09,0x09,0x09,0x0a,0x0a,0x0b,0x0b,0x0b,0x0c, -0x0c,0x0c,0x0d,0x0d,0x0d,0x0e,0x0e,0x0e,0x0f,0x0f,0x0f,0x10,0x10,0x10,0x11,0x11, -0x12,0x12,0x12,0x13,0x13,0x13,0x14,0x14,0x14,0x15,0x15,0x15,0x16,0x16,0x16,0x17, -0x17,0x17,0x18,0x18,0x19,0x19,0x19,0x1a,0x1a,0x1a,0x1b,0x1b,0x1b,0x1c,0x1c,0x1c, -0x1d,0x1d,0x1d,0x1e,0x1e,0x1e,0x1f,0x1f,0x20,0x20,0x20,0x21,0x21,0x21,0x22,0x22, -0x22,0x23,0x23,0x23,0x24,0x24,0x24,0x25,0x25,0x25,0x26,0x26,0x27,0x27,0x27,0x28, -0x28,0x28,0x29,0x29,0x29,0x2a,0x2a,0x2a,0x2b,0x2b,0x2b,0x2c,0x2c,0x2c,0x2d,0x2d, -0x2e,0x2e,0x2e,0x2f,0x2f,0x2f,0x30,0x30,0x30,0x31,0x31,0x31,0x32,0x32,0x32,0x33, -0x33,0x33,0x34,0x34,0x35,0x35,0x35,0x36,0x36,0x36,0x37,0x37,0x37,0x38,0x38,0x38, -0x39,0x39,0x39,0x3a,0x3a,0x3a,0x3b,0x3b,0x3c,0x3c,0x3c,0x3d,0x3d,0x3d,0x3e,0x3e, -0x3e,0x3f,0x3f,0x3f,0x40,0x40,0x40,0x41,0x41,0x41,0x42,0x42,0x43,0x43,0x43,0x44, -0x44,0x44,0x45,0x45,0x45,0x46,0x46,0x46,0x47,0x47,0x47,0x48,0x48,0x48,0x49,0x49, -0x4a,0x4a,0x4a,0x4b,0x4b,0x4b,0x4c,0x4c,0x4c,0x4d,0x4d,0x4d,0x4e,0x4e,0x4e,0x4f, -0x4f,0x4f,0x50,0x50,0x51,0x51,0x51,0x52,0x52,0x52,0x53,0x53,0x53,0x54,0x54,0x54, -0x55,0x55,0x55,0x56,0x56,0x56,0x57,0x57,0x58,0x58,0x58,0x59,0x59,0x59,0x5a,0x5a, -0x5a,0x5b,0x5b,0x5b,0x5c,0x5c,0x5c,0x5d,0x5d,0x5d,0x5e,0x5e,0x5f,0x5f,0x5f,0x60, -0x60,0x60,0x61,0x61,0x61,0x62,0x62,0x62,0x63,0x63,0x63,0x64,0x64,0x64,0x65,0x65, -0x66,0x66,0x66,0x67,0x67,0x67,0x68,0x68,0x68,0x69,0x69,0x69,0x6a,0x6a,0x6a,0x6b, -0x6b,0x6b,0x6c,0x6c,0x6d,0x6d,0x6d,0x6e,0x6e,0x6e,0x6f,0x6f,0x6f,0x70,0x70,0x70, -0x71,0x71,0x71,0x72,0x72,0x72,0x73,0x73,0x74,0x74,0x74,0x75,0x75,0x75,0x76,0x76, -0x76,0x77,0x77,0x77,0x78,0x78,0x78,0x79,0x79,0x79,0x7a,0x7a,0x7b,0x7b,0x7b,0x7c, -0x7c,0x7c,0x7d,0x7d,0x7d,0x7e,0x7e,0x7e,0x7f,0x7f,0x7f,0x80,0x80,0x80,0x81,0x81, -0x82,0x82,0x82,0x83,0x83,0x83,0x84,0x84,0x85,0x85,0x85,0x86,0x86,0x86,0x87,0x87, +0x85,0x86,0x86,0x87,0x88,0x88,0x89,0x8a,0x8a,0x8b,0x8c,0x8d,0x8d,0x8e,0x8f,0x8f, +0x90,0x91,0x91,0x92,0x93,0xe9,0xe9,0xea,0xeb,0xeb,0xec,0xed,0xed,0xee,0xef,0xf0, +0xf0,0xf1,0xf2,0xf2,0xf3,0xf4,0xf4,0xf5,0xf6,0xf7,0xf7,0xf8,0xf9,0xf9,0xfa,0xfb, +0xfb,0xfc,0xfd,0xfe,0xfe,0xff,0x00,0x00,0x01,0x02,0x02,0x03,0x04,0x05,0x05,0x06, +0x07,0x07,0x08,0x09,0x09,0x0a,0x0b,0x0c,0x0c,0x0d,0x0e,0x0e,0x0f,0x10,0x10,0x11, +0x12,0x13,0x13,0x14,0x15,0x15,0x16,0x17,0x17,0x18,0x19,0x1a,0x1a,0x1b,0x1c,0x1c, +0x1d,0x1e,0x1e,0x1f,0x20,0x21,0x21,0x22,0x23,0x23,0x24,0x25,0x25,0x26,0x27,0x28, +0x28,0x29,0x2a,0x2a,0x2b,0x2c,0x2c,0x2d,0x2e,0x2f,0x2f,0x30,0x31,0x31,0x32,0x33, +0x33,0x34,0x35,0x36,0x36,0x37,0x38,0x38,0x39,0x3a,0x3a,0x3b,0x3c,0x3d,0x3d,0x3e, +0x3f,0x3f,0x40,0x41,0x41,0x42,0x43,0x44,0x44,0x45,0x46,0x46,0x47,0x48,0x48,0x49, +0x4a,0x4b,0x4b,0x4c,0x4d,0x4d,0x4e,0x4f,0x4f,0x50,0x51,0x52,0x52,0x53,0x54,0x54, +0x55,0x56,0x56,0x57,0x58,0x59,0x59,0x5a,0x5b,0x5b,0x5c,0x5d,0x5d,0x5e,0x5f,0x60, +0x60,0x61,0x62,0x62,0x63,0x64,0x64,0x65,0x66,0x67,0x67,0x68,0x69,0x69,0x6a,0x6b, +0x6b,0x6c,0x6d,0x6e,0x6e,0x6f,0x70,0x70,0x71,0x72,0x72,0x73,0x74,0x75,0x75,0x76, +0x77,0x77,0x78,0x79,0x79,0x7a,0x7b,0x7c,0x7c,0x7d,0x7e,0x7e,0x7f,0x80,0x80,0x81, +0x82,0x83,0x83,0x84,0x85,0x85,0x86,0x87,0x87,0x88,0x89,0x8a,0x8a,0x8b,0x8c,0x8c, }; #ifndef _ASM_MISC_C diff --git a/pico/videoport.c b/pico/videoport.c index fd7a3a46b..cbcea7965 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -975,8 +975,8 @@ PICO_INTERNAL_ASM unsigned int PicoVideoRead(unsigned int a) if (Pico.video.reg[0]&2) d = Pico.video.hv_latch; else if (Pico.video.reg[12]&1) - d = hcounts_40[d] | (Pico.video.v_counter << 8); - else d = hcounts_32[d] | (Pico.video.v_counter << 8); + d = hcounts_40[d/2] | (Pico.video.v_counter << 8); + else d = hcounts_32[d/2] | (Pico.video.v_counter << 8); elprintf(EL_HVCNT, "hv: %02x %02x [%u] @ %06x", d, Pico.video.v_counter, SekCyclesDone(), SekPc); return d; @@ -1035,8 +1035,8 @@ unsigned char PicoVideoRead8HV_L(void) if (Pico.video.reg[0]&2) d = Pico.video.hv_latch; else if (Pico.video.reg[12]&1) - d = hcounts_40[d]; - else d = hcounts_32[d]; + d = hcounts_40[d/2]; + else d = hcounts_32[d/2]; elprintf(EL_HVCNT, "hcounter: %02x [%u] @ %06x", d, SekCyclesDone(), SekPc); return d; } From e0216d53da5e27c21d8e19f1e59b4b42dfe84299 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 14 Mar 2020 19:52:27 +0100 Subject: [PATCH 128/174] vdp fifo speed optimization --- pico/misc.c | 129 ++++++++++++++++++++++++++++++ pico/pico.c | 1 + pico/pico_cmn.c | 2 + pico/pico_int.h | 3 + pico/videoport.c | 200 ++++++++++++++++++----------------------------- 5 files changed, 210 insertions(+), 125 deletions(-) diff --git a/pico/misc.c b/pico/misc.c index 4837fd3e5..74d4d8a8e 100644 --- a/pico/misc.c +++ b/pico/misc.c @@ -48,6 +48,135 @@ const unsigned char hcounts_32[] = { 0x82,0x83,0x83,0x84,0x85,0x85,0x86,0x87,0x87,0x88,0x89,0x8a,0x8a,0x8b,0x8c,0x8c, }; +// VDP transfer slots for blanked and active display in 32col and 40col mode. +// 1 slot is 488/171 = 2.8538 68k cycles in h32, and 488/210 = 2.3238 in h40 +// In blanked display, all slots but 5(h32) / 6(h40) are usable for transfers, +// in active display only 16(h32) / 18(h40) slots can be used. + +// XXX inactive tables by slot#=cycles*maxslot#/488. should be through hv tables +// VDP transfer slots in inactive (blanked) display 32col mode. +// refresh slots: 250, 26, 58, 90, 122 -> 32, 64, 96, 128, 160 +const unsigned char vdpcyc2sl_32_bl[] = { // 68k cycles/2 to slot # +// 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 + 0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, + 10, 11, 12, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 19, 20, 21, + 21, 22, 23, 23, 24, 25, 25, 26, 27, 27, 28, 29, 29, 30, 31, 31, + 32, 33, 34, 34, 35, 36, 36, 37, 38, 38, 39, 40, 40, 41, 42, 42, + 43, 44, 44, 45, 46, 46, 47, 48, 48, 49, 50, 51, 51, 52, 53, 53, + 54, 55, 55, 56, 57, 57, 58, 59, 59, 60, 61, 61, 62, 63, 63, 64, + 65, 65, 66, 67, 68, 68, 69, 70, 70, 71, 72, 72, 73, 74, 74, 75, + 76, 76, 77, 78, 78, 79, 80, 80, 81, 82, 83, 83, 84, 85, 85, 86, + 87, 87, 88, 89, 89, 90, 91, 91, 92, 93, 93, 94, 95, 95, 96, 97, + 97, 98, 99,100,100,101,102,102,103,104,104,105,106,106,107,108, + 108,109,110,110,111,112,112,113,114,114,115,116,117,117,118,119, + 119,120,121,121,122,123,123,124,125,125,126,127,127,128,129,129, + 130,131,131,132,133,134,134,135,136,136,137,138,138,139,140,140, + 141,142,142,143,144,144,145,146,146,147,148,148,149,150,151,151, + 152,153,153,154,155,155,156,157,157,158,159,159,160,161,161,162, + 163,163,164,165,166,166,167,168,168,169,170,170,171,172,172,173, +}; +// VDP transfer slots in inactive (blanked) display 40col mode. +// refresh slots: 250, 26, 58, 90, 122, 154 -> 40, 72, 104, 136, 168, 200 +const unsigned char vdpcyc2sl_40_bl[] = { // 68k cycles/2 to slot # +// 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 + 0, 0, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 10, 11, 12, + 13, 14, 15, 15, 16, 17, 18, 19, 20, 20, 21, 22, 23, 24, 25, 25, + 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 35, 35, 36, 37, 38, 39, + 40, 40, 41, 42, 43, 44, 45, 45, 46, 47, 48, 49, 50, 51, 51, 52, + 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65, 66, + 66, 67, 68, 69, 70, 71, 71, 72, 73, 74, 75, 76, 76, 77, 78, 79, + 80, 81, 81, 82, 83, 84, 85, 86, 86, 87, 88, 89, 90, 91, 91, 92, + 93, 94, 95, 96, 96, 97, 98, 99,100,101,102,102,103,104,105,106, + 107,107,108,109,110,111,112,112,113,114,115,116,117,117,118,119, + 120,121,122,122,123,124,125,126,127,127,128,129,130,131,132,132, + 133,134,135,136,137,137,138,139,140,141,142,142,143,144,145,146, + 147,147,148,149,150,151,152,153,153,154,155,156,157,158,158,159, + 160,161,162,163,163,164,165,166,167,168,168,169,170,171,172,173, + 173,174,175,176,177,178,178,179,180,181,182,183,183,184,185,186, + 187,188,188,189,190,191,192,193,193,194,195,196,197,198,198,199, + 200,201,202,203,204,204,205,206,207,208,209,209,210,211,212,213, +}; +// VDP transfer slots in active display 32col mode. Transfer slots (Hint=0): +// 11,25,40,48,56,72,80,88,104,112,120,136,144,152,167,168 +const unsigned char vdpcyc2sl_32[] = { // 68k cycles/2 to slot # +// 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, +}; +// VDP transfer slots in active display 40col mode. Transfer slots (Hint=0): +// 21,47,55,63,79,87,95,111,119,127,143,151,159,175,183,191,206,207 +const unsigned char vdpcyc2sl_40[] = { // 68k cycles/2 to slot # +// 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0 + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, // 32 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 64 + 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 96 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, // 128 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, // 160 + 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, // 192 + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 224 + 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, // 256 + 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, // 288 + 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, // 320 + 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, // 352 + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, // 384 + 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, // 416 + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, // 448 + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, // 480 +}; + +// XXX inactive tables by cyc=slot#*488/maxslot#. should be through hv tables +const unsigned short vdpsl2cyc_32_bl[] = { // slot # to 68k cycles/2 + 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, + 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43, 45, 46, + 48, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 64, 65, 67, 68, 70, + 71, 73, 74, 75, 77, 78, 80, 81, 83, 84, 86, 87, 89, 90, 92, 93, + 95, 96, 98, 99,100,102,103,105,106,108,109,111,112,114,115,117, + 118,120,121,122,124,125,127,128,130,131,133,134,136,137,139,140, + 142,143,145,146,147,149,150,152,153,155,156,158,159,161,162,164, + 165,167,168,170,171,172,174,175,177,178,180,181,183,184,186,187, + 189,190,192,193,195,196,197,199,200,202,203,205,206,208,209,211, + 212,214,215,217,218,220,221,222,224,225,227,228,230,231,233,234, + 236,237,239,240,242,243,244,246, +}; +const unsigned short vdpsl2cyc_40_bl[] = { // slot # to 68k cycles/2 + 0, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, + 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 38, + 39, 40, 41, 42, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 56, 57, + 58, 59, 60, 61, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 75, 76, + 77, 78, 79, 81, 82, 83, 84, 85, 87, 88, 89, 90, 91, 93, 94, 95, + 96, 97, 99,100,101,102,103,105,106,107,108,109,111,112,113,114, + 115,117,118,119,120,121,122,124,125,126,127,128,130,131,132,133, + 134,136,137,138,139,140,142,143,144,145,146,148,149,150,151,152, + 154,155,156,157,158,160,161,162,163,164,166,167,168,169,170,172, + 173,174,175,176,178,179,180,181,182,183,185,186,187,188,189,191, + 192,193,194,195,197,198,199,200,201,203,204,205,206,207,209,210, + 211,212,213,215,216,217,218,219,221,222,223,224,225,227,228,229, + 230,231,233,234,235,236,237,239,240,241,242,243,244,246, +}; +const unsigned short vdpsl2cyc_32[] = { // slot # to 68k cycles/2 + 0, 16, 36, 56, 67, 79,102,113,125,148,159,171,194,205,217,239, + 240,260 +}; +const unsigned short vdpsl2cyc_40[] = { // slot # to 68k cycles/2 + 0, 24, 55, 64, 73, 92,101,110,129,138,147,166,175,184,203,212, + 221,239,240,268 +}; + #ifndef _ASM_MISC_C PICO_INTERNAL_ASM void memcpy16bswap(unsigned short *dest, void *src, int count) { diff --git a/pico/pico.c b/pico/pico.c index 9db2fc641..87e22e59d 100644 --- a/pico/pico.c +++ b/pico/pico.c @@ -79,6 +79,7 @@ void PicoPower(void) Pico.video.reg[0] = Pico.video.reg[1] = 0x04; Pico.video.reg[0xc] = 0x81; Pico.video.reg[0xf] = 0x02; + PicoVideoFIFOMode(0, 1); if (PicoIn.AHW & PAHW_MCD) PicoPowerMCD(); diff --git a/pico/pico_cmn.c b/pico/pico_cmn.c index 50a632ca7..017c404b7 100644 --- a/pico/pico_cmn.c +++ b/pico/pico_cmn.c @@ -179,6 +179,7 @@ static int PicoFrameHints(void) } pv->status |= SR_VB | PVS_VB2; // go into vblank + PicoVideoFIFOMode(pv->reg[1]&0x40, pv->reg[12]&1); // the following SekRun is there for several reasons: // there must be a delay after vblank bit is set and irq is asserted (Mazin Saga) @@ -270,6 +271,7 @@ static int PicoFrameHints(void) pv->status &= ~(SR_VB | PVS_VB2); pv->status |= ((pv->reg[1] >> 3) ^ SR_VB) & SR_VB; // forced blanking + PicoVideoFIFOMode(pv->reg[1]&0x40, pv->reg[12]&1); // last scanline Pico.m.scanline = y++; diff --git a/pico/pico_int.h b/pico/pico_int.h index 65b56f1d2..c0f2c3437 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -299,6 +299,8 @@ extern SH2 sh2s[2]; #define PVS_CPUWR (1 << 18) // CPU write blocked by FIFO full #define PVS_CPURD (1 << 19) // CPU read blocked by FIFO not empty #define PVS_DMAFILL (1 << 20) // DMA fill is waiting for fill data +#define PVS_DMABG (1 << 21) // background DMA operation is running +#define PVS_FIFORUN (1 << 22) // FIFO is processing struct PicoVideo { @@ -858,6 +860,7 @@ unsigned char PicoVideoRead8HV_L(void); extern int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned int *mask); void PicoVideoFIFOSync(int cycles); int PicoVideoFIFOHint(void); +void PicoVideoFIFOMode(int active, int h40); int PicoVideoFIFOWrite(int count, int byte_p, unsigned sr_mask, unsigned sr_flags); void PicoVideoSave(void); void PicoVideoLoad(void); diff --git a/pico/videoport.c b/pico/videoport.c index cbcea7965..3ed7f5b4c 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -12,8 +12,11 @@ #define NEED_DMA_SOURCE #include "memory.h" -extern const unsigned char hcounts_32[]; -extern const unsigned char hcounts_40[]; +extern const unsigned char hcounts_32[], hcounts_40[]; +extern const unsigned char vdpcyc2sl_32_bl[], vdpcyc2sl_40_bl[]; +extern const unsigned char vdpcyc2sl_32[], vdpcyc2sl_40[]; +extern const unsigned short vdpsl2cyc_32_bl[], vdpsl2cyc_40_bl[]; +extern const unsigned short vdpsl2cyc_32[], vdpsl2cyc_40[]; static int blankline; // display disabled for this line static unsigned sat; // VRAM addr of sprite attribute table @@ -53,48 +56,6 @@ int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned * FIFORead executes a 68k read. 68k is blocked until the next transfer slot. */ -// FIFO transfer slots per line: [active][h40] -static const short vdpslots[2][2] = {{ 166, 204 },{ 16, 18 }}; -// mapping between slot# and 68k cycles in a blanked scanline [H32, H40] -static const int vdpcyc2sl_bl[] = { (166<<16)/488, (204<<16)/488 }; -static const int vdpsl2cyc_bl[] = { (488<<16)/166, (488<<16)/204 }; - -// VDP transfer slots in active display 32col mode. 1 slot is 488/171 = 2.8538 -// 68k cycles. Only 16 of the 171 slots in a scanline can be used by CPU/DMA: -// (HINT=slot 0): 11,25,40,48,56,72,80,88,104,112,120,136,144,152,167,168 -static const unsigned char vdpcyc2sl_32[] = { // 68k cycles/4 to slot # -// 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 - 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, - 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9,10, -10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11, -11,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14, -14,14,14,14,14,14,14,15,16,16,16,16,16,16,16,16, -}; -static const unsigned char vdpsl2cyc_32[] = { // slot # to 68k cycles/4 - 0, 8, 18, 28, 33, 39, 51, 56, 62, 74, 79, 85, 97,102,108,119,120,130 -}; - -// VDP transfer slots in active display 40col mode. 1 slot is 488/210 = 2.3238 -// 68k cycles. Only 18 of the 210 slots in a scanline can be used by CPU/DMA: -// (HINT=0): 21,47,55,63,79,87,95,111,119,127,143,151,159,175,183,191,206,207, -static const unsigned char vdpcyc2sl_40[] = { // 68k cycles/4 to slot # -// 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, - 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 8, 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10, -10,10,10,11,11,11,11,12,12,12,12,12,13,13,13,13, -13,13,13,13,13,14,14,14,14,14,15,15,15,15,15,16, -16,16,16,16,16,16,16,17,18,18,18,18,18,18,18,18, -}; -static const unsigned char vdpsl2cyc_40[] = { // slot # to 68k cycles/4 - 0, 12, 27, 32, 36, 46, 50, 55, 64, 69, 73, 83, 87, 92,101,106,111,119,120,134 -}; - // NB code assumes fifo_* arrays have size 2^n // last transferred FIFO data, ...x = index XXX currently only CPU static short fifo_data[4], fifo_dx; // XXX must go into save? @@ -106,34 +67,10 @@ enum { FQ_BYTE = 1, FQ_BGDMA = 2, FQ_FGDMA = 4 }; // queue flags, NB: BYTE = 1! static unsigned int fifo_total; // total# of pending FIFO entries (w/o BGDMA) static unsigned short fifo_slot; // last executed slot in current scanline +static unsigned short fifo_maxslot;// #slots in scanline -// map cycles to FIFO slot -static __inline int GetFIFOSlot(struct PicoVideo *pv, int cycles) -{ - int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); - int h40 = pv->reg[12] & 1; - - if (active) return (h40 ? vdpcyc2sl_40 : vdpcyc2sl_32)[cycles/4]; - else return (cycles * vdpcyc2sl_bl[h40] + cycles) >> 16; -} - -static __inline int GetMaxFIFOSlot(struct PicoVideo *pv) -{ - int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); - int h40 = pv->reg[12] & 1; - - return vdpslots[active][h40]; -} - -// map FIFO slot to cycles -static __inline int GetFIFOCycles(struct PicoVideo *pv, int slot) -{ - int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40); - int h40 = pv->reg[12] & 1; - - if (active) return (h40 ? vdpsl2cyc_40 : vdpsl2cyc_32)[slot]*4; - else return ((slot * vdpsl2cyc_bl[h40] + slot) >> 16); -} +static const unsigned char *fifo_cyc2sl; +static const unsigned short *fifo_sl2cyc; // do the FIFO math static __inline int AdvanceFIFOEntry(struct PicoVideo *pv, int slots) @@ -149,20 +86,16 @@ static __inline int AdvanceFIFOEntry(struct PicoVideo *pv, int slots) // if entry has been processed... if (pv->fifo_cnt == 0) { - if (fifo_ql) { - // terminate DMA if applicable - if ((pv->status & SR_DMA) && (fifo_queue[fifo_qx] & FQ_BGDMA)) { - pv->status &= ~SR_DMA; - pv->command &= ~0x80; - } - // remove entry from FIFO + // remove entry from FIFO + if (fifo_ql) fifo_qx ++, fifo_qx &= 7, fifo_ql --; - } // start processing for next entry if there is one if (fifo_ql) pv->fifo_cnt = (fifo_queue[fifo_qx] >> 3) << (fifo_queue[fifo_qx] & FQ_BYTE); - else + else { // FIFO empty + pv->status &= ~PVS_FIFORUN; fifo_total = 0; + } } return l; } @@ -170,16 +103,20 @@ static __inline int AdvanceFIFOEntry(struct PicoVideo *pv, int slots) static __inline void SetFIFOState(struct PicoVideo *pv) { // release CPU and terminate DMA if FIFO isn't blocking the 68k anymore - if (fifo_total == 0) - pv->status &= ~PVS_CPURD; if (fifo_total <= 4) { - int x = (fifo_qx + fifo_ql - 1) & 7; - if ((pv->status & SR_DMA) && !(pv->status & PVS_DMAFILL) && - (!fifo_ql || !(fifo_queue[x] & FQ_BGDMA))) { + pv->status &= ~PVS_CPUWR; + if (!(pv->status & (PVS_DMABG|PVS_DMAFILL))) { pv->status &= ~SR_DMA; pv->command &= ~0x80; } - pv->status &= ~PVS_CPUWR; + } + if (fifo_total == 0) { + pv->status &= ~PVS_CPURD; + // terminate DMA if applicable + if (!(pv->status & (PVS_FIFORUN|PVS_DMAFILL))) { + pv->status &= ~(SR_DMA|PVS_DMABG); + pv->command &= ~0x80; + } } } @@ -190,7 +127,7 @@ void PicoVideoFIFOSync(int cycles) int slots, done; // calculate #slots since last executed slot - slots = GetFIFOSlot(pv, cycles) - fifo_slot; + slots = fifo_cyc2sl[cycles>>1] - fifo_slot; // advance FIFO queue by #done slots done = slots; @@ -208,31 +145,28 @@ void PicoVideoFIFOSync(int cycles) int PicoVideoFIFODrain(int level, int cycles, int bgdma) { struct PicoVideo *pv = &Pico.video; - int maxsl = GetMaxFIFOSlot(pv); // max xfer slots in this scanline + unsigned ocyc = cycles; int burn = 0; // process FIFO entries until low level is reached - while (fifo_total > level && fifo_slot < maxsl && + while (fifo_total > level && fifo_slot < fifo_maxslot && (!(fifo_queue[fifo_qx] & FQ_BGDMA) || bgdma)) { int b = fifo_queue[fifo_qx] & FQ_BYTE; int cnt = ((fifo_total-level) << b) - (pv->fifo_cnt & b); - int last = fifo_slot; - int slot = (pv->fifo_cnt < cnt ? pv->fifo_cnt : cnt) + last; // target slot - unsigned ocyc = cycles; + int slot = (pv->fifo_cntfifo_cnt:cnt) + fifo_slot; // target slot - if (slot > maxsl) { + if (slot > fifo_maxslot) { // target in later scanline, advance to eol - slot = maxsl; + slot = fifo_maxslot; cycles = 488; } else { // advance FIFO to target slot and CPU to cycles at that slot - cycles = GetFIFOCycles(pv, slot); + cycles = fifo_sl2cyc[slot]<<1; } + AdvanceFIFOEntry(pv, slot - fifo_slot); fifo_slot = slot; - burn += cycles - ocyc; - - AdvanceFIFOEntry(pv, slot - last); } + burn = cycles - ocyc; SetFIFOState(pv); @@ -246,17 +180,19 @@ int PicoVideoFIFORead(void) int lc = SekCyclesDone()-Pico.t.m68c_line_start; int burn = 0; - PicoVideoFIFOSync(lc); + if (pv->fifo_cnt) { + PicoVideoFIFOSync(lc); + // advance FIFO and CPU until FIFO is empty + burn = PicoVideoFIFODrain(0, lc, 1); + lc += burn; + } - // advance FIFO and CPU until FIFO is empty - burn = PicoVideoFIFODrain(0, lc, 1); - lc += burn; if (fifo_total > 0) pv->status |= PVS_CPURD; // target slot is in later scanline else { // use next VDP access slot for reading, block 68k until then - fifo_slot = GetFIFOSlot(pv, lc) + 1; - burn += GetFIFOCycles(pv, fifo_slot) - lc; + fifo_slot = fifo_cyc2sl[lc>>1] + 1; + burn += (fifo_sl2cyc[fifo_slot]<<1) - lc; } return burn; @@ -267,35 +203,41 @@ int PicoVideoFIFOWrite(int count, int flags, unsigned sr_mask,unsigned sr_flags) { struct PicoVideo *pv = &Pico.video; int lc = SekCyclesDone()-Pico.t.m68c_line_start; - int burn = 0, x; + int burn = 0, x, head = 0; - PicoVideoFIFOSync(lc); + if (pv->fifo_cnt) + PicoVideoFIFOSync(lc); pv->status = (pv->status & ~sr_mask) | sr_flags; if (count && fifo_ql < 8) { // update FIFO state if it was empty if (fifo_ql == 0) { - fifo_slot = GetFIFOSlot(pv, lc+9); // FIFO latency ~3 vdp slots + fifo_slot = fifo_cyc2sl[(lc+8)>>1]; // FIFO latency ~3 vdp slots pv->fifo_cnt = count << (flags & FQ_BYTE); + pv->status |= PVS_FIFORUN; } - // create xfer queue entry + // determine queue position for entry x = (fifo_qx + fifo_ql - 1) & 7; if (fifo_ql && (fifo_queue[x] & FQ_BGDMA)) { // CPU FIFO writes have priority over a background DMA Fill/Copy fifo_queue[(x+1) & 7] = fifo_queue[x]; - if (fifo_ql == 1) { + if (x == fifo_qx) { // overtaking to queue head? // XXX if interrupting a DMA fill, fill data changes int f = fifo_queue[x] & 7; fifo_queue[(x+1) & 7] = (pv->fifo_cnt >> (f & FQ_BYTE) << 3) | f; pv->fifo_cnt = count << (flags & FQ_BYTE); + head = 1; } x = (x-1) & 7; } - if (fifo_ql && (fifo_queue[x] & 7) == flags) { + + // create xfer queue entry + if (fifo_ql && !head && (fifo_queue[x] & 7) == flags) { // amalgamate entries if of same type fifo_queue[x] += (count << 3); - if (fifo_ql == 1) pv->fifo_cnt += count << (flags & FQ_BYTE); + if (x == fifo_qx) // modifiying fifo head, adjust count + pv->fifo_cnt += count << (flags & FQ_BYTE); } else { fifo_ql ++; x = (x+1) & 7; @@ -331,20 +273,25 @@ int PicoVideoFIFOHint(void) } // switch FIFO mode between active/inactive display -static void PicoVideoFIFOMode(int active) +void PicoVideoFIFOMode(int active, int h40) { + static const unsigned char *vdpcyc2sl[2][2] = + { {vdpcyc2sl_32_bl, vdpcyc2sl_40_bl} , {vdpcyc2sl_32, vdpcyc2sl_40} }; + static const unsigned short *vdpsl2cyc[2][2] = + { {vdpsl2cyc_32_bl, vdpsl2cyc_40_bl} , {vdpsl2cyc_32, vdpsl2cyc_40} }; + struct PicoVideo *pv = &Pico.video; - int h40 = pv->reg[12] & 1; int lc = SekCyclesDone() - Pico.t.m68c_line_start; + active = active && !(pv->status & PVS_VB2); - PicoVideoFIFOSync(lc); + if (fifo_maxslot) + PicoVideoFIFOSync(lc); - if (fifo_ql) { - // recalculate FIFO slot for new mode - if (!(pv->status & SR_VB) && active) - fifo_slot = (pv->reg[12]&1 ? vdpcyc2sl_40 : vdpcyc2sl_32)[lc/4]; - else fifo_slot = ((lc * vdpcyc2sl_bl[h40] + lc) >> 16); - } + fifo_cyc2sl = vdpcyc2sl[active][h40]; + fifo_sl2cyc = vdpsl2cyc[active][h40]; + // recalculate FIFO slot for new mode + fifo_slot = fifo_cyc2sl[lc>>1]-1; + fifo_maxslot = fifo_cyc2sl[488>>1]; } @@ -459,7 +406,7 @@ static void DmaSlow(int len, unsigned int source) SekCyclesDone(), SekPc); SekCyclesBurnRun(PicoVideoFIFOWrite(len, FQ_FGDMA | (Pico.video.type == 1), - 0, SR_DMA| PVS_CPUWR)); + PVS_DMABG, SR_DMA | PVS_CPUWR)); if ((source & 0xe00000) == 0xe00000) { // Ram base = (u16 *)PicoMem.ram; @@ -583,13 +530,13 @@ static void DmaCopy(int len) int source; elprintf(EL_VDPDMA, "DmaCopy len %i [%u]", len, SekCyclesDone()); + // XXX implement VRAM 128k? Is this even working? xfer/count still FQ_BYTE? SekCyclesBurnRun(PicoVideoFIFOWrite(len, FQ_BGDMA | FQ_BYTE, - PVS_CPUWR, SR_DMA)); + PVS_CPUWR, SR_DMA | PVS_DMABG)); source =Pico.video.reg[0x15]; source|=Pico.video.reg[0x16]<<8; - // XXX implement VRAM 128k? Is this even working? count still in bytes? for (; len; len--) { vr[(u16)a] = vr[(u16)(source++)]; @@ -616,7 +563,7 @@ static NOINLINE void DmaFill(int data) elprintf(EL_VDPDMA, "DmaFill len %i inc %i [%u]", len, inc, SekCyclesDone()); SekCyclesBurnRun(PicoVideoFIFOWrite(len, FQ_BGDMA | (Pico.video.type == 1), - PVS_CPUWR | PVS_DMAFILL, SR_DMA)); + PVS_CPUWR | PVS_DMAFILL, SR_DMA | PVS_DMABG)); switch (Pico.video.type) { @@ -823,11 +770,13 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) if (num == 0 && !(pvid->reg[0]&2) && (d&2)) pvid->hv_latch = PicoVideoRead(0x08); if (num == 1 && ((pvid->reg[1]^d)&0x40)) { - PicoVideoFIFOMode(d & 0x40); + PicoVideoFIFOMode(d & 0x40, pvid->reg[12]&1); // handle line blanking before line rendering if (SekCyclesDone() - Pico.t.m68c_line_start <= 488-390) blankline = d&0x40 ? -1 : Pico.m.scanline; } + if (num == 12 && ((pvid->reg[12]^d)&0x01)) + PicoVideoFIFOMode(pvid->reg[1]&0x40, d & 1); DrawSync(SekCyclesDone() - Pico.t.m68c_line_start <= 488-390); pvid->reg[num]=(unsigned char)d; switch (num) @@ -1058,6 +1007,7 @@ void PicoVideoLoad(void) // convert former dma_xfers (why was this in PicoMisc anyway?) if (Pico.m.dma_xfers) { + pv->status = SR_DMA|PVS_FIFORUN; pv->fifo_cnt = Pico.m.dma_xfers * (pv->type == 1 ? 2 : 1); fifo_total = Pico.m.dma_xfers; Pico.m.dma_xfers = 0; From 4a45cd8127b86316fa3c029c34e5198daa4aede0 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 19 Mar 2020 22:45:06 +0100 Subject: [PATCH 129/174] vdp rendering fixes (debug register, vscroll) for overdrive 2 --- pico/draw.c | 258 +++++++++++++++++++++++++++++++++++++++--------- pico/draw_arm.S | 8 +- 2 files changed, 218 insertions(+), 48 deletions(-) diff --git a/pico/draw.c b/pico/draw.c index 8343a3418..babf40069 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -29,6 +29,7 @@ */ #include "pico_int.h" +#define FORCE // layer forcing via debug register? int (*PicoScanBegin)(unsigned int num) = NULL; int (*PicoScanEnd) (unsigned int num) = NULL; @@ -222,6 +223,7 @@ TileFlipMakerAS(TileFlipSH_AS_onlyop_lp, pix_sh_as_onlyop) TileNormMakerAS(TileNormAS_onlymark, pix_sh_as_onlymark) TileFlipMakerAS(TileFlipAS_onlymark, pix_sh_as_onlymark) +#ifdef FORCE // forced both layer draw (through debug reg) #define pix_and(x) \ pd[x] = (pd[x] & 0xc0) | (pd[x] & (pal | t)) @@ -230,12 +232,16 @@ TileNormMaker(TileNorm_and, pix_and) TileFlipMaker(TileFlip_and, pix_and) // forced sprite draw (through debug reg) -#define pix_sh_and(x) /* XXX is there S/H with forced draw? */ \ - if (t>=0xe) pd[x]=(pd[x]&0x3f)|(t<<6); /* c0 shadow, 80 hilight */ \ - else pd[x] = (pd[x] & 0xc0) | (pd[x] & (pal | t)) +#define pix_sh_as_and(x) /* XXX is there S/H with forced draw? */ \ + if (m & (1<<(x+8))) { \ + m &= ~(1<<(x+8)); \ + if (t>=0xe) pd[x]=(pd[x]&0x3f)|(t<<6); /* c0 shadow, 80 hilight */ \ + else pd[x] = (pd[x] & 0xc0) | (pd[x] & (pal | t)); \ + } -TileNormMaker(TileNormSH_and, pix_sh_and) -TileFlipMaker(TileFlipSH_and, pix_sh_and) +TileNormMakerAS(TileNormSH_AS_and, pix_sh_as_and) +TileFlipMakerAS(TileFlipSH_AS_and, pix_sh_as_and) +#endif // -------------------------------------------- @@ -311,6 +317,7 @@ static void DrawStripVSRam(struct TileStrip *ts, int plane_sh, int cellskip) int adj = ((ts->hscroll ^ dx) >> 3) & 1; cell -= adj + 1; ts->cells -= adj; + PicoMem.vsram[0x3e] = PicoMem.vsram[0x3f] = plane_sh >> 16; } cell+=cellskip; tilex+=cellskip; @@ -479,7 +486,7 @@ static void DrawLayer(int plane_sh, int *hcache, int cellskip, int maxcells, // shit, we have 2-cell column based vscroll // luckily this doesn't happen too often ts.line=ymask|(shift[width]<<24); // save some stuff instead of line - PicoMem.vsram[(plane_sh & 1)+0x3e] = PicoMem.vsram[0x27]; // XXX really? + plane_sh |= PicoMem.vsram[0x26+(~plane_sh&1)] << 16; DrawStripVSRam(&ts, plane_sh, cellskip); } else { vscroll = PicoMem.vsram[plane_sh & 1]; // Get vertical scroll value @@ -778,28 +785,6 @@ static void DrawSprite(int *sprite, int sh, int w) } #endif -static NOINLINE void DrawTilesFromCacheForced(const int *hc) -{ - unsigned char *pd = Pico.est.HighCol; - int code, addr, dx; - unsigned int pack; - int pal; - - // *ts->hc++ = code | (dx<<16) | (ty<<25); - while ((code = *hc++)) { - // Get tile address/2: - addr = (code & 0x7ff) << 4; - addr += (code >> 25) & 0x0e; // y offset into tile - - dx = (code >> 16) & 0x1ff; - pal = ((code >> 9) & 0x30); - pack = *(unsigned int *)(PicoMem.vram + addr); - - if (code & 0x0800) TileFlip_and(pd + dx, pack, pal); - else TileNorm_and(pd + dx, pack, pal); - } -} - static void DrawSpriteInterlace(unsigned int *sprite) { unsigned char *pd = Pico.est.HighCol; @@ -1040,16 +1025,181 @@ static void DrawSpritesHiAS(unsigned char *sprited, int sh) } } +#ifdef FORCE +static void DrawStripForced(struct TileStrip *ts, int lflags, int cellskip) +{ + unsigned char *pd = Pico.est.HighCol; + int tilex,dx,ty,code=0,addr=0,cells; + int oldcode=-1; + int pal=0,sh; + + // Draw tiles across screen: + sh = (lflags & LF_SH) << 5; // 0x40 + tilex=((-ts->hscroll)>>3)+cellskip; + ty=(ts->line&7)<<1; // Y-Offset into tile + dx=((ts->hscroll-1)&7)+1; + cells = ts->cells - cellskip; + if(dx != 8) cells++; // have hscroll, need to draw 1 cell more + dx+=cellskip<<3; + + for (; cells > 0; dx+=8, tilex++, cells--) + { + unsigned int pack; + + code = PicoMem.vram[ts->nametab + (tilex & ts->xmask)]; + + if (code!=oldcode) { + oldcode = code; + // Get tile address/2: + addr=(code&0x7ff)<<4; + addr+=ty; + if (code&0x1000) addr^=0xe; // Y-flip + + pal=((code>>9)&0x30)|sh; + } + + pack = *(unsigned int *)(PicoMem.vram + addr); + + if (code & 0x0800) TileFlip_and(pd + dx, pack, pal); + else TileNorm_and(pd + dx, pack, pal); + } +} + +// this is messy +static void DrawStripVSRamForced(struct TileStrip *ts, int plane_sh, int cellskip) +{ + unsigned char *pd = Pico.est.HighCol; + int tilex,dx,code=0,addr=0,cell=0; + int oldcode=-1; + int pal=0,scan=Pico.est.DrawScanline; + + // Draw tiles across screen: + tilex=(-ts->hscroll)>>3; + dx=((ts->hscroll-1)&7)+1; + if (ts->hscroll & 0x0f) { + int adj = ((ts->hscroll ^ dx) >> 3) & 1; + cell -= adj + 1; + ts->cells -= adj; + PicoMem.vsram[0x3e] = PicoMem.vsram[0x3f] = plane_sh >> 16; + } + cell+=cellskip; + tilex+=cellskip; + dx+=cellskip<<3; + + for (; cell < ts->cells; dx+=8,tilex++,cell++) + { + int nametabadd, ty; + unsigned int pack; + + //if((cell&1)==0) + { + int line,vscroll; + vscroll=PicoMem.vsram[(plane_sh&1)+(cell&0x3e)]; + + // Find the line in the name table + line=(vscroll+scan)&ts->line&0xffff; // ts->line is really ymask .. + nametabadd=(line>>3)<<(ts->line>>24); // .. and shift[width] + ty=(line&7)<<1; // Y-Offset into tile + } + + code=PicoMem.vram[ts->nametab+nametabadd+(tilex&ts->xmask)]; + + if (code!=oldcode) { + oldcode = code; + // Get tile address/2: + addr=(code&0x7ff)<<4; + + pal=((code>>9)&0x30)|((plane_sh<<5)&0x40); + } + + if (code & 0x1000) ty ^= 0xe; // Y-flip + pack = *(unsigned int *)(PicoMem.vram + addr+ty); + + if (code & 0x0800) TileFlip_and(pd + dx, pack, pal); + else TileNorm_and(pd + dx, pack, pal); + } +} + +static void DrawLayerForced(int plane_sh, int cellskip, int maxcells, + struct PicoEState *est) +{ + struct PicoVideo *pvid=&Pico.video; + const char shift[4]={5,6,5,7}; // 32,64 or 128 sized tilemaps (2 is invalid) + struct TileStrip ts; + int width, height, ymask; + int vscroll, htab; + + ts.cells=maxcells; + + // Work out the TileStrip to draw + + // Work out the name table size: 32 64 or 128 tiles (0-3) + width=pvid->reg[16]; + height=(width>>4)&3; width&=3; + + ts.xmask=(1<reg[4]&0x07)<<12; // B + else ts.nametab=(pvid->reg[2]&0x38)<< 9; // A + + htab=pvid->reg[13]<<9; // Horizontal scroll table address + switch (pvid->reg[11]&3) { + case 1: htab += (est->DrawScanline<<1) & 0x0f; break; + case 2: htab += (est->DrawScanline<<1) & ~0x0f; break; // Offset by tile + case 3: htab += (est->DrawScanline<<1); break; // Offset by line + } + htab+=plane_sh&1; // A or B + + // Get horizontal scroll value, will be masked later + ts.hscroll = PicoMem.vram[htab & 0x7fff]; + + if((pvid->reg[12]&6) == 6) { + // interlace mode 2 + vscroll = PicoMem.vsram[plane_sh & 1]; // Get vertical scroll value + + // Find the line in the name table + ts.line=(vscroll+(est->DrawScanline<<1))&((ymask<<1)|1); + ts.nametab+=(ts.line>>4)<reg[11]&4) { + // shit, we have 2-cell column based vscroll + // luckily this doesn't happen too often + ts.line=ymask|(shift[width]<<24); // save some stuff instead of line + plane_sh |= PicoMem.vsram[0x26+(~plane_sh&1)] << 16; + DrawStripVSRamForced(&ts, plane_sh, cellskip); + } else { + vscroll = PicoMem.vsram[plane_sh & 1]; // Get vertical scroll value + + // Find the line in the name table + ts.line=(vscroll+est->DrawScanline)&ymask; + ts.nametab+=(ts.line>>3)<>9)&0x30; - if (code&0x800) fTileFunc = TileFlipSH_and; - else fTileFunc = TileNormSH_and; + if (code&0x800) fTileFunc = TileFlipSH_AS_and; + else fTileFunc = TileNormSH_AS_and; // parse remaining sprite data sy=sprite[0]; @@ -1087,7 +1237,8 @@ static void DrawSpritesForced(unsigned char *sprited) delta<<=4; // Delta of address if (entry+1 == cnt) width = p[entry+1]; // last sprite width limited? - for (; width; width--,sx+=8,tile+=delta) + mp = mb+(sx>>3); + for (m = *mp; width; width--, sx+=8, *mp++ = m, m >>= 8, tile+=delta) { unsigned int pack; @@ -1095,10 +1246,25 @@ static void DrawSpritesForced(unsigned char *sprited) if(sx>=328) break; // Offscreen pack = *(unsigned int *)(PicoMem.vram + (tile & 0x7fff)); - fTileFunc(pd + sx, pack, pal); - } + + m |= mp[1] << 8; // next mask byte + // shift mask bits to bits 8-15 for easier load/store handling + m = fTileFunc(pd + sx, m << (8-(sx&0x7)), pack, pal) >> (8-(sx&0x7)); + } + *mp = m; // write last mask byte } + + // anything not covered by a sprite is off (XXX or bg?) + for (cnt = 1; cnt < sizeof(mb)-1; cnt++) + if (mb[cnt] == 0xff) + for (m = 0; m < 8; m++) + pd[8*cnt+m] = 0; + else if (mb[cnt]) + for (m = 0; m < 8; m++) + if (mb[cnt] & (1<debug_p & PVD_KILL_B)) { lflags = LF_PLANE_1 | (sh << 1); - if (pvid->debug_p & PVD_FORCE_B) - lflags |= LF_FORCE; DrawLayer(lflags, HighCacheB, 0, maxcells, est); } /* - layer A low - */ lflags = 0 | (sh << 1); - if (pvid->debug_p & PVD_FORCE_A) - lflags |= LF_FORCE; if (pvid->debug_p & PVD_KILL_A) ; else if (hvwind == 1) @@ -1516,12 +1678,16 @@ static int DrawDisplay(int sh) else if (sprited[1] & SPRL_HAVE_HI) DrawAllSprites(sprited, 1, 0, est); - if (pvid->debug_p & PVD_FORCE_B) - DrawTilesFromCacheForced(HighCacheB); - else if (pvid->debug_p & PVD_FORCE_A) - DrawTilesFromCacheForced(HighCacheA); - else if (pvid->debug_p & PVD_FORCE_S) +#ifdef FORCE + if (pvid->debug_p & PVD_FORCE_B) { + lflags = LF_PLANE_1 | (sh << 1); + DrawLayerForced(lflags, 0, maxcells, est); + } else if (pvid->debug_p & PVD_FORCE_A) { + lflags = (sh << 1); + DrawLayerForced(lflags, 0, maxcells, est); + } else if (pvid->debug_p & PVD_FORCE_S) DrawSpritesForced(sprited); +#endif #if 0 { diff --git a/pico/draw_arm.S b/pico/draw_arm.S index 8dc660c25..1a0f35133 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -545,8 +545,12 @@ DrawLayer: eor r3, r3, r7 sub r10,r10, #1<<24 @ cell-- // start from negative for hscroll tst r3, #0x08 + add_c24 r1, lr, (OFS_PMEM_vsram-OFS_PMEM_vram) + ldr r3, [r1, #0x4c] @ r3=vsram[0x26..0x27] subne r10,r10, #1<<16 @ cells-- subne r10,r10, #1<<24 @ cell-- // even more negative + ror r3, r3, #16 + str r3, [r1, #0x7c] @ vsram[0x3e..0x3f]=r3 0: tst r9, #1<<31 mov r3, #0 @@ -577,8 +581,8 @@ DrawLayer: @ calc offset and read tileline code to r7, also calc ty add_c24 r7, lr, (OFS_PMEM_vsram-OFS_PMEM_vram) - add r7, r7, r10,asr #23 @ vsram + ((cell&~1)<<1) - bic r7, r7, #3 + and r4, r10, #0x3e000000 + add r7, r7, r4, asr #23 @ vsram + ((cell&0x3e)<<1) tst r10,#0x8000 @ plane1? addne r7, r7, #2 ldrh r7, [r7] @ r7=vscroll From 4a6709352567c3ef299243173c05e667a98a5813 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 27 Mar 2020 19:09:05 +0100 Subject: [PATCH 130/174] ARM asm, symbol visibility fix --- pico/cd/memory_arm.S | 2 ++ pico/memory.h | 9 +++++++++ pico/memory_arm.S | 2 ++ 3 files changed, 13 insertions(+) diff --git a/pico/cd/memory_arm.S b/pico/cd/memory_arm.S index 95ad09ff3..0d1369ee8 100644 --- a/pico/cd/memory_arm.S +++ b/pico/cd/memory_arm.S @@ -703,8 +703,10 @@ m_s68k_write16_regs_spec: @ special case .global s68k_read8 .global s68k_read16 +.global s68k_read32 .global s68k_write8 .global s68k_write16 +.global s68k_write32 s68k_read8: PIC_LDR(r3, r2, s68k_read8_map) diff --git a/pico/memory.h b/pico/memory.h index d55267ba1..eba234712 100644 --- a/pico/memory.h +++ b/pico/memory.h @@ -25,8 +25,17 @@ typedef void (cpu68k_write_f)(u32 a, u32 d); extern u32 m68k_read8(u32 a); extern u32 m68k_read16(u32 a); +extern u32 m68k_read32(u32 a); extern void m68k_write8(u32 a, u8 d); extern void m68k_write16(u32 a, u16 d); +extern void m68k_write32(u32 a, u32 d); + +extern u32 s68k_read8(u32 a); +extern u32 s68k_read16(u32 a); +extern u32 s68k_read32(u32 a); +extern void s68k_write8(u32 a, u8 d); +extern void s68k_write16(u32 a, u16 d); +extern void s68k_write32(u32 a, u32 d); // z80 #define Z80_MEM_SHIFT 13 diff --git a/pico/memory_arm.S b/pico/memory_arm.S index ebeb346b7..607006ced 100644 --- a/pico/memory_arm.S +++ b/pico/memory_arm.S @@ -227,8 +227,10 @@ m_write16_not_z80ctl: .global m68k_read8 .global m68k_read16 +.global m68k_read32 .global m68k_write8 .global m68k_write16 +.global m68k_write32 m68k_read8: PIC_LDR(r3, r2, m68k_read8_map) From de206a43d52219e0efecc6eaf543152022857541 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 27 Mar 2020 19:22:19 +0100 Subject: [PATCH 131/174] vdp rendering, fix for CD (sprites from WORD RAM) --- pico/cd/gfx_dma.c | 2 +- pico/pico_int.h | 18 +++++++++++++++++ pico/videoport.c | 50 +++++++++++++++-------------------------------- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/pico/cd/gfx_dma.c b/pico/cd/gfx_dma.c index ff93a2dc0..354fc2136 100644 --- a/pico/cd/gfx_dma.c +++ b/pico/cd/gfx_dma.c @@ -28,7 +28,7 @@ PICO_INTERNAL void DmaSlowCell(unsigned int source, unsigned int a, int len, uns asrc = cell_map(source >> 2) << 2; asrc |= source & 2; // if(a&1) d=(d<<8)|(d>>8); // ?? - r[a>>1] = *(u16 *)(base + asrc); + VideoWriteVRAM(a, *(u16 *)(base + asrc)); source += 2; // AutoIncrement a=(u16)(a+inc); diff --git a/pico/pico_int.h b/pico/pico_int.h index c0f2c3437..5fed483dc 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -849,6 +849,24 @@ void ym2612_unpack_state(void); // videoport.c +extern unsigned SATaddr, SATmask; +static __inline void UpdateSAT(u32 a, u32 d) +{ + unsigned num = (a-SATaddr) >> 3; + + Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; + if (!(a & 4) && num < 128) { + ((u16 *)&VdpSATCache[num])[(a&3) >> 1] = d; + } +} +static __inline void VideoWriteVRAM(u32 a, u16 d) +{ + PicoMem.vram [(u16)a >> 1] = d; + + if (!((u16)(a^SATaddr) & SATmask)) + UpdateSAT(a, d); +} + PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d); PICO_INTERNAL_ASM unsigned int PicoVideoRead(unsigned int a); unsigned char PicoVideoRead8DataH(void); diff --git a/pico/videoport.c b/pico/videoport.c index 3ed7f5b4c..bb79c09f2 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -19,8 +19,8 @@ extern const unsigned short vdpsl2cyc_32_bl[], vdpsl2cyc_40_bl[]; extern const unsigned short vdpsl2cyc_32[], vdpsl2cyc_40[]; static int blankline; // display disabled for this line -static unsigned sat; // VRAM addr of sprite attribute table -static int satxbits; // index bits in SAT address + +unsigned SATaddr, SATmask; // VRAM addr of sprite attribute table int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned int *mask) = NULL; @@ -303,34 +303,16 @@ static __inline void AutoIncrement(void) if (Pico.video.addr < Pico.video.reg[0xf]) Pico.video.addr_u ^= 1; } -static __inline void UpdateSAT(u32 a, u32 d) -{ - unsigned num = (a-sat) >> 3; - - Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; - if (!(a & 4) && num < 128) { - ((u16 *)&VdpSATCache[num])[(a&3) >> 1] = d; - } -} - static NOINLINE void VideoWriteVRAM128(u32 a, u16 d) { // nasty u32 b = ((a & 2) >> 1) | ((a & 0x400) >> 9) | (a & 0x3FC) | ((a & 0x1F800) >> 1); ((u8 *)PicoMem.vram)[b] = d; - if (!((u16)(b^sat) >> satxbits)) + if (!((u16)(b^SATaddr) & SATmask)) Pico.est.rendstatus |= PDRAW_DIRTY_SPRITES; - if (!((u16)(a^sat) >> satxbits)) - UpdateSAT(a, d); -} - -static void VideoWriteVRAM(u32 a, u16 d) -{ - PicoMem.vram [(u16)a >> 1] = d; - - if (!((u16)(a^sat) >> satxbits)) + if (!((u16)(a^SATaddr) & SATmask)) UpdateSAT(a, d); } @@ -461,7 +443,7 @@ static void DmaSlow(int len, unsigned int source) r = PicoMem.vram; if (inc == 2 && !(a & 1) && (a >> 16) == ((a + len*2) >> 16) && (source & ~mask) == ((source + len-1) & ~mask) && - (a << 16 >= (sat+0x280) << 16 || (a + len*2) << 16 <= sat << 16)) + (a << 16 >= (SATaddr+0x280)<<16 || (a + len*2) << 16 <= SATaddr<<16)) { // most used DMA mode memcpy((char *)r + a, base + (source & mask), len * 2); @@ -540,7 +522,7 @@ static void DmaCopy(int len) for (; len; len--) { vr[(u16)a] = vr[(u16)(source++)]; - if (!((u16)(a^sat) >> satxbits)) + if (!((u16)(a^SATaddr) & SATmask)) UpdateSAT(a, ((u16 *)vr)[(u16)a >> 1]); // AutoIncrement a = (a+inc) & ~0x20000; @@ -572,7 +554,7 @@ static NOINLINE void DmaFill(int data) // Write upper byte to adjacent address // (here we are byteswapped, so address is already 'adjacent') vr[(u16)a] = high; - if (!((u16)(a^sat) >> satxbits)) + if (!((u16)(a^SATaddr) & SATmask)) UpdateSAT(a, ((u16 *)vr)[(u16)a >> 1]); // Increment address register @@ -803,11 +785,11 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) default: return; } - sat = ((pvid->reg[5]&0x7f) << 9) | ((pvid->reg[6]&0x20) << 11); - satxbits = 9; + SATaddr = ((pvid->reg[5]&0x7f) << 9) | ((pvid->reg[6]&0x20) << 11); + SATmask = ~0x1ff; if (Pico.video.reg[12]&1) - sat &= ~0x200, satxbits = 10; // H40, zero lowest SAT bit - //elprintf(EL_STATUS, "spritep moved to %04x", sat); + SATaddr &= ~0x200, SATmask &= ~0x200; // H40, zero lowest SAT bit + //elprintf(EL_STATUS, "spritep moved to %04x", SATaddr); return; update_irq: @@ -1013,15 +995,15 @@ void PicoVideoLoad(void) Pico.m.dma_xfers = 0; } - sat = ((pv->reg[5]&0x7f) << 9) | ((pv->reg[6]&0x20) << 11); - satxbits = 9; + SATaddr = ((pv->reg[5]&0x7f) << 9) | ((pv->reg[6]&0x20) << 11); + SATmask = ~0x1ff; if (pv->reg[12]&1) - sat &= ~0x200, satxbits = 10; // H40, zero lowest SAT bit + SATaddr &= ~0x200, SATmask &= ~0x200; // H40, zero lowest SAT bit // rebuild SAT cache XXX wrong since cache and memory can differ for (l = 0; l < 80; l++) { - *((u16 *)VdpSATCache + 2*l ) = PicoMem.vram[(sat>>1) + l*4 ]; - *((u16 *)VdpSATCache + 2*l+1) = PicoMem.vram[(sat>>1) + l*4 + 1]; + *((u16 *)VdpSATCache + 2*l ) = PicoMem.vram[(SATaddr>>1) + l*4 ]; + *((u16 *)VdpSATCache + 2*l+1) = PicoMem.vram[(SATaddr>>1) + l*4 + 1]; } } From 05a7e6f4fb140f59ef309fbb2089c9fbdbadf9c2 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 27 Mar 2020 19:25:20 +0100 Subject: [PATCH 132/174] vdp rendering fixes --- pico/draw.c | 12 +++++++++--- pico/draw_arm.S | 11 ++++++++--- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/pico/draw.c b/pico/draw.c index babf40069..a18ec8d7c 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -486,7 +486,10 @@ static void DrawLayer(int plane_sh, int *hcache, int cellskip, int maxcells, // shit, we have 2-cell column based vscroll // luckily this doesn't happen too often ts.line=ymask|(shift[width]<<24); // save some stuff instead of line - plane_sh |= PicoMem.vsram[0x26+(~plane_sh&1)] << 16; + // vscroll value for leftmost cells in case of hscroll not on 16px boundary + // XXX it's unclear what exactly the hw is doing. Continue reading where it + // stopped last seems to work best (H40: 0x50 (wrap->0x00), H32 0x40). + plane_sh |= PicoMem.vsram[(pvid->reg[12]&1?0x00:0x20) + (plane_sh&1)] << 16; DrawStripVSRam(&ts, plane_sh, cellskip); } else { vscroll = PicoMem.vsram[plane_sh & 1]; // Get vertical scroll value @@ -1173,7 +1176,10 @@ static void DrawLayerForced(int plane_sh, int cellskip, int maxcells, // shit, we have 2-cell column based vscroll // luckily this doesn't happen too often ts.line=ymask|(shift[width]<<24); // save some stuff instead of line - plane_sh |= PicoMem.vsram[0x26+(~plane_sh&1)] << 16; + // vscroll value for leftmost cells in case of hscroll not on 16px boundary + // XXX it's unclear what exactly the hw is doing. Continue reading where it + // stopped last seems to work best (H40: 0x50 (wrap->0x00), H32 0x40). + plane_sh |= PicoMem.vsram[(pvid->reg[12]&1?0x00:0x20) + (plane_sh&1)] << 16; DrawStripVSRamForced(&ts, plane_sh, cellskip); } else { vscroll = PicoMem.vsram[plane_sh & 1]; // Get vertical scroll value @@ -1191,7 +1197,7 @@ static void DrawSpritesForced(unsigned char *sprited) { unsigned (*fTileFunc)(unsigned char *pd, unsigned m, unsigned int pack, int pal); unsigned char *pd = Pico.est.HighCol; - unsigned char mb[1+320+1]; + unsigned char mb[1+320/8+1]; unsigned char *p, *mp; unsigned m; int entry, cnt; diff --git a/pico/draw_arm.S b/pico/draw_arm.S index 1a0f35133..2ae6dba63 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -523,6 +523,9 @@ DrawLayer: @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ .DrawStrip_vsscroll: + tst r8, #1 @ if h40: lflags |= 0x10000 + orrne r0, r0, #0x10000 + rsb r8, r3, #0 mov r8, r8, lsr #3 @ r8=tilex=(-ts->hscroll)>>3 bic r8, r8, #0x3fc00000 @@ -545,11 +548,13 @@ DrawLayer: eor r3, r3, r7 sub r10,r10, #1<<24 @ cell-- // start from negative for hscroll tst r3, #0x08 - add_c24 r1, lr, (OFS_PMEM_vsram-OFS_PMEM_vram) - ldr r3, [r1, #0x4c] @ r3=vsram[0x26..0x27] subne r10,r10, #1<<16 @ cells-- subne r10,r10, #1<<24 @ cell-- // even more negative - ror r3, r3, #16 + + add_c24 r1, lr, (OFS_PMEM_vsram-OFS_PMEM_vram) + tst r0, #0x10000 @ h40? + ldrne r3, [r1, #0x00] @ r3=vsram[0x00..0x01] + ldreq r3, [r1, #0x40] @ r3=vsram[0x20..0x21] str r3, [r1, #0x7c] @ vsram[0x3e..0x3f]=r3 0: tst r9, #1<<31 From 209db31bb44b8a585b5ecca0d23c08c683a63987 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 27 Mar 2020 19:27:05 +0100 Subject: [PATCH 133/174] fix for 68K cycle accounting --- pico/pico_cmn.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pico/pico_cmn.c b/pico/pico_cmn.c index 017c404b7..8863bb39c 100644 --- a/pico/pico_cmn.c +++ b/pico/pico_cmn.c @@ -35,6 +35,7 @@ static void SekExecM68k(int cyc_do) #elif defined(EMU_F68K) Pico.t.m68c_cnt += fm68k_emulate(&PicoCpuFM68k, cyc_do, 0) - cyc_do; #endif + SekCyclesLeft = 0; } static void SekSyncM68k(void) @@ -46,8 +47,6 @@ static void SekSyncM68k(void) while ((cyc_do = Pico.t.m68c_aim - Pico.t.m68c_cnt) > 0) SekExecM68k(cyc_do); - SekCyclesLeft = 0; - SekTrace(0); pevt_log_m68k_o(EVT_RUN_END); pprof_end(m68k); From 8095361c2b5d3f2a792e784e33d4a8d1bd3ed0d8 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 27 Mar 2020 19:32:45 +0100 Subject: [PATCH 134/174] vdp fifo speed optimization --- pico/videoport.c | 228 ++++++++++++++++++++++++++--------------------- 1 file changed, 125 insertions(+), 103 deletions(-) diff --git a/pico/videoport.c b/pico/videoport.c index bb79c09f2..401190e09 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -57,125 +57,142 @@ int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned */ // NB code assumes fifo_* arrays have size 2^n -// last transferred FIFO data, ...x = index XXX currently only CPU -static short fifo_data[4], fifo_dx; // XXX must go into save? +static struct VdpFIFO { // XXX this must go into save file! + // last transferred FIFO data, ...x = index XXX currently only CPU + unsigned short fifo_data[4], fifo_dx; -// queued FIFO transfers, ...x = index, ...l = queue length -// each entry has 2 values: [n]>>3 = #writes, [n]&7 = flags -static int fifo_queue[8], fifo_qx, fifo_ql; // XXX must go into save? -enum { FQ_BYTE = 1, FQ_BGDMA = 2, FQ_FGDMA = 4 }; // queue flags, NB: BYTE = 1! -static unsigned int fifo_total; // total# of pending FIFO entries (w/o BGDMA) + // queued FIFO transfers, ...x = index, ...l = queue length + // each entry has 2 values: [n]>>3 = #writes, [n]&7 = flags (FQ_*) + unsigned int fifo_queue[8], fifo_qx, fifo_ql; + unsigned int fifo_total; // total# of pending FIFO entries (w/o BGDMA) + + unsigned short fifo_slot; // last executed slot in current scanline + unsigned short fifo_maxslot;// #slots in scanline -static unsigned short fifo_slot; // last executed slot in current scanline -static unsigned short fifo_maxslot;// #slots in scanline + const unsigned char *fifo_cyc2sl; + const unsigned short *fifo_sl2cyc; +} VdpFIFO; -static const unsigned char *fifo_cyc2sl; -static const unsigned short *fifo_sl2cyc; +enum { FQ_BYTE = 1, FQ_BGDMA = 2, FQ_FGDMA = 4 }; // queue flags, NB: BYTE = 1! // do the FIFO math -static __inline int AdvanceFIFOEntry(struct PicoVideo *pv, int slots) +static __inline int AdvanceFIFOEntry(struct VdpFIFO *vf, struct PicoVideo *pv, int slots) { - int l = slots, b = fifo_queue[fifo_qx] & FQ_BYTE; + int l = slots, b = vf->fifo_queue[vf->fifo_qx] & FQ_BYTE; + int cnt = pv->fifo_cnt; // advance currently active FIFO entry - if (l > pv->fifo_cnt) - l = pv->fifo_cnt; - if (!(fifo_queue[fifo_qx] & FQ_BGDMA)) - fifo_total -= ((pv->fifo_cnt & b) + l) >> b; - pv->fifo_cnt -= l; + if (l > cnt) + l = cnt; + if (!(vf->fifo_queue[vf->fifo_qx] & FQ_BGDMA)) + vf->fifo_total -= ((cnt & b) + l) >> b; + cnt -= l; // if entry has been processed... - if (pv->fifo_cnt == 0) { + if (cnt == 0) { // remove entry from FIFO - if (fifo_ql) - fifo_qx ++, fifo_qx &= 7, fifo_ql --; + if (vf->fifo_ql) + vf->fifo_qx = (vf->fifo_qx+1) & 7, vf->fifo_ql --; // start processing for next entry if there is one - if (fifo_ql) - pv->fifo_cnt = (fifo_queue[fifo_qx] >> 3) << (fifo_queue[fifo_qx] & FQ_BYTE); - else { // FIFO empty + if (vf->fifo_ql) { + b = vf->fifo_queue[vf->fifo_qx] & FQ_BYTE; + cnt = (vf->fifo_queue[vf->fifo_qx] >> 3) << b; + } else { // FIFO empty pv->status &= ~PVS_FIFORUN; - fifo_total = 0; + vf->fifo_total = 0; } } + + pv->fifo_cnt = cnt; return l; } -static __inline void SetFIFOState(struct PicoVideo *pv) +static __inline void SetFIFOState(struct VdpFIFO *vf, struct PicoVideo *pv) { + unsigned int st = pv->status, cmd = pv->command; // release CPU and terminate DMA if FIFO isn't blocking the 68k anymore - if (fifo_total <= 4) { - pv->status &= ~PVS_CPUWR; - if (!(pv->status & (PVS_DMABG|PVS_DMAFILL))) { - pv->status &= ~SR_DMA; - pv->command &= ~0x80; + if (vf->fifo_total <= 4) { + st &= ~PVS_CPUWR; + if (!(st & (PVS_DMABG|PVS_DMAFILL))) { + st &= ~SR_DMA; + cmd &= ~0x80; } } - if (fifo_total == 0) { - pv->status &= ~PVS_CPURD; + if (pv->fifo_cnt == 0) { + st &= ~PVS_CPURD; // terminate DMA if applicable - if (!(pv->status & (PVS_FIFORUN|PVS_DMAFILL))) { - pv->status &= ~(SR_DMA|PVS_DMABG); - pv->command &= ~0x80; + if (!(st & (PVS_FIFORUN|PVS_DMAFILL))) { + st &= ~(SR_DMA|PVS_DMABG); + cmd &= ~0x80; } } + pv->status = st; + pv->command = cmd; } // sync FIFO to cycles void PicoVideoFIFOSync(int cycles) { + struct VdpFIFO *vf = &VdpFIFO; struct PicoVideo *pv = &Pico.video; int slots, done; // calculate #slots since last executed slot - slots = fifo_cyc2sl[cycles>>1] - fifo_slot; + slots = vf->fifo_cyc2sl[cycles>>1] - vf->fifo_slot; // advance FIFO queue by #done slots done = slots; while (done > 0 && pv->fifo_cnt) { - int l = AdvanceFIFOEntry(pv, done); - fifo_slot += l; + int l = AdvanceFIFOEntry(vf, pv, done); + vf->fifo_slot += l; done -= l; } if (done != slots) - SetFIFOState(pv); + SetFIFOState(vf, pv); } // drain FIFO, blocking 68k on the way. FIFO must be synced prior to drain. -int PicoVideoFIFODrain(int level, int cycles, int bgdma) +static int PicoVideoFIFODrain(int level, int cycles, int bgdma) { + struct VdpFIFO *vf = &VdpFIFO; struct PicoVideo *pv = &Pico.video; unsigned ocyc = cycles; int burn = 0; +//int osl = fifo_slot; // process FIFO entries until low level is reached - while (fifo_total > level && fifo_slot < fifo_maxslot && - (!(fifo_queue[fifo_qx] & FQ_BGDMA) || bgdma)) { - int b = fifo_queue[fifo_qx] & FQ_BYTE; - int cnt = ((fifo_total-level) << b) - (pv->fifo_cnt & b); - int slot = (pv->fifo_cntfifo_cnt:cnt) + fifo_slot; // target slot - - if (slot > fifo_maxslot) { - // target in later scanline, advance to eol - slot = fifo_maxslot; + while (vf->fifo_slot < vf->fifo_maxslot && cycles < 488 && + (vf->fifo_total > level || (vf->fifo_queue[vf->fifo_qx] & bgdma))) { + int b = vf->fifo_queue[vf->fifo_qx] & FQ_BYTE; + int cnt = bgdma ? pv->fifo_cnt : ((vf->fifo_total-level)<fifo_cnt&b); + int slot = (pv->fifo_cntfifo_cnt:cnt) + vf->fifo_slot; + + if (slot > vf->fifo_maxslot) { + // target slot in later scanline, advance to eol + slot = vf->fifo_maxslot; cycles = 488; } else { // advance FIFO to target slot and CPU to cycles at that slot - cycles = fifo_sl2cyc[slot]<<1; + cycles = vf->fifo_sl2cyc[slot]<<1; + } + if (slot > vf->fifo_slot) { + AdvanceFIFOEntry(vf, pv, slot - vf->fifo_slot); + vf->fifo_slot = slot; } - AdvanceFIFOEntry(pv, slot - fifo_slot); - fifo_slot = slot; } - burn = cycles - ocyc; + if (cycles > ocyc) + burn = cycles - ocyc; - SetFIFOState(pv); + SetFIFOState(vf, pv); return burn; } // read VDP data port -int PicoVideoFIFORead(void) +static int PicoVideoFIFORead(void) { + struct VdpFIFO *vf = &VdpFIFO; struct PicoVideo *pv = &Pico.video; int lc = SekCyclesDone()-Pico.t.m68c_line_start; int burn = 0; @@ -183,16 +200,16 @@ int PicoVideoFIFORead(void) if (pv->fifo_cnt) { PicoVideoFIFOSync(lc); // advance FIFO and CPU until FIFO is empty - burn = PicoVideoFIFODrain(0, lc, 1); + burn = PicoVideoFIFODrain(0, lc, FQ_BGDMA); lc += burn; } - if (fifo_total > 0) + if (pv->fifo_cnt) pv->status |= PVS_CPURD; // target slot is in later scanline else { // use next VDP access slot for reading, block 68k until then - fifo_slot = fifo_cyc2sl[lc>>1] + 1; - burn += (fifo_sl2cyc[fifo_slot]<<1) - lc; + vf->fifo_slot = vf->fifo_cyc2sl[lc>>1] + 1; + burn += (vf->fifo_sl2cyc[vf->fifo_slot]<<1) - lc; } return burn; @@ -201,50 +218,51 @@ int PicoVideoFIFORead(void) // write VDP data port int PicoVideoFIFOWrite(int count, int flags, unsigned sr_mask,unsigned sr_flags) { + struct VdpFIFO *vf = &VdpFIFO; struct PicoVideo *pv = &Pico.video; int lc = SekCyclesDone()-Pico.t.m68c_line_start; - int burn = 0, x, head = 0; + int burn = 0; if (pv->fifo_cnt) PicoVideoFIFOSync(lc); pv->status = (pv->status & ~sr_mask) | sr_flags; - if (count && fifo_ql < 8) { - // update FIFO state if it was empty - if (fifo_ql == 0) { - fifo_slot = fifo_cyc2sl[(lc+8)>>1]; // FIFO latency ~3 vdp slots - pv->fifo_cnt = count << (flags & FQ_BYTE); - pv->status |= PVS_FIFORUN; - } - + if (count && vf->fifo_ql < 8) { // determine queue position for entry - x = (fifo_qx + fifo_ql - 1) & 7; - if (fifo_ql && (fifo_queue[x] & FQ_BGDMA)) { + int x = (vf->fifo_qx + vf->fifo_ql - 1) & 7; + if (unlikely(vf->fifo_ql && (vf->fifo_queue[x] & FQ_BGDMA))) { // CPU FIFO writes have priority over a background DMA Fill/Copy - fifo_queue[(x+1) & 7] = fifo_queue[x]; - if (x == fifo_qx) { // overtaking to queue head? - // XXX if interrupting a DMA fill, fill data changes - int f = fifo_queue[x] & 7; - fifo_queue[(x+1) & 7] = (pv->fifo_cnt >> (f & FQ_BYTE) << 3) | f; - pv->fifo_cnt = count << (flags & FQ_BYTE); - head = 1; - } + // XXX if interrupting a DMA fill, fill data changes + if (x == vf->fifo_qx) { // overtaking to queue head? + int f = vf->fifo_queue[x] & 7; + vf->fifo_queue[(x+1) & 7] = (pv->fifo_cnt >> (f & FQ_BYTE) << 3) | f; + pv->status &= ~PVS_FIFORUN; + } else + // push background DMA back + vf->fifo_queue[(x+1) & 7] = vf->fifo_queue[x]; x = (x-1) & 7; } - // create xfer queue entry - if (fifo_ql && !head && (fifo_queue[x] & 7) == flags) { + if ((pv->status & PVS_FIFORUN) && (vf->fifo_queue[x] & 7) == flags) { // amalgamate entries if of same type - fifo_queue[x] += (count << 3); - if (x == fifo_qx) // modifiying fifo head, adjust count + vf->fifo_queue[x] += (count << 3); + if (x == vf->fifo_qx) pv->fifo_cnt += count << (flags & FQ_BYTE); } else { - fifo_ql ++; + // create new xfer queue entry + vf->fifo_ql ++; x = (x+1) & 7; - fifo_queue[x] = (count << 3) | flags; + vf->fifo_queue[x] = (count << 3) | flags; + } + + // update FIFO state if it was empty + if (!(pv->status & PVS_FIFORUN)) { + vf->fifo_slot = vf->fifo_cyc2sl[(lc+8)>>1]; // FIFO latency ~3 vdp slots + pv->status |= PVS_FIFORUN; + pv->fifo_cnt = count << (flags & FQ_BYTE); } if (!(flags & FQ_BGDMA)) - fifo_total += count; + vf->fifo_total += count; } // if CPU is waiting for the bus, advance CPU and FIFO until bus is free @@ -257,11 +275,12 @@ int PicoVideoFIFOWrite(int count, int flags, unsigned sr_mask,unsigned sr_flags) // at HINT, advance FIFO to new scanline int PicoVideoFIFOHint(void) { + struct VdpFIFO *vf = &VdpFIFO; struct PicoVideo *pv = &Pico.video; int burn = 0; // reset slot to start of scanline - fifo_slot = 0; + vf->fifo_slot = 0; // if CPU is waiting for the bus, advance CPU and FIFO until bus is free if (pv->status & PVS_CPURD) @@ -280,18 +299,19 @@ void PicoVideoFIFOMode(int active, int h40) static const unsigned short *vdpsl2cyc[2][2] = { {vdpsl2cyc_32_bl, vdpsl2cyc_40_bl} , {vdpsl2cyc_32, vdpsl2cyc_40} }; + struct VdpFIFO *vf = &VdpFIFO; struct PicoVideo *pv = &Pico.video; int lc = SekCyclesDone() - Pico.t.m68c_line_start; active = active && !(pv->status & PVS_VB2); - if (fifo_maxslot) + if (vf->fifo_maxslot) PicoVideoFIFOSync(lc); - fifo_cyc2sl = vdpcyc2sl[active][h40]; - fifo_sl2cyc = vdpsl2cyc[active][h40]; + vf->fifo_cyc2sl = vdpcyc2sl[active][h40]; + vf->fifo_sl2cyc = vdpsl2cyc[active][h40]; // recalculate FIFO slot for new mode - fifo_slot = fifo_cyc2sl[lc>>1]-1; - fifo_maxslot = fifo_cyc2sl[488>>1]; + vf->fifo_slot = vf->fifo_cyc2sl[lc>>1]-1; + vf->fifo_maxslot = vf->fifo_cyc2sl[488>>1]; } @@ -342,7 +362,7 @@ static void VideoWrite(u16 d) static unsigned int VideoRead(void) { - unsigned int a, d = fifo_data[(fifo_dx+1)&3]; + unsigned int a, d = VdpFIFO.fifo_data[(VdpFIFO.fifo_dx+1)&3]; a=Pico.video.addr; a>>=1; @@ -351,7 +371,6 @@ static unsigned int VideoRead(void) { case 0: d=PicoMem.vram [a & 0x7fff]; break; case 8: d=PicoMem.cram [a & 0x003f] | (d & ~0x0eee); break; - case 4: if ((a & 0x3f) >= 0x28) a = 0; d=PicoMem.vsram [a & 0x003f] | (d & ~0x07ff); break; case 12:a=PicoMem.vram [a & 0x7fff]; if (Pico.video.addr&1) a >>= 8; @@ -618,8 +637,9 @@ static NOINLINE void CommandDma(void) PicoVideoFIFOSync(SekCyclesDone()-Pico.t.m68c_line_start); if (pvid->status & SR_DMA) { elprintf(EL_VDPDMA, "Dma overlap, left=%d @ %06x", - fifo_total, SekPc); - pvid->fifo_cnt = fifo_total = fifo_ql = 0; + VdpFIFO.fifo_total, SekPc); + pvid->fifo_cnt = VdpFIFO.fifo_total = VdpFIFO.fifo_ql = 0; + pvid->status &= ~(PVS_FIFORUN|PVS_DMAFILL); } len = GetDmaLength(); @@ -704,7 +724,7 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) if (!(PicoIn.opt&POPT_DIS_VDP_FIFO)) { - fifo_data[++fifo_dx&3] = d; + VdpFIFO.fifo_data[++VdpFIFO.fifo_dx&3] = d; SekCyclesBurnRun(PicoVideoFIFOWrite(1, pvid->type == 1, 0, PVS_CPUWR)); elprintf(EL_ASVDP, "VDP data write: [%04x] %04x [%u] {%i} @ %06x", @@ -714,7 +734,7 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) // start DMA fill on write. NB VSRAM and CRAM fills use wrong FIFO data. if (pvid->status & PVS_DMAFILL) - DmaFill(fifo_data[(fifo_dx + !!(pvid->type&~0x81))&3]); + DmaFill(VdpFIFO.fifo_data[(VdpFIFO.fifo_dx + !!(pvid->type&~0x81))&3]); break; @@ -860,9 +880,9 @@ static u32 VideoSr(const struct PicoVideo *pv) d |= SR_HB; PicoVideoFIFOSync(c); - if (fifo_total >= 4) + if (VdpFIFO.fifo_total >= 4) d |= SR_FULL; - else if (!fifo_total) + else if (!VdpFIFO.fifo_total) d |= SR_EMPT; return d; } @@ -974,16 +994,18 @@ unsigned char PicoVideoRead8HV_L(void) void PicoVideoSave(void) { + struct VdpFIFO *vf = &VdpFIFO; struct PicoVideo *pv = &Pico.video; int l, x; // account for all outstanding xfers XXX kludge, entry attr's not saved - for (l = fifo_ql, x = fifo_qx + l-1; l > 1; l--, x--) - pv->fifo_cnt += (fifo_queue[x&7] >> 3) << (fifo_queue[x&7] & FQ_BYTE); + for (l = vf->fifo_ql, x = vf->fifo_qx + l-1; l > 1; l--, x--) + pv->fifo_cnt += (vf->fifo_queue[x&7] >> 3) << (vf->fifo_queue[x&7] & FQ_BYTE); } void PicoVideoLoad(void) { + struct VdpFIFO *vf = &VdpFIFO; struct PicoVideo *pv = &Pico.video; int l; @@ -991,7 +1013,7 @@ void PicoVideoLoad(void) if (Pico.m.dma_xfers) { pv->status = SR_DMA|PVS_FIFORUN; pv->fifo_cnt = Pico.m.dma_xfers * (pv->type == 1 ? 2 : 1); - fifo_total = Pico.m.dma_xfers; + vf->fifo_total = Pico.m.dma_xfers; Pico.m.dma_xfers = 0; } From a3c38fbde79a1f998fbc5983fbe3e256bdde7463 Mon Sep 17 00:00:00 2001 From: kub Date: Mon, 30 Mar 2020 23:54:11 +0200 Subject: [PATCH 135/174] fix for gp2x audio regression --- pico/memory.c | 2 +- pico/sound/sound.c | 27 ++++++++++++++------------- platform/gp2x/940ctl.c | 6 +++--- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/pico/memory.c b/pico/memory.c index d61491c14..0fa7b8de0 100644 --- a/pico/memory.c +++ b/pico/memory.c @@ -1057,11 +1057,11 @@ static int ym2612_write_local(u32 a, u32 d, int is_from_z80) break; } - PsndDoFM(get_scanline(is_from_z80)); #ifdef __GP2X__ if (PicoIn.opt & POPT_EXT_FM) return YM2612Write_940(a, d, get_scanline(is_from_z80)); #endif + PsndDoFM(get_scanline(is_from_z80)); return YM2612Write_(a, d); } diff --git a/pico/sound/sound.c b/pico/sound/sound.c index 688812829..f84309470 100644 --- a/pico/sound/sound.c +++ b/pico/sound/sound.c @@ -267,8 +267,8 @@ PICO_INTERNAL void PsndDoFM(int line_to) // Q16, number of samples since last call len = ((line_to-1) * Pico.snd.fm_mult) - Pico.snd.fm_pos; - // don't do this too often (no more than 256 per sec) - if (len >> 16 <= PicoIn.sndRate >> 9) + // don't do this too often (about every 4th scanline) + if (len >> 16 <= PicoIn.sndRate >> 12) return; // update position and calculate buffer offset and length @@ -355,22 +355,22 @@ static int PsndRender(int offset, int length) { int *buf32; int stereo = (PicoIn.opt & 8) >> 3; - int fmlen = ((Pico.snd.fm_pos+0x8000) >> 16) - offset; - int daclen = ((Pico.snd.dac_pos+0x80000) >> 20) - offset; + int fmlen = ((Pico.snd.fm_pos+0x8000) >> 16); + int daclen = ((Pico.snd.dac_pos+0x80000) >> 20); - offset <<= stereo; - buf32 = PsndBuffer+offset; + buf32 = PsndBuffer+(offset< 0) { short *dacbuf = PicoIn.sndOut + (daclen << stereo); + Pico.snd.dac_pos += (length-daclen) << 20; for (; length-daclen > 0; daclen++) { *dacbuf++ += Pico.snd.dac_val; if (stereo) dacbuf++; @@ -379,14 +379,15 @@ static int PsndRender(int offset, int length) // Add in parts of the FM buffer not yet done if (length-fmlen > 0) { - int *fmbuf = buf32 + (fmlen << stereo); + int *fmbuf = buf32 + ((fmlen-offset) << stereo); + Pico.snd.fm_pos += (length-fmlen) << 16; if (PicoIn.opt & POPT_EN_FM) YM2612UpdateOne(fmbuf, length-fmlen, stereo, 1); } // CD: PCM sound if (PicoIn.AHW & PAHW_MCD) { - pcd_pcm_update(buf32, length, stereo); + pcd_pcm_update(buf32, length-offset, stereo); } // CD: CDDA audio @@ -397,13 +398,13 @@ static int PsndRender(int offset, int length) { // note: only 44, 22 and 11 kHz supported, with forced stereo if (Pico_mcd->cdda_type == CT_MP3) - mp3_update(buf32, length, stereo); + mp3_update(buf32, length-offset, stereo); else - cdda_raw_update(buf32, length); + cdda_raw_update(buf32, length-offset); } if ((PicoIn.AHW & PAHW_32X) && (PicoIn.opt & POPT_EN_PWM)) - p32x_pwm_update(buf32, length, stereo); + p32x_pwm_update(buf32, length-offset, stereo); // Apply low pass filter, if required if (PicoIn.sndFilter == 1) { @@ -411,7 +412,7 @@ static int PsndRender(int offset, int length) } // convert + limit to normal 16bit output - PsndMix_32_to_16l(PicoIn.sndOut+offset, buf32, length); + PsndMix_32_to_16l(PicoIn.sndOut+(offset<writebuffsel ? shared_ctl->writebuff0 : shared_ctl->writebuff1; /* detect rapid ym updates */ - if (upd && !(writebuff_ptr & 0x80000000) && scanline < 224) + if (upd && !(writebuff_ptr & 0x80000000)) { - int mid = Pico.m.pal ? 68 : 93; - if (scanline > mid) { + int mid = (Pico.m.pal ? 313 : 262) / 2; + if (scanline >= mid) { //printf("%05i:%03i: rapid ym\n", Pico.m.frame_count, scanline); writebuff[writebuff_ptr++ & 0xffff] = 0xfffe; writebuff_ptr |= 0x80000000; From f026daa6bd46c4242e37507bb3b8d9d428227e2c Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 2 Apr 2020 20:18:39 +0200 Subject: [PATCH 136/174] vdp DMA optimizations --- pico/videoport.c | 67 ++++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/pico/videoport.c b/pico/videoport.c index 401190e09..dac74dc3a 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -163,7 +163,7 @@ static int PicoVideoFIFODrain(int level, int cycles, int bgdma) // process FIFO entries until low level is reached while (vf->fifo_slot < vf->fifo_maxslot && cycles < 488 && - (vf->fifo_total > level || (vf->fifo_queue[vf->fifo_qx] & bgdma))) { + ((vf->fifo_total > level) | (vf->fifo_queue[vf->fifo_qx] & bgdma))) { int b = vf->fifo_queue[vf->fifo_qx] & FQ_BYTE; int cnt = bgdma ? pv->fifo_cnt : ((vf->fifo_total-level)<fifo_cnt&b); int slot = (pv->fifo_cntfifo_cnt:cnt) + vf->fifo_slot; @@ -283,10 +283,10 @@ int PicoVideoFIFOHint(void) vf->fifo_slot = 0; // if CPU is waiting for the bus, advance CPU and FIFO until bus is free - if (pv->status & PVS_CPURD) - burn = PicoVideoFIFORead(); - else if (pv->status & PVS_CPUWR) + if (pv->status & PVS_CPUWR) burn = PicoVideoFIFOWrite(0, 0, 0, 0); + else if (pv->status & PVS_CPURD) + burn = PicoVideoFIFORead(); return burn; } @@ -458,27 +458,23 @@ static void DmaSlow(int len, unsigned int source) switch (Pico.video.type) { case 1: // vram -#if 0 r = PicoMem.vram; - if (inc == 2 && !(a & 1) && (a >> 16) == ((a + len*2) >> 16) && - (source & ~mask) == ((source + len-1) & ~mask) && - (a << 16 >= (SATaddr+0x280)<<16 || (a + len*2) << 16 <= SATaddr<<16)) + if (inc == 2 && !(a & 1) && (a & ~0xffff) == ((a + len*2-1) & ~0xffff) && + ((a >= SATaddr+0x280) | ((a + len*2-1) < SATaddr)) && + (source & ~mask) == ((source + len-1) & ~mask)) { // most used DMA mode memcpy((char *)r + a, base + (source & mask), len * 2); a += len * 2; + break; } - else -#endif + for(; len; len--) { - for(; len; len--) - { - u16 d = base[source++ & mask]; - if(a & 1) d=(d<<8)|(d>>8); - VideoWriteVRAM(a, d); - // AutoIncrement - a = (a+inc) & ~0x20000; - } + u16 d = base[source++ & mask]; + if(a & 1) d=(d<<8)|(d>>8); + VideoWriteVRAM(a, d); + // AutoIncrement + a = (a+inc) & ~0x20000; } break; @@ -569,6 +565,14 @@ static NOINLINE void DmaFill(int data) switch (Pico.video.type) { case 1: // vram + if (inc == 1 && (a & ~0xffff) == ((a + len-1) & ~0xffff) && + ((a >= SATaddr+0x280) | ((a + len-1) < SATaddr))) + { + // most used DMA mode + memset(vr + (u16)a, high, len); + a += len; + break; + } for (l = len; l; l--) { // Write upper byte to adjacent address // (here we are byteswapped, so address is already 'adjacent') @@ -662,9 +666,8 @@ static NOINLINE void CommandDma(void) Pico.video.reg[0x16] = source >> 8; } -static NOINLINE void CommandChange(void) +static NOINLINE void CommandChange(struct PicoVideo *pvid) { - struct PicoVideo *pvid = &Pico.video; unsigned int cmd, addr; cmd = pvid->command; @@ -718,7 +721,7 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) DrawSync(0); // XXX it's unclear when vscroll data is fetched from vsram? if (pvid->pending) { - CommandChange(); + CommandChange(pvid); pvid->pending=0; } @@ -749,7 +752,7 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d) pvid->command &= 0xffff0000; pvid->command |= d; pvid->pending = 0; - CommandChange(); + CommandChange(pvid); // Check for dma: if (d & 0x80) { DrawSync(SekCyclesDone() - Pico.t.m68c_line_start <= 488-390); @@ -896,7 +899,7 @@ PICO_INTERNAL_ASM unsigned int PicoVideoRead(unsigned int a) struct PicoVideo *pv = &Pico.video; unsigned int d = VideoSr(pv); if (pv->pending) { - CommandChange(); + CommandChange(pv); pv->pending = 0; } elprintf(EL_SR, "SR read: %04x [%u] @ %06x", d, SekCyclesDone(), SekPc); @@ -953,10 +956,11 @@ unsigned char PicoVideoRead8DataL(void) unsigned char PicoVideoRead8CtlH(void) { - u8 d = VideoSr(&Pico.video) >> 8; - if (Pico.video.pending) { - CommandChange(); - Pico.video.pending = 0; + struct PicoVideo *pv = &Pico.video; + u8 d = VideoSr(pv) >> 8; + if (pv->pending) { + CommandChange(pv); + pv->pending = 0; } elprintf(EL_SR, "SR read (h): %02x @ %06x", d, SekPc); return d; @@ -964,10 +968,11 @@ unsigned char PicoVideoRead8CtlH(void) unsigned char PicoVideoRead8CtlL(void) { - u8 d = VideoSr(&Pico.video); - if (Pico.video.pending) { - CommandChange(); - Pico.video.pending = 0; + struct PicoVideo *pv = &Pico.video; + u8 d = VideoSr(pv); + if (pv->pending) { + CommandChange(pv); + pv->pending = 0; } elprintf(EL_SR, "SR read (l): %02x @ %06x", d, SekPc); return d; From 314dba97977a4fc629800299880330045b208b15 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 2 Apr 2020 20:33:56 +0200 Subject: [PATCH 137/174] ym2612 ARM, bug fixing and small optimizations --- pico/sound/ym2612_arm.S | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/pico/sound/ym2612_arm.S b/pico/sound/ym2612_arm.S index e3ec370d0..59abb74e2 100644 --- a/pico/sound/ym2612_arm.S +++ b/pico/sound/ym2612_arm.S @@ -15,6 +15,9 @@ #include "../arm_features.h" +@ very simple adaption YM2612 output rate to sample rate (~1M cycles @44100) +//#define INTERPOL + .equiv SLOT1, 0 .equiv SLOT2, 2 .equiv SLOT3, 1 @@ -42,10 +45,14 @@ @ r5=slot, r1=eg_cnt, trashes: r0,r2,r3 @ writes output to routp, but only if vol_out changes .macro update_eg_phase_slot slot +#if defined(INTERPOL) ldrh r0, [r5,#0x34] @ vol_out +#endif ldrb r2, [r5,#0x17] @ state add r3, r5, #0x1c +#if defined(INTERPOL) strh r0, [r5,#0x36] @ vol_ipol +#endif tst r2, r2 beq 0f @ EG_OFF @@ -59,11 +66,11 @@ bne 0f @ no volume change mov r3, r1, lsr r0 + ldrb r0, [r5,#0x30] @ ssg and r3, r3, #7 add r3, r3, r3, lsl #1 mov r3, r2, lsr r3 and r3, r3, #7 @ eg_inc_val shift, may be 0 - ldrb r0, [r5,#0x30] @ ssg ldrb r2, [r5,#0x17] @ state tst r0, #0x08 @ ssg enabled? @@ -124,8 +131,8 @@ b 11f 9: @ SSG-EG mode - cmp r2, #4 @ EG_ATT ldrh r0, [r5,#0x1a] @ volume, unsigned (0-1023) + cmp r2, #4 @ EG_ATT beq 4f cmp r0, #0x200 @ if ( volume < 0x200 ) @@ -170,9 +177,9 @@ strgeb r3, [r5,#0x17] @ state 10: @ finish - strh r0, [r5,#0x1a] @ volume ldrb r2, [r5,#0x30] @ ssg ldrb r3, [r5,#0x17] @ state + strh r0, [r5,#0x1a] @ volume cmp r2, #0x0c @ if ( ssg&0x04 && state > EG_REL ) cmpge r3, #EG_REL+1 rsbge r0, r0, #0x200 @ volume = (0x200-volume) & MAX_ATT @@ -206,10 +213,10 @@ .macro update_ssg_eg ldrh r0, [r5,#0x30] @ ssg+ssgn ldrb r2, [r5,#0x17] @ state - and r3, r0, #0x08 - cmp r3, #0x08 @ ssg enabled && ldrh r3, [r5,#0x1a] @ volume - cmpge r2, #EG_REL+1 @ state > EG_REL && + tst r0, #0x08 @ ssg enabled && + beq 9f + cmp r2, #EG_REL+1 @ state > EG_REL && cmpge r3, #0x200 @ volume >= 0x200? blt 9f @@ -222,7 +229,7 @@ orrne r0, r0, #0x400 @ ssgn = 4 strneh r0, [r5,#0x30] - eor r0, r0, #0x4 @ if ( !(ssg&0x04 ) + eor r0, r0, #0x4 @ if ( !(ssg&0x04) ) tst r0, #0x4 cmpne r2, #EG_ATT @ if ( state != EG_ATT ) movne r3, #0x400 @@ -747,7 +754,7 @@ eg_done: beq crl_loop @ output interpolation -#if 0 +#if 0 // too expensive on slow platforms @ basic interpolator, interpolate in middle region, else use closer value mov r3, r8, lsr #EG_SH @ eg_timer, [0..3<>EG_SH)/2 @@ -780,7 +787,7 @@ eg_done: mov r7, r7, lsl #16 orr r7, r7, r0 ror r7, r7, #16 -#elif 0 +#elif defined(INTERPOL) @ super-basic... just take value closest to sample point mov r3, r8, lsr #EG_SH-1 @ eg_timer, [0..3<>EG_SH) From 07b86136c4d44b693f79d074ab6cac33bb737e95 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 7 Apr 2020 20:47:38 +0200 Subject: [PATCH 138/174] vdp rendering, sprite caching optimization --- pico/draw.c | 12 ++++++++---- pico/draw_arm.S | 2 +- pico/pico.h | 5 +++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pico/draw.c b/pico/draw.c index a18ec8d7c..3f1857d97 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -1601,7 +1601,6 @@ static int DrawDisplay(int sh) int win=0, edge=0, hvwind=0, lflags; int maxw, maxcells; - est->rendstatus &= ~(PDRAW_SPRITES_MOVED|PDRAW_DIRTY_SPRITES); est->rendstatus &= ~(PDRAW_SHHI_DONE|PDRAW_PLANE_HI_PRIO); if (pvid->reg[12]&1) { @@ -1713,6 +1712,7 @@ PICO_INTERNAL void PicoFrameStart(void) { int offs = 8, lines = 224; int dirty = ((Pico.est.rendstatus & PDRAW_SONIC_MODE) || Pico.m.dirtyPal); + int sprep = Pico.est.rendstatus & (PDRAW_SPRITES_MOVED|PDRAW_DIRTY_SPRITES); // prepare to do this frame Pico.est.rendstatus = 0; @@ -1732,6 +1732,8 @@ PICO_INTERNAL void PicoFrameStart(void) lines, (Pico.video.reg[12] & 1) ? 0 : 1); rendstatus_old = Pico.est.rendstatus; } + if (sprep) + Pico.est.rendstatus |= PDRAW_PARSE_SPRITES; Pico.est.HighCol = HighColBase + offs * HighColIncrement; Pico.est.DrawLineDest = (char *)DrawLineDestBase + offs * DrawLineDestIncrement; @@ -1804,6 +1806,7 @@ static void PicoLine(int line, int offs, int sh, int bgc) void PicoDrawSync(int to, int blank_last_line) { + struct PicoEState *est = &Pico.est; int line, offs = 0; int sh = (Pico.video.reg[0xC] & 8) >> 3; // shadow/hilight? int bgc = Pico.video.reg[7]; @@ -1815,10 +1818,11 @@ void PicoDrawSync(int to, int blank_last_line) if (to > 223) to = 223; } - if (Pico.est.DrawScanline <= to - blank_last_line) + if (est->DrawScanline <= to - blank_last_line && (est->rendstatus & + (PDRAW_SPRITES_MOVED|PDRAW_DIRTY_SPRITES|PDRAW_PARSE_SPRITES))) PrepareSprites(to - blank_last_line + 1); - for (line = Pico.est.DrawScanline; line < to; line++) + for (line = est->DrawScanline; line < to; line++) PicoLine(line, offs, sh, bgc); // last line @@ -1829,7 +1833,7 @@ void PicoDrawSync(int to, int blank_last_line) else PicoLine(line, offs, sh, bgc); line++; } - Pico.est.DrawScanline = line; + est->DrawScanline = line; pprof_end(draw); } diff --git a/pico/draw_arm.S b/pico/draw_arm.S index 2ae6dba63..9b5a4e322 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -14,7 +14,7 @@ .equ PDRAW_SPRITES_MOVED, (1<<0) .equ PDRAW_WND_DIFF_PRIO, (1<<1) -.equ PDRAW_ACC_SPRITES, (1<<2) +.equ PDRAW_PARSE_SPRITES, (1<<2) .equ PDRAW_DIRTY_SPRITES, (1<<4) .equ PDRAW_PLANE_HI_PRIO, (1<<6) .equ PDRAW_SHHI_DONE, (1<<7) diff --git a/pico/pico.h b/pico/pico.h index 6f231cf08..7914cfc0f 100644 --- a/pico/pico.h +++ b/pico/pico.h @@ -198,10 +198,11 @@ void vidConvCpyRGB565(void *to, void *from, int pixels); #endif void PicoDoHighPal555(int sh, int line, struct PicoEState *est); // internals -#define PDRAW_SPRITES_MOVED (1<<0) // (asm) +#define PDRAW_SPRITES_MOVED (1<<0) // SAT address modified #define PDRAW_WND_DIFF_PRIO (1<<1) // not all window tiles use same priority +#define PDRAW_PARSE_SPRITES (1<<2) // SAT needs parsing #define PDRAW_INTERLACE (1<<3) -#define PDRAW_DIRTY_SPRITES (1<<4) // (asm) +#define PDRAW_DIRTY_SPRITES (1<<4) // SAT modified #define PDRAW_SONIC_MODE (1<<5) // mid-frame palette changes for 8bit renderer #define PDRAW_PLANE_HI_PRIO (1<<6) // have layer with all hi prio tiles (mk3) #define PDRAW_SHHI_DONE (1<<7) // layer sh/hi already processed From 2bbc05300ef28c0c31cfc7d21b44b1ed53707b36 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 7 Apr 2020 22:07:38 +0200 Subject: [PATCH 139/174] ym2612 ARM optimisations --- pico/sound/ym2612_arm.S | 198 +++++++++++++++++----------------------- 1 file changed, 82 insertions(+), 116 deletions(-) diff --git a/pico/sound/ym2612_arm.S b/pico/sound/ym2612_arm.S index 59abb74e2..1370e6cf8 100644 --- a/pico/sound/ym2612_arm.S +++ b/pico/sound/ym2612_arm.S @@ -15,8 +15,8 @@ #include "../arm_features.h" -@ very simple adaption YM2612 output rate to sample rate (~1M cycles @44100) -//#define INTERPOL +@ very simple YM2612 output rate to sample rate adaption (~500k cycles @44100) +#define INTERPOL .equiv SLOT1, 0 .equiv SLOT2, 2 @@ -44,7 +44,7 @@ @ r5=slot, r1=eg_cnt, trashes: r0,r2,r3 @ writes output to routp, but only if vol_out changes -.macro update_eg_phase_slot slot +.macro update_eg_phase_slot #if defined(INTERPOL) ldrh r0, [r5,#0x34] @ vol_out #endif @@ -190,21 +190,6 @@ ldrh r3, [r5,#0x18] @ tl add r0, r0, r3 @ volume += tl strh r0, [r5,#0x34] @ vol_out -.if \slot == SLOT1 - mov r6, r6, lsr #16 - orr r6, r0, r6, lsl #16 -.elseif \slot == SLOT2 - mov r6, r6, lsl #16 - mov r0, r0, lsl #16 - orr r6, r0, r6, lsr #16 -.elseif \slot == SLOT3 - mov r7, r7, lsr #16 - orr r7, r0, r7, lsl #16 -.elseif \slot == SLOT4 - mov r7, r7, lsl #16 - mov r0, r0, lsl #16 - orr r7, r0, r7, lsr #16 -.endif 0: @ EG_OFF .endm @@ -672,24 +657,16 @@ chan_render_loop: mov r11, r1 and r0, r0, #7 orr r4, r4, r0 @ (length<<8)|algo - add r0, lr, #0x44 - ldmia r0, {r8,r9} @ eg_timer, eg_timer_add + ldr r8, [lr, #0x44] @ eg_timer + ldr r9, [lr, #0x48] @ eg_timer_add ldr r10, [lr, #0x54] @ op1_out -@ ldmia lr, {r6,r7} @ load volumes - ldr r5, [lr, #0x40] @ CH - ldrh r6, [r5, #0x34] @ vol_out values for all slots - ldrh r2, [r5, #0x34+SLOT_STRUCT_SIZE*2] - ldrh r7, [r5, #0x34+SLOT_STRUCT_SIZE] - ldrh r3, [r5, #0x34+SLOT_STRUCT_SIZE*3] - orr r6, r6, r2, lsl #16 - orr r7, r7, r3, lsl #16 tst r12, #8 @ lfo? beq crl_loop crl_loop_lfo: - add r0, lr, #0x30 - ldmia r0, {r1,r2} @ lfo_cnt, lfo_inc + ldr r1, [lr, #0x30] @ lfo_cnt + ldr r2, [lr, #0x34] @ lfo_inc subs r4, r4, #0x100 bmi crl_loop_end @@ -707,37 +684,48 @@ crl_loop: bmi crl_loop_end @ -- SSG -- - add r0, lr, #0x3c - ldmia r0, {r1,r5} @ eg_cnt, CH + ldr r5, [lr, #0x40] @ CH @ r5=slot, trashes: r0,r2,r3 + mov r6, #4 +ssg_upd_loop: update_ssg_eg - add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT2 (2) - update_ssg_eg - sub r5, r5, #SLOT_STRUCT_SIZE @ SLOT3 (1) - update_ssg_eg - add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT4 (3) +#if 0 + subs r6, r6, #1 + addne r5, r5, #SLOT_STRUCT_SIZE +#else + add r5, r5, #SLOT_STRUCT_SIZE*2 update_ssg_eg + subs r6, r6, #2 + subne r5, r5, #SLOT_STRUCT_SIZE +#endif + bne ssg_upd_loop sub r5, r5, #SLOT_STRUCT_SIZE*3 @ -- EG -- add r8, r8, r9 cmp r8, #EG_TIMER_OVERFLOW bcc eg_done + ldr r1, [lr, #0x3c] @ eg_cnt eg_loop: sub r8, r8, #EG_TIMER_OVERFLOW add r1, r1, #1 cmp r1, #4096 movge r1, #1 - @ SLOT1 (0) - @ r5=slot, r1=eg_cnt, trashes: r0,r2,r3 - update_eg_phase_slot SLOT1 - add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT2 (2) - update_eg_phase_slot SLOT2 - sub r5, r5, #SLOT_STRUCT_SIZE @ SLOT3 (1) - update_eg_phase_slot SLOT3 - add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT4 (3) - update_eg_phase_slot SLOT4 + + mov r6, #4 +eg_upd_loop: + update_eg_phase_slot +#if 1 + subs r6, r6, #1 + addne r5, r5, #SLOT_STRUCT_SIZE +#else + add r5, r5, #SLOT_STRUCT_SIZE*2 + update_eg_phase_slot + subs r6, r6, #2 + subne r5, r5, #SLOT_STRUCT_SIZE +#endif + bne eg_upd_loop cmp r8, #EG_TIMER_OVERFLOW sub r5, r5, #SLOT_STRUCT_SIZE*3 @@ -754,64 +742,49 @@ eg_done: beq crl_loop @ output interpolation -#if 0 // too expensive on slow platforms +#if defined(INTERPOL) +#if 1 // possibly too expensive for slow platforms? @ basic interpolator, interpolate in middle region, else use closer value mov r3, r8, lsr #EG_SH @ eg_timer, [0..3<>EG_SH)/2 - bgt 0f @ mix is vol_out - - ldrh r0, [r5,#0x36] @ SLOT1 vol_ipol - lsleq r2, r6, #16 - addeq r0, r0, r2, lsr #16 - lsreq r0, r0, #1 - mov r6, r6, lsr #16 - orr r6, r0, r6, lsl #16 - - ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*2] @ SLOT2 vol_ipol - addeq r0, r0, r6, lsr #16 - lsreq r0, r0, #1 - mov r6, r6, lsl #16 - orr r6, r6, r0 - ror r6, r6, #16 - - ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE] @ SLOT3 vol_ipol - lsleq r2, r7, #16 - addeq r0, r0, r2, lsr #16 - lsreq r0, r0, #1 - mov r7, r7, lsr #16 - orr r7, r0, r7, lsl #16 - - ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*3] @ SLOT4 vol_ipol - addeq r0, r0, r7, lsr #16 - lsreq r0, r0, #1 - mov r7, r7, lsl #16 - orr r7, r7, r0 - ror r7, r7, #16 -#elif defined(INTERPOL) + bne 0f @ mix is vol_out + + ldr r6, [r5, #0x34] @ vol_out, vol_ipol for all slots + ldr r2, [r5, #0x34+SLOT_STRUCT_SIZE*2] + ldr r7, [r5, #0x34+SLOT_STRUCT_SIZE] + ldr r3, [r5, #0x34+SLOT_STRUCT_SIZE*3] + add r6, r6, r6, lsl #16 + lsr r6, r6, #17 + add r2, r2, r2, lsl #16 + lsr r2, r2, #17 + add r7, r7, r7, lsl #16 + lsr r7, r7, #17 + add r3, r3, r3, lsl #16 + lsr r3, r3, #17 + b 1f +#else @ super-basic... just take value closest to sample point mov r3, r8, lsr #EG_SH-1 @ eg_timer, [0..3<>EG_SH) - bge 0f @ mix is vol_out - - ldrh r0, [r5,#0x36] @ SLOT1 vol_ipol - mov r6, r6, lsr #16 - orr r6, r0, r6, lsl #16 - - ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*2] @ SLOT2 vol_ipol - mov r6, r6, lsl #16 - orr r6, r6, r0 - ror r6, r6, #16 +#endif - ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE] @ SLOT3 vol_ipol - mov r7, r7, lsr #16 - orr r7, r0, r7, lsl #16 +0: ldrgeh r6, [r5, #0x34] @ vol_out values for all slots + ldrlth r6, [r5, #0x36] @ vol_ipol values for all slots + ldrgeh r2, [r5, #0x34+SLOT_STRUCT_SIZE*2] + ldrlth r2, [r5, #0x36+SLOT_STRUCT_SIZE*2] + ldrgeh r7, [r5, #0x34+SLOT_STRUCT_SIZE] + ldrlth r7, [r5, #0x36+SLOT_STRUCT_SIZE] + ldrgeh r3, [r5, #0x34+SLOT_STRUCT_SIZE*3] + ldrlth r3, [r5, #0x36+SLOT_STRUCT_SIZE*3] - ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*3] @ SLOT4 vol_ipol - mov r7, r7, lsl #16 - orr r7, r7, r0 - ror r7, r7, #16 +#else + ldrh r6, [r5, #0x34] @ vol_out values for all slots + ldrh r2, [r5, #0x34+SLOT_STRUCT_SIZE*2] + ldrh r7, [r5, #0x34+SLOT_STRUCT_SIZE] + ldrh r3, [r5, #0x34+SLOT_STRUCT_SIZE*3] #endif -0: +1: orr r6, r6, r2, lsl #16 + orr r7, r7, r3, lsl #16 @ -- SLOT1 -- PIC_LDR(r3, r2, ym_tl_tab) @@ -893,34 +866,28 @@ crl_algo_done: strne r1, [r11], #4 b crl_do_phase -ctl_sample_skip: - and r1, r12, #1 - add r1, r1, #1 - add r11,r11, r1, lsl #2 - b crl_do_phase - ctl_sample_mono: ldr r1, [r11] add r1, r0, r1 str r1, [r11], #4 + b crl_do_phase + +ctl_sample_skip: + and r1, r12, #1 + add r1, r1, #1 + add r11,r11, r1, lsl #2 crl_do_phase: @ -- PHASE UPDATE -- add r5, lr, #0x10 - ldmia r5, {r0-r1} - add r5, lr, #0x20 - ldmia r5, {r2-r3} - add r5, lr, #0x10 - add r0, r0, r2 - add r1, r1, r3 - stmia r5!,{r0-r1} - ldmia r5, {r0-r1} - add r5, lr, #0x28 - ldmia r5, {r2-r3} - add r5, lr, #0x18 - add r0, r0, r2 - add r1, r1, r3 - stmia r5, {r0-r1} + ldmia r5, {r0-r3,r6-r7} + add r0, r0, r6 + add r1, r1, r7 + ldr r6, [r5, #0x18] + ldr r7, [r5, #0x1c] + add r2, r2, r6 + add r3, r3, r7 + stmia r5, {r0-r3} tst r12, #8 bne crl_loop_lfo @@ -928,7 +895,6 @@ crl_do_phase: crl_loop_end: -@ stmia lr, {r6,r7} @ save volumes (for debug) str r8, [lr, #0x44] @ eg_timer str r12, [lr, #0x4c] @ pack (for lfo_ampm) str r4, [lr, #0x50] @ was_update From fe42399ab9d68becfda9cfb064cc1c705779ad29 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 7 Apr 2020 22:23:52 +0200 Subject: [PATCH 140/174] menu background fix for pal mode --- pico/pico.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pico/pico.c b/pico/pico.c index 87e22e59d..579cdd0dc 100644 --- a/pico/pico.c +++ b/pico/pico.c @@ -282,7 +282,7 @@ void PicoFrameDrawOnly(void) { if (!(PicoIn.AHW & PAHW_SMS)) { PicoFrameStart(); - PicoDrawSync(223, 0); + PicoDrawSync(Pico.m.pal?239:223, 0); } else { PicoFrameDrawOnlyMS(); } From 15a59e359b4e03e653668294f35d3fb8672836f1 Mon Sep 17 00:00:00 2001 From: kub Date: Mon, 13 Apr 2020 22:20:13 +0200 Subject: [PATCH 141/174] sh2 timer optimization --- cpu/sh2/sh2.h | 1 + pico/32x/32x.c | 10 ++++++++-- pico/32x/memory.c | 14 +++++++------- pico/32x/sh2soc.c | 46 +++++++++++++++++++++++----------------------- pico/pico_int.h | 3 ++- 5 files changed, 41 insertions(+), 33 deletions(-) diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index 2f2dfd922..aabe45bea 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -48,6 +48,7 @@ typedef struct SH2_ #define SH2_STATE_CPOLL (1 << 2) // polling comm regs #define SH2_STATE_VPOLL (1 << 3) // polling VDP #define SH2_STATE_RPOLL (1 << 4) // polling address in SDRAM +#define SH2_TIMER_RUN (1 << 8) // SOC WDT timer is running unsigned int state; uint32_t poll_addr; int poll_cycles; diff --git a/pico/32x/32x.c b/pico/32x/32x.c index 0f0cc4f5c..ddd03fa85 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -508,12 +508,18 @@ void sync_sh2s_normal(unsigned int m68k_target) now = ssh2.m68krcycles_done; } if (CYCLES_GT(now, timer_cycles+STEP_N)) { - p32x_timers_do(now - timer_cycles); + if (msh2.state & SH2_TIMER_RUN) + p32x_timer_do(&msh2, now - timer_cycles); + if (ssh2.state & SH2_TIMER_RUN) + p32x_timer_do(&ssh2, now - timer_cycles); timer_cycles = now; } } - p32x_timers_do(now - timer_cycles); + if (msh2.state & SH2_TIMER_RUN) + p32x_timer_do(&msh2, now - timer_cycles); + if (ssh2.state & SH2_TIMER_RUN) + p32x_timer_do(&ssh2, now - timer_cycles); timer_cycles = now; } pprof_end_sub(m68k); diff --git a/pico/32x/memory.c b/pico/32x/memory.c index f4f0a18b9..3f5972880 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -111,7 +111,7 @@ void p32x_m68k_poll_event(u32 flags) m68k_poll.addr1 = m68k_poll.addr2 = m68k_poll.cnt = 0; } -static void NOINLINE sh2_poll_detect(u32 a, SH2 *sh2, u32 flags, int maxcnt) +void NOINLINE p32x_sh2_poll_detect(u32 a, SH2 *sh2, u32 flags, int maxcnt) { u32 cycles_done = sh2_cycles_done_t(sh2); @@ -275,7 +275,7 @@ u32 REGPARM(3) p32x_sh2_poll_memory16(u32 a, u32 d, SH2 *sh2) d = (s16)sh2_poll_read(a, d, cycles, sh2); } - sh2_poll_detect(a, sh2, SH2_STATE_RPOLL, 5); + p32x_sh2_poll_detect(a, sh2, SH2_STATE_RPOLL, 5); DRC_RESTORE_SR(sh2); return d; @@ -296,7 +296,7 @@ u32 REGPARM(3) p32x_sh2_poll_memory32(u32 a, u32 d, SH2 *sh2) ((u16)sh2_poll_read(a+2, d, cycles, sh2)); } - sh2_poll_detect(a, sh2, SH2_STATE_RPOLL, 5); + p32x_sh2_poll_detect(a, sh2, SH2_STATE_RPOLL, 5); DRC_RESTORE_SR(sh2); return d; @@ -735,7 +735,7 @@ static u32 p32x_sh2reg_read16(u32 a, SH2 *sh2) return (r[0] & P32XS_FM) | Pico32x.sh2_regs[0] | Pico32x.sh2irq_mask[sh2->is_slave]; case 0x04/2: // H count (often as comm too) - sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 9); + p32x_sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 9); cycles = sh2_cycles_done_m68k(sh2); sh2s_sync_on_read(sh2, cycles); return sh2_poll_read(a, Pico32x.sh2_regs[4 / 2], cycles, sh2); @@ -769,7 +769,7 @@ static u32 p32x_sh2reg_read16(u32 a, SH2 *sh2) case 0x2a/2: case 0x2c/2: case 0x2e/2: - sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 9); + p32x_sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 9); cycles = sh2_cycles_done_m68k(sh2); sh2s_sync_on_read(sh2, cycles); return sh2_poll_read(a, r[a / 2], cycles, sh2); @@ -1456,7 +1456,7 @@ static u32 REGPARM(2) sh2_read8_cs0(u32 a, SH2 *sh2) if ((a & 0x3fff0) == 0x4100) { d = p32x_vdp_read16(a); - sh2_poll_detect(a, sh2, SH2_STATE_VPOLL, 9); + p32x_sh2_poll_detect(a, sh2, SH2_STATE_VPOLL, 9); goto out_16to8; } @@ -1519,7 +1519,7 @@ static u32 REGPARM(2) sh2_read16_cs0(u32 a, SH2 *sh2) if ((a & 0x3fff0) == 0x4100) { d = p32x_vdp_read16(a); - sh2_poll_detect(a, sh2, SH2_STATE_VPOLL, 9); + p32x_sh2_poll_detect(a, sh2, SH2_STATE_VPOLL, 9); goto out; } diff --git a/pico/32x/sh2soc.c b/pico/32x/sh2soc.c index cf11666dd..8895d49b9 100644 --- a/pico/32x/sh2soc.c +++ b/pico/32x/sh2soc.c @@ -209,6 +209,9 @@ void p32x_timers_recalc(void) // SH2 timer step for (i = 0; i < 2; i++) { + sh2s[i].state &= ~SH2_TIMER_RUN; + if (PREG8(sh2s[i].peri_regs, 0x80) & 0x20) // TME + sh2s[i].state |= SH2_TIMER_RUN; tmp = PREG8(sh2s[i].peri_regs, 0x80) & 7; // Sclk cycles per timer tick if (tmp) @@ -222,32 +225,29 @@ void p32x_timers_recalc(void) } } -void p32x_timers_do(unsigned int m68k_slice) +NOINLINE void p32x_timer_do(SH2 *sh2, unsigned int m68k_slice) { unsigned int cycles = m68k_slice * 3; - int cnt, i; - - // WDT timers - for (i = 0; i < 2; i++) { - void *pregs = sh2s[i].peri_regs; - if (PREG8(pregs, 0x80) & 0x20) { // TME - timer_cycles[i] += cycles; - // cnt = timer_cycles[i] / timer_tick_cycles[i]; - cnt = (1ULL * timer_cycles[i] * timer_tick_factor[i]) >> 32; - timer_cycles[i] -= timer_tick_cycles[i] * cnt; - if (timer_cycles[i] > timer_tick_cycles[i]) - timer_cycles[i] -= timer_tick_cycles[i], cnt++; - cnt += PREG8(pregs, 0x81); - if (cnt >= 0x100) { - int level = PREG8(pregs, 0xe3) >> 4; - int vector = PREG8(pregs, 0xe4) & 0x7f; - elprintf(EL_32XP, "%csh2 WDT irq (%d, %d)", - i ? 's' : 'm', level, vector); - sh2_internal_irq(&sh2s[i], level, vector); - cnt &= 0xff; - } - PREG8(pregs, 0x81) = cnt; + void *pregs = sh2->peri_regs; + int cnt; int i = sh2->is_slave; + + // WDT timer + timer_cycles[i] += cycles; + if (timer_cycles[i] > timer_tick_cycles[i]) { + // cnt = timer_cycles[i] / timer_tick_cycles[i]; + cnt = (1ULL * timer_cycles[i] * timer_tick_factor[i]) >> 32; + timer_cycles[i] -= timer_tick_cycles[i] * cnt; + + cnt += PREG8(pregs, 0x81); + if (cnt >= 0x100) { + int level = PREG8(pregs, 0xe3) >> 4; + int vector = PREG8(pregs, 0xe4) & 0x7f; + elprintf(EL_32XP, "%csh2 WDT irq (%d, %d)", + i ? 's' : 'm', level, vector); + sh2_internal_irq(sh2, level, vector); + cnt &= 0xff; } + PREG8(pregs, 0x81) = cnt; } } diff --git a/pico/pico_int.h b/pico/pico_int.h index 5fed483dc..e4bd4c1ef 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -977,6 +977,7 @@ unsigned int REGPARM(3) p32x_sh2_poll_memory8(unsigned int a, unsigned int d, SH unsigned int REGPARM(3) p32x_sh2_poll_memory16(unsigned int a, unsigned int d, SH2 *sh2); unsigned int REGPARM(3) p32x_sh2_poll_memory32(unsigned int a, unsigned int d, SH2 *sh2); void *p32x_sh2_get_mem_ptr(unsigned int a, unsigned int *mask, SH2 *sh2); +void p32x_sh2_poll_detect(unsigned int a, SH2 *sh2, unsigned int flags, int maxcnt); void p32x_sh2_poll_event(SH2 *sh2, unsigned int flags, unsigned int m68k_cycles); int p32x_sh2_memcpy(unsigned int dst, unsigned int src, int count, int size, SH2 *sh2); @@ -1012,7 +1013,7 @@ void p32x_pwm_state_loaded(void); void p32x_dreq0_trigger(void); void p32x_dreq1_trigger(void); void p32x_timers_recalc(void); -void p32x_timers_do(unsigned int m68k_slice); +void p32x_timer_do(SH2 *sh2, unsigned int m68k_slice); void sh2_peripheral_reset(SH2 *sh2); unsigned int REGPARM(2) sh2_peripheral_read8(unsigned int a, SH2 *sh2); unsigned int REGPARM(2) sh2_peripheral_read16(unsigned int a, SH2 *sh2); From 76cbd988b7400605c4cfad5a91f72ae60a87ede5 Mon Sep 17 00:00:00 2001 From: kub Date: Mon, 13 Apr 2020 22:22:33 +0200 Subject: [PATCH 142/174] 32x pwm, tiny optimization --- pico/32x/pwm.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pico/32x/pwm.c b/pico/32x/pwm.c index 3e5ce0ae0..ec4bdb3e7 100644 --- a/pico/32x/pwm.c +++ b/pico/32x/pwm.c @@ -9,7 +9,7 @@ static struct { int cycles; - int mult; + unsigned mult; int ptr; int irq_reload; int doing_fifo; @@ -58,11 +58,11 @@ static void do_pwm_irq(SH2 *sh2, unsigned int m68k_cycles) static int convert_sample(unsigned int v) { - if (v == 0) - return 0; if (v > pwm.cycles) v = pwm.cycles; - return (v * 2 - pwm.cycles) / 2 * pwm.mult; + if (v == 0) + return 0; + return v * pwm.mult - 0x10000/2; } #define consume_fifo(sh2, m68k_cycles) { \ @@ -89,19 +89,21 @@ static void consume_fifo_do(SH2 *sh2, unsigned int m68k_cycles, // this is for recursion from dreq1 writes pwm.doing_fifo = 1; - for (; sh2_cycles_diff >= pwm.cycles; sh2_cycles_diff -= pwm.cycles) + while (sh2_cycles_diff >= pwm.cycles) { + sh2_cycles_diff -= pwm.cycles; + if (Pico32x.pwm_p[0] > 0) { mem->pwm_index[0] = (mem->pwm_index[0]+1) % 4; Pico32x.pwm_p[0]--; pwm.current[0] = convert_sample(fifo_l[mem->pwm_index[0]]); - sum |= pwm.current[0]; + sum |= (u16)pwm.current[0]; } if (Pico32x.pwm_p[1] > 0) { mem->pwm_index[1] = (mem->pwm_index[1]+1) % 4; Pico32x.pwm_p[1]--; pwm.current[1] = convert_sample(fifo_r[mem->pwm_index[1]]); - sum |= pwm.current[1]; + sum |= (u16)pwm.current[1]; } mem->pwm[pwm.ptr * 2 ] = pwm.current[0]; @@ -234,9 +236,7 @@ void p32x_pwm_write16(unsigned int a, unsigned int d, fifo = Pico32xMem->pwm_fifo[1]; idx = Pico32xMem->pwm_index[1]; if (Pico32x.pwm_p[1] < 3) { - if (pwm.irq_state == PWM_IRQ_STOPPED) - pwm.irq_state = PWM_IRQ_LOW; - if (Pico32x.pwm_p[1] == 2 && pwm.irq_state >= PWM_IRQ_LOW) { + if (Pico32x.pwm_p[1] == 2 && pwm.irq_state >= PWM_IRQ_STOPPED) { // buffer full. If there was no buffer underrun after last fill, // try increasing reload rate to reduce IRQs if (pwm.irq_reload < 3 && pwm.irq_state == PWM_IRQ_HIGH) @@ -250,7 +250,7 @@ void p32x_pwm_write16(unsigned int a, unsigned int d, pwm.irq_reload = pwm.irq_timer; pwm.irq_state = PWM_IRQ_LOCKED; idx = (idx+1) % 4; - Pico32xMem->pwm_index[0] = idx; + Pico32xMem->pwm_index[1] = idx; } fifo[(idx+Pico32x.pwm_p[1]) % 4] = (d - 1) & 0x0fff; if (a != 8) break; // fallthrough if MONO From 7371b14ede219e0929dab4086ed7577fbfab8f6e Mon Sep 17 00:00:00 2001 From: kub Date: Mon, 13 Apr 2020 22:26:15 +0200 Subject: [PATCH 143/174] add sh2 ubc area to poll detection --- pico/32x/sh2soc.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/pico/32x/sh2soc.c b/pico/32x/sh2soc.c index 8895d49b9..369fc0de8 100644 --- a/pico/32x/sh2soc.c +++ b/pico/32x/sh2soc.c @@ -273,9 +273,14 @@ u32 REGPARM(2) sh2_peripheral_read8(u32 a, SH2 *sh2) a &= 0x1ff; d = PREG8(r, a); - sh2->poll_cnt = 0; elprintf_sh2(sh2, EL_32XP, "peri r8 [%08x] %02x @%06x", a | ~0x1ff, d, sh2_pc(sh2)); + if ((a & 0x1c0) == 0x140) { + // abused as comm area + DRC_SAVE_SR(sh2); + p32x_sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 3); + DRC_RESTORE_SR(sh2); + } return d; } @@ -287,9 +292,14 @@ u32 REGPARM(2) sh2_peripheral_read16(u32 a, SH2 *sh2) a &= 0x1fe; d = r[(a / 2) ^ 1]; - sh2->poll_cnt = 0; elprintf_sh2(sh2, EL_32XP, "peri r16 [%08x] %04x @%06x", a | ~0x1ff, d, sh2_pc(sh2)); + if ((a & 0x1c0) == 0x140) { + // abused as comm area + DRC_SAVE_SR(sh2); + p32x_sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 3); + DRC_RESTORE_SR(sh2); + } return d; } @@ -300,9 +310,14 @@ u32 REGPARM(2) sh2_peripheral_read32(u32 a, SH2 *sh2) a &= 0x1fc; d = sh2->peri_regs[a / 4]; - sh2->poll_cnt = 0; elprintf_sh2(sh2, EL_32XP, "peri r32 [%08x] %08x @%06x", a | ~0x1ff, d, sh2_pc(sh2)); + if ((a & 0x1c0) == 0x140) { + // abused as comm area + DRC_SAVE_SR(sh2); + p32x_sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 3); + DRC_RESTORE_SR(sh2); + } return d; } @@ -378,6 +393,9 @@ void REGPARM(3) sh2_peripheral_write8(u32 a, u32 d, SH2 *sh2) break; } PREG8(r, a) = d; + + if ((a & 0x1c0) == 0x140) + p32x_sh2_poll_event(sh2, SH2_STATE_CPOLL, SekCyclesDone()); } void REGPARM(3) sh2_peripheral_write16(u32 a, u32 d, SH2 *sh2) @@ -400,6 +418,8 @@ void REGPARM(3) sh2_peripheral_write16(u32 a, u32 d, SH2 *sh2) } r[(a / 2) ^ 1] = d; + if ((a & 0x1c0) == 0x140) + p32x_sh2_poll_event(sh2, SH2_STATE_CPOLL, SekCyclesDone()); } void REGPARM(3) sh2_peripheral_write32(u32 a, u32 d, SH2 *sh2) @@ -457,14 +477,15 @@ void REGPARM(3) sh2_peripheral_write32(u32 a, u32 d, SH2 *sh2) if (!(dmac->dmaor & DMA_DME)) return; - DRC_SAVE_SR(sh2); if ((dmac->chan[0].chcr & (DMA_TE|DMA_DE)) == DMA_DE) dmac_trigger(sh2, &dmac->chan[0]); if ((dmac->chan[1].chcr & (DMA_TE|DMA_DE)) == DMA_DE) dmac_trigger(sh2, &dmac->chan[1]); - DRC_RESTORE_SR(sh2); break; } + + if ((a & 0x1c0) == 0x140) + p32x_sh2_poll_event(sh2, SH2_STATE_CPOLL, SekCyclesDone()); } /* 32X specific */ From 02a29cc8f46c217b31dd838875ff6a2ab159c19e Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 22 Apr 2020 20:29:53 +0200 Subject: [PATCH 144/174] sh2, optimizations to innermost run loop --- cpu/sh2/sh2.h | 17 +++++++++-------- pico/32x/32x.c | 13 ++++++++----- pico/pico_int.h | 9 ++++----- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index aabe45bea..b0054c05c 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -75,6 +75,7 @@ typedef struct SH2_ unsigned int cycles_timeslice; struct SH2_ *other_sh2; + int (*run)(struct SH2_ *, int); // we use 68k reference cycles for easier sync unsigned int m68krcycles_done; @@ -82,7 +83,7 @@ typedef struct SH2_ unsigned int mult_sh2_to_m68k; uint8_t data_array[0x1000]; // cache (can be used as RAM) - uint32_t peri_regs[0x200/4]; // periphereal regs + uint32_t peri_regs[0x200/4]; // peripheral regs } SH2; #define CYCLE_MULT_SHIFT 10 @@ -103,17 +104,17 @@ void sh2_unpack(SH2 *sh2, const unsigned char *buff); int sh2_execute_drc(SH2 *sh2c, int cycles); int sh2_execute_interpreter(SH2 *sh2c, int cycles); -static __inline int sh2_execute(SH2 *sh2, int cycles, int use_drc) +static __inline void sh2_execute_prepare(SH2 *sh2, int use_drc) +{ + sh2->run = use_drc ? sh2_execute_drc : sh2_execute_interpreter; +} + +static __inline int sh2_execute(SH2 *sh2, int cycles) { int ret; sh2->cycles_timeslice = cycles; -#ifdef DRC_SH2 - if (use_drc) - ret = sh2_execute_drc(sh2, cycles); - else -#endif - ret = sh2_execute_interpreter(sh2, cycles); + ret = sh2->run(sh2, cycles); return sh2->cycles_timeslice - ret; } diff --git a/pico/32x/32x.c b/pico/32x/32x.c index ddd03fa85..3b8896483 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -383,7 +383,7 @@ static void run_sh2(SH2 *sh2, unsigned int m68k_cycles) elprintf_sh2(sh2, EL_32X, "+run %u %d @%08x", sh2->m68krcycles_done, cycles, sh2->pc); - done = sh2_execute(sh2, cycles, PicoIn.opt & POPT_EN_DRC); + done = sh2_execute(sh2, cycles); sh2->m68krcycles_done += C_SH2_TO_M68K(sh2, done); sh2->state &= ~SH2_STATE_RUN; @@ -499,12 +499,12 @@ void sync_sh2s_normal(unsigned int m68k_target) pprof_end(msh2); now = next; - if (!(msh2.state & SH2_IDLE_STATES)) { - if (CYCLES_GT(now, msh2.m68krcycles_done)) + if (CYCLES_GT(now, msh2.m68krcycles_done)) { + if (!(msh2.state & SH2_IDLE_STATES)) now = msh2.m68krcycles_done; } - if (!(ssh2.state & SH2_IDLE_STATES)) { - if (CYCLES_GT(now, ssh2.m68krcycles_done)) + if (CYCLES_GT(now, ssh2.m68krcycles_done)) { + if (!(ssh2.state & SH2_IDLE_STATES)) now = ssh2.m68krcycles_done; } if (CYCLES_GT(now, timer_cycles+STEP_N)) { @@ -571,6 +571,9 @@ void sync_sh2s_lockstep(unsigned int m68k_target) void PicoFrame32x(void) { + sh2_execute_prepare(&msh2, PicoIn.opt & POPT_EN_DRC); + sh2_execute_prepare(&ssh2, PicoIn.opt & POPT_EN_DRC); + Pico.m.scanline = 0; Pico32x.vdp_regs[0x0a/2] &= ~P32XV_VBLK; // get out of vblank diff --git a/pico/pico_int.h b/pico/pico_int.h index e4bd4c1ef..8a4aa309f 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -235,11 +235,10 @@ extern SH2 sh2s[2]; # define sh2_pc(sh2) (sh2)->ppc #else # define sh2_end_run(sh2, after_) do { \ - int left_ = (signed int)(sh2)->sr >> 12; \ - if (left_ > (after_)) { \ - (sh2)->cycles_timeslice -= left_ - (after_); \ - (sh2)->sr &= 0xfff; \ - (sh2)->sr |= (after_) << 12; \ + int left_ = ((signed int)(sh2)->sr >> 12) - (after_); \ + if (left_ > 0) { \ + (sh2)->cycles_timeslice -= left_; \ + (sh2)->sr -= (left_ << 12); \ } \ } while (0) # define sh2_cycles_left(sh2) ((signed int)(sh2)->sr >> 12) From 36694b6067adcfea7d2e878e62d44cd0f907bec9 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 22 Apr 2020 20:34:20 +0200 Subject: [PATCH 145/174] 32x, small improvement for poll detection --- pico/32x/memory.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 3f5972880..69f703183 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -114,12 +114,13 @@ void p32x_m68k_poll_event(u32 flags) void NOINLINE p32x_sh2_poll_detect(u32 a, SH2 *sh2, u32 flags, int maxcnt) { u32 cycles_done = sh2_cycles_done_t(sh2); + u32 cycles_diff = cycles_done - sh2->poll_cycles; // reading 2 consecutive 16bit values is probably a 32bit access. detect this // by checking address (max 2 bytes away) and cycles (max 2 cycles later). // no polling if more than 20 cycles have passed since last detect call. - if (a - sh2->poll_addr <= 2 && CYCLES_GE(sh2->poll_cycles+20, cycles_done)) { - if (CYCLES_GT(cycles_done,sh2->poll_cycles+2) && ++sh2->poll_cnt >= maxcnt) { + if (a - sh2->poll_addr <= 2 && CYCLES_GE(20, cycles_diff)) { + if (CYCLES_GT(cycles_diff, 2) && ++sh2->poll_cnt >= maxcnt) { if (!(sh2->state & flags)) elprintf_sh2(sh2, EL_32X, "state: %02x->%02x", sh2->state, sh2->state | flags); From a274146a7d84f761bf1678a592a9dceb9e0da0e4 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 22 Apr 2020 20:41:51 +0200 Subject: [PATCH 146/174] audio: improve cycle accuracy of SN76496+YM2612 --- pico/memory.c | 7 +++-- pico/pico_int.h | 12 ++++----- pico/sms.c | 2 +- pico/sound/sound.c | 67 +++++++++++++++------------------------------- 4 files changed, 32 insertions(+), 56 deletions(-) diff --git a/pico/memory.c b/pico/memory.c index 0fa7b8de0..e1afb4dbe 100644 --- a/pico/memory.c +++ b/pico/memory.c @@ -391,7 +391,7 @@ static int get_scanline(int is_from_z80); static void psg_write_68k(u32 d) { // look for volume write and update if needed - if ((d & 0x90) == 0x90 && Pico.snd.psg_line < Pico.m.scanline) + if ((d & 0x90) == 0x90) PsndDoPSG(Pico.m.scanline); SN76496Write(d); @@ -401,8 +401,7 @@ static void psg_write_z80(u32 d) { if ((d & 0x90) == 0x90) { int scanline = get_scanline(1); - if (Pico.snd.psg_line < scanline) - PsndDoPSG(scanline); + PsndDoPSG(scanline); } SN76496Write(d); @@ -1061,7 +1060,7 @@ static int ym2612_write_local(u32 a, u32 d, int is_from_z80) if (PicoIn.opt & POPT_EXT_FM) return YM2612Write_940(a, d, get_scanline(is_from_z80)); #endif - PsndDoFM(get_scanline(is_from_z80)); + PsndDoFM(is_from_z80 ? z80_cyclesDone() : z80_cycles_from_68k()); return YM2612Write_(a, d); } diff --git a/pico/pico_int.h b/pico/pico_int.h index 8a4aa309f..7539379aa 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -436,12 +436,12 @@ struct PicoSound short len_use; // adjusted int len_e_add; // for non-int samples/frame int len_e_cnt; - int dac_val, dac_val2; // last DAC sample - unsigned int dac_mult; // z80 clocks per line in Q16 - unsigned int dac_pos; // last DAC position in Q16 - short psg_line; - unsigned int fm_mult; // samples per line in Q16 - unsigned int fm_pos; // last FM position in Q16 + unsigned int clkl_mult; // z80 clocks per line in Q20 + unsigned int smpl_mult; // samples per line in Q16 + short dac_val, dac_val2; // last DAC sample + unsigned int dac_pos; // last DAC position in Q20 + unsigned int fm_pos; // last FM position in Q20 + unsigned int psg_pos; // last PSG position in Q16 }; // run tools/mkoffsets pico/pico_int_offs.h if you change these diff --git a/pico/sms.c b/pico/sms.c index b016f197b..901f2f55e 100644 --- a/pico/sms.c +++ b/pico/sms.c @@ -152,7 +152,7 @@ static void z80_sms_out(unsigned short a, unsigned char d) case 0x40: case 0x41: - if ((d & 0x90) == 0x90 && Pico.snd.psg_line < Pico.m.scanline) + if ((d & 0x90) == 0x90); PsndDoPSG(Pico.m.scanline); SN76496Write(d); break; diff --git a/pico/sound/sound.c b/pico/sound/sound.c index f84309470..befdf4b45 100644 --- a/pico/sound/sound.c +++ b/pico/sound/sound.c @@ -19,9 +19,6 @@ void (*PsndMix_32_to_16l)(short *dest, int *src, int count) = mix_32_to_16l_ster // master int buffer to mix to static int PsndBuffer[2*(44100+100)/50]; -// dac, psg -static unsigned short dac_info[312+4]; // pos in sample buffer - // cdda output buffer short cdda_out_buffer[2*1152]; @@ -95,23 +92,6 @@ static void low_pass_filter_mono(int *buf32, int length) void (*low_pass_filter)(int *buf32, int length) = low_pass_filter_stereo; -static void dac_recalculate(void) -{ - int lines = Pico.m.pal ? 313 : 262; - int i, pos; - - pos = 0; // Q16 - - for(i = 0; i <= lines; i++) - { - dac_info[i] = ((pos+0x8000) >> 16); // round to nearest - pos += Pico.snd.fm_mult; - } - for (i = lines+1; i < sizeof(dac_info) / sizeof(dac_info[0]); i++) - dac_info[i] = dac_info[i-1]; -} - - PICO_INTERNAL void PsndReset(void) { // PsndRerate calls YM2612Init, which also resets @@ -159,12 +139,9 @@ void PsndRerate(int preserve_state) Pico.snd.len_e_cnt = 0; // Q16 // samples per line (Q16) - Pico.snd.fm_mult = 65536LL * PicoIn.sndRate / (target_fps*target_lines); + Pico.snd.smpl_mult = 65536LL * PicoIn.sndRate / (target_fps*target_lines); // samples per z80 clock (Q20) - Pico.snd.dac_mult = 16 * Pico.snd.fm_mult * 15/7 / 488; - - // recalculate dac info - dac_recalculate(); + Pico.snd.clkl_mult = 16 * Pico.snd.smpl_mult * 15/7 / 488; // clear all buffers memset32(PsndBuffer, 0, sizeof(PsndBuffer)/4); @@ -192,8 +169,6 @@ PICO_INTERNAL void PsndStartFrame(void) Pico.snd.len_e_cnt -= 0x10000; Pico.snd.len_use++; } - - Pico.snd.psg_line = 0; } PICO_INTERNAL void PsndDoDAC(int cyc_to) @@ -202,7 +177,7 @@ PICO_INTERNAL void PsndDoDAC(int cyc_to) int dout = ym2612.dacout; // number of samples to fill in buffer (Q20) - len = (cyc_to * Pico.snd.dac_mult) - Pico.snd.dac_pos; + len = (cyc_to * Pico.snd.clkl_mult) - Pico.snd.dac_pos; // update position and calculate buffer offset and length pos = (Pico.snd.dac_pos+0x80000) >> 20; @@ -237,17 +212,18 @@ PICO_INTERNAL void PsndDoDAC(int cyc_to) PICO_INTERNAL void PsndDoPSG(int line_to) { - int line_from = Pico.snd.psg_line; - int pos, pos1, len; + int pos, len; int stereo = 0; - pos = dac_info[line_from]; - pos1 = dac_info[line_to + 1]; - len = pos1 - pos; + // Q16, number of samples since last call + len = ((line_to+1) * Pico.snd.smpl_mult) - Pico.snd.psg_pos; if (len <= 0) return; - Pico.snd.psg_line = line_to + 1; + // update position and calculate buffer offset and length + pos = (Pico.snd.psg_pos+0x8000) >> 16; + Pico.snd.psg_pos += len; + len = ((Pico.snd.psg_pos+0x8000) >> 16) - pos; if (!PicoIn.sndOut || !(PicoIn.opt & POPT_EN_PSG)) return; @@ -259,22 +235,22 @@ PICO_INTERNAL void PsndDoPSG(int line_to) SN76496Update(PicoIn.sndOut + pos, len, stereo); } -PICO_INTERNAL void PsndDoFM(int line_to) +PICO_INTERNAL void PsndDoFM(int cyc_to) { int pos, len; int stereo = 0; // Q16, number of samples since last call - len = ((line_to-1) * Pico.snd.fm_mult) - Pico.snd.fm_pos; + len = (cyc_to * Pico.snd.clkl_mult) - Pico.snd.fm_pos; // don't do this too often (about every 4th scanline) - if (len >> 16 <= PicoIn.sndRate >> 12) + if (len >> 20 <= PicoIn.sndRate >> 12) return; // update position and calculate buffer offset and length - pos = (Pico.snd.fm_pos+0x8000) >> 16; + pos = (Pico.snd.fm_pos+0x80000) >> 20; Pico.snd.fm_pos += len; - len = ((Pico.snd.fm_pos+0x8000) >> 16) - pos; + len = ((Pico.snd.fm_pos+0x80000) >> 20) - pos; // fill buffer if (PicoIn.opt & POPT_EN_STEREO) { @@ -347,7 +323,7 @@ PICO_INTERNAL void PsndClear(void) if (!(PicoIn.opt & POPT_EN_FM)) memset32(PsndBuffer, 0, PicoIn.opt & POPT_EN_STEREO ? len*2 : len); // drop pos remainder to avoid rounding errors (not entirely correct though) - Pico.snd.dac_pos = Pico.snd.fm_pos = 0; + Pico.snd.dac_pos = Pico.snd.fm_pos = Pico.snd.psg_pos = 0; } @@ -355,7 +331,7 @@ static int PsndRender(int offset, int length) { int *buf32; int stereo = (PicoIn.opt & 8) >> 3; - int fmlen = ((Pico.snd.fm_pos+0x8000) >> 16); + int fmlen = ((Pico.snd.fm_pos+0x80000) >> 20); int daclen = ((Pico.snd.dac_pos+0x80000) >> 20); buf32 = PsndBuffer+(offset< 0) { short *dacbuf = PicoIn.sndOut + (daclen << stereo); Pico.snd.dac_pos += (length-daclen) << 20; - for (; length-daclen > 0; daclen++) { + *dacbuf++ += Pico.snd.dac_val2; + if (stereo) dacbuf++; + for (daclen++; length-daclen > 0; daclen++) { *dacbuf++ += Pico.snd.dac_val; if (stereo) dacbuf++; } + Pico.snd.dac_val2 = Pico.snd.dac_val; } // Add in parts of the FM buffer not yet done if (length-fmlen > 0) { int *fmbuf = buf32 + ((fmlen-offset) << stereo); - Pico.snd.fm_pos += (length-fmlen) << 16; + Pico.snd.fm_pos += (length-fmlen) << 20; if (PicoIn.opt & POPT_EN_FM) YM2612UpdateOne(fmbuf, length-fmlen, stereo, 1); } @@ -423,8 +402,6 @@ PICO_INTERNAL void PsndGetSamples(int y) { static int curr_pos = 0; - if (ym2612.dacen) - PsndDoDAC(cycles_68k_to_z80(Pico.t.m68c_aim - Pico.t.m68c_frame_start)); PsndDoPSG(y - 1); curr_pos = PsndRender(0, Pico.snd.len_use); From e4d1637146cf1a40af6741ee65d2a7d4299c30e6 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 22 Apr 2020 20:48:03 +0200 Subject: [PATCH 147/174] audio: fixes and optimizations for SSG-EG --- pico/sound/ym2612.c | 104 +++++++++++++++++++++++++++------------- pico/sound/ym2612.h | 1 + pico/sound/ym2612_arm.S | 94 +++++++++++++++++++++++++++++++----- 3 files changed, 153 insertions(+), 46 deletions(-) diff --git a/pico/sound/ym2612.c b/pico/sound/ym2612.c index af381fb08..cb4f8c7d1 100644 --- a/pico/sound/ym2612.c +++ b/pico/sound/ym2612.c @@ -128,7 +128,7 @@ extern YM2612 *ym2612_940; #endif -void memset32(int *dest, int c, int count); +void memset32(void *dest, int c, int count); #ifndef __GNUC__ @@ -511,7 +511,7 @@ static INT32 lfo_pm_table[128*8*32]; /* 128 combinations of 7 bits meaningful (o but LFO works with one more bit of a precision so we really need 4096 elements */ static UINT32 fn_table[4096]; /* fnumber->increment counter */ -static int g_lfo_ampm = 0; +static int g_lfo_ampm; /* register number to channel number , slot offset */ #define OPN_CHAN(N) (N&3) @@ -569,7 +569,7 @@ INLINE void FM_KEYON(int c , int s ) } else { SLOT->volume = MIN_ATT_INDEX; } - recalc_volout(SLOT); +// recalc_volout(SLOT); ym2612.slot_mask |= (1<tl = (v&0x7f)<<(ENV_BITS-7); /* 7bit TL */ - if (SLOT->state > EG_REL) - recalc_volout(SLOT); +// if (SLOT->state > EG_REL) +// recalc_volout(SLOT); } /* set attack rate & key scale */ @@ -761,7 +761,7 @@ INLINE int advance_lfo(int lfo_ampm, UINT32 lfo_cnt_old, UINT32 lfo_cnt) return lfo_ampm; } -INLINE void update_eg_phase(FM_SLOT *SLOT, UINT32 eg_cnt) +INLINE void update_eg_phase(FM_SLOT *SLOT, UINT32 eg_cnt, UINT32 ssg_en) { INT32 volume = SLOT->volume; UINT32 pack = SLOT->eg_pack[SLOT->state - 1]; @@ -774,7 +774,7 @@ INLINE void update_eg_phase(FM_SLOT *SLOT, UINT32 eg_cnt) eg_inc_val = pack >> ((eg_cnt >> shift) & 7) * 3; eg_inc_val = (1 << (eg_inc_val & 7)) >> 1; - if (SLOT->ssg&0x08) { + if ((SLOT->ssg&0x08) && ssg_en) { switch (SLOT->state) { case EG_ATT: /* attack phase */ @@ -854,7 +854,7 @@ INLINE void update_eg_phase(FM_SLOT *SLOT, UINT32 eg_cnt) SLOT->volume = volume; } -INLINE void update_ssg_eg_phase(FM_SLOT *SLOT) +INLINE UINT32 update_ssg_eg_phase(FM_SLOT *SLOT, UINT32 phase) { if (SLOT->ssg&0x01) { if (SLOT->ssg&0x02) { @@ -869,7 +869,7 @@ INLINE void update_ssg_eg_phase(FM_SLOT *SLOT) SLOT->ssg ^= 4; SLOT->ssgn ^= 4; } else - SLOT->phase = 0; + phase = 0; if (SLOT->state != EG_ATT) { SLOT->state = (SLOT->sl == MIN_ATT_INDEX) ? EG_SUS : EG_DEC; @@ -880,7 +880,8 @@ INLINE void update_ssg_eg_phase(FM_SLOT *SLOT) } } } - recalc_volout(SLOT); +// recalc_volout(SLOT); + return phase; } #endif @@ -927,15 +928,23 @@ static void chan_render_loop(chan_rend_context *ct, int *buffer, int length) int smp = 0; /* produced sample */ unsigned int eg_out, eg_out2, eg_out4; FM_SLOT *SLOT; + UINT32 cnt = ct->eg_timer_add+(ct->eg_timer & ((1<CH->SLOT[SLOT1]; - if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) update_ssg_eg_phase(SLOT); - SLOT = &ct->CH->SLOT[SLOT2]; - if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) update_ssg_eg_phase(SLOT); - SLOT = &ct->CH->SLOT[SLOT3]; - if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) update_ssg_eg_phase(SLOT); - SLOT = &ct->CH->SLOT[SLOT4]; - if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) update_ssg_eg_phase(SLOT); + if (ct->pack & 2) while (cnt >= 1<CH->SLOT[SLOT1]; + if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) + ct->phase1 = update_ssg_eg_phase(SLOT, ct->phase1); + SLOT = &ct->CH->SLOT[SLOT2]; + if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) + ct->phase2 = update_ssg_eg_phase(SLOT, ct->phase2); + SLOT = &ct->CH->SLOT[SLOT3]; + if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) + ct->phase3 = update_ssg_eg_phase(SLOT, ct->phase3); + SLOT = &ct->CH->SLOT[SLOT4]; + if ((SLOT->ssg&0x08) && SLOT->state > EG_REL && SLOT->volume >= 0x200) + ct->phase4 = update_ssg_eg_phase(SLOT, ct->phase4); + } if (ct->pack & 8) { /* LFO enabled ? (test Earthworm Jim in between demo 1 and 2) */ ct->pack = (ct->pack&0xffff) | (advance_lfo(ct->pack >> 16, ct->lfo_cnt, ct->lfo_cnt + ct->lfo_inc) << 16); @@ -943,7 +952,21 @@ static void chan_render_loop(chan_rend_context *ct, int *buffer, int length) } ct->eg_timer += ct->eg_timer_add; - while (ct->eg_timer >= EG_TIMER_OVERFLOW) + if (ct->eg_timer < EG_TIMER_OVERFLOW) { + SLOT = &ct->CH->SLOT[SLOT1]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state > EG_REL) recalc_volout(SLOT); + SLOT = &ct->CH->SLOT[SLOT2]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state > EG_REL) recalc_volout(SLOT); + SLOT = &ct->CH->SLOT[SLOT3]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state > EG_REL) recalc_volout(SLOT); + SLOT = &ct->CH->SLOT[SLOT4]; + SLOT->vol_ipol = SLOT->vol_out; + if (SLOT->state > EG_REL) recalc_volout(SLOT); + } + else while (ct->eg_timer >= EG_TIMER_OVERFLOW) { ct->eg_timer -= EG_TIMER_OVERFLOW; ct->eg_cnt++; @@ -951,17 +974,18 @@ static void chan_render_loop(chan_rend_context *ct, int *buffer, int length) SLOT = &ct->CH->SLOT[SLOT1]; SLOT->vol_ipol = SLOT->vol_out; - if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt); + if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt, ct->pack & 2); SLOT = &ct->CH->SLOT[SLOT2]; SLOT->vol_ipol = SLOT->vol_out; - if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt); + if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt, ct->pack & 2); SLOT = &ct->CH->SLOT[SLOT3]; SLOT->vol_ipol = SLOT->vol_out; - if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt); + if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt, ct->pack & 2); SLOT = &ct->CH->SLOT[SLOT4]; SLOT->vol_ipol = SLOT->vol_out; - if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt); + if (SLOT->state != EG_OFF) update_eg_phase(SLOT, ct->eg_cnt, ct->pack & 2); } + #if 0 UINT32 ifrac0 = ct->eg_timer / (EG_TIMER_OVERFLOW>>EG_SH); UINT32 ifrac1 = (1<CH->SLOT[SLOT3].vol_out) >> 1; ct->vol_out4 = (ct->CH->SLOT[SLOT4].vol_ipol + ct->CH->SLOT[SLOT4].vol_out) >> 1; + break; } #elif 0 if (ct->eg_timer >> (EG_SH-1) < EG_TIMER_OVERFLOW >> EG_SH) { @@ -1272,7 +1297,7 @@ static int chan_render(int *buffer, int length, int c, UINT32 flags) // flags: s crct.mem = crct.CH->mem_value; /* one sample delay memory */ crct.lfo_cnt = ym2612.OPN.lfo_cnt; - flags &= 0x35; + flags &= 0x37; if (crct.lfo_inc) { flags |= 8; @@ -1453,6 +1478,7 @@ static void reset_channels(FM_CH *CH) CH[c].mem_value = CH[c].op1_out = 0; } ym2612.slot_mask = 0; + ym2612.ssg_mask = 0; } /* initialize generic tables */ @@ -1655,8 +1681,10 @@ static int OPNWriteReg(int r, int v) case 0x90: /* SSG-EG */ SLOT->ssg = v&0x0f; SLOT->ssg ^= SLOT->ssgn; - if (SLOT->state > EG_REL) - recalc_volout(SLOT); + if (v&0x08) ym2612.ssg_mask |= 1<<(OPN_SLOT(r) + c*4); + else ym2612.ssg_mask &= ~(1<<(OPN_SLOT(r) + c*4)); +// if (SLOT->state > EG_REL) +// recalc_volout(SLOT); break; case 0xa0: @@ -1751,6 +1779,7 @@ int YM2612UpdateOne_(int *buffer, int length, int stereo, int is_buf_empty) { int pan; int active_chs = 0; + int flags = stereo ? 1:0; // if !is_buf_empty, it means it has valid samples to mix with, else it may contain trash if (is_buf_empty) memset32(buffer, 0, length<>2)) << 3; - if (ym2612.slot_mask & 0x0f0000) active_chs |= chan_render(buffer, length, 4, stereo|((pan&0x300)>>4)) << 4; - if (ym2612.slot_mask & 0xf00000) active_chs |= chan_render(buffer, length, 5, stereo|((pan&0xc00)>>6)|(ym2612.dacen<<2)) << 5; +#define BIT_IF(v,b,c) { v &= ~(1<<(b)); if (c) v |= 1<<(b); } + BIT_IF(flags, 1, (ym2612.ssg_mask & 0x00000f)); + if (ym2612.slot_mask & 0x00000f) active_chs |= chan_render(buffer, length, 0, flags|((pan&0x003)<<4)) << 0; + BIT_IF(flags, 1, (ym2612.ssg_mask & 0x0000f0)); + if (ym2612.slot_mask & 0x0000f0) active_chs |= chan_render(buffer, length, 1, flags|((pan&0x00c)<<2)) << 1; + BIT_IF(flags, 1, (ym2612.ssg_mask & 0x000f00)); + if (ym2612.slot_mask & 0x000f00) active_chs |= chan_render(buffer, length, 2, flags|((pan&0x030) )) << 2; + BIT_IF(flags, 1, (ym2612.ssg_mask & 0x00f000)); + if (ym2612.slot_mask & 0x00f000) active_chs |= chan_render(buffer, length, 3, flags|((pan&0x0c0)>>2)) << 3; + BIT_IF(flags, 1, (ym2612.ssg_mask & 0x0f0000)); + if (ym2612.slot_mask & 0x0f0000) active_chs |= chan_render(buffer, length, 4, flags|((pan&0x300)>>4)) << 4; + BIT_IF(flags, 1, (ym2612.ssg_mask & 0xf00000)); + if (ym2612.slot_mask & 0xf00000) active_chs |= chan_render(buffer, length, 5, flags|((pan&0xc00)>>6)|(!!ym2612.dacen<<2)) << 5; +#undef BIT_IF chan_render_finish(); return active_chs; // 1 if buffer updated diff --git a/pico/sound/ym2612.h b/pico/sound/ym2612.h index 73e693f92..b614790cf 100644 --- a/pico/sound/ym2612.h +++ b/pico/sound/ym2612.h @@ -153,6 +153,7 @@ typedef struct FM_OPN OPN; /* OPN state */ UINT32 slot_mask; /* active slot mask (performance hack) */ + UINT32 ssg_mask; /* active ssg mask (performance hack) */ } YM2612; #endif diff --git a/pico/sound/ym2612_arm.S b/pico/sound/ym2612_arm.S index 1370e6cf8..0334d1cfe 100644 --- a/pico/sound/ym2612_arm.S +++ b/pico/sound/ym2612_arm.S @@ -17,6 +17,7 @@ @ very simple YM2612 output rate to sample rate adaption (~500k cycles @44100) #define INTERPOL +#define SSG_EG .equiv SLOT1, 0 .equiv SLOT2, 2 @@ -73,8 +74,11 @@ and r3, r3, #7 @ eg_inc_val shift, may be 0 ldrb r2, [r5,#0x17] @ state +#if defined(SSG_EG) tst r0, #0x08 @ ssg enabled? + tstne r12, #0x02 bne 9f +#endif @ non-SSG-EG mode cmp r2, #4 @ EG_ATT @@ -127,7 +131,9 @@ strgeb r3, [r5,#0x17] @ state 10: @ finish + ldrh r3, [r5,#0x18] @ tl strh r0, [r5,#0x1a] @ volume +#if defined(SSG_EG) b 11f 9: @ SSG-EG mode @@ -140,7 +146,7 @@ movlt r3, r0, lsl r3 ldrlth r0, [r5,#0x1a] @ volume, unsigned (0-1023) movlt r3, r3, lsr #1 @ eg_inc_val - addlt r0, r0, r3, lsr #2 + addlt r0, r0, r3, lsl #2 cmp r2, #2 blt 1f @ EG_REL @@ -182,18 +188,20 @@ strh r0, [r5,#0x1a] @ volume cmp r2, #0x0c @ if ( ssg&0x04 && state > EG_REL ) cmpge r3, #EG_REL+1 + ldrh r3, [r5,#0x18] @ tl rsbge r0, r0, #0x200 @ volume = (0x200-volume) & MAX_ATT - lslge r0, r0, #10 - lsrge r0, r0, #10 + lslge r0, r0, #22 + lsrge r0, r0, #22 11: - ldrh r3, [r5,#0x18] @ tl +#endif add r0, r0, r3 @ volume += tl strh r0, [r5,#0x34] @ vol_out 0: @ EG_OFF .endm +#if defined(SSG_EG) @ r5=slot, trashes: r0,r2,r3 .macro update_ssg_eg ldrh r0, [r5,#0x30] @ ssg+ssgn @@ -204,6 +212,7 @@ cmp r2, #EG_REL+1 @ state > EG_REL && cmpge r3, #0x200 @ volume >= 0x200? blt 9f + orr r4, r4, #0x10 @ ssg_update tst r0, #0x01 beq 1f @@ -249,6 +258,33 @@ 9: .endm +@ r5=slot, trashes: r0,r2,r3 +.macro recalc_volout +#if defined(INTERPOL) + ldrh r0, [r5,#0x34] @ vol_out +#endif + ldrb r2, [r5,#0x30] @ ssg + ldrb r3, [r5,#0x17] @ state +#if defined(INTERPOL) + strh r0, [r5,#0x36] @ vol_ipol +#endif + ldrh r0, [r5,#0x1a] @ volume + +@ and r2, r2, #0x0c + cmp r2, #0x0c @ if ( ~ssg&0x0c && state > EG_REL ) + cmpge r3, #EG_REL+1 + ldrh r3, [r5,#0x18] @ tl + rsbge r0, r0, #0x200 @ volume = (0x200-volume) & MAX_ATT + lslge r0, r0, #22 + lsrge r0, r0, #22 + ldrh r0, [r5,#0x1a] @ volume + ldrh r3, [r5,#0x18] @ tl + + add r0, r0, r3 @ volume += tl + strh r0, [r5,#0x34] @ vol_out +.endm +#endif + @ r12=lfo_ampm[31:16], r1=lfo_cnt_old, r2=lfo_cnt, r3=scratch .macro advance_lfo_m mov r2, r2, lsr #LFO_SH @@ -305,7 +341,7 @@ .endm -@ lr=context, r12=pack (stereo, lastchan, disabled, lfo_enabled | pan_r, pan_l, ams[2] | AMmasks[4] | FB[4] | lfo_ampm[16]) +@ lr=context, r12=pack (stereo, ssg_enabled, disabled, lfo_enabled | pan_r, pan_l, ams[2] | AMmasks[4] | FB[4] | lfo_ampm[16]) @ r0-r2=scratch, r3=sin_tab, r5=scratch, r6-r7=vol_out[4], r10=op1_out .macro upd_algo0_m @@ -643,8 +679,8 @@ .endm -@ lr=context, r12=pack (stereo, lastchan, disabled, lfo_enabled | pan_r, pan_l, ams[2] | AMmasks[4] | FB[4] | lfo_ampm[16]) -@ r0-r2=scratch, r3=sin_tab/scratch, r4=(length<<8)|unused[4],was_update,algo[3], r5=tl_tab/slot, +@ lr=context, r12=pack (stereo, ssg_enabled, disabled, lfo_enabled | pan_r, pan_l, ams[2] | AMmasks[4] | FB[4] | lfo_ampm[16]) +@ r0-r2=scratch, r3=sin_tab/scratch, r4=(length<<8)|unused[3],ssg_update,was_update,algo[3], r5=tl_tab/slot, @ r6-r7=vol_out[4], r8=eg_timer, r9=eg_timer_add[31:16], r10=op1_out, r11=buffer .global chan_render_loop @ chan_rend_context *ct, int *buffer, int length @@ -683,10 +719,17 @@ crl_loop: subs r4, r4, #0x100 bmi crl_loop_end - @ -- SSG -- ldr r5, [lr, #0x40] @ CH +#if defined(SSG_EG) + tst r12, #0x02 @ ssg_enabled? + beq ssg_done + @ -- SSG -- + lsl r7, r8, #EG_SH + add r7, r9, r7, lsr #EG_SH + subs r7, r7, #1< Date: Wed, 22 Apr 2020 21:40:05 +0200 Subject: [PATCH 148/174] audio: add option to switch off SSG-EG --- pico/pico.h | 1 + pico/sound/sound.c | 2 +- pico/sound/ym2612.c | 15 ++++++++------- pico/sound/ym2612.h | 10 +++++----- platform/common/menu_pico.c | 1 + platform/common/menu_pico.h | 1 + platform/gp2x/940ctl.c | 4 ++-- platform/gp2x/940ctl.h | 2 +- platform/gp2x/code940/940.c | 2 +- 9 files changed, 21 insertions(+), 17 deletions(-) diff --git a/pico/pico.h b/pico/pico.h index 7914cfc0f..2c73f383e 100644 --- a/pico/pico.h +++ b/pico/pico.h @@ -73,6 +73,7 @@ extern void *p32x_bios_g, *p32x_bios_m, *p32x_bios_s; #define POPT_EN_32X (1<<20) // x0 0000 #define POPT_EN_PWM (1<<21) #define POPT_PWM_IRQ_OPT (1<<22) +#define POPT_DIS_FM_SSGEG (1<<23) #define PAHW_MCD (1<<0) #define PAHW_32X (1<<1) diff --git a/pico/sound/sound.c b/pico/sound/sound.c index befdf4b45..f0c91841e 100644 --- a/pico/sound/sound.c +++ b/pico/sound/sound.c @@ -119,7 +119,7 @@ void PsndRerate(int preserve_state) ym2612_pack_state(); memcpy(state, YM2612GetRegs(), 0x204); } - YM2612Init(Pico.m.pal ? OSC_PAL/7 : OSC_NTSC/7, PicoIn.sndRate); + YM2612Init(Pico.m.pal ? OSC_PAL/7 : OSC_NTSC/7, PicoIn.sndRate, !(PicoIn.opt&POPT_DIS_FM_SSGEG)); if (preserve_state) { // feed it back it's own registers, just like after loading state memcpy(YM2612GetRegs(), state, 0x204); diff --git a/pico/sound/ym2612.c b/pico/sound/ym2612.c index cb4f8c7d1..622fff0b3 100644 --- a/pico/sound/ym2612.c +++ b/pico/sound/ym2612.c @@ -1820,17 +1820,17 @@ int YM2612UpdateOne_(int *buffer, int length, int stereo, int is_buf_empty) // flags: stereo, ssg_enabled, disabled, _, pan_r, pan_l chan_render_prep(); #define BIT_IF(v,b,c) { v &= ~(1<<(b)); if (c) v |= 1<<(b); } - BIT_IF(flags, 1, (ym2612.ssg_mask & 0x00000f)); + BIT_IF(flags, 1, (ym2612.ssg_mask & 0x00000f) && (ym2612.OPN.ST.flags & 1)); if (ym2612.slot_mask & 0x00000f) active_chs |= chan_render(buffer, length, 0, flags|((pan&0x003)<<4)) << 0; - BIT_IF(flags, 1, (ym2612.ssg_mask & 0x0000f0)); + BIT_IF(flags, 1, (ym2612.ssg_mask & 0x0000f0) && (ym2612.OPN.ST.flags & 1)); if (ym2612.slot_mask & 0x0000f0) active_chs |= chan_render(buffer, length, 1, flags|((pan&0x00c)<<2)) << 1; - BIT_IF(flags, 1, (ym2612.ssg_mask & 0x000f00)); + BIT_IF(flags, 1, (ym2612.ssg_mask & 0x000f00) && (ym2612.OPN.ST.flags & 1)); if (ym2612.slot_mask & 0x000f00) active_chs |= chan_render(buffer, length, 2, flags|((pan&0x030) )) << 2; - BIT_IF(flags, 1, (ym2612.ssg_mask & 0x00f000)); + BIT_IF(flags, 1, (ym2612.ssg_mask & 0x00f000) && (ym2612.OPN.ST.flags & 1)); if (ym2612.slot_mask & 0x00f000) active_chs |= chan_render(buffer, length, 3, flags|((pan&0x0c0)>>2)) << 3; - BIT_IF(flags, 1, (ym2612.ssg_mask & 0x0f0000)); + BIT_IF(flags, 1, (ym2612.ssg_mask & 0x0f0000) && (ym2612.OPN.ST.flags & 1)); if (ym2612.slot_mask & 0x0f0000) active_chs |= chan_render(buffer, length, 4, flags|((pan&0x300)>>4)) << 4; - BIT_IF(flags, 1, (ym2612.ssg_mask & 0xf00000)); + BIT_IF(flags, 1, (ym2612.ssg_mask & 0xf00000) && (ym2612.OPN.ST.flags & 1)); if (ym2612.slot_mask & 0xf00000) active_chs |= chan_render(buffer, length, 5, flags|((pan&0xc00)>>6)|(!!ym2612.dacen<<2)) << 5; #undef BIT_IF chan_render_finish(); @@ -1840,13 +1840,14 @@ int YM2612UpdateOne_(int *buffer, int length, int stereo, int is_buf_empty) /* initialize YM2612 emulator */ -void YM2612Init_(int clock, int rate) +void YM2612Init_(int clock, int rate, int ssg) { memset(&ym2612, 0, sizeof(ym2612)); init_tables(); ym2612.OPN.ST.clock = clock; ym2612.OPN.ST.rate = rate; + ym2612.OPN.ST.flags = (ssg ? 1:0); OPNSetPres( 6*24 ); diff --git a/pico/sound/ym2612.h b/pico/sound/ym2612.h index b614790cf..e73c97321 100644 --- a/pico/sound/ym2612.h +++ b/pico/sound/ym2612.h @@ -95,7 +95,7 @@ typedef struct UINT8 address; /* 10 address register | need_save */ UINT8 status; /* 11 status flag | need_save */ UINT8 mode; /* mode CSM / 3SLOT */ - UINT8 pad; + UINT8 flags; /* operational flags */ int TA; /* timer a */ int TAC; /* timer a maxval */ int TAT; /* timer a ticker | need_save */ @@ -161,7 +161,7 @@ typedef struct extern YM2612 ym2612; #endif -void YM2612Init_(int baseclock, int rate); +void YM2612Init_(int baseclock, int rate, int ssg); void YM2612ResetChip_(void); int YM2612UpdateOne_(int *buffer, int length, int stereo, int is_buf_empty); @@ -183,9 +183,9 @@ int YM2612PicoStateLoad2(int *tat, int *tbt); #else /* GP2X specific */ #include "../../platform/gp2x/940ctl.h" -#define YM2612Init(baseclock,rate) do { \ - if (PicoIn.opt&POPT_EXT_FM) YM2612Init_940(baseclock, rate); \ - else YM2612Init_(baseclock, rate); \ +#define YM2612Init(baseclock,rate,ssg) do { \ + if (PicoIn.opt&POPT_EXT_FM) YM2612Init_940(baseclock, rate, ssg); \ + else YM2612Init_(baseclock, rate, ssg); \ } while (0) #define YM2612ResetChip() do { \ if (PicoIn.opt&POPT_EXT_FM) YM2612ResetChip_940(); \ diff --git a/platform/common/menu_pico.c b/platform/common/menu_pico.c index 327190a55..882aef924 100644 --- a/platform/common/menu_pico.c +++ b/platform/common/menu_pico.c @@ -499,6 +499,7 @@ static menu_entry e_menu_adv_options[] = mee_range_h ("Overclock M68k (%)", MA_OPT2_OVERCLOCK_M68K,currentConfig.overclock_68k, 0, 1000, h_ovrclk), mee_onoff ("Emulate Z80", MA_OPT2_ENABLE_Z80, PicoIn.opt, POPT_EN_Z80), mee_onoff ("Emulate YM2612 (FM)", MA_OPT2_ENABLE_YM2612, PicoIn.opt, POPT_EN_FM), + mee_onoff ("Disable YM2612 SSG-EG", MA_OPT2_DISABLE_YM_SSG,PicoIn.opt, POPT_DIS_FM_SSGEG), mee_onoff ("Emulate SN76496 (PSG)", MA_OPT2_ENABLE_SN76496,PicoIn.opt, POPT_EN_PSG), mee_onoff ("gzip savestates", MA_OPT2_GZIP_STATES, currentConfig.EmuOpt, EOPT_GZIP_SAVES), mee_onoff ("Don't save last used ROM", MA_OPT2_NO_LAST_ROM, currentConfig.EmuOpt, EOPT_NO_AUTOSVCFG), diff --git a/platform/common/menu_pico.h b/platform/common/menu_pico.h index 4c0bbdd1d..d15113fc1 100644 --- a/platform/common/menu_pico.h +++ b/platform/common/menu_pico.h @@ -48,6 +48,7 @@ typedef enum MA_OPT2_VSYNC, MA_OPT2_ENABLE_Z80, MA_OPT2_ENABLE_YM2612, + MA_OPT2_DISABLE_YM_SSG, MA_OPT2_ENABLE_SN76496, MA_OPT2_GZIP_STATES, MA_OPT2_NO_LAST_ROM, diff --git a/platform/gp2x/940ctl.c b/platform/gp2x/940ctl.c index 2afba0d91..cd3fcdc33 100644 --- a/platform/gp2x/940ctl.c +++ b/platform/gp2x/940ctl.c @@ -282,7 +282,7 @@ void sharedmem940_finish(void) } -void YM2612Init_940(int baseclock, int rate) +void YM2612Init_940(int baseclock, int rate, int ssg) { static int oldrate; @@ -339,7 +339,7 @@ void YM2612Init_940(int baseclock, int rate) memset(shared_ctl, 0, sizeof(*shared_ctl)); /* cause local ym2612 to init REGS */ - YM2612Init_(baseclock, rate); + YM2612Init_(baseclock, rate, ssg); internal_reset(); diff --git a/platform/gp2x/940ctl.h b/platform/gp2x/940ctl.h index 5b789dad3..dba6cc70d 100644 --- a/platform/gp2x/940ctl.h +++ b/platform/gp2x/940ctl.h @@ -1,7 +1,7 @@ void sharedmem940_init(void); void sharedmem940_finish(void); -void YM2612Init_940(int baseclock, int rate); +void YM2612Init_940(int baseclock, int rate, int ssg); void YM2612ResetChip_940(void); int YM2612UpdateOne_940(int *buffer, int length, int stereo, int is_buf_empty); diff --git a/platform/gp2x/code940/940.c b/platform/gp2x/code940/940.c index f79db1e50..db51fdc9c 100644 --- a/platform/gp2x/code940/940.c +++ b/platform/gp2x/code940/940.c @@ -167,7 +167,7 @@ void Main940(void) case JOB940_INITALL: /* ym2612 */ shared_ctl->writebuff0[0] = shared_ctl->writebuff1[0] = 0xffff; - YM2612Init_(shared_ctl->baseclock, shared_ctl->rate); + YM2612Init_(shared_ctl->baseclock, shared_ctl->rate, 0); /* Helix mp3 decoder */ __malloc_init(); shared_data->mp3dec = MP3InitDecoder(); From 03bc2d8599489e3beab18636654e7dba198f4b4d Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 22 Apr 2020 21:51:35 +0200 Subject: [PATCH 149/174] vdp fifo, bugfix --- pico/videoport.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pico/videoport.c b/pico/videoport.c index dac74dc3a..f9fd6ece4 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -91,8 +91,10 @@ static __inline int AdvanceFIFOEntry(struct VdpFIFO *vf, struct PicoVideo *pv, i // if entry has been processed... if (cnt == 0) { // remove entry from FIFO - if (vf->fifo_ql) + if (vf->fifo_ql) { + vf->fifo_queue[vf->fifo_qx] = 0; vf->fifo_qx = (vf->fifo_qx+1) & 7, vf->fifo_ql --; + } // start processing for next entry if there is one if (vf->fifo_ql) { b = vf->fifo_queue[vf->fifo_qx] & FQ_BYTE; @@ -230,7 +232,7 @@ int PicoVideoFIFOWrite(int count, int flags, unsigned sr_mask,unsigned sr_flags) if (count && vf->fifo_ql < 8) { // determine queue position for entry int x = (vf->fifo_qx + vf->fifo_ql - 1) & 7; - if (unlikely(vf->fifo_ql && (vf->fifo_queue[x] & FQ_BGDMA))) { + if (unlikely(vf->fifo_queue[x] & FQ_BGDMA)) { // CPU FIFO writes have priority over a background DMA Fill/Copy // XXX if interrupting a DMA fill, fill data changes if (x == vf->fifo_qx) { // overtaking to queue head? From 70236118d5a0063e63aceb49a16b9733db29e121 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 24 Apr 2020 19:00:41 +0200 Subject: [PATCH 150/174] 32x poll detection fix --- pico/32x/sh2soc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pico/32x/sh2soc.c b/pico/32x/sh2soc.c index 369fc0de8..9da3f296e 100644 --- a/pico/32x/sh2soc.c +++ b/pico/32x/sh2soc.c @@ -312,7 +312,10 @@ u32 REGPARM(2) sh2_peripheral_read32(u32 a, SH2 *sh2) elprintf_sh2(sh2, EL_32XP, "peri r32 [%08x] %08x @%06x", a | ~0x1ff, d, sh2_pc(sh2)); - if ((a & 0x1c0) == 0x140) { + if (a == 0x18c) + // kludge for polling COMM while polling for end of DMA + sh2->poll_cnt = 0; + else if ((a & 0x1c0) == 0x140) { // abused as comm area DRC_SAVE_SR(sh2); p32x_sh2_poll_detect(a, sh2, SH2_STATE_CPOLL, 3); From 4581c3368c183d8e425008aeb39fdbd07f2c2021 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 24 Apr 2020 19:05:27 +0200 Subject: [PATCH 151/174] audio: SN76496 fixes --- pico/memory.c | 2 +- pico/pico_int.h | 2 +- pico/sms.c | 2 +- pico/sound/sn76496.c | 8 +++++++- pico/sound/sound.c | 39 +++++++++++++++++++++++++++++++++------ 5 files changed, 43 insertions(+), 10 deletions(-) diff --git a/pico/memory.c b/pico/memory.c index e1afb4dbe..c0ba9ffe7 100644 --- a/pico/memory.c +++ b/pico/memory.c @@ -883,7 +883,7 @@ static void m68k_mem_setup(void) static int get_scanline(int is_from_z80) { if (is_from_z80) { - int mclk_z80 = z80_cyclesDone() * 15; + int mclk_z80 = (z80_cyclesLeft<0 ? Pico.t.z80c_aim : z80_cyclesDone()) * 15; int mclk_line = Pico.t.z80_scanline * 488 * 7; while (mclk_z80 - mclk_line >= 488 * 7) Pico.t.z80_scanline++, mclk_line += 488 * 7; diff --git a/pico/pico_int.h b/pico/pico_int.h index 7539379aa..088c7aa53 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -213,7 +213,7 @@ extern struct DrZ80 drZ80; #define z80_cyclesDone() \ (Pico.t.z80c_aim - z80_cyclesLeft) -#define cycles_68k_to_z80(x) ((x) * 3823 >> 13) +#define cycles_68k_to_z80(x) ((x) * 3822 >> 13) // ----------------------- SH2 CPU ----------------------- diff --git a/pico/sms.c b/pico/sms.c index 901f2f55e..0f4a48ad4 100644 --- a/pico/sms.c +++ b/pico/sms.c @@ -152,7 +152,7 @@ static void z80_sms_out(unsigned short a, unsigned char d) case 0x40: case 0x41: - if ((d & 0x90) == 0x90); + if ((d & 0x90) == 0x90) PsndDoPSG(Pico.m.scanline); SN76496Write(d); break; diff --git a/pico/sound/sn76496.c b/pico/sound/sn76496.c index b21275941..4507507c4 100644 --- a/pico/sound/sn76496.c +++ b/pico/sound/sn76496.c @@ -173,9 +173,12 @@ void SN76496Update(short *buffer, int length, int stereo) /* If we exit the loop in the middle, Output[i] has to be inverted */ /* and vol[i] incremented only if the exit status of the square */ /* wave is 1. */ + left = 0; while (R->Count[i] <= 0) { - R->Count[i] += R->Period[i]; + if (R->Count[i] + R->Period[i]*4 < R->Period[i]) + left+= 4, R->Count[i] += R->Period[i]*4; + else left++, R->Count[i] += R->Period[i]; if (R->Count[i] > 0) { R->Output[i] ^= 1; @@ -186,6 +189,9 @@ void SN76496Update(short *buffer, int length, int stereo) vol[i] += R->Period[i]; } if (R->Output[i]) vol[i] -= R->Count[i]; + /* Cut of anything above the sample freqency. It will only create */ + /* aliasing and hearable distortions anyway. */ + if (left > 1) vol[i] = STEP/2; } left = STEP; diff --git a/pico/sound/sound.c b/pico/sound/sound.c index f0c91841e..2b18446c3 100644 --- a/pico/sound/sound.c +++ b/pico/sound/sound.c @@ -333,6 +333,7 @@ static int PsndRender(int offset, int length) int stereo = (PicoIn.opt & 8) >> 3; int fmlen = ((Pico.snd.fm_pos+0x80000) >> 20); int daclen = ((Pico.snd.dac_pos+0x80000) >> 20); + int psglen = ((Pico.snd.psg_pos+0x8000) >> 16); buf32 = PsndBuffer+(offset< 0) { + short *psgbuf = PicoIn.sndOut + (psglen << stereo); + Pico.snd.psg_pos += (length-psglen) << 16; + if (PicoIn.opt & POPT_EN_PSG) + SN76496Update(psgbuf, length-psglen, stereo); + } + // Add in parts of the FM buffer not yet done if (length-fmlen > 0) { int *fmbuf = buf32 + ((fmlen-offset) << stereo); @@ -402,8 +411,6 @@ PICO_INTERNAL void PsndGetSamples(int y) { static int curr_pos = 0; - PsndDoPSG(y - 1); - curr_pos = PsndRender(0, Pico.snd.len_use); if (PicoIn.writeSound) @@ -412,11 +419,20 @@ PICO_INTERNAL void PsndGetSamples(int y) PsndClear(); } -PICO_INTERNAL void PsndGetSamplesMS(int y) +static int PsndRenderMS(int offset, int length) { - int length = Pico.snd.len_use; + int stereo = (PicoIn.opt & 8) >> 3; + int psglen = ((Pico.snd.psg_pos+0x8000) >> 16); - PsndDoPSG(y - 1); + pprof_start(sound); + + // Add in parts of the PSG output not yet done + if (length-psglen > 0) { + short *psgbuf = PicoIn.sndOut + (psglen << stereo); + Pico.snd.psg_pos += (length-psglen) << 16; + if (PicoIn.opt & POPT_EN_PSG) + SN76496Update(psgbuf, length-psglen, stereo); + } // upmix to "stereo" if needed if (PicoIn.opt & POPT_EN_STEREO) { @@ -425,8 +441,19 @@ PICO_INTERNAL void PsndGetSamplesMS(int y) *p |= *p << 16; } + pprof_end(sound); + + return length; +} + +PICO_INTERNAL void PsndGetSamplesMS(int y) +{ + static int curr_pos = 0; + + curr_pos = PsndRenderMS(0, Pico.snd.len_use); + if (PicoIn.writeSound != NULL) - PicoIn.writeSound(length * ((PicoIn.opt & POPT_EN_STEREO) ? 4 : 2)); + PicoIn.writeSound(curr_pos * ((PicoIn.opt & POPT_EN_STEREO) ? 4 : 2)); PsndClear(); } From fc2614f00f52464cbe73952e5c20fe3e1e4cc30a Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 25 Apr 2020 21:51:47 +0200 Subject: [PATCH 152/174] sh2: bugfix in drc --- cpu/sh2/compiler.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index 043204241..ad1983bf1 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -160,7 +160,7 @@ enum op_types { & BITMASK1(op)) #define OP_ISBRAUC(op) (BITMASK4(OP_BRANCH, OP_BRANCH_R, OP_BRANCH_RF, OP_RTE) \ & BITMASK1(op)) -#define OP_ISBRACND(op) (BITMASK3(OP_BRANCH_CT, OP_BRANCH_CF, OP_BRANCH_N) \ +#define OP_ISBRACND(op) (BITMASK2(OP_BRANCH_CT, OP_BRANCH_CF) \ & BITMASK1(op)) #define OP_ISBRAIMM(op) (BITMASK3(OP_BRANCH, OP_BRANCH_CT, OP_BRANCH_CF) \ & BITMASK1(op)) @@ -3501,7 +3501,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_set_usage_now(opd[0].source); // current insn rcache_set_usage_soon(soon); // insns 1-4 rcache_set_usage_late(late & ~soon); // insns 5-9 - rcache_set_usage_discard(write & ~(late|soon)); + rcache_set_usage_discard(write & ~(late|soon|opd[0].source)); if (v <= 9) // upcoming rcache_flush, start writing back unused dirty stuff rcache_clean_masked(rcache_dirty_mask() & ~(write|opd[0].dest)); From d61f4f71b26592a99109c3d64470540a40c043b5 Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 6 May 2020 22:58:39 +0200 Subject: [PATCH 153/174] audio: fix for save/load --- pico/state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pico/state.c b/pico/state.c index 3deb5aa72..5160ce95a 100644 --- a/pico/state.c +++ b/pico/state.c @@ -569,7 +569,7 @@ static int state_load(void *file) z80_unpack(buff_z80); // due to dep from 68k cycles.. - Pico.t.m68c_aim = Pico.t.m68c_cnt; + Pico.t.m68c_frame_start = Pico.t.m68c_aim = Pico.t.m68c_cnt; if (PicoIn.AHW & PAHW_32X) Pico32xStateLoaded(0); if (PicoIn.AHW & PAHW_MCD) From 35e6ff97c830ed01068d050261649c866087d0af Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 6 May 2020 23:06:10 +0200 Subject: [PATCH 154/174] sh2: optimisations in drc --- cpu/drc/cmn.h | 24 +++++------ cpu/sh2/compiler.c | 99 ++++++++++++++++++++++++++++------------------ 2 files changed, 72 insertions(+), 51 deletions(-) diff --git a/cpu/drc/cmn.h b/cpu/drc/cmn.h index 2eb52aada..9c041e704 100644 --- a/cpu/drc/cmn.h +++ b/cpu/drc/cmn.h @@ -17,18 +17,18 @@ void drc_cmn_cleanup(void); // binary search approach, since we don't have CLZ on ARM920T #define FOR_ALL_BITS_SET_DO(mask, bit, code) { \ u32 __mask = mask; \ - for (bit = 31; bit >= 0 && mask; bit--, __mask <<= 1) { \ - if (!(__mask & (0xffff << 16))) \ - bit -= 16, __mask <<= 16; \ - if (!(__mask & (0xff << 24))) \ - bit -= 8, __mask <<= 8; \ - if (!(__mask & (0xf << 28))) \ - bit -= 4, __mask <<= 4; \ - if (!(__mask & (0x3 << 30))) \ - bit -= 2, __mask <<= 2; \ - if (!(__mask & (0x1 << 31))) \ - bit -= 1, __mask <<= 1; \ - if (__mask & (0x1 << 31)) { \ + for (bit = 0; bit < 32 && mask; bit++, __mask >>= 1) { \ + if (!(__mask & 0xffff)) \ + bit += 16,__mask >>= 16; \ + if (!(__mask & 0xff)) \ + bit += 8, __mask >>= 8; \ + if (!(__mask & 0xf)) \ + bit += 4, __mask >>= 4; \ + if (!(__mask & 0x3)) \ + bit += 2, __mask >>= 2; \ + if (!(__mask & 0x1)) \ + bit += 1, __mask >>= 1; \ + if (__mask & 0x1) { \ code; \ } \ } \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index ad1983bf1..bfd9ec061 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -1549,22 +1549,31 @@ static u32 rcache_regs_clean; // regs needing cleaning static void rcache_lock_vreg(int x) { if (x >= 0) { + cache_regs[x].locked ++; +#if DRC_DEBUG & 64 if (cache_regs[x].type == HR_FREE) { printf("locking free vreg %x, aborting\n", x); exit(1); } - cache_regs[x].locked ++; + if (!cache_regs[x].locked) { + printf("locking overflow vreg %x, aborting\n", x); + exit(1); + } +#endif } } static void rcache_unlock_vreg(int x) { if (x >= 0) { +#if DRC_DEBUG & 64 if (cache_regs[x].type == HR_FREE) { printf("unlocking free vreg %x, aborting\n", x); exit(1); } - cache_regs[x].locked --; +#endif + if (cache_regs[x].locked) + cache_regs[x].locked --; } } @@ -1582,7 +1591,7 @@ static void rcache_unmap_vreg(int x) FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, i, if (guest_regs[i].flags & GRF_DIRTY) { // if a dirty reg is unmapped save its value to context - if (~rcache_regs_discard & (1 << i)) + if ((~rcache_regs_discard | rcache_regs_now) & (1 << i)) emith_ctx_write(cache_regs[x].hreg, i * 4); guest_regs[i].flags &= ~GRF_DIRTY; } @@ -1700,26 +1709,28 @@ static int rcache_allocate(int what, int minprio) continue; if (cache_regs[i].type == HR_FREE || cache_regs[i].type == HR_TEMP) { // REG is free - prio = 6; + prio = 10; oldest = i; break; } if (cache_regs[i].type == HR_CACHED) { if (rcache_regs_now & cache_regs[i].gregs) // REGs needed for the current insn - i_prio = 1; + i_prio = 0; else if (rcache_regs_soon & cache_regs[i].gregs) // REGs needed in the next insns i_prio = 2; else if (rcache_regs_late & cache_regs[i].gregs) // REGs needed in some future insn - i_prio = 3; - else if (!(~rcache_regs_discard & cache_regs[i].gregs)) - // REGs not needed in the foreseeable future i_prio = 4; + else if (~rcache_regs_discard & cache_regs[i].gregs) + // REGs not needed in the foreseeable future + i_prio = 6; else // REGs soon overwritten anyway - i_prio = 5; + i_prio = 8; + if (!(cache_regs[i].flags & HRF_DIRTY)) i_prio ++; + if (prio < i_prio || (prio == i_prio && cache_regs[i].stamp < min_stamp)) { min_stamp = cache_regs[i].stamp; oldest = i; @@ -1744,21 +1755,21 @@ static int rcache_allocate_vreg(int needed) { int x; - x = rcache_allocate(1, needed ? 0 : 3); + x = rcache_allocate(1, needed ? 0 : 4); if (x < 0) - x = rcache_allocate(-1, 1); + x = rcache_allocate(-1, 0); return x; } static int rcache_allocate_nontemp(void) { - int x = rcache_allocate(0, 3); + int x = rcache_allocate(0, 4); return x; } static int rcache_allocate_temp(void) { - int x = rcache_allocate(-1, 1); + int x = rcache_allocate(-1, 0); if (x < 0) x = rcache_allocate(0, 0); return x; @@ -1821,20 +1832,25 @@ static void rcache_remap_vreg(int x) int d; // x must be a cached vreg - if (cache_regs[x].type != HR_CACHED) + if (cache_regs[x].type != HR_CACHED || cache_regs[x].locked) return; - // don't do it if x is already a REG or isn't used or to be cleaned anyway - if ((cache_regs[x].htype & HRT_REG) || - !(rsl_d & cache_regs[x].gregs)) { + // don't do it if x isn't used + if (!(rsl_d & cache_regs[x].gregs)) { // clean here to avoid data loss on invalidation rcache_clean_vreg(x); return; } - if (cache_regs[x].locked) { - printf("remap vreg %d is locked\n", x); - exit(1); - } + FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, d, + if ((guest_regs[d].flags & (GRF_STATIC|GRF_PINNED)) && + !cache_regs[guest_regs[d].sreg].locked && + !((rsl_d|rcache_regs_now) & cache_regs[guest_regs[d].sreg].gregs)) { + // STATIC not in its sreg and sreg is available + rcache_evict_vreg(guest_regs[d].sreg); + rcache_move_vreg(guest_regs[d].sreg, x); + return; + } + ) // allocate a non-TEMP vreg rcache_lock_vreg(x); // lock to avoid evicting x @@ -1891,8 +1907,8 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr { int src, dst, ali; cache_reg_t *tr; - u32 rsp_d = (rcache_regs_now | rcache_regs_soon | - rcache_regs_static | rcache_regs_pinned) & ~rcache_regs_discard; + u32 rsp_d = (rcache_regs_soon | rcache_regs_static | rcache_regs_pinned) & + ~rcache_regs_discard; dst = src = guest_regs[r].vreg; @@ -1901,7 +1917,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) && src != guest_regs[r].sreg && (src < 0 || mode != RC_GR_READ) && !cache_regs[guest_regs[r].sreg].locked && - !(rsp_d & cache_regs[guest_regs[r].sreg].gregs)) { + !((rsp_d|rcache_regs_now) & cache_regs[guest_regs[r].sreg].gregs)) { dst = guest_regs[r].sreg; rcache_evict_vreg(dst); } else if (dst < 0) { @@ -1926,7 +1942,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr ali = tr->gregs & ~(1 << r); if (mode != RC_GR_READ && src == dst && ali) { int x = -1; - if (rsp_d & ali) { + if ((rsp_d|rcache_regs_now) & ali) { if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) && guest_regs[r].sreg == dst && !tr->locked) { // split aliases if r is STATIC in sreg and dst isn't already locked @@ -1935,7 +1951,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr if ((guest_regs[t].flags & (GRF_STATIC|GRF_PINNED)) && !(ali & ~(1 << t)) && !cache_regs[guest_regs[t].sreg].locked && - !(rsp_d & cache_regs[guest_regs[t].sreg].gregs)) { + !((rsp_d|rcache_regs_now) & cache_regs[guest_regs[t].sreg].gregs)) { // alias is a single STATIC and its sreg is available x = guest_regs[t].sreg; rcache_evict_vreg(x); @@ -1947,8 +1963,9 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr break; ) if (x >= 0) { - src = x; - rcache_move_vreg(src, dst); + rcache_remove_vreg_alias(src, r); + src = dst; + rcache_move_vreg(x, dst); } } else { // split r @@ -1956,6 +1973,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr x = rcache_allocate_vreg(rsp_d & (1 << r)); rcache_unlock_vreg(src); if (x >= 0) { + rcache_remove_vreg_alias(src, r); dst = x; tr = &cache_regs[dst]; tr->stamp = rcache_counter; @@ -1965,8 +1983,6 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr if (x < 0) // aliases not needed or no vreg available, remove them rcache_evict_vreg_aliases(dst, r); - else if (src != dst) - rcache_remove_vreg_alias(src, r); } // assign r to dst @@ -2342,13 +2358,16 @@ static void rcache_clean_tmp(void) static void rcache_clean_masked(u32 mask) { int i, r, hr; + u32 m; rcache_regs_clean |= mask; mask = rcache_regs_clean; - // clean constants where all aliases are covered by the mask + // clean constants where all aliases are covered by the mask, exempt statics + // to avoid flushing them to context if sreg isn't available + m = mask & ~(rcache_regs_static | rcache_regs_pinned); for (i = 0; i < ARRAY_SIZE(gconsts); i++) - if ((gconsts[i].gregs & mask) && !(gconsts[i].gregs & ~mask)) { + if ((gconsts[i].gregs & m) && !(gconsts[i].gregs & ~mask)) { FOR_ALL_BITS_SET_DO(gconsts[i].gregs, r, if (guest_regs[r].flags & GRF_CDIRTY) { hr = rcache_get_reg_(r, RC_GR_READ, 0, NULL); @@ -2479,6 +2498,9 @@ static void rcache_create(void) } // create static host register mapping for SH2 regs + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { + guest_regs[i] = (guest_reg_t){.sreg = -1}; + } for (i = 0; i < ARRAY_SIZE(regs_static); i += 2) { for (x = ARRAY_SIZE(cache_regs)-1; x >= 0; x--) if (cache_regs[x].hreg == regs_static[i+1]) break; @@ -2486,8 +2508,7 @@ static void rcache_create(void) guest_regs[regs_static[i]] = (guest_reg_t){.flags = GRF_STATIC,.sreg = x}; rcache_regs_static |= (1 << regs_static[i]); rcache_vregs_reg &= ~(1 << x); - } else - guest_regs[regs_static[i]] = (guest_reg_t){.sreg = -1}; + } } printf("DRC registers created, %ld host regs (%d REG, %d STATIC, 1 CTX)\n", @@ -3501,7 +3522,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_set_usage_now(opd[0].source); // current insn rcache_set_usage_soon(soon); // insns 1-4 rcache_set_usage_late(late & ~soon); // insns 5-9 - rcache_set_usage_discard(write & ~(late|soon|opd[0].source)); + rcache_set_usage_discard(write & ~(late|soon)); if (v <= 9) // upcoming rcache_flush, start writing back unused dirty stuff rcache_clean_masked(rcache_dirty_mask() & ~(write|opd[0].dest)); @@ -4717,7 +4738,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) // branch not taken, correct cycle count if (ctaken) - emith_add_r_imm(sr, ctaken << 12); + cycles -= ctaken; // set T bit to reflect branch not taken for OP_BRANCH_CT/CF if (emith_get_t_cond() >= 0) // T is synced for all other cases emith_set_t(sr, opd_b->op == OP_BRANCH_CF); @@ -5263,11 +5284,11 @@ static void sh2_smc_rm_blocks(u32 a, int len, int tcache_id, u32 shift) start_lit = block->addr_lit & wtmask; end_lit = start_lit + block->size_lit; // disable/delete block if it covers the modified address - if ((start_addr <= a+len && a < end_addr) || - (start_lit <= a+len && a < end_lit)) + if ((start_addr < a+len && a < end_addr) || + (start_lit < a+len && a < end_lit)) { dbg(2, "smc remove @%08x", a); - end_addr = (start_lit <= a+len && block->size_lit ? a : 0); + end_addr = (start_lit < a+len && block->size_lit ? a : 0); dr_rm_block_entry(block, tcache_id, end_addr, 0); #if (DRC_DEBUG & 2) removed = 1; From 6578890c2cdeae07b382a5687085aba40964d32c Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 9 May 2020 10:45:56 +0200 Subject: [PATCH 155/174] 32x: libretro bugfix --- platform/libretro/libretro.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/platform/libretro/libretro.c b/platform/libretro/libretro.c index 77e2c50ce..de727606c 100644 --- a/platform/libretro/libretro.c +++ b/platform/libretro/libretro.c @@ -116,6 +116,8 @@ static short ALIGNED(4) sndBuffer[2*INITIAL_SND_RATE/50]; static void snd_write(int len); +char **g_argv; + #ifdef _WIN32 #define SLASH '\\' #else @@ -565,6 +567,8 @@ void emu_video_mode_change(int start_line, int line_count, int is_32cols) void emu_32x_startup(void) { + PicoDrawSetOutFormat(PDF_RGB555, 0); + PicoDrawSetOutBuf(vout_buf, vout_width * 2); } void lprintf(const char *fmt, ...) From af026c008d38dd9bbf6dbf73a955077310c65a5b Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 15 May 2020 21:46:28 +0200 Subject: [PATCH 156/174] sh2 drc: revised ARM A32 backend optimizer --- cpu/drc/emit_arm.c | 141 ++++++++++++++++++++------------------------- 1 file changed, 61 insertions(+), 80 deletions(-) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index af9491f13..e27054a30 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -36,8 +36,7 @@ #define SR 16 // CPSR, status register #define MEM 17 // memory access (src=LDR, dst=STR) #define CYC1 20 // 1 cycle interlock (LDR, reg-cntrld shift) -#define CYC2 21 // 2+ cycles interlock (LDR[BH], MUL/MLA etc) -#define SWAP 31 // swapped +#define CYC2 (CYC1+1)// 2+ cycles interlock (LDR[BH], MUL/MLA etc) #define NO 32 // token for "no register" // bitmask builders @@ -46,6 +45,7 @@ #define M3(x,y,z) (M2(x,y)|M1(z)) #define M4(x,y,z,a) (M3(x,y,z)|M1(a)) #define M5(x,y,z,a,b) (M4(x,y,z,a)|M1(b)) +#define M6(x,y,z,a,b,c) (M5(x,y,z,a,b)|M1(c)) #define M10(a,b,c,d,e,f,g,h,i,j) (M5(a,b,c,d,e)|M5(f,g,h,i,j)) // sys_cacheflush always flushes whole pages, and it's rather expensive on ARMs @@ -90,94 +90,81 @@ static inline void emith_update_add(void *base, void *end) } // peephole optimizer. ATM only tries to reduce interlock -#define EMIT_CACHE_SIZE 3 +#define EMIT_CACHE_SIZE 6 struct emit_op { u32 op; u32 src, dst; }; -// peephole cache, last commited insn + cache + next insn + empty insn = size+3 -static struct emit_op emit_cache[EMIT_CACHE_SIZE+3]; +// peephole cache, last commited insn + cache + next insn = size+2 +static struct emit_op emit_cache[EMIT_CACHE_SIZE+2]; static int emit_index; #define emith_insn_ptr() (u8 *)((u32 *)tcache_ptr-emit_index) -static inline int emith_pool_index(int tcache_offs); -static inline void emith_pool_adjust(int pool_index, int move_offs); +static inline void emith_pool_adjust(int tcache_offs, int move_offs); static NOINLINE void EMIT(u32 op, u32 dst, u32 src) { - void *emit_ptr = (u32 *)tcache_ptr - emit_index; - int i; + void * emit_ptr = (u32 *)tcache_ptr - emit_index; + struct emit_op *const ptr = emit_cache; + const int n = emit_index+1; + int i, bi, bd = 0; - EMIT_PTR(tcache_ptr, op); // emit to keep tcache_ptr current + // account for new insn in tcache + tcache_ptr = (void *)((u32 *)tcache_ptr + 1); COUNT_OP; // for conditional execution SR is always source if (op < 0xe0000000 /*A_COND_AL << 28*/) src |= M1(SR); - // put insn on back of queue - emit_cache[emit_index+1].op = op; - emit_cache[emit_index+1].src = src & ~M1(NO); // mask away the NO token - emit_cache[emit_index+1].dst = dst & ~M1(NO); - // move insn down in the queue as long as permitted by dependencies - for (i = emit_index-1; i > 0; i--) { - struct emit_op *ptr = &emit_cache[i]; + // put insn on back of queue // mask away the NO token + emit_cache[n] = (struct emit_op) + { .op=op, .src=src & ~M1(NO), .dst=dst & ~M1(NO) }; + // check insns down the queue as long as permitted by dependencies + for (bd = bi = 0, i = emit_index; i > 1 && !(dst & M1(PC)); i--) { int deps = 0; - // never swap branch insns (changes semantics) - if ((ptr[0].dst | ptr[1].dst) & M1(PC)) - continue; - // dst deps between 0 and 1 must not be swapped, since any deps - // but [0].src & [1].src lead to changed semantics if swapped. - if ((ptr[0].dst & ptr[1].src) || (ptr[1].dst & ptr[0].src) || - (ptr[0].dst & ptr[1].dst)) - continue; -#if 1 - // just move loads as far up as possible - deps -= !!(ptr[1].src & M1(MEM)); - deps += !!(ptr[0].src & M1(MEM)); -#elif 0 - // treat all dest->src deps as a potential interlock -#define DEP_INSN(x,y) !!(ptr[x].dst & ptr[y].src) - // insn sequence: -1, 0, 1, 2 - deps -= DEP_INSN(1,2) + DEP_INSN(-1,0); - deps -= !!(ptr[1].src & M1(MEM)); // favour moving LDR's down - // insn sequence: -1, 1, 0, 2 - deps += DEP_INSN(0,2) + DEP_INSN(-1,1); - deps += !!(ptr[0].src & M1(SWAP)); // penalise if swapped -#else - // calculate ARM920T interlock cycles -#define DEP_CYC1(x,y) ((ptr[x].dst & ptr[y].src)&&(ptr[x].src & M1(CYC1))) -#define DEP_CYC2(x,y) ((ptr[x].dst & ptr[y].src)&&(ptr[x].src & M1(CYC2))) -#define DEP_INSN(x,y,z) DEP_CYC1(x,y)+DEP_CYC1(y,z)+2*DEP_CYC2(x,y)+DEP_CYC2(x,z) - // insn sequence: -1, 0, 1, 2 - deps -= DEP_INSN(0,1,2) + DEP_INSN(-1,0,1); - deps -= !!(ptr[1].src & M1(MEM)); // favour moving LDR's down - // insn sequence: -1, 1, 0, 2 - deps += DEP_INSN(0,2,1) + DEP_INSN(-1,1,0); - deps += !!(ptr[0].src & M1(SWAP)); // penalise multiple swaps -#endif - // swap if fewer depencies - if (deps < 0) { - // swap insn reading PC only if uncomitted pool load - struct emit_op tmp; - int i0 = -1, i1 = -1; - if ((!(ptr[0].src & M1(PC)) || - (i0 = emith_pool_index(emit_index+2 - i)) >= 0) && - (!(ptr[1].src & M1(PC)) || - (i1 = emith_pool_index(emit_index+1 - i)) >= 0)) { - // not using PC, or pool load - emith_pool_adjust(i0, 1); - emith_pool_adjust(i1, -1); - tmp = ptr[0], ptr[0] = ptr[1], ptr[1] = tmp; - ptr[0].src |= M1(SWAP); - } + // dst deps between i and n must not be swapped, since any deps + // but [i].src & [n].src lead to changed semantics if swapped. + if ((ptr[i].dst & ptr[n].src) || (ptr[n].dst & ptr[i].src) || + (ptr[i].dst & ptr[n].dst)) + break; + // don't swap insns reading PC if it's not a word pool load + // (ptr[i].op&0xf700000) != EOP_C_AM2_IMM(0,0,0,1,0,0,0)) + if ((ptr[i].src & M1(PC)) && (ptr[i].op&0xf700000) != 0x5100000) + break; + + // calculate ARM920T interlock cycles (differences only) +#define D2(x,y) ((ptr[x].dst & ptr[y].src)?((ptr[x].src >> CYC2) & 1):0) +#define D1(x,y) ((ptr[x].dst & ptr[y].src)?((ptr[x].src >> CYC1) & 3):0) + // insn sequence: [..., i-2, i-1, i, i+1, ..., n-2, n-1, n] + deps -= D2(i-2,i)+D2(i-1,i+1)+D2(n-2,n ) + D1(i-1,i)+D1(n-1,n); + deps -= !!(ptr[n].src & M2(CYC1,CYC2));// favour moving LDR down + // insn sequence: [..., i-2, i-1, n, i, i+1, ..., n-2, n-1] + deps += D2(i-2,n)+D2(i-1,i )+D2(n ,i+1) + D1(i-1,n)+D1(n ,i); + deps += !!(ptr[i].src & M2(CYC1,CYC2));// penalize moving LDR up + // remember best match found + if (bd > deps) + bd = deps, bi = i; + } + // swap if fewer depencies + if (bd < 0) { + // make room for new insn at bi + struct emit_op tmp = ptr[n]; + for (i = n-1; i >= bi; i--) { + ptr[i+1] = ptr[i]; + if (ptr[i].src & M1(PC)) + emith_pool_adjust(n-i+1, 1); } + // insert new insn at bi + ptr[bi] = tmp; + if (ptr[bi].src & M1(PC)) + emith_pool_adjust(1, bi-n); } if (dst & M1(PC)) { // commit everything if a branch insn is emitted for (i = 1; i <= emit_index+1; i++) EMIT_PTR(emit_ptr, emit_cache[i].op); emit_index = 0; - } else if (emit_index <= EMIT_CACHE_SIZE) { + } else if (emit_index < EMIT_CACHE_SIZE) { // queue not yet full emit_index++; } else { @@ -412,13 +399,13 @@ static void emith_flush(void) EMIT(((cond)<<28) | ((s)<<20) | ((rd)<<16) | ((rs)<<8) | 0x90 | (rm), M2(rd,s?SR:NO), M3(rs,rm,CYC2)) #define EOP_C_UMULL(cond,s,rdhi,rdlo,rs,rm) \ - EMIT(((cond)<<28) | 0x00800000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm), M3(rdhi,rdlo,s?SR:NO), M3(rs,rm,CYC2)) + EMIT(((cond)<<28) | 0x00800000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm), M3(rdhi,rdlo,s?SR:NO), M4(rs,rm,CYC1,CYC2)) #define EOP_C_SMULL(cond,s,rdhi,rdlo,rs,rm) \ - EMIT(((cond)<<28) | 0x00c00000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm), M3(rdhi,rdlo,s?SR:NO), M3(rs,rm,CYC2)) + EMIT(((cond)<<28) | 0x00c00000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm), M3(rdhi,rdlo,s?SR:NO), M4(rs,rm,CYC1,CYC2)) #define EOP_C_SMLAL(cond,s,rdhi,rdlo,rs,rm) \ - EMIT(((cond)<<28) | 0x00e00000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm), M3(rdhi,rdlo,s?SR:NO), M5(rs,rm,rdlo,rdhi,CYC2)) + EMIT(((cond)<<28) | 0x00e00000 | ((s)<<20) | ((rdhi)<<16) | ((rdlo)<<12) | ((rs)<<8) | 0x90 | (rm), M3(rdhi,rdlo,s?SR:NO), M6(rs,rm,rdlo,rdhi,CYC1,CYC2)) #define EOP_MUL(rd,rm,rs) EOP_C_MUL(A_COND_AL,0,rd,rs,rm) // note: rd != rm @@ -502,10 +489,10 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int return; } #else - for (i = 3, u = v; i > 0; i--, u >>= 8) + for (i = 2, u = v; i > 0; i--, u >>= 8) while (u > 0xff && !(u & 3)) u >>= 2; - if (u) { // 4 insns needed... + if (u) { // 3+ insns needed... if (op == A_OP_MVN) imm = ~imm; // ...emit literal load @@ -660,21 +647,14 @@ static inline void emith_pool_check(void) emith_pool_commit(1); } -static inline int emith_pool_index(int tcache_offs) +static inline void emith_pool_adjust(int tcache_offs, int move_offs) { u32 *ptr = (u32 *)tcache_ptr - tcache_offs; int i; for (i = literal_iindex-1; i >= 0 && literal_insn[i] >= ptr; i--) if (literal_insn[i] == ptr) - return i; - return -1; -} - -static inline void emith_pool_adjust(int pool_index, int move_offs) -{ - if (pool_index >= 0) - literal_insn[pool_index] += move_offs; + literal_insn[i] += move_offs; } #define EMITH_HINT_COND(cond) /**/ @@ -938,6 +918,7 @@ static inline void emith_pool_adjust(int pool_index, int move_offs) emith_top_imm(cond, A_OP_TST, r, imm) #define emith_move_r_imm_s8_patchable(r, imm) do { \ + emith_flush(); /* pin insn at current tcache_ptr for patching */ \ if ((s8)(imm) < 0) \ EOP_MVN_IMM(r, 0, (u8)~(imm)); \ else \ From f003b9965e41d61483abb0f3993f0488df9e85a8 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 16 May 2020 21:16:27 +0200 Subject: [PATCH 157/174] add copyright stuff to substantially changed files --- pico/32x/draw.c | 1 + pico/draw.c | 1 + pico/draw_arm.S | 1 + pico/misc.c | 1 + platform/common/host_dasm.c | 4 ++++ platform/common/menu_pico.c | 3 ++- 6 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pico/32x/draw.c b/pico/32x/draw.c index 4da70650d..ffcb5c924 100644 --- a/pico/32x/draw.c +++ b/pico/32x/draw.c @@ -1,6 +1,7 @@ /* * PicoDrive * (C) notaz, 2009,2010 + * (C) kub, 2019 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. diff --git a/pico/draw.c b/pico/draw.c index 3f1857d97..624a2a31e 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -2,6 +2,7 @@ * line renderer * (c) Copyright Dave, 2004 * (C) notaz, 2006-2010 + * (C) kub, 2019-2020 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. diff --git a/pico/draw_arm.S b/pico/draw_arm.S index 9b5a4e322..0579006cd 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -1,6 +1,7 @@ /* * assembly optimized versions of most funtions from draw.c * (C) notaz, 2006-2010,2017 + * (C) kub, 2020 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. diff --git a/pico/misc.c b/pico/misc.c index 74d4d8a8e..cf09688ed 100644 --- a/pico/misc.c +++ b/pico/misc.c @@ -1,6 +1,7 @@ /* * rarely used EEPROM code * (C) notaz, 2006-2008 + * (C) kub, 2020 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. diff --git a/platform/common/host_dasm.c b/platform/common/host_dasm.c index 2084aa91d..b3b504e8e 100644 --- a/platform/common/host_dasm.c +++ b/platform/common/host_dasm.c @@ -1,3 +1,7 @@ +/* + * DRC host disassembler interface for MIPS/ARM32 for use without binutils + * (C) kub, 2018,2019 + */ #include #include #include diff --git a/platform/common/menu_pico.c b/platform/common/menu_pico.c index 882aef924..1d46e634b 100644 --- a/platform/common/menu_pico.c +++ b/platform/common/menu_pico.c @@ -923,7 +923,8 @@ static void draw_frame_credits(void) } static const char credits[] = - "PicoDrive v" VERSION " (c) notaz, 2006-2013\n\n\n" + "PicoDrive v" VERSION "\n" + "(c) notaz, 2006-2013; irixxxx, 2018-2020\n\n" "Credits:\n" "fDave: initial code\n" #ifdef EMU_C68K From ab94bbce3ee1ee144bb01aa3eb5d5bc9b116bc97 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 16 May 2020 21:17:28 +0200 Subject: [PATCH 158/174] release 1.96 --- platform/common/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platform/common/version.h b/platform/common/version.h index a8c3034b0..cd811a665 100644 --- a/platform/common/version.h +++ b/platform/common/version.h @@ -1 +1 @@ -#define VERSION "1.95" +#define VERSION "1.96" From c9763486640af426971eccf9816eee38f9321288 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 22 May 2020 23:14:52 +0200 Subject: [PATCH 159/174] vdp rendering, bugfix for overlapping high prio sprites --- pico/draw.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pico/draw.c b/pico/draw.c index 624a2a31e..bdd450e0b 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -1011,12 +1011,12 @@ static void DrawSpritesHiAS(unsigned char *sprited, int sh) delta<<=4; // Delta of address if (entry+1 == cnt) width = p[entry+1]; // last sprite width limited? + while (sx <= 0 && width) width--, sx+=8, tile+=delta; // Offscreen mp = mb+(sx>>3); - for (m = *mp; width; width--, sx+=8, *mp++ = m, m >>= 8, tile+=delta) + for (m = *mp; width; width--, sx+=8, tile+=delta, *mp++ = m, m >>= 8) { unsigned int pack; - if(sx<=0) continue; if(sx>=328) break; // Offscreen pack = *(unsigned int *)(PicoMem.vram + (tile & 0x7fff)); @@ -1244,12 +1244,12 @@ static void DrawSpritesForced(unsigned char *sprited) delta<<=4; // Delta of address if (entry+1 == cnt) width = p[entry+1]; // last sprite width limited? + while (sx <= 0 && width) width--, sx+=8, tile+=delta; // Offscreen mp = mb+(sx>>3); - for (m = *mp; width; width--, sx+=8, *mp++ = m, m >>= 8, tile+=delta) + for (m = *mp; width; width--, sx+=8, tile+=delta, *mp++ = m, m >>= 8) { unsigned int pack; - if(sx<=0) continue; if(sx>=328) break; // Offscreen pack = *(unsigned int *)(PicoMem.vram + (tile & 0x7fff)); From b3f7eccfc8c81655bab9746025852a3fcc901fd3 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 16 Jun 2020 18:43:45 +0200 Subject: [PATCH 160/174] sh2 drc, preparations for powerpc support --- cpu/drc/emit_ppc.c | 1797 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1797 insertions(+) create mode 100644 cpu/drc/emit_ppc.c diff --git a/cpu/drc/emit_ppc.c b/cpu/drc/emit_ppc.c new file mode 100644 index 000000000..fb2ca44bf --- /dev/null +++ b/cpu/drc/emit_ppc.c @@ -0,0 +1,1797 @@ +/* + * Basic macros to emit PowerISA 2.03 64 bit instructions and some utils + * Copyright (C) 2020 kub + * + * This work is licensed under the terms of MAME license. + * See COPYING file in the top-level directory. + */ + +// WARNING: unfinished, neither thoroughly tested nor optimized. little endian only! + +// NB bit numbers are reversed in PPC (MSB is bit 0). The emith_* functions and +// macros must take this into account. + +// NB PPC was a 64 bit architecture from the onset, so basically all operations +// are operating on 64 bits. 32 bit arch was only added later on, and there are +// very few 32 bit operations (cmp*, shift/rotate, extract/insert, load/store). +// For most operations the upper bits don't spill into the lower word, for the +// others there is an appropriate 32 bit operation available. + +// NB PowerPC isn't a clean RISC design. Several insns use microcode, which is +// AFAIK notably slower than using some 2-3 non-microcode insns. So, using +// such insns should by avoided if possible. Listed in Cell handbook, App. A: +// - shift/rotate having the amount in a register +// - arithmetic/logical having the RC flag set (except cmp*) +// - load/store algebraic (l?a*), multiple (lmw/stmw), string (ls*/sts*) +// - mtcrf (and some more SPR related, not used here) +// moreover, misaligned load/store crossing a cacheline boundary are microcoded. +// Note also that load/store string isn't available in little endian mode. + +// NB flag handling in PPC differs grossly from the ARM/X86 model. There are 8 +// fields in the condition register, each having 4 condition bits. However, only +// the EQ bit is similar to the Z flag. The CA and OV bits in the XER register +// are similar to the C and V bits, but shifts don't use CA, and cmp* doesn't +// use CA and OV. +// Moreover, there's no easy possibility to get CA and OV for 32 bit arithmetic +// since all arithmetic/logical insns use 64 bit. +// For now, use the "no flags" code from the RISCV backend. + +#define HOST_REGS 32 + +// PPC64: params: r3-r10, return: r3, temp: r0,r11-r12, saved: r14-r31 +// reserved: r0(zero), r1(stack), r2(TOC), r13(TID) +#define RET_REG 3 +#define PARAM_REGS { 3, 4, 5, 6, 7, 8, 9, 10 } +#define PRESERVED_REGS { 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,31 } +#define TEMPORARY_REGS { 11, 12 } + +#define CONTEXT_REG 31 +#define STATIC_SH2_REGS { SHR_SR,30 , SHR_R(0),29 , SHR_R(1),28 } + +// if RA is 0 in non-update memory insns, ADDI/ADDIS, ISEL, it aliases with zero +#define Z0 0 // zero register +#define SP 1 // stack pointer +// SPR registers +#define XER -1 // exception register +#define LR -8 // link register +#define CTR -9 // counter register +// internally used by code emitter: +#define AT 0 // emitter temporary (can't be fully used anyway) +#define FNZ 14 // emulated processor flags: N (bit 31) ,Z (all bits) +#define FC 15 // emulated processor flags: C (bit 0), others 0 +#define FV 16 // emulated processor flags: Nt^Ns (bit 31). others x + + +// PPC conditions, BO0-BO4:BI2-BI4 since we only need CR0 +#define PPC_LT 0x60 +#define PPC_GE 0x20 +#define PPC_GT 0x61 +#define PPC_LE 0x21 +#define PPC_EQ 0x62 +#define PPC_NE 0x22 +#define PPC_AL 0xa0 + +// unified conditions; virtual, not corresponding to anything real on PPC +#define DCOND_EQ 0x0 +#define DCOND_NE 0x1 +#define DCOND_HS 0x2 +#define DCOND_LO 0x3 +#define DCOND_MI 0x4 +#define DCOND_PL 0x5 +#define DCOND_VS 0x6 +#define DCOND_VC 0x7 +#define DCOND_HI 0x8 +#define DCOND_LS 0x9 +#define DCOND_GE 0xa +#define DCOND_LT 0xb +#define DCOND_GT 0xc +#define DCOND_LE 0xd + +#define DCOND_CS DCOND_LO +#define DCOND_CC DCOND_HS + +// unified insn; use right-aligned bit offsets for the bitfields +#define PPC_INSN(op, b10, b15, b20, b31) \ + (((op)<<26)|((b10)<<21)|((b15)<<16)|((b20)<<11)|((b31)<<0)) + +#define _ 0 // marker for "field unused" +#define __(n) o##n // enum marker for "undefined" +#define _CB(v,l,s,d) ((((v)>>(s))&((1<<(l))-1))<<(d)) // copy l bits + +// NB everything privileged or unneeded at 1st sight is left out +// opcode field (encoded in OPCD, bits 0-5) +enum { OP__LMA=004, OP_MULLI=007, + OP_SUBFIC, __(11), OP_CMPLI, OP_CMPI, OP_ADDIC, OP_ADDICF, OP_ADDI, OP_ADDIS, + OP_BC, __(21), OP_B, OP__CR, OP_RLWIMI, OP_RLWINM, __(26), OP_RLWNM, + OP_ORI, OP_ORIS, OP_XORI, OP_XORIS, OP_ANDI, OP_ANDIS, OP__RLD, OP__EXT, + OP_LWZ, OP_LWZU, OP_LBZ, OP_LBZU, OP_STW, OP_STWU, OP_STB, OP_STBU, + OP_LHZ, OP_LHZU, OP_LHA, OP_LHAU, OP_STH, OP_STHU, OP_LMW, OP_STMW, + /*OP_LQ=070,*/ OP__LD=072, OP__ST=076 }; +// CR subops (encoded in bits 21-31) +enum { OPC_MCRF=0, OPC_BCLR=32, OPC_BCCTR=1056 }; +// RLD subops (encoded in XO bits 27-31) +enum { OPR_RLDICL=0, OPR_RLDICR=4, OPR_RLDIC=8, OPR_RLDIMI=12, OPR_RLDCL=16, OPR_RLDCR=18 }; +// EXT subops (encoded in XO bits 21-31) +enum { + // arith/logical + OPE_CMP=0, OPE_SUBFC=16, OPE_ADDC=20, OPE_AND=56, + OPE_CMPL=64, OPE_SUBF=80, OPE_ANDC=120, OPE_NEG=208, OPE_NOR=248, + OPE_SUBFE=272, OPE_ADDE=276, OPE_SUBFZE=400, OPE_ADDZE=404, OPE_SUBFME=464, OPE_ADDME=468, + OPE_ADD=532, OPE_EQV=568, OPE_XOR=632, OPE_ORC=824, OPE_OR=888, OPE_NAND=952, + // shift + OPE_SLW=48, OPE_SLD=54, OPE_SRW=1072, OPE_SRD=1078, OPE_SRAW=1584, OPE_SRAD=1588, OPE_SRAWI=1648, OPE_SRADI=1652, + // extend, bitcount + OPE_CNTLZW=52, OPE_CNTLZD=116, OPE_EXTSH=1844, OPE_EXTSB=1908, OPE_EXTSW=1972, + // mult/div + OPE_MULHDU=18, OPE_MULHWU=22, OPE_MULHD=146, OPE_MULHW=150, OPE_MULLD=466, OPE_MULLW=470, + OPE_DIVDU=914, OPE_DIVWU=918, OPE_DIVD=978, OPE_DIVW=982, + // load/store indexed + OPE_LDX=42, OPE_LDUX=106, OPE_STDX=298, OPE_STDUX=362, + OPE_LWZX=46, OPE_LWZUX=110, OPE_LWAX=682, OPE_LWAUX=746, OPE_STWX=302, OPE_STWUX=366, + OPE_LBZX=174, OPE_LBZUX=238, /* no LBAX/LBAUX... */ OPE_STBX=430, OPE_STBUX=494, + OPE_LHZX=558, OPE_LHZUX=622, OPE_LHAX=686, OPE_LHAUX=750, OPE_STHX=814, OPE_STHUX=878, + // SPR, CR related + OPE_ISEL=15, OPE_MFCR=38, OPE_MTCRF=288, OPE_MFSPR=678, OPE_MTSPR=934, OPE_MCRXR=1024, +}; +// LD subops (encoded in XO bits 30-31) +enum { OPL_LD, OPL_LDU, OPL_LWA }; +// ST subops (encoded in XO bits 30-31) +enum { OPS_STD, OPS_STDU /*,OPS_STQ*/ }; + +// X*,M*-forms insns often have overflow detect in b21 and CR0 update in b31 +#define XOE (1<<10) // (31-21) +#define XRC (1<<0) // (31-31) +#define XF (XOE|XRC) +// MB and ME in M*-forms rotate left +#define MM(b,e) (((b)<<6)|((e)<<1)) +#define MD(b,s) (_CB(b,5,0,6)|_CB(b,1,5,5)|_CB(s,5,0,11)|_CB(s,1,5,1)) +// AA and LK in I,B-forms branches +#define BAA (1<<1) +#define BLK (1<<0) + +#define PPC_NOP \ + PPC_INSN(OP_ORI, 0, 0, _, 0) // ori r0, r0, 0 + +// arithmetic/logical + +#define PPC_OP_REG(op, xop, rt, ra, rb) /* X*,M*-form */ \ + PPC_INSN((unsigned)op, rt, ra, rb, xop) +#define PPC_OP_IMM(op, rt, ra, imm) /* D,B,I-form */ \ + PPC_INSN((unsigned)op, rt, ra, _, imm) + +// rt = ra OP rb +#define PPC_ADD_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_ADD,rt,ra,rb) +#define PPC_ADDC_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_ADD|XOE,rt,ra,rb) +#define PPC_SUB_REG(rt, rb, ra) /* NB reversed args (rb-ra) */ \ + PPC_OP_REG(OP__EXT,OPE_SUBF,rt,ra,rb) +#define PPC_SUBC_REG(rt, rb, ra) \ + PPC_OP_REG(OP__EXT,OPE_SUBF|XOE,rt,ra,rb) +#define PPC_NEG_REG(rt, ra) \ + PPC_OP_REG(OP__EXT,OPE_NEG,rt,ra,_) +#define PPC_NEGC_REG(rt, ra) \ + PPC_OP_REG(OP__EXT,OPE_NEG|XOE,rt,ra,_) + +#define PPC_CMP_REG(ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_CMP,1,ra,rb) +#define PPC_CMPL_REG(ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_CMPL,1,ra,rb) + +#define PPC_CMPW_REG(ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_CMP,0,ra,rb) +#define PPC_CMPLW_REG(ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_CMPL,0,ra,rb) + +#define PPC_XOR_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_XOR,rt,ra,rb) +#define PPC_OR_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_OR,rt,ra,rb) +#define PPC_ORN_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_ORC,rt,ra,rb) +#define PPC_NOR_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_NOR,rt,ra,rb) +#define PPC_AND_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_AND,rt,ra,rb) +#define PPC_BIC_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_ANDC,rt,ra,rb) + +#define PPC_MOV_REG(rt, ra) \ + PPC_OR_REG(rt, ra, ra) +#define PPC_MVN_REG(rt, ra) \ + PPC_NOR_REG(rt, ra, ra) + +// rt = ra OP rb OP carry +#define PPC_ADC_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_ADDE,rt,ra,rb) +#define PPC_SBC_REG(rt, rb, ra) \ + PPC_OP_REG(OP__EXT,OPE_SUBFE,rt,ra,rb) +#define PPC_NGC_REG(rt, ra) \ + PPC_OP_REG(OP__EXT,OPE_SUBFZE,rt,ra,_) + +// rt = ra SHIFT rb +#define PPC_LSL_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_SLD,rt,ra,rb) +#define PPC_LSR_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_SRD,rt,ra,rb) +#define PPC_ASR_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_SRAD,rt,ra,rb) +#define PPC_ROL_REG(ra, rt, rb) \ + PPC_OP_REG(OP__RLD,OPR_RLDCL,rt,ra,rb,0) + +#define PPC_LSLW_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_SLW,rt,ra,rb) +#define PPC_LSRW_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_SRW,rt,ra,rb) +#define PPC_ASRW_REG(ra, rt, rb) \ + PPC_OP_REG(OP__EXT,OPE_SRAW,rt,ra,rb) +#define PPC_ROLW_REG(ra, rt, rb) \ + PPC_OP_REG(OP_RLWNM,MM(0,31),rt,ra,rb) + +// rt = ra OP (imm16 << (0|16)) +#define PPC_ADD_IMM(rt, ra, imm16) \ + PPC_OP_IMM(OP_ADDI, rt, ra, imm16) +#define PPC_ADDT_IMM(rt, ra, imm16) \ + PPC_OP_IMM(OP_ADDIS, rt, ra, imm16) + +#define PPC_XOR_IMM(ra, rt, imm16) \ + PPC_OP_IMM(OP_XORI, rt, ra, imm16) +#define PPC_XORT_IMM(ra, rt, imm16) \ + PPC_OP_IMM(OP_XORIS, rt, ra, imm16) +#define PPC_OR_IMM(ra, rt, imm16) \ + PPC_OP_IMM(OP_ORI, rt, ra, imm16) +#define PPC_ORT_IMM(ra, rt, imm16) \ + PPC_OP_IMM(OP_ORIS, rt, ra, imm16) + +#define PPC_ANDS_IMM(rt, ra, imm16) \ + PPC_OP_IMM(OP_ANDI, rt, ra, imm16) +#define PPC_ANDTS_IMM(rt, ra, imm16) \ + PPC_OP_IMM(OP_ANDIS, rt, ra, imm16) +#define PPC_CMP_IMM(ra, imm16) \ + PPC_OP_IMM(OP_CMPI, 1, ra, imm16) +#define PPC_CMPL_IMM(ra, imm16) \ + PPC_OP_IMM(OP_CMPLI, 1, ra, imm16) + +#define PPC_CMPW_IMM(ra, imm16) \ + PPC_OP_IMM(OP_CMPI, 0, ra, imm16) +#define PPC_CMPLW_IMM(ra, imm16) \ + PPC_OP_IMM(OP_CMPLI, 0, ra, imm16) + +#define PPC_TST_IMM(rt, imm16) \ + PPC_ANDS_IMM(Z0,ra,imm16) + +#define PPC_MOV_IMM(rt, ra, imm16) \ + PPC_ADD_IMM(rt,ra,imm16) +#define PPC_MOVT_IMM(rt, ra, imm16) \ + PPC_ADDT_IMM(rt,ra,imm16) + +// rt = EXTEND ra +#define PPC_EXTSW_REG(ra, rt) \ + PPC_OP_REG(OP__EXT,OPE_EXTSW,rt,ra,_) +#define PPC_EXTSH_REG(ra, rt) \ + PPC_OP_REG(OP__EXT,OPE_EXTSH,rt,ra,_) +#define PPC_EXTSB_REG(ra, rt) \ + PPC_OP_REG(OP__EXT,OPE_EXTSB,rt,ra,_) +#define PPC_EXTUW_REG(ra, rt) \ + PPC_OP_REG(OP__RLD,OPR_RLDICL|MD(32,0),rt,ra,_) +#define PPC_EXTUH_REG(ra, rt) \ + PPC_OP_REG(OP__RLD,OPR_RLDICL|MD(48,0),rt,ra,_) +#define PPC_EXTUB_REG(ra, rt) \ + PPC_OP_REG(OP__RLD,OPR_RLDICL|MD(56,0),rt,ra,_) + +// rt = ra SHIFT imm5/imm6 +#define PPC_LSL_IMM(ra, rt, bits) \ + PPC_OP_REG(OP__RLD,OPR_RLDICR|MD(63-(bits),bits),rt,ra,_) +#define PPC_LSR_IMM(ra, rt, bits) \ + PPC_OP_REG(OP__RLD,OPR_RLDICL|MD(bits,64-(bits)),rt,ra,_) +#define PPC_ASR_IMM(ra, rt, bits) \ + PPC_OP_REG(OP__EXT,OPE_SRADI|MD(_,bits),rt,ra,_) +#define PPC_ROL_IMM(ra, rt, bits) \ + PPC_OP_REG(OP__RLD,OPR_RLDICL|MD(0,bits),rt,ra,_) + +#define PPC_LSLW_IMM(ra, rt, bits) \ + PPC_OP_REG(OP_RLWINM,MM(0,31-(bits)),rt,ra,bits) +#define PPC_LSRW_IMM(ra, rt, bits) \ + PPC_OP_REG(OP_RLWINM,MM(bits,31),rt,ra,32-(bits)) +#define PPC_ASRW_IMM(ra, rt, bits) \ + PPC_OP_REG(OP__EXT,OPE_SRAWI,rt,ra,bits) +#define PPC_ROLW_IMM(ra, rt, bits) \ + PPC_OP_REG(OP_RLWINM,MM(0,31),rt,ra,bits) + +// rt = EXTRACT/INSERT ra +#define PPC_BFX_IMM(ra, rt, lsb, bits) \ + PPC_OP_REG(OP__RLD,OPR_RLDICL|MD(64-(bits),63&(lsb+bits)),rt,ra,_) +#define PPC_BFXD_IMM(ra, rt, lsb, bits) /* extract to high bits, 64 bit */ \ + PPC_OP_REG(OP__RLD,OPR_RLDICR|MD(bits-1,lsb),rt,ra,_) +#define PPC_BFI_IMM(ra, rt, lsb, bits) \ + PPC_OP_REG(OP__RLD,OPR_RLDIMI|MD(lsb,64-(lsb+bits)),rt,ra,_) + +#define PPC_BFXW_IMM(ra, rt, lsb, bits) \ + PPC_OP_REG(OP_RLWINM,MM(32-(bits),31),rt,ra,31&(lsb+bits)) +#define PPC_BFXT_IMM(ra, rt, lsb, bits) /* extract to high bits, 32 bit */ \ + PPC_OP_REG(OP_RLWINM,MM(0,bits-1),rt,ra,lsb) +#define PPC_BFIW_IMM(ra, rt, lsb, bits) \ + PPC_OP_REG(OP_RLWIMI,MM(lsb,lsb+bits-1),rt,ra,32-(lsb+bits)) + +// multiplication; NB in 32 bit results the topmost 32 bits are undefined +#define PPC_MULL(rt, ra, rb) /* 64 bit */ \ + PPC_OP_REG(OP__EXT,OPE_MULLD,rt,ra,rb) +#define PPC_MUL(rt, ra, rb) /* low 32 bit */ \ + PPC_OP_REG(OP__EXT,OPE_MULLW,rt,ra,rb) +#define PPC_MULHS(rt, ra, rb) /* high 32 bit, signed */ \ + PPC_OP_REG(OP__EXT,OPE_MULHW,rt,ra,rb) +#define PPC_MULHU(rt, ra, rb) /* high 32 bit, unsigned */ \ + PPC_OP_REG(OP__EXT,OPE_MULHWU,rt,ra,rb) +// XXX use MAC* insns from the LMA group? + +// branching (only PC-relative) + +#define PPC_B(offs26) \ + PPC_OP_IMM(OP_B,_,_,(offs26)&~3) +#define PPC_BL(offs26) \ + PPC_OP_IMM(OP_B,_,_,((offs26)&~3)|BLK) +#define PPC_RET() \ + PPC_OP_REG(OP__CR,OPC_BCLR,PPC_AL>>3,_,_) +#define PPC_RETCOND(cond) \ + PPC_OP_REG(OP__CR,OPC_BCLR,(cond)>>3,(cond)&0x7,_) +#define PPC_BCTRCOND(cond) \ + PPC_OP_REG(OP__CR,OPC_BCCTR,(cond)>>3,(cond)&0x7,_) +#define PPC_BLCTRCOND(cond) \ + PPC_OP_REG(OP__CR,OPC_BCCTR|BLK,(cond)>>3,(cond)&0x7,_) +#define PPC_BCOND(cond, offs19) \ + PPC_OP_IMM(OP_BC,(cond)>>3,(cond)&0x7,(offs19)&~3) + +// load/store, offset + +#define PPC_LDX_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP__LD,rt,ra,((u16)(offs16)&~3)|OPL_LD) +#define PPC_LDW_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP_LWZ,rt,ra,(u16)(offs16)) +#define PPC_LDH_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP_LHZ,rt,ra,(u16)(offs16)) +#define PPC_LDB_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP_LBZ,rt,ra,(u16)(offs16)) + +#define PPC_LDSH_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP_LHA,rt,ra,(u16)(offs16)) + +#define PPC_STX_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP__ST,rt,ra,((u16)(offs16)&~3)|OPS_STD) +#define PPC_STW_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP_STW,rt,ra,(u16)(offs16)) +#define PPC_STH_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP_STH,rt,ra,(u16)(offs16)) +#define PPC_STB_IMM(rt, ra, offs16) \ + PPC_OP_IMM(OP_STB,rt,ra,(u16)(offs16)) + +// load/store, indexed + +#define PPC_LDX_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_LDX,rt,ra,rb) +#define PPC_LDW_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_LWZX,rt,ra,rb) +#define PPC_LDH_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_LHZX,rt,ra,rb) +#define PPC_LDB_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_LBZX,rt,ra,rb) + +#define PPC_LDSH_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_LHAX,rt,ra,rb) + +#define PPC_STX_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_STX,rt,ra,rb) +#define PPC_STW_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_STWX,rt,ra,rb) +#define PPC_STH_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_STHX,rt,ra,rb) +#define PPC_STB_REG(rt, ra, rb) \ + PPC_OP_REG(OP__EXT,OPE_STBX,rt,ra,rb) + +// special regs: LR, CTR, XER, CR + +#define PPC_MFSP_REG(rt, spr) \ + PPC_OP_REG(OP__EXT,OPE_MFSPR,rt,_,_CB(-(spr),5,0,5)|_CB(-(spr),5,5,0)) +#define PPC_MTSP_REG(rs, spr) \ + PPC_OP_REG(OP__EXT,OPE_MTSPR,rs,_,_CB(-(spr),5,0,5)|_CB(-(spr),5,5,0)) + +#define PPC_MFCR_REG(rt) \ + PPC_OP_REG(OP__EXT,OPE_MFCR,rt,_,_) +#define PPC_MTCRF_REG(rs, fm) \ + PPC_OP_REG(OP__EXT,OPE_MTCRF,rs,_,(fm)<<1) +#define PPC_MCRXR_REG(crt) \ + PPC_OP_REG(OP__EXT,OPE_MCRXR,(crt)<<2,_,_) +#define PPC_MCRCR_REG(crt, crf) \ + PPC_OP_REG(OP__CR,OPC_MCRF,(crt)<<2,(crf)<<1,_) + +#ifdef __powerpc64__ +#define PTR_SCALE 3 +#define PPC_LDP_IMM PPC_LDX_IMM +#define PPC_LDP_REG PPC_LDX_REG +#define PPC_STP_IMM PPC_STX_IMM +#define PPC_STP_REG PPC_STX_REG +#define PPC_BFXP_IMM PPC_BFX_IMM + +// "long" multiplication, 32x32 bit = 64 bit +#define EMIT_PPC_MULLU_REG(dlo, dhi, s1, s2) do { \ + EMIT(PPC_EXTUW_REG(s1, s1)); \ + EMIT(PPC_EXTUW_REG(s2, s2)); \ + EMIT(PPC_MULL(dlo, s1, s2)); \ + EMIT(PPC_ASR_IMM(dhi, dlo, 32)); \ +} while (0) + +#define EMIT_PPC_MULLS_REG(dlo, dhi, s1, s2) do { \ + EMIT(PPC_EXTSW_REG(s1, s1)); \ + EMIT(PPC_EXTSW_REG(s2, s2)); \ + EMIT(PPC_MULL(dlo, s1, s2)); \ + EMIT(PPC_ASR_IMM(dhi, dlo, 32)); \ +} while (0) + +#define EMIT_PPC_MACLS_REG(dlo, dhi, s1, s2) do { \ + EMIT(PPC_EXTSW_REG(s1, s1)); \ + EMIT(PPC_EXTSW_REG(s2, s2)); \ + EMIT(PPC_MULL(AT, s1, s2)); \ + EMIT(PPC_BFI_IMM(dlo, dhi, 0, 32)); \ + emith_add_r_r(dlo, AT); \ + EMIT(PPC_ASR_IMM(dhi, dlo, 32)); \ +} while (0) +#else +#define PTR_SCALE 2 +#define PPC_LDP_IMM PPC_LDW_IMM +#define PPC_LDP_REG PPC_LDW_REG +#define PPC_STP_IMM PPC_STW_IMM +#define PPC_STP_REG PPC_STW_REG +#define PPC_BFXP_IMM PPC_BFXW_IMM + +// "long" multiplication, 32x32 bit = 64 bit +#define EMIT_PPC_MULLU_REG(dlo, dhi, s1, s2) do { \ + int at = (dlo == s1 || dlo == s2 ? AT : dlo); \ + EMIT(PPC_MUL(at, s1, s2)); \ + EMIT(PPC_MULHU(dhi, s1, s2)); \ + if (at != dlo) emith_move_r_r(dlo, at); \ +} while (0) + +#define EMIT_PPC_MULLS_REG(dlo, dhi, s1, s2) do { \ + int at = (dlo == s1 || dlo == s2 ? AT : dlo); \ + EMIT(PPC_MUL(at, s1, s2)); \ + EMIT(PPC_MULHS(dhi, s1, s2)); \ + if (at != dlo) emith_move_r_r(dlo, at); \ +} while (0) + +#define EMIT_PPC_MACLS_REG(dlo, dhi, s1, s2) do { \ + int t_ = rcache_get_tmp(); \ + EMIT_PPC_MULLS_REG(t_, AT, s1, s2); \ + EMIT(PPC_ADDC_REG(dlo, dlo, t_)); \ + EMIT(PPC_ADC_REG(dhi, dhi, AT)); \ + rcache_free_tmp(t_); \ +} while (0) +#endif +#define PTR_SIZE (1<>1 since the lowest bit inverts the cond */ \ + unsigned _mv = BITMASK3(DCOND_VS>>1,DCOND_GE>>1,DCOND_GT>>1); \ + unsigned _mc = _mv | BITMASK2(DCOND_HS>>1,DCOND_HI>>1); \ + emith_flg_hint = (_mv & BITMASK1(cond >> 1) ? _FHV : 0); \ + emith_flg_hint |= (_mc & BITMASK1(cond >> 1) ? _FHC : 0); \ +} while (0) + +// store minimal cc information: rt, rb^ra, carry +// NB: the result *must* first go to FNZ, in case rt == ra or rt == rb. +// NB: for adcf and sbcf, carry-in must be dealt with separately (see there) +static void emith_set_arith_flags(int rt, int ra, int rb, s32 imm, int sub) +{ + if (emith_flg_hint & _FHC) { + if (sub) // C = sub:rb= 0) // Nt^Ns in FV, bit 31 + EMIT(PPC_XOR_REG(FV, ra, rb)); + else if (imm == 0) + emith_flg_noV = 1; // imm #0 can't overflow + else if ((imm < 0) == !sub) + EMIT(PPC_MVN_REG(FV, ra)); + else if ((imm > 0) == !sub) + EMIT(PPC_MOV_REG(FV, ra)); + } + // full V = Nd^Nt^Ns^C calculation is deferred until really needed + + if (rt && rt != FNZ) + EMIT(PPC_MOV_REG(rt, FNZ)); // N,Z via result value in FNZ + emith_cmp_ra = emith_cmp_rb = -1; +} + +// since R5 has less-than and compare-branch insns, handle cmp separately by +// storing the involved regs for later use in one of those R5 insns. +// This works for all conditions but VC/VS, but this is fortunately never used. +static void emith_set_compare_flags(int ra, int rb, s32 imm) +{ + emith_cmp_rb = rb; + emith_cmp_ra = ra; + emith_cmp_imm = imm; +} + + +// data processing, register + +#define emith_move_r_r_ptr(d, s) \ + EMIT(PPC_MOV_REG(d, s)) +#define emith_move_r_r_ptr_c(cond, d, s) \ + emith_move_r_r_ptr(d, s) + +#define emith_move_r_r(d, s) \ + emith_move_r_r_ptr(d, s) +#define emith_move_r_r_c(cond, d, s) \ + emith_move_r_r(d, s) + +#define emith_mvn_r_r(d, s) \ + EMIT(PPC_MVN_REG(d, s)) + +#define emith_add_r_r_r_lsl_ptr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_ADD_REG(d, s1, AT)); \ + } else EMIT(PPC_ADD_REG(d, s1, s2)); \ +} while (0) +#define emith_add_r_r_r_lsl(d, s1, s2, simm) \ + emith_add_r_r_r_lsl_ptr(d, s1, s2, simm) + +#define emith_add_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSRW_IMM(AT, s2, simm)); \ + EMIT(PPC_ADD_REG(d, s1, AT)); \ + } else EMIT(PPC_ADD_REG(d, s1, s2)); \ +} while (0) + +#define emith_addf_r_r_r_lsl_ptr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_ADD_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(PPC_ADD_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) +#define emith_addf_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_ADD_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(PPC_ADD_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) + +#define emith_addf_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSRW_IMM(AT, s2, simm)); \ + EMIT(PPC_ADD_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 0); \ + } else { \ + EMIT(PPC_ADD_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + } \ +} while (0) + +#define emith_sub_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_SUB_REG(d, s1, AT)); \ + } else EMIT(PPC_SUB_REG(d, s1, s2)); \ +} while (0) + +#define emith_subf_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_SUB_REG(FNZ, s1, AT)); \ + emith_set_arith_flags(d, s1, AT, 0, 1); \ + } else { \ + EMIT(PPC_SUB_REG(FNZ, s1, s2)); \ + emith_set_arith_flags(d, s1, s2, 0, 1); \ + } \ +} while (0) + +#define emith_or_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_OR_REG(d, s1, AT)); \ + } else EMIT(PPC_OR_REG(d, s1, s2)); \ +} while (0) + +#define emith_or_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSRW_IMM(AT, s2, simm)); \ + EMIT(PPC_OR_REG(d, s1, AT)); \ + } else EMIT(PPC_OR_REG(d, s1, s2)); \ +} while (0) + +#define emith_eor_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_XOR_REG(d, s1, AT)); \ + } else EMIT(PPC_XOR_REG(d, s1, s2)); \ +} while (0) + +#define emith_eor_r_r_r_lsr(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSRW_IMM(AT, s2, simm)); \ + EMIT(PPC_XOR_REG(d, s1, AT)); \ + } else EMIT(PPC_XOR_REG(d, s1, s2)); \ +} while (0) + +#define emith_and_r_r_r_lsl(d, s1, s2, simm) do { \ + if (simm) { \ + EMIT(PPC_LSLW_IMM(AT, s2, simm)); \ + EMIT(PPC_AND_REG(d, s1, AT)); \ + } else EMIT(PPC_AND_REG(d, s1, s2)); \ +} while (0) + +#define emith_or_r_r_lsl(d, s, lslimm) \ + emith_or_r_r_r_lsl(d, d, s, lslimm) +#define emith_or_r_r_lsr(d, s, lsrimm) \ + emith_or_r_r_r_lsr(d, d, s, lsrimm) + +#define emith_eor_r_r_lsl(d, s, lslimm) \ + emith_eor_r_r_r_lsl(d, d, s, lslimm) +#define emith_eor_r_r_lsr(d, s, lsrimm) \ + emith_eor_r_r_r_lsr(d, d, s, lsrimm) + +#define emith_add_r_r_r(d, s1, s2) \ + emith_add_r_r_r_lsl(d, s1, s2, 0) + +#define emith_addf_r_r_r_ptr(d, s1, s2) \ + emith_addf_r_r_r_lsl_ptr(d, s1, s2, 0) +#define emith_addf_r_r_r(d, s1, s2) \ + emith_addf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_sub_r_r_r(d, s1, s2) \ + emith_sub_r_r_r_lsl(d, s1, s2, 0) + +#define emith_subf_r_r_r(d, s1, s2) \ + emith_subf_r_r_r_lsl(d, s1, s2, 0) + +#define emith_or_r_r_r(d, s1, s2) \ + emith_or_r_r_r_lsl(d, s1, s2, 0) + +#define emith_eor_r_r_r(d, s1, s2) \ + emith_eor_r_r_r_lsl(d, s1, s2, 0) + +#define emith_and_r_r_r(d, s1, s2) \ + emith_and_r_r_r_lsl(d, s1, s2, 0) + +#define emith_add_r_r_ptr(d, s) \ + emith_add_r_r_r_lsl_ptr(d, d, s, 0) +#define emith_add_r_r(d, s) \ + emith_add_r_r_r(d, d, s) + +#define emith_sub_r_r(d, s) \ + emith_sub_r_r_r(d, d, s) + +#define emith_neg_r_r(d, s) \ + EMIT(PPC_NEG_REG(d, s)) + +#define emith_adc_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(AT, s2, FC); \ + emith_add_r_r_r(d, s1, AT); \ +} while (0) + +#define emith_sbc_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(AT, s2, FC); \ + emith_sub_r_r_r(d, s1, AT); \ +} while (0) + +#define emith_adc_r_r(d, s) \ + emith_adc_r_r_r(d, d, s) + +#define emith_negc_r_r(d, s) do { \ + emith_neg_r_r(d, s); \ + emith_sub_r_r(d, FC); \ +} while (0) + +// NB: the incoming carry Cin can cause Cout if s2+Cin=0 (or s1+Cin=0 FWIW) +// moreover, if s2+Cin=0 caused Cout, s1+s2+Cin=s1+0 can't cause another Cout +#define emith_adcf_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(FNZ, s2, FC); \ + EMIT_PPC_SLTWU_REG(AT, FNZ, FC); \ + emith_add_r_r_r(FNZ, s1, FNZ); \ + emith_set_arith_flags(d, s1, s2, 0, 0); \ + emith_or_r_r(FC, AT); \ +} while (0) + +#define emith_sbcf_r_r_r(d, s1, s2) do { \ + emith_add_r_r_r(FNZ, s2, FC); \ + EMIT_PPC_SLTWU_REG(AT, FNZ, FC); \ + emith_sub_r_r_r(FNZ, s1, FNZ); \ + emith_set_arith_flags(d, s1, s2, 0, 1); \ + emith_or_r_r(FC, AT); \ +} while (0) + +#define emith_and_r_r(d, s) \ + emith_and_r_r_r(d, d, s) +#define emith_and_r_r_c(cond, d, s) \ + emith_and_r_r(d, s) + +#define emith_or_r_r(d, s) \ + emith_or_r_r_r(d, d, s) + +#define emith_eor_r_r(d, s) \ + emith_eor_r_r_r(d, d, s) + +#define emith_tst_r_r_ptr(d, s) do { \ + if (d != s) { \ + emith_and_r_r_r(FNZ, d, s); \ + emith_cmp_ra = emith_cmp_rb = -1; \ + } else emith_cmp_ra = s, emith_cmp_rb = -1, emith_cmp_imm = 0; \ +} while (0) +#define emith_tst_r_r(d, s) \ + emith_tst_r_r_ptr(d, s) + +#define emith_teq_r_r(d, s) do { \ + emith_eor_r_r_r(FNZ, d, s); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +#define emith_cmp_r_r(d, s) \ + emith_set_compare_flags(d, s, 0) +// emith_subf_r_r_r(FNZ, d, s) + +#define emith_addf_r_r(d, s) \ + emith_addf_r_r_r(d, d, s) + +#define emith_subf_r_r(d, s) \ + emith_subf_r_r_r(d, d, s) + +#define emith_adcf_r_r(d, s) \ + emith_adcf_r_r_r(d, d, s) + +#define emith_sbcf_r_r(d, s) \ + emith_sbcf_r_r_r(d, d, s) + +#define emith_negcf_r_r(d, s) do { \ + emith_add_r_r_r(FNZ, s, FC); \ + EMIT_PPC_SLTWU_REG(AT, FNZ, FC); \ + emith_neg_r_r(FNZ, FNZ); \ + emith_set_arith_flags(d, Z0, s, 0, 1); \ + emith_or_r_r(FC, AT); \ +} while (0) + +// move immediate + +static void emith_move_imm(int r, int ptr, uintptr_t imm) +{ +#ifdef __powerpc64__ + if ((u32)imm != imm && ptr) { + emith_move_imm(r, 0, imm >> 32); + if (imm >> 32) + EMIT(PPC_LSL_IMM(r, r, 32)); + if (imm & 0x0000ffff) + EMIT(PPC_OR_IMM(r, r, imm & 0x0000ffff)); + if (imm & 0xffff0000) + EMIT(PPC_ORT_IMM(r, r, (imm & 0xffff0000) >> 16)); + } else +#endif + { + int s = Z0, d = 0, c = 0; + if ((u16)imm) { + EMIT(PPC_ADD_IMM(r, s, (u16)imm)); + s = r, d = 1, c = (s16)imm < 0; + } + // adjust for sign extension in ADDI + if (!d) // low part == 0 + EMIT(PPC_ADDT_IMM(r, s, (u16)(imm>>16))); + else if (c && (u16)(~imm>>16)) // low part < 0 + EMIT(PPC_XORT_IMM(r, s, (u16)(~imm>>16))); + else if (!c && (u16)(imm>>16)) // low part > 0 + EMIT(PPC_ORT_IMM(r, s, (u16)(imm>>16))); + // make sure to clear upper half if this is a ptr + if (ptr && !(imm >> 32) && c) + EMIT(PPC_EXTUW_REG(r, r)); + } +} + +#define emith_move_r_ptr_imm(r, imm) \ + emith_move_imm(r, 1, (uintptr_t)(imm)) + +#define emith_move_r_imm(r, imm) \ + emith_move_imm(r, 0, (u32)(imm)) +#define emith_move_r_imm_c(cond, r, imm) \ + emith_move_r_imm(r, imm) + +#define emith_move_r_imm_s8_patchable(r, imm) \ + EMIT(PPC_ADD_IMM(r, Z0, (s8)(imm))) +#define emith_move_r_imm_s8_patch(ptr, imm) do { \ + u32 *ptr_ = (u32 *)ptr; \ + EMIT_PTR(ptr_, (*ptr_ & 0xffff0000) | (u16)(s8)(imm)); \ +} while (0) + +// arithmetic, immediate - can only be ADDI, since SUBI doesn't exist + +static void emith_add_imm(int rt, int ra, u32 imm) +{ + int s = ra; + if ((u16)imm) { + EMIT(PPC_ADD_IMM(rt, s, (u16)imm)); + s = rt; + } + // adjust for sign extension in ADDI + imm = (imm >> 16) + ((s16)imm < 0); + if ((u16)imm || rt != s) + EMIT(PPC_ADDT_IMM(rt, s, (u16)imm)); +} + +#define emith_add_r_imm(r, imm) \ + emith_add_r_r_imm(r, r, imm) +#define emith_add_r_imm_c(cond, r, imm) \ + emith_add_r_imm(r, imm) + +#define emith_addf_r_imm(r, imm) \ + emith_addf_r_r_imm(r, imm) + +#define emith_sub_r_imm(r, imm) \ + emith_sub_r_r_imm(r, r, imm) +#define emith_sub_r_imm_c(cond, r, imm) \ + emith_sub_r_imm(r, imm) + +#define emith_subf_r_imm(r, imm) \ + emith_subf_r_r_imm(r, r, imm) + +#define emith_adc_r_imm(r, imm) \ + emith_adc_r_r_imm(r, r, imm) + +#define emith_adcf_r_imm(r, imm) \ + emith_adcf_r_r_imm(r, r, imm) + +#define emith_cmp_r_imm(r, imm) \ + emith_set_compare_flags(r, -1, imm) +// emith_subf_r_r_imm(FNZ, r, (s16)imm) + +#define emith_add_r_r_ptr_imm(d, s, imm) \ + emith_add_imm(d, s, imm) + +#define emith_add_r_r_imm(d, s, imm) \ + emith_add_r_r_ptr_imm(d, s, imm) + +#define emith_addf_r_r_imm(d, s, imm) do { \ + emith_add_r_r_imm(FNZ, s, imm); \ + emith_set_arith_flags(d, s, -1, imm, 0); \ +} while (0) + +#define emith_adc_r_r_imm(d, s, imm) do { \ + emith_add_r_r_r(AT, s, FC); \ + emith_add_r_r_imm(d, AT, imm); \ +} while (0) + + +#define emith_adcf_r_r_imm(d, s, imm) do { \ + if (imm == 0) { \ + emith_add_r_r_r(FNZ, s, FC); \ + emith_set_arith_flags(d, s, -1, 1, 0); \ + } else { \ + emith_add_r_r_r(FNZ, s, FC); \ + EMIT_PPC_SLTWU_REG(AT, FNZ, FC); \ + emith_add_r_r_imm(FNZ, FNZ, imm); \ + emith_set_arith_flags(d, s, -1, imm, 0); \ + emith_or_r_r(FC, AT); \ + } \ +} while (0) + +// NB: no SUBI, since ADDI takes a signed imm +#define emith_sub_r_r_imm(d, s, imm) \ + emith_add_r_r_imm(d, s, -(imm)) +#define emith_sub_r_r_imm_c(cond, d, s, imm) \ + emith_sub_r_r_imm(d, s, imm) + +#define emith_subf_r_r_imm(d, s, imm) do { \ + emith_sub_r_r_imm(FNZ, s, imm); \ + emith_set_arith_flags(d, s, -1, imm, 1); \ +} while (0) + +// logical, immediate + +#define emith_log_imm2(opi, opr, rt, ra, imm) do { \ + if ((imm) >> 16 || opi == OP_ANDI) { /* too big, or microcoded ANDI */ \ + emith_move_r_imm(AT, imm); \ + EMIT(PPC_OP_REG(OP__EXT, opr, ra, rt, AT)); \ + } else if (/*opi == OP_ANDI ||*/ imm || rt != ra) \ + EMIT(PPC_OP_IMM(opi, ra, rt, imm)); \ +} while (0) +#define emith_log_imm(op, rt, ra, imm) \ + emith_log_imm2(OP_##op##I, OPE_##op, rt, ra, imm) + +#define emith_and_r_imm(r, imm) \ + emith_log_imm(AND, r, r, imm) + +#define emith_or_r_imm(r, imm) \ + emith_log_imm(OR, r, r, imm) +#define emith_or_r_imm_c(cond, r, imm) \ + emith_or_r_imm(r, imm) + +#define emith_eor_r_imm_ptr(r, imm) \ + emith_log_imm(XOR, r, r, imm) +#define emith_eor_r_imm_ptr_c(cond, r, imm) \ + emith_eor_r_imm_ptr(r, imm) + +#define emith_eor_r_imm(r, imm) \ + emith_eor_r_imm_ptr(r, imm) +#define emith_eor_r_imm_c(cond, r, imm) \ + emith_eor_r_imm(r, imm) + +/* NB: BIC #imm not available; use AND #~imm instead */ +#define emith_bic_r_imm(r, imm) \ + emith_log_imm(AND, r, r, ~(imm)) +#define emith_bic_r_imm_c(cond, r, imm) \ + emith_bic_r_imm(r, imm) + +#define emith_tst_r_imm(r, imm) do { \ + emith_log_imm(AND, FNZ, r, imm); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) +#define emith_tst_r_imm_c(cond, r, imm) \ + emith_tst_r_imm(r, imm) + +#define emith_and_r_r_imm(d, s, imm) \ + emith_log_imm(AND, d, s, imm) + +#define emith_or_r_r_imm(d, s, imm) \ + emith_log_imm(OR, d, s, imm) + +#define emith_eor_r_r_imm(d, s, imm) \ + emith_log_imm(XOR, d, s, imm) + +// shift + +#define emith_lsl(d, s, cnt) \ + EMIT(PPC_LSLW_IMM(d, s, cnt)) + +#define emith_lsr(d, s, cnt) \ + EMIT(PPC_LSRW_IMM(d, s, cnt)) + +#define emith_asr(d, s, cnt) \ + EMIT(PPC_ASRW_IMM(d, s, cnt)) + +#define emith_ror(d, s, cnt) \ + EMIT(PPC_ROLW_IMM(d, s, 32-(cnt))) +#define emith_ror_c(cond, d, s, cnt) \ + emith_ror(d, s, cnt) + +#define emith_rol(d, s, cnt) \ + EMIT(PPC_ROLW_IMM(d, s, cnt)); \ + +#define emith_rorc(d) do { \ + emith_lsr(d, d, 1); \ + emith_lsl(AT, FC, 31); \ + emith_or_r_r(d, AT); \ +} while (0) + +#define emith_rolc(d) do { \ + emith_lsl(d, d, 1); \ + emith_or_r_r(d, FC); \ +} while (0) + +// NB: all flag setting shifts make V undefined +#define emith_lslf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_lsl(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_lsr(FC, _s, 31); \ + emith_lsl(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +#define emith_lsrf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_lsr(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_and_r_r_imm(FC, _s, 1); \ + emith_lsr(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +#define emith_asrf(d, s, cnt) do { \ + int _s = s; \ + if ((cnt) > 1) { \ + emith_asr(d, s, cnt-1); \ + _s = d; \ + } \ + if ((cnt) > 0) { \ + emith_and_r_r_imm(FC, _s, 1); \ + emith_asr(d, _s, 1); \ + } \ + emith_move_r_r(FNZ, d); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +#define emith_rolf(d, s, cnt) do { \ + emith_rol(d, s, cnt); \ + emith_and_r_r_imm(FC, d, 1); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +#define emith_rorf(d, s, cnt) do { \ + emith_ror(d, s, cnt); \ + emith_lsr(FC, d, 31); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +#define emith_rolcf(d) do { \ + emith_lsr(AT, d, 31); \ + emith_lsl(d, d, 1); \ + emith_or_r_r(d, FC); \ + emith_move_r_r(FC, AT); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +#define emith_rorcf(d) do { \ + emith_and_r_r_imm(AT, d, 1); \ + emith_lsr(d, d, 1); \ + emith_lsl(FC, FC, 31); \ + emith_or_r_r(d, FC); \ + emith_move_r_r(FC, AT); \ + emith_move_r_r(FNZ, d); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +// signed/unsigned extend + +#define emith_clear_msb(d, s, count) /* bits to clear */ \ + EMIT(PPC_BFXW_IMM(d, s, count, 32-(count))) + +#define emith_clear_msb_c(cond, d, s, count) \ + emith_clear_msb(d, s, count) + +#define emith_sext(d, s, count) /* bits to keep */ do { \ + if (count == 8) \ + EMIT(PPC_EXTSB_REG(d, s)); \ + else if (count == 16) \ + EMIT(PPC_EXTSH_REG(d, s)); \ + else { \ + emith_lsl(d, s, 32-(count)); \ + emith_asr(d, d, 32-(count)); \ + } \ +} while (0) + +#define emith_uext_ptr(r) \ + EMIT(PPC_EXTUW_REG(r, r)) + +// multiply Rd = Rn*Rm (+ Ra) + +#define emith_mul(d, s1, s2) \ + EMIT(PPC_MUL(d, s1, s2)) + +#define emith_mul_u64(dlo, dhi, s1, s2) \ + EMIT_PPC_MULLU_REG(dlo, dhi, s1, s2) + +#define emith_mul_s64(dlo, dhi, s1, s2) \ + EMIT_PPC_MULLS_REG(dlo, dhi, s1, s2) + +#define emith_mula_s64(dlo, dhi, s1, s2) \ + EMIT_PPC_MACLS_REG(dlo, dhi, s1, s2) +#define emith_mula_s64_c(cond, dlo, dhi, s1, s2) \ + emith_mula_s64(dlo, dhi, s1, s2) + +// load/store. offs has 16 bits signed, which is currently sufficient +#define emith_read_r_r_offs_ptr(r, ra, offs) \ + EMIT(PPC_LDP_IMM(r, ra, offs)) +#define emith_read_r_r_offs_ptr_c(cond, r, ra, offs) \ + emith_read_r_r_offs_ptr(r, ra, offs) + +#define emith_read_r_r_offs(r, ra, offs) \ + EMIT(PPC_LDW_IMM(r, ra, offs)) +#define emith_read_r_r_offs_c(cond, r, ra, offs) \ + emith_read_r_r_offs(r, ra, offs) + +#define emith_read_r_r_r_ptr(r, ra, rm) \ + EMIT(PPC_LDP_REG(r, ra, rm)) + +#define emith_read_r_r_r(r, ra, rm) \ + EMIT(PPC_LDW_REG(r, ra, rm)) +#define emith_read_r_r_r_c(cond, r, ra, rm) \ + emith_read_r_r_r(r, ra, rm) + +#define emith_read8_r_r_offs(r, ra, offs) \ + EMIT(PPC_LDB_IMM(r, ra, offs)) +#define emith_read8_r_r_offs_c(cond, r, ra, offs) \ + emith_read8_r_r_offs(r, ra, offs) + +#define emith_read8_r_r_r(r, ra, rm) \ + EMIT(PPC_LDB_REG(r, ra, rm)) +#define emith_read8_r_r_r_c(cond, r, ra, rm) \ + emith_read8_r_r_r(r, ra, rm) + +#define emith_read16_r_r_offs(r, ra, offs) \ + EMIT(PPC_LDH_IMM(r, ra, offs)) +#define emith_read16_r_r_offs_c(cond, r, ra, offs) \ + emith_read16_r_r_offs(r, ra, offs) + +#define emith_read16_r_r_r(r, ra, rm) \ + EMIT(PPC_LDH_REG(r, ra, rm)) +#define emith_read16_r_r_r_c(cond, r, ra, rm) \ + emith_read16_r_r_r(r, ra, rm) + +#define emith_read8s_r_r_offs(r, ra, offs) do { \ + EMIT(PPC_LDB_IMM(r, ra, offs)); \ + EMIT(PPC_EXTSB_REG(r, r)); \ +} while (0) +#define emith_read8s_r_r_offs_c(cond, r, ra, offs) \ + emith_read8s_r_r_offs(r, ra, offs) + +#define emith_read8s_r_r_r(r, ra, rm) do { \ + EMIT(PPC_LDB_REG(r, ra, rm)); \ + EMIT(PPC_EXTSB_REG(r, r)); \ +} while (0) +#define emith_read8s_r_r_r_c(cond, r, ra, rm) \ + emith_read8s_r_r_r(r, ra, rm) + +#define emith_read16s_r_r_offs(r, ra, offs) \ + EMIT(PPC_LDSH_IMM(r, ra, offs)) +#define emith_read16s_r_r_offs_c(cond, r, ra, offs) \ + emith_read16s_r_r_offs(r, ra, offs) + +#define emith_read16s_r_r_r(r, ra, rm) \ + EMIT(PPC_LDSH_REG(r, ra, rm)) +#define emith_read16s_r_r_r_c(cond, r, ra, rm) \ + emith_read16s_r_r_r(r, ra, rm) + + +#define emith_write_r_r_offs_ptr(r, ra, offs) \ + EMIT(PPC_STP_IMM(r, ra, offs)) +#define emith_write_r_r_offs_ptr_c(cond, r, ra, offs) \ + emith_write_r_r_offs_ptr(r, ra, offs) + +#define emith_write_r_r_r_ptr(r, ra, rm) \ + EMIT(PPC_STP_REG(r, ra, rm)) +#define emith_write_r_r_r_ptr_c(cond, r, ra, rm) \ + emith_write_r_r_r_ptr(r, ra, rm) + +#define emith_write_r_r_offs(r, ra, offs) \ + EMIT(PPC_STW_IMM(r, ra, offs)) +#define emith_write_r_r_offs_c(cond, r, ra, offs) \ + emith_write_r_r_offs(r, ra, offs) + +#define emith_write_r_r_r(r, ra, rm) \ + EMIT(PPC_STW_REG(r, ra, rm)) +#define emith_write_r_r_r_c(cond, r, ra, rm) \ + emith_write_r_r_r(r, ra, rm) + +#define emith_ctx_read_ptr(r, offs) \ + emith_read_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_read(r, offs) \ + emith_read_r_r_offs(r, CONTEXT_REG, offs) +#define emith_ctx_read_c(cond, r, offs) \ + emith_ctx_read(r, offs) + +#define emith_ctx_write_ptr(r, offs) \ + emith_write_r_r_offs_ptr(r, CONTEXT_REG, offs) + +#define emith_ctx_write(r, offs) \ + emith_write_r_r_offs(r, CONTEXT_REG, offs) + +#define emith_ctx_read_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_read(r_, offs_); \ +} while (0) + +#define emith_ctx_write_multiple(r, offs, cnt, tmpr) do { \ + int r_ = r, offs_ = offs, cnt_ = cnt; \ + for (; cnt_ > 0; r_++, offs_ += 4, cnt_--) \ + emith_ctx_write(r_, offs_); \ +} while (0) + +// function call handling +#define emith_save_caller_regs(mask) do { \ + int _c, _z = PTR_SIZE; u32 _m = mask & 0x1ff8; /* r3-r12 */ \ + if (__builtin_parity(_m) == 1) _m |= 0x1; /* ABI align */ \ + int _s = count_bits(_m) * _z, _o = _s; \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, -_s); \ + for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + if (_m & (1 << _c)) \ + { _o -= _z; if (_c) emith_write_r_r_offs_ptr(_c, SP, _o); } \ +} while (0) + +#define emith_restore_caller_regs(mask) do { \ + int _c, _z = PTR_SIZE; u32 _m = mask & 0x1ff8; \ + if (__builtin_parity(_m) == 1) _m |= 0x1; \ + int _s = count_bits(_m) * _z, _o = 0; \ + for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) \ + { if (_c) emith_read_r_r_offs_ptr(_c, SP, _o); _o += _z; } \ + if (_s) emith_add_r_r_ptr_imm(SP, SP, _s); \ +} while (0) + +#define host_arg2reg(rt, arg) \ + rt = (arg+3) + +#define emith_pass_arg_r(arg, reg) \ + emith_move_r_r(arg, reg) + +#define emith_pass_arg_imm(arg, imm) \ + emith_move_r_imm(arg, imm) + +// branching +#define emith_invert_branch(cond) /* inverted conditional branch */ \ + ((cond) ^ 0x40) + +// evaluate the emulated condition, returns a register/branch type pair +static int emith_cmpr_check(int rs, int rt, int cond, u32 *op) +{ + int b = -1; + + // condition check for comparing 2 registers + switch (cond) { + case DCOND_EQ: *op = PPC_CMPW_REG(rs, rt); b = PPC_EQ; break; + case DCOND_NE: *op = PPC_CMPW_REG(rs, rt); b = PPC_NE; break; + case DCOND_LO: *op = PPC_CMPLW_REG(rs, rt); b = PPC_LT; break; + case DCOND_HS: *op = PPC_CMPLW_REG(rs, rt); b = PPC_GE; break; + case DCOND_LS: *op = PPC_CMPLW_REG(rs, rt); b = PPC_LE; break; + case DCOND_HI: *op = PPC_CMPLW_REG(rs, rt); b = PPC_GT; break; + case DCOND_LT: *op = PPC_CMPW_REG(rs, rt); b = PPC_LT; break; + case DCOND_GE: *op = PPC_CMPW_REG(rs, rt); b = PPC_GE; break; + case DCOND_LE: *op = PPC_CMPW_REG(rs, rt); b = PPC_LE; break; + case DCOND_GT: *op = PPC_CMPW_REG(rs, rt); b = PPC_GT; break; + } + + return b; +} + +static int emith_cmpi_check(int rs, s32 imm, int cond, u32 *op) +{ + int b = -1; + + // condition check for comparing register with immediate + switch (cond) { + case DCOND_EQ: *op = PPC_CMPW_IMM(rs, (u16)imm), b = PPC_EQ; break; + case DCOND_NE: *op = PPC_CMPW_IMM(rs, (u16)imm), b = PPC_NE; break; + case DCOND_LO: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = PPC_LT; break; + case DCOND_HS: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = PPC_GE; break; + case DCOND_LS: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = PPC_LE; break; + case DCOND_HI: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = PPC_GT; break; + case DCOND_LT: *op = PPC_CMPW_IMM(rs, (u16)imm), b = PPC_LT; break; + case DCOND_GE: *op = PPC_CMPW_IMM(rs, (u16)imm), b = PPC_GE; break; + case DCOND_LE: *op = PPC_CMPW_IMM(rs, (u16)imm), b = PPC_LE; break; + case DCOND_GT: *op = PPC_CMPW_IMM(rs, (u16)imm), b = PPC_GT; break; + } + + return b; +} + +static int emith_cond_check(int cond) +{ + int b = -1; + u32 op = 0; + + if (emith_cmp_ra >= 0) { + if (emith_cmp_rb != -1) + b = emith_cmpr_check(emith_cmp_ra,emith_cmp_rb, cond,&op); + else b = emith_cmpi_check(emith_cmp_ra,emith_cmp_imm,cond,&op); + } + + // shortcut for V known to be 0 + if (b < 0 && emith_flg_noV) switch (cond) { + case DCOND_VS: /* no branch */ break; // never + case DCOND_VC: b = PPC_AL; break; // always + case DCOND_LT: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_LT; break; // N + case DCOND_GE: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_GE; break; // !N + case DCOND_LE: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_LE; break; // N || Z + case DCOND_GT: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_GT; break; // !N && !Z + } + + // the full monty if no shortcut + if (b < 0) switch (cond) { + // conditions using NZ + case DCOND_EQ: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_EQ; break; // Z + case DCOND_NE: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_NE; break; // !Z + case DCOND_MI: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_LT; break; // N + case DCOND_PL: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_GE; break; // !N + // conditions using C + case DCOND_LO: op = PPC_CMPW_IMM(FC , 0); b = PPC_NE; break; // C + case DCOND_HS: op = PPC_CMPW_IMM(FC , 0); b = PPC_EQ; break; // !C + // conditions using CZ + case DCOND_LS: // C || Z + case DCOND_HI: // !C && !Z + EMIT(PPC_ADD_IMM(AT, FC, -1)); // !C && !Z + EMIT(PPC_AND_REG(AT, FNZ, AT)); + op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_HI ? PPC_NE : PPC_EQ); + break; + + // conditions using V + case DCOND_VS: // V + case DCOND_VC: // !V + EMIT(PPC_XOR_REG(AT, FV, FNZ)); // V = Nt^Ns^Nd^C + EMIT(PPC_LSRW_IMM(AT, AT, 31)); + EMIT(PPC_XOR_REG(AT, AT, FC)); + op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_VS ? PPC_NE : PPC_EQ); + break; + // conditions using VNZ + case DCOND_LT: // N^V + case DCOND_GE: // !(N^V) + EMIT(PPC_LSRW_IMM(AT, FV, 31)); // Nd^V = Nt^Ns^C + EMIT(PPC_XOR_REG(AT, FC, AT)); + op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_LT ? PPC_NE : PPC_EQ); + break; + case DCOND_LE: // (N^V) || Z + case DCOND_GT: // !(N^V) && !Z + EMIT(PPC_LSRW_IMM(AT, FV, 31)); // Nd^V = Nt^Ns^C + EMIT(PPC_XOR_REG(AT, FC, AT)); + EMIT(PPC_ADD_IMM(AT, AT, -1)); // !(Nd^V) && !Z + EMIT(PPC_AND_REG(AT, FNZ, AT)); + op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_GT ? PPC_NE : PPC_EQ); + break; + } + + if (op) EMIT(op); + return b; +} + +#define emith_jump(target) do { \ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ + EMIT(PPC_B((uintptr_t)disp_ & 0x03ffffff)); \ +} while (0) +#define emith_jump_patchable(target) \ + emith_jump(target) + +// NB: PPC conditional branches have only +/- 64KB range +#define emith_jump_cond(cond, target) do { \ + int mcond_ = emith_cond_check(cond); \ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ + EMIT(PPC_BCOND(mcond_,disp_ & 0x0000ffff)); \ +} while (0) +#define emith_jump_cond_patchable(cond, target) \ + emith_jump_cond(cond, target) + +#define emith_jump_cond_inrange(target) \ + ((u8 *)target - (u8 *)tcache_ptr < 0x8000 && \ + (u8 *)target - (u8 *)tcache_ptr >= -0x8000+0x10) //mind cond_check + +// NB: returns position of patch for cache maintenance +#define emith_jump_patch(ptr, target, pos) do { \ + u32 *ptr_ = (u32 *)ptr; /* must skip condition check code */ \ + u32 disp_, mask_; \ + while (*ptr_>>26 != OP_BC && *ptr_>>26 != OP_B) ptr_ ++; \ + disp_ = (u8 *)target - (u8 *)ptr_; \ + mask_ = (*ptr_>>26 == OP_BC ? 0xffff0003 : 0xfc000003); \ + EMIT_PTR(ptr_, (*ptr_ & mask_) | (disp_ & ~mask_)); \ + if ((void *)(pos) != NULL) *(u8 **)(pos) = (u8 *)(ptr_-1); \ +} while (0) + +#define emith_jump_patch_inrange(ptr, target) \ + ((u8 *)target - (u8 *)ptr < 0x8000 && \ + (u8 *)target - (u8 *)ptr >= -0x8000+0x10) // mind cond_check +#define emith_jump_patch_size() 4 + +#define emith_jump_at(ptr, target) do { \ + u32 disp_ = (u8 *)target - (u8 *)ptr; \ + u32 *ptr_ = (u32 *)ptr; \ + EMIT_PTR(ptr_, PPC_B((uintptr_t)disp_ & 0x03ffffff)); \ +} while (0) +#define emith_jump_at_size() 4 + +#define emith_jump_reg(r) do { \ + EMIT(PPC_MTSP_REG(r, CTR)); \ + EMIT(PPC_BCTRCOND(PPC_AL)); \ +} while(0) +#define emith_jump_reg_c(cond, r) \ + emith_jump_reg(r) + +#define emith_jump_ctx(offs) do { \ + emith_ctx_read_ptr(AT, offs); \ + emith_jump_reg(AT); \ +} while (0) +#define emith_jump_ctx_c(cond, offs) \ + emith_jump_ctx(offs) + +#define emith_call(target) do { \ + u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ + EMIT(PPC_BL((uintptr_t)disp_ & 0x03ffffff)); \ +} while(0) +#define emith_call_cond(cond, target) \ + emith_call(target) + +#define emith_call_reg(r) do { \ + EMIT(PPC_MTSP_REG(r, CTR)); \ + EMIT(PPC_BLCTRCOND(PPC_AL)); \ +} while(0) + +#define emith_call_ctx(offs) do { \ + emith_ctx_read_ptr(AT, offs); \ + emith_call_reg(AT); \ +} while (0) + +#define emith_call_cleanup() /**/ + +#define emith_ret() \ + EMIT(PPC_RET()) +#define emith_ret_c(cond) \ + emith_ret() + +#define emith_ret_to_ctx(offs) do { \ + EMIT(PPC_MFSP_REG(AT, LR)); \ + emith_ctx_write_ptr(AT, offs); \ +} while (0) + +#define emith_add_r_ret(r) do { \ + EMIT(PPC_MFSP_REG(AT, LR)); \ + emith_add_r_r_ptr(r, AT); \ +} while (0) + +// NB: ABI SP alignment is 16 in 64 bit mode +#define emith_push_ret(r) do { \ + int offs_ = 16 - 2*PTR_SIZE; \ + emith_add_r_r_ptr_imm(SP, SP, -16); \ + EMIT(PPC_MFSP_REG(AT, LR)); \ + emith_write_r_r_offs_ptr(AT, SP, offs_ + PTR_SIZE); \ + if ((r) > 0) emith_write_r_r_offs(r, SP, offs_); \ +} while (0) + +#define emith_pop_and_ret(r) do { \ + int offs_ = 16 - 2*PTR_SIZE; \ + if ((r) > 0) emith_read_r_r_offs(r, SP, offs_); \ + emith_read_r_r_offs_ptr(AT, SP, offs_ + PTR_SIZE); \ + EMIT(PPC_MTSP_REG(AT, LR)); \ + emith_add_r_r_ptr_imm(SP, SP, 16); \ + emith_ret(); \ +} while (0) + + +// emitter ABI stuff +#define emith_pool_check() /**/ +#define emith_pool_commit(j) /**/ +#define emith_insn_ptr() ((u8 *)tcache_ptr) +#define emith_flush() /**/ +#define host_instructions_updated(base, end) __builtin___clear_cache(base, end) +#define emith_update_cache() /**/ +#define emith_rw_offs_max() 0x7fff + +// SH2 drc specific +#define STACK_EXTRA (64+48) // Param, ABI (LR,CR,FP etc) save areas +#define emith_sh2_drc_entry() do { \ + int _c, _z = PTR_SIZE; u32 _m = 0xffffc000; /* r14-r30 */ \ + if (__builtin_parity(_m) == 1) _m |= 0x1; /* ABI align for SP is 16 */ \ + int _s = count_bits(_m) * _z, _o = 0; \ + for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ + if (_m & (1 << _c)) \ + { _o -= _z; if (_c) emith_write_r_r_offs_ptr(_c, SP, _o); } \ + EMIT(PPC_MFSP_REG(10, LR)); \ + emith_write_r_r_offs_ptr(10, SP, 16); \ + emith_write_r_r_offs_ptr(SP, SP, -_s-STACK_EXTRA); /* XXX stdu */ \ + emith_add_r_r_ptr_imm(SP, SP, -_s-STACK_EXTRA); \ +} while (0) +#define emith_sh2_drc_exit() do { \ + int _c, _z = PTR_SIZE; u32 _m = 0xffffc000; \ + if (__builtin_parity(_m) == 1) _m |= 0x1; \ + int _s = count_bits(_m) * _z, _o = STACK_EXTRA; \ + for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ + if (_m & (1 << _c)) \ + { if (_c) emith_read_r_r_offs_ptr(_c, SP, _o); _o += _z; } \ + emith_add_r_r_ptr_imm(SP, SP, _s+STACK_EXTRA); \ + emith_read_r_r_offs_ptr(10, SP, 16); \ + EMIT(PPC_MTSP_REG(10, LR)); \ + emith_ret(); \ +} while (0) + +// NB: assumes a is in arg0, tab, func and mask are temp +#define emith_sh2_rcall(a, tab, func, mask) do { \ + emith_lsr(mask, a, SH2_READ_SHIFT); \ + emith_add_r_r_r_lsl_ptr(tab, tab, mask, PTR_SCALE+1); \ + emith_read_r_r_offs_ptr(func, tab, 0); \ + emith_read_r_r_offs(mask, tab, PTR_SIZE); \ + EMIT(PPC_BFXP_IMM(FC, func, 0, 1)); \ + emith_add_r_r_ptr(func, func); \ + emith_cmp_ra = emith_cmp_rb = -1; \ +} while (0) + +// NB: assumes a, val are in arg0 and arg1, tab and func are temp +#define emith_sh2_wcall(a, val, tab, func) do { \ + emith_lsr(func, a, SH2_WRITE_SHIFT); \ + emith_lsl(func, func, PTR_SCALE); \ + emith_read_r_r_r_ptr(func, tab, func); \ + emith_move_r_r_ptr(5, CONTEXT_REG); /* arg2 */ \ + emith_jump_reg(func); \ +} while (0) + +#define emith_sh2_delay_loop(cycles, reg) do { \ + int sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); \ + int t1 = rcache_get_tmp(); \ + int t2 = rcache_get_tmp(); \ + int t3 = rcache_get_tmp(); \ + /* if (sr < 0) return */ \ + emith_cmp_r_imm(sr, 0); \ + EMITH_JMP_START(DCOND_LE); \ + /* turns = sr.cycles / cycles */ \ + emith_asr(t2, sr, 12); \ + emith_move_r_imm(t3, (u32)((1ULL<<32) / (cycles)) + 1); \ + emith_mul_u64(t1, t2, t2, t3); /* multiply by 1/x */ \ + rcache_free_tmp(t3); \ + if (reg >= 0) { \ + /* if (reg <= turns) turns = reg-1 */ \ + t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \ + emith_cmp_r_r(t3, t2); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_sub_r_r_imm_c(DCOND_LS, t2, t3, 1); \ + EMITH_SJMP_END(DCOND_HI); \ + /* if (reg <= 1) turns = 0 */ \ + emith_cmp_r_imm(t3, 1); \ + EMITH_SJMP_START(DCOND_HI); \ + emith_move_r_imm_c(DCOND_LS, t2, 0); \ + EMITH_SJMP_END(DCOND_HI); \ + /* reg -= turns */ \ + emith_sub_r_r(t3, t2); \ + } \ + /* sr.cycles -= turns * cycles; */ \ + emith_move_r_imm(t1, cycles); \ + emith_mul(t1, t2, t1); \ + emith_sub_r_r_r_lsl(sr, sr, t1, 12); \ + EMITH_JMP_END(DCOND_LE); \ + rcache_free_tmp(t1); \ + rcache_free_tmp(t2); \ +} while (0) + +/* + * T = !carry(Rn = (Rn << 1) | T) + * if Q + * C = carry(Rn += Rm) + * else + * C = carry(Rn -= Rm) + * T ^= C + */ +#define emith_sh2_div1_step(rn, rm, sr) do { \ + int t_ = rcache_get_tmp(); \ + emith_and_r_r_imm(AT, sr, T); \ + emith_lsr(FC, rn, 31); /*Rn = (Rn<<1)+T*/ \ + emith_lsl(t_, rn, 1); \ + emith_or_r_r(t_, AT); \ + emith_or_r_imm(sr, T); /* T = !carry */ \ + emith_eor_r_r(sr, FC); \ + emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ + EMITH_JMP3_START(DCOND_EQ); \ + emith_add_r_r_r(rn, t_, rm); \ + EMIT_PPC_SLTWU_REG(FC, rn, t_); \ + EMITH_JMP3_MID(DCOND_EQ); \ + emith_sub_r_r_r(rn, t_, rm); \ + EMIT_PPC_SLTWU_REG(FC, t_, rn); \ + EMITH_JMP3_END(); \ + emith_eor_r_r(sr, FC); /* T ^= carry */ \ + rcache_free_tmp(t_); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macl(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* MACH top 16 bits unused if saturated. sign ext for overfl detect */ \ + emith_sext(mh, mh, 16); \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 17 bits of MACH aren't all 1 or 0 */ \ + /* to check: add MACH >> 31 to MACH >> 15. this is 0 if no overflow */ \ + emith_asr(rn, mh, 15); \ + emith_add_r_r_r_lsr(rn, rn, mh, 31); /* sum = (MACH>>31)+(MACH>>15) */ \ + emith_tst_r_r(rn, rn); /* (need only N and Z flags) */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> ov */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x0000); /* -overflow */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x8000); \ + EMITH_SJMP_START(DCOND_PL); /* sum > 0 -> +ovl */ \ + emith_sub_r_imm_c(DCOND_MI, ml, 1); /* 0xffffffff */ \ + emith_sub_r_imm_c(DCOND_MI, mh, 1); /* 0x00007fff */ \ + EMITH_SJMP_END(DCOND_PL); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +/* mh:ml += rn*rm, does saturation if required by S bit. rn, rm must be TEMP */ +#define emith_sh2_macw(ml, mh, rn, rm, sr) do { \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* XXX: MACH should be untouched when S is set? */ \ + emith_asr(mh, ml, 31); /* sign ext MACL to MACH for ovrfl check */ \ + EMITH_SJMP_END(DCOND_EQ); \ + emith_mula_s64(ml, mh, rn, rm); \ + emith_tst_r_imm(sr, S); \ + EMITH_SJMP_START(DCOND_EQ); \ + /* overflow if top 33 bits of MACH:MACL aren't all 1 or 0 */ \ + /* to check: add MACL[31] to MACH. this is 0 if no overflow */ \ + emith_lsr(rn, ml, 31); \ + emith_add_r_r(rn, mh); /* sum = MACH + ((MACL>>31)&1) */ \ + emith_tst_r_r(rn, rn); /* (need only N and Z flags) */ \ + EMITH_SJMP_START(DCOND_EQ); /* sum != 0 -> overflow */ \ + /* XXX: LSB signalling only in SH1, or in SH2 too? */ \ + emith_move_r_imm_c(DCOND_NE, mh, 0x00000001); /* LSB of MACH */ \ + emith_move_r_imm_c(DCOND_NE, ml, 0x80000000); /* negative ovrfl */ \ + EMITH_SJMP_START(DCOND_PL); /* sum > 0 -> positive ovrfl */ \ + emith_sub_r_imm_c(DCOND_MI, ml, 1); /* 0x7fffffff */ \ + EMITH_SJMP_END(DCOND_PL); \ + EMITH_SJMP_END(DCOND_EQ); \ + EMITH_SJMP_END(DCOND_EQ); \ +} while (0) + +#define emith_write_sr(sr, srcr) \ + EMIT(PPC_BFIW_IMM(sr, srcr, 22, 10)) + +#define emith_carry_to_t(sr, is_sub) \ + EMIT(PPC_BFIW_IMM(sr, FC, 31, 1)) + +#define emith_t_to_carry(sr, is_sub) \ + emith_and_r_r_imm(FC, sr, 1) + +#define emith_tpop_carry(sr, is_sub) do { \ + emith_and_r_r_imm(FC, sr, 1); \ + emith_eor_r_r(sr, FC); \ +} while (0) + +#define emith_tpush_carry(sr, is_sub) \ + emith_or_r_r(sr, FC) + +#ifdef T +#define emith_invert_cond(cond) \ + ((cond) ^ 1) + +// T bit handling +static void emith_clr_t_cond(int sr) +{ + emith_bic_r_imm(sr, T); +} + +static void emith_set_t_cond(int sr, int cond) +{ + int b; + u8 *ptr; + u32 val = 0; + + // XXX optimization + b = emith_invert_branch(emith_cond_check(cond)); + ptr = tcache_ptr; + EMIT(PPC_BCOND(b, 0)); + emith_or_r_imm(sr, T); + val = (u8 *)tcache_ptr - (u8 *)(ptr); + EMIT_PTR(ptr, PPC_BCOND(b, val & 0x00001fff)); +} + +#define emith_get_t_cond() -1 + +#define emith_sync_t(sr) ((void)sr) + +#define emith_invalidate_t() + +static void emith_set_t(int sr, int val) +{ + if (val) + emith_or_r_imm(sr, T); + else + emith_bic_r_imm(sr, T); +} + +static int emith_tst_t(int sr, int tf) +{ + emith_tst_r_imm(sr, T); + return tf ? DCOND_NE: DCOND_EQ; +} +#endif From 9760c3cdbffa42f91cacec7ec9b13798c117e18f Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 19 Jun 2020 00:14:28 +0200 Subject: [PATCH 161/174] sh2 drc, add powerpc64le backend --- Makefile | 6 +- cpu/drc/emit_arm.c | 2 + cpu/drc/emit_arm64.c | 1 + cpu/drc/emit_mips.c | 1 + cpu/drc/emit_ppc.c | 234 +++++++++++++++++++------------------------ cpu/drc/emit_riscv.c | 1 + cpu/drc/emit_x86.c | 2 + cpu/sh2/compiler.c | 9 +- cpu/sh2/compiler.h | 3 + platform/linux/emu.c | 2 +- 10 files changed, 129 insertions(+), 132 deletions(-) diff --git a/Makefile b/Makefile index b38b54c02..903f93696 100644 --- a/Makefile +++ b/Makefile @@ -79,6 +79,10 @@ else ifneq (,$(findstring riscv,$(ARCH))) use_fame ?= 1 use_cz80 ?= 1 use_sh2drc ?= 1 +else ifneq (,$(findstring powerpc,$(ARCH))) +use_fame ?= 1 +use_cz80 ?= 1 +use_sh2drc ?= 1 endif -include Makefile.local @@ -292,7 +296,7 @@ pico/carthw_cfg.c: pico/carthw.cfg # random deps pico/carthw/svp/compiler.o : cpu/drc/emit_arm.c -cpu/sh2/compiler.o : cpu/drc/emit_arm.c cpu/drc/emit_arm64.c +cpu/sh2/compiler.o : cpu/drc/emit_arm.c cpu/drc/emit_arm64.c cpu/drc/emit_ppc.c cpu/sh2/compiler.o : cpu/drc/emit_x86.c cpu/drc/emit_mips.c cpu/drc/emit_riscv.c cpu/sh2/mame/sh2pico.o : cpu/sh2/mame/sh2.c pico/pico.o pico/cd/mcd.o pico/32x/32x.o : pico/pico_cmn.c pico/pico_int.h diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index e27054a30..3f373435b 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -1138,6 +1138,8 @@ static inline void emith_pool_adjust(int tcache_offs, int move_offs) EOP_MOV_REG_ASR(d,d,32 - (bits)); \ } while (0) +#define emith_uext_ptr(r) /**/ + #define emith_do_caller_regs(mask, func) do { \ u32 _reg_mask = (mask) & 0x500f; \ if (_reg_mask) { \ diff --git a/cpu/drc/emit_arm64.c b/cpu/drc/emit_arm64.c index f4645bc15..ae7077a01 100644 --- a/cpu/drc/emit_arm64.c +++ b/cpu/drc/emit_arm64.c @@ -1176,6 +1176,7 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode) #define host_instructions_updated(base, end) __builtin___clear_cache(base, end) #define emith_update_cache() /**/ #define emith_rw_offs_max() 0xff +#define emith_uext_ptr(r) /**/ // SH2 drc specific diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 8cb094deb..8eddd2196 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -1563,6 +1563,7 @@ static int emith_cond_check(int cond, int *r) #define host_instructions_updated(base, end) __builtin___clear_cache(base, end) #define emith_update_cache() /**/ #define emith_rw_offs_max() 0x7fff +#define emith_uext_ptr(r) /**/ // SH2 drc specific #define emith_sh2_drc_entry() do { \ diff --git a/cpu/drc/emit_ppc.c b/cpu/drc/emit_ppc.c index fb2ca44bf..286d41668 100644 --- a/cpu/drc/emit_ppc.c +++ b/cpu/drc/emit_ppc.c @@ -6,8 +6,6 @@ * See COPYING file in the top-level directory. */ -// WARNING: unfinished, neither thoroughly tested nor optimized. little endian only! - // NB bit numbers are reversed in PPC (MSB is bit 0). The emith_* functions and // macros must take this into account. @@ -34,7 +32,7 @@ // use CA and OV. // Moreover, there's no easy possibility to get CA and OV for 32 bit arithmetic // since all arithmetic/logical insns use 64 bit. -// For now, use the "no flags" code from the RISCV backend. +// For now, use the "no flags" code from the RISC-V backend. #define HOST_REGS 32 @@ -42,7 +40,7 @@ // reserved: r0(zero), r1(stack), r2(TOC), r13(TID) #define RET_REG 3 #define PARAM_REGS { 3, 4, 5, 6, 7, 8, 9, 10 } -#define PRESERVED_REGS { 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,31 } +#define PRESERVED_REGS { 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 } #define TEMPORARY_REGS { 11, 12 } #define CONTEXT_REG 31 @@ -50,27 +48,18 @@ // if RA is 0 in non-update memory insns, ADDI/ADDIS, ISEL, it aliases with zero #define Z0 0 // zero register -#define SP 1 // stack pointer +#define SP 1 // stack pointer // SPR registers -#define XER -1 // exception register -#define LR -8 // link register -#define CTR -9 // counter register +#define XER -1 // exception register +#define LR -8 // link register +#define CTR -9 // counter register // internally used by code emitter: -#define AT 0 // emitter temporary (can't be fully used anyway) +#define AT 0 // emitter temporary (can't be fully used anyway) #define FNZ 14 // emulated processor flags: N (bit 31) ,Z (all bits) #define FC 15 // emulated processor flags: C (bit 0), others 0 #define FV 16 // emulated processor flags: Nt^Ns (bit 31). others x -// PPC conditions, BO0-BO4:BI2-BI4 since we only need CR0 -#define PPC_LT 0x60 -#define PPC_GE 0x20 -#define PPC_GT 0x61 -#define PPC_LE 0x21 -#define PPC_EQ 0x62 -#define PPC_NE 0x22 -#define PPC_AL 0xa0 - // unified conditions; virtual, not corresponding to anything real on PPC #define DCOND_EQ 0x0 #define DCOND_NE 0x1 @@ -94,8 +83,8 @@ #define PPC_INSN(op, b10, b15, b20, b31) \ (((op)<<26)|((b10)<<21)|((b15)<<16)|((b20)<<11)|((b31)<<0)) -#define _ 0 // marker for "field unused" -#define __(n) o##n // enum marker for "undefined" +#define _ 0 // marker for "field unused" +#define __(n) o##n // enum marker for "undefined" #define _CB(v,l,s,d) ((((v)>>(s))&((1<<(l))-1))<<(d)) // copy l bits // NB everything privileged or unneeded at 1st sight is left out @@ -148,8 +137,16 @@ enum { OPS_STD, OPS_STDU /*,OPS_STQ*/ }; // AA and LK in I,B-forms branches #define BAA (1<<1) #define BLK (1<<0) - -#define PPC_NOP \ +// BO and BI condition codes in B-form, BO0-BO4:BI2-BI4 since we only need CR0 +#define BLT 0x60 +#define BGE 0x20 +#define BGT 0x61 +#define BLE 0x21 +#define BEQ 0x62 +#define BNE 0x22 +#define BXX 0xa0 // unconditional, aka always + +#define PPC_NOP \ PPC_INSN(OP_ORI, 0, 0, _, 0) // ori r0, r0, 0 // arithmetic/logical @@ -331,7 +328,7 @@ enum { OPS_STD, OPS_STDU /*,OPS_STQ*/ }; #define PPC_BL(offs26) \ PPC_OP_IMM(OP_B,_,_,((offs26)&~3)|BLK) #define PPC_RET() \ - PPC_OP_REG(OP__CR,OPC_BCLR,PPC_AL>>3,_,_) + PPC_OP_REG(OP__CR,OPC_BCLR,BXX>>3,_,_) #define PPC_RETCOND(cond) \ PPC_OP_REG(OP__CR,OPC_BCLR,(cond)>>3,(cond)&0x7,_) #define PPC_BCTRCOND(cond) \ @@ -411,6 +408,8 @@ enum { OPS_STD, OPS_STDU /*,OPS_STQ*/ }; #define PPC_STP_REG PPC_STX_REG #define PPC_BFXP_IMM PPC_BFX_IMM +#define emith_uext_ptr(r) EMIT(PPC_EXTUW_REG(r, r)) + // "long" multiplication, 32x32 bit = 64 bit #define EMIT_PPC_MULLU_REG(dlo, dhi, s1, s2) do { \ EMIT(PPC_EXTUW_REG(s1, s1)); \ @@ -442,6 +441,8 @@ enum { OPS_STD, OPS_STDU /*,OPS_STQ*/ }; #define PPC_STP_REG PPC_STW_REG #define PPC_BFXP_IMM PPC_BFXW_IMM +#define emith_uext_ptr(r) /**/ + // "long" multiplication, 32x32 bit = 64 bit #define EMIT_PPC_MULLU_REG(dlo, dhi, s1, s2) do { \ int at = (dlo == s1 || dlo == s2 ? AT : dlo); \ @@ -467,23 +468,7 @@ enum { OPS_STD, OPS_STDU /*,OPS_STQ*/ }; #endif #define PTR_SIZE (1<> 32); if (imm >> 32) EMIT(PPC_LSL_IMM(r, r, 32)); @@ -883,23 +867,11 @@ static void emith_move_imm(int r, int ptr, uintptr_t imm) EMIT(PPC_ORT_IMM(r, r, (imm & 0xffff0000) >> 16)); } else #endif - { - int s = Z0, d = 0, c = 0; - if ((u16)imm) { - EMIT(PPC_ADD_IMM(r, s, (u16)imm)); - s = r, d = 1, c = (s16)imm < 0; - } - // adjust for sign extension in ADDI - if (!d) // low part == 0 - EMIT(PPC_ADDT_IMM(r, s, (u16)(imm>>16))); - else if (c && (u16)(~imm>>16)) // low part < 0 - EMIT(PPC_XORT_IMM(r, s, (u16)(~imm>>16))); - else if (!c && (u16)(imm>>16)) // low part > 0 - EMIT(PPC_ORT_IMM(r, s, (u16)(imm>>16))); - // make sure to clear upper half if this is a ptr - if (ptr && !(imm >> 32) && c) - EMIT(PPC_EXTUW_REG(r, r)); - } + if ((s16)imm != (s32)imm) { + EMIT(PPC_ADDT_IMM(r, Z0, (u16)(imm>>16))); + if ((s16)imm) + EMIT(PPC_OR_IMM(r, r, (u16)(imm))); + } else EMIT(PPC_ADD_IMM(r, Z0, (u16)imm)); } #define emith_move_r_ptr_imm(r, imm) \ @@ -1176,9 +1148,6 @@ static void emith_add_imm(int rt, int ra, u32 imm) } \ } while (0) -#define emith_uext_ptr(r) \ - EMIT(PPC_EXTUW_REG(r, r)) - // multiply Rd = Rn*Rm (+ Ra) #define emith_mul(d, s1, s2) \ @@ -1248,13 +1217,17 @@ static void emith_add_imm(int rt, int ra, u32 imm) #define emith_read8s_r_r_r_c(cond, r, ra, rm) \ emith_read8s_r_r_r(r, ra, rm) -#define emith_read16s_r_r_offs(r, ra, offs) \ - EMIT(PPC_LDSH_IMM(r, ra, offs)) +#define emith_read16s_r_r_offs(r, ra, offs) do { \ + EMIT(PPC_LDH_IMM(r, ra, offs)); \ + EMIT(PPC_EXTSH_REG(r, r)); \ +} while (0) #define emith_read16s_r_r_offs_c(cond, r, ra, offs) \ emith_read16s_r_r_offs(r, ra, offs) -#define emith_read16s_r_r_r(r, ra, rm) \ - EMIT(PPC_LDSH_REG(r, ra, rm)) +#define emith_read16s_r_r_r(r, ra, rm) do { \ + EMIT(PPC_LDH_REG(r, ra, rm)); \ + EMIT(PPC_EXTSH_REG(r, r)); \ +} while (0) #define emith_read16s_r_r_r_c(cond, r, ra, rm) \ emith_read16s_r_r_r(r, ra, rm) @@ -1346,16 +1319,16 @@ static int emith_cmpr_check(int rs, int rt, int cond, u32 *op) // condition check for comparing 2 registers switch (cond) { - case DCOND_EQ: *op = PPC_CMPW_REG(rs, rt); b = PPC_EQ; break; - case DCOND_NE: *op = PPC_CMPW_REG(rs, rt); b = PPC_NE; break; - case DCOND_LO: *op = PPC_CMPLW_REG(rs, rt); b = PPC_LT; break; - case DCOND_HS: *op = PPC_CMPLW_REG(rs, rt); b = PPC_GE; break; - case DCOND_LS: *op = PPC_CMPLW_REG(rs, rt); b = PPC_LE; break; - case DCOND_HI: *op = PPC_CMPLW_REG(rs, rt); b = PPC_GT; break; - case DCOND_LT: *op = PPC_CMPW_REG(rs, rt); b = PPC_LT; break; - case DCOND_GE: *op = PPC_CMPW_REG(rs, rt); b = PPC_GE; break; - case DCOND_LE: *op = PPC_CMPW_REG(rs, rt); b = PPC_LE; break; - case DCOND_GT: *op = PPC_CMPW_REG(rs, rt); b = PPC_GT; break; + case DCOND_EQ: *op = PPC_CMPW_REG(rs, rt); b = BEQ; break; + case DCOND_NE: *op = PPC_CMPW_REG(rs, rt); b = BNE; break; + case DCOND_LO: *op = PPC_CMPLW_REG(rs, rt); b = BLT; break; + case DCOND_HS: *op = PPC_CMPLW_REG(rs, rt); b = BGE; break; + case DCOND_LS: *op = PPC_CMPLW_REG(rs, rt); b = BLE; break; + case DCOND_HI: *op = PPC_CMPLW_REG(rs, rt); b = BGT; break; + case DCOND_LT: *op = PPC_CMPW_REG(rs, rt); b = BLT; break; + case DCOND_GE: *op = PPC_CMPW_REG(rs, rt); b = BGE; break; + case DCOND_LE: *op = PPC_CMPW_REG(rs, rt); b = BLE; break; + case DCOND_GT: *op = PPC_CMPW_REG(rs, rt); b = BGT; break; } return b; @@ -1367,16 +1340,16 @@ static int emith_cmpi_check(int rs, s32 imm, int cond, u32 *op) // condition check for comparing register with immediate switch (cond) { - case DCOND_EQ: *op = PPC_CMPW_IMM(rs, (u16)imm), b = PPC_EQ; break; - case DCOND_NE: *op = PPC_CMPW_IMM(rs, (u16)imm), b = PPC_NE; break; - case DCOND_LO: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = PPC_LT; break; - case DCOND_HS: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = PPC_GE; break; - case DCOND_LS: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = PPC_LE; break; - case DCOND_HI: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = PPC_GT; break; - case DCOND_LT: *op = PPC_CMPW_IMM(rs, (u16)imm), b = PPC_LT; break; - case DCOND_GE: *op = PPC_CMPW_IMM(rs, (u16)imm), b = PPC_GE; break; - case DCOND_LE: *op = PPC_CMPW_IMM(rs, (u16)imm), b = PPC_LE; break; - case DCOND_GT: *op = PPC_CMPW_IMM(rs, (u16)imm), b = PPC_GT; break; + case DCOND_EQ: *op = PPC_CMPW_IMM(rs, (u16)imm), b = BEQ; break; + case DCOND_NE: *op = PPC_CMPW_IMM(rs, (u16)imm), b = BNE; break; + case DCOND_LO: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = BLT; break; + case DCOND_HS: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = BGE; break; + case DCOND_LS: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = BLE; break; + case DCOND_HI: *op = PPC_CMPLW_IMM(rs, (u16)imm), b = BGT; break; + case DCOND_LT: *op = PPC_CMPW_IMM(rs, (u16)imm), b = BLT; break; + case DCOND_GE: *op = PPC_CMPW_IMM(rs, (u16)imm), b = BGE; break; + case DCOND_LE: *op = PPC_CMPW_IMM(rs, (u16)imm), b = BLE; break; + case DCOND_GT: *op = PPC_CMPW_IMM(rs, (u16)imm), b = BGT; break; } return b; @@ -1396,29 +1369,29 @@ static int emith_cond_check(int cond) // shortcut for V known to be 0 if (b < 0 && emith_flg_noV) switch (cond) { case DCOND_VS: /* no branch */ break; // never - case DCOND_VC: b = PPC_AL; break; // always - case DCOND_LT: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_LT; break; // N - case DCOND_GE: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_GE; break; // !N - case DCOND_LE: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_LE; break; // N || Z - case DCOND_GT: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_GT; break; // !N && !Z + case DCOND_VC: b = BXX; break; // always + case DCOND_LT: op = PPC_CMPW_IMM(FNZ, 0); b = BLT; break; // N + case DCOND_GE: op = PPC_CMPW_IMM(FNZ, 0); b = BGE; break; // !N + case DCOND_LE: op = PPC_CMPW_IMM(FNZ, 0); b = BLE; break; // N || Z + case DCOND_GT: op = PPC_CMPW_IMM(FNZ, 0); b = BGT; break; // !N && !Z } // the full monty if no shortcut if (b < 0) switch (cond) { // conditions using NZ - case DCOND_EQ: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_EQ; break; // Z - case DCOND_NE: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_NE; break; // !Z - case DCOND_MI: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_LT; break; // N - case DCOND_PL: op = PPC_CMPW_IMM(FNZ, 0); b = PPC_GE; break; // !N + case DCOND_EQ: op = PPC_CMPW_IMM(FNZ, 0); b = BEQ; break; // Z + case DCOND_NE: op = PPC_CMPW_IMM(FNZ, 0); b = BNE; break; // !Z + case DCOND_MI: op = PPC_CMPW_IMM(FNZ, 0); b = BLT; break; // N + case DCOND_PL: op = PPC_CMPW_IMM(FNZ, 0); b = BGE; break; // !N // conditions using C - case DCOND_LO: op = PPC_CMPW_IMM(FC , 0); b = PPC_NE; break; // C - case DCOND_HS: op = PPC_CMPW_IMM(FC , 0); b = PPC_EQ; break; // !C + case DCOND_LO: op = PPC_CMPW_IMM(FC , 0); b = BNE; break; // C + case DCOND_HS: op = PPC_CMPW_IMM(FC , 0); b = BEQ; break; // !C // conditions using CZ case DCOND_LS: // C || Z case DCOND_HI: // !C && !Z EMIT(PPC_ADD_IMM(AT, FC, -1)); // !C && !Z EMIT(PPC_AND_REG(AT, FNZ, AT)); - op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_HI ? PPC_NE : PPC_EQ); + op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_HI ? BNE : BEQ); break; // conditions using V @@ -1427,14 +1400,14 @@ static int emith_cond_check(int cond) EMIT(PPC_XOR_REG(AT, FV, FNZ)); // V = Nt^Ns^Nd^C EMIT(PPC_LSRW_IMM(AT, AT, 31)); EMIT(PPC_XOR_REG(AT, AT, FC)); - op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_VS ? PPC_NE : PPC_EQ); + op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_VS ? BNE : BEQ); break; // conditions using VNZ case DCOND_LT: // N^V case DCOND_GE: // !(N^V) EMIT(PPC_LSRW_IMM(AT, FV, 31)); // Nd^V = Nt^Ns^C EMIT(PPC_XOR_REG(AT, FC, AT)); - op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_LT ? PPC_NE : PPC_EQ); + op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_LT ? BNE : BEQ); break; case DCOND_LE: // (N^V) || Z case DCOND_GT: // !(N^V) && !Z @@ -1442,7 +1415,7 @@ static int emith_cond_check(int cond) EMIT(PPC_XOR_REG(AT, FC, AT)); EMIT(PPC_ADD_IMM(AT, AT, -1)); // !(Nd^V) && !Z EMIT(PPC_AND_REG(AT, FNZ, AT)); - op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_GT ? PPC_NE : PPC_EQ); + op = PPC_CMPW_IMM(AT , 0); b = (cond == DCOND_GT ? BNE : BEQ); break; } @@ -1461,7 +1434,7 @@ static int emith_cond_check(int cond) #define emith_jump_cond(cond, target) do { \ int mcond_ = emith_cond_check(cond); \ u32 disp_ = (u8 *)target - (u8 *)tcache_ptr; \ - EMIT(PPC_BCOND(mcond_,disp_ & 0x0000ffff)); \ + if (mcond_ >= 0) EMIT(PPC_BCOND(mcond_,disp_ & 0x0000ffff)); \ } while (0) #define emith_jump_cond_patchable(cond, target) \ emith_jump_cond(cond, target) @@ -1495,7 +1468,7 @@ static int emith_cond_check(int cond) #define emith_jump_reg(r) do { \ EMIT(PPC_MTSP_REG(r, CTR)); \ - EMIT(PPC_BCTRCOND(PPC_AL)); \ + EMIT(PPC_BCTRCOND(BXX)); \ } while(0) #define emith_jump_reg_c(cond, r) \ emith_jump_reg(r) @@ -1516,7 +1489,7 @@ static int emith_cond_check(int cond) #define emith_call_reg(r) do { \ EMIT(PPC_MTSP_REG(r, CTR)); \ - EMIT(PPC_BLCTRCOND(PPC_AL)); \ + EMIT(PPC_BLCTRCOND(BXX)); \ } while(0) #define emith_call_ctx(offs) do { \ @@ -1564,13 +1537,13 @@ static int emith_cond_check(int cond) #define emith_pool_check() /**/ #define emith_pool_commit(j) /**/ #define emith_insn_ptr() ((u8 *)tcache_ptr) -#define emith_flush() /**/ +#define emith_flush() /**/ #define host_instructions_updated(base, end) __builtin___clear_cache(base, end) -#define emith_update_cache() /**/ +#define emith_update_cache() /**/ #define emith_rw_offs_max() 0x7fff // SH2 drc specific -#define STACK_EXTRA (64+48) // Param, ABI (LR,CR,FP etc) save areas +#define STACK_EXTRA ((8+6)*PTR_SIZE) // Param, ABI (LR,CR,FP etc) save areas #define emith_sh2_drc_entry() do { \ int _c, _z = PTR_SIZE; u32 _m = 0xffffc000; /* r14-r30 */ \ if (__builtin_parity(_m) == 1) _m |= 0x1; /* ABI align for SP is 16 */ \ @@ -1579,7 +1552,7 @@ static int emith_cond_check(int cond) if (_m & (1 << _c)) \ { _o -= _z; if (_c) emith_write_r_r_offs_ptr(_c, SP, _o); } \ EMIT(PPC_MFSP_REG(10, LR)); \ - emith_write_r_r_offs_ptr(10, SP, 16); \ + emith_write_r_r_offs_ptr(10, SP, 2*PTR_SIZE); \ emith_write_r_r_offs_ptr(SP, SP, -_s-STACK_EXTRA); /* XXX stdu */ \ emith_add_r_r_ptr_imm(SP, SP, -_s-STACK_EXTRA); \ } while (0) @@ -1591,7 +1564,7 @@ static int emith_cond_check(int cond) if (_m & (1 << _c)) \ { if (_c) emith_read_r_r_offs_ptr(_c, SP, _o); _o += _z; } \ emith_add_r_r_ptr_imm(SP, SP, _s+STACK_EXTRA); \ - emith_read_r_r_offs_ptr(10, SP, 16); \ + emith_read_r_r_offs_ptr(10, SP, 2*PTR_SIZE); \ EMIT(PPC_MTSP_REG(10, LR)); \ emith_ret(); \ } while (0) @@ -1672,11 +1645,13 @@ static int emith_cond_check(int cond) emith_tst_r_imm(sr, Q); /* if (Q ^ M) */ \ EMITH_JMP3_START(DCOND_EQ); \ emith_add_r_r_r(rn, t_, rm); \ - EMIT_PPC_SLTWU_REG(FC, rn, t_); \ + EMIT(PPC_CMPLW_REG(rn, t_)); \ EMITH_JMP3_MID(DCOND_EQ); \ emith_sub_r_r_r(rn, t_, rm); \ - EMIT_PPC_SLTWU_REG(FC, t_, rn); \ + EMIT(PPC_CMPLW_REG(t_, rn)); \ EMITH_JMP3_END(); \ + EMIT(PPC_MFCR_REG(FC)); \ + EMIT(PPC_BFXW_IMM(FC, FC, 0, 1)); \ emith_eor_r_r(sr, FC); /* T ^= carry */ \ rcache_free_tmp(t_); \ } while (0) @@ -1737,7 +1712,7 @@ static int emith_cond_check(int cond) EMIT(PPC_BFIW_IMM(sr, srcr, 22, 10)) #define emith_carry_to_t(sr, is_sub) \ - EMIT(PPC_BFIW_IMM(sr, FC, 31, 1)) + EMIT(PPC_BFIW_IMM(sr, FC, 32-__builtin_ffs(T), 1)) #define emith_t_to_carry(sr, is_sub) \ emith_and_r_r_imm(FC, sr, 1) @@ -1755,26 +1730,27 @@ static int emith_cond_check(int cond) ((cond) ^ 1) // T bit handling -static void emith_clr_t_cond(int sr) -{ - emith_bic_r_imm(sr, T); -} - static void emith_set_t_cond(int sr, int cond) { int b; - u8 *ptr; - u32 val = 0; - - // XXX optimization - b = emith_invert_branch(emith_cond_check(cond)); - ptr = tcache_ptr; - EMIT(PPC_BCOND(b, 0)); - emith_or_r_imm(sr, T); - val = (u8 *)tcache_ptr - (u8 *)(ptr); - EMIT_PTR(ptr, PPC_BCOND(b, val & 0x00001fff)); + + // catch never and always cases + if ((b = emith_cond_check(cond)) < 0) + return; + else if (b == BXX) { + emith_or_r_imm(sr, T); + return; + } + + // extract bit from CR and insert into T + EMIT(PPC_MFCR_REG(AT)); + EMIT(PPC_BFXW_IMM(AT, AT, (b&7), 1)); + if (!(b & 0x40)) EMIT(PPC_XOR_IMM(AT, AT, 1)); + EMIT(PPC_BFIW_IMM(sr, AT, 32-__builtin_ffs(T), 1)); } +#define emith_clr_t_cond(sr) ((void)sr) + #define emith_get_t_cond() -1 #define emith_sync_t(sr) ((void)sr) diff --git a/cpu/drc/emit_riscv.c b/cpu/drc/emit_riscv.c index 69ed530ea..954d14a77 100644 --- a/cpu/drc/emit_riscv.c +++ b/cpu/drc/emit_riscv.c @@ -1400,6 +1400,7 @@ static int emith_cond_check(int cond, int *r, int *s) #define host_instructions_updated(base, end) __builtin___clear_cache(base, end) #define emith_update_cache() /**/ #define emith_rw_offs_max() 0x7ff +#define emith_uext_ptr(r) /**/ // SH2 drc specific #define emith_sh2_drc_entry() do { \ diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 80ec04445..c836e1595 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -622,6 +622,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common emith_asr(d, d, 32 - (bits)); \ } while (0) +#define emith_uext_ptr(r) /**/ + #define emith_setc(r) do { \ assert(is_abcdx(r)); \ EMIT_REX_IF(0, 0, r); \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index bfd9ec061..d1a971c64 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -189,7 +189,7 @@ static char sh2dasm_buff[64]; (sh2)->r[8], (sh2)->r[9], (sh2)->r[10], (sh2)->r[11], \ (sh2)->r[12], (sh2)->r[13], (sh2)->r[14], (sh2)->r[15]); \ printf("%csh2 pc-ml %08x %08x %08x %08x %08x %08x %08x %08x\n", ms, \ - (sh2)->pc, (sh2)->ppc, (sh2)->pr, (sh2)->sr&0x3ff, \ + (sh2)->pc, (sh2)->ppc, (sh2)->pr, (sh2)->sr&0xfff, \ (sh2)->gbr, (sh2)->vbr, (sh2)->mach, (sh2)->macl); \ printf("%csh2 tmp-p %08x %08x %08x %08x %08x %08x %08x %08x\n", ms, \ (sh2)->drc_tmp, (sh2)->irq_cycles, \ @@ -246,6 +246,10 @@ static void REGPARM(3) *sh2_drc_log_entry(void *block, SH2 *sh2, u32 sr) SH2_DUMP(&fsh2, "file"); SH2_DUMP(sh2, "current"); SH2_DUMP(&csh2[idx][0], "previous"); + char *ps = (char *)sh2, *pf = (char *)&fsh2; + for (idx = 0; idx < offsetof(SH2, read8_map); idx += sizeof(u32)) + if (*(u32 *)(ps+idx) != *(u32 *)(pf+idx)) + printf("diff reg %ld\n",idx/sizeof(u32)); exit(1); } csh2[idx][0] = fsh2; @@ -455,6 +459,8 @@ static void rcache_free_tmp(int hr); #include "../drc/emit_mips.c" #elif defined(__riscv__) || defined(__riscv) #include "../drc/emit_riscv.c" +#elif defined(__powerpc__) +#include "../drc/emit_ppc.c" #elif defined(__i386__) #include "../drc/emit_x86.c" #elif defined(__x86_64__) @@ -2572,6 +2578,7 @@ static int emit_get_rbase_and_offs(SH2 *sh2, sh2_reg_e r, int rmode, u32 *offs) u32 odd = a & 1; // need to fix odd address for correct byte addressing la -= (s32)((a & ~mask) - *offs - odd); // diff between reg and memory hr = hr2 = rcache_get_reg(r, rmode, NULL); + if ((s32)a < 0) emith_uext_ptr(hr2); if ((la & ~omask) - odd) { hr = rcache_get_tmp(); emith_add_r_r_ptr_imm(hr, hr2, (la & ~omask) - odd); diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 00a8707b9..b1b7487fb 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -48,6 +48,9 @@ unsigned short scan_block(uint32_t base_pc, int is_slave, #elif defined(__riscv__) || defined(__riscv) #define DRC_SR_REG "s11" #define DRC_REG_LL 0 // no ABI for (__ILP32__ && __riscv_xlen != 32) +#elif defined(__powerpc__) +#define DRC_SR_REG "r30" +#define DRC_REG_LL 0 // no ABI for __ILP32__ #elif defined(__i386__) #define DRC_SR_REG "edi" #define DRC_REG_LL 0 // 32 bit diff --git a/platform/linux/emu.c b/platform/linux/emu.c index 5e4dd72a2..005f82a36 100644 --- a/platform/linux/emu.c +++ b/platform/linux/emu.c @@ -29,7 +29,7 @@ void pemu_prep_defconfig(void) void pemu_validate_config(void) { -#if !defined(__arm__) && !defined(__aarch64__) && !defined(__mips__) && !defined(__riscv__) && !defined(__riscv) && !defined(__i386__) && !defined(__x86_64__) +#if !defined(__arm__) && !defined(__aarch64__) && !defined(__mips__) && !defined(__riscv__) && !defined(__riscv) && !defined(__powerpc__) && !defined(__i386__) && !defined(__x86_64__) PicoIn.opt &= ~POPT_EN_DRC; #endif } From 39036644ea17dd2fa8aceabe9cba7e902532a9c4 Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 21 Jun 2020 22:32:37 +0200 Subject: [PATCH 162/174] vdp fifo, DMA bugfix --- pico/videoport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pico/videoport.c b/pico/videoport.c index f9fd6ece4..f324f704f 100644 --- a/pico/videoport.c +++ b/pico/videoport.c @@ -164,7 +164,7 @@ static int PicoVideoFIFODrain(int level, int cycles, int bgdma) //int osl = fifo_slot; // process FIFO entries until low level is reached - while (vf->fifo_slot < vf->fifo_maxslot && cycles < 488 && + while (vf->fifo_slot <= vf->fifo_maxslot && cycles < 488 && ((vf->fifo_total > level) | (vf->fifo_queue[vf->fifo_qx] & bgdma))) { int b = vf->fifo_queue[vf->fifo_qx] & FQ_BYTE; int cnt = bgdma ? pv->fifo_cnt : ((vf->fifo_total-level)<fifo_cnt&b); From 241743af052dd97b3e1d0f736073e3a820ad18fb Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 23 Jun 2020 23:34:07 +0200 Subject: [PATCH 163/174] sh2 drc, backend 32/64 bit compatibility fixes for Mips/RiscV --- cpu/drc/emit_mips.c | 27 +++++++++++++++------------ cpu/drc/emit_ppc.c | 2 +- cpu/drc/emit_riscv.c | 33 ++++++++++++++++++--------------- 3 files changed, 34 insertions(+), 28 deletions(-) diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 8eddd2196..1c98ac043 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -286,6 +286,7 @@ enum { RB_SRL=0, RB_ROTR=1 }; #define FN_PSUBU FN_SUBU #define PTR_SCALE 2 #endif +#define PTR_SIZE (1< 0) emith_write_r_r_offs(r, SP, 0+16); \ + int offs_ = 8+16 - 2*PTR_SIZE; \ + emith_add_r_r_ptr_imm(SP, SP, -8-16); \ + emith_write_r_r_offs_ptr(LR, SP, offs_ + PTR_SIZE); \ + if ((r) > 0) emith_write_r_r_offs(r, SP, offs_); \ } while (0) #define emith_pop_and_ret(r) do { \ - if ((r) > 0) emith_read_r_r_offs(r, SP, 0+16); \ - emith_read_r_r_offs(LR, SP, 4+16); \ + int offs_ = 8+16 - 2*PTR_SIZE; \ + if ((r) > 0) emith_read_r_r_offs(r, SP, offs_); \ + emith_read_r_r_offs_ptr(LR, SP, offs_ + PTR_SIZE); \ emith_add_r_r_ptr_imm(SP, SP, 8+16); \ emith_ret(); \ } while (0) @@ -1567,21 +1570,21 @@ static int emith_cond_check(int cond, int *r) // SH2 drc specific #define emith_sh2_drc_entry() do { \ - int _c; u32 _m = 0xd0ff0000; \ + int _c, _z = PTR_SIZE; u32 _m = 0xd0ff0000; \ if (__builtin_parity(_m) == 1) _m |= 0x1; /* ABI align for SP is 8 */ \ - int _s = count_bits(_m) * 4 + 16, _o = _s; /* 16 byte arg save area */ \ + int _s = count_bits(_m) * _z + 16, _o = _s; /* 16 O32 arg save area */ \ if (_s) emith_add_r_r_ptr_imm(SP, SP, -_s); \ for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ if (_m & (1 << _c)) \ - { _o -= 4; if (_c) emith_write_r_r_offs(_c, SP, _o); } \ + { _o -= _z; if (_c) emith_write_r_r_offs_ptr(_c, SP, _o); } \ } while (0) #define emith_sh2_drc_exit() do { \ - int _c; u32 _m = 0xd0ff0000; \ + int _c, _z = PTR_SIZE; u32 _m = 0xd0ff0000; \ if (__builtin_parity(_m) == 1) _m |= 0x1; \ - int _s = count_bits(_m) * 4 + 16, _o = 16; \ + int _s = count_bits(_m) * _z + 16, _o = 16; \ for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ if (_m & (1 << _c)) \ - { if (_c) emith_read_r_r_offs(_c, SP, _o); _o += 4; } \ + { if (_c) emith_read_r_r_offs_ptr(_c, SP, _o); _o += _z; } \ if (_s) emith_add_r_r_ptr_imm(SP, SP, _s); \ emith_ret(); \ } while (0) diff --git a/cpu/drc/emit_ppc.c b/cpu/drc/emit_ppc.c index 286d41668..54050bad2 100644 --- a/cpu/drc/emit_ppc.c +++ b/cpu/drc/emit_ppc.c @@ -1545,7 +1545,7 @@ static int emith_cond_check(int cond) // SH2 drc specific #define STACK_EXTRA ((8+6)*PTR_SIZE) // Param, ABI (LR,CR,FP etc) save areas #define emith_sh2_drc_entry() do { \ - int _c, _z = PTR_SIZE; u32 _m = 0xffffc000; /* r14-r30 */ \ + int _c, _z = PTR_SIZE; u32 _m = 0xffffc000; /* r14-r31 */ \ if (__builtin_parity(_m) == 1) _m |= 0x1; /* ABI align for SP is 16 */ \ int _s = count_bits(_m) * _z, _o = 0; \ for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ diff --git a/cpu/drc/emit_riscv.c b/cpu/drc/emit_riscv.c index 954d14a77..ab6c4fd00 100644 --- a/cpu/drc/emit_riscv.c +++ b/cpu/drc/emit_riscv.c @@ -244,6 +244,8 @@ enum { F2_ALT=0x20, F2_MULDIV=0x01 }; } while (0) #endif +#define PTR_SIZE (1<= 0; _m &= ~(1 << _c), _c--) \ if (_m & (1 << _c)) \ - { _o -= 4; if (_c) emith_write_r_r_offs(_c, SP, _o); } \ + { _o -= _z; if (_c) emith_write_r_r_offs_ptr(_c, SP, _o); } \ } while (0) #define emith_restore_caller_regs(mask) do { \ - int _c; u32 _m = mask & 0x3fce0; \ + int _c, _z = PTR_SIZE; u32 _m = mask & 0x3fce0; \ _c = count_bits(_m)&3; _m |= (1<<((4-_c)&3))-1; /* ABI align */ \ - int _s = count_bits(_m) * 4, _o = 0; \ + int _s = count_bits(_m) * _z, _o = 0; \ for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ if (_m & (1 << _c)) \ - { if (_c) emith_read_r_r_offs(_c, SP, _o); _o += 4; } \ + { if (_c) emith_read_r_r_offs_ptr(_c, SP, _o); _o += _z; } \ if (_s) emith_add_r_r_ptr_imm(SP, SP, _s); \ } while (0) @@ -1312,6 +1314,7 @@ static int emith_cond_check(int cond, int *r, int *s) // NB: returns position of patch for cache maintenance #define emith_jump_patch(ptr, target, pos) do { \ u32 *ptr_ = (u32 *)ptr; /* must skip condition check code */ \ + while ((*ptr_&0x77) != OP_BCOND && (*ptr_&0x77) != OP_LUI) ptr_ ++; \ if ((*ptr_&0x77) == OP_BCOND) { \ u32 *p_ = ptr_, disp_ = (u8 *)target - (u8 *)ptr_; \ u32 f1_ = _CB(*ptr_,3,12,0); \ @@ -1382,13 +1385,13 @@ static int emith_cond_check(int cond, int *r, int *s) #define emith_push_ret(r) do { \ emith_add_r_r_ptr_imm(SP, SP, -16); /* ABI requires 16 byte aligment */\ - emith_write_r_r_offs(LR, SP, 4); \ + emith_write_r_r_offs_ptr(LR, SP, 8); \ if ((r) > 0) emith_write_r_r_offs(r, SP, 0); \ } while (0) #define emith_pop_and_ret(r) do { \ if ((r) > 0) emith_read_r_r_offs(r, SP, 0); \ - emith_read_r_r_offs(LR, SP, 4); \ + emith_read_r_r_offs_ptr(LR, SP, 8); \ emith_add_r_r_ptr_imm(SP, SP, 16); \ emith_ret(); \ } while (0) @@ -1404,21 +1407,21 @@ static int emith_cond_check(int cond, int *r, int *s) // SH2 drc specific #define emith_sh2_drc_entry() do { \ - int _c; u32 _m = 0x0ffc0202; /* x1,x9,x18-x27 */ \ + int _c, _z = PTR_SIZE; u32 _m = 0x0ffc0202; /* x1,x9,x18-x27 */ \ _c = count_bits(_m)&3; _m |= (1<<((4-_c)&3))-1; /* ABI align */ \ - int _s = count_bits(_m) * 4, _o = _s; \ + int _s = count_bits(_m) * _z, _o = _s; \ if (_s) emith_add_r_r_ptr_imm(SP, SP, -_s); \ for (_c = HOST_REGS-1; _m && _c >= 0; _m &= ~(1 << _c), _c--) \ if (_m & (1 << _c)) \ - { _o -= 4; if (_c) emith_write_r_r_offs(_c, SP, _o); } \ + { _o -= _z; if (_c) emith_write_r_r_offs_ptr(_c, SP, _o); } \ } while (0) #define emith_sh2_drc_exit() do { \ - int _c; u32 _m = 0x0ffc0202; \ + int _c, _z = PTR_SIZE; u32 _m = 0x0ffc0202; \ _c = count_bits(_m)&3; _m |= (1<<((4-_c)&3))-1; /* ABI align */ \ - int _s = count_bits(_m) * 4, _o = 0; \ + int _s = count_bits(_m) * _z, _o = 0; \ for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \ if (_m & (1 << _c)) \ - { if (_c) emith_read_r_r_offs(_c, SP, _o); _o += 4; } \ + { if (_c) emith_read_r_r_offs_ptr(_c, SP, _o); _o += _z; } \ if (_s) emith_add_r_r_ptr_imm(SP, SP, _s); \ emith_ret(); \ } while (0) @@ -1428,7 +1431,7 @@ static int emith_cond_check(int cond, int *r, int *s) emith_lsr(mask, a, SH2_READ_SHIFT); \ emith_add_r_r_r_lsl_ptr(tab, tab, mask, PTR_SCALE+1); \ emith_read_r_r_offs_ptr(func, tab, 0); \ - emith_read_r_r_offs(mask, tab, 1 << PTR_SCALE); \ + emith_read_r_r_offs(mask, tab, PTR_SIZE); \ emith_addf_r_r_r_ptr(func, func, func); \ } while (0) From a09f88adce40d9f90086de79df232cc4727b2ca0 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 23 Jun 2020 23:36:38 +0200 Subject: [PATCH 164/174] SDL UI, fix for CD LED display --- platform/common/plat_sdl.c | 2 +- platform/linux/emu.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/platform/common/plat_sdl.c b/platform/common/plat_sdl.c index bce4b0841..bb1ce6121 100644 --- a/platform/common/plat_sdl.c +++ b/platform/common/plat_sdl.c @@ -270,7 +270,7 @@ void plat_init(void) if (shadow_size < 320 * 480 * 2) shadow_size = 320 * 480 * 2; - shadow_fb = malloc(shadow_size); + shadow_fb = calloc(1, shadow_size); g_menubg_ptr = calloc(1, shadow_size); if (shadow_fb == NULL || g_menubg_ptr == NULL) { fprintf(stderr, "OOM\n"); diff --git a/platform/linux/emu.c b/platform/linux/emu.c index 005f82a36..597c13086 100644 --- a/platform/linux/emu.c +++ b/platform/linux/emu.c @@ -39,10 +39,11 @@ static void draw_cd_leds(void) int led_reg, pitch, scr_offs, led_offs; led_reg = Pico_mcd->s68k_regs[0]; - pitch = 320; + pitch = g_screen_ppitch; led_offs = 4; scr_offs = pitch * 2 + 4; +#if 0 if (currentConfig.renderer != RT_16BIT) { #define p(x) px[(x) >> 2] // 8-bit modes @@ -52,7 +53,9 @@ static void draw_cd_leds(void) p(pitch*0) = p(pitch*1) = p(pitch*2) = col_g; p(pitch*0 + led_offs) = p(pitch*1 + led_offs) = p(pitch*2 + led_offs) = col_r; #undef p - } else { + } else +#endif + { #define p(x) px[(x)*2 >> 2] = px[((x)*2 >> 2) + 1] // 16-bit modes unsigned int *px = (unsigned int *)((short *)g_screen_ptr + scr_offs); From 817c9eb8f170319715c20c6c704c316694bd0093 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 23 Jun 2020 23:43:53 +0200 Subject: [PATCH 165/174] sh2 drc, optimisation for SH2 16x16 multiplication --- cpu/sh2/compiler.c | 117 +++++++++++++++++++++++++++++++-------------- 1 file changed, 82 insertions(+), 35 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index d1a971c64..cba97e2b5 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -1,7 +1,7 @@ /* * SH2 recompiler * (C) notaz, 2009,2010,2013 - * (C) kub, 2018,2019 + * (C) kub, 2018,2019,2020 * * This work is licensed under the terms of MAME license. * See COPYING file in the top-level directory. @@ -398,12 +398,13 @@ int rchit, rcmiss; enum cache_reg_htype { HRT_TEMP = 1, // is for temps and args HRT_REG = 2, // is for sh2 regs - HRT_STATIC = 2, // is for static mappings (same as HRT_REG) }; enum cache_reg_flags { HRF_DIRTY = 1 << 0, // has "dirty" value to be written to ctx HRF_PINNED = 1 << 1, // has a pinned mapping + HRF_S16 = 1 << 2, // has a sign extended 16 bit value + HRF_U16 = 1 << 3, // has a zero extended 16 bit value }; enum cache_reg_type { @@ -413,9 +414,9 @@ enum cache_reg_type { }; typedef struct { - u8 hreg; // "host" reg + u8 hreg:6; // "host" reg u8 htype:2; // TEMP or REG? - u8 flags:2; // DIRTY, PINNED? + u8 flags:4; // DIRTY, PINNED? u8 type:2; // CACHED or TEMP? u8 locked:2; // LOCKED reference counter u16 stamp; // kind of a timestamp @@ -1334,6 +1335,37 @@ static void rcache_remove_vreg_alias(int x, sh2_reg_e r); static void rcache_evict_vreg(int x); static void rcache_remap_vreg(int x); +static void rcache_set_x16(int hr, int s16_, int u16_) +{ + int x = reg_map_host[hr]; + if (x >= 0) { + cache_regs[x].flags &= ~(HRF_S16|HRF_U16); + if (s16_) cache_regs[x].flags |= HRF_S16; + if (u16_) cache_regs[x].flags |= HRF_U16; + } +} + +static void rcache_copy_x16(int hr, int hr2) +{ + int x = reg_map_host[hr], y = reg_map_host[hr2]; + if (x >= 0 && y >= 0) { + cache_regs[x].flags = (cache_regs[x].flags & ~(HRF_S16|HRF_U16)) | + (cache_regs[y].flags & (HRF_S16|HRF_U16)); + } +} + +static int rcache_is_s16(int hr) +{ + int x = reg_map_host[hr]; + return (x >= 0 ? cache_regs[x].flags & HRF_S16 : 0); +} + +static int rcache_is_u16(int hr) +{ + int x = reg_map_host[hr]; + return (x >= 0 ? cache_regs[x].flags & HRF_U16 : 0); +} + #define RCACHE_DUMP(msg) { \ cache_reg_t *cp; \ guest_reg_t *gp; \ @@ -1467,10 +1499,13 @@ static int gconst_check(sh2_reg_e r) static int gconst_try_read(int vreg, sh2_reg_e r) { int i, x; + u32 v; if (guest_regs[r].flags & GRF_CDIRTY) { x = guest_regs[r].cnst; - emith_move_r_imm(cache_regs[vreg].hreg, gconsts[x].val); + v = gconsts[x].val; + emith_move_r_imm(cache_regs[vreg].hreg, v); + rcache_set_x16(cache_regs[vreg].hreg, v == (s16)v, v == (u16)v); FOR_ALL_BITS_SET_DO(gconsts[x].gregs, i, { if (guest_regs[i].vreg >= 0 && guest_regs[i].vreg != vreg) @@ -1641,6 +1676,8 @@ static void rcache_clean_vreg(int x) rcache_evict_vreg(guest_regs[r].sreg); emith_move_r_r(cache_regs[guest_regs[r].sreg].hreg, cache_regs[guest_regs[r].vreg].hreg); + rcache_copy_x16(cache_regs[guest_regs[r].sreg].hreg, + cache_regs[guest_regs[r].vreg].hreg); rcache_remove_vreg_alias(x, r); rcache_add_vreg_alias(guest_regs[r].sreg, r); cache_regs[guest_regs[r].sreg].flags |= HRF_DIRTY; @@ -1783,9 +1820,9 @@ static int rcache_allocate_temp(void) #if REMAP_REGISTER // maps a host register to a REG -static int rcache_map_reg(sh2_reg_e r, int hr, int mode) +static int rcache_map_reg(sh2_reg_e r, int hr) { - int x, i; + int i; gconst_kill(r); @@ -1797,19 +1834,6 @@ static int rcache_map_reg(sh2_reg_e r, int hr, int mode) exit(1); } - // deal with statically mapped regs - if (mode == RC_GR_RMW && (guest_regs[r].flags & (GRF_STATIC|GRF_PINNED))) { - x = guest_regs[r].sreg; - if (guest_regs[r].vreg == x) { - // STATIC in its sreg with no aliases, and some processing pending - if (cache_regs[x].gregs == 1 << r) - return cache_regs[x].hreg; - } else if (cache_regs[x].type == HR_FREE || - (cache_regs[x].type == HR_TEMP && !cache_regs[x].locked)) - // STATIC not in its sreg, with sreg available -> move it - i = guest_regs[r].sreg; - } - // remove old mappings of r and i if one exists if (guest_regs[r].vreg >= 0) rcache_remove_vreg_alias(guest_regs[r].vreg, r); @@ -1818,7 +1842,6 @@ static int rcache_map_reg(sh2_reg_e r, int hr, int mode) // set new mappping cache_regs[i].type = HR_CACHED; cache_regs[i].gregs = 1 << r; - cache_regs[i].flags &= HRF_PINNED; cache_regs[i].locked = 0; cache_regs[i].stamp = ++rcache_counter; cache_regs[i].flags |= HRF_DIRTY; @@ -2010,7 +2033,9 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr tr->flags |= HRF_DIRTY; guest_regs[r].flags |= GRF_DIRTY; gconst_kill(r); - } + rcache_set_x16(tr->hreg, 0, 0); + } else if (src >= 0 && cache_regs[src].hreg != tr->hreg) + rcache_copy_x16(tr->hreg, cache_regs[src].hreg); #if DRC_DEBUG & 64 RCACHE_CHECK("after getreg"); #endif @@ -2410,6 +2435,8 @@ static void rcache_clean(void) else { emith_move_r_r(cache_regs[guest_regs[i].sreg].hreg, cache_regs[guest_regs[i].vreg].hreg); + rcache_copy_x16(cache_regs[guest_regs[i].sreg].hreg, + cache_regs[guest_regs[i].vreg].hreg); rcache_remove_vreg_alias(guest_regs[i].vreg, i); } cache_regs[guest_regs[i].sreg].gregs = 1 << i; @@ -2689,6 +2716,8 @@ static void emit_sync_t_to_sr(void) // rd = @(arg0) static int emit_memhandler_read(int size) { + int hr; + emit_sync_t_to_sr(); rcache_clean_tmp(); #ifndef DRC_SR_REG @@ -2711,7 +2740,9 @@ static int emit_memhandler_read(int size) case 2: emith_call(sh2_drc_read32); break; // 32 } - return rcache_get_tmp_ret(); + hr = rcache_get_tmp_ret(); + rcache_set_x16(hr, (size & MF_SIZEMASK) < 2, 0); + return hr; } // @(arg0) = arg1 @@ -2747,6 +2778,7 @@ static int emit_memhandler_read_rr(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rs, u32 off emit_move_r_imm32(rd, val); hr2 = rcache_get_reg(rd, RC_GR_RMW, NULL); } + rcache_set_x16(hr2, val == (s16)val, val == (u16)val); if (size & MF_POSTINCR) emit_add_r_imm(rs, 1 << (size & MF_SIZEMASK)); return hr2; @@ -2790,12 +2822,11 @@ static int emit_memhandler_read_rr(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rs, u32 off } hr = emit_memhandler_read(size); - size &= MF_SIZEMASK; if (rd == SHR_TMP) hr2 = hr; else #if REMAP_REGISTER - hr2 = rcache_map_reg(rd, hr, RC_GR_WRITE); + hr2 = rcache_map_reg(rd, hr); #else hr2 = rcache_get_reg(rd, RC_GR_WRITE, NULL); #endif @@ -2865,12 +2896,11 @@ static int emit_indirect_indexed_read(SH2 *sh2, sh2_reg_e rd, sh2_reg_e rx, sh2_ emith_add_r_r_r(hr, tx, ty); hr = emit_memhandler_read(size); - size &= MF_SIZEMASK; if (rd == SHR_TMP) hr2 = hr; else #if REMAP_REGISTER - hr2 = rcache_map_reg(rd, hr, RC_GR_WRITE); + hr2 = rcache_map_reg(rd, hr); #else hr2 = rcache_get_reg(rd, RC_GR_WRITE, NULL); #endif @@ -3644,7 +3674,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } tmp2 = emit_memhandler_read(opd->size); #if REMAP_REGISTER - tmp3 = rcache_map_reg(GET_Rn(), tmp2, RC_GR_WRITE); + tmp3 = rcache_map_reg(GET_Rn(), tmp2); #else tmp3 = rcache_get_reg(GET_Rn(), RC_GR_WRITE, NULL); #endif @@ -3886,16 +3916,29 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) tmp2 = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); tmp3 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); tmp = rcache_get_reg(SHR_MACL, RC_GR_WRITE, NULL); - tmp4 = rcache_get_tmp(); + tmp4 = tmp3; if (op & 1) { - emith_sext(tmp, tmp2, 16); - emith_sext(tmp4, tmp3, 16); + if (! rcache_is_s16(tmp2)) { + emith_sext(tmp, tmp2, 16); + tmp2 = tmp; + } + if (! rcache_is_s16(tmp3)) { + tmp4 = rcache_get_tmp(); + emith_sext(tmp4, tmp3, 16); + } } else { - emith_clear_msb(tmp, tmp2, 16); - emith_clear_msb(tmp4, tmp3, 16); + if (! rcache_is_u16(tmp2)) { + emith_clear_msb(tmp, tmp2, 16); + tmp2 = tmp; + } + if (! rcache_is_u16(tmp3)) { + tmp4 = rcache_get_tmp(); + emith_clear_msb(tmp4, tmp3, 16); + } } - emith_mul(tmp, tmp, tmp4); - rcache_free_tmp(tmp4); + emith_mul(tmp, tmp2, tmp4); + if (tmp4 != tmp3) + rcache_free_tmp(tmp4); goto end_op; } goto default_; @@ -4415,15 +4458,19 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) break; case 0x0c: // EXTU.B Rm,Rn 0110nnnnmmmm1100 emith_clear_msb(tmp2, tmp, 24); + rcache_set_x16(tmp2, 1, 1); break; case 0x0d: // EXTU.W Rm,Rn 0110nnnnmmmm1101 emith_clear_msb(tmp2, tmp, 16); + rcache_set_x16(tmp2, 0, 1); break; case 0x0e: // EXTS.B Rm,Rn 0110nnnnmmmm1110 emith_sext(tmp2, tmp, 8); + rcache_set_x16(tmp2, 1, 0); break; case 0x0f: // EXTS.W Rm,Rn 0110nnnnmmmm1111 emith_sext(tmp2, tmp, 16); + rcache_set_x16(tmp2, 1, 0); break; } goto end_op; From 1fef85f06eae654db0db5bc5f80d11f08966efcf Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 25 Jun 2020 16:46:35 +0200 Subject: [PATCH 166/174] SDL UI, preparation for 2x mode, for improved color resolution --- platform/common/plat_sdl.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/platform/common/plat_sdl.c b/platform/common/plat_sdl.c index bb1ce6121..276a0c619 100644 --- a/platform/common/plat_sdl.c +++ b/platform/common/plat_sdl.c @@ -136,9 +136,19 @@ void bgr_to_uyvy_init(void) void rgb565_to_uyvy(void *d, const void *s, int pixels) { - unsigned int *dst = d; - const unsigned short *src = s; + uint32_t *dst = d; + const uint16_t *src = s; + if (plat_sdl_overlay->w > 2*plat_sdl_overlay->h) + for (; pixels > 0; src += 4, dst += 4, pixels -= 4) + { + struct uyvy *uyvy0 = yuv_uyvy + src[0], *uyvy1 = yuv_uyvy + src[1]; + struct uyvy *uyvy2 = yuv_uyvy + src[2], *uyvy3 = yuv_uyvy + src[3]; + dst[0] = (uyvy0->y << 24) | uyvy0->vyu; + dst[1] = (uyvy1->y << 24) | uyvy1->vyu; + dst[2] = (uyvy2->y << 24) | uyvy2->vyu; + dst[3] = (uyvy3->y << 24) | uyvy3->vyu; + } else for (; pixels > 0; src += 4, dst += 2, pixels -= 4) { struct uyvy *uyvy0 = yuv_uyvy + src[0], *uyvy1 = yuv_uyvy + src[1]; From fbe827950373c3ab4fb76a6460fea5d258ae12f8 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 25 Jun 2020 16:49:17 +0200 Subject: [PATCH 167/174] sh2 drc, fix for SH2 T handling in Mips/RiscV --- cpu/drc/emit_mips.c | 2 +- cpu/drc/emit_riscv.c | 25 +++++++++---------------- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/cpu/drc/emit_mips.c b/cpu/drc/emit_mips.c index 1c98ac043..fb7de3661 100644 --- a/cpu/drc/emit_mips.c +++ b/cpu/drc/emit_mips.c @@ -1801,7 +1801,7 @@ static void emith_set_t_cond(int sr, int cond) EMIT(MIPS_SLTU_IMM(AT,AT, 1)); r=AT; val++; break; } else if ((b>>5) == OP_BNE) { EMIT(MIPS_XOR_REG(AT, r, b&0x1f)); - EMIT(MIPS_SLTU_IMM(AT,Z0,AT)); r=AT; val++; break; + EMIT(MIPS_SLTU_REG(AT,Z0,AT)); r=AT; val++; break; } } if (val) { diff --git a/cpu/drc/emit_riscv.c b/cpu/drc/emit_riscv.c index ab6c4fd00..de99d4fd0 100644 --- a/cpu/drc/emit_riscv.c +++ b/cpu/drc/emit_riscv.c @@ -1601,28 +1601,21 @@ static void emith_set_t_cond(int sr, int cond) u32 val = 0, inv = 0; // try to avoid jumping around if possible - if (emith_cmp_rs >= 0) { - if (emith_cmp_rt >= 0) - b = emith_cmpr_check(emith_cmp_rs, emith_cmp_rt, cond, &r, &s); - else - b = emith_cmpi_check(emith_cmp_rs, emith_cmp_imm, cond, &r, &s); - } else { - b = emith_cond_check(cond, &r, &s); - if (r == Z0) { - if (b == F1_BEQ || b == F1_BGE || b == F1_BGEU) - emith_or_r_imm(sr, T); - return; - } else if (r == FC) - val++, inv = (b == F1_BEQ); - } + b = emith_cond_check(cond, &r, &s); + if (r == Z0) { + if (b == F1_BEQ || b == F1_BGE || b == F1_BGEU) + emith_or_r_imm(sr, T); + return; + } else if (r == FC) + val++, inv = (b == F1_BEQ); if (!val) switch (b) { case F1_BEQ: if (s == Z0) { EMIT(R5_SLTU_IMM(AT,r ,1)); r=AT; val++; break; } EMIT(R5_XOR_REG(AT, r, s)); EMIT(R5_SLTU_IMM(AT,AT, 1)); r=AT; val++; break; - case F1_BNE: if (s == Z0) { EMIT(R5_SLTU_IMM(AT,Z0,r)); r=AT; val++; break; } + case F1_BNE: if (s == Z0) { EMIT(R5_SLTU_REG(AT,Z0,r)); r=AT; val++; break; } EMIT(R5_XOR_REG(AT, r, s)); - EMIT(R5_SLTU_IMM(AT,Z0,AT)); r=AT; val++; break; + EMIT(R5_SLTU_REG(AT,Z0,AT)); r=AT; val++; break; case F1_BLTU: EMIT(R5_SLTU_REG(AT, r, s)); r=AT; val++; break; case F1_BGEU: EMIT(R5_SLTU_REG(AT, r, s)); r=AT; val++; inv++; break; case F1_BLT: EMIT(R5_SLT_REG(AT, r, s)); r=AT; val++; break; From db5b05727691cdcfc5dd84d303760ac09a441815 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 2 Jul 2020 16:14:16 +0200 Subject: [PATCH 168/174] libretro make fix for non-arm architectures --- Makefile.libretro | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile.libretro b/Makefile.libretro index 0cc97695d..0d18b4ba8 100644 --- a/Makefile.libretro +++ b/Makefile.libretro @@ -42,7 +42,7 @@ STATIC_LINKING:= 0 TARGET_NAME := picodrive LIBM := -lm GIT_VERSION ?= $(shell git rev-parse --short HEAD || echo unknown) -ifneq ($(GIT_VERSION)," unknown") +ifneq ($(GIT_VERSION),"unknown") CFLAGS += -DGIT_VERSION=\"$(GIT_VERSION)\" endif @@ -756,6 +756,9 @@ endif SHARED ?= -shared LDFLAGS += $(SHARED) $(fpic) +ifneq ($(ARCH), arm) +ARCH = $(shell $(CC) -dumpmachine | awk -F '-' '{print $$1}') +endif PLATFORM = libretro NO_CONFIG_MAK = yes From 2eaba39cae82defffa007f4d6ab7ba399aca7fc6 Mon Sep 17 00:00:00 2001 From: kub Date: Fri, 3 Jul 2020 00:46:40 +0200 Subject: [PATCH 169/174] SDL UI, 2x overlay mode, for improved color resolution --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 903f93696..b7cc26d8e 100644 --- a/Makefile +++ b/Makefile @@ -121,6 +121,7 @@ OBJS += platform/libpicofe/gl_platform.o USE_FRONTEND = 1 endif ifeq "$(PLATFORM)" "generic" +CFLAGS += -DSDL_OVERLAY_2X OBJS += platform/linux/emu.o platform/linux/blit.o # FIXME OBJS += platform/common/plat_sdl.o OBJS += platform/libpicofe/plat_sdl.o platform/libpicofe/in_sdl.o From 9e1e3bd5bc9a9356c1efa8dcac3c04c7a1942009 Mon Sep 17 00:00:00 2001 From: kub Date: Sun, 5 Jul 2020 13:16:35 +0200 Subject: [PATCH 170/174] libretro, changes to allow for both standalone and libretro build --- Makefile | 44 +++++++++++++++++++++++++------------------- platform/libpicofe | 2 +- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index b7cc26d8e..95a1da4a7 100644 --- a/Makefile +++ b/Makefile @@ -1,28 +1,9 @@ $(LD) ?= $(CC) TARGET ?= PicoDrive -CFLAGS += -Wall -ggdb -ffunction-sections -fdata-sections CFLAGS += -I. CYCLONE_CC ?= gcc CYCLONE_CXX ?= g++ -ifneq ("$(PLATFORM)", "libretro") - CFLAGS += -Wall -g - ifndef DEBUG - CFLAGS += -O3 -DNDEBUG - endif -endif - -# This is actually needed, believe me. -# If you really have to disable this, set NO_ALIGN_FUNCTIONS elsewhere. -ifndef NO_ALIGN_FUNCTIONS -CFLAGS += -falign-functions=2 -endif -LDFLAGS += -Wl,--gc-sections - -# profiling -pprof ?= 0 -gperf ?= 0 - all: config.mak target_ ifndef NO_CONFIG_MAK @@ -40,6 +21,31 @@ else # NO_CONFIG_MAK config.mak: endif +# This is actually needed, believe me. +# If you really have to disable this, set NO_ALIGN_FUNCTIONS elsewhere. +ifndef NO_ALIGN_FUNCTIONS +CFLAGS += -falign-functions=2 +endif + +# profiling +pprof ?= 0 +gperf ?= 0 + +ifneq ("$(PLATFORM)", "libretro") + CFLAGS += -Wall -g +ifneq ($(findstring gcc,$(CC)),) + CFLAGS += -ffunction-sections -fdata-sections + LDFLAGS += -Wl,--gc-sections +endif + ifndef DEBUG + CFLAGS += -O3 -DNDEBUG + endif + + LD = $(CC) + OBJOUT ?= -o + LINKOUT ?= -o +endif + ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1")) # very small caches, avoid optimization options making the binary much bigger CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp -ffast-math diff --git a/platform/libpicofe b/platform/libpicofe index 795b71c57..811cef4d9 160000 --- a/platform/libpicofe +++ b/platform/libpicofe @@ -1 +1 @@ -Subproject commit 795b71c571518b310a22138141bb6d1cd08d85f6 +Subproject commit 811cef4d9f3772d0bbf6c1f0434e5860c9550abc From ca9d270d1374aa67fa6ce79daeef2215ede33422 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 7 Jul 2020 10:19:23 +0200 Subject: [PATCH 171/174] libretro, fix for windows and osx --- tools/mkoffsets.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index 8a1092e0a..d890fc0f6 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -46,14 +46,22 @@ get_define () # prefix struct member member... line=$(printf "#define %-20s 0x%04x" $prefix$name $rodata) } +fn="${1:-.}/pico_int_offs.h" if echo $CFLAGS | grep -qe -flto; then CFLAGS="$CFLAGS -fno-lto"; fi + +# don't do this if readelf isn't available. it doesn't matter since offsets are +# only needed for the asm parts (currently mips/arm32) and those have readelf +if ! command -v readelf >/dev/null; then + echo "/* mkoffset.sh: readelf not found, offset table not created */" >$fn + echo "WARNING: readelf not found, offset table not created" + exit +fi # determine endianess echo '#include ' >/tmp/getoffs.c echo "const int32_t val = 1;" >>/tmp/getoffs.c compile_rodata ENDIAN=$(if [ "$rodata" -eq 1 ]; then echo be; else echo le; fi) # output header -fn="${1:-.}/pico_int_offs.h" echo "/* autogenerated by mkoffset.sh, do not edit */" >$fn echo "/* target endianess: $ENDIAN, compiled with: $CC $CFLAGS */" >>$fn # output offsets From f6f9a47b83b7e54105d2e42b0958fa55c560423d Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 7 Jul 2020 19:03:08 +0200 Subject: [PATCH 172/174] libretro, more fixes and cleanups for windows and osx --- Makefile | 1 + Makefile.libretro | 128 +++++++++++++-------------------------------- cpu/drc/emit_x86.c | 4 +- cpu/sh2/compiler.c | 35 ++++++------- cpu/sh2/compiler.h | 2 +- cpu/sh2/sh2.h | 4 ++ tools/mkoffsets.sh | 22 +++++--- 7 files changed, 76 insertions(+), 120 deletions(-) diff --git a/Makefile b/Makefile index 95a1da4a7..6fb90da29 100644 --- a/Makefile +++ b/Makefile @@ -174,6 +174,7 @@ OBJS += platform/libretro/libretro-common/streams/file_stream.o OBJS += platform/libretro/libretro-common/streams/file_stream_transforms.o OBJS += platform/libretro/libretro-common/vfs/vfs_implementation.o endif +PLATFORM_ZLIB = 1 endif ifeq "$(USE_FRONTEND)" "1" diff --git a/Makefile.libretro b/Makefile.libretro index 0d18b4ba8..4add450c3 100644 --- a/Makefile.libretro +++ b/Makefile.libretro @@ -52,6 +52,8 @@ asm_ym2612 = 0 asm_misc = 0 asm_cdmemory = 0 asm_mix = 0 +asm_32xdraw = 0 +asm_32xmemory = 0 fpic := @@ -97,8 +99,8 @@ else ifeq ($(platform), osx) endif ifeq ($(arch),ppc) CFLAGS += -DBLARGG_BIG_ENDIAN=1 -D__ppc__ -DFAMEC_NO_GOTOS - else - use_sh2drc = 1 + else + use_sh2drc = 1 endif OSXVER = `sw_vers -productVersion | cut -d. -f 2` OSX_LT_MAVERICKS = `(( $(OSXVER) <= 9)) && echo "YES"` @@ -120,14 +122,14 @@ else ifeq ($(platform), staticios) CXX += -miphoneos-version-min=8.0 CC_AS += -miphoneos-version-min=8.0 CFLAGS += -miphoneos-version-min=8.0 - ARCH := arm + ARCH := aarch64 STATIC_LINKING = 1 use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 - use_sh2drc = 0 + use_sh2drc = 1 use_svpdrc = 0 # iOS @@ -142,7 +144,7 @@ else ifneq (,$(findstring ios,$(platform))) ifeq ($(platform),ios-arm64) CC = clang -arch arm64 -isysroot $(IOSSDK) CXX = clang++ -arch arm64 -isysroot $(IOSSDK) - CFLAGS += -marm -DARM -D__aarch64__=1 + CFLAGS += -marm -DARM -D__aarch64__=1 else CC = clang -arch armv7 -isysroot $(IOSSDK) CXX = clang++ -arch armv7 -isysroot $(IOSSDK) @@ -157,35 +159,32 @@ ifeq ($(platform),$(filter $(platform),ios9 ios-arm64)) CXX += -miphoneos-version-min=8.0 CC_AS += -miphoneos-version-min=8.0 CFLAGS += -miphoneos-version-min=8.0 + ARCH := aarch64 else CC += -miphoneos-version-min=5.0 CXX += -miphoneos-version-min=5.0 CC_AS += -miphoneos-version-min=5.0 CFLAGS += -miphoneos-version-min=5.0 -endif ARCH := arm +endif use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 - ifeq ($(platform),ios-arm64) - use_sh2drc = 0 - use_svpdrc = 0 - else - use_sh2drc = 1 - use_svpdrc = 1 - endif + use_sh2drc = 1 + use_svpdrc = 0 # tvOS else ifeq ($(platform), tvos-arm64) - ARCH := arm + ARCH := aarch64 use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 - use_sh2drc = 0 + use_sh2drc = 1 use_svpdrc = 0 + TARGET := $(TARGET_NAME)_libretro_tvos.dylib SHARED := -dynamiclib fpic := -fPIC @@ -194,6 +193,9 @@ else ifeq ($(platform), tvos-arm64) IOSSDK := $(shell xcodebuild -version -sdk appletvos Path) endif CC_AS = perl ./tools/gas-preprocessor.pl $(CC) + CC = clang -arch arm64 -isysroot $(IOSSDK) + CXX = clang++ -arch arm64 -isysroot $(IOSSDK) + CFLAGS += -marm -DARM -D__aarch64__=1 CFLAGS += -DIOS # PS3 @@ -206,17 +208,11 @@ else ifeq ($(platform), ps3) NO_MMAP = 1 DONT_COMPILE_IN_ZLIB = 1 - asm_memory = 0 - asm_render = 0 - asm_ym2612 = 0 - asm_misc = 0 - asm_cdpico = 0 - asm_cdmemory = 0 - asm_mix = 0 use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 + use_sh2drc = 1 # sncps3 else ifeq ($(platform), sncps3) @@ -228,17 +224,11 @@ else ifeq ($(platform), sncps3) NO_MMAP = 1 DONT_COMPILE_IN_ZLIB = 1 - asm_memory = 0 - asm_render = 0 - asm_ym2612 = 0 - asm_misc = 0 - asm_cdpico = 0 - asm_cdmemory = 0 - asm_mix = 0 use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 + use_sh2drc = 1 # Lightweight PS3 Homebrew SDK else ifeq ($(platform), psl1ght) @@ -250,17 +240,11 @@ else ifeq ($(platform), psl1ght) NO_MMAP = 1 DONT_COMPILE_IN_ZLIB = 1 - asm_memory = 0 - asm_render = 0 - asm_ym2612 = 0 - asm_misc = 0 - asm_cdpico = 0 - asm_cdmemory = 0 - asm_mix = 0 use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 + use_sh2drc = 1 # PSP else ifeq ($(platform), psp1) @@ -273,17 +257,11 @@ else ifeq ($(platform), psp1) NO_MMAP = 1 DONT_COMPILE_IN_ZLIB = 1 - asm_memory = 0 asm_render = 1 - asm_ym2612 = 0 - asm_misc = 0 - asm_cdpico = 0 - asm_cdmemory = 0 - asm_mix = 0 - use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 + use_sh2drc = 1 # PS2 else ifeq ($(platform), ps2) @@ -297,17 +275,12 @@ else ifeq ($(platform), ps2) NO_MMAP = 1 DONT_COMPILE_IN_ZLIB = 1 - asm_memory = 0 asm_render = 1 - asm_ym2612 = 0 - asm_misc = 0 - asm_cdpico = 0 - asm_cdmemory = 0 - asm_mix = 0 use_cyclone = 0 use_fame = 1 use_drz80 = 0 use_cz80 = 1 + use_sh2drc = 1 OBJS += platform/ps2/asm.o @@ -327,14 +300,6 @@ else ifeq ($(platform), ctr) ARCH = arm ARM_ASM = 1 - asm_memory = 1 - asm_render = 1 - asm_ym2612 = 1 - asm_misc = 1 - asm_cdpico = 1 - asm_cdmemory = 1 - asm_mix = 1 - use_cyclone = 1 use_fame = 0 use_drz80 = 1 @@ -357,14 +322,6 @@ else ifeq ($(platform), raspberrypi) fpic := -fPIC DONT_COMPILE_IN_ZLIB = 1 - asm_memory = 1 - asm_render = 1 - asm_ym2612 = 1 - asm_misc = 1 - asm_cdpico = 1 - asm_cdmemory = 1 - asm_mix = 1 - use_cyclone = 1 use_fame = 0 use_drz80 = 1 @@ -386,14 +343,8 @@ else ifeq ($(platform), vita) NO_MMAP = 1 DONT_COMPILE_IN_ZLIB = 1 ARCH = arm + ARM_ASM=1 - asm_memory = 1 - asm_render = 1 - asm_ym2612 = 1 - asm_misc = 1 - asm_cdpico = 1 - asm_cdmemory = 1 - asm_mix = 1 use_cyclone = 1 use_fame = 0 use_drz80 = 1 @@ -443,13 +394,14 @@ else ifeq ($(platform), switch) else ifeq ($(platform), libnx) include $(DEVKITPRO)/libnx/switch_rules TARGET := $(TARGET_NAME)_libretro_$(platform).a - ARCH := arm64 + ARCH := aarch64 CFLAGS += -O3 -fomit-frame-pointer -ffast-math -I$(DEVKITPRO)/libnx/include/ -fPIE -Wl,--allow-multiple-definition CFLAGS += -specs=$(DEVKITPRO)/libnx/switch.specs CFLAGS += -D__SWITCH__ -DHAVE_LIBNX CFLAGS += -DARM -D__aarch64__=1 -march=armv8-a -mtune=cortex-a57 -mtp=soft -ffast-math -mcpu=cortex-a57+crc+fp+simd -ffunction-sections CFLAGS += -Ifrontend/switch -ftree-vectorize STATIC_LINKING=1 + use_sh2drc = 1 # QNX else ifeq ($(platform), qnx) @@ -489,7 +441,6 @@ else ifeq ($(platform), classic_armv7_a7) HAVE_NEON = 1 ARCH = arm BUILTIN_GPU = neon - USE_DYNAREC = 1 ifeq ($(shell echo `$(CC) -dumpversion` "< 4.9" | bc -l), 1) CFLAGS += -march=armv7-a else @@ -499,6 +450,9 @@ else ifeq ($(platform), classic_armv7_a7) LDFLAGS += -static-libgcc -static-libstdc++ endif endif + + use_sh2drc = 1 + use_svpdrc = 1 # (armv8 a35, hard point, neon based) ### # Playstation Classic @@ -518,19 +472,14 @@ else ifeq ($(platform), classic_armv8_a35) CPPFLAGS += $(CFLAGS) ASFLAGS += $(CFLAGS) HAVE_NEON = 1 - ARCH = arm + ARCH = aarch64 BUILTIN_GPU = neon CFLAGS += -march=armv8-a LDFLAGS += -static-libgcc -static-libstdc++ - asm_memory = 0 - asm_render = 0 - asm_ym2612 = 0 - asm_misc = 0 - asm_cdpico = 0 - asm_cdmemory = 0 - asm_mix = 0 + + use_sh2drc = 1 use_cyclone = 0 - use_fame = 1 + use_fame = 1 use_drz80 = 0 use_cz80 = 1 ####################################### @@ -597,21 +546,14 @@ else ifeq ($(platform), emscripten) # GCW0 else ifeq ($(platform), gcw0) TARGET := $(TARGET_NAME)_libretro.so - CC = /opt/gcw0-toolchain/usr/bin/mipsel-linux-gcc - AR = /opt/gcw0-toolchain/usr/bin/mipsel-linux-ar + CC = mipsel-linux-gcc + AR = mipsel-linux-ar SHARED := -shared -nostdlib fpic := -fPIC LIBM := DONT_COMPILE_IN_ZLIB = 1 CFLAGS += -ffast-math -march=mips32 -mtune=mips32r2 -mhard-float - asm_memory = 0 - asm_render = 0 - asm_ym2612 = 0 - asm_misc = 0 - asm_cdpico = 0 - asm_cdmemory = 0 - asm_mix = 0 use_cyclone = 0 use_fame = 1 use_drz80 = 0 @@ -744,6 +686,8 @@ asm_ym2612 = 1 asm_misc = 1 asm_cdmemory = 1 asm_mix = 1 +asm_32xdraw = 1 +asm_32xmemory = 1 endif CFLAGS += $(fpic) diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index c836e1595..7006beff5 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -733,7 +733,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common /* mov r <-> [ebp+#offs] */ \ if ((offs) == 0) { \ emith_deref_modrm(op, 0, r, rs); \ - } else if (abs(offs) >= 0x80) { \ + } else if ((s32)(offs) < -0x80 || (s32)(offs) >= 0x80) { \ emith_deref_modrm(op, 2, r, rs); \ EMIT(offs, u32); \ } else { \ @@ -1075,7 +1075,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define PARAM_REGS { xCX, xDX, xR8, xR9 } #define PRESERVED_REGS { xSI, xDI, xR12, xR13, xR14, xR15, xBX, xBP } #define TEMPORARY_REGS { xAX, xR10, xR11 } -#define STATIC_SH2_REGS { SHR_SR,xBX , SHR_R(0),xR15 , SH2_R(1),xR14 } +#define STATIC_SH2_REGS { SHR_SR,xBX , SHR_R(0),xR15 , SHR_R(1),xR14 } #define host_arg2reg(rd, arg) \ switch (arg) { \ diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index cba97e2b5..c8940432d 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -1273,24 +1273,23 @@ static void dr_flush_tcache(int tcid) blink_free[tcid] = NULL; memset(unresolved_links[tcid], 0, sizeof(*unresolved_links[0]) * HASH_TABLE_SIZE(tcid)); memset(hash_tables[tcid], 0, sizeof(*hash_tables[0]) * HASH_TABLE_SIZE(tcid)); - if (Pico32xMem->sdram != NULL) { - if (tcid == 0) { // ROM, RAM - memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram)); - memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram)); - memset(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)); - memset(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)); - memset(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)); - memset(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)); - sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0; - } else { - memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram)); - memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram)); - memset(Pico32xMem->drcblk_da[tcid - 1], 0, sizeof(Pico32xMem->drcblk_da[tcid - 1])); - memset(Pico32xMem->drclit_da[tcid - 1], 0, sizeof(Pico32xMem->drclit_da[tcid - 1])); - memset(sh2s[tcid - 1].branch_cache, -1, sizeof(sh2s[0].branch_cache)); - memset(sh2s[tcid - 1].rts_cache, -1, sizeof(sh2s[0].rts_cache)); - sh2s[tcid - 1].rts_cache_idx = 0; - } + + if (tcid == 0) { // ROM, RAM + memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram)); + memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram)); + memset(sh2s[0].branch_cache, -1, sizeof(sh2s[0].branch_cache)); + memset(sh2s[1].branch_cache, -1, sizeof(sh2s[1].branch_cache)); + memset(sh2s[0].rts_cache, -1, sizeof(sh2s[0].rts_cache)); + memset(sh2s[1].rts_cache, -1, sizeof(sh2s[1].rts_cache)); + sh2s[0].rts_cache_idx = sh2s[1].rts_cache_idx = 0; + } else { + memset(Pico32xMem->drcblk_ram, 0, sizeof(Pico32xMem->drcblk_ram)); + memset(Pico32xMem->drclit_ram, 0, sizeof(Pico32xMem->drclit_ram)); + memset(Pico32xMem->drcblk_da[tcid - 1], 0, sizeof(Pico32xMem->drcblk_da[tcid - 1])); + memset(Pico32xMem->drclit_da[tcid - 1], 0, sizeof(Pico32xMem->drclit_da[tcid - 1])); + memset(sh2s[tcid - 1].branch_cache, -1, sizeof(sh2s[0].branch_cache)); + memset(sh2s[tcid - 1].rts_cache, -1, sizeof(sh2s[0].rts_cache)); + sh2s[tcid - 1].rts_cache_idx = 0; } #if (DRC_DEBUG & 4) tcache_dsm_ptrs[tcid] = tcache_ring[tcid].base; diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index b1b7487fb..76b445520 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -31,7 +31,7 @@ unsigned short scan_block(uint32_t base_pc, int is_slave, unsigned char *op_flags, uint32_t *end_pc, uint32_t *base_literals, uint32_t *end_literals); -#if defined(DRC_SH2) && defined(__GNUC__) +#if defined(DRC_SH2) && defined(__GNUC__) && !defined(__clang__) // direct access to some host CPU registers used by the DRC if gcc is used. // XXX MUST match SHR_SR definitions in cpu/drc/emit_*.c; should be moved there // XXX yuck, there's no portable way to determine register size. Use long long diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index b0054c05c..4b0b33841 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -106,7 +106,11 @@ int sh2_execute_interpreter(SH2 *sh2c, int cycles); static __inline void sh2_execute_prepare(SH2 *sh2, int use_drc) { +#ifdef DRC_SH2 sh2->run = use_drc ? sh2_execute_drc : sh2_execute_interpreter; +#else + sh2->run = sh2_execute_interpreter; +#endif } static __inline int sh2_execute(SH2 *sh2, int cycles) diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index d890fc0f6..6b086a939 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -8,6 +8,20 @@ CC=${CC:-gcc} # endianess of target (automagically determined below) ENDIAN= +# don't do this if ELF format isn't used. it doesn't matter since offsets are +# only needed for the asm parts (currently mips/arm32) and those have ELF +check_elf () +{ + echo '#include ' >/tmp/getoffs.c + echo "const int32_t val = 1;" >>/tmp/getoffs.c + $CC $CFLAGS -I .. -c /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 + if ! command -v readelf >/dev/null || ! file /tmp/getoffs.o | grep -q ELF; then + echo "/* mkoffset.sh: no readelf or not ELF, offset table not created */" >$fn + echo "WARNING: no readelf or not ELF, offset table not created" + exit + fi +} + # compile with target C compiler and extract value from .rodata section compile_rodata () { @@ -49,13 +63,7 @@ get_define () # prefix struct member member... fn="${1:-.}/pico_int_offs.h" if echo $CFLAGS | grep -qe -flto; then CFLAGS="$CFLAGS -fno-lto"; fi -# don't do this if readelf isn't available. it doesn't matter since offsets are -# only needed for the asm parts (currently mips/arm32) and those have readelf -if ! command -v readelf >/dev/null; then - echo "/* mkoffset.sh: readelf not found, offset table not created */" >$fn - echo "WARNING: readelf not found, offset table not created" - exit -fi +check_elf # determine endianess echo '#include ' >/tmp/getoffs.c echo "const int32_t val = 1;" >>/tmp/getoffs.c From 68f83baff36a63fbd4057faa2be198e70e5ab57b Mon Sep 17 00:00:00 2001 From: kub Date: Wed, 8 Jul 2020 19:50:41 +0200 Subject: [PATCH 173/174] sh2 drc, fix for x86_64 backend --- cpu/drc/emit_x86.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cpu/drc/emit_x86.c b/cpu/drc/emit_x86.c index 7006beff5..60b2b6a2c 100644 --- a/cpu/drc/emit_x86.c +++ b/cpu/drc/emit_x86.c @@ -915,8 +915,10 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common #define emith_call_cond(cond, ptr) \ emith_call(ptr) -#define emith_call_reg(r) \ - EMIT_OP_MODRM(0xff, 3, 2, r) +#define emith_call_reg(r) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM(0xff, 3, 2, (r)&7); \ +} while (0) #define emith_call_ctx(offs) do { \ EMIT_OP_MODRM(0xff, 2, 2, CONTEXT_REG); \ @@ -934,8 +936,10 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common emith_deref_modrm(0x03, 0, r, xSP); /* add r, [xsp] */ \ } while (0) -#define emith_jump_reg(r) \ - EMIT_OP_MODRM(0xff, 3, 4, r) +#define emith_jump_reg(r) do { \ + EMIT_REX_IF(0, 0, r); \ + EMIT_OP_MODRM(0xff, 3, 4, (r)&7); \ +} while (0) #define emith_jump_ctx(offs) do { \ EMIT_OP_MODRM(0xff, 2, 4, CONTEXT_REG); \ From b718a54e58a3ba419af67b5c84fd1dd363a90432 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 9 Jul 2020 08:42:33 +0200 Subject: [PATCH 174/174] sh2, fix for interpreter crash if drc is compiled in too --- cpu/sh2/compiler.c | 3 +++ cpu/sh2/compiler.h | 4 ++-- cpu/sh2/sh2.h | 3 ++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index c8940432d..085a61793 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -5392,7 +5392,10 @@ int sh2_execute_drc(SH2 *sh2c, int cycles) // others are usual SH2 flags sh2c->sr &= 0x3f3; sh2c->sr |= cycles << 12; + + sh2c->state |= SH2_IN_DRC; sh2_drc_entry(sh2c); + sh2c->state &= ~SH2_IN_DRC; // TODO: irq cycles ret_cycles = (int32_t)sh2c->sr >> 12; diff --git a/cpu/sh2/compiler.h b/cpu/sh2/compiler.h index 76b445520..9642492db 100644 --- a/cpu/sh2/compiler.h +++ b/cpu/sh2/compiler.h @@ -72,11 +72,11 @@ extern void REGPARM(1) (*sh2_drc_restore_sr)(SH2 *sh2); #define DRC_DECLARE_SR register long _sh2_sr asm(DRC_SR_REG) #endif #define DRC_SAVE_SR(sh2) \ - if (likely((sh2->state&(SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN)) \ + if (likely(sh2->state & SH2_IN_DRC)) \ sh2->sr = (s32)_sh2_sr // sh2_drc_save_sr(sh2) #define DRC_RESTORE_SR(sh2) \ - if (likely((sh2->state&(SH2_STATE_RUN|SH2_STATE_SLEEP)) == SH2_STATE_RUN)) \ + if (likely(sh2->state & SH2_IN_DRC)) \ _sh2_sr = (s32)sh2->sr // sh2_drc_restore_sr(sh2) #else diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index 4b0b33841..b9267d740 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -48,7 +48,8 @@ typedef struct SH2_ #define SH2_STATE_CPOLL (1 << 2) // polling comm regs #define SH2_STATE_VPOLL (1 << 3) // polling VDP #define SH2_STATE_RPOLL (1 << 4) // polling address in SDRAM -#define SH2_TIMER_RUN (1 << 8) // SOC WDT timer is running +#define SH2_TIMER_RUN (1 << 7) // SOC WDT timer is running +#define SH2_IN_DRC (1 << 8) // DRC in use unsigned int state; uint32_t poll_addr; int poll_cycles;