Skip to content

Commit

Permalink
First step towards parallel QPU programs
Browse files Browse the repository at this point in the history
Fixed buffer locking, caused crashes described in #1
Still stalling even at 480p@30 after a couple of seconds
However the camera-emulation mode works fine even at 480p@250 so this bug is likely to be in the camera code
However, increasing QPU program count from 5 to 10 by enabling split columns makes it stall faster, so there might be more to it still
  • Loading branch information
Seneral committed Oct 5, 2020
1 parent e354c83 commit 07212c1
Show file tree
Hide file tree
Showing 4 changed files with 256 additions and 123 deletions.
55 changes: 35 additions & 20 deletions main_qpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include "user-vcsm.h" // for vcsm_vc_hdl_from_ptr

#define DEFAULT 0 // source and target butter uniforms only
#define FULL_FRAME 1 // uniforms for full frame processing, e.g. blit
#define FULL_FRAME 1 // uniforms for full frame processing, e.g. blit
#define TILED 2 // uniforms and setup for tiled frame processing, e.g. blob detection
#define BITMSK 3 // uniforms and full frame processing, with bit mask target, e.g. blob detection

Expand All @@ -29,7 +29,7 @@ int main(int argc, char **argv)
// ---- Read arguments ----

GCS_CameraParams params = {
.mmalEnc = MMAL_ENCODING_I420,
.mmalEnc = MMAL_ENCODING_I420,
.width = (uint16_t)camWidth,
.height = (uint16_t)camHeight,
.fps = (uint16_t)camFPS,
Expand Down Expand Up @@ -103,6 +103,9 @@ int main(int argc, char **argv)
QPU_UserProgramInfo upInfo;
// MMAL Camera
GCS *gcs;
// Camera emulation buffers
const int emulBufCnt = 4;
QPU_BUFFER camEmulBuf[emulBufCnt];
// Frame Counter
auto startTime = std::chrono::high_resolution_clock::now();
auto lastTime = startTime;
Expand Down Expand Up @@ -282,21 +285,21 @@ int main(int argc, char **argv)
gcs_start(gcs);
printf("-- Camera Stream started --\n");
#else
QPU_BUFFER camEmulBuf;
qpu_allocBuffer(&camEmulBuf, &base, camWidth*camHeight*3, 4096); // Emulating full YUV frame
qpu_lockBuffer(&camEmulBuf);
for (int i = 0; i < emulBufCnt; i++)
{
uint8_t *YUVFrameData = (uint8_t*)camEmulBuf.ptr.arm.vptr;
qpu_allocBuffer(&camEmulBuf[i], &base, camWidth*camHeight*3, 4096); // Emulating full YUV frame
qpu_lockBuffer(&camEmulBuf[i]);
uint8_t *YUVFrameData = (uint8_t*)camEmulBuf[i].ptr.arm.vptr;
for (int x = 0; x < camWidth; x++)
{
for (int y = 0; y < camHeight; y++)
{
// Write test data in Y component (UV are after this, but are not used)
YUVFrameData[y*camWidth + x] = x/8 + y%8;
YUVFrameData[y*camWidth + x] = ((x+camWidth/(i+1))*255/camWidth)%256 + ((y+camHeight/(i+1))*255/camHeight)%256;
}
}
qpu_unlockBuffer(&camEmulBuf[i]);
}
qpu_unlockBuffer(&camEmulBuf);
#endif

// ---- Start Loop ----
Expand Down Expand Up @@ -327,11 +330,9 @@ int main(int argc, char **argv)
uint32_t cameraBufferHandle = vcsm_vc_hdl_from_ptr(cameraBuffer);
// Lock VCSM buffer to get VC-space address
uint32_t cameraBufferPtr = mem_lock(base.mb, cameraBufferHandle);
// Unlock VCSM buffer (no need to keep locked, VC-space adress won't change)
mem_unlock(base.mb, cameraBufferHandle);
#else
qpu_lockBuffer(&camEmulBuf);
uint32_t cameraBufferPtr = camEmulBuf.ptr.vc;
qpu_lockBuffer(&camEmulBuf[numFrames%emulBufCnt]);
uint32_t cameraBufferPtr = camEmulBuf[numFrames%emulBufCnt].ptr.vc;
#endif

// ---- Uniform preparation ----
Expand All @@ -352,30 +353,43 @@ int main(int argc, char **argv)

// ---- Program execution ----

/* if (mode == BITMSK)
qpu_lockBuffer(&bitmskBuffer);
else if (!drawToFrameBuffer)
qpu_lockBuffer(&targetBuffer);
*/
// Execute programs
int result;
if (mode == TILED)
{ // Execute numInstances programs each with their own set of uniforms
qpu_executeProgramDirect(&program, &base, numInstances, 6, 6, &perfState);
// Uncomment to execute only one program each frame, one after another
// program.progmem.uniforms.vc += 6*4*(numFrames%numInstances);
// qpu_executeProgramDirect(&program, &base, 1, 6, 6, &perfState);
// program.progmem.uniforms.vc -= 6*4*(numFrames%numInstances);
result = qpu_executeProgramDirect(&program, &base, numInstances, 6, 6, &perfState);
}
else
{ // Execute single program handling full frame
qpu_executeProgramDirect(&program, &base, 1, program.progmem.uniformsSize, 0, &perfState);
result = qpu_executeProgramDirect(&program, &base, 1, program.progmem.uniformsSize, 0, &perfState);
}

// Log errors occurred during execution
qpu_logErrors(&base);

#ifdef CAMERA
// Unlock VCSM buffer (no need to keep locked, VC-space adress won't change)
mem_unlock(base.mb, cameraBufferHandle);
// Return camera buffer to camera
gcs_returnFrameBuffer(gcs);
#else
qpu_unlockBuffer(&camEmulBuf);
qpu_unlockBuffer(&camEmulBuf[numFrames%emulBufCnt]);
#endif

/* // Unlock target buffers
if (mode == BITMSK)
qpu_unlockBuffer(&bitmskBuffer);
else if (!drawToFrameBuffer)
qpu_unlockBuffer(&targetBuffer);
*/
if (result != 0)
break;


// ---- Debugging and Statistics ----

Expand Down Expand Up @@ -459,7 +473,8 @@ int main(int argc, char **argv)
gcs_destroy(gcs);
printf("-- Camera Stream stopped --\n");
#else
qpu_releaseBuffer(&camEmulBuf);
for (int i = 0; i < emulBufCnt; i++)
qpu_releaseBuffer(&camEmulBuf[i]);
#endif


Expand Down
12 changes: 8 additions & 4 deletions qpu/qpu_program.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,19 @@ int qpu_executeProgramDirect (QPU_PROGRAM *program, QPU_BASE *base, int numInst,
base->peripherals[V3D_DBQITE] = 0; // Disable IRQ
base->peripherals[V3D_DBQITC] = -1; // Resets IRQ flags

// Clear caches - L2, TMU, uniforms, instructions
base->peripherals[V3D_L2CACTL] = (1<<2); // Clear L2 cache
base->peripherals[V3D_SLCACTL] = -1; // Clear other caches
base->peripherals[V3D_SLCACTL] = 0b1111<<24 | 0b1111<<16 | 0b1111<<8 | 0b1111<<0;

// Note QPU user program numbers to determine when all our instances finished
int qpuQueued = (base->peripherals[V3D_SRQCS] & 0b111111);
int qpuFinished = (base->peripherals[V3D_SRQCS] >> 16) & 0xFF;
int qpuWaitCount = (qpuQueued + qpuFinished + numInst) % 256;
//base->peripheral[V3D_SRQCS] = (1<<7) | (1<<8) | (1<<16); // Reset error bit and counts

if (qpuWaitCount < qpuFinished)
printf("QPU executing %d programs; waiting for %d with %d queued and %d already finished! \n", numInst, qpuWaitCount, qpuQueued, qpuFinished);

if (perfState != NULL)
perfState->qpusUsed = numInst;

Expand All @@ -84,14 +88,14 @@ int qpu_executeProgramDirect (QPU_PROGRAM *program, QPU_BASE *base, int numInst,
while((base->peripherals[V3D_SRQCS] & 0b111111) == 16)
{
cnt++;
if (cnt % 100000 == 0)
if (cnt % 10000 == 0)
{
qpu_logErrors(base);
// qpu_logStalls(base);
}
if (cnt % 100000 == 0 && perfState != NULL)
if (cnt % 10000 == 0 && perfState != NULL)
qpu_updatePerformance(base, perfState);
if (cnt % 1000000 == 0)
if (cnt % 100000 == 0)
{
printf("QPU stalled - queued %d / %d! \n", q, numInst);
if (perfState != NULL) qpu_logPerformance(perfState);
Expand Down
104 changes: 5 additions & 99 deletions qpu_programs/qpu_blit_tiled.asm
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
.set lineCount, ra3
mov srcAddr, unif;
mov tgtAddr, unif;
mov srcStride, unif;
mov tgtStride, unif;
mov lineWidth, unif;
mov srcStride, unif;
mov tgtStride, unif;
mov lineWidth, unif;
mov lineCount, unif;

# Variables
.set y, ra4 # Iterator over all lines
.set y, ra4 # Iterator over all lines
.set srcPtr, ra5
.set tgtPtr, ra6
.set vpmSetup, rb2
Expand All @@ -32,9 +32,6 @@ ldi num32, 32;

# TODO: Generate vector mask to allow for any multiple of 8-wide columns (not just 16x8)

# ------- Block 0 Start
#or.setf nop, mutex, nop;

# Create VPM Setup
ldi r0, vpm_setup(0, 1, h32(0));
ldi r1, 4;
Expand Down Expand Up @@ -63,94 +60,13 @@ max y, lineCount, 1;

:y # Loop over lines

# ------- Block 1 Start -----
# or.setf nop, mutex, nop;

.rep px, 2

# Initiate VPM write and make sure last VDW finished
read vw_wait;
mov vw_setup, vpmSetup;

.if 1 # --- Code 1
# Normal debug code. Always works, without mutex or whatever configuration
# So VPM access should not be problematic

# Constant Alpha
mov ra17.8dsi, 255;
# Element number (1-16) in Red
mul24 ra17.8csi, elem_num, num16;
# Constant 0 green
mov ra17.8bsi, 0;
# QPU number in Blue
ldi r0, 21;
mul24 ra17.8asi, qpu_num, r0;
nop;

# Write to VPM
mov vpm, ra17;
mov vpm, ra17;
mov vpm, ra17;
mov vpm, ra17;

.endif

.if 0 # --- Code 2
# Simple TMU Test code

mov t0s, srcPtr;
ldtmu0
mov ra18, r4;

.endif

.if 0 # --- Code 3
# TMU Test code with mutex

read mutex;

mov t0s, srcPtr;
ldtmu0
mov ra18, r4;

mov mutex, 0;

.endif

.if 0 # --- Code 4
# TMU read to r0
mov t0s, srcPtr;
ldtmu0

mov r0, r4;

# Write to VPM
fmul vpm.8888, r0, 1.0; # using mul encoding
fmul vpm.8888, r0, 1.0; # using mul encoding
fmul vpm.8888, r0, 1.0; # using mul encoding
fmul vpm.8888, r0, 1.0; # using mul encoding

.endif

.if 0 # --- Code 5
# TMU read to r0 with nop; afterwards
mov t0s, srcPtr;
ldtmu0

mov r0, r4;
nop;

# Write to VPM
fmul vpm.8888, r0, 1.0; # using mul encoding
fmul vpm.8888, r0, 1.0; # using mul encoding
fmul vpm.8888, r0, 1.0; # using mul encoding
fmul vpm.8888, r0, 1.0; # using mul encoding

.endif

.if 0 # --- Code 6
# Normal TMU camera write (works if executed one after another

# Read TMU
mov t0s, srcPtr;
ldtmu0

Expand All @@ -166,27 +82,20 @@ max y, lineCount, 1;
fmul vpm.8888, ra22, 1.0; # using mul encoding
fmul vpm.8888, ra23, 1.0; # using mul encoding

.endif

# Initiate VDW from VPM to memory
mov vw_setup, vdwSetup;
mov vw_setup, vdwStride;
mov vw_addr, tgtPtr;

# Increase address
add srcPtr, srcPtr, 4;
# nop;
add tgtPtr, tgtPtr, num16;
# nop;

# Make sure to finish VDW
# read vw_wait;

.endr

# ------- Block 1 End
# or.setf mutex, nop, nop;

# Increase adresses to next line
add srcPtr, srcPtr, srcStride;
add tgtPtr, tgtPtr, tgtStride;
Expand All @@ -198,9 +107,6 @@ max y, lineCount, 1;
nop
nop

# ------- Block 0 End
#or.setf mutex, nop, nop;

mov.setf irq, nop;

nop; thrend
Expand Down
Loading

0 comments on commit 07212c1

Please sign in to comment.