GX2+TCL: Reimplement command buffer submission

- GX2 utilizes TCL(.rpl) API for command submission instead of directly writing to an internal GPU fifo
- Submission & retire timestamps are correctly implemented as incremental counters
- Command buffering behaviour matches console
- Fixes race conditions on aarch64
This commit is contained in:
Exzap 2025-05-14 18:59:50 +02:00
parent 96765e4ac6
commit 28ea70b6d8
21 changed files with 761 additions and 472 deletions

View file

@ -59,7 +59,7 @@ void gx2Export_GX2SwapScanBuffers(PPCInterpreter_t* hCPU)
if (isPokken)
GX2::GX2DrawDone();
GX2ReserveCmdSpace(5+2);
GX2::GX2ReserveCmdSpace(5+2);
uint64 tick64 = PPCInterpreter_getMainCoreCycleCounter() / 20ULL;
lastSwapTime = tick64;
@ -86,24 +86,16 @@ void gx2Export_GX2SwapScanBuffers(PPCInterpreter_t* hCPU)
GX2::GX2WaitForFlip();
}
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2CopyColorBufferToScanBuffer(PPCInterpreter_t* hCPU)
{
cemuLog_log(LogType::GX2, "GX2CopyColorBufferToScanBuffer(0x{:08x},{})", hCPU->gpr[3], hCPU->gpr[4]);
GX2ReserveCmdSpace(5);
GX2::GX2ReserveCmdSpace(10);
// todo: proper implementation
// hack: Avoid running to far ahead of GPU. Normally this would be guaranteed by the circular buffer model, which we currently dont fully emulate
if(GX2::GX2WriteGather_getReadWriteDistance() > 32*1024*1024 )
{
debug_printf("Waiting for GPU to catch up...\n");
PPCInterpreter_relinquishTimeslice(); // release current thread
return;
}
GX2ColorBuffer* colorBuffer = (GX2ColorBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER, 9));
@ -309,81 +301,6 @@ void gx2Export_GX2SetSemaphore(PPCInterpreter_t* hCPU)
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2Flush(PPCInterpreter_t* hCPU)
{
cemuLog_log(LogType::GX2, "GX2Flush()");
_GX2SubmitToTCL();
osLib_returnFromFunction(hCPU, 0);
}
uint8* _GX2LastFlushPtr[PPC_CORE_COUNT] = {NULL};
uint64 _prevReturnedGPUTime = 0;
uint64 Latte_GetTime()
{
uint64 gpuTime = coreinit::OSGetSystemTime();
gpuTime *= 20000ULL;
if (gpuTime <= _prevReturnedGPUTime)
gpuTime = _prevReturnedGPUTime + 1; // avoid ever returning identical timestamps
_prevReturnedGPUTime = gpuTime;
return gpuTime;
}
void _GX2SubmitToTCL()
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
// do nothing if called from non-main GX2 core
if (GX2::sGX2MainCoreIndex != coreIndex)
{
cemuLog_logDebug(LogType::Force, "_GX2SubmitToTCL() called on non-main GX2 core");
return;
}
if( gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL )
return; // quit if in display list
_GX2LastFlushPtr[coreIndex] = (gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex]);
// update last submitted CB timestamp
uint64 commandBufferTimestamp = Latte_GetTime();
LatteGPUState.lastSubmittedCommandBufferTimestamp.store(commandBufferTimestamp);
cemuLog_log(LogType::GX2, "Submitting GX2 command buffer with timestamp {:016x}", commandBufferTimestamp);
// submit HLE packet to write retirement timestamp
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_SET_CB_RETIREMENT_TIMESTAMP, 2));
gx2WriteGather_submitU32AsBE((uint32)(commandBufferTimestamp>>32ULL));
gx2WriteGather_submitU32AsBE((uint32)(commandBufferTimestamp&0xFFFFFFFFULL));
}
uint32 _GX2GetUnflushedBytes(uint32 coreIndex)
{
uint32 unflushedBytes = 0;
if (_GX2LastFlushPtr[coreIndex] != NULL)
{
if (_GX2LastFlushPtr[coreIndex] > gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex])
unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer + 4); // this isn't 100% correct since we ignore the bytes between the last flush address and the start of the wrap around
else
unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - _GX2LastFlushPtr[coreIndex]);
}
else
unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer);
return unflushedBytes;
}
/*
* Guarantees that the requested amount of space is available on the current command buffer
* If the space is not available, the current command buffer is pushed to the GPU and a new one is allocated
*/
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32)
{
uint32 coreIndex = coreinit::OSGetCoreId();
// if we are in a display list then do nothing
if( gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL )
return;
uint32 unflushedBytes = _GX2GetUnflushedBytes(coreIndex);
if( unflushedBytes >= 0x1000 )
{
_GX2SubmitToTCL();
}
}
void gx2_load()
{
osLib_addFunction("gx2", "GX2GetContextStateDisplayList", gx2Export_GX2GetContextStateDisplayList);
@ -445,10 +362,6 @@ void gx2_load()
// semaphore
osLib_addFunction("gx2", "GX2SetSemaphore", gx2Export_GX2SetSemaphore);
// command buffer
osLib_addFunction("gx2", "GX2Flush", gx2Export_GX2Flush);
GX2::GX2Init_writeGather();
GX2::GX2MemInit();
GX2::GX2ResourceInit();
GX2::GX2CommandInit();

View file

@ -67,10 +67,4 @@ void gx2Export_GX2MarkScanBufferCopied(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetDefaultState(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetupContextStateEx(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetContextState(PPCInterpreter_t* hCPU);
// command buffer
uint32 _GX2GetUnflushedBytes(uint32 coreIndex);
void _GX2SubmitToTCL();
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32);
void gx2Export_GX2SetContextState(PPCInterpreter_t* hCPU);

View file

@ -132,7 +132,6 @@ namespace GX2
depthFirstSlice = _swapEndianU32(depthBuffer->viewFirstSlice);
depthNumSlices = _swapEndianU32(depthBuffer->viewNumSlices);
}
gx2WriteGather_submit(pm4HeaderType3(IT_HLE_CLEAR_COLOR_DEPTH_STENCIL, 23),
hleClearFlags,
colorPhysAddr,

View file

@ -4,178 +4,397 @@
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/OS/libs/coreinit/coreinit.h"
#include "Cafe/OS/libs/coreinit/coreinit_Thread.h"
#include "Cafe/OS/libs/TCL/TCL.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "GX2.h"
#include "GX2_Command.h"
#include "GX2_Shader.h"
#include "GX2_Misc.h"
#include "OS/libs/coreinit/coreinit_MEM.h"
extern uint8* gxRingBufferReadPtr;
GX2WriteGatherPipeState gx2WriteGatherPipe = { 0 };
namespace GX2
{
GX2PerCoreCBState s_perCoreCBState[Espresso::CORE_COUNT];
}
void gx2WriteGather_submitU32AsBE(uint32 v)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL)
if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
return;
*(uint32*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = _swapEndianU32(v);
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4;
*(uint32*)(GX2::s_perCoreCBState[coreIndex].currentWritePtr) = _swapEndianU32(v);
GX2::s_perCoreCBState[coreIndex].currentWritePtr++;
cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
}
void gx2WriteGather_submitU32AsLE(uint32 v)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL)
if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
return;
*(uint32*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = v;
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4;
*(uint32*)(GX2::s_perCoreCBState[coreIndex].currentWritePtr) = v;
GX2::s_perCoreCBState[coreIndex].currentWritePtr++;
cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
}
void gx2WriteGather_submitU32AsLEArray(uint32* v, uint32 numValues)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL)
if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
return;
memcpy_dwords((*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]), v, numValues);
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4 * numValues;
memcpy_dwords(GX2::s_perCoreCBState[coreIndex].currentWritePtr, v, numValues);
GX2::s_perCoreCBState[coreIndex].currentWritePtr += numValues;
cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
}
namespace GX2
{
sint32 gx2WriteGatherCurrentMainCoreIndex = -1;
bool gx2WriteGatherInited = false;
void GX2WriteGather_ResetToDefaultState()
struct GX2CommandState // mapped to PPC space since the GPU writes here
{
gx2WriteGatherCurrentMainCoreIndex = -1;
gx2WriteGatherInited = false;
}
// command pool
MEMPTR<uint32be> commandPoolBase;
uint32 commandPoolSizeInU32s;
MEMPTR<uint32be> gpuCommandReadPtr;
// timestamp
uint64be lastSubmissionTime;
};
void GX2Init_writeGather() // init write gather, make current core
SysAllocator<GX2CommandState> s_commandState;
GX2PerCoreCBState s_mainCoreLastCommandState;
bool s_cbBufferIsInternallyAllocated;
void GX2Command_StartNewCommandBuffer(uint32 numU32s);
// called from GX2Init. Allocates a 4MB memory chunk from which command buffers are suballocated from
void GX2Init_commandBufferPool(void* bufferBase, uint32 bufferSize)
{
if (gx2WriteGatherPipe.gxRingBuffer == NULL)
gx2WriteGatherPipe.gxRingBuffer = (uint8*)malloc(GX2_COMMAND_RING_BUFFER_SIZE);
if (gx2WriteGatherCurrentMainCoreIndex == sGX2MainCoreIndex)
return; // write gather already configured for same core
for (sint32 i = 0; i < PPC_CORE_COUNT; i++)
cemu_assert_debug(!s_commandState->commandPoolBase); // should not be allocated already
// setup command buffer pool. If not provided allocate a 4MB or custom size buffer
uint32 poolSize = bufferSize ? bufferSize : 0x400000; // 4MB (can be overwritten by custom GX2Init parameters?)
if (bufferBase)
{
if (i == sGX2MainCoreIndex)
s_commandState->commandPoolBase = (uint32be*)bufferBase;
s_cbBufferIsInternallyAllocated = false;
}
else
{
s_commandState->commandPoolBase = (uint32be*)coreinit::_weak_MEMAllocFromDefaultHeapEx(poolSize, 0x100);
s_cbBufferIsInternallyAllocated = true;
}
if (!s_commandState->commandPoolBase)
{
cemuLog_log(LogType::Force, "GX2: Failed to allocate command buffer pool");
}
s_commandState->commandPoolSizeInU32s = poolSize / sizeof(uint32be);
s_commandState->gpuCommandReadPtr = s_commandState->commandPoolBase;
// init per-core command buffer state
for (uint32 i = 0; i < Espresso::CORE_COUNT; i++)
{
s_perCoreCBState[i].bufferPtr = nullptr;
s_perCoreCBState[i].bufferSizeInU32s = 0;
s_perCoreCBState[i].currentWritePtr = nullptr;
}
// start first command buffer for main core
GX2Command_StartNewCommandBuffer(0x100);
}
void GX2Shutdown_commandBufferPool()
{
if (!s_commandState->commandPoolBase)
return;
if (s_cbBufferIsInternallyAllocated)
coreinit::_weak_MEMFreeToDefaultHeap(s_commandState->commandPoolBase.GetPtr());
s_cbBufferIsInternallyAllocated = false;
s_commandState->commandPoolBase = nullptr;
s_commandState->commandPoolSizeInU32s = 0;
s_commandState->gpuCommandReadPtr = nullptr;
}
// current position of where the GPU is reading from. Updated via a memory write command submitted to the GPU
uint32 GX2Command_GetPoolGPUReadIndex()
{
stdx::atomic_ref<MEMPTR<uint32be>> _readPtr(s_commandState->gpuCommandReadPtr);
MEMPTR<uint32be> currentReadPtr = _readPtr.load();
cemu_assert_debug(currentReadPtr);
return (uint32)(currentReadPtr.GetPtr() - s_commandState->commandPoolBase.GetPtr());
}
void GX2Command_WaitForNextBufferRetired()
{
uint64 retiredTimeStamp = GX2GetRetiredTimeStamp();
retiredTimeStamp += 1;
// but cant be higher than the submission timestamp
stdx::atomic_ref<uint64be> _lastSubmissionTime(s_commandState->lastSubmissionTime);
uint64 submissionTimeStamp = _lastSubmissionTime.load();
if (retiredTimeStamp > submissionTimeStamp)
retiredTimeStamp = submissionTimeStamp;
GX2WaitTimeStamp(retiredTimeStamp);
}
void GX2Command_SetupCoreCommandBuffer(uint32be* buffer, uint32 sizeInU32s, bool isDisplayList)
{
uint32 coreIndex = coreinit::OSGetCoreId();
auto& coreCBState = s_perCoreCBState[coreIndex];
coreCBState.bufferPtr = buffer;
coreCBState.bufferSizeInU32s = sizeInU32s;
coreCBState.currentWritePtr = buffer;
coreCBState.isDisplayList = isDisplayList;
}
void GX2Command_StartNewCommandBuffer(uint32 numU32s)
{
uint32 coreIndex = coreinit::OSGetCoreId();
auto& coreCBState = s_perCoreCBState[coreIndex];
numU32s = std::max<uint32>(numU32s, 0x100);
// grab space from command buffer pool and if necessary wait for it
uint32be* bufferPtr = nullptr;
uint32 bufferSizeInU32s = 0;
uint32 readIndex;
while (true)
{
// try to grab buffer data from first available spot:
// 1. At the current write location up to the end of the buffer (avoiding an overlap with the read location)
// 2. From the start of the buffer up to the read location
readIndex = GX2Command_GetPoolGPUReadIndex();
uint32be* nextWritePos = coreCBState.bufferPtr ? coreCBState.bufferPtr + coreCBState.bufferSizeInU32s : s_commandState->commandPoolBase.GetPtr();
uint32 writeIndex = nextWritePos - s_commandState->commandPoolBase;
uint32 poolSizeInU32s = s_commandState->commandPoolSizeInU32s;
// readIndex == writeIndex can mean either buffer full or buffer empty
// we could use GX2GetRetiredTimeStamp() == GX2GetLastSubmittedTimeStamp() to determine if the buffer is truly empty
// but this can have false negatives since the last submission timestamp is updated independently of the read index
// so instead we just avoid ever filling the buffer completely
cemu_assert_debug(readIndex < poolSizeInU32s);
cemu_assert_debug(writeIndex < poolSizeInU32s);
if (writeIndex < readIndex)
{
gx2WriteGatherPipe.writeGatherPtrGxBuffer[i] = gx2WriteGatherPipe.gxRingBuffer;
gx2WriteGatherPipe.writeGatherPtrWrite[i] = &gx2WriteGatherPipe.writeGatherPtrGxBuffer[i];
// writeIndex has wrapped around
uint32 wordsAvailable = readIndex - writeIndex;
if (wordsAvailable > 0)
wordsAvailable--; // avoid writeIndex becoming equal to readIndex
if (wordsAvailable >= numU32s)
{
bufferPtr = s_commandState->commandPoolBase + writeIndex;
bufferSizeInU32s = wordsAvailable;
break;
}
}
else
{
gx2WriteGatherPipe.writeGatherPtrGxBuffer[i] = NULL;
gx2WriteGatherPipe.writeGatherPtrWrite[i] = NULL;
uint32 wordsAvailable = poolSizeInU32s - writeIndex;
if (wordsAvailable > 0)
wordsAvailable--; // avoid writeIndex becoming equal to readIndex
if (wordsAvailable >= numU32s)
{
bufferPtr = nextWritePos;
bufferSizeInU32s = wordsAvailable;
break;
}
// not enough space at end of buffer, try to grab from the beginning of the buffer
wordsAvailable = readIndex;
if (wordsAvailable > 0)
wordsAvailable--; // avoid writeIndex becoming equal to readIndex
if (wordsAvailable >= numU32s)
{
bufferPtr = s_commandState->commandPoolBase;
bufferSizeInU32s = wordsAvailable;
break;
}
}
gx2WriteGatherPipe.displayListStart[i] = MPTR_NULL;
gx2WriteGatherPipe.writeGatherPtrDisplayList[i] = NULL;
gx2WriteGatherPipe.displayListMaxSize[i] = 0;
GX2Command_WaitForNextBufferRetired();
}
cemu_assert_debug(bufferPtr);
bufferSizeInU32s = std::min<uint32>(numU32s, 0x20000); // size cap
#ifdef CEMU_DEBUG_ASSERT
uint32 newWriteIndex = ((bufferPtr - s_commandState->commandPoolBase) + bufferSizeInU32s) % s_commandState->commandPoolSizeInU32s;
cemu_assert_debug(newWriteIndex != readIndex);
#endif
// setup buffer and make it the current write gather target
cemu_assert_debug(bufferPtr >= s_commandState->commandPoolBase && (bufferPtr + bufferSizeInU32s) <= s_commandState->commandPoolBase + s_commandState->commandPoolSizeInU32s);
GX2Command_SetupCoreCommandBuffer(bufferPtr, bufferSizeInU32s, false);
}
void GX2Command_SubmitCommandBuffer(uint32be* buffer, uint32 sizeInU32s, MEMPTR<uint32be>* completionGPUReadPointer, bool triggerMarkerInterrupt)
{
uint32be cmd[10];
uint32 cmdLen = 4;
cmd[0] = pm4HeaderType3(IT_INDIRECT_BUFFER_PRIV, 3);
cmd[1] = memory_virtualToPhysical(MEMPTR<void>(buffer).GetMPTR());
cmd[2] = 0x00000000; // address high bits
cmd[3] = sizeInU32s;
if (completionGPUReadPointer)
{
// append command to update completionGPUReadPointer after the GPU is done with the command buffer
cmd[4] = pm4HeaderType3(IT_MEM_WRITE, 4);
cmd[5] = memory_virtualToPhysical(MEMPTR<void>(completionGPUReadPointer).GetMPTR()) | 2;
cmd[6] = 0x40000;
cmd[7] = MEMPTR<void>(buffer + sizeInU32s).GetMPTR(); // value to write
cmd[8] = 0x00000000;
cmdLen = 9;
}
betype<TCL::TCLSubmissionFlag> submissionFlags{};
if (!triggerMarkerInterrupt)
submissionFlags |= TCL::TCLSubmissionFlag::NO_MARKER_INTERRUPT;
submissionFlags |= TCL::TCLSubmissionFlag::USE_RETIRED_MARKER;
TCL::TCLSubmitToRing(cmd, cmdLen, &submissionFlags, &s_commandState->lastSubmissionTime);
}
void GX2Command_PadCurrentBuffer()
{
uint32 coreIndex = coreinit::OSGetCoreId();
auto& coreCBState = s_perCoreCBState[coreIndex];
if (!coreCBState.currentWritePtr)
return;
uint32 writeDistance = (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr);
if ((writeDistance&7) != 0)
{
uint32 distanceToPad = 0x8 - (writeDistance & 0x7);
while (distanceToPad)
{
*coreCBState.currentWritePtr = pm4HeaderType2Filler();
coreCBState.currentWritePtr++;
distanceToPad--;
}
}
}
void GX2Command_Flush(uint32 numU32sForNextBuffer, bool triggerMarkerInterrupt)
{
uint32 coreIndex = coreinit::OSGetCoreId();
auto& coreCBState = s_perCoreCBState[coreIndex];
if (coreCBState.isDisplayList)
{
// display list
cemu_assert_debug((uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr) < coreCBState.bufferSizeInU32s);
cemuLog_logDebugOnce(LogType::Force, "GX2 flush called on display list");
}
else
{
// command buffer
if (coreCBState.currentWritePtr != coreCBState.bufferPtr)
{
// pad the command buffer to 32 byte alignment
GX2Command_PadCurrentBuffer();
// submit it to the GPU
uint32 bufferLength = (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr);
cemu_assert_debug(bufferLength <= coreCBState.bufferSizeInU32s);
GX2Command_SubmitCommandBuffer(coreCBState.bufferPtr, bufferLength, &s_commandState->gpuCommandReadPtr, triggerMarkerInterrupt);
GX2Command_StartNewCommandBuffer(numU32sForNextBuffer);
}
else
{
// current buffer is empty so we dont need to queue it
if (numU32sForNextBuffer > s_commandState->commandPoolSizeInU32s)
GX2Command_StartNewCommandBuffer(numU32sForNextBuffer);
}
}
}
void GX2Flush()
{
GX2Command_Flush(256, true);
}
uint64 GX2GetLastSubmittedTimeStamp()
{
stdx::atomic_ref<uint64be> _lastSubmissionTime(s_commandState->lastSubmissionTime);
return _lastSubmissionTime.load();
}
uint64 GX2GetRetiredTimeStamp()
{
uint64be ts = 0;
TCL::TCLTimestamp(TCL::TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED, &ts);
return ts;
}
bool GX2WaitTimeStamp(uint64 tsWait)
{
// handle GPU timeout here? But for now we timeout after 60 seconds
TCL::TCLWaitTimestamp(TCL::TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED, tsWait, Espresso::TIMER_CLOCK * 60);
return true;
}
/*
* Guarantees that the requested amount of space is available on the current command buffer
* If the space is not available, the current command buffer is pushed to the GPU and a new one is allocated
*/
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32)
{
uint32 coreIndex = coreinit::OSGetCoreId();
auto& coreCBState = s_perCoreCBState[coreIndex];
if (coreCBState.currentWritePtr == nullptr)
return;
uint32 writeDistance = (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr);
if (writeDistance + reservedFreeSpaceInU32 > coreCBState.bufferSizeInU32s)
{
GX2Command_Flush(reservedFreeSpaceInU32, true);
}
gx2WriteGatherCurrentMainCoreIndex = sGX2MainCoreIndex;
gx2WriteGatherInited = true;
}
void GX2WriteGather_beginDisplayList(PPCInterpreter_t* hCPU, MPTR buffer, uint32 maxSize)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(hCPU);
gx2WriteGatherPipe.displayListStart[coreIndex] = buffer;
gx2WriteGatherPipe.displayListMaxSize[coreIndex] = maxSize;
// set new write gather ptr
gx2WriteGatherPipe.writeGatherPtrDisplayList[coreIndex] = memory_getPointerFromVirtualOffset(gx2WriteGatherPipe.displayListStart[coreIndex]);
gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = &gx2WriteGatherPipe.writeGatherPtrDisplayList[coreIndex];
if (coreIndex == sGX2MainCoreIndex)
{
GX2Command_PadCurrentBuffer();
cemu_assert_debug(!s_perCoreCBState[coreIndex].isDisplayList);
s_mainCoreLastCommandState = s_perCoreCBState[coreIndex];
}
GX2Command_SetupCoreCommandBuffer(MEMPTR<uint32be>(buffer), maxSize/4, true);
}
uint32 GX2WriteGather_getDisplayListWriteDistance(sint32 coreIndex)
{
return (uint32)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] - memory_getPointerFromVirtualOffset(gx2WriteGatherPipe.displayListStart[coreIndex]));
}
uint32 GX2WriteGather_getFifoWriteDistance(uint32 coreIndex)
{
uint32 writeDistance = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer);
return writeDistance;
auto& coreCBState = s_perCoreCBState[coreIndex];
cemu_assert_debug(coreCBState.isDisplayList);
if (coreCBState.currentWritePtr == nullptr)
return 0;
return (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr) * 4;
}
uint32 GX2WriteGather_endDisplayList(PPCInterpreter_t* hCPU, MPTR buffer)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(hCPU);
if (gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL)
uint32 coreIndex = coreinit::OSGetCoreId();
auto& coreCBState = s_perCoreCBState[coreIndex];
GX2Command_PadCurrentBuffer();
uint32 finalWriteIndex = coreCBState.currentWritePtr - coreCBState.bufferPtr;
cemu_assert_debug(finalWriteIndex <= coreCBState.bufferSizeInU32s);
// if we are on the main GX2 core then restore the GPU command buffer
if (coreIndex == sGX2MainCoreIndex)
{
uint32 currentWriteSize = GX2WriteGather_getDisplayListWriteDistance(coreIndex);
// pad to 32 byte
if (gx2WriteGatherPipe.displayListMaxSize[coreIndex] >= ((gx2WriteGatherPipe.displayListMaxSize[coreIndex] + 0x1F) & ~0x1F))
{
while ((currentWriteSize & 0x1F) != 0)
{
gx2WriteGather_submitU32AsBE(pm4HeaderType2Filler());
currentWriteSize += 4;
}
}
// get size of written data
currentWriteSize = GX2WriteGather_getDisplayListWriteDistance(coreIndex);
// disable current display list and restore write gather ptr
gx2WriteGatherPipe.displayListStart[coreIndex] = MPTR_NULL;
if (sGX2MainCoreIndex == coreIndex)
gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = &gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex];
else
gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = NULL;
// return size of (written) display list
return currentWriteSize;
coreCBState = s_mainCoreLastCommandState;
}
else
{
// no active display list
// return a size of 0
return 0;
coreCBState.bufferPtr = nullptr;
coreCBState.currentWritePtr = nullptr;
coreCBState.bufferSizeInU32s = 0;
coreCBState.isDisplayList = false;
}
return finalWriteIndex * 4;
}
bool GX2GetCurrentDisplayList(betype<MPTR>* displayListAddr, uint32be* displayListSize)
bool GX2GetCurrentDisplayList(MEMPTR<uint32be>* displayListAddr, uint32be* displayListSize)
{
uint32 coreIndex = coreinit::OSGetCoreId();
if (gx2WriteGatherPipe.displayListStart[coreIndex] == MPTR_NULL)
auto& coreCBState = s_perCoreCBState[coreIndex];
if (!coreCBState.isDisplayList)
return false;
if (displayListAddr)
*displayListAddr = gx2WriteGatherPipe.displayListStart[coreIndex];
*displayListAddr = coreCBState.bufferPtr;
if (displayListSize)
*displayListSize = gx2WriteGatherPipe.displayListMaxSize[coreIndex];
*displayListSize = coreCBState.bufferSizeInU32s * sizeof(uint32be);
return true;
}
// returns true if we are writing to a display list
bool GX2GetDisplayListWriteStatus()
{
// returns true if we are writing to a display list
uint32 coreIndex = coreinit::OSGetCoreId();
return gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL;
}
uint32 GX2WriteGather_getReadWriteDistance()
{
uint32 coreIndex = sGX2MainCoreIndex;
uint32 writeDistance = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] + GX2_COMMAND_RING_BUFFER_SIZE - gxRingBufferReadPtr);
writeDistance %= GX2_COMMAND_RING_BUFFER_SIZE;
return writeDistance;
}
void GX2WriteGather_checkAndInsertWrapAroundMark()
{
uint32 coreIndex = coreinit::OSGetCoreId();
if (coreIndex != sGX2MainCoreIndex) // only if main gx2 core
return;
if (gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL)
return;
uint32 writeDistance = GX2WriteGather_getFifoWriteDistance(coreIndex);
if (writeDistance >= (GX2_COMMAND_RING_BUFFER_SIZE * 3 / 5))
{
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_FIFO_WRAP_AROUND, 1));
gx2WriteGather_submitU32AsBE(0); // empty word since we can't send commands with zero data words
gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] = gx2WriteGatherPipe.gxRingBuffer;
}
return s_perCoreCBState[coreIndex].isDisplayList;
}
void GX2BeginDisplayList(MEMPTR<void> displayListAddr, uint32 size)
@ -204,28 +423,23 @@ namespace GX2
memory_virtualToPhysical(addr),
0, // high address bits
size / 4);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
void GX2DirectCallDisplayList(void* addr, uint32 size)
{
// this API submits to TCL directly and bypasses write-gatherer
// its basically a way to manually submit a command buffer to the GPU
// as such it also affects the submission and retire timestamps
uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
cemu_assert_debug(coreIndex == sGX2MainCoreIndex);
coreIndex = sGX2MainCoreIndex; // always submit to main queue which is owned by GX2 main core (TCLSubmitToRing does not need this workaround)
uint32be* cmdStream = (uint32be*)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex]);
cmdStream[0] = pm4HeaderType3(IT_INDIRECT_BUFFER_PRIV, 3);
cmdStream[1] = memory_virtualToPhysical(MEMPTR<void>(addr).GetMPTR());
cmdStream[2] = 0;
cmdStream[3] = size / 4;
gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] += 16;
// update submission timestamp and retired timestamp
_GX2SubmitToTCL();
uint32 coreIndex = coreinit::OSGetCoreId();
if (coreIndex != sGX2MainCoreIndex)
{
cemuLog_logDebugOnce(LogType::Force, "GX2DirectCallDisplayList() called on non-main GX2 core");
}
if (!s_perCoreCBState[coreIndex].isDisplayList)
{
// make sure any preceeding commands are submitted first
GX2Command_Flush(0x100, false);
}
GX2Command_SubmitCommandBuffer(static_cast<uint32be*>(addr), size / 4, nullptr, false);
}
void GX2CopyDisplayList(MEMPTR<uint32be*> addr, uint32 size)
@ -288,6 +502,12 @@ namespace GX2
void GX2CommandInit()
{
cafeExportRegister("gx2", GX2Flush, LogType::GX2);
cafeExportRegister("gx2", GX2GetLastSubmittedTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2GetRetiredTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2WaitTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2BeginDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2BeginDisplayListEx, LogType::GX2);
cafeExportRegister("gx2", GX2EndDisplayList, LogType::GX2);
@ -295,7 +515,6 @@ namespace GX2
cafeExportRegister("gx2", GX2GetCurrentDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2GetDisplayListWriteStatus, LogType::GX2);
cafeExportRegister("gx2", GX2CallDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2DirectCallDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2CopyDisplayList, LogType::GX2);
@ -305,7 +524,10 @@ namespace GX2
void GX2CommandResetToDefaultState()
{
GX2WriteGather_ResetToDefaultState();
s_commandState->commandPoolBase = nullptr;
s_commandState->commandPoolSizeInU32s = 0;
s_commandState->gpuCommandReadPtr = nullptr;
s_cbBufferIsInternallyAllocated = false;
}
}

View file

@ -2,21 +2,19 @@
#include "Cafe/HW/Latte/ISA/LatteReg.h"
#include "Cafe/HW/Espresso/Const.h"
struct GX2WriteGatherPipeState
namespace GX2
{
uint8* gxRingBuffer;
// each core has it's own write gatherer and display list state (writing)
uint8* writeGatherPtrGxBuffer[Espresso::CORE_COUNT];
uint8** writeGatherPtrWrite[Espresso::CORE_COUNT];
uint8* writeGatherPtrDisplayList[Espresso::CORE_COUNT];
MPTR displayListStart[Espresso::CORE_COUNT];
uint32 displayListMaxSize[Espresso::CORE_COUNT];
struct GX2PerCoreCBState
{
uint32be* bufferPtr;
uint32 bufferSizeInU32s;
uint32be* currentWritePtr;
bool isDisplayList;
};
extern GX2PerCoreCBState s_perCoreCBState[Espresso::CORE_COUNT];
};
extern GX2WriteGatherPipeState gx2WriteGatherPipe;
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32); // move to GX2 namespace eventually
void gx2WriteGather_submitU32AsBE(uint32 v);
void gx2WriteGather_submitU32AsLE(uint32 v);
void gx2WriteGather_submitU32AsLEArray(uint32* v, uint32 numValues);
@ -27,7 +25,8 @@ uint32 PPCInterpreter_getCurrentCoreIndex();
template <typename ...Targs>
inline void gx2WriteGather_submit_(uint32 coreIndex, uint32be* writePtr)
{
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = (uint8*)writePtr;
GX2::s_perCoreCBState[coreIndex].currentWritePtr = writePtr;
cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
}
template <typename T, typename ...Targs>
@ -75,17 +74,23 @@ template <typename ...Targs>
inline void gx2WriteGather_submit(Targs... args)
{
uint32 coreIndex = PPCInterpreter_getCurrentCoreIndex();
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == nullptr)
if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
{
cemu_assert_suspicious(); // writing to command buffer without valid write pointer?
return;
uint32be* writePtr = (uint32be*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]);
}
uint32be* writePtr = GX2::s_perCoreCBState[coreIndex].currentWritePtr;
gx2WriteGather_submit_(coreIndex, writePtr, std::forward<Targs>(args)...);
}
namespace GX2
{
uint32 GX2WriteGather_getReadWriteDistance();
void GX2WriteGather_checkAndInsertWrapAroundMark();
void GX2Command_Flush(uint32 numU32sForNextBuffer, bool triggerMarkerInterrupt = true);
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32);
uint64 GX2GetLastSubmittedTimeStamp();
uint64 GX2GetRetiredTimeStamp();
bool GX2WaitTimeStamp(uint64 tsWait);
void GX2BeginDisplayList(MEMPTR<void> displayListAddr, uint32 size);
void GX2BeginDisplayListEx(MEMPTR<void> displayListAddr, uint32 size, bool profiling);
@ -96,7 +101,8 @@ namespace GX2
bool GX2GetDisplayListWriteStatus();
void GX2Init_writeGather();
void GX2CommandInit();
void GX2Init_commandBufferPool(void* bufferBase, uint32 bufferSize);
void GX2Shutdown_commandBufferPool();
void GX2CommandResetToDefaultState();
}

View file

@ -168,7 +168,7 @@ uint32 _GX2Context_CalcStateSize()
void _GX2Context_CreateLoadDL()
{
GX2ReserveCmdSpace(3);
GX2::GX2ReserveCmdSpace(3);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_CONTEXT_CONTROL, 2));
gx2WriteGather_submitU32AsBE(0x80000077);
gx2WriteGather_submitU32AsBE(0x80000077);
@ -176,7 +176,7 @@ void _GX2Context_CreateLoadDL()
void _GX2Context_WriteCmdDisableStateShadowing()
{
GX2ReserveCmdSpace(3);
GX2::GX2ReserveCmdSpace(3);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_CONTEXT_CONTROL, 2));
gx2WriteGather_submitU32AsBE(0x80000000);
gx2WriteGather_submitU32AsBE(0x80000000);
@ -184,7 +184,7 @@ void _GX2Context_WriteCmdDisableStateShadowing()
void _GX2Context_cmdLoad(void* gx2ukn, uint32 pm4Header, MPTR physAddrRegArea, uint32 waitForIdle, uint32 numRegOffsetEntries, GX2RegLoadPktEntry_t* regOffsetEntries)
{
GX2ReserveCmdSpace(3 + numRegOffsetEntries*2);
GX2::GX2ReserveCmdSpace(3 + numRegOffsetEntries*2);
gx2WriteGather_submitU32AsBE(pm4Header);
gx2WriteGather_submitU32AsBE(physAddrRegArea);
gx2WriteGather_submitU32AsBE(waitForIdle);
@ -199,7 +199,6 @@ void _GX2Context_cmdLoad(void* gx2ukn, uint32 pm4Header, MPTR physAddrRegArea, u
void _GX2Context_WriteCmdRestoreState(GX2ContextState_t* gx2ContextState, uint32 ukn)
{
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
MPTR physAddrContextState = memory_virtualToPhysical(memory_getVirtualOffsetFromPointer(gx2ContextState));
_GX2Context_CreateLoadDL();
__cmdStateLoad(NULL, IT_LOAD_CONFIG_REG, gx2ContextState->hwContext.areaConfigReg, 0x80000000, configReg_loadPktEntries);
@ -212,7 +211,7 @@ void _GX2Context_WriteCmdRestoreState(GX2ContextState_t* gx2ContextState, uint32
void GX2SetDefaultState()
{
GX2ReserveCmdSpace(0x100);
GX2::GX2ReserveCmdSpace(0x100);
Latte::LATTE_PA_CL_VTE_CNTL reg{};
reg.set_VPORT_X_OFFSET_ENA(true).set_VPORT_X_SCALE_ENA(true);
@ -376,7 +375,6 @@ void gx2Export_GX2SetContextState(PPCInterpreter_t* hCPU)
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2GetContextStateDisplayList(PPCInterpreter_t* hCPU)
{
cemuLog_log(LogType::GX2, "GX2GetContextStateDisplayList(0x{:08x}, 0x{:08x}, 0x{:08x})", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5]);

View file

@ -52,7 +52,6 @@ namespace GX2
0,
count,
0);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
void GX2DrawIndexedEx2(GX2PrimitiveMode2 primitiveMode, uint32 count, GX2IndexType indexType, void* indexData, uint32 baseVertex, uint32 numInstances, uint32 baseInstance)
@ -85,7 +84,6 @@ namespace GX2
pm4HeaderType3(IT_SET_CTL_CONST, 2), 1,
0 // baseInstance
);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
void GX2DrawEx(GX2PrimitiveMode2 primitiveMode, uint32 count, uint32 baseVertex, uint32 numInstances)
@ -109,7 +107,6 @@ namespace GX2
count,
0 // DRAW_INITIATOR
);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
void GX2DrawIndexedImmediateEx(GX2PrimitiveMode2 primitiveMode, uint32 count, GX2IndexType indexType, void* indexData, uint32 baseVertex, uint32 numInstances)
@ -177,7 +174,6 @@ namespace GX2
}
}
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
struct GX2DispatchComputeParam

View file

@ -16,18 +16,6 @@ namespace GX2
SysAllocator<coreinit::OSThreadQueue> g_vsyncThreadQueue;
SysAllocator<coreinit::OSThreadQueue> g_flipThreadQueue;
SysAllocator<coreinit::OSEvent> s_updateRetirementEvent;
std::atomic<uint64> s_lastRetirementTimestamp = 0;
// called from GPU code when a command buffer is retired
void __GX2NotifyNewRetirementTimestamp(uint64 tsRetire)
{
__OSLockScheduler();
s_lastRetirementTimestamp = tsRetire;
coreinit::OSSignalEventAllInternal(s_updateRetirementEvent.GetPtr());
__OSUnlockScheduler();
}
void GX2SetGPUFence(uint32be* fencePtr, uint32 mask, uint32 compareOp, uint32 compareValue)
{
GX2ReserveCmdSpace(7);
@ -210,16 +198,6 @@ namespace GX2
osLib_returnFromFunction(hCPU, 0);
}
uint64 GX2GetLastSubmittedTimeStamp()
{
return LatteGPUState.lastSubmittedCommandBufferTimestamp.load();
}
uint64 GX2GetRetiredTimeStamp()
{
return s_lastRetirementTimestamp;
}
void GX2WaitForVsync()
{
__OSLockScheduler();
@ -236,19 +214,6 @@ namespace GX2
__OSUnlockScheduler();
}
bool GX2WaitTimeStamp(uint64 tsWait)
{
__OSLockScheduler();
while (tsWait > s_lastRetirementTimestamp)
{
// GPU hasn't caught up yet
coreinit::OSWaitEventInternal(s_updateRetirementEvent.GetPtr());
}
__OSUnlockScheduler();
// return true to indicate no timeout
return true;
}
void GX2DrawDone()
{
// optional force full sync (texture readback and occlusion queries)
@ -263,13 +228,10 @@ namespace GX2
gx2WriteGather_submitU32AsBE(0x00000000); // unused
}
// flush pipeline
if (_GX2GetUnflushedBytes(coreinit::OSGetCoreId()) > 0)
_GX2SubmitToTCL();
GX2Command_Flush(0x100, true);
uint64 ts = GX2GetLastSubmittedTimeStamp();
GX2WaitTimeStamp(ts);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
void GX2Init_event()
@ -294,25 +256,19 @@ namespace GX2
cafeExportRegister("gx2", GX2SetEventCallback, LogType::GX2);
cafeExportRegister("gx2", GX2GetEventCallback, LogType::GX2);
cafeExportRegister("gx2", GX2GetLastSubmittedTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2GetRetiredTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2WaitForVsync, LogType::GX2);
cafeExportRegister("gx2", GX2WaitForFlip, LogType::GX2);
cafeExportRegister("gx2", GX2WaitTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2DrawDone, LogType::GX2);
coreinit::OSInitThreadQueue(g_vsyncThreadQueue.GetPtr());
coreinit::OSInitThreadQueue(g_flipThreadQueue.GetPtr());
coreinit::OSInitEvent(s_updateRetirementEvent, coreinit::OSEvent::EVENT_STATE::STATE_NOT_SIGNALED, coreinit::OSEvent::EVENT_MODE::MODE_AUTO);
coreinit::OSInitSemaphore(s_eventCbQueueSemaphore, 0);
}
void GX2EventResetToDefaultState()
{
s_callbackThreadLaunched = false;
s_lastRetirementTimestamp = 0;
for(auto& it : s_eventCallback)
{
it.callbackFuncPtr = nullptr;

View file

@ -81,19 +81,68 @@ namespace GX2
void _test_AddrLib();
void GX2Init(void* initSettings)
using GX2InitArg = uint32;
enum class GX2InitArgId : GX2InitArg
{
EndOfArgs = 0,
CommandPoolBase = 1,
CommandPoolSize = 2,
UknArg7 = 7,
UknArg8 = 8,
UknArg9 = 9,
UknArg11 = 11,
};
void GX2Init(betype<GX2InitArg>* initArgStream)
{
if (LatteGPUState.gx2InitCalled)
{
cemuLog_logDebug(LogType::Force, "GX2Init() called while already initialized");
return;
}
// parse init params from the stream
MEMPTR<void> commandPoolBase = nullptr;
uint32 commandPoolSize = 0;
if (initArgStream)
{
while (true)
{
GX2InitArgId paramId = static_cast<GX2InitArgId>((GX2InitArg)*initArgStream);
initArgStream++;
if (paramId == GX2InitArgId::EndOfArgs)
{
break;
}
else if (paramId == GX2InitArgId::CommandPoolBase)
{
commandPoolBase = MEMPTR<void>(*initArgStream);
initArgStream++;
}
else if (paramId == GX2InitArgId::CommandPoolSize)
{
commandPoolSize = *initArgStream;
initArgStream++;
}
else if (paramId == GX2InitArgId::UknArg7 ||
paramId == GX2InitArgId::UknArg8 ||
paramId == GX2InitArgId::UknArg9 ||
paramId == GX2InitArgId::UknArg11)
{
initArgStream++;
}
else
{
cemuLog_log(LogType::Force, "GX2Init: Unsupported init arg {}", (uint32)paramId);
}
}
}
// init main core
uint32 coreIndex = coreinit::OSGetCoreId();
cemuLog_log(LogType::GX2, "GX2Init() on core {} by thread 0x{:08x}", coreIndex, MEMPTR<OSThread_t>(coreinit::OSGetCurrentThread()).GetMPTR());
sGX2MainCoreIndex = coreIndex;
// init submodules
GX2::GX2Init_event();
GX2::GX2Init_writeGather();
GX2::GX2Init_commandBufferPool(commandPoolBase, commandPoolSize);
// init shared area
if (LatteGPUState.sharedAreaAddr == MPTR_NULL)
{
@ -112,6 +161,21 @@ namespace GX2
_test_AddrLib();
}
void GX2Shutdown()
{
if (!LatteGPUState.gx2InitCalled)
{
cemuLog_logDebug(LogType::Force, "GX2Shutdown() called while not initialized");
return;
}
LatteGPUState.gx2InitCalled--;
if (LatteGPUState.gx2InitCalled != 0)
return;
GX2DrawDone();
GX2Shutdown_commandBufferPool();
cemuLog_log(LogType::Force, "GX2 shutdown");
}
void _GX2DriverReset()
{
LatteGPUState.gx2InitCalled = 0;
@ -237,6 +301,7 @@ namespace GX2
void GX2MiscInit()
{
cafeExportRegister("gx2", GX2Init, LogType::GX2);
cafeExportRegister("gx2", GX2Shutdown, LogType::GX2);
cafeExportRegister("gx2", GX2GetMainCoreId, LogType::GX2);
cafeExportRegister("gx2", GX2ResetGPU, LogType::GX2);

View file

@ -135,7 +135,7 @@ void gx2Export_GX2InitDepthBufferRegs(PPCInterpreter_t* hCPU)
void gx2Export_GX2SetColorBuffer(PPCInterpreter_t* hCPU)
{
cemuLog_log(LogType::GX2, "GX2SetColorBuffer(0x{:08x}, {})", hCPU->gpr[3], hCPU->gpr[4]);
GX2ReserveCmdSpace(20);
GX2::GX2ReserveCmdSpace(20);
GX2ColorBuffer* colorBufferBE = (GX2ColorBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
@ -198,15 +198,13 @@ void gx2Export_GX2SetColorBuffer(PPCInterpreter_t* hCPU)
mmCB_COLOR0_INFO - 0xA000 + hCPU->gpr[4],
colorBufferBE->reg_info);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SetDepthBuffer(PPCInterpreter_t* hCPU)
{
cemuLog_log(LogType::GX2, "GX2SetDepthBuffer(0x{:08x})", hCPU->gpr[3]);
GX2ReserveCmdSpace(20);
GX2::GX2ReserveCmdSpace(20);
GX2DepthBuffer* depthBufferBE = (GX2DepthBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
@ -264,8 +262,6 @@ void gx2Export_GX2SetDepthBuffer(PPCInterpreter_t* hCPU)
gx2WriteGather_submitU32AsBE(mmDB_DEPTH_VIEW - 0xA000);
gx2WriteGather_submitU32AsBE(db_view);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
osLib_returnFromFunction(hCPU, 0);
}
@ -281,7 +277,7 @@ void gx2Export_GX2MarkScanBufferCopied(PPCInterpreter_t* hCPU)
uint32 scanTarget = hCPU->gpr[3];
if( scanTarget == GX2_SCAN_TARGET_TV )
{
GX2ReserveCmdSpace(10);
GX2::GX2ReserveCmdSpace(10);
uint32 physAddr = (MEMORY_TILINGAPERTURE_AREA_ADDR+0x200000);

View file

@ -303,7 +303,27 @@ namespace GX2
void GX2SetVertexShader(GX2VertexShader* vertexShader)
{
GX2ReserveCmdSpace(100);
uint32 numOutputIds = vertexShader->regs.vsOutIdTableSize;
numOutputIds = std::min<uint32>(numOutputIds, 0xA);
uint32 vsSemanticTableSize = vertexShader->regs.semanticTableSize;
uint32 reserveSize = 31;
if (vertexShader->shaderMode == GX2_SHADER_MODE::GEOMETRY_SHADER)
{
reserveSize += 7;
}
else
{
reserveSize += 18;
reserveSize += numOutputIds;
if (vertexShader->usesStreamOut != 0)
reserveSize += 2+12;
}
if (vsSemanticTableSize > 0)
{
reserveSize += 5 + vsSemanticTableSize;
}
GX2ReserveCmdSpace(reserveSize);
MPTR shaderProgramAddr;
uint32 shaderProgramSize;
@ -361,8 +381,6 @@ namespace GX2
cemu_assert_debug(vertexShader->regs.SPI_VS_OUT_CONFIG.value().get_VS_PER_COMPONENT() == false); // not handled on the GPU side
uint32 numOutputIds = vertexShader->regs.vsOutIdTableSize;
numOutputIds = std::min<uint32>(numOutputIds, 0xA);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numOutputIds));
gx2WriteGather_submitU32AsBE(Latte::REGADDR::SPI_VS_OUT_ID_0-0xA000);
for(uint32 i=0; i<numOutputIds; i++)
@ -392,7 +410,6 @@ namespace GX2
}
}
// update semantic table
uint32 vsSemanticTableSize = vertexShader->regs.semanticTableSize;
if (vsSemanticTableSize > 0)
{
gx2WriteGather_submit(

View file

@ -213,7 +213,6 @@ namespace GX2
void GX2SetViewportReg(GX2ViewportReg* viewportReg)
{
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
GX2ReserveCmdSpace(2 + 6);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 6),

View file

@ -264,7 +264,7 @@ void gx2Surface_GX2CopySurface(GX2Surface* srcSurface, uint32 srcMip, uint32 src
// send copy command to GPU
if( srcHwTileMode > 0 && srcHwTileMode < 16 && dstHwTileMode > 0 && dstHwTileMode < 16 || requestGPURAMCopy )
{
GX2ReserveCmdSpace(1+13*2);
GX2::GX2ReserveCmdSpace(1+13*2);
gx2WriteGather_submit(pm4HeaderType3(IT_HLE_COPY_SURFACE_NEW, 13*2),
// src
@ -540,7 +540,7 @@ void gx2Export_GX2ResolveAAColorBuffer(PPCInterpreter_t* hCPU)
uint32 dstDepth = std::max<uint32>(surfOutDst.depth, 1);
// send copy command to GPU
GX2ReserveCmdSpace(1 + 13 * 2);
GX2::GX2ReserveCmdSpace(1 + 13 * 2);
gx2WriteGather_submit(pm4HeaderType3(IT_HLE_COPY_SURFACE_NEW, 13 * 2),
// src
(uint32)srcSurface->imagePtr,
@ -619,7 +619,7 @@ void gx2Export_GX2ConvertDepthBufferToTextureSurface(PPCInterpreter_t* hCPU)
sint32 srcMip = 0;
uint32 numSlices = std::max<uint32>(_swapEndianU32(depthBuffer->viewNumSlices), 1);
GX2ReserveCmdSpace((1 + 13 * 2) * numSlices);
GX2::GX2ReserveCmdSpace((1 + 13 * 2) * numSlices);
for (uint32 subSliceIndex = 0; subSliceIndex < numSlices; subSliceIndex++)
{
// send copy command to GPU

View file

@ -11,9 +11,14 @@
void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU)
{
cemuLog_log(LogType::GX2, "GX2SetPixelShader(0x{:08x})", hCPU->gpr[3]);
GX2ReserveCmdSpace(100);
GX2PixelShader_t* pixelShader = (GX2PixelShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
uint32 numInputs = _swapEndianU32(pixelShader->regs[4]);
if( numInputs > 0x20 )
numInputs = 0x20;
GX2::GX2ReserveCmdSpace(26 + numInputs);
MPTR shaderProgramAddr;
uint32 shaderProgramSize;
@ -44,9 +49,6 @@ void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU)
_swapEndianU32(pixelShader->regs[2]),
_swapEndianU32(pixelShader->regs[3]));
// setup pixel shader extended inputs control
uint32 numInputs = _swapEndianU32(pixelShader->regs[4]);
if( numInputs > 0x20 )
numInputs = 0x20;
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numInputs));
gx2WriteGather_submitU32AsBE(mmSPI_PS_INPUT_CNTL_0-0xA000);
for(uint32 i=0; i<numInputs; i++)
@ -79,9 +81,17 @@ void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU)
void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU)
{
cemuLog_log(LogType::GX2, "GX2SetGeometryShader(0x{:08x})", hCPU->gpr[3]);
GX2ReserveCmdSpace(100);
GX2GeometryShader_t* geometryShader = (GX2GeometryShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
uint32 numOutputIds = _swapEndianU32(geometryShader->regs[7]);
numOutputIds = std::min<uint32>(numOutputIds, 0xA);
uint32 reserveSize = 38; // 38 fixed parameters
if (numOutputIds != 0)
reserveSize += 2 + numOutputIds;
if( _swapEndianU32(geometryShader->useStreamout) != 0 )
reserveSize += 2 + 12;
GX2::GX2ReserveCmdSpace(reserveSize);
MPTR shaderProgramAddr;
uint32 shaderProgramSize;
@ -128,6 +138,7 @@ void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU)
if( _swapEndianU32(geometryShader->useStreamout) != 0 )
{
// todo - IT_EVENT_WRITE packet here
// stride 0
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_0-0xA000);
@ -180,8 +191,6 @@ void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU)
gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->regs[3]));
// GS outputs
uint32 numOutputIds = _swapEndianU32(geometryShader->regs[7]);
numOutputIds = std::min<uint32>(numOutputIds, 0xA);
if( numOutputIds != 0 )
{
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numOutputIds));
@ -254,8 +263,7 @@ void gx2Export_GX2SetComputeShader(PPCInterpreter_t* hCPU)
shaderPtr = computeShader->rBuffer.GetVirtualAddr();
shaderSize = computeShader->rBuffer.GetSize();
}
GX2ReserveCmdSpace(0x11);
GX2::GX2ReserveCmdSpace(0x11);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 6),
mmSQ_PGM_START_ES-0xA000,
@ -272,7 +280,7 @@ void gx2Export_GX2SetComputeShader(PPCInterpreter_t* hCPU)
void _GX2SubmitUniformBlock(uint32 registerBase, uint32 index, MPTR virtualAddress, uint32 size)
{
GX2ReserveCmdSpace(9);
GX2::GX2ReserveCmdSpace(9);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_RESOURCE, 8),
registerBase + index * 7,
memory_virtualToPhysical(virtualAddress),
@ -307,7 +315,7 @@ void gx2Export_GX2SetGeometryUniformBlock(PPCInterpreter_t* hCPU)
void gx2Export_GX2RSetVertexUniformBlock(PPCInterpreter_t* hCPU)
{
GX2ReserveCmdSpace(9);
GX2::GX2ReserveCmdSpace(9);
GX2RBuffer* bufferPtr = (GX2RBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
uint32 index = hCPU->gpr[4];
@ -320,7 +328,7 @@ void gx2Export_GX2RSetVertexUniformBlock(PPCInterpreter_t* hCPU)
void gx2Export_GX2SetShaderModeEx(PPCInterpreter_t* hCPU)
{
GX2ReserveCmdSpace(8+4);
GX2::GX2ReserveCmdSpace(8+4);
uint32 mode = hCPU->gpr[3];
uint32 sqConfig = hCPU->gpr[3] == 0 ? 4 : 0;