Add all the files

This commit is contained in:
Exzap 2022-08-22 22:21:23 +02:00
parent e3db07a16a
commit d60742f52b
1445 changed files with 430238 additions and 0 deletions

View file

@ -0,0 +1,472 @@
#include "Cafe/OS/common/OSCommon.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "Cafe/HW/Espresso/PPCCallback.h"
#include "GX2.h"
#include "Cafe/HW/Latte/Core/Latte.h"
#include "Cafe/OS/libs/coreinit/coreinit_Time.h"
#include "Cafe/CafeSystem.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "GX2_Command.h"
#include "GX2_State.h"
#include "GX2_Memory.h"
#include "GX2_Event.h"
#include "GX2_Shader.h"
#include "GX2_Blit.h"
#include "GX2_Draw.h"
#include "GX2_Query.h"
#include "GX2_Misc.h"
#include "GX2_Surface.h"
#include "GX2_Surface_Copy.h"
#include "GX2_Texture.h"
#define GX2_TV_RENDER_NONE 0
#define GX2_TV_RENDER_480 1
#define GX2_TV_RENDER_480_WIDE 2
#define GX2_TV_RENDER_720 3
#define GX2_TV_RENDER_720I 4
#define GX2_TV_RENDER_1080 5
#define GX2_TV_RENDER_COUNT 6
struct
{
sint32 width;
sint32 height;
}tvScanBufferResolutions[GX2_TV_RENDER_COUNT] = {
0,0,
640,480,
854,480,
1280,720,
1280,720,
1920,1080
};
uint64 lastSwapTime = 0;
void gx2Export_GX2SwapScanBuffers(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SwapScanBuffers()");
bool isPokken = false;
uint64 titleId = CafeSystem::GetForegroundTitleId();
if (titleId == 0x00050000101DF500ull || titleId == 0x00050000101C5800ull || titleId == 0x00050000101DF400ull)
isPokken = true;
if (isPokken)
GX2::GX2DrawDone();
GX2ReserveCmdSpace(5+2);
uint64 tick64 = PPCInterpreter_getMainCoreCycleCounter() / 20ULL;
lastSwapTime = tick64;
// count flip request
// is this updated via a PM4 MEM_WRITE operation?
// Orochi Warriors seems to call GX2SwapScanBuffers on arbitrary threads/cores. The PM4 commands should go through to the GPU as long as there is no active display list and no other core is submitting commands simultaneously
// right now, we work around this by avoiding the infinite loop below (request counter incremented, but PM4 not sent)
uint32 coreIndex = PPCInterpreter_getCoreIndex(ppcInterpreterCurrentInstance);
if (GX2::sGX2MainCoreIndex == coreIndex)
LatteGPUState.sharedArea->flipRequestCountBE = _swapEndianU32(_swapEndianU32(LatteGPUState.sharedArea->flipRequestCountBE) + 1);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_REQUEST_SWAP_BUFFERS, 1));
gx2WriteGather_submitU32AsBE(0); // reserved
// swap frames
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_TRIGGER_SCANBUFFER_SWAP, 1));
gx2WriteGather_submitU32AsBE(0); // reserved
// wait for flip if the CPU is too far ahead
// doing it after swap request is how the actual console does it, but that still causes issues in Pokken
while ((sint32)(_swapEndianU32(LatteGPUState.sharedArea->flipRequestCountBE) - _swapEndianU32(LatteGPUState.sharedArea->flipExecuteCountBE)) > 5)
{
GX2::GX2WaitForFlip();
}
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2CopyColorBufferToScanBuffer(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2CopyColorBufferToScanBuffer(0x%08x,%d)\n", hCPU->gpr[3], hCPU->gpr[4]);
GX2ReserveCmdSpace(5);
// todo: proper implementation
// hack: Avoid running to far ahead of GPU. Normally this would be guaranteed by the circular buffer model, which we currently dont fully emulate
if(GX2::GX2WriteGather_getReadWriteDistance() > 32*1024*1024 )
{
debug_printf("Waiting for GPU to catch up...\n");
PPCInterpreter_relinquishTimeslice(); // release current thread
return;
}
GX2ColorBuffer* colorBuffer = (GX2ColorBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER, 9));
gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(colorBuffer->surface.imagePtr));
gx2WriteGather_submitU32AsBE((uint32)colorBuffer->surface.width);
gx2WriteGather_submitU32AsBE((uint32)colorBuffer->surface.height);
gx2WriteGather_submitU32AsBE((uint32)colorBuffer->surface.pitch);
gx2WriteGather_submitU32AsBE((uint32)colorBuffer->surface.tileMode.value());
gx2WriteGather_submitU32AsBE((uint32)colorBuffer->surface.swizzle);
gx2WriteGather_submitU32AsBE(_swapEndianU32(colorBuffer->viewFirstSlice));
gx2WriteGather_submitU32AsBE((uint32)colorBuffer->surface.format.value());
gx2WriteGather_submitU32AsBE(hCPU->gpr[4]);
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2WaitForFreeScanBuffer(PPCInterpreter_t* hCPU)
{
// todo: proper implementation
debug_printf("GX2WaitForFreeScanBuffer(): Unimplemented\n");
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2GetCurrentScanBuffer(PPCInterpreter_t* hCPU)
{
// todo: proper implementation
uint32 scanTarget = hCPU->gpr[3];
GX2ColorBuffer* colorBufferBE = (GX2ColorBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[4]);
memset(colorBufferBE, 0x00, sizeof(GX2ColorBuffer));
colorBufferBE->surface.width = 100;
colorBufferBE->surface.height = 100;
// note: For now we abuse the tiling aperture memory area as framebuffer pointers
if( scanTarget == GX2_SCAN_TARGET_TV )
{
colorBufferBE->surface.imagePtr = MEMORY_TILINGAPERTURE_AREA_ADDR+0x200000;
}
else if( scanTarget == GX2_SCAN_TARGET_DRC_FIRST )
{
colorBufferBE->surface.imagePtr = MEMORY_TILINGAPERTURE_AREA_ADDR+0x40000;
}
osLib_returnFromFunction(hCPU, 0);
}
void coreinitExport_GX2GetSystemTVScanMode(PPCInterpreter_t* hCPU)
{
// 1080p = 7
osLib_returnFromFunction(hCPU, 7);
}
void coreinitExport_GX2GetSystemTVAspectRatio(PPCInterpreter_t* hCPU)
{
osLib_returnFromFunction(hCPU, 1); // 16:9
}
void gx2Export_GX2TempGetGPUVersion(PPCInterpreter_t* hCPU)
{
osLib_returnFromFunction(hCPU, 2);
}
void _GX2InitScanBuffer(GX2ColorBuffer* colorBuffer, sint32 width, sint32 height, Latte::E_GX2SURFFMT format)
{
colorBuffer->surface.resFlag = GX2_RESFLAG_USAGE_TEXTURE | GX2_RESFLAG_USAGE_COLOR_BUFFER;
colorBuffer->surface.width = width;
colorBuffer->surface.height = height;
colorBuffer->viewFirstSlice = _swapEndianU32(0);
colorBuffer->viewNumSlices = _swapEndianU32(1);
colorBuffer->viewMip = _swapEndianU32(0);
colorBuffer->surface.numLevels = 1;
colorBuffer->surface.dim = Latte::E_DIM::DIM_2D;
colorBuffer->surface.swizzle = 0;
colorBuffer->surface.depth = 1;
colorBuffer->surface.tileMode = Latte::E_GX2TILEMODE::TM_LINEAR_GENERAL;
colorBuffer->surface.format = format;
colorBuffer->surface.mipPtr = MPTR_NULL;
colorBuffer->surface.aa = 0;
GX2::GX2CalcSurfaceSizeAndAlignment(&colorBuffer->surface);
colorBuffer->surface.resFlag = GX2_RESFLAG_USAGE_TEXTURE | GX2_RESFLAG_USAGE_COLOR_BUFFER | GX2_RESFLAG_USAGE_SCAN_BUFFER;
}
void gx2Export_GX2CalcTVSize(PPCInterpreter_t* hCPU)
{
uint32 tvRenderMode = hCPU->gpr[3];
Latte::E_GX2SURFFMT format = (Latte::E_GX2SURFFMT)hCPU->gpr[4];
uint32 bufferingMode = hCPU->gpr[5];
uint32 outputSizeMPTR = hCPU->gpr[6];
uint32 outputScaleNeededMPTR = hCPU->gpr[7];
cemu_assert(tvRenderMode < GX2_TV_RENDER_COUNT);
uint32 width = tvScanBufferResolutions[tvRenderMode].width;
uint32 height = tvScanBufferResolutions[tvRenderMode].height;
GX2ColorBuffer colorBuffer;
memset(&colorBuffer, 0, sizeof(GX2ColorBuffer));
_GX2InitScanBuffer(&colorBuffer, width, height, format);
uint32 imageSize = colorBuffer.surface.imageSize;
uint32 alignment = colorBuffer.surface.alignment;
uint32 alignmentPaddingSize = (alignment - (imageSize%alignment)) % alignment;
uint32 uknMult = 1; // probably for interlaced?
if (tvRenderMode == GX2_TV_RENDER_720I)
uknMult = 2;
uint32 adjustedBufferingMode = bufferingMode;
if (tvRenderMode < GX2_TV_RENDER_720)
adjustedBufferingMode = 4;
uint32 bufferedImageSize = (imageSize + alignmentPaddingSize) * adjustedBufferingMode;
bufferedImageSize = bufferedImageSize * uknMult - alignmentPaddingSize;
memory_writeU32(outputSizeMPTR, bufferedImageSize);
memory_writeU32(outputScaleNeededMPTR, 0); // todo
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2CalcDRCSize(PPCInterpreter_t* hCPU)
{
ppcDefineParamS32(drcMode, 0);
ppcDefineParamU32(format, 1);
ppcDefineParamU32(bufferingMode, 2);
ppcDefineParamMPTR(sizeMPTR, 3);
ppcDefineParamMPTR(scaleNeededMPTR, 4);
uint32 width = 0;
uint32 height = 0;
if (drcMode > 0)
{
width = 854;
height = 480;
}
GX2ColorBuffer colorBuffer = {};
memset(&colorBuffer, 0, sizeof(colorBuffer));
_GX2InitScanBuffer(&colorBuffer, width, height, (Latte::E_GX2SURFFMT)format);
uint32 imageSize = colorBuffer.surface.imageSize;
uint32 alignment = colorBuffer.surface.alignment;
uint32 alignmentPaddingSize = (alignment - (imageSize%alignment)) % alignment;
uint32 adjustedBufferingMode = bufferingMode;
uint32 bufferedImageSize = (imageSize + alignmentPaddingSize) * adjustedBufferingMode;
bufferedImageSize = bufferedImageSize - alignmentPaddingSize;
memory_writeU32(sizeMPTR, bufferedImageSize);
memory_writeU32(scaleNeededMPTR, 0);
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SetDRCScale(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetDRCScale(%d,%d)", hCPU->gpr[3], hCPU->gpr[4]);
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SetDRCConnectCallback(PPCInterpreter_t* hCPU)
{
ppcDefineParamS32(channel, 0);
ppcDefineParamMEMPTR(callback, void, 1);
gx2Log_printf("GX2SetDRCConnectCallback(%d, 0x%08x)", channel, callback.GetMPTR());
if(callback.GetPtr())
PPCCoreCallback(callback, channel, TRUE);
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SetSemaphore(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetSemaphore(0x%08x,%d)", hCPU->gpr[3], hCPU->gpr[4]);
ppcDefineParamMPTR(semaphoreMPTR, 0);
ppcDefineParamS32(mode, 1);
uint32 SEM_SEL;
if (mode == 0)
{
// wait
SEM_SEL = 7;
}
else if (mode == 1)
{
// signal
SEM_SEL = 6;
}
else
{
cemu_assert_debug(false);
osLib_returnFromFunction(hCPU, 0);
return;
}
uint32 semaphoreControl = (SEM_SEL << 29);
semaphoreControl |= 0x1000; // WAIT_ON_SIGNAL
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_MEM_SEMAPHORE, 2));
gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(semaphoreMPTR)); // semaphore physical address
gx2WriteGather_submitU32AsBE(semaphoreControl); // control
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2Flush(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2Flush()");
_GX2SubmitToTCL();
osLib_returnFromFunction(hCPU, 0);
}
uint8* _GX2LastFlushPtr[PPC_CORE_COUNT] = {NULL};
uint64 _prevReturnedGPUTime = 0;
uint64 Latte_GetTime()
{
uint64 gpuTime = coreinit::coreinit_getTimerTick();
gpuTime *= 20000ULL;
if (gpuTime <= _prevReturnedGPUTime)
gpuTime = _prevReturnedGPUTime + 1; // avoid ever returning identical timestamps
_prevReturnedGPUTime = gpuTime;
return gpuTime;
}
void _GX2SubmitToTCL()
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(ppcInterpreterCurrentInstance);
// do nothing if called from non-main GX2 core
if (GX2::sGX2MainCoreIndex != coreIndex)
{
forceLogDebug_printf("_GX2SubmitToTCL() called on non-main GX2 core");
return;
}
if( gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL )
return; // quit if in display list
_GX2LastFlushPtr[coreIndex] = (gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex]);
// update last submitted CB timestamp
uint64 commandBufferTimestamp = Latte_GetTime();
LatteGPUState.lastSubmittedCommandBufferTimestamp.store(commandBufferTimestamp);
gx2Log_printf("Submitting GX2 command buffer with timestamp %016I64x", commandBufferTimestamp);
// submit HLE packet to write retirement timestamp
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_SET_CB_RETIREMENT_TIMESTAMP, 2));
gx2WriteGather_submitU32AsBE((uint32)(commandBufferTimestamp>>32ULL));
gx2WriteGather_submitU32AsBE((uint32)(commandBufferTimestamp&0xFFFFFFFFULL));
}
uint32 _GX2GetUnflushedBytes(uint32 coreIndex)
{
uint32 unflushedBytes = 0;
if (_GX2LastFlushPtr[coreIndex] != NULL)
{
if (_GX2LastFlushPtr[coreIndex] > gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex])
unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer + 4); // this isn't 100% correct since we ignore the bytes between the last flush address and the start of the wrap around
else
unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - _GX2LastFlushPtr[coreIndex]);
}
else
unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer);
return unflushedBytes;
}
/*
* Guarantees that the requested amount of space is available on the current command buffer
* If the space is not available, the current command buffer is pushed to the GPU and a new one is allocated
*/
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(ppcInterpreterCurrentInstance);
// if we are in a display list then do nothing
if( gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL )
return;
uint32 unflushedBytes = _GX2GetUnflushedBytes(coreIndex);
if( unflushedBytes >= 0x1000 )
{
_GX2SubmitToTCL();
}
}
void gx2_load()
{
osLib_addFunction("gx2", "GX2GetContextStateDisplayList", gx2Export_GX2GetContextStateDisplayList);
// swap, vsync & timing
osLib_addFunction("gx2", "GX2SwapScanBuffers", gx2Export_GX2SwapScanBuffers);
osLib_addFunction("gx2", "GX2GetSwapStatus", gx2Export_GX2GetSwapStatus);
osLib_addFunction("gx2", "GX2CopyColorBufferToScanBuffer", gx2Export_GX2CopyColorBufferToScanBuffer);
osLib_addFunction("gx2", "GX2WaitForFreeScanBuffer", gx2Export_GX2WaitForFreeScanBuffer);
osLib_addFunction("gx2", "GX2GetCurrentScanBuffer", gx2Export_GX2GetCurrentScanBuffer);
// shader stuff
osLib_addFunction("gx2", "GX2GetVertexShaderGPRs", gx2Export_GX2GetVertexShaderGPRs);
osLib_addFunction("gx2", "GX2GetVertexShaderStackEntries", gx2Export_GX2GetVertexShaderStackEntries);
osLib_addFunction("gx2", "GX2GetPixelShaderGPRs", gx2Export_GX2GetPixelShaderGPRs);
osLib_addFunction("gx2", "GX2GetPixelShaderStackEntries", gx2Export_GX2GetPixelShaderStackEntries);
osLib_addFunction("gx2", "GX2SetFetchShader", gx2Export_GX2SetFetchShader);
osLib_addFunction("gx2", "GX2SetVertexShader", gx2Export_GX2SetVertexShader);
osLib_addFunction("gx2", "GX2SetPixelShader", gx2Export_GX2SetPixelShader);
osLib_addFunction("gx2", "GX2SetGeometryShader", gx2Export_GX2SetGeometryShader);
osLib_addFunction("gx2", "GX2SetComputeShader", gx2Export_GX2SetComputeShader);
osLib_addFunction("gx2", "GX2SetVertexUniformReg", gx2Export_GX2SetVertexUniformReg);
osLib_addFunction("gx2", "GX2SetVertexUniformBlock", gx2Export_GX2SetVertexUniformBlock);
osLib_addFunction("gx2", "GX2RSetVertexUniformBlock", gx2Export_GX2RSetVertexUniformBlock);
osLib_addFunction("gx2", "GX2SetPixelUniformBlock", gx2Export_GX2SetPixelUniformBlock);
osLib_addFunction("gx2", "GX2SetPixelUniformReg", gx2Export_GX2SetPixelUniformReg);
osLib_addFunction("gx2", "GX2SetGeometryUniformBlock", gx2Export_GX2SetGeometryUniformBlock);
osLib_addFunction("gx2", "GX2SetShaderModeEx", gx2Export_GX2SetShaderModeEx);
osLib_addFunction("gx2", "GX2CalcGeometryShaderInputRingBufferSize", gx2Export_GX2CalcGeometryShaderInputRingBufferSize);
osLib_addFunction("gx2", "GX2CalcGeometryShaderOutputRingBufferSize", gx2Export_GX2CalcGeometryShaderOutputRingBufferSize);
// color/depth buffers
osLib_addFunction("gx2", "GX2InitColorBufferRegs", gx2Export_GX2InitColorBufferRegs);
osLib_addFunction("gx2", "GX2InitDepthBufferRegs", gx2Export_GX2InitDepthBufferRegs);
osLib_addFunction("gx2", "GX2SetColorBuffer", gx2Export_GX2SetColorBuffer);
osLib_addFunction("gx2", "GX2SetDepthBuffer", gx2Export_GX2SetDepthBuffer);
osLib_addFunction("gx2", "GX2SetDRCBuffer", gx2Export_GX2SetDRCBuffer);
osLib_addFunction("gx2", "GX2MarkScanBufferCopied", gx2Export_GX2MarkScanBufferCopied);
// misc
osLib_addFunction("gx2", "GX2TempGetGPUVersion", gx2Export_GX2TempGetGPUVersion);
osLib_addFunction("gx2", "GX2CalcTVSize", gx2Export_GX2CalcTVSize);
osLib_addFunction("gx2", "GX2CalcDRCSize", gx2Export_GX2CalcDRCSize);
osLib_addFunction("gx2", "GX2SetDRCScale", gx2Export_GX2SetDRCScale);
osLib_addFunction("gx2", "GX2SetDRCConnectCallback", gx2Export_GX2SetDRCConnectCallback);
osLib_addFunction("gx2", "GX2GetSystemTVScanMode", coreinitExport_GX2GetSystemTVScanMode);
osLib_addFunction("gx2", "GX2GetSystemTVAspectRatio", coreinitExport_GX2GetSystemTVAspectRatio);
osLib_addFunction("gx2", "GX2SetSwapInterval", gx2Export_GX2SetSwapInterval);
osLib_addFunction("gx2", "GX2GetSwapInterval", gx2Export_GX2GetSwapInterval);
osLib_addFunction("gx2", "GX2GetGPUTimeout", gx2Export_GX2GetGPUTimeout);
osLib_addFunction("gx2", "GX2SampleTopGPUCycle", gx2Export_GX2SampleTopGPUCycle);
osLib_addFunction("gx2", "GX2SampleBottomGPUCycle", gx2Export_GX2SampleBottomGPUCycle);
osLib_addFunction("gx2", "GX2AllocateTilingApertureEx", gx2Export_GX2AllocateTilingApertureEx);
osLib_addFunction("gx2", "GX2FreeTilingAperture", gx2Export_GX2FreeTilingAperture);
// context state
osLib_addFunction("gx2", "GX2SetDefaultState", gx2Export_GX2SetDefaultState);
osLib_addFunction("gx2", "GX2SetupContextStateEx", gx2Export_GX2SetupContextStateEx);
osLib_addFunction("gx2", "GX2SetContextState", gx2Export_GX2SetContextState);
// semaphore
osLib_addFunction("gx2", "GX2SetSemaphore", gx2Export_GX2SetSemaphore);
// command buffer
osLib_addFunction("gx2", "GX2Flush", gx2Export_GX2Flush);
GX2::GX2Init_writeGather();
GX2::GX2MemInit();
GX2::GX2ResourceInit();
GX2::GX2CommandInit();
GX2::GX2SurfaceInit();
GX2::GX2SurfaceCopyInit();
GX2::GX2TextureInit();
GX2::GX2StateInit();
GX2::GX2ShaderInit();
GX2::GX2EventInit();
GX2::GX2BlitInit();
GX2::GX2DrawInit();
GX2::GX2StreamoutInit();
GX2::GX2QueryInit();
GX2::GX2MiscInit();
}

View file

@ -0,0 +1,89 @@
#pragma once
#include "Cafe/HW/Latte/Core/LatteConst.h"
// base defines for GX2
#define GX2_TRUE 1
#define GX2_FALSE 0
#define GX2_ENABLE 1
#define GX2_DISABLE 0
// tex unit base for render backends
#define CEMU_PS_TEX_UNIT_BASE 0
#define CEMU_VS_TEX_UNIT_BASE 32
#define CEMU_GS_TEX_UNIT_BASE 64
#include "GX2_Surface.h"
// general
void gx2_load();
// shader
void gx2Export_GX2SetFetchShader(PPCInterpreter_t* hCPU);
void gx2Export_GX2GetVertexShaderGPRs(PPCInterpreter_t* hCPU);
void gx2Export_GX2GetVertexShaderStackEntries(PPCInterpreter_t* hCPU);
void gx2Export_GX2GetPixelShaderGPRs(PPCInterpreter_t* hCPU);
void gx2Export_GX2GetPixelShaderStackEntries(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetVertexShader(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetComputeShader(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetVertexUniformReg(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetVertexUniformBlock(PPCInterpreter_t* hCPU);
void gx2Export_GX2RSetVertexUniformBlock(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetPixelUniformBlock(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetPixelUniformReg(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetGeometryUniformBlock(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetShaderModeEx(PPCInterpreter_t* hCPU);
void gx2Export_GX2CalcGeometryShaderInputRingBufferSize(PPCInterpreter_t* hCPU);
void gx2Export_GX2CalcGeometryShaderOutputRingBufferSize(PPCInterpreter_t* hCPU);
// write gather / command queue
#define GX2_COMMAND_RING_BUFFER_SIZE (64*1024*1024) // 64MB
void gx2Export_GX2GetContextStateDisplayList(PPCInterpreter_t* hCPU);
#include "GX2_Command.h"
// misc
void gx2Export_GX2AllocateTilingApertureEx(PPCInterpreter_t* hCPU);
void gx2Export_GX2FreeTilingAperture(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetSwapInterval(PPCInterpreter_t* hCPU);
void gx2Export_GX2GetSwapInterval(PPCInterpreter_t* hCPU);
void gx2Export_GX2GetSwapStatus(PPCInterpreter_t* hCPU);
void gx2Export_GX2GetGPUTimeout(PPCInterpreter_t* hCPU);
void gx2Export_GX2SampleTopGPUCycle(PPCInterpreter_t* hCPU);
void gx2Export_GX2SampleBottomGPUCycle(PPCInterpreter_t* hCPU);
// color/depth buffers
#define GX2_SCAN_TARGET_TV 1
#define GX2_SCAN_TARGET_TV_RIGH 2
#define GX2_SCAN_TARGET_DRC_FIRST 4
#define GX2_SCAN_TARGET_DRC_SECOND 8
void gx2Export_GX2InitColorBufferRegs(PPCInterpreter_t* hCPU);
void gx2Export_GX2InitDepthBufferRegs(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetColorBuffer(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetDepthBuffer(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetDRCBuffer(PPCInterpreter_t* hCPU);
void gx2Export_GX2MarkScanBufferCopied(PPCInterpreter_t* hCPU);
// special state
#define GX2_SPECIAL_STATE_COUNT 9
// context state
void gx2Export_GX2SetDefaultState(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetupContextStateEx(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetContextState(PPCInterpreter_t* hCPU);
// command buffer
uint32 _GX2GetUnflushedBytes(uint32 coreIndex);
void _GX2SubmitToTCL();
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32);

View file

@ -0,0 +1,422 @@
#include "Cafe/OS/libs/coreinit/coreinit_DynLoad.h"
#include "Cafe/HW/Espresso/PPCCallback.h"
#include "Cafe/HW/Espresso/PPCState.h"
#include "Cafe/OS/common/OSCommon.h"
#include "Cafe/HW/Latte/ISA/LatteReg.h"
#include "Cafe/HW/Latte/Core/Latte.h"
#include "Cafe/HW/Latte/LatteAddrLib/LatteAddrLib.h"
#include "util/highresolutiontimer/HighResolutionTimer.h"
namespace GX2
{
struct AddrCreate_INPUT
{
/* +0x00 */ uint32be structSize;
/* +0x04 */ uint32be ukn04_maybeGen;
/* +0x08 */ uint32be ukn08;
/* +0x0C */ uint32be revision;
/* +0x10 */ uint32be func_Alloc;
/* +0x14 */ uint32be func_Free;
/* +0x18 */ uint32be func_Debug;
/* +0x1C */ uint32be ukn1C;
/* +0x20 */ uint32be reg263C;
/* +0x24 */ uint32be ukn24;
/* +0x28 */ uint32be ukn28;
/* +0x2C */ uint32be ukn2C;
/* +0x30 */ uint32be ukn30;
/* +0x34 */ uint32be ukn34;
/* +0x38 */ uint32be ukn38;
/* +0x3C */ uint32be ukn3C;
/* +0x40 */ uint32be ukn40;
};
struct AddrCreate_OUTPUT
{
uint32be structSize;
MEMPTR<void> addrLibPtr;
};
static_assert(sizeof(AddrCreate_INPUT) == 0x44);
static_assert(sizeof(AddrCreate_OUTPUT) == 8);
struct ADDRAllocParam
{
uint32be ukn00; // alignment?
uint32be ukn04;
uint32be size;
};
struct ADDRComputeSurfaceInfo_INPUT
{
uint32be structSize;
betype<Latte::E_HWTILEMODE> tileMode;
betype<Latte::E_HWSURFFMT> format;
uint32be bpp;
uint32be numSamples;
uint32be width;
uint32be height;
uint32be numSlices;
uint32be slice;
uint32be mipLevel;
uint32be _flags;
uint32be numFrags;
MEMPTR<void> tileInfo;
uint32be tileType;
uint32be tileIndex;
enum FLAG_BITS
{
FLAG_BIT_CUBE = (1 << 27),
FLAG_BIT_VOLUME = (1 << 26),
FLAG_BIT_OPT4SPACE = (1 << 19),
};
void SetFlagCube(bool f)
{
if (f) _flags |= FLAG_BIT_CUBE;
else _flags &= ~FLAG_BIT_CUBE;
}
void SetFlagVolume(bool f)
{
if (f) _flags |= FLAG_BIT_VOLUME;
else _flags &= ~FLAG_BIT_VOLUME;
}
void SetFlagOpt4Space(bool f)
{
if (f) _flags |= FLAG_BIT_OPT4SPACE;
else _flags &= ~FLAG_BIT_OPT4SPACE;
}
};
static_assert(sizeof(ADDRComputeSurfaceInfo_INPUT) == 0x3C);
struct ADDRComputeSurfaceInfo_OUTPUT
{
/* 0x00 */ uint32be structSize;
/* 0x04 */ uint32be pitch;
/* 0x08 */ uint32be height;
/* 0x0C */ uint32be depth;
/* 0x10 */ uint64be surfSize;
/* 0x18 */ uint32be tileMode;
/* 0x1C */ uint32be baseAlign;
/* 0x20 */ uint32be pitchAlign;
/* 0x24 */ uint32be heightAlign;
/* 0x28 */ uint32be depthAlign;
/* 0x2C */ uint32be bpp;
/* 0x30 */ uint32be pixelPitch;
/* 0x34 */ uint32be pixelHeight;
/* 0x38 */ uint32be pixelBits;
/* 0x3C */ uint32be sliceSize;
/* 0x40 */ uint32be pitchTileMax;
/* 0x44 */ uint32be heightTileMax;
/* 0x48 */ uint32be sliceTileMax;
/* 0x4C */ MEMPTR<void> tileInfo;
/* 0x50 */ uint32be tileType;
/* 0x54 */ uint32be tileIndex;
/* 0x58 */ MEMPTR<void> stereoInfo;
/* 0x5C */ uint32be _padding;
};
static_assert(sizeof(ADDRComputeSurfaceInfo_OUTPUT) == 0x60);
static void _cb_alloc(PPCInterpreter_t* hCPU)
{
ppcDefineParamStructPtr(param, ADDRAllocParam, 0);
uint32 r = coreinit_allocFromSysArea(param->size, 0x10);
osLib_returnFromFunction(hCPU, r);
}
static void _cb_free(PPCInterpreter_t* hCPU)
{
cemu_assert_unimplemented();
}
static void _cb_debug(PPCInterpreter_t* hCPU)
{
cemu_assert_unimplemented();
}
static void* sAddrLib{};
static uint32be tclFunc_AddrCreate = 0;
static uint32be tclFunc_AddrComputeSurfaceInfo = 0;
void _TestAddrLib_Init()
{
// load tcl_addr_test.rpl (from /cafelibs/)
uint32be tclHandle;
uint32 r = coreinit::OSDynLoad_Acquire("tcl_addr_test.rpl", &tclHandle);
cemu_assert_debug(r == 0);
// get imports
r = coreinit::OSDynLoad_FindExport(tclHandle, 0, "AddrCreate", &tclFunc_AddrCreate);
cemu_assert_debug(r == 0);
r = coreinit::OSDynLoad_FindExport(tclHandle, 0, "AddrComputeSurfaceInfo", &tclFunc_AddrComputeSurfaceInfo);
cemu_assert_debug(r == 0);
// call AddrCreate
StackAllocator<AddrCreate_INPUT> addrCreateIn;
memset(addrCreateIn.GetPointer(), 0, sizeof(addrCreateIn));
addrCreateIn->structSize = sizeof(addrCreateIn);
addrCreateIn->ukn04_maybeGen = 6; // R600?
addrCreateIn->ukn08 = 0x51;
addrCreateIn->revision = 71;
addrCreateIn->reg263C = 0x44902;
addrCreateIn->ukn24 = 0; // ukn
addrCreateIn->func_Alloc = PPCInterpreter_makeCallableExportDepr(_cb_alloc);
addrCreateIn->func_Free = PPCInterpreter_makeCallableExportDepr(_cb_free);
addrCreateIn->func_Debug = PPCInterpreter_makeCallableExportDepr(_cb_debug);
StackAllocator<AddrCreate_OUTPUT> addrCreateOut;
memset(addrCreateOut.GetPointer(), 0, sizeof(addrCreateOut));
addrCreateOut->structSize = sizeof(addrCreateOut);
r = PPCCoreCallback((uint32)tclFunc_AddrCreate, addrCreateIn.GetPointer(), addrCreateOut.GetPointer());
sAddrLib = addrCreateOut->addrLibPtr;
cemu_assert_debug(r == 0 && sAddrLib != nullptr);
}
void _TestAddrLib_CalculateSurfaceInfo(Latte::E_GX2SURFFMT surfaceFormat, uint32 surfaceWidth, uint32 surfaceHeight, uint32 surfaceDepth, Latte::E_DIM surfaceDim, Latte::E_GX2TILEMODE surfaceTileMode, uint32 surfaceAA, uint32 level, ADDRComputeSurfaceInfo_OUTPUT* paramOut)
{
StackAllocator<ADDRComputeSurfaceInfo_INPUT> _paramIn;
ADDRComputeSurfaceInfo_INPUT& paramIn = *_paramIn.GetPointer();
memset(&paramIn, 0, sizeof(ADDRComputeSurfaceInfo_INPUT));
memset(paramOut, 0, sizeof(ADDRComputeSurfaceInfo_OUTPUT));
Latte::E_HWSURFFMT hwFormat = GetHWFormat(surfaceFormat);
if (surfaceTileMode == Latte::E_GX2TILEMODE::TM_LINEAR_SPECIAL)
{
uint32 numSamples = 1 << surfaceAA;
uint32 blockSize = IsCompressedFormat(surfaceFormat) ? 4 : 1;
uint32 width = ((surfaceWidth >> level) + blockSize - 1) & ~(blockSize - 1);
paramOut->bpp = GetFormatBits(hwFormat);
paramOut->structSize = sizeof(ADDRComputeSurfaceInfo_OUTPUT);
paramOut->pitch = width / blockSize;
paramOut->pixelBits = paramOut->bpp;
paramOut->baseAlign = 1;
paramOut->pitchAlign = 1;
paramOut->heightAlign = 1;
paramOut->depthAlign = 1;
switch (surfaceDim)
{
case Latte::E_DIM::DIM_1D:
paramOut->height = 1;
paramOut->depth = 1;
break;
case Latte::E_DIM::DIM_2D:
paramOut->height = std::max<uint32>(surfaceHeight >> level, 1);
paramOut->depth = 1;
break;
case Latte::E_DIM::DIM_3D:
paramOut->height = surfaceHeight >> level;
paramOut->height = std::max<uint32>(paramOut->height, 1);
paramOut->depth = std::max<uint32>(surfaceDepth >> level, 1);
break;
case Latte::E_DIM::DIM_CUBEMAP:
paramOut->height = std::max<uint32>(surfaceHeight >> level, 1);
paramOut->depth = std::max<uint32>(surfaceDepth, 6);
break;
case Latte::E_DIM::DIM_1D_ARRAY:
paramOut->height = 1;
paramOut->depth = surfaceDepth;
break;
case Latte::E_DIM::DIM_2D_ARRAY:
paramOut->height = std::max<uint32>(surfaceHeight >> level, 1);
paramOut->depth = surfaceDepth;
break;
default:
break;
}
paramOut->height = ((paramOut->height + blockSize - 1) & ~(blockSize - 1)) / (uint64)blockSize;
paramOut->pixelPitch = ((surfaceWidth >> level) + blockSize - 1) & ~(blockSize - 1);
paramOut->pixelPitch = std::max<uint32>(paramOut->pixelPitch, blockSize);
paramOut->pixelHeight = ((surfaceHeight >> level) + blockSize - 1) & ~(blockSize - 1);
paramOut->pixelHeight = std::max<uint32>(paramOut->pixelHeight, blockSize);;
paramOut->pitch = std::max<uint32>(paramOut->pitch, 1);
paramOut->height = std::max<uint32>(paramOut->height, 1);
paramOut->surfSize = paramOut->bpp * numSamples * paramOut->depth * paramOut->height * paramOut->pitch >> 3;
if (surfaceDim == Latte::E_DIM::DIM_3D)
paramOut->sliceSize = (uint32)(paramOut->surfSize);
else
{
if (paramOut->surfSize == 0 && paramOut->depth == 0)
paramOut->sliceSize = 0; // edge case for (1D)_ARRAY textures with 0/0/0 res
else
paramOut->sliceSize = ((uint32)paramOut->surfSize.value() / paramOut->depth);
}
paramOut->pitchTileMax = (paramOut->pitch >> 3) - 1;
paramOut->heightTileMax = (paramOut->height >> 3) - 1;
paramOut->sliceTileMax = (paramOut->height * paramOut->pitch >> 6) - 1;
}
else
{
paramIn.structSize = sizeof(paramIn);
paramIn.tileMode = Latte::MakeHWTileMode(surfaceTileMode);
paramIn.format = hwFormat;
paramIn.bpp = GetFormatBits(hwFormat);
paramIn.numSamples = 1 << surfaceAA;
paramIn.numFrags = paramIn.numSamples;
paramIn.width = std::max<uint32>(surfaceWidth >> level, 1);
switch (surfaceDim)
{
case Latte::E_DIM::DIM_1D:
paramIn.height = 1;
paramIn.numSlices = 1;
break;
case Latte::E_DIM::DIM_2D:
paramIn.height = std::max<uint32>(surfaceHeight >> level, 1);
paramIn.numSlices = 1;
break;
case Latte::E_DIM::DIM_3D:
paramIn.height = std::max<uint32>(surfaceHeight >> level, 1);
paramIn.numSlices = std::max<uint32>(surfaceDepth >> level, 1);
break;
case Latte::E_DIM::DIM_CUBEMAP:
paramIn.height = std::max<uint32>(surfaceHeight >> level, 1);
paramIn.numSlices = std::max<uint32>(surfaceDepth, 6);
paramIn.SetFlagCube(true);
break;
case Latte::E_DIM::DIM_1D_ARRAY:
paramIn.height = 1;
paramIn.numSlices = surfaceDepth;
break;
case Latte::E_DIM::DIM_2D_ARRAY:
paramIn.height = std::max<uint32>(surfaceHeight >> level, 1);
paramIn.numSlices = surfaceDepth;
break;
case Latte::E_DIM::DIM_2D_MSAA:
paramIn.height = std::max<uint32>(surfaceHeight >> level, 1);
paramIn.numSlices = 1;
break;
case Latte::E_DIM::DIM_2D_ARRAY_MSAA:
paramIn.height = std::max<uint32>(surfaceHeight >> level, 1);
paramIn.numSlices = surfaceDepth;
break;
default:
break;
}
paramIn.slice = 0;
paramIn.mipLevel = level;
if (surfaceDim == Latte::E_DIM::DIM_3D)
paramIn.SetFlagVolume(true);
paramIn.SetFlagOpt4Space(level == 0);
paramOut->structSize = sizeof(ADDRComputeSurfaceInfo_OUTPUT);
PPCCoreCallback((uint32)tclFunc_AddrComputeSurfaceInfo, sAddrLib, _paramIn.GetPointer(), paramOut);
}
}
void _TestAddrLib_Compare(uint32 surfaceWidth, uint32 surfaceHeight, uint32 surfaceDepth, Latte::E_DIM surfaceDim, Latte::E_GX2SURFFMT surfaceFormat, Latte::E_GX2TILEMODE surfaceTileMode, uint32 surfaceAA, uint32 level)
{
// get result from tcl.rpl
StackAllocator<ADDRComputeSurfaceInfo_OUTPUT> _paramOut;
ADDRComputeSurfaceInfo_OUTPUT& tclSurfInfo = *_paramOut.GetPointer();
_TestAddrLib_CalculateSurfaceInfo(surfaceFormat, surfaceWidth, surfaceHeight, surfaceDepth, surfaceDim, surfaceTileMode, surfaceAA, level, _paramOut.GetPointer());
// get result from our implementation
LatteAddrLib::AddrSurfaceInfo_OUT ourSurfInfo;
LatteAddrLib::GX2CalculateSurfaceInfo(surfaceFormat, surfaceWidth, surfaceHeight, surfaceDepth, surfaceDim, surfaceTileMode, surfaceAA, level, &ourSurfInfo);
// compare
cemu_assert(tclSurfInfo.pitchAlign == ourSurfInfo.pitchAlign);
cemu_assert((Latte::E_HWTILEMODE)tclSurfInfo.tileMode.value() == ourSurfInfo.hwTileMode);
cemu_assert(tclSurfInfo.baseAlign == ourSurfInfo.baseAlign);
cemu_assert(tclSurfInfo.surfSize == ourSurfInfo.surfSize);
cemu_assert(tclSurfInfo.depthAlign == ourSurfInfo.depthAlign);
cemu_assert(tclSurfInfo.pitch == ourSurfInfo.pitch);
cemu_assert(tclSurfInfo.sliceSize == ourSurfInfo.sliceSize);
}
void _TestAddrLib_Run()
{
uint32 surfaceAA = 0;
std::vector<Latte::E_DIM> dimList = {
Latte::E_DIM::DIM_1D,
Latte::E_DIM::DIM_2D,
Latte::E_DIM::DIM_3D,
Latte::E_DIM::DIM_CUBEMAP,
Latte::E_DIM::DIM_1D_ARRAY,
Latte::E_DIM::DIM_2D_ARRAY,
Latte::E_DIM::DIM_2D_MSAA,
Latte::E_DIM::DIM_2D_ARRAY_MSAA
};
std::vector<Latte::E_GX2TILEMODE> tilemodeList = {
// linear
Latte::E_GX2TILEMODE::TM_LINEAR_GENERAL,
Latte::E_GX2TILEMODE::TM_LINEAR_ALIGNED,
// micro tiled
Latte::E_GX2TILEMODE::TM_1D_TILED_THIN1,
Latte::E_GX2TILEMODE::TM_1D_TILED_THICK,
// macro tiled
Latte::E_GX2TILEMODE::TM_2D_TILED_THIN1,
Latte::E_GX2TILEMODE::TM_2D_TILED_THIN4,
Latte::E_GX2TILEMODE::TM_2D_TILED_THIN2,
Latte::E_GX2TILEMODE::TM_2D_TILED_THICK,
Latte::E_GX2TILEMODE::TM_2B_TILED_THIN1,
Latte::E_GX2TILEMODE::TM_2B_TILED_THIN2,
Latte::E_GX2TILEMODE::TM_2B_TILED_THIN4,
Latte::E_GX2TILEMODE::TM_2B_TILED_THICK,
Latte::E_GX2TILEMODE::TM_3D_TILED_THIN1,
Latte::E_GX2TILEMODE::TM_3D_TILED_THICK,
Latte::E_GX2TILEMODE::TM_3B_TILED_THIN1,
Latte::E_GX2TILEMODE::TM_3B_TILED_THICK,
// special
Latte::E_GX2TILEMODE::TM_LINEAR_SPECIAL,
Latte::E_GX2TILEMODE::TM_32_SPECIAL, // note: Specific to GX2CalcSurfaceSizeAndAlignment, for AddrLib this should just be interpreted as (tm&0xF)
};
std::vector<Latte::E_GX2SURFFMT> formatList = {
Latte::E_GX2SURFFMT::HWFMT_8, Latte::E_GX2SURFFMT::HWFMT_8_8, Latte::E_GX2SURFFMT::HWFMT_8_8_8_8, // 8, 16, 32
Latte::E_GX2SURFFMT::R32_UINT, Latte::E_GX2SURFFMT::R32_G32_UINT, Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT, // 32, 64, 128
Latte::E_GX2SURFFMT::HWFMT_BC1, Latte::E_GX2SURFFMT::HWFMT_BC2, Latte::E_GX2SURFFMT::HWFMT_BC3, Latte::E_GX2SURFFMT::HWFMT_BC4, Latte::E_GX2SURFFMT::HWFMT_BC5
};
std::vector<uint32> resXYList = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17,
31, 32, 33, 50, 63, 64, 65, 127, 128, 129, 200, 253, 254, 255, 256, 257,
511, 512, 513, 1023, 1024, 1025, 2047, 2048, 2049, 4095, 4096, 4097
};
debug_printf("Running AddrLib test...\n");
BenchmarkTimer timer;
timer.Start();
size_t index = 0;
for (auto dim : dimList)
{
debug_printf("%d/%d\n", (int)index, (int)dimList.size());
index++;
for (auto tileMode : tilemodeList)
{
for (auto format : formatList)
{
for (uint32 level = 0; level < 16; level++)
{
for (auto depth : resXYList)
{
for (auto height : resXYList)
{
for (auto width : resXYList)
{
_TestAddrLib_Compare(width, height, depth, dim, format, tileMode, surfaceAA, level);
}
}
}
}
}
}
}
timer.Stop();
debug_printf("Test complete (in %d seconds)\n", (int)(timer.GetElapsedMilliseconds() * 0.001));
assert_dbg();
}
void _test_AddrLib()
{
return;
_TestAddrLib_Init();
_TestAddrLib_Run();
}
}

View file

@ -0,0 +1,225 @@
#include "Common/precompiled.h"
#include "GX2_Blit.h"
#include "GX2_Command.h"
#include "GX2_Surface.h"
#include "Cafe/HW/Latte/ISA/LatteReg.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/OS/common/OSCommon.h"
#include "GX2_Resource.h"
namespace GX2
{
// sets the depth/stencil clear registers and updates clear values in DepthBuffer struct
void GX2SetClearDepthStencil(GX2DepthBuffer* depthBuffer, float depthClearValue, uint8 stencilClearValue)
{
GX2ReserveCmdSpace(4);
*(uint32*)&depthBuffer->clearDepth = _swapEndianU32(*(uint32*)&depthClearValue);
depthBuffer->clearStencil = _swapEndianU32(stencilClearValue);
Latte::LATTE_DB_STENCIL_CLEAR stencilClearReg;
stencilClearReg.set_clearValue(stencilClearValue);
Latte::LATTE_DB_DEPTH_CLEAR depthClearReg;
depthClearReg.set_clearValue(depthClearValue);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 2),
Latte::REGADDR::DB_STENCIL_CLEAR - 0xA000,
stencilClearReg, depthClearReg);
}
// similar to GX2SetClearDepthStencil but only sets depth
void GX2SetClearDepth(GX2DepthBuffer* depthBuffer, float depthClearValue)
{
GX2ReserveCmdSpace(3);
*(uint32*)&depthBuffer->clearDepth = _swapEndianU32(*(uint32*)&depthClearValue);
Latte::LATTE_DB_DEPTH_CLEAR depthClearReg;
depthClearReg.set_clearValue(depthClearValue);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::DB_DEPTH_CLEAR - 0xA000,
depthClearReg);
}
// similar to GX2SetClearDepthStencil but only sets stencil
void GX2SetClearStencil(GX2DepthBuffer* depthBuffer, uint8 stencilClearValue)
{
GX2ReserveCmdSpace(3);
depthBuffer->clearStencil = _swapEndianU32(stencilClearValue);
Latte::LATTE_DB_STENCIL_CLEAR stencilClearReg;
stencilClearReg.set_clearValue(stencilClearValue);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::DB_STENCIL_CLEAR - 0xA000,
stencilClearReg);
}
// update DB_STENCIL_CLEAR and DB_STENCIL_CLEAR based on clear flags
void _updateDepthStencilClearRegs(float depthClearValue, uint8 stencilClearValue, GX2ClearFlags clearFlags)
{
if ((clearFlags & GX2ClearFlags::SET_DEPTH_REG) != 0 && (clearFlags & GX2ClearFlags::SET_STENCIL_REG) != 0)
{
GX2ReserveCmdSpace(4);
Latte::LATTE_DB_STENCIL_CLEAR stencilClearReg;
stencilClearReg.set_clearValue(stencilClearValue);
Latte::LATTE_DB_DEPTH_CLEAR depthClearReg;
depthClearReg.set_clearValue(depthClearValue);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 2),
Latte::REGADDR::DB_STENCIL_CLEAR - 0xA000,
stencilClearReg, depthClearReg);
}
else if ((clearFlags & GX2ClearFlags::SET_DEPTH_REG) != 0)
{
GX2ReserveCmdSpace(3);
Latte::LATTE_DB_DEPTH_CLEAR depthClearReg;
depthClearReg.set_clearValue(depthClearValue);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::DB_DEPTH_CLEAR - 0xA000,
depthClearReg);
}
else if ((clearFlags & GX2ClearFlags::SET_STENCIL_REG) != 0)
{
GX2ReserveCmdSpace(3);
Latte::LATTE_DB_STENCIL_CLEAR stencilClearReg;
stencilClearReg.set_clearValue(stencilClearValue);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::DB_STENCIL_CLEAR - 0xA000,
stencilClearReg);
}
}
void GX2ClearColor(GX2ColorBuffer* colorBuffer, float r, float g, float b, float a)
{
GX2ReserveCmdSpace(50);
if ((colorBuffer->surface.resFlag & GX2_RESFLAG_USAGE_COLOR_BUFFER) != 0)
{
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_CLEAR_COLOR_DEPTH_STENCIL, 23));
gx2WriteGather_submitU32AsBE(1); // color (1)
gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(colorBuffer->surface.imagePtr));
gx2WriteGather_submitU32AsBE((uint32)colorBuffer->surface.format.value());
gx2WriteGather_submitU32AsBE((uint32)colorBuffer->surface.tileMode.value());
gx2WriteGather_submitU32AsBE(colorBuffer->surface.width);
gx2WriteGather_submitU32AsBE(colorBuffer->surface.height);
gx2WriteGather_submitU32AsBE(colorBuffer->surface.pitch);
gx2WriteGather_submitU32AsBE(_swapEndianU32(colorBuffer->viewFirstSlice));
gx2WriteGather_submitU32AsBE(_swapEndianU32(colorBuffer->viewNumSlices));
gx2WriteGather_submitU32AsBE(MPTR_NULL);
gx2WriteGather_submitU32AsBE(0); // depth buffer format
gx2WriteGather_submitU32AsBE(0); // tilemode for depth buffer
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE((uint32)(r * 255.0f));
gx2WriteGather_submitU32AsBE((uint32)(g * 255.0f));
gx2WriteGather_submitU32AsBE((uint32)(b * 255.0f));
gx2WriteGather_submitU32AsBE((uint32)(a * 255.0f));
gx2WriteGather_submitU32AsBE(0); // clear depth
gx2WriteGather_submitU32AsBE(0); // clear stencil
}
else
{
debug_printf("GX2ClearColor() - unsupported surface flags\n");
}
}
void GX2ClearBuffersEx(GX2ColorBuffer* colorBuffer, GX2DepthBuffer* depthBuffer, float r, float g, float b, float a, float depthClearValue, uint8 stencilClearValue, GX2ClearFlags clearFlags)
{
GX2ReserveCmdSpace(50);
_updateDepthStencilClearRegs(depthClearValue, stencilClearValue, clearFlags);
uint32 hleClearFlags = 0;
if ((clearFlags & GX2ClearFlags::CLEAR_DEPTH) != 0)
hleClearFlags |= 2;
if ((clearFlags & GX2ClearFlags::CLEAR_STENCIL) != 0)
hleClearFlags |= 4;
hleClearFlags |= 1;
// send command to clear color, depth and stencil
if (_swapEndianU32(colorBuffer->viewFirstSlice) != 0)
debugBreakpoint();
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_CLEAR_COLOR_DEPTH_STENCIL, 23));
gx2WriteGather_submitU32AsBE(hleClearFlags); // color (1), depth (2), stencil (4)
gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(colorBuffer->surface.imagePtr));
gx2WriteGather_submitU32AsBE((uint32)colorBuffer->surface.format.value());
gx2WriteGather_submitU32AsBE((uint32)colorBuffer->surface.tileMode.value());
gx2WriteGather_submitU32AsBE((uint32)colorBuffer->surface.width);
gx2WriteGather_submitU32AsBE((uint32)colorBuffer->surface.height);
gx2WriteGather_submitU32AsBE((uint32)colorBuffer->surface.pitch);
gx2WriteGather_submitU32AsBE(_swapEndianU32(colorBuffer->viewFirstSlice));
gx2WriteGather_submitU32AsBE(_swapEndianU32(colorBuffer->viewNumSlices));
gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(depthBuffer->surface.imagePtr));
gx2WriteGather_submitU32AsBE((uint32)depthBuffer->surface.format.value());
gx2WriteGather_submitU32AsBE((uint32)depthBuffer->surface.tileMode.value());
gx2WriteGather_submitU32AsBE((uint32)depthBuffer->surface.width);
gx2WriteGather_submitU32AsBE((uint32)depthBuffer->surface.height);
gx2WriteGather_submitU32AsBE((uint32)depthBuffer->surface.pitch);
gx2WriteGather_submitU32AsBE(_swapEndianU32(depthBuffer->viewFirstSlice));
gx2WriteGather_submitU32AsBE(_swapEndianU32(depthBuffer->viewNumSlices));
gx2WriteGather_submitU32AsBE((uint32)(r * 255.0f));
gx2WriteGather_submitU32AsBE((uint32)(g * 255.0f));
gx2WriteGather_submitU32AsBE((uint32)(b * 255.0f));
gx2WriteGather_submitU32AsBE((uint32)(a * 255.0f));
gx2WriteGather_submitU32AsBE(*(uint32*)&depthClearValue); // clear depth
gx2WriteGather_submitU32AsBE(stencilClearValue&0xFF); // clear stencil
}
// always uses passed depthClearValue/stencilClearValue for clearing, even if clear flags dont specify value updates
void GX2ClearDepthStencilEx(GX2DepthBuffer* depthBuffer, float depthClearValue, uint8 stencilClearValue, GX2ClearFlags clearFlags)
{
GX2ReserveCmdSpace(50);
if (!depthBuffer && (depthBuffer->surface.width == 0 || depthBuffer->surface.height == 0))
{
// Super Smash Bros tries to clear an uninitialized depth surface?
debug_printf("GX2ClearDepthStencilEx(): Attempting to clear invalid depthbuffer\n");
return;
}
_updateDepthStencilClearRegs(depthClearValue, stencilClearValue, clearFlags);
uint32 hleClearFlags = 0;
if ((clearFlags & GX2ClearFlags::CLEAR_DEPTH) != 0)
hleClearFlags |= 2;
if ((clearFlags & GX2ClearFlags::CLEAR_STENCIL) != 0)
hleClearFlags |= 4;
// send command to clear color, depth and stencil
if (hleClearFlags != 0)
{
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_CLEAR_COLOR_DEPTH_STENCIL, 23));
gx2WriteGather_submitU32AsBE(hleClearFlags); // color (1), depth (2), stencil (4)
gx2WriteGather_submitU32AsBE(MPTR_NULL);
gx2WriteGather_submitU32AsBE(0); // format for color buffer
gx2WriteGather_submitU32AsBE(0); // tilemode for color buffer
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(depthBuffer->surface.imagePtr));
gx2WriteGather_submitU32AsBE((uint32)depthBuffer->surface.format.value());
gx2WriteGather_submitU32AsBE((uint32)depthBuffer->surface.tileMode.value());
gx2WriteGather_submitU32AsBE((uint32)depthBuffer->surface.width);
gx2WriteGather_submitU32AsBE((uint32)depthBuffer->surface.height);
gx2WriteGather_submitU32AsBE((uint32)depthBuffer->surface.pitch);
gx2WriteGather_submitU32AsBE(_swapEndianU32(depthBuffer->viewFirstSlice));
gx2WriteGather_submitU32AsBE(_swapEndianU32(depthBuffer->viewNumSlices));
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(*(uint32*)&depthClearValue); // clear depth
gx2WriteGather_submitU32AsBE(stencilClearValue & 0xFF); // clear stencil
}
}
void GX2BlitInit()
{
cafeExportRegister("gx2", GX2SetClearDepthStencil, LogType::GX2);
cafeExportRegister("gx2", GX2SetClearDepth, LogType::GX2);
cafeExportRegister("gx2", GX2SetClearStencil, LogType::GX2);
cafeExportRegister("gx2", GX2ClearColor, LogType::GX2);
cafeExportRegister("gx2", GX2ClearBuffersEx, LogType::GX2);
cafeExportRegister("gx2", GX2ClearDepthStencilEx, LogType::GX2);
}
}

View file

@ -0,0 +1,15 @@
#pragma once
namespace GX2
{
enum class GX2ClearFlags : uint32
{
CLEAR_DEPTH = 0x01, // clear depth to given clear value
CLEAR_STENCIL = 0x02, // clear stencil to given stencil clear value
SET_DEPTH_REG = 0x04, //
SET_STENCIL_REG = 0x08,
};
void GX2BlitInit();
}
ENABLE_BITMASK_OPERATORS(GX2::GX2ClearFlags);

View file

@ -0,0 +1,308 @@
#include "Cafe/HW/Latte/Core/Latte.h"
#include "Cafe/HW/Latte/Core/LatteDraw.h"
#include "Cafe/OS/common/OSCommon.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/OS/libs/coreinit/coreinit.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "GX2.h"
#include "GX2_Command.h"
#include "GX2_Shader.h"
#include "GX2_Misc.h"
extern uint8* gxRingBufferReadPtr;
GX2WriteGatherPipeState gx2WriteGatherPipe = { 0 };
void gx2WriteGather_submitU32AsBE(uint32 v)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(ppcInterpreterCurrentInstance);
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL)
return;
*(uint32*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = _swapEndianU32(v);
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4;
}
void gx2WriteGather_submitU32AsLE(uint32 v)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(ppcInterpreterCurrentInstance);
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL)
return;
*(uint32*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = v;
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4;
}
void gx2WriteGather_submitU32AsLEArray(uint32* v, uint32 numValues)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(ppcInterpreterCurrentInstance);
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL)
return;
memcpy_dwords((*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]), v, numValues);
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4 * numValues;
}
namespace GX2
{
sint32 gx2WriteGatherCurrentMainCoreIndex = -1;
bool gx2WriteGatherInited = false;
void GX2Init_writeGather() // init write gather, make current core
{
if (gx2WriteGatherPipe.gxRingBuffer == NULL)
gx2WriteGatherPipe.gxRingBuffer = (uint8*)malloc(GX2_COMMAND_RING_BUFFER_SIZE);
if (gx2WriteGatherCurrentMainCoreIndex == sGX2MainCoreIndex)
return; // write gather already configured for same core
for (sint32 i = 0; i < PPC_CORE_COUNT; i++)
{
if (i == sGX2MainCoreIndex)
{
gx2WriteGatherPipe.writeGatherPtrGxBuffer[i] = gx2WriteGatherPipe.gxRingBuffer;
gx2WriteGatherPipe.writeGatherPtrWrite[i] = &gx2WriteGatherPipe.writeGatherPtrGxBuffer[i];
}
else
{
gx2WriteGatherPipe.writeGatherPtrGxBuffer[i] = NULL;
gx2WriteGatherPipe.writeGatherPtrWrite[i] = NULL;
}
gx2WriteGatherPipe.displayListStart[i] = MPTR_NULL;
gx2WriteGatherPipe.writeGatherPtrDisplayList[i] = NULL;
gx2WriteGatherPipe.displayListMaxSize[i] = 0;
}
gx2WriteGatherCurrentMainCoreIndex = sGX2MainCoreIndex;
gx2WriteGatherInited = true;
}
void GX2WriteGather_beginDisplayList(PPCInterpreter_t* hCPU, MPTR buffer, uint32 maxSize)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(hCPU);
gx2WriteGatherPipe.displayListStart[coreIndex] = buffer;
gx2WriteGatherPipe.displayListMaxSize[coreIndex] = maxSize;
// set new write gather ptr
gx2WriteGatherPipe.writeGatherPtrDisplayList[coreIndex] = memory_getPointerFromVirtualOffset(gx2WriteGatherPipe.displayListStart[coreIndex]);
gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = &gx2WriteGatherPipe.writeGatherPtrDisplayList[coreIndex];
}
uint32 GX2WriteGather_getDisplayListWriteDistance(sint32 coreIndex)
{
return (uint32)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] - memory_getPointerFromVirtualOffset(gx2WriteGatherPipe.displayListStart[coreIndex]));
}
uint32 GX2WriteGather_getFifoWriteDistance(uint32 coreIndex)
{
uint32 writeDistance = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer);
return writeDistance;
}
uint32 GX2WriteGather_endDisplayList(PPCInterpreter_t* hCPU, MPTR buffer)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(hCPU);
if (gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL)
{
uint32 currentWriteSize = GX2WriteGather_getDisplayListWriteDistance(coreIndex);
// pad to 32 byte
if (gx2WriteGatherPipe.displayListMaxSize[coreIndex] >= ((gx2WriteGatherPipe.displayListMaxSize[coreIndex] + 0x1F) & ~0x1F))
{
while ((currentWriteSize & 0x1F) != 0)
{
gx2WriteGather_submitU32AsBE(pm4HeaderType2Filler());
currentWriteSize += 4;
}
}
// get size of written data
currentWriteSize = GX2WriteGather_getDisplayListWriteDistance(coreIndex);
// disable current display list and restore write gather ptr
gx2WriteGatherPipe.displayListStart[coreIndex] = MPTR_NULL;
if (sGX2MainCoreIndex == coreIndex)
gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = &gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex];
else
gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = NULL;
// return size of (written) display list
return currentWriteSize;
}
else
{
// no active display list
// return a size of 0
return 0;
}
}
bool GX2GetCurrentDisplayList(betype<MPTR>* displayListAddr, uint32be* displayListSize)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(ppcInterpreterCurrentInstance);
if (gx2WriteGatherPipe.displayListStart[coreIndex] == MPTR_NULL)
return false;
if (displayListAddr)
*displayListAddr = gx2WriteGatherPipe.displayListStart[coreIndex];
if (displayListSize)
*displayListSize = gx2WriteGatherPipe.displayListMaxSize[coreIndex];
return true;
}
bool GX2GetDisplayListWriteStatus()
{
// returns true if we are writing to a display list
uint32 coreIndex = PPCInterpreter_getCoreIndex(ppcInterpreterCurrentInstance);
return gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL;
}
bool GX2WriteGather_isDisplayListActive()
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(ppcInterpreterCurrentInstance);
if (gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL)
return true;
return false;
}
uint32 GX2WriteGather_getReadWriteDistance()
{
uint32 coreIndex = sGX2MainCoreIndex;
uint32 writeDistance = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] + GX2_COMMAND_RING_BUFFER_SIZE - gxRingBufferReadPtr);
writeDistance %= GX2_COMMAND_RING_BUFFER_SIZE;
return writeDistance;
}
void GX2WriteGather_checkAndInsertWrapAroundMark()
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(ppcInterpreterCurrentInstance);
if (coreIndex != sGX2MainCoreIndex) // only if main gx2 core
return;
if (gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL)
return;
uint32 writeDistance = GX2WriteGather_getFifoWriteDistance(coreIndex);
if (writeDistance >= (GX2_COMMAND_RING_BUFFER_SIZE * 3 / 5))
{
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_FIFO_WRAP_AROUND, 1));
gx2WriteGather_submitU32AsBE(0); // empty word since we can't send commands with zero data words
gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] = gx2WriteGatherPipe.gxRingBuffer;
}
}
void GX2BeginDisplayList(MEMPTR<void> displayListAddr, uint32 size)
{
GX2WriteGather_beginDisplayList(ppcInterpreterCurrentInstance, displayListAddr.GetMPTR(), size);
}
void GX2BeginDisplayListEx(MEMPTR<void> displayListAddr, uint32 size, bool profiling)
{
GX2WriteGather_beginDisplayList(ppcInterpreterCurrentInstance, displayListAddr.GetMPTR(), size);
}
uint32 GX2EndDisplayList(MEMPTR<void> displayListAddr)
{
cemu_assert_debug(displayListAddr != nullptr);
uint32 displayListSize = GX2WriteGather_endDisplayList(ppcInterpreterCurrentInstance, displayListAddr.GetMPTR());
return displayListSize;
}
void GX2CallDisplayList(MPTR addr, uint32 size)
{
cemu_assert_debug((size&3) == 0);
// write PM4 command
GX2ReserveCmdSpace(4);
gx2WriteGather_submit(pm4HeaderType3(IT_INDIRECT_BUFFER_PRIV, 3),
memory_virtualToPhysical(addr),
0, // high address bits
size / 4);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
void GX2DirectCallDisplayList(void* addr, uint32 size)
{
// this API submits to TCL directly and bypasses write-gatherer
// its basically a way to manually submit a command buffer to the GPU
// as such it also affects the submission and retire timestamps
uint32 coreIndex = PPCInterpreter_getCoreIndex(ppcInterpreterCurrentInstance);
cemu_assert_debug(coreIndex == sGX2MainCoreIndex);
coreIndex = sGX2MainCoreIndex; // always submit to main queue which is owned by GX2 main core (TCLSubmitToRing does not need this workaround)
uint32be* cmdStream = (uint32be*)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex]);
cmdStream[0] = pm4HeaderType3(IT_INDIRECT_BUFFER_PRIV, 3);
cmdStream[1] = memory_virtualToPhysical(MEMPTR<void>(addr).GetMPTR());
cmdStream[2] = 0;
cmdStream[3] = size / 4;
gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] += 16;
// update submission timestamp and retired timestamp
_GX2SubmitToTCL();
}
void GX2CopyDisplayList(MEMPTR<uint32be*> addr, uint32 size)
{
// copy display list to write gather
uint32* displayListDWords = (uint32*)addr.GetPtr();
uint32 dwordCount = size / 4;
if (dwordCount > 0)
{
GX2ReserveCmdSpace(dwordCount);
gx2WriteGather_submitU32AsLEArray(displayListDWords, dwordCount);
}
}
enum class GX2_PATCH_TYPE : uint32
{
FETCH_SHADER = 1,
VERTEX_SHADER = 2,
GEOMETRY_COPY_SHADER = 3,
GEOMETRY_SHADER = 4,
PIXEL_SHADER = 5,
COMPUTE_SHADER = 6
};
void GX2PatchDisplayList(uint32be* displayData, GX2_PATCH_TYPE patchType, uint32 patchOffset, void* obj)
{
cemu_assert_debug((patchOffset & 3) == 0);
if (patchType == GX2_PATCH_TYPE::VERTEX_SHADER)
{
GX2VertexShader_t* vertexShader = (GX2VertexShader_t*)obj;
displayData[patchOffset / 4 + 2] = memory_virtualToPhysical(vertexShader->GetProgramAddr()) >> 8;
}
else if (patchType == GX2_PATCH_TYPE::PIXEL_SHADER)
{
GX2PixelShader_t* pixelShader = (GX2PixelShader_t*)obj;
displayData[patchOffset / 4 + 2] = memory_virtualToPhysical(pixelShader->GetProgramAddr()) >> 8;
}
else if (patchType == GX2_PATCH_TYPE::FETCH_SHADER)
{
GX2FetchShader_t* fetchShader = (GX2FetchShader_t*)obj;
displayData[patchOffset / 4 + 2] = memory_virtualToPhysical(fetchShader->GetProgramAddr()) >> 8;
}
else if (patchType == GX2_PATCH_TYPE::GEOMETRY_COPY_SHADER)
{
GX2GeometryShader_t* geometryShader = (GX2GeometryShader_t*)obj;
displayData[patchOffset / 4 + 2] = memory_virtualToPhysical(geometryShader->GetCopyProgramAddr()) >> 8;
}
else if (patchType == GX2_PATCH_TYPE::GEOMETRY_SHADER)
{
GX2GeometryShader_t* geometryShader = (GX2GeometryShader_t*)obj;
displayData[patchOffset / 4 + 2] = memory_virtualToPhysical(geometryShader->GetGeometryProgramAddr()) >> 8;
}
else
{
forceLog_printf("GX2PatchDisplayList(): unsupported patchType %d", (uint32)patchType);
cemu_assert_debug(false);
}
}
void GX2CommandInit()
{
cafeExportRegister("gx2", GX2BeginDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2BeginDisplayListEx, LogType::GX2);
cafeExportRegister("gx2", GX2EndDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2GetCurrentDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2GetDisplayListWriteStatus, LogType::GX2);
cafeExportRegister("gx2", GX2CallDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2DirectCallDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2CopyDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2PatchDisplayList, LogType::GX2);
}
}

View file

@ -0,0 +1,101 @@
#pragma once
#include "Cafe/HW/Latte/ISA/LatteReg.h"
#include "Cafe/HW/Espresso/Const.h"
struct GX2WriteGatherPipeState
{
uint8* gxRingBuffer;
// each core has it's own write gatherer and display list state (writing)
uint8* writeGatherPtrGxBuffer[Espresso::CORE_COUNT];
uint8** writeGatherPtrWrite[Espresso::CORE_COUNT];
uint8* writeGatherPtrDisplayList[Espresso::CORE_COUNT];
MPTR displayListStart[Espresso::CORE_COUNT];
uint32 displayListMaxSize[Espresso::CORE_COUNT];
};
extern GX2WriteGatherPipeState gx2WriteGatherPipe;
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32); // move to GX2 namespace eventually
void gx2WriteGather_submitU32AsBE(uint32 v);
void gx2WriteGather_submitU32AsLE(uint32 v);
void gx2WriteGather_submitU32AsLEArray(uint32* v, uint32 numValues);
uint32 PPCInterpreter_getCurrentCoreIndex();
// gx2WriteGather_submit functions
template <typename ...Targs>
inline void gx2WriteGather_submit_(uint32 coreIndex, uint32be* writePtr)
{
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = (uint8*)writePtr;
}
template <typename T, typename ...Targs>
inline void gx2WriteGather_submit_(uint32 coreIndex, uint32be* writePtr, const betype<T>& arg, Targs... args)
{
static_assert(sizeof(betype<T>) == sizeof(uint32be));
*(betype<T>*)writePtr = arg;
writePtr++;
gx2WriteGather_submit_(coreIndex, writePtr, args...);
}
template <typename T, typename ...Targs>
inline
typename std::enable_if< std::is_floating_point<T>::value, void>::type
gx2WriteGather_submit_(uint32 coreIndex, uint32be* writePtr, const T& arg, Targs... args)
{
static_assert(sizeof(T) == sizeof(uint32));
*writePtr = *(uint32*)&arg;
writePtr++;
gx2WriteGather_submit_(coreIndex, writePtr, args...);
}
template <typename T, typename ...Targs>
inline
typename std::enable_if< std::is_base_of<Latte::LATTEREG, T>::value, void>::type
gx2WriteGather_submit_(uint32 coreIndex, uint32be* writePtr, const T& arg, Targs... args)
{
static_assert(sizeof(Latte::LATTEREG) == sizeof(uint32be));
*writePtr = arg.getRawValue();
writePtr++;
gx2WriteGather_submit_(coreIndex, writePtr, args...);
}
template <typename T, typename ...Targs>
inline
typename std::enable_if< !std::is_base_of<Latte::LATTEREG, T>::value && !std::is_floating_point<T>::value, void>::type
gx2WriteGather_submit_(uint32 coreIndex, uint32be* writePtr, const T& arg, Targs... args)
{
*writePtr = arg;
writePtr++;
gx2WriteGather_submit_(coreIndex, writePtr, args...);
}
template <typename ...Targs>
inline void gx2WriteGather_submit(Targs... args)
{
uint32 coreIndex = PPCInterpreter_getCurrentCoreIndex();
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == nullptr)
return;
uint32be* writePtr = (uint32be*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]);
gx2WriteGather_submit_(coreIndex, writePtr, std::forward<Targs>(args)...);
}
namespace GX2
{
bool GX2WriteGather_isDisplayListActive();
uint32 GX2WriteGather_getReadWriteDistance();
void GX2WriteGather_checkAndInsertWrapAroundMark();
void GX2BeginDisplayList(MEMPTR<void> displayListAddr, uint32 size);
void GX2BeginDisplayListEx(MEMPTR<void> displayListAddr, uint32 size, bool profiling);
uint32 GX2EndDisplayList(MEMPTR<void> displayListAddr);
void GX2CallDisplayList(MPTR addr, uint32 size);
void GX2DirectCallDisplayList(void* addr, uint32 size);
void GX2Init_writeGather();
void GX2CommandInit();
}

View file

@ -0,0 +1,392 @@
#include "Cafe/OS/common/OSCommon.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "GX2.h"
#include "Cafe/HW/Latte/Core/Latte.h"
#include "Cafe/HW/Latte/Core/LatteDraw.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "GX2_Command.h"
#include "GX2_State.h"
#include "Cafe/CafeSystem.h"
#define GPU7_REG_AREA_SIZE_CONFIG_REG 0xB00
#define GPU7_REG_AREA_SIZE_CONTEXT_REG 0x400
#define GPU7_REG_AREA_SIZE_ALU_CONST 0x800
#define GPU7_REG_AREA_SIZE_LOOP_CONST 0x60
#define GPU7_REG_AREA_SIZE_RESOURCE 0xD9E
#define GPU7_REG_AREA_SIZE_SAMPLER 0xA2 // (guessed)
#define _GX2_CALC_SHADOWMEM_NUM_U32(__v) (((((__v)*4)+0xFF)&~0xFF)/4)
MPTR gx2CurrentContextStateMPTR = MPTR_NULL;
typedef struct
{
uint32 regOffset;
uint32 regCount;
}GX2RegLoadPktEntry_t;
GX2RegLoadPktEntry_t aluConst_loadPktEntries[1] = // base: 0xC000
{
{0, 0x800},
};
GX2RegLoadPktEntry_t loopConst_loadPktEntries[1] = // base: 0xF880
{
{0, 0x60},
};
GX2RegLoadPktEntry_t samplerReg_loadPktEntries[3] = // base: 0xF000
{
{0, 0x36},
{0x36, 0x36},
{0x6C, 0x36},
};
GX2RegLoadPktEntry_t configReg_loadPktEntries[0xF] = // base: 0x2000
{
{0x300, 0x6},
{0x900, 0x48},
{0x980, 0x48},
{0xA00, 0x48},
{0x310, 0xC},
{0x542, 0x1},
{0x235, 0x1},
{0x232, 0x2},
{0x23A, 0x1},
{0x256, 0x1},
{0x60C, 0x1},
{0x5C5, 0x1},
{0x2C8, 0x1},
{0x363, 0x1},
{0x404, 0x2}
};
GX2RegLoadPktEntry_t contextReg_loadPktEntries[0x2D] = // base: 0xA000
{
{0x0, 0x2},
{0x3, 0x3},
{0xA, 0x4},
{0x10, 0x38},
{0x50, 0x34},
{0x8E, 0x4},
{0x94, 0x40},
{0x100, 0x9},
{0x10C, 0x3},
{0x10F, 0x60},
{0x185, 0xA},
{0x191, 0x27},
{0x1E0, 0x9},
{0x200, 0x1},
{0x202, 0x7},
{0xE0, 0x20},
{0x210, 0x29},
{0x250, 0x34},
{0x290, 0x1},
{0x292, 0x2},
{0x2A1, 0x1},
{0x2A5, 0x1},
{0x2A8, 0x2},
{0x2AC, 0x3},
{0x2CA, 0x1},
{0x2CC, 0x1},
{0x2CE, 0x1},
{0x300, 0x9},
{0x30C, 0x1},
{0x312, 0x1},
{0x316, 0x2},
{0x343, 0x2},
{0x349, 0x3},
{0x34C, 0x2},
{0x351, 0x1},
{0x37E, 0x6},
{0x2B4, 0x3},
{0x2B8, 0x3},
{0x2BC, 0x3},
{0x2C0, 0x3},
{0x2C8, 0x1},
{0x29B, 0x1},
{0x8C, 0x1},
{0xD5, 0x1},
{0x284, 0xC}
};
GX2RegLoadPktEntry_t resourceReg_loadPktEntries[9] = // base: 0xE000
{
{0, 0x70}, // ps tex
{0x380, 0x70},
{0x460, 0x70}, // vs tex
{0x7E0, 0x70},
{0x8B9, 0x7},
{0x8C0, 0x70},
{0x930, 0x70}, // gs tex
{0xCB0, 0x70},
{0xD89, 0x7}
};
typedef struct
{
// Hardware view of context state (register areas)
uint32 areaConfigReg[_GX2_CALC_SHADOWMEM_NUM_U32(GPU7_REG_AREA_SIZE_CONFIG_REG)];
uint32 areaContextReg[_GX2_CALC_SHADOWMEM_NUM_U32(GPU7_REG_AREA_SIZE_CONTEXT_REG)];
uint32 areaALUConst[_GX2_CALC_SHADOWMEM_NUM_U32(GPU7_REG_AREA_SIZE_ALU_CONST)];
uint32 areaLoopConst[_GX2_CALC_SHADOWMEM_NUM_U32(GPU7_REG_AREA_SIZE_LOOP_CONST)];
uint32 areaResource[_GX2_CALC_SHADOWMEM_NUM_U32(GPU7_REG_AREA_SIZE_RESOURCE)];
uint32 areaSampler[_GX2_CALC_SHADOWMEM_NUM_U32(GPU7_REG_AREA_SIZE_SAMPLER)];
}GX2HwContextState_t;
typedef struct
{
GX2HwContextState_t hwContext;
uint32 enableProfling;
/* + 0x9804 */ uint32be loadDL_size;
uint8 ukn9808[0x3FC-4];
uint8 ukn9C00[0x200];
/* +0x9E00 */ uint8 loadDL_buffer[0x300]; // this displaylist caches the IT_LOAD_* commands for swapping out context
}GX2ContextState_t;
static_assert(offsetof(GX2ContextState_t, loadDL_size) == 0x9804);
static_assert(sizeof(GX2ContextState_t) == 0xA100);
uint32 _GX2Context_CalcShadowMemSize(uint32 regCount)
{
return (regCount*4+0xFF)&~0xFF;
}
uint32 _GX2Context_CalcStateSize()
{
uint32 contextStateSize = 0;
contextStateSize += _GX2Context_CalcShadowMemSize(GPU7_REG_AREA_SIZE_CONFIG_REG);
contextStateSize += _GX2Context_CalcShadowMemSize(GPU7_REG_AREA_SIZE_CONTEXT_REG);
contextStateSize += _GX2Context_CalcShadowMemSize(GPU7_REG_AREA_SIZE_ALU_CONST);
contextStateSize += _GX2Context_CalcShadowMemSize(GPU7_REG_AREA_SIZE_LOOP_CONST);
contextStateSize += _GX2Context_CalcShadowMemSize(GPU7_REG_AREA_SIZE_RESOURCE);
contextStateSize += _GX2Context_CalcShadowMemSize(GPU7_REG_AREA_SIZE_SAMPLER);
return contextStateSize;
}
void _GX2Context_CreateLoadDL()
{
GX2ReserveCmdSpace(3);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_CONTEXT_CONTROL, 2));
gx2WriteGather_submitU32AsBE(0x80000077);
gx2WriteGather_submitU32AsBE(0x80000077);
}
void _GX2Context_WriteCmdDisableStateShadowing()
{
GX2ReserveCmdSpace(3);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_CONTEXT_CONTROL, 2));
gx2WriteGather_submitU32AsBE(0x80000000);
gx2WriteGather_submitU32AsBE(0x80000000);
}
void _GX2Context_cmdLoad(void* gx2ukn, uint32 pm4Header, MPTR physAddrRegArea, uint32 waitForIdle, uint32 numRegOffsetEntries, GX2RegLoadPktEntry_t* regOffsetEntries)
{
GX2ReserveCmdSpace(3 + numRegOffsetEntries*2);
gx2WriteGather_submitU32AsBE(pm4Header);
gx2WriteGather_submitU32AsBE(physAddrRegArea);
gx2WriteGather_submitU32AsBE(waitForIdle);
for(uint32 i=0; i<numRegOffsetEntries; i++)
{
gx2WriteGather_submitU32AsBE(regOffsetEntries[i].regOffset); // regOffset
gx2WriteGather_submitU32AsBE(regOffsetEntries[i].regCount); // regCount
}
}
#define __cmdStateLoad(__gx2State, __pm4Command, __regArea, __waitForIdle, __regLoadPktEntries) _GX2Context_cmdLoad(NULL, pm4HeaderType3(__pm4Command, 2+sizeof(__regLoadPktEntries)/sizeof(__regLoadPktEntries[0])*2), memory_virtualToPhysical(memory_getVirtualOffsetFromPointer(__regArea)), __waitForIdle, sizeof(__regLoadPktEntries)/sizeof(__regLoadPktEntries[0]), __regLoadPktEntries)
void _GX2Context_WriteCmdRestoreState(GX2ContextState_t* gx2ContextState, uint32 ukn)
{
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
MPTR physAddrContextState = memory_virtualToPhysical(memory_getVirtualOffsetFromPointer(gx2ContextState));
_GX2Context_CreateLoadDL();
__cmdStateLoad(NULL, IT_LOAD_CONFIG_REG, gx2ContextState->hwContext.areaConfigReg, 0x80000000, configReg_loadPktEntries);
__cmdStateLoad(NULL, IT_LOAD_CONTEXT_REG, gx2ContextState->hwContext.areaContextReg, 0, contextReg_loadPktEntries);
__cmdStateLoad(NULL, IT_LOAD_ALU_CONST, gx2ContextState->hwContext.areaALUConst, 0, aluConst_loadPktEntries);
__cmdStateLoad(NULL, IT_LOAD_LOOP_CONST, gx2ContextState->hwContext.areaLoopConst, 0, loopConst_loadPktEntries);
__cmdStateLoad(NULL, IT_LOAD_RESOURCE, gx2ContextState->hwContext.areaResource, 0, resourceReg_loadPktEntries);
__cmdStateLoad(NULL, IT_LOAD_SAMPLER, gx2ContextState->hwContext.areaSampler, 0, samplerReg_loadPktEntries);
}
void GX2SetDefaultState()
{
GX2ReserveCmdSpace(0x100);
Latte::LATTE_PA_CL_VTE_CNTL reg{};
reg.set_VPORT_X_OFFSET_ENA(true).set_VPORT_X_SCALE_ENA(true);
reg.set_VPORT_Y_OFFSET_ENA(true).set_VPORT_Y_SCALE_ENA(true);
reg.set_VPORT_Z_OFFSET_ENA(true).set_VPORT_Z_SCALE_ENA(true);
reg.set_VTX_W0_FMT(true);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::PA_CL_VTE_CNTL - 0xA000,
reg);
uint32 stencilTestEnable = GX2_FALSE;
uint32 backStencilEnable = GX2_FALSE;
uint32 frontStencilFunc = 0;
uint32 frontStencilZPass = 0;
uint32 frontStencilZFail = 0;
uint32 frontStencilFail = 0;
uint32 backStencilFunc = 0;
uint32 backStencilZPass = 0;
uint32 backStencilZFail = 0;
uint32 backStencilFail = 0;
uint32 depthControlReg = 0;
// depth stuff
depthControlReg |= (1<<1);
depthControlReg |= (1<<2);
depthControlReg |= ((1&7)<<4);
// stencil stuff
depthControlReg |= ((stencilTestEnable&1)<<0);
depthControlReg |= ((backStencilEnable&1)<<7);
depthControlReg |= ((frontStencilFunc&7)<<8);
depthControlReg |= ((frontStencilZPass&7)<<14);
depthControlReg |= ((frontStencilZFail&7)<<17);
depthControlReg |= ((frontStencilFail&7)<<11);
depthControlReg |= ((backStencilFunc&7)<<20);
depthControlReg |= ((backStencilZPass&7)<<26);
depthControlReg |= ((backStencilZFail&3)<<29);
depthControlReg |= ((backStencilFail&7)<<23);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+1));
gx2WriteGather_submitU32AsBE(Latte::REGADDR::DB_DEPTH_CONTROL-0xA000);
gx2WriteGather_submitU32AsBE(depthControlReg);
GX2::GX2SetAlphaTest(GX2_DISABLE, GX2::GX2_ALPHAFUNC::LESS, 0.0f);
GX2::GX2SetPolygonControl(Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE::CCW, GX2_DISABLE, GX2_DISABLE, Latte::LATTE_PA_SU_SC_MODE_CNTL::E_POLYGONMODE::UKN0, Latte::LATTE_PA_SU_SC_MODE_CNTL::E_PTYPE::TRIANGLES, Latte::LATTE_PA_SU_SC_MODE_CNTL::E_PTYPE::TRIANGLES, GX2_DISABLE, GX2_DISABLE, GX2_DISABLE);
GX2::GX2SetPolygonOffset(0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
GX2::GX2SetPrimitiveRestartIndex(0xffffffff);
GX2::GX2SetTargetChannelMasks(0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF);
GX2::GX2SetBlendConstantColor(0.0f, 0.0f, 0.0f, 0.0f);
GX2::GX2SetPointSize(1.0f, 1.0f);
GX2::GX2SetPointLimits(1.0f, 1.0f);
GX2::GX2SetColorControl(GX2::GX2_LOGICOP::COPY, GX2_DISABLE, GX2_DISABLE, GX2_ENABLE);
GX2::GX2SetRasterizerClipControlEx(true, true, false);
// Set clear depth to 1.0 (workaround for Darksiders 2. Investigate whether actual GX2 driver also sets this)
float depth = 1.0;
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+1));
gx2WriteGather_submitU32AsBE(mmDB_DEPTH_CLEAR - 0xA000);
gx2WriteGather_submitU32AsBE(*(uint32*)&depth); // depth (as float)
// reset HLE special states
for (sint32 i = 0; i <= GX2_SPECIAL_STATE_COUNT; i++)
{
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_SPECIAL_STATE, 2));
gx2WriteGather_submitU32AsBE(i); // state id
gx2WriteGather_submitU32AsBE(0); // disable
}
}
void gx2Export_GX2SetDefaultState(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetDefaultState()");
GX2SetDefaultState();
osLib_returnFromFunction(hCPU, 0);
}
void _GX2ContextCreateRestoreStateDL(GX2ContextState_t* gx2ContextState)
{
// begin display list
if (GX2::GX2WriteGather_isDisplayListActive())
assert_dbg();
GX2::GX2BeginDisplayList((void*)gx2ContextState->loadDL_buffer, sizeof(gx2ContextState->loadDL_buffer));
_GX2Context_WriteCmdRestoreState(gx2ContextState, 0);
uint32 displayListSize = GX2::GX2EndDisplayList((void*)gx2ContextState->loadDL_buffer);
gx2ContextState->loadDL_size = displayListSize;
}
void gx2Export_GX2SetupContextStateEx(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetupContextStateEx(0x%08x)\n", hCPU->gpr[3]);
cemu_assert_debug(hCPU->gpr[4] == 0 || hCPU->gpr[4] == 1);
GX2ContextState_t* gx2ContextState = (GX2ContextState_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
uint32 hwContextSize = _GX2Context_CalcStateSize();
if( hwContextSize != sizeof(GX2HwContextState_t) )
assert_dbg();
if( sizeof(GX2HwContextState_t) != 0x9800 )
assert_dbg(); // GX2 HW context size mismatch
if( sizeof(GX2ContextState_t) != 0xA100 )
assert_dbg(); // GX2 context size mismatch
memset(gx2ContextState, 0x00, sizeof(GX2ContextState_t));
gx2ContextState->enableProfling = _swapEndianU32(hCPU->gpr[4]&1);
_GX2Context_WriteCmdRestoreState(gx2ContextState, 1);
gx2CurrentContextStateMPTR = hCPU->gpr[3];
_GX2Context_CreateLoadDL();
GX2SetDefaultState();
_GX2ContextCreateRestoreStateDL(gx2ContextState);
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SetContextState(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetContextState(0x%08x)\n", hCPU->gpr[3]);
// parameters:
if( hCPU->gpr[3] == MPTR_NULL )
{
// disable state shadowing
_GX2Context_WriteCmdDisableStateShadowing();
osLib_returnFromFunction(hCPU, 0);
return;
}
// check if context state changed
bool boiWorkaround = CafeSystem::GetRPXHashBase() == 0x6BCD618E; // workaround for a bug in Binding of Isaac to avoid flicker
if( boiWorkaround )
{
if( hCPU->gpr[3] != gx2CurrentContextStateMPTR ) // dont reload same state
{
GX2ContextState_t* gx2ContextState = (GX2ContextState_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
_GX2Context_WriteCmdRestoreState(gx2ContextState, 0);
_GX2Context_CreateLoadDL();
// set new context state
gx2CurrentContextStateMPTR = hCPU->gpr[3];
}
else
{
// even if it's the same context, make sure state shadowing is enabled.
_GX2Context_CreateLoadDL();
}
}
else
{
GX2ContextState_t* gx2ContextState = (GX2ContextState_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
if (gx2ContextState->loadDL_size == 0)
{
_GX2Context_CreateLoadDL();
_GX2Context_WriteCmdRestoreState(gx2ContextState, 0);
}
else
{
_GX2Context_CreateLoadDL();
GX2::GX2CallDisplayList(memory_getVirtualOffsetFromPointer(gx2ContextState->loadDL_buffer), gx2ContextState->loadDL_size);
}
// set new context state
gx2CurrentContextStateMPTR = hCPU->gpr[3];
}
// todo: Save/restore GX2 special state as well -> handle this by correctly emulating the state
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2GetContextStateDisplayList(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2GetContextStateDisplayList(0x%08x, 0x%08x, 0x%08x)\n", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5]);
ppcDefineParamStructPtr(gx2ContextState, GX2ContextState_t, 0);
ppcDefineParamU32BEPtr(displayListPtrOut, 1);
ppcDefineParamU32BEPtr(displayListSizeOut, 2);
*displayListPtrOut = memory_getVirtualOffsetFromPointer(gx2ContextState->loadDL_buffer);
*displayListSizeOut = gx2ContextState->loadDL_size;
osLib_returnFromFunction(hCPU, 0);
}

View file

@ -0,0 +1,231 @@
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "GX2.h"
#include "Cafe/HW/Latte/Core/Latte.h"
#include "Cafe/HW/Latte/Core/LatteDraw.h"
#include "Cafe/HW/Latte/ISA/LatteReg.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/OS/common/OSCommon.h"
#include "GX2_Command.h"
#include "GX2_Draw.h"
namespace GX2
{
void GX2SetAttribBuffer(uint32 bufferIndex, uint32 sizeInBytes, uint32 stride, void* data)
{
GX2ReserveCmdSpace(9);
MPTR physicalAddress = memory_virtualToPhysical(memory_getVirtualOffsetFromPointer(data));
// write PM4 command
gx2WriteGather_submit(
pm4HeaderType3(IT_SET_RESOURCE, 8),
0x8C0 + bufferIndex * 7,
physicalAddress,
sizeInBytes - 1, // size
(stride & 0xFFFF) << 11, // stride
0, // ukn
0, // ukn
0, // ukn
0xC0000000); // ukn
}
void GX2DrawIndexedEx(GX2PrimitiveMode2 primitiveMode, uint32 count, GX2IndexType indexType, void* indexData, uint32 baseVertex, uint32 numInstances)
{
GX2ReserveCmdSpace(3 + 3 + 2 + 2 + 6);
gx2WriteGather_submit(
// IT_SET_CTL_CONST
pm4HeaderType3(IT_SET_CTL_CONST, 2), 0,
baseVertex,
// IT_SET_CONFIG_REG
pm4HeaderType3(IT_SET_CONFIG_REG, 2), Latte::REGADDR::VGT_PRIMITIVE_TYPE - 0x2000,
(uint32)primitiveMode,
// IT_INDEX_TYPE
pm4HeaderType3(IT_INDEX_TYPE, 1),
(uint32)indexType,
// IT_NUM_INSTANCES
pm4HeaderType3(IT_NUM_INSTANCES, 1),
numInstances,
// IT_DRAW_INDEX_2
pm4HeaderType3(IT_DRAW_INDEX_2, 5) | 0x00000001,
-1,
memory_virtualToPhysical(memory_getVirtualOffsetFromPointer(indexData)),
0,
count,
0);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
void GX2DrawIndexedEx2(GX2PrimitiveMode2 primitiveMode, uint32 count, GX2IndexType indexType, void* indexData, uint32 baseVertex, uint32 numInstances, uint32 baseInstance)
{
GX2ReserveCmdSpace(3 + 3 + 2 + 2 + 6);
gx2WriteGather_submit(
// IT_SET_CTL_CONST
pm4HeaderType3(IT_SET_CTL_CONST, 2), 0,
baseVertex,
// set base instance
pm4HeaderType3(IT_SET_CTL_CONST, 2), 1,
baseInstance,
// IT_SET_CONFIG_REG
pm4HeaderType3(IT_SET_CONFIG_REG, 2), Latte::REGADDR::VGT_PRIMITIVE_TYPE - 0x2000,
(uint32)primitiveMode,
// IT_INDEX_TYPE
pm4HeaderType3(IT_INDEX_TYPE, 1),
(uint32)indexType,
// IT_NUM_INSTANCES
pm4HeaderType3(IT_NUM_INSTANCES, 1),
numInstances,
// IT_DRAW_INDEX_2
pm4HeaderType3(IT_DRAW_INDEX_2, 5) | 0x00000001,
-1,
memory_virtualToPhysical(memory_getVirtualOffsetFromPointer(indexData)),
0,
count,
0,
// reset base instance
pm4HeaderType3(IT_SET_CTL_CONST, 2), 1,
0 // baseInstance
);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
void GX2DrawEx(GX2PrimitiveMode2 primitiveMode, uint32 count, uint32 baseVertex, uint32 numInstances)
{
GX2ReserveCmdSpace(3 + 3 + 2 + 2 + 6);
gx2WriteGather_submit(
// IT_SET_CTL_CONST
pm4HeaderType3(IT_SET_CTL_CONST, 2), 0,
baseVertex,
// IT_SET_CONFIG_REG
pm4HeaderType3(IT_SET_CONFIG_REG, 2), Latte::REGADDR::VGT_PRIMITIVE_TYPE - 0x2000,
(uint32)primitiveMode,
// IT_INDEX_TYPE
pm4HeaderType3(IT_INDEX_TYPE, 1),
(uint32)GX2IndexType::U32_BE,
// IT_NUM_INSTANCES
pm4HeaderType3(IT_NUM_INSTANCES, 1),
numInstances,
// IT_DRAW_INDEX_2
pm4HeaderType3(IT_DRAW_INDEX_AUTO, 2) | 0x00000001,
count,
0 // DRAW_INITIATOR
);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
void GX2DrawIndexedImmediateEx(GX2PrimitiveMode2 primitiveMode, uint32 count, GX2IndexType indexType, void* indexData, uint32 baseVertex, uint32 numInstances)
{
uint32* indexDataU32 = (uint32*)indexData;
uint32 numIndexU32s;
bool use32BitIndices = false;
if (indexType == GX2IndexType::U16_BE || indexType == GX2IndexType::U16_LE)
{
// 16bit indices
numIndexU32s = (count + 1) / 2;
}
else if (indexType == GX2IndexType::U32_BE || indexType == GX2IndexType::U32_LE)
{
// 32bit indices
numIndexU32s = count;
use32BitIndices = true;
}
else
{
cemu_assert_unimplemented();
}
GX2ReserveCmdSpace(3 + 3 + 3 + 2 + 2 + 6 + 3 + numIndexU32s);
if (numIndexU32s > 0x4000 - 2)
{
cemuLog_log(LogType::Force, "GX2DrawIndexedImmediateEx(): Draw exceeds maximum PM4 command size. Keep index size below 16KiB minus 8 byte");
return;
}
// set base vertex
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CTL_CONST, 2));
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(baseVertex);
// set primitive mode
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONFIG_REG, 2));
gx2WriteGather_submitU32AsBE(Latte::REGADDR::VGT_PRIMITIVE_TYPE - 0x2000);
gx2WriteGather_submitU32AsBE((uint32)primitiveMode);
// set index type
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_INDEX_TYPE, 1));
gx2WriteGather_submitU32AsBE((uint32)indexType);
// set number of instances
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_NUM_INSTANCES, 1));
gx2WriteGather_submitU32AsBE((uint32)numInstances);
// request indexed draw with indices embedded into command buffer
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_DRAW_INDEX_IMMD, 2 + numIndexU32s) | 0x00000001);
gx2WriteGather_submitU32AsBE(count);
gx2WriteGather_submitU32AsBE(0); // ukn
if (use32BitIndices)
{
for (uint32 i = 0; i < numIndexU32s; i++)
{
gx2WriteGather_submitU32AsLE(indexDataU32[i]);
}
}
else
{
for (uint32 i = 0; i < numIndexU32s; i++)
{
uint32 indexPair = indexDataU32[i];
// swap index pair
indexPair = (indexPair >> 16) | (indexPair << 16);
gx2WriteGather_submitU32AsLE(indexPair);
}
}
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
struct GX2DispatchComputeParam
{
/* +0x00 */ uint32be worksizeX;
/* +0x04 */ uint32be worksizeY;
/* +0x08 */ uint32be worksizeZ;
};
void GX2DispatchCompute(GX2DispatchComputeParam* dispatchParam)
{
GX2ReserveCmdSpace(9 + 10);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_RESOURCE, 8),
(mmSQ_CS_DISPATCH_PARAMS - mmSQ_TEX_RESOURCE_WORD0),
memory_virtualToPhysical(MEMPTR<GX2DispatchComputeParam>(dispatchParam).GetMPTR()),
0xF,
0x862000,
1,
0xABCD1234,
0xABCD1234,
0xC0000000);
// IT_EVENT_WRITE with RST_VTX_CNT?
// set primitive mode
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONFIG_REG, 2));
gx2WriteGather_submitU32AsBE(Latte::REGADDR::VGT_PRIMITIVE_TYPE - 0x2000);
gx2WriteGather_submitU32AsBE(1); // mode
// set number of instances
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_NUM_INSTANCES, 1));
gx2WriteGather_submitU32AsBE(1); // numInstances
uint32 workCount = (uint32)dispatchParam->worksizeX * (uint32)dispatchParam->worksizeY * (uint32)dispatchParam->worksizeZ;
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_DRAW_INDEX_AUTO, 2) | 0x00000001);
gx2WriteGather_submitU32AsBE(workCount);
gx2WriteGather_submitU32AsBE(0); // DRAW_INITIATOR (has source select for index generator + other unknown info)
}
void GX2DrawInit()
{
cafeExportRegister("gx2", GX2SetAttribBuffer, LogType::GX2);
cafeExportRegister("gx2", GX2DrawIndexedEx, LogType::GX2);
cafeExportRegister("gx2", GX2DrawIndexedEx2, LogType::GX2);
cafeExportRegister("gx2", GX2DrawEx, LogType::GX2);
cafeExportRegister("gx2", GX2DrawIndexedImmediateEx, LogType::GX2);
cafeExportRegister("gx2", GX2DispatchCompute, LogType::GX2);
}
}

View file

@ -0,0 +1,13 @@
#pragma once
#include "Cafe/HW/Latte/ISA/LatteReg.h"
namespace GX2
{
using GX2IndexType = Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE;
using GX2PrimitiveMode2 = Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE;
void GX2SetAttribBuffer(uint32 bufferIndex, uint32 sizeInBytes, uint32 stride, void* data);
void GX2DrawIndexedEx(GX2PrimitiveMode2 primitiveMode, uint32 count, GX2IndexType indexType, void* indexData, uint32 baseVertex, uint32 numInstances);
void GX2DrawInit();
}

View file

@ -0,0 +1,311 @@
#include "Cafe/OS/common/OSCommon.h"
#include "GX2_Command.h"
#include "GX2_Event.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/HW/MMU/MMU.h"
#include "Cafe/OS/libs/coreinit/coreinit_Thread.h"
#include "Cafe/HW/Latte/Core/Latte.h"
#include "GX2.h"
#include "Cafe/HW/Latte/Renderer/Renderer.h"
#include "config/ActiveSettings.h"
#include "util/helpers/ConcurrentQueue.h"
namespace GX2
{
SysAllocator<coreinit::OSThreadQueue> g_vsyncThreadQueue;
SysAllocator<coreinit::OSThreadQueue> g_flipThreadQueue;
SysAllocator<coreinit::OSEvent> s_updateRetirementEvent;
std::atomic<uint64> s_lastRetirementTimestamp = 0;
// called from GPU code when a command buffer is retired
void __GX2NotifyNewRetirementTimestamp(uint64 tsRetire)
{
__OSLockScheduler();
s_lastRetirementTimestamp = tsRetire;
coreinit::OSSignalEventAllInternal(s_updateRetirementEvent.GetPtr());
__OSUnlockScheduler();
}
void GX2SetGPUFence(uint32be* fencePtr, uint32 mask, uint32 compareOp, uint32 compareValue)
{
GX2ReserveCmdSpace(7);
uint8 compareOpTable[] = { 0x7,0x1,0x3,0x2,0x6,0x4,0x5,0x0 };
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_WAIT_REG_MEM, 6));
gx2WriteGather_submitU32AsBE((uint32)(compareOpTable[compareOp & 7]) | 0x10); // compare operand + memory select (0x10)
gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(memory_getVirtualOffsetFromPointer(fencePtr)) | 2); // physical address + type size flag(?)
gx2WriteGather_submitU32AsBE(0); // ukn, always set to 0
gx2WriteGather_submitU32AsBE(compareValue); // fence value
gx2WriteGather_submitU32AsBE(mask); // fence mask
gx2WriteGather_submitU32AsBE(0xA); // unknown purpose
}
enum class GX2PipeEventType : uint32
{
TOP = 0,
BOTTOM = 1,
BOTTOM_AFTER_FLUSH = 2
};
void GX2SubmitUserTimeStamp(uint64* timestampOut, uint64 value, GX2PipeEventType eventType, uint32 triggerInterrupt)
{
GX2ReserveCmdSpace(7);
MPTR physTimestampAddr = memory_virtualToPhysical(memory_getVirtualOffsetFromPointer(timestampOut));
uint32 valHigh = (uint32)(value >> 32);
uint32 valLow = (uint32)(value & 0xffffffff);
if (eventType == GX2PipeEventType::TOP)
{
// write when on top of pipe
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_MEM_WRITE, 4));
gx2WriteGather_submitU32AsBE(physTimestampAddr | 0x2);
gx2WriteGather_submitU32AsBE(0); // 0x40000 -> 32bit write, 0x0 -> 64bit write?
gx2WriteGather_submitU32AsBE(valLow); // low
gx2WriteGather_submitU32AsBE(valHigh); // high
if (triggerInterrupt != 0)
{
// top callback
gx2WriteGather_submitU32AsBE(0x0000304A);
gx2WriteGather_submitU32AsBE(0x40000000);
}
}
else if (eventType == GX2PipeEventType::BOTTOM_AFTER_FLUSH)
{
// write when on bottom of pipe
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_MEM_WRITE, 4));
gx2WriteGather_submitU32AsBE(physTimestampAddr | 0x2);
gx2WriteGather_submitU32AsBE(0); // 0x40000 -> 32bit write, 0x0 -> 64bit write?
gx2WriteGather_submitU32AsBE(valLow); // low
gx2WriteGather_submitU32AsBE(valHigh); // high
// trigger CB
if (triggerInterrupt != 0)
{
// bottom callback
// todo -> Fix this
gx2WriteGather_submitU32AsBE(0x0000304B); // hax -> This event is handled differently and uses a different packet?
gx2WriteGather_submitU32AsBE(0x40000000);
// trigger bottom of pipe callback
// used by Mario & Sonic Rio
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_BOTTOM_OF_PIPE_CB, 3));
gx2WriteGather_submitU32AsBE(physTimestampAddr);
gx2WriteGather_submitU32AsBE(valLow); // low
gx2WriteGather_submitU32AsBE(valHigh); // high
}
}
else if (eventType == GX2PipeEventType::BOTTOM)
{
// fix this
// write timestamp when on bottom of pipe
if (triggerInterrupt != 0)
{
// write value and trigger CB
// todo: Use correct packet
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_BOTTOM_OF_PIPE_CB, 3));
gx2WriteGather_submitU32AsBE(physTimestampAddr);
gx2WriteGather_submitU32AsBE(valLow); // low
gx2WriteGather_submitU32AsBE(valHigh); // high
}
else
{
// write value but don't trigger CB
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_MEM_WRITE, 4));
gx2WriteGather_submitU32AsBE(physTimestampAddr | 0x2);
gx2WriteGather_submitU32AsBE(0); // 0x40000 -> 32bit write, 0x0 -> 64bit write?
gx2WriteGather_submitU32AsBE(valLow); // low
gx2WriteGather_submitU32AsBE(valHigh); // high
}
}
else
{
cemu_assert_debug(false);
}
}
struct GX2EventFunc
{
MEMPTR<void> callbackFuncPtr;
MEMPTR<void> userData;
}s_eventCallback[GX2CallbackEventTypeCount]{};
void GX2SetEventCallback(GX2CallbackEventType eventType, void* callbackFuncPtr, void* userData)
{
if ((size_t)eventType >= GX2CallbackEventTypeCount)
{
forceLog_printf("GX2SetEventCallback(): Unknown eventType\n");
return;
}
s_eventCallback[(size_t)eventType].callbackFuncPtr = callbackFuncPtr;
s_eventCallback[(size_t)eventType].userData = userData;
}
void GX2GetEventCallback(GX2CallbackEventType eventType, MEMPTR<void>* callbackFuncPtrOut, MEMPTR<void>* userDataOut)
{
if ((size_t)eventType >= GX2CallbackEventTypeCount)
{
forceLog_printf("GX2GetEventCallback(): Unknown eventType\n");
return;
}
if (callbackFuncPtrOut)
*callbackFuncPtrOut = s_eventCallback[(size_t)eventType].callbackFuncPtr;
if (userDataOut)
*userDataOut = s_eventCallback[(size_t)eventType].userData;
}
// event callback thread
bool s_callbackThreadLaunched{};
SysAllocator<OSThread_t> s_eventCallbackThread;
SysAllocator<uint8, 0x2000> s_eventCallbackThreadStack;
SysAllocator<char, 64> s_eventCallbackThreadName;
// event callback queue
struct GX2EventQueueEntry
{
GX2EventQueueEntry() {};
GX2EventQueueEntry(GX2CallbackEventType eventType) : eventType(eventType) {};
GX2CallbackEventType eventType{(GX2CallbackEventType)-1};
};
SysAllocator<coreinit::OSSemaphore> s_eventCbQueueSemaphore;
ConcurrentQueue<GX2EventQueueEntry> s_eventCbQueue;
void __GX2NotifyEvent(GX2CallbackEventType eventType)
{
if ((size_t)eventType >= GX2CallbackEventTypeCount)
{
cemu_assert_debug(false);
return;
}
if (s_eventCallback[(size_t)eventType].callbackFuncPtr)
{
s_eventCbQueue.push(eventType);
coreinit::OSSignalSemaphore(s_eventCbQueueSemaphore);
}
// wake up threads that are waiting for VSYNC or FLIP event
if (eventType == GX2CallbackEventType::VSYNC)
{
__OSLockScheduler();
g_vsyncThreadQueue->wakeupEntireWaitQueue(false);
__OSUnlockScheduler();
}
else if (eventType == GX2CallbackEventType::FLIP)
{
__OSLockScheduler();
g_flipThreadQueue->wakeupEntireWaitQueue(false);
__OSUnlockScheduler();
}
}
void __GX2CallbackThread(PPCInterpreter_t* hCPU)
{
while (coreinit::OSWaitSemaphore(s_eventCbQueueSemaphore))
{
GX2EventQueueEntry entry;
if (!s_eventCbQueue.peek2(entry))
continue;
if(!s_eventCallback[(size_t)entry.eventType].callbackFuncPtr)
continue;
PPCCoreCallback(s_eventCallback[(size_t)entry.eventType].callbackFuncPtr, (sint32)entry.eventType, s_eventCallback[(size_t)entry.eventType].userData);
}
osLib_returnFromFunction(hCPU, 0);
}
uint64 GX2GetLastSubmittedTimeStamp()
{
return LatteGPUState.lastSubmittedCommandBufferTimestamp.load();
}
uint64 GX2GetRetiredTimeStamp()
{
return s_lastRetirementTimestamp;
}
void GX2WaitForVsync()
{
__OSLockScheduler();
g_vsyncThreadQueue.GetPtr()->queueAndWait(coreinit::OSGetCurrentThread());
__OSUnlockScheduler();
}
void GX2WaitForFlip()
{
if ((sint32)(_swapEndianU32(LatteGPUState.sharedArea->flipRequestCountBE) == _swapEndianU32(LatteGPUState.sharedArea->flipExecuteCountBE)))
return; // dont wait if no flip is requested
__OSLockScheduler();
g_flipThreadQueue.GetPtr()->queueAndWait(coreinit::OSGetCurrentThread());
__OSUnlockScheduler();
}
bool GX2WaitTimeStamp(uint64 tsWait)
{
__OSLockScheduler();
while (tsWait > s_lastRetirementTimestamp)
{
// GPU hasn't caught up yet
coreinit::OSWaitEventInternal(s_updateRetirementEvent.GetPtr());
}
__OSUnlockScheduler();
// return true to indicate no timeout
return true;
}
void GX2DrawDone()
{
// optional force full sync (texture readback and occlusion queries)
bool forceFullSync = false;
if (g_renderer && g_renderer->GetType() == RendererAPI::Vulkan)
forceFullSync = true;
if (forceFullSync || ActiveSettings::WaitForGX2DrawDoneEnabled())
{
GX2ReserveCmdSpace(2);
// write PM4 command
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_SYNC_ASYNC_OPERATIONS, 1));
gx2WriteGather_submitU32AsBE(0x00000000); // unused
}
// flush pipeline
if (_GX2GetUnflushedBytes(PPCInterpreter_getCoreIndex(ppcInterpreterCurrentInstance)) > 0)
_GX2SubmitToTCL();
uint64 ts = GX2GetLastSubmittedTimeStamp();
GX2WaitTimeStamp(ts);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
void GX2Init_event()
{
// clear queue
// launch event callback thread
if (s_callbackThreadLaunched)
return;
s_callbackThreadLaunched = true;
strcpy(s_eventCallbackThreadName.GetPtr(), "GX2 event callback");
coreinit::OSCreateThreadType(s_eventCallbackThread, PPCInterpreter_makeCallableExportDepr(__GX2CallbackThread), 0, nullptr, (uint8*)s_eventCallbackThreadStack.GetPtr() + s_eventCallbackThreadStack.GetByteSize(), (sint32)s_eventCallbackThreadStack.GetByteSize(), 16, OSThread_t::ATTR_DETACHED, OSThread_t::THREAD_TYPE::TYPE_IO);
coreinit::OSSetThreadName(s_eventCallbackThread, s_eventCallbackThreadName);
coreinit::OSResumeThread(s_eventCallbackThread);
}
void GX2EventInit()
{
cafeExportRegister("gx2", GX2SetGPUFence, LogType::GX2);
cafeExportRegister("gx2", GX2SubmitUserTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2SetEventCallback, LogType::GX2);
cafeExportRegister("gx2", GX2GetEventCallback, LogType::GX2);
cafeExportRegister("gx2", GX2GetLastSubmittedTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2GetRetiredTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2WaitForVsync, LogType::GX2);
cafeExportRegister("gx2", GX2WaitForFlip, LogType::GX2);
cafeExportRegister("gx2", GX2WaitTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2DrawDone, LogType::GX2);
coreinit::OSInitThreadQueue(g_vsyncThreadQueue.GetPtr());
coreinit::OSInitThreadQueue(g_flipThreadQueue.GetPtr());
coreinit::OSInitEvent(s_updateRetirementEvent, coreinit::OSEvent::EVENT_STATE::STATE_NOT_SIGNALED, coreinit::OSEvent::EVENT_MODE::MODE_AUTO);
coreinit::OSInitSemaphore(s_eventCbQueueSemaphore, 0);
}
}

View file

@ -0,0 +1,26 @@
#pragma once
namespace GX2
{
void GX2EventInit();
void GX2Init_event();
void GX2WaitForVsync();
void GX2WaitForFlip();
void GX2DrawDone();
enum class GX2CallbackEventType
{
TIMESTAMP_TOP = 0,
TIMESTAMP_BOTTOM = 1,
VSYNC = 2,
FLIP = 3,
// 4 is buffer overrun?
};
inline constexpr size_t GX2CallbackEventTypeCount = 5;
// notification callbacks for GPU
void __GX2NotifyNewRetirementTimestamp(uint64 tsRetire);
void __GX2NotifyEvent(GX2CallbackEventType eventType);
}

View file

@ -0,0 +1,77 @@
#include "Cafe/OS/common/OSCommon.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "GX2.h"
#include "GX2_Resource.h"
#include "Cafe/HW/Latte/Core/Latte.h"
#include "Cafe/HW/Latte/Core/LatteDraw.h"
// default GX2 allocator (not the same as the GX2R allocator, but GX2R uses this allocator by default)
MPTR gx2Mem_defaultAlloc = MPTR_NULL;
MPTR gx2Mem_defaultFree = MPTR_NULL;
void gx2Memory_GX2SetDefaultAllocator(MPTR defaultAllocFunc, MPTR defaulFreeFunc)
{
gx2Mem_defaultAlloc = defaultAllocFunc;
gx2Mem_defaultFree = defaulFreeFunc;
}
void _GX2DefaultAlloc_Alloc(PPCInterpreter_t* hCPU)
{
// parameters:
// r3 uint32 userParam
// r4 uint32 size
// r5 sint32 alignment
hCPU->gpr[3] = hCPU->gpr[4];
hCPU->gpr[4] = hCPU->gpr[5];
hCPU->instructionPointer = gCoreinitData->MEMAllocFromDefaultHeapEx.GetMPTR();
}
void _GX2DefaultAlloc_Free(PPCInterpreter_t* hCPU)
{
hCPU->gpr[3] = hCPU->gpr[4];
hCPU->instructionPointer = gCoreinitData->MEMFreeToDefaultHeap.GetMPTR();
}
void gx2Export_GX2SetDefaultAllocator(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetDefaultAllocator(0x%08x, 0x%08x)\n", hCPU->gpr[3], hCPU->gpr[4]);
gx2Mem_defaultAlloc = hCPU->gpr[3];
gx2Mem_defaultFree = hCPU->gpr[4];
osLib_returnFromFunction(hCPU, 0);
}
void _GX2DefaultAllocR_Alloc(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2DefaultAllocate(0x%08x, 0x%08x, 0x%08x)\n", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5]);
// parameters:
// r3 uint32 userParam
// r4 uint32 size
// r5 sint32 alignment
hCPU->instructionPointer = gx2Mem_defaultAlloc;
}
void _GX2DefaultAllocR_Free(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2DefaultFree(0x%08x, 0x%08x)\n", hCPU->gpr[3], hCPU->gpr[4]);
// parameters:
// r3 uint32 userParam
// r4 void* mem
hCPU->instructionPointer = gx2Mem_defaultFree;
}
namespace GX2
{
void GX2MEMAllocatorsInit()
{
// set default allocators (can be overwritten by GX2SetDefaultAllocator)
gx2Mem_defaultAlloc = PPCInterpreter_makeCallableExportDepr(_GX2DefaultAlloc_Alloc);
gx2Mem_defaultFree = PPCInterpreter_makeCallableExportDepr(_GX2DefaultAlloc_Free);
// set resource default allocator
GX2::GX2RSetAllocator(PPCInterpreter_makeCallableExportDepr(_GX2DefaultAllocR_Alloc), PPCInterpreter_makeCallableExportDepr(_GX2DefaultAllocR_Free));
}
void GX2MemInit()
{
osLib_addFunction("gx2", "GX2SetDefaultAllocator", gx2Export_GX2SetDefaultAllocator);
}
};

View file

@ -0,0 +1,8 @@
#pragma once
namespace GX2
{
void GX2MEMAllocatorsInit();
void GX2MemInit();
};

View file

@ -0,0 +1,255 @@
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "Cafe/OS/common/OSCommon.h"
#include "GX2.h"
#include "config/CemuConfig.h"
#include "Cafe/OS/libs/coreinit/coreinit_Time.h"
#include "config/ActiveSettings.h"
#include "Cafe/HW/Latte/Renderer/Renderer.h"
#include "Cafe/HW/Latte/Core/LatteBufferCache.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "GX2_Command.h"
#include "GX2_Event.h"
#include "GX2_Misc.h"
#include "GX2_Memory.h"
#include "GX2_Texture.h"
void gx2Export_GX2SetSwapInterval(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetSwapInterval(%d)\n", hCPU->gpr[3]);
if( hCPU->gpr[3] >= 20 )
{
forceLog_printf("GX2SetSwapInterval() called with out of range value (%d)\n", hCPU->gpr[3]);
}
else
LatteGPUState.sharedArea->swapInterval = hCPU->gpr[3];
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2GetSwapInterval(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2GetSwapInterval()\n");
osLib_returnFromFunction(hCPU, LatteGPUState.sharedArea->swapInterval);
}
extern uint64 lastSwapTime;
void gx2Export_GX2GetSwapStatus(PPCInterpreter_t* hCPU)
{
memory_writeU32(hCPU->gpr[3], _swapEndianU32(LatteGPUState.sharedArea->flipRequestCountBE));
memory_writeU32(hCPU->gpr[4], _swapEndianU32(LatteGPUState.sharedArea->flipExecuteCountBE));
memory_writeU64Slow(hCPU->gpr[5], lastSwapTime);
memory_writeU64Slow(hCPU->gpr[6], lastSwapTime);
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2GetGPUTimeout(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2GetGPUTimeout()\n");
osLib_returnFromFunction(hCPU, 0x3E8);
}
#define GX2_INVALID_COUNTER_VALUE_U64 0xFFFFFFFFFFFFFFFFULL
void gx2Export_GX2SampleTopGPUCycle(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SampleTopGPUCycle(0x%08x)\n", hCPU->gpr[3]);
memory_writeU64Slow(hCPU->gpr[3], coreinit::coreinit_getTimerTick());
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SampleBottomGPUCycle(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SampleBottomGPUCycle(0x%08x)\n", hCPU->gpr[3]);
memory_writeU64Slow(hCPU->gpr[3], GX2_INVALID_COUNTER_VALUE_U64);
osLib_returnFromFunction(hCPU, 0);
return;
// seems like implementing this correctly causes more harm than good as games will try to dynamically scale their resolution, which our texture cache and graphic packs cant handle well. If we just never return a valid timestamp, it seems like games stop dynamically scaling resolution
// Whats a good solution here? Should we implement it correctly and instead rely on graphic pack patches to patch out the dynamic scaling?
// some known affected games: Wind Waker HD, Super Mario 3D World
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_SAMPLE_TIMER, 1));
gx2WriteGather_submitU32AsBE(hCPU->gpr[3]);
osLib_returnFromFunction(hCPU, 0);
}
namespace GX2
{
SysAllocator<uint8, 640 * 480 * 4, 0x1000> _lastFrame;
uint32 sGX2MainCoreIndex = 0;
void _test_AddrLib();
void GX2Init(void* initSettings)
{
if (LatteGPUState.gx2InitCalled)
{
cemuLog_logDebug(LogType::Force, "GX2Init() called while already initialized");
return;
}
uint32 coreIndex = coreinit::OSGetCoreId();
cemuLog_log(LogType::GX2, "GX2Init() on core {} by thread 0x{:08x}", coreIndex, MEMPTR<OSThread_t>(coreinit::OSGetCurrentThread()).GetMPTR());
sGX2MainCoreIndex = coreIndex;
// init submodules
GX2::GX2Init_event();
GX2::GX2Init_writeGather();
// init shared area
if (LatteGPUState.sharedAreaAddr == MPTR_NULL)
{
LatteGPUState.sharedAreaAddr = coreinit_allocFromSysArea(sizeof(gx2GPUSharedArea_t), 0x20);
LatteGPUState.sharedArea = (gx2GPUSharedArea_t*)memory_getPointerFromVirtualOffset(LatteGPUState.sharedAreaAddr);
}
// init shared variables
LatteGPUState.sharedArea->flipRequestCountBE = _swapEndianU32(0);
LatteGPUState.sharedArea->flipExecuteCountBE = _swapEndianU32(0);
LatteGPUState.sharedArea->swapInterval = 1;
// init memory handling
GX2::GX2MEMAllocatorsInit();
// let GPU know that GX2 is initialized
LatteGPUState.gx2InitCalled++;
// run tests
_test_AddrLib();
}
void _GX2DriverReset()
{
LatteGPUState.gx2InitCalled = 0;
}
sint32 GX2GetMainCoreId(PPCInterpreter_t* hCPU)
{
if (LatteGPUState.gx2InitCalled == 0)
return -1;
return sGX2MainCoreIndex;
}
void GX2ResetGPU(uint32 ukn)
{
cemuLog_log(LogType::Force, "GX2ResetGPU()"); // always log this
GX2::GX2DrawDone();
}
void GX2SetTVBuffer(void* imageBuffePtr, uint32 imageBufferSize, E_TVRES tvResolutionMode, uint32 _surfaceFormat, E_TVBUFFERMODE bufferMode)
{
Latte::E_GX2SURFFMT surfaceFormat = (Latte::E_GX2SURFFMT)_surfaceFormat;
LatteGPUState.tvBufferUsesSRGB = HAS_FLAG(surfaceFormat, Latte::E_GX2SURFFMT::FMT_BIT_SRGB);
// todo - actually allocate a scanbuffer
}
void GX2SetTVGamma(float gamma)
{
if (abs(gamma - 1.0f) > 0.01f)
cemuLog_logDebug(LogType::Force, "TV gamma set to {} which is not supported", gamma);
}
bool GX2GetLastFrame(uint32 deviceId, GX2Texture* textureOut)
{
// return a 480p image
textureOut->viewFirstMip = 0;
textureOut->viewFirstSlice = 0;
textureOut->viewNumMips = 1;
textureOut->viewNumSlices = 1;
textureOut->compSel = 0x00010203;
textureOut->surface.width = 640;
textureOut->surface.height = 480;
textureOut->surface.depth = 1;
textureOut->surface.dim = Latte::E_DIM::DIM_2D;
textureOut->surface.format = Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM;
textureOut->surface.tileMode = Latte::E_GX2TILEMODE::TM_LINEAR_ALIGNED;
textureOut->surface.pitch = 0;
textureOut->surface.resFlag = 0;
textureOut->surface.aa = 0;
GX2CalcSurfaceSizeAndAlignment(&textureOut->surface);
textureOut->surface.imagePtr = _lastFrame.GetMPTR();
GX2InitTextureRegs(textureOut);
return true;
}
bool GX2GetLastFrameGammaA(uint32 deviceId, float32be* gamma)
{
*gamma = 1.0f;
return true;
}
bool GX2GetLastFrameGammaB(uint32 deviceId, float32be* gamma)
{
*gamma = 1.0f;
return true;
}
uint64 GX2GPUTimeToCPUTime(uint64 gpuTime)
{
return 0; // hack, see note in GX2SampleBottomGPUCycle
}
uint32 GX2GetSystemDRCMode()
{
return 1;
}
uint32 GX2IsVideoOutReady()
{
return 1;
}
void GX2Invalidate(uint32 invalidationFlags, MPTR invalidationAddr, uint32 invalidationSize)
{
uint32 surfaceSyncFlags = 0;
if (invalidationFlags & 0x04)
{
// uniform block
surfaceSyncFlags |= 0x8800000;
}
if (invalidationFlags & 0x01)
{
// attribute data
surfaceSyncFlags |= 0x800000;
}
if (invalidationFlags & 0x40)
{
// CPU cache
LatteBufferCache_notifyDCFlush(invalidationAddr, invalidationSize);
}
if (surfaceSyncFlags != 0)
{
GX2ReserveCmdSpace(5);
// write PM4 command
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SURFACE_SYNC, 4)); // IT_SURFACE_SYNC + 4 data dwords
gx2WriteGather_submitU32AsBE(surfaceSyncFlags);
gx2WriteGather_submitU32AsBE((invalidationSize + 0xFF) >> 8); // size
gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(invalidationAddr) >> 8); // base address (divided by 0x100)
gx2WriteGather_submitU32AsBE(0x00000004); // poll interval
}
}
void GX2MiscInit()
{
cafeExportRegister("gx2", GX2Init, LogType::GX2);
cafeExportRegister("gx2", GX2GetMainCoreId, LogType::GX2);
cafeExportRegister("gx2", GX2ResetGPU, LogType::GX2);
cafeExportRegister("gx2", GX2SetTVBuffer, LogType::GX2);
cafeExportRegister("gx2", GX2SetTVGamma, LogType::GX2);
cafeExportRegister("gx2", GX2GetLastFrame, LogType::GX2);
cafeExportRegister("gx2", GX2GetLastFrameGammaA, LogType::GX2);
cafeExportRegister("gx2", GX2GetLastFrameGammaB, LogType::GX2);
cafeExportRegister("gx2", GX2GPUTimeToCPUTime, LogType::GX2);
cafeExportRegister("gx2", GX2GetSystemDRCMode, LogType::GX2);
cafeExportRegister("gx2", GX2IsVideoOutReady, LogType::GX2);
cafeExportRegister("gx2", GX2Invalidate, LogType::GX2);
sGX2MainCoreIndex = 0;
}
};

View file

@ -0,0 +1,23 @@
#pragma once
namespace GX2
{
extern uint32 sGX2MainCoreIndex;
enum class E_TVRES
{
TODO,
};
enum class E_TVBUFFERMODE
{
DOUBLE_BUFFER = 2,
};
void _GX2DriverReset();
void GX2SetTVBuffer(void* imageBuffePtr, uint32 imageBufferSize, E_TVRES tvResolutionMode, uint32 surfaceFormat, E_TVBUFFERMODE bufferMode);
void GX2SetTVGamma(float gamma);
void GX2MiscInit();
};

View file

@ -0,0 +1,153 @@
#include "Cafe/OS/common/OSCommon.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "GX2.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/CafeSystem.h"
#include "GX2_Query.h"
#define LATTE_GC_NUM_RB 2
#define _QUERY_REG_COUNT 8 // each reg/result is 64bits, little endian
namespace GX2
{
struct GX2Query
{
// 4*2 sets of uint64 results
uint32 reg[_QUERY_REG_COUNT * 2];
};
static_assert(sizeof(GX2Query) == 0x40);
void _BeginOcclusionQuery(GX2Query* queryInfo, bool isGPUQuery)
{
if (isGPUQuery)
{
uint64 titleId = CafeSystem::GetForegroundTitleId();
if (titleId == 0x00050000101c4c00ULL || titleId == 0x00050000101c4d00 || titleId == 0x0005000010116100) // XCX EU, US, JPN
{
// in XCX queries are used to determine if certain objects are visible
// if we are not setting the result fast enough and the query still holds a value of 0 (which is the default for GPU queries)
// then XCX will not render affected objects, causing flicker
// note: This is a very old workaround. It may no longer be necessary since the introduction of full sync. Investigate
*(uint64*)(queryInfo->reg + 2) = 0x100000;
}
else
{
GX2ReserveCmdSpace(5 * _QUERY_REG_COUNT);
MPTR queryInfoPhys = memory_virtualToPhysical(memory_getVirtualOffsetFromPointer(queryInfo));
for (sint32 i = 0; i < _QUERY_REG_COUNT; i++)
{
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_MEM_WRITE, 4));
gx2WriteGather_submitU32AsBE((queryInfoPhys + i * 8) | 0x2);
gx2WriteGather_submitU32AsBE(0x20000); // 0x20000 -> ?
uint32 v = 0;
if (i >= LATTE_GC_NUM_RB * 2)
v |= 0x80000000;
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(v);
}
}
}
else
{
memset(queryInfo, 0, 0x10); // size maybe GPU7_GC_NUM_RB*2*4 ?
queryInfo->reg[LATTE_GC_NUM_RB * 4 + 0] = 0;
queryInfo->reg[LATTE_GC_NUM_RB * 4 + 1] = _swapEndianU32('OCPU');
}
// todo: Set mmDB_RENDER_CONTROL
}
void GX2QueryBegin(uint32 queryType, GX2Query* query)
{
if (queryType == GX2_QUERY_TYPE_OCCLUSION_CPU)
{
_BeginOcclusionQuery(query, false);
}
else if (queryType == GX2_QUERY_TYPE_OCCLUSION_GPU)
{
_BeginOcclusionQuery(query, true);
}
else
{
debug_printf("GX2QueryBegin(): Unsupported type %d\n", queryType);
debugBreakpoint();
return;
}
// HLE packet
GX2ReserveCmdSpace(2);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_BEGIN_OCCLUSION_QUERY, 1));
gx2WriteGather_submitU32AsBE(MEMPTR<GX2Query>(query).GetMPTR());
}
void GX2QueryEnd(uint32 queryType, GX2Query* query)
{
GX2ReserveCmdSpace(2);
if (queryType == GX2_QUERY_TYPE_OCCLUSION_CPU || queryType == GX2_QUERY_TYPE_OCCLUSION_GPU)
{
// HLE packet
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_END_OCCLUSION_QUERY, 1));
gx2WriteGather_submitU32AsBE(MEMPTR<GX2Query>(query).GetMPTR());
}
else
{
debug_printf("GX2QueryBegin(): Unsupported %d\n", queryType);
debugBreakpoint();
return;
}
}
uint32 GX2QueryGetOcclusionResult(GX2Query* query, uint64be* resultOut)
{
if (query->reg[LATTE_GC_NUM_RB * 4 + 1] == _swapEndianU32('OCPU') && query->reg[LATTE_GC_NUM_RB * 4 + 0] == 0)
{
// CPU query result not ready
return GX2_FALSE;
}
uint64 startValue = *(uint64*)(query->reg + 0);
uint64 endValue = *(uint64*)(query->reg + 2);
if ((startValue & 0x8000000000000000ULL) || (endValue & 0x8000000000000000ULL))
{
return GX2_FALSE;
}
*resultOut = endValue - startValue;
return GX2_TRUE;
}
void GX2QueryBeginConditionalRender(uint32 queryType, GX2Query* query, uint32 dontWaitBool, uint32 pixelsMustPassBool)
{
GX2ReserveCmdSpace(3);
uint32 flags = 0;
if (pixelsMustPassBool)
flags |= (1<<31);
if (queryType == GX2_QUERY_TYPE_OCCLUSION_GPU)
flags |= (1 << 13);
else
flags |= (2 << 13);
flags |= ((dontWaitBool != 0) << 19);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_PREDICATION, 2));
gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(MEMPTR<GX2Query>(query).GetMPTR()));
gx2WriteGather_submitU32AsBE(flags);
}
void GX2QueryEndConditionalRender()
{
GX2ReserveCmdSpace(3);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_PREDICATION, 2));
gx2WriteGather_submitU32AsBE(MPTR_NULL);
gx2WriteGather_submitU32AsBE(0); // unknown / todo
}
void GX2QueryInit()
{
cafeExportRegister("gx2", GX2QueryBegin, LogType::GX2);
cafeExportRegister("gx2", GX2QueryEnd, LogType::GX2);
cafeExportRegister("gx2", GX2QueryGetOcclusionResult, LogType::GX2);
cafeExportRegister("gx2", GX2QueryBeginConditionalRender, LogType::GX2);
cafeExportRegister("gx2", GX2QueryEndConditionalRender, LogType::GX2);
}
};

View file

@ -0,0 +1,10 @@
#pragma once
#define GX2_QUERY_TYPE_OCCLUSION_CPU 0
#define GX2_QUERY_TYPE_OCCLUSION_GPU 2
// 1 and 3 are streamout related?
namespace GX2
{
void GX2QueryInit();
};

View file

@ -0,0 +1,303 @@
#include "Cafe/OS/common/OSCommon.h"
#include "GX2.h"
#include "Cafe/HW/Latte/Core/Latte.h"
#include "Cafe/HW/Latte/Core/LatteDraw.h"
#include "Cafe/HW/Latte/LatteAddrLib/LatteAddrLib.h"
#include "Cafe/HW/Latte/ISA/LatteReg.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "GX2_Command.h"
void gx2Export_GX2InitColorBufferRegs(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2InitColorBufferRegs(0x%08x)\n", hCPU->gpr[3]);
ppcDefineParamStructPtr(colorBuffer, GX2ColorBuffer, 0);
LatteAddrLib::AddrSurfaceInfo_OUT surfaceInfo;
LatteAddrLib::GX2CalculateSurfaceInfo(colorBuffer->surface.format, colorBuffer->surface.width, colorBuffer->surface.height, colorBuffer->surface.depth, colorBuffer->surface.dim, colorBuffer->surface.tileMode, colorBuffer->surface.aa, _swapEndianU32(colorBuffer->viewMip), &surfaceInfo);
uint32 pitchHeight = (surfaceInfo.height * surfaceInfo.pitch) >> 6;
#ifndef PUBLIC_RELEASE
if (colorBuffer->viewNumSlices != _swapEndianU32(1))
forceLogDebug_printf("GX2InitColorBufferRegs(): With unsupported slice count %d", _swapEndianU32(colorBuffer->viewNumSlices));
if (surfaceInfo.pitch < 7)
forceLogDebug_printf("GX2InitColorBufferRegs(): Pitch too small (pitch = %d)", surfaceInfo.pitch);
if ((surfaceInfo.pitch & 7) != 0)
forceLogDebug_printf("GX2InitColorBufferRegs(): Pitch has invalid alignment (pitch = %d)", surfaceInfo.pitch);
if (pitchHeight == 0)
forceLogDebug_printf("GX2InitColorBufferRegs(): Invalid value (pitchHeight = %d)", pitchHeight);
#endif
uint32 cSize = ((surfaceInfo.pitch >> 3) - 1) & 0x3FF;
cSize |= (((pitchHeight - 1) & 0xFFFFF) << 10);
colorBuffer->reg_size = cSize;
colorBuffer->reg_mask = 0;
// reg color_info
Latte::E_GX2SURFFMT format = colorBuffer->surface.format;
Latte::E_HWSURFFMT hwFormat = Latte::GetHWFormat(format);
uint32 formatHighBits = (uint32)format & 0xF00;
uint32 regInfo = 0;
regInfo = (uint32)GX2::GetSurfaceFormatSwapMode(colorBuffer->surface.format);
regInfo |= ((uint32)hwFormat<<2);
cemu_assert_debug(LatteAddrLib::IsValidHWTileMode(surfaceInfo.hwTileMode));
regInfo |= ((uint32)surfaceInfo.hwTileMode << 8);
bool clampBlend = false;
if (formatHighBits == 0x000)
{
regInfo |= (0 << 12);
clampBlend = true;
}
else if (formatHighBits == 0x100) // integer
{
regInfo |= (4 << 12);
}
else if (formatHighBits == 0x200) // signed
{
regInfo |= (1 << 12);
clampBlend = true;
}
else if (formatHighBits == 0x300) // integer + signed
{
regInfo |= (5 << 12);
}
else if (formatHighBits == 0x400) // srgb
{
clampBlend = true;
regInfo |= (6 << 12);
}
else if (formatHighBits == 0x800) // float
{
regInfo |= (7 << 12);
}
else
cemu_assert_debug(false);
if (hwFormat == Latte::E_HWSURFFMT::HWFMT_5_5_5_1 || hwFormat == Latte::E_HWSURFFMT::HWFMT_10_10_10_2 )
regInfo |= (2 << 16);
else
regInfo &= ~(3 << 16); // COMP_SWAP_mask
if(colorBuffer->surface.aa != 0)
regInfo |= (2 << 18); // TILE_MODE
bool isIntegerFormat = (uint32)(format & Latte::E_GX2SURFFMT::FMT_BIT_INT) != 0;
if (isIntegerFormat == false)
regInfo |= (GX2::GetSurfaceColorBufferExportFormat(colorBuffer->surface.format) << 27); // 0 -> full, 1 -> normalized
if (isIntegerFormat
|| format ==Latte::E_GX2SURFFMT::R24_X8_UNORM
|| format ==Latte::E_GX2SURFFMT::R24_X8_FLOAT
|| format ==Latte::E_GX2SURFFMT::R32_X8_FLOAT)
{
// set the blend bypass bit for formats which dont support blending
regInfo |= (1<<22);
clampBlend = false;
}
if (clampBlend)
regInfo |= (1<<20); // BLEND_CLAMP_bit
if ((uint32)(format & Latte::E_GX2SURFFMT::FMT_BIT_FLOAT) != 0)
regInfo |= (1<<25); // ROUND_MODE_bit
colorBuffer->reg_info = regInfo;
// reg color_view
uint32 regView = 0;
if (colorBuffer->surface.tileMode != Latte::E_GX2TILEMODE::TM_LINEAR_SPECIAL)
{
regView |= (_swapEndianU32(colorBuffer->viewFirstSlice) & 0x7FF);
regView |= (((_swapEndianU32(colorBuffer->viewNumSlices) + _swapEndianU32(colorBuffer->viewFirstSlice) - 1) & 0x7FF) << 13);
}
colorBuffer->reg_view = regView;
colorBuffer->reg_mask = 0;
// todo - aa stuff
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2InitDepthBufferRegs(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2InitDepthBufferRegs(0x%08x)\n", hCPU->gpr[3]);
ppcDefineParamStructPtr(depthBuffer, GX2DepthBuffer, 0);
LatteAddrLib::AddrSurfaceInfo_OUT surfaceInfo;
LatteAddrLib::GX2CalculateSurfaceInfo(depthBuffer->surface.format, depthBuffer->surface.width, depthBuffer->surface.height, depthBuffer->surface.depth, depthBuffer->surface.dim, depthBuffer->surface.tileMode, depthBuffer->surface.aa, _swapEndianU32(depthBuffer->viewMip), &surfaceInfo);
cemu_assert_debug(depthBuffer->viewNumSlices != 0);
uint32 cSize = ((surfaceInfo.pitch >> 3) - 1) & 0x3FF;
cSize |= ((((surfaceInfo.height * surfaceInfo.pitch >> 6) - 1) & 0xFFFFF) << 10);
depthBuffer->reg_size = cSize;
// todo - other regs
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SetColorBuffer(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetColorBuffer(0x%08x, %d)", hCPU->gpr[3], hCPU->gpr[4]);
GX2ReserveCmdSpace(20);
GX2ColorBuffer* colorBufferBE = (GX2ColorBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
#ifndef PUBLIC_RELEASE
gx2Log_printf("ColorBuffer tileMode %01x PhysAddr %08x fmt %04x res %dx%d Mip %d Slice %d", (uint32)colorBufferBE->surface.tileMode.value(), (uint32)colorBufferBE->surface.imagePtr, (uint32)colorBufferBE->surface.format.value(), (uint32)colorBufferBE->surface.width, (uint32)colorBufferBE->surface.height, _swapEndianU32(colorBufferBE->viewMip), _swapEndianU32(colorBufferBE->viewFirstSlice));
#endif
// regs[0] = mmCB_COLOR0_SIZE
// regs[1] = mmCB_COLOR0_INFO
// regs[2] = mmCB_COLOR0_VIEW
// regs[3] = mmCB_COLOR0_MASK
// regs[4] = mmCB_COLOR0_TILE
uint32 targetIndex = hCPU->gpr[4];
uint32 viewMip = _swapEndianU32(colorBufferBE->viewMip);
uint32 colorBufferBase = memory_virtualToPhysical(colorBufferBE->surface.imagePtr);
if( viewMip != 0 )
{
uint32 baseImagePtr = colorBufferBE->surface.mipPtr;
if( viewMip == 1 )
colorBufferBase = memory_virtualToPhysical(baseImagePtr);
else
colorBufferBase = memory_virtualToPhysical(baseImagePtr+colorBufferBE->surface.mipOffset[viewMip-1]);
}
Latte::E_GX2TILEMODE tileMode = colorBufferBE->surface.tileMode;
uint32 viewMipIndex = _swapEndianU32(colorBufferBE->viewMip);
uint32 swizzle = colorBufferBE->surface.swizzle;
if (Latte::TM_IsMacroTiled(tileMode) && viewMipIndex < ((swizzle >> 16) & 0xFF))
{
// remove swizzle for small mips
colorBufferBase ^= (swizzle & 0xFFFF);
}
// set color buffer pointer for render target
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmCB_COLOR0_BASE - 0xA000 + hCPU->gpr[4]);
gx2WriteGather_submitU32AsBE(colorBufferBase);
// set color buffer size
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmCB_COLOR0_SIZE - 0xA000 + hCPU->gpr[4]);
gx2WriteGather_submitU32AsBE((uint32)colorBufferBE->reg_size);
cemu_assert_debug(tileMode != Latte::E_GX2TILEMODE::TM_LINEAR_SPECIAL);
// set mmCB_COLOR*_VIEW
gx2WriteGather_submit(
pm4HeaderType3(IT_SET_CONTEXT_REG, 2),
mmCB_COLOR0_VIEW - 0xA000 + hCPU->gpr[4],
colorBufferBE->reg_view);
// todo: mmCB_COLOR0_TILE and mmCB_COLOR0_FRAG
// set mmCB_COLOR*_INFO
gx2WriteGather_submit(
pm4HeaderType3(IT_SET_CONTEXT_REG, 2),
mmCB_COLOR0_INFO - 0xA000 + hCPU->gpr[4],
colorBufferBE->reg_info);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SetDepthBuffer(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetDepthBuffer(0x%08x)\n", hCPU->gpr[3]);
GX2ReserveCmdSpace(20);
GX2DepthBuffer* depthBufferBE = (GX2DepthBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
gx2Log_printf("DepthBuffer tileMode %01x PhysAddr %08x fmt %04x res %dx%d", (uint32)depthBufferBE->surface.tileMode.value(), (uint32)depthBufferBE->surface.imagePtr, (uint32)depthBufferBE->surface.format.value(), (uint32)depthBufferBE->surface.width, (uint32)depthBufferBE->surface.height);
uint32 viewMip = _swapEndianU32(depthBufferBE->viewMip);
// todo: current code for the PM4 packets is a hack, replace with proper implementation
uint32 regHTileDataBase = memory_virtualToPhysical(depthBufferBE->surface.imagePtr)>>8;
if( viewMip > 0 )
{
forceLogDebug_printf("GX2SetDepthBuffer: Unsupported non-zero mip (%d) Pointer: %08X Base: %08X", viewMip, regHTileDataBase, 0);
}
// setup depthbuffer info register
uint32 regDepthBufferInfo = 0;
uint32 depthBufferTileMode = (uint32)depthBufferBE->surface.tileMode.value();
Latte::E_GX2SURFFMT depthBufferFormat = depthBufferBE->surface.format;
regDepthBufferInfo |= ((depthBufferTileMode&0xF)<<15);
if (depthBufferFormat == Latte::E_GX2SURFFMT::D16_UNORM)
regDepthBufferInfo |= (1 << 0);
else if (depthBufferFormat == Latte::E_GX2SURFFMT::D24_S8_UNORM)
regDepthBufferInfo |= (3 << 0);
else if (depthBufferFormat == Latte::E_GX2SURFFMT::D32_FLOAT)
regDepthBufferInfo |= (6 << 0);
else if (depthBufferFormat == Latte::E_GX2SURFFMT::D32_S8_FLOAT)
regDepthBufferInfo |= (7 << 0);
else if (depthBufferFormat == Latte::E_GX2SURFFMT::D24_S8_FLOAT)
regDepthBufferInfo |= (5 << 0);
else
{
debug_printf("Unsupported depth buffer format 0x%04x\n", depthBufferFormat);
}
// set color buffer pointer for render target
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+1));
gx2WriteGather_submitU32AsBE(mmDB_DEPTH_SIZE - 0xA000);
gx2WriteGather_submitU32AsBE((uint32)depthBufferBE->reg_size); // hack
// set color buffer size
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+3));
gx2WriteGather_submitU32AsBE(mmDB_DEPTH_BASE - 0xA000);
gx2WriteGather_submitU32AsBE(0); // DB_DEPTH_BASE
gx2WriteGather_submitU32AsBE(regDepthBufferInfo); // DB_DEPTH_INFO
gx2WriteGather_submitU32AsBE(regHTileDataBase); // DB_HTILE_DATA_BASE
// set DB_DEPTH_VIEW
uint32 db_view = 0;
db_view |= (_swapEndianU32(depthBufferBE->viewFirstSlice)&0x7FF);
db_view |= (((_swapEndianU32(depthBufferBE->viewNumSlices)+_swapEndianU32(depthBufferBE->viewFirstSlice)-1)&0x7FF)<<13);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmDB_DEPTH_VIEW - 0xA000);
gx2WriteGather_submitU32AsBE(db_view);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SetDRCBuffer(PPCInterpreter_t* hCPU)
{
Latte::E_GX2SURFFMT format = (Latte::E_GX2SURFFMT)hCPU->gpr[6];
LatteGPUState.drcBufferUsesSRGB = HAS_FLAG(format, Latte::E_GX2SURFFMT::FMT_BIT_SRGB);
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2MarkScanBufferCopied(PPCInterpreter_t* hCPU)
{
uint32 scanTarget = hCPU->gpr[3];
if( scanTarget == GX2_SCAN_TARGET_TV )
{
GX2ReserveCmdSpace(10);
uint32 physAddr = (MEMORY_TILINGAPERTURE_AREA_ADDR+0x200000);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER, 9));
gx2WriteGather_submitU32AsBE(physAddr);
gx2WriteGather_submitU32AsBE(1920);
gx2WriteGather_submitU32AsBE(1080);
gx2WriteGather_submitU32AsBE(1920); // pitch
gx2WriteGather_submitU32AsBE(4); // tileMode
gx2WriteGather_submitU32AsBE(0); // swizzle
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE((uint32)Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM);
gx2WriteGather_submitU32AsBE(scanTarget);
}
osLib_returnFromFunction(hCPU, 0);
}

View file

@ -0,0 +1,253 @@
#include "Cafe/HW/Latte/Core/LatteBufferCache.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/OS/common/OSCommon.h"
#include "GX2.h"
#include "GX2_Command.h"
#include "GX2_Resource.h"
#include "GX2_Streamout.h"
#include "GX2_Draw.h"
namespace GX2
{
MPTR GX2RAllocateFunc = MPTR_NULL;
MPTR GX2RFreeFunc = MPTR_NULL;
void GX2RSetAllocator(MPTR funcAllocMPTR, MPTR funcFreeMPR)
{
GX2RAllocateFunc = funcAllocMPTR;
GX2RFreeFunc = funcFreeMPR;
}
uint32 GX2RGetBufferAllocationSize(GX2RBuffer* buffer)
{
return (buffer->GetSize() + 0x3F) & ~0x3F; // pad to 64 byte alignment
}
uint32 GX2RGetBufferAlignment(uint32 resFlags)
{
if ((resFlags & GX2R_RESFLAG_USAGE_STREAM_OUTPUT) != 0)
return 0x100;
if ((resFlags & GX2R_RESFLAG_USAGE_UNIFORM_BLOCK) != 0)
return 0x100;
if ((resFlags & GX2R_RESFLAG_USAGE_SHADER_PROGRAM) != 0)
return 0x100;
if ((resFlags & GX2R_RESFLAG_USAGE_GS_RINGBUFFER) != 0)
return 0x100;
if ((resFlags & GX2R_RESFLAG_USAGE_VERTEX_BUFFER) != 0)
return 0x40;
if ((resFlags & GX2R_RESFLAG_USAGE_INDEX_BUFFER) != 0)
return 0x40;
if ((resFlags & GX2R_RESFLAG_USAGE_DISPLAY_LIST) != 0)
return 0x40;
return 0x100;
}
bool GX2RCreateBuffer(GX2RBuffer* buffer)
{
uint32 bufferAlignment = GX2RGetBufferAlignment(buffer->resFlags);
uint32 bufferSize = GX2RGetBufferAllocationSize(buffer);
MPTR allocResult = PPCCoreCallback(GX2RAllocateFunc, (uint32)buffer->resFlags, bufferSize, bufferAlignment);
buffer->ptr = allocResult;
buffer->resFlags &= ~GX2R_RESFLAG_LOCKED;
buffer->resFlags |= GX2R_RESFLAG_ALLOCATED_BY_GX2R;
// todo: invalidation
return allocResult != MPTR_NULL;
}
bool GX2RCreateBufferUserMemory(GX2RBuffer* buffer, void* ptr, uint32 unusedSizeParameter)
{
buffer->ptr = ptr;
buffer->resFlags &= ~GX2R_RESFLAG_LOCKED;
buffer->resFlags &= ~GX2R_RESFLAG_ALLOCATED_BY_GX2R;
// todo: invalidation
return true;
}
void GX2RDestroyBufferEx(GX2RBuffer* buffer, uint32 resFlags)
{
if ((buffer->resFlags & GX2R_RESFLAG_ALLOCATED_BY_GX2R) == 0)
{
// this buffer is user-allocated
buffer->ptr = nullptr;
return;
}
PPCCoreCallback(GX2RFreeFunc, (uint32)buffer->resFlags, buffer->GetPtr());
buffer->ptr = nullptr;
}
bool GX2RBufferExists(GX2RBuffer* buffer)
{
if (!buffer)
return false;
if (!buffer->GetPtr())
return false;
return true;
}
void* GX2RLockBufferEx(GX2RBuffer* buffer, uint32 resFlags)
{
return buffer->GetPtr();
}
void GX2RUnlockBufferEx(GX2RBuffer* buffer, uint32 resFlags)
{
// todo - account for flags, not all buffer types need flushing
LatteBufferCache_notifyDCFlush(buffer->GetVirtualAddr(), buffer->GetSize());
}
void GX2RInvalidateBuffer(GX2RBuffer* buffer, uint32 resFlags)
{
// todo - account for flags, not all buffer types need flushing
LatteBufferCache_notifyDCFlush(buffer->GetVirtualAddr(), buffer->GetSize());
}
void GX2RSetAttributeBuffer(GX2RBuffer* buffer, uint32 bufferIndex, uint32 stride, uint32 offset)
{
uint32 bufferSize = buffer->GetSize();
if (offset > bufferSize)
cemuLog_log(LogType::Force, "GX2RSetAttributeBuffer(): Offset exceeds buffer size");
GX2SetAttribBuffer(bufferIndex, bufferSize - offset, stride, ((uint8be*)buffer->GetPtr()) + offset);
}
void GX2RSetStreamOutBuffer(uint32 bufferIndex, GX2StreamOutBuffer* soBuffer)
{
// seen in CoD: Ghosts
GX2SetStreamOutBuffer(bufferIndex, soBuffer);
}
bool GX2RCreateSurface(GX2Surface* surface, uint32 resFlags)
{
// seen in Transformers Prime
surface->resFlag = resFlags;
GX2CalcSurfaceSizeAndAlignment(surface);
surface->resFlag &= ~GX2R_RESFLAG_LOCKED;
surface->resFlag |= GX2R_RESFLAG_ALLOCATED_BY_GX2R;
MPTR allocResult = PPCCoreCallback(GX2RAllocateFunc, (uint32)surface->resFlag, (uint32)surface->imageSize + (uint32)surface->mipSize, (uint32)surface->alignment);
surface->imagePtr = allocResult;
if (surface->imagePtr != MPTR_NULL && surface->mipSize > 0)
{
surface->mipPtr = (uint32)surface->imagePtr + surface->imageSize;
}
else
{
surface->mipPtr = MPTR_NULL;
}
// todo: Cache invalidation based on resourceFlags?
return allocResult != MPTR_NULL;
}
bool GX2RCreateSurfaceUserMemory(GX2Surface* surface, void* imagePtr, void* mipPtr, uint32 resFlags)
{
surface->resFlag = resFlags;
surface->resFlag &= ~(GX2R_RESFLAG_LOCKED | GX2R_RESFLAG_ALLOCATED_BY_GX2R);
GX2CalcSurfaceSizeAndAlignment(surface);
surface->imagePtr = memory_getVirtualOffsetFromPointer(imagePtr);
surface->mipPtr = memory_getVirtualOffsetFromPointer(mipPtr);
if (surface->resFlag & 0x14000)
{
// memory invalidate
}
return true;
}
void GX2RDestroySurfaceEx(GX2Surface* surface, uint32 resFlags)
{
if ((surface->resFlag & GX2R_RESFLAG_ALLOCATED_BY_GX2R) == 0)
{
// this surface is user-allocated
surface->imagePtr = MPTR_NULL;
return;
}
resFlags &= (GX2R_RESFLAG_UKN_BIT_19 | GX2R_RESFLAG_UKN_BIT_20 | GX2R_RESFLAG_UKN_BIT_21 | GX2R_RESFLAG_UKN_BIT_22 | GX2R_RESFLAG_UKN_BIT_23);
PPCCoreCallback(GX2RFreeFunc, (uint32)surface->resFlag | resFlags, (uint32)surface->imagePtr);
surface->imagePtr = MPTR_NULL;
}
bool GX2RSurfaceExists(GX2Surface* surface)
{
if (!surface)
return false;
if (surface->imagePtr == MPTR_NULL)
return false;
if ((surface->resFlag & (GX2R_RESFLAG_USAGE_CPU_READ | GX2R_RESFLAG_USAGE_CPU_WRITE | GX2R_RESFLAG_USAGE_GPU_READ | GX2R_RESFLAG_USAGE_GPU_WRITE)) == 0)
return false;
return true;
}
void* GX2RLockSurfaceEx(GX2Surface* surface, uint32 mipLevel, uint32 resFlags)
{
// todo: handle invalidation
surface->resFlag |= GX2R_RESFLAG_LOCKED;
return memory_getPointerFromVirtualOffset(surface->imagePtr);
}
void GX2RUnlockSurfaceEx(GX2Surface* surface, uint32 mipLevel, uint32 resFlags)
{
// todo: handle invalidation
surface->resFlag &= ~GX2R_RESFLAG_LOCKED;
}
void GX2RBeginDisplayListEx(GX2RBuffer* buffer, bool ukn, uint32 resFlags)
{
// todo: handle invalidation
GX2::GX2BeginDisplayList(buffer->GetPtr(), buffer->GetSize());
}
uint32 GX2REndDisplayList(GX2RBuffer* buffer)
{
return GX2::GX2EndDisplayList(buffer->GetPtr());
}
void GX2RCallDisplayList(GX2RBuffer* buffer, uint32 size)
{
GX2::GX2CallDisplayList(buffer->GetVirtualAddr(), size);
}
void GX2RDirectCallDisplayList(GX2RBuffer* buffer, uint32 size)
{
GX2::GX2DirectCallDisplayList(buffer->GetPtr(), size);
}
void GX2RDrawIndexed(GX2PrimitiveMode2 primitiveMode, GX2RBuffer* indexBuffer, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, uint32 count, uint32 baseIndex, uint32 baseVertex, uint32 numInstances)
{
GX2DrawIndexedEx(primitiveMode, count, indexType, (uint8be*)indexBuffer->GetPtr() + (baseIndex * (uint32)indexBuffer->elementSize), baseVertex, numInstances);
}
void GX2ResourceInit()
{
cafeExportRegister("gx2", GX2RSetAllocator, LogType::GX2);
cafeExportRegister("gx2", GX2RGetBufferAllocationSize, LogType::GX2);
cafeExportRegister("gx2", GX2RGetBufferAlignment, LogType::GX2);
cafeExportRegister("gx2", GX2RCreateBuffer, LogType::GX2);
cafeExportRegister("gx2", GX2RCreateBufferUserMemory, LogType::GX2);
cafeExportRegister("gx2", GX2RDestroyBufferEx, LogType::GX2);
cafeExportRegister("gx2", GX2RBufferExists, LogType::GX2);
cafeExportRegister("gx2", GX2RLockBufferEx, LogType::GX2);
cafeExportRegister("gx2", GX2RUnlockBufferEx, LogType::GX2);
cafeExportRegister("gx2", GX2RInvalidateBuffer, LogType::GX2);
cafeExportRegister("gx2", GX2RSetAttributeBuffer, LogType::GX2);
cafeExportRegister("gx2", GX2RSetStreamOutBuffer, LogType::GX2);
cafeExportRegister("gx2", GX2RCreateSurface, LogType::GX2);
cafeExportRegister("gx2", GX2RCreateSurfaceUserMemory, LogType::GX2);
cafeExportRegister("gx2", GX2RDestroySurfaceEx, LogType::GX2);
cafeExportRegister("gx2", GX2RSurfaceExists, LogType::GX2);
cafeExportRegister("gx2", GX2RLockSurfaceEx, LogType::GX2);
cafeExportRegister("gx2", GX2RUnlockSurfaceEx, LogType::GX2);
cafeExportRegister("gx2", GX2RBeginDisplayListEx, LogType::GX2);
cafeExportRegister("gx2", GX2REndDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2RCallDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2RDirectCallDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2RDrawIndexed, LogType::GX2);
GX2RAllocateFunc = MPTR_NULL;
GX2RFreeFunc = MPTR_NULL;
}
};

View file

@ -0,0 +1,64 @@
#pragma once
// basic resource flags
#define GX2_RESFLAG_USAGE_TEXTURE (1<<0)
#define GX2_RESFLAG_USAGE_COLOR_BUFFER (1<<1)
#define GX2_RESFLAG_USAGE_DEPTH_BUFFER (1<<2)
#define GX2_RESFLAG_USAGE_SCAN_BUFFER (1<<3)
// extended resource flags used by GX2R API
#define GX2R_RESFLAG_USAGE_VERTEX_BUFFER (1<<4)
#define GX2R_RESFLAG_USAGE_INDEX_BUFFER (1<<5)
#define GX2R_RESFLAG_USAGE_UNIFORM_BLOCK (1<<6)
#define GX2R_RESFLAG_USAGE_SHADER_PROGRAM (1<<7)
#define GX2R_RESFLAG_USAGE_STREAM_OUTPUT (1<<8)
#define GX2R_RESFLAG_USAGE_DISPLAY_LIST (1<<9)
#define GX2R_RESFLAG_USAGE_GS_RINGBUFFER (1<<10)
#define GX2R_RESFLAG_USAGE_CPU_READ (1<<11)
#define GX2R_RESFLAG_USAGE_CPU_WRITE (1<<12)
#define GX2R_RESFLAG_USAGE_GPU_READ (1<<13)
#define GX2R_RESFLAG_USAGE_GPU_WRITE (1<<14)
#define GX2R_RESFLAG_USE_MEM1 (1<<17)
#define GX2R_RESFLAG_UKN_BIT_19 (1<<19)
#define GX2R_RESFLAG_UKN_BIT_20 (1<<20)
#define GX2R_RESFLAG_UKN_BIT_21 (1<<21)
#define GX2R_RESFLAG_UKN_BIT_22 (1<<22)
#define GX2R_RESFLAG_UKN_BIT_23 (1<<23)
#define GX2R_RESFLAG_ALLOCATED_BY_GX2R (1<<29)
#define GX2R_RESFLAG_LOCKED (1<<30)
struct GX2RBuffer
{
/* +0x00 */ uint32be resFlags;
/* +0x04 */ uint32be elementSize;
/* +0x08 */ uint32be elementCount;
/* +0x0C */ MEMPTR<void> ptr;
uint32 GetSize() const
{
return (uint32)elementSize * (uint32)elementCount;
}
MPTR GetVirtualAddr() const
{
return ptr.GetMPTR();
}
void* GetPtr() const
{
return ptr.GetPtr();
}
};
static_assert(sizeof(GX2RBuffer) == 0x10);
namespace GX2
{
void GX2ResourceInit();
void GX2RSetAllocator(MPTR funcAllocMPTR, MPTR funcFreeMPR);
};

View file

@ -0,0 +1,264 @@
#include "Cafe/OS/common/OSCommon.h"
#include "GX2.h"
#include "GX2_Shader.h"
#include "Cafe/HW/Latte/Core/LatteConst.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/HW/Latte/ISA/LatteInstructions.h"
uint32 memory_getVirtualOffsetFromPointer(void* ptr); // remove once we updated everything to MEMPTR
namespace GX2
{
using namespace Latte;
LatteConst::VertexFetchEndianMode _getVtxFormatEndianSwapDefault(uint32 vertexFormat)
{
switch (vertexFormat)
{
case 0:
case 1:
case 4:
case 10:
return LatteConst::VertexFetchEndianMode::SWAP_NONE; // 0
case 2:
case 3:
case 7:
case 8:
case 14:
case 15:
return LatteConst::VertexFetchEndianMode::SWAP_U16; // 1
case 5:
case 6:
case 9:
case 11:
case 12:
case 13:
case 16:
case 17:
case 18:
case 19:
return LatteConst::VertexFetchEndianMode::SWAP_U32; // 2
default:
break;
}
cemu_assert_suspicious();
return LatteConst::VertexFetchEndianMode::SWAP_NONE;
}
uint32 rawFormatToFetchFormat[] =
{
1, 2, 5, 6,
7, 0xD, 0xE, 0xF,
0x10, 0x16, 0x1A, 0x19,
0x1D, 0x1E, 0x1F, 0x20,
0x2F, 0x30, 0x22, 0x23,
};
struct GX2AttribDescription
{
/* +0x00 */ uint32 location;
/* +0x04 */ uint32 buffer;
/* +0x08 */ uint32be offset;
/* +0x0C */ uint32 format;
/* +0x10 */ uint32 indexType;
/* +0x14 */ uint32 aluDivisor;
/* +0x18 */ uint32 destSel;
/* +0x1C */ betype<LatteConst::VertexFetchEndianMode> endianSwap;
};
static_assert(sizeof(GX2AttribDescription) == 0x20);
static_assert(sizeof(betype<LatteConst::VertexFetchEndianMode>) == 0x4);
// calculate size of CF program subpart, includes alignment padding for clause instructions
size_t _calcFetchShaderCFCodeSize(uint32 attributeCount, GX2FetchShader_t::FetchShaderType fetchShaderType, uint32 tessellationMode)
{
cemu_assert_debug(fetchShaderType == GX2FetchShader_t::FetchShaderType::NO_TESSELATION);
cemu_assert_debug(tessellationMode == 0);
uint32 numCFInstructions = ((attributeCount + 15) / 16) + 1; // one VTX clause can have up to 16 instructions + final CF instruction is RETURN
size_t cfSize = numCFInstructions * 8;
cfSize = (cfSize + 0xF) & ~0xF; // pad to 16 byte alignment
return cfSize;
}
size_t _calcFetchShaderClauseCodeSize(uint32 attributeCount, GX2FetchShader_t::FetchShaderType fetchShaderType, uint32 tessellationMode)
{
cemu_assert_debug(fetchShaderType == GX2FetchShader_t::FetchShaderType::NO_TESSELATION);
cemu_assert_debug(tessellationMode == 0);
uint32 numClauseInstructions = attributeCount;
size_t clauseSize = numClauseInstructions * 16;
return clauseSize;
}
void _writeFetchShaderCFCode(void* programBufferOut, uint32 attributeCount, GX2FetchShader_t::FetchShaderType fetchShaderType, uint32 tessellationMode)
{
LatteCFInstruction* cfInstructionWriter = (LatteCFInstruction*)programBufferOut;
uint32 attributeIndex = 0;
uint32 cfSize = (uint32)_calcFetchShaderCFCodeSize(attributeCount, fetchShaderType, tessellationMode);
while (attributeIndex < attributeCount)
{
LatteCFInstruction_DEFAULT defaultInstr;
defaultInstr.setField_Opcode(LatteCFInstruction::INST_VTX_TC);
defaultInstr.setField_COUNT(std::min(attributeCount - attributeIndex, 16u));
defaultInstr.setField_ADDR(cfSize + attributeIndex*16);
memcpy(cfInstructionWriter, &defaultInstr, sizeof(LatteCFInstruction));
attributeIndex += 16;
cfInstructionWriter++;
}
// write RETURN instruction
LatteCFInstruction_DEFAULT returnInstr;
returnInstr.setField_Opcode(LatteCFInstruction::INST_RETURN);
returnInstr.setField_BARRIER(true);
memcpy(cfInstructionWriter, &returnInstr, sizeof(LatteCFInstruction));
}
void _writeFetchShaderVTXCode(GX2FetchShader_t* fetchShader, void* programOut, uint32 attributeCount, GX2AttribDescription* attributeDescription, GX2FetchShader_t::FetchShaderType fetchShaderType, uint32 tessellationMode)
{
uint8* writePtr = (uint8*)programOut;
// one instruction per attribute (hardcoded into _writeFetchShaderCFCode)
for (uint32 i = 0; i < attributeCount; i++)
{
uint32 attrFormat = _swapEndianU32(attributeDescription[i].format);
uint32 attrDestSel = _swapEndianU32(attributeDescription[i].destSel);
uint32 attrLocation = _swapEndianU32(attributeDescription[i].location);
uint32 attrBufferId = _swapEndianU32(attributeDescription[i].buffer);
uint32 attrIndexType = _swapEndianU32(attributeDescription[i].indexType);
uint32 attrAluDivisor = _swapEndianU32(attributeDescription[i].aluDivisor);
cemu_assert_debug(attrIndexType <= 1);
LatteConst::VertexFetchEndianMode endianSwap = attributeDescription[i].endianSwap;
if (endianSwap == LatteConst::VertexFetchEndianMode::SWAP_DEFAULT) // use per-format default
endianSwap = _getVtxFormatEndianSwapDefault(attrFormat & 0x3F);
uint32 srcSelX = 0; // this field is used to store the divisor index/mode (0 -> per-vertex index, 1 -> alu divisor 0, 2 -> alu divisor 1, 3 -> per-instance index)
if (attrIndexType == 0)
{
srcSelX = 0; // increase index per vertex
}
else if (attrIndexType == 1)
{
// instance based index
if (attrAluDivisor == 1)
{
// special encoding if alu divisor is 1
srcSelX = 3;
}
else
{
cemu_assert_debug(attrAluDivisor != 0); // divisor should not be zero if instance based index is selected?
// store alu divisor in divisor table (up to two entries)
uint32 numDivisors = _swapEndianU32(fetchShader->divisorCount);
bool divisorFound = false;
for (uint32 i = 0; i < numDivisors; i++)
{
if (_swapEndianU32(fetchShader->divisors[i]) == attrAluDivisor)
{
srcSelX = i != 0 ? 2 : 1;
divisorFound = true;
break;
}
}
if (divisorFound == false)
{
// add new divisor
if (numDivisors >= 2)
{
cemu_assert_debug(false); // not enough space for additional divisor
}
else
{
srcSelX = numDivisors != 0 ? 2 : 1;
fetchShader->divisors[numDivisors] = _swapEndianU32(attrAluDivisor);
numDivisors++;
fetchShader->divisorCount = _swapEndianU32(numDivisors);
}
}
}
}
else
{
cemu_assert_debug(false);
}
// convert attribute format to fetch format
uint32 fetchFormat = rawFormatToFetchFormat[attrFormat & 0x3F] & 0x3F;
uint32 nfa = 0;
if ((attrFormat & 0x800) != 0)
nfa = 2;
else if ((attrFormat & 0x100) != 0)
nfa = 1;
else
nfa = 0;
LatteClauseInstruction_VTX vtxInstruction;
vtxInstruction.setField_VTX_INST(LatteClauseInstruction_VTX::VTX_INST::_VTX_INST_SEMANTIC);
vtxInstruction.setFieldSEM_SEMANTIC_ID(attrLocation&0xFF);
vtxInstruction.setField_BUFFER_ID(attrBufferId + 0xA0);
vtxInstruction.setField_FETCH_TYPE((LatteConst::VertexFetchType2)attrIndexType);
vtxInstruction.setField_SRC_SEL_X((LatteClauseInstruction_VTX::SRC_SEL)srcSelX);
vtxInstruction.setField_DATA_FORMAT((LatteConst::VertexFetchFormat)fetchFormat);
vtxInstruction.setField_NUM_FORMAT_ALL((LatteClauseInstruction_VTX::NUM_FORMAT_ALL)nfa);
vtxInstruction.setField_OFFSET(attributeDescription[i].offset);
if ((attrFormat & 0x200) != 0)
vtxInstruction.setField_FORMAT_COMP_ALL(LatteClauseInstruction_VTX::FORMAT_COMP::COMP_SIGNED);
vtxInstruction.setField_ENDIAN_SWAP((LatteConst::VertexFetchEndianMode)endianSwap);
vtxInstruction.setField_DST_SEL(0, (LatteClauseInstruction_VTX::DST_SEL)((attrDestSel >> 24) & 0x7));
vtxInstruction.setField_DST_SEL(1, (LatteClauseInstruction_VTX::DST_SEL)((attrDestSel >> 16) & 0x7));
vtxInstruction.setField_DST_SEL(2, (LatteClauseInstruction_VTX::DST_SEL)((attrDestSel >> 8) & 0x7));
vtxInstruction.setField_DST_SEL(3, (LatteClauseInstruction_VTX::DST_SEL)((attrDestSel >> 0) & 0x7));
memcpy(writePtr, &vtxInstruction, 16);
writePtr += 16;
}
}
uint32 GX2CalcFetchShaderSizeEx(uint32 attributeCount, GX2FetchShader_t::FetchShaderType fetchShaderType, uint32 tessellationMode)
{
cemu_assert_debug(fetchShaderType == GX2FetchShader_t::FetchShaderType::NO_TESSELATION); // other types are todo
cemu_assert_debug(tessellationMode == 0); // other modes are todo
uint32 finalSize =
(uint32)_calcFetchShaderCFCodeSize(attributeCount, fetchShaderType, tessellationMode) +
(uint32)_calcFetchShaderClauseCodeSize(attributeCount, fetchShaderType, tessellationMode);
return finalSize;
}
void GX2InitFetchShaderEx(GX2FetchShader_t* fetchShader, void* programBufferOut, uint32 attributeCount, GX2AttribDescription* attributeDescription, GX2FetchShader_t::FetchShaderType fetchShaderType, uint32 tessellationMode)
{
cemu_assert_debug(fetchShaderType == GX2FetchShader_t::FetchShaderType::NO_TESSELATION);
cemu_assert_debug(tessellationMode == 0);
/*
Fetch shader program:
[CF_PROGRAM]
[Last CF instruction: 0x0 0x8A000000 (INST_RETURN)]
[PAD_TO_16_ALIGNMENT]
[CLAUSES]
*/
memset(fetchShader, 0x00, sizeof(GX2FetchShader_t));
fetchShader->attribCount = _swapEndianU32(attributeCount);
fetchShader->shaderPtr = (MPTR)_swapEndianU32(memory_getVirtualOffsetFromPointer(programBufferOut));
uint8* shaderStart = (uint8*)programBufferOut;
uint8* shaderOutput = shaderStart;
_writeFetchShaderCFCode(shaderOutput, attributeCount, fetchShaderType, tessellationMode);
shaderOutput += _calcFetchShaderCFCodeSize(attributeCount, fetchShaderType, tessellationMode);
_writeFetchShaderVTXCode(fetchShader, shaderOutput, attributeCount, attributeDescription, fetchShaderType, tessellationMode);
shaderOutput += _calcFetchShaderClauseCodeSize(attributeCount, fetchShaderType, tessellationMode);
uint32 shaderSize = (uint32)(shaderOutput - shaderStart);
cemu_assert_debug(shaderSize == GX2CalcFetchShaderSizeEx(attributeCount, GX2FetchShader_t::FetchShaderType::NO_TESSELATION, tessellationMode));
fetchShader->shaderSize = _swapEndianU32((uint32)(shaderOutput - shaderStart));
}
void GX2ShaderInit()
{
cafeExportRegister("gx2", GX2CalcFetchShaderSizeEx, LogType::GX2);
cafeExportRegister("gx2", GX2InitFetchShaderEx, LogType::GX2);
}
}

View file

@ -0,0 +1,172 @@
#pragma once
#include "Cafe/HW/Latte/ISA/LatteReg.h"
#include "GX2_Streamout.h"
struct GX2FetchShader_t
{
enum class FetchShaderType : uint32
{
NO_TESSELATION = 0,
};
/* +0x00 */ betype<FetchShaderType> fetchShaderType;
/* +0x04 */ uint32 _regs[1];
/* +0x08 */ uint32 shaderSize;
/* +0x0C */ MPTR shaderPtr;
/* +0x10 */ uint32 attribCount;
/* +0x14 */ uint32 divisorCount;
/* +0x18 */ uint32 divisors[2];
MPTR GetProgramAddr() const
{
return _swapEndianU32(shaderPtr);
}
};
static_assert(sizeof(GX2FetchShader_t) == 0x20);
static_assert(sizeof(betype<GX2FetchShader_t::FetchShaderType>) == 4);
namespace GX2
{
void GX2ShaderInit();
}
// code below still needs to be modernized (use betype, enum classes)
#define GX2_SHADER_MODE_UNIFORM_REGISTER 0
#define GX2_SHADER_MODE_UNIFORM_BLOCK 1
#define GX2_SHADER_MODE_GEOMETRY_SHADER 2
#define GX2_SHADER_MODE_COMPUTE_SHADER 3
struct GX2VertexShader_t
{
/* +0x000 */ uint32 regs[52];
/* +0x0D0 */ uint32 shaderSize;
/* +0x0D4 */ MPTR shaderPtr;
/* +0x0D8 */ uint32 shaderMode; // GX2_SHADER_MODE_*
/* +0x0DC */ uint32 uniformBlockCount;
/* +0x0E0 */ MPTR uniformBlockInfo;
/* +0x0E4 */ uint32 uniformVarCount;
/* +0x0E8 */ MPTR uniformVarInfo;
/* +0x0EC */ uint32 uknEC;
/* +0x0F0 */ MPTR uknF0;
/* +0x0F4 */ uint32 uknF4;
/* +0x0F8 */ MPTR uknF8; // each entry has 8 byte?
/* +0x0FC */ uint32 samplerCount;
/* +0x100 */ MPTR samplerInfo;
/* +0x104 */ uint32 attribCount;
/* +0x108 */ MPTR attribInfo;
/* +0x10C */ uint32 ringItemsize; // for GS
/* +0x110 */ uint32 usesStreamOut;
/* +0x114 */ uint32 streamOutVertexStride[GX2_MAX_STREAMOUT_BUFFERS];
/* +0x124 */ GX2RBuffer rBuffer;
MPTR GetProgramAddr() const
{
if (_swapEndianU32(this->shaderPtr) != MPTR_NULL)
return _swapEndianU32(this->shaderPtr);
return this->rBuffer.GetVirtualAddr();
}
};
static_assert(sizeof(GX2VertexShader_t) == 0x134);
typedef struct _GX2PixelShader
{
uint32 regs[41];
// regs:
// 0 ? Used by GPR count API?
// 1 ?
// 2 mmSPI_PS_IN_CONTROL_0
// 3 mmSPI_PS_IN_CONTROL_1
// 4 numInputs
// 5 mmSPI_PS_INPUT_CNTL_0
// ...
// 36 mmSPI_PS_INPUT_CNTL_31
// 37 mmCB_SHADER_MASK
// 38 mmCB_SHADER_CONTROL
// 39 mmDB_SHADER_CONTROL
// 40 mmSPI_INPUT_Z
/* +0xA4 */ uint32 shaderSize;
/* +0xA8 */ MPTR shaderPtr;
/* +0xAC */ uint32 shaderMode;
/* +0xB0 */ uint32 uniformBlockCount;
/* +0xB4 */ MPTR uniformBlockInfo;
/* +0xB8 */ uint32 uniformVarCount;
/* +0xBC */ MPTR uniformVarInfo;
/* +0xC0 */ uint32 uknC0;
/* +0xC4 */ MPTR uknC4;
/* +0xC8 */ uint32 uknC8;
/* +0xCC */ MPTR uknCC;
/* +0xD0 */ uint32 samplerCount;
/* +0xD4 */ MPTR samplerInfo;
/* +0xD8 */ GX2RBuffer rBuffer;
MPTR GetProgramAddr() const
{
if (_swapEndianU32(shaderPtr) != MPTR_NULL)
return _swapEndianU32(shaderPtr);
return rBuffer.GetVirtualAddr();
}
}GX2PixelShader_t;
static_assert(sizeof(GX2PixelShader_t) == 0xE8);
struct GX2GeometryShader_t
{
union
{
/* +0x00 */ uint32 regs[19];
struct
{
uint32be reg0;
uint32be reg1;
uint32be VGT_GS_MODE;
uint32be reg3;
uint32be reg4;
uint32be reg5;
uint32be reg6;
uint32be reg7;
// todo
}reg;
};
/* +0x4C */ uint32 shaderSize;
/* +0x50 */ MPTR shaderPtr;
/* +0x54 */ uint32 copyShaderSize;
/* +0x58 */ MPTR copyShaderPtr;
/* +0x5C */ uint32 shaderMode;
/* +0x60 */ uint32 uniformBlockCount;
/* +0x64 */ MPTR uniformBlockInfo;
/* +0x68 */ uint32 uniformVarCount;
/* +0x6C */ MPTR uniformVarInfo;
/* +0x70 */ uint32 ukn70;
/* +0x74 */ MPTR ukn74;
/* +0x78 */ uint32 ukn78;
/* +0x7C */ MPTR ukn7C;
/* +0x80 */ uint32 samplerCount;
/* +0x84 */ MPTR samplerInfo;
/* +0x88 */ uint32 ringItemsize;
/* +0x8C */ uint32 useStreamout;
/* +0x90 */ uint32 streamoutStride[GX2_MAX_STREAMOUT_BUFFERS];
/* +0xA0 */ GX2RBuffer rBuffer;
/* +0xB0 */ GX2RBuffer rBufferCopyProgram;
MPTR GetGeometryProgramAddr() const
{
if (_swapEndianU32(shaderPtr) != MPTR_NULL)
return _swapEndianU32(shaderPtr);
return rBuffer.GetVirtualAddr();
}
MPTR GetCopyProgramAddr() const
{
if (_swapEndianU32(copyShaderPtr) != MPTR_NULL)
return _swapEndianU32(copyShaderPtr);
return rBufferCopyProgram.GetVirtualAddr();
}
};
static_assert(sizeof(GX2GeometryShader_t) == 0xC0);

View file

@ -0,0 +1,726 @@
#include "Common/precompiled.h"
#include "GX2_State.h"
#include "GX2_Command.h"
#include "Cafe/HW/Latte/ISA/LatteReg.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/OS/common/OSCommon.h"
namespace GX2
{
using namespace Latte;
void GX2InitAlphaTestReg(GX2AlphaTestReg* reg, uint32 alphaTestEnable, GX2_ALPHAFUNC alphaFunc, float alphaRef)
{
Latte::LATTE_SX_ALPHA_TEST_CONTROL tmpRegCtrl;
tmpRegCtrl.set_ALPHA_FUNC(alphaFunc);
tmpRegCtrl.set_ALPHA_TEST_ENABLE(alphaTestEnable != 0);
reg->regAlphaTestControl = tmpRegCtrl;
Latte::LATTE_SX_ALPHA_REF tmpRegRef;
tmpRegRef.set_ALPHA_TEST_REF(alphaRef);
reg->regAlphaTestRef = tmpRegRef;
}
void GX2SetAlphaTestReg(GX2AlphaTestReg* reg)
{
GX2ReserveCmdSpace(3 + 3);
gx2WriteGather_submit(
pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::SX_ALPHA_TEST_CONTROL - 0xA000,
reg->regAlphaTestControl,
pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::SX_ALPHA_REF - 0xA000,
reg->regAlphaTestRef);
}
void GX2SetAlphaTest(uint32 alphaTestEnable, GX2_ALPHAFUNC alphaFunc, float alphaRef)
{
GX2AlphaTestReg tmpReg;
GX2InitAlphaTestReg(&tmpReg, alphaTestEnable, alphaFunc, alphaRef);
GX2SetAlphaTestReg(&tmpReg);
}
void GX2InitColorControlReg(GX2ColorControlReg* reg, GX2_LOGICOP logicOp, uint32 blendMask, uint32 multiwriteEnable, uint32 colorBufferEnable)
{
Latte::LATTE_CB_COLOR_CONTROL colorControlReg2;
colorControlReg2.set_MULTIWRITE_ENABLE(multiwriteEnable != 0);
if (colorBufferEnable == 0)
colorControlReg2.set_SPECIAL_OP(Latte::LATTE_CB_COLOR_CONTROL::E_SPECIALOP::DISABLE);
else
colorControlReg2.set_SPECIAL_OP(Latte::LATTE_CB_COLOR_CONTROL::E_SPECIALOP::NORMAL);
colorControlReg2.set_BLEND_MASK(blendMask);
colorControlReg2.set_ROP(logicOp);
reg->reg = colorControlReg2;
}
void GX2SetColorControlReg(GX2ColorControlReg* reg)
{
GX2ReserveCmdSpace(3);
gx2WriteGather_submit(
pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::CB_COLOR_CONTROL - 0xA000,
reg->reg);
}
void GX2SetColorControl(GX2_LOGICOP logicOp, uint32 blendMask, uint32 multiwriteEnable, uint32 colorBufferEnable)
{
GX2ColorControlReg colorControlReg;
GX2InitColorControlReg(&colorControlReg, logicOp, blendMask, multiwriteEnable, colorBufferEnable);
GX2SetColorControlReg(&colorControlReg);
}
void GX2InitPolygonControlReg(GX2PolygonControlReg* reg,
Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE frontFace,
uint32 cullFront,
uint32 cullBack,
Latte::LATTE_PA_SU_SC_MODE_CNTL::E_POLYGONMODE usePolygonMode,
Latte::LATTE_PA_SU_SC_MODE_CNTL::E_PTYPE polyModeFront,
Latte::LATTE_PA_SU_SC_MODE_CNTL::E_PTYPE polyModeBack,
uint32 polygonOffsetFrontEnable,
uint32 polygonOffsetBackEnable,
uint32 paraOffsetEnable)
{
Latte::LATTE_PA_SU_SC_MODE_CNTL v;
v.set_FRONT_FACE(frontFace);
v.set_CULL_FRONT((cullFront & 1) != 0);
v.set_CULL_BACK((cullBack & 1) != 0);
v.set_POLYGON_MODE(usePolygonMode);
v.set_FRONT_POLY_MODE(polyModeFront);
v.set_BACK_POLY_MODE(polyModeBack);
v.set_OFFSET_PARA_ENABLED((paraOffsetEnable & 1) != 0);
v.set_OFFSET_FRONT_ENABLED((polygonOffsetFrontEnable & 1) != 0);
v.set_OFFSET_BACK_ENABLED((polygonOffsetBackEnable & 1) != 0);
reg->reg = v;
}
void GX2SetPolygonControlReg(GX2PolygonControlReg* reg)
{
GX2ReserveCmdSpace(3);
gx2WriteGather_submit(
pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::PA_SU_SC_MODE_CNTL - 0xA000,
reg->reg);
}
void GX2SetPolygonControl(Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE frontFace,
uint32 cullFront,
uint32 cullBack,
Latte::LATTE_PA_SU_SC_MODE_CNTL::E_POLYGONMODE usePolygonMode,
Latte::LATTE_PA_SU_SC_MODE_CNTL::E_PTYPE polyModeFront,
Latte::LATTE_PA_SU_SC_MODE_CNTL::E_PTYPE polyModeBack,
uint32 polygonOffsetFrontEnable,
uint32 polygonOffsetBackEnable,
uint32 paraOffsetEnable)
{
GX2PolygonControlReg reg{};
GX2InitPolygonControlReg(&reg, frontFace, cullFront, cullBack, usePolygonMode, polyModeFront, polyModeBack, polygonOffsetFrontEnable, polygonOffsetBackEnable, paraOffsetEnable);
GX2SetPolygonControlReg(&reg);
}
void GX2SetCullOnlyControl(Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE frontFace, uint32 cullFront, uint32 cullBack)
{
GX2PolygonControlReg reg{};
GX2InitPolygonControlReg(&reg, frontFace, cullFront, cullBack, Latte::LATTE_PA_SU_SC_MODE_CNTL::E_POLYGONMODE::UKN0, Latte::LATTE_PA_SU_SC_MODE_CNTL::E_PTYPE::POINTS, Latte::LATTE_PA_SU_SC_MODE_CNTL::E_PTYPE::POINTS, 0, 0, 0);
GX2SetPolygonControlReg(&reg);
}
void GX2InitPolygonOffsetReg(GX2PolygonOffsetReg* reg, float frontOffset, float frontScale, float backOffset, float backScale, float clampOffset)
{
frontScale *= 16.0;
backScale *= 16.0;
reg->regFrontScale = Latte::LATTE_PA_SU_POLY_OFFSET_FRONT_SCALE().set_SCALE(frontScale);
reg->regFrontOffset = Latte::LATTE_PA_SU_POLY_OFFSET_FRONT_OFFSET().set_OFFSET(frontOffset);
reg->regBackScale = Latte::LATTE_PA_SU_POLY_OFFSET_BACK_SCALE().set_SCALE(backScale);
reg->regBackOffset = Latte::LATTE_PA_SU_POLY_OFFSET_BACK_OFFSET().set_OFFSET(backOffset);
reg->regClamp = Latte::LATTE_PA_SU_POLY_OFFSET_CLAMP().set_CLAMP(clampOffset);
}
void GX2SetPolygonOffsetReg(GX2PolygonOffsetReg* reg)
{
GX2ReserveCmdSpace(6 + 3);
gx2WriteGather_submit(
pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 4),
Latte::REGADDR::PA_SU_POLY_OFFSET_FRONT_SCALE - 0xA000,
reg->regFrontScale,
reg->regFrontOffset,
reg->regBackScale,
reg->regBackOffset,
pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::PA_SU_POLY_OFFSET_CLAMP - 0xA000,
reg->regClamp);
}
void GX2SetPolygonOffset(float frontOffset, float frontScale, float backOffset, float backScale, float clampOffset)
{
GX2PolygonOffsetReg tmpReg;
GX2InitPolygonOffsetReg(&tmpReg, frontOffset, frontScale, backOffset, backScale, clampOffset);
GX2SetPolygonOffsetReg(&tmpReg);
}
void GX2SetRasterizerClipControlEx(bool enableRasterizer, bool enableZClip, bool enableHalfZ)
{
GX2ReserveCmdSpace(3);
//if (enableHalfZ)
//{
// // Smash has a bug where it enables half space clipping during streamout drawcalls and shadowing and then doesn't turn it off until the next GX2SetRasterizerClipControl call
// // this leads to some stuff being rendered at the wrong z-plane (e.g. shields behind characters) if the game's default depth range -1 to 1 isn't supported (on OpenGL only Nvidia's glDepthRangedNV allows unclamped values)
// uint64 titleId = gameMeta_getTitleId();
// if (titleId == 0x0005000010144F00ULL ||
// titleId == 0x0005000010145000ULL ||
// titleId == 0x0005000010110E00ULL)
// {
// // force disable half space clipping
// if (g_renderer && g_renderer->GetType() == RendererAPI::OpenGL && LatteGPUState.glVendor != GLVENDOR_NVIDIA)
// enableHalfZ = false;
// }
//}
Latte::LATTE_PA_CL_CLIP_CNTL reg{};
reg.set_ZCLIP_NEAR_DISABLE(!enableZClip).set_ZCLIP_FAR_DISABLE(!enableZClip);
reg.set_DX_RASTERIZATION_KILL(!enableRasterizer);
reg.set_DX_CLIP_SPACE_DEF(enableHalfZ);
reg.set_DX_LINEAR_ATTR_CLIP_ENA(true);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::PA_CL_CLIP_CNTL - 0xA000,
reg);
}
void GX2SetRasterizerClipControl(bool enableRasterizer, bool enableZClip)
{
GX2SetRasterizerClipControlEx(enableRasterizer, enableZClip, false);
}
void GX2SetRasterizerClipControlHalfZ(bool enableRasterizer, bool enableZClip, bool enableHalfZ)
{
GX2SetRasterizerClipControlEx(enableRasterizer, enableZClip, enableHalfZ);
}
void GX2InitViewportReg(GX2ViewportReg* viewportReg, float x, float y, float width, float height, float nearZ, float farZ)
{
// todo: set clipping registers and zMin/zMax registers
viewportReg->xScale = Latte::LATTE_PA_CL_VPORT_XSCALE().set_SCALE(width * 0.5f);
viewportReg->xOffset = Latte::LATTE_PA_CL_VPORT_XOFFSET().set_OFFSET(x + (width * 0.5f));
viewportReg->yScale = Latte::LATTE_PA_CL_VPORT_YSCALE().set_SCALE(height * -0.5f);
viewportReg->yOffset = Latte::LATTE_PA_CL_VPORT_YOFFSET().set_OFFSET(y + (height * 0.5f));
viewportReg->zScale = Latte::LATTE_PA_CL_VPORT_ZSCALE().set_SCALE((farZ - nearZ) * 0.5f);
viewportReg->zOffset = Latte::LATTE_PA_CL_VPORT_ZOFFSET().set_OFFSET((nearZ + farZ) * 0.5f);
}
void GX2SetViewportReg(GX2ViewportReg* viewportReg)
{
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
GX2ReserveCmdSpace(2 + 6);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 6),
Latte::REGADDR::PA_CL_VPORT_XSCALE - 0xA000,
viewportReg->xScale, viewportReg->xOffset,
viewportReg->yScale, viewportReg->yOffset,
viewportReg->zScale, viewportReg->zOffset);
}
void GX2SetViewport(float x, float y, float width, float height, float nearZ, float farZ)
{
GX2ViewportReg viewportReg;
GX2InitViewportReg(&viewportReg, x, y, width, height, nearZ, farZ);
GX2SetViewportReg(&viewportReg);
}
void GX2InitScissorReg(GX2ScissorReg* scissorReg, uint32 x, uint32 y, uint32 width, uint32 height)
{
uint32 tlx = x;
uint32 tly = y;
uint32 brx = x + width;
uint32 bry = y + height;
tlx = std::min(tlx, 8192u);
tly = std::min(tly, 8192u);
brx = std::min(brx, 8192u);
bry = std::min(bry, 8192u);
scissorReg->scissorTL = Latte::LATTE_PA_SC_GENERIC_SCISSOR_TL().set_TL_X(tlx).set_TL_Y(tly).set_WINDOW_OFFSET_DISABLE(true);
scissorReg->scissorBR = Latte::LATTE_PA_SC_GENERIC_SCISSOR_BR().set_BR_X(brx).set_BR_Y(bry);
}
void GX2SetScissorReg(GX2ScissorReg* scissorReg)
{
GX2ReserveCmdSpace(4);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 2),
Latte::REGADDR::PA_SC_GENERIC_SCISSOR_TL - 0xA000,
scissorReg->scissorTL, scissorReg->scissorBR);
}
void GX2GetScissorReg(GX2ScissorReg* scissorReg, uint32be* x, uint32be* y, uint32be* width, uint32be* height)
{
*x = scissorReg->scissorTL.value().get_TL_X();
*y = scissorReg->scissorTL.value().get_TL_Y();
*width = scissorReg->scissorBR.value().get_BR_X() - scissorReg->scissorTL.value().get_TL_X();
*height = scissorReg->scissorBR.value().get_BR_Y() - scissorReg->scissorTL.value().get_TL_Y();
}
void GX2SetScissor(uint32 x, uint32 y, uint32 width, uint32 height)
{
GX2ScissorReg scissorReg;
GX2InitScissorReg(&scissorReg, x, y, width, height);
GX2SetScissorReg(&scissorReg);
}
void GX2SetDepthOnlyControl(bool depthTestEnable, bool depthWriteEnable, LATTE_DB_DEPTH_CONTROL::E_ZFUNC depthFunction)
{
// disables any currently set stencil test
GX2ReserveCmdSpace(3);
Latte::LATTE_DB_DEPTH_CONTROL reg{};
reg.set_Z_ENABLE(depthTestEnable);
reg.set_Z_WRITE_ENABLE(depthWriteEnable);
reg.set_Z_FUNC(depthFunction);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::DB_DEPTH_CONTROL - 0xA000,
reg);
}
void GX2SetDepthStencilControl(
bool depthTestEnable, bool depthWriteEnable, LATTE_DB_DEPTH_CONTROL::E_ZFUNC depthFunction,
bool stencilTestEnable, bool backStencilTestEnable,
LATTE_DB_DEPTH_CONTROL::E_STENCILFUNC frontStencilFunction,
LATTE_DB_DEPTH_CONTROL::E_STENCILACTION frontStencilZPass, LATTE_DB_DEPTH_CONTROL::E_STENCILACTION frontStencilZFail, LATTE_DB_DEPTH_CONTROL::E_STENCILACTION frontStencilFail,
LATTE_DB_DEPTH_CONTROL::E_STENCILFUNC backStencilFunction,
LATTE_DB_DEPTH_CONTROL::E_STENCILACTION backStencilZPass, LATTE_DB_DEPTH_CONTROL::E_STENCILACTION backStencilZFail, LATTE_DB_DEPTH_CONTROL::E_STENCILACTION backStencilFail
)
{
GX2ReserveCmdSpace(3);
Latte::LATTE_DB_DEPTH_CONTROL reg{};
reg.set_Z_ENABLE(depthTestEnable).set_Z_WRITE_ENABLE(depthWriteEnable).set_Z_FUNC(depthFunction);
reg.set_STENCIL_ENABLE(stencilTestEnable).set_BACK_STENCIL_ENABLE(backStencilTestEnable);
reg.set_STENCIL_FUNC_F(frontStencilFunction).set_STENCIL_FUNC_B(backStencilFunction);
reg.set_STENCIL_ZPASS_F(frontStencilZPass).set_STENCIL_ZFAIL_F(frontStencilZFail).set_STENCIL_FAIL_F(frontStencilFail);
reg.set_STENCIL_ZPASS_B(backStencilZPass).set_STENCIL_ZFAIL_B(backStencilZFail).set_STENCIL_FAIL_B(backStencilFail);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::DB_DEPTH_CONTROL - 0xA000,
reg);
}
void GX2InitDepthStencilControlReg(
GX2DepthStencilControlReg* depthStencilControlReg,
bool depthTestEnable, bool depthWriteEnable, LATTE_DB_DEPTH_CONTROL::E_ZFUNC depthFunction,
bool stencilTestEnable, bool backStencilTestEnable,
LATTE_DB_DEPTH_CONTROL::E_STENCILFUNC frontStencilFunction,
LATTE_DB_DEPTH_CONTROL::E_STENCILACTION frontStencilZPass, LATTE_DB_DEPTH_CONTROL::E_STENCILACTION frontStencilZFail, LATTE_DB_DEPTH_CONTROL::E_STENCILACTION frontStencilFail,
LATTE_DB_DEPTH_CONTROL::E_STENCILFUNC backStencilFunction,
LATTE_DB_DEPTH_CONTROL::E_STENCILACTION backStencilZPass, LATTE_DB_DEPTH_CONTROL::E_STENCILACTION backStencilZFail, LATTE_DB_DEPTH_CONTROL::E_STENCILACTION backStencilFail)
{
Latte::LATTE_DB_DEPTH_CONTROL reg{};
reg.set_Z_ENABLE(depthTestEnable).set_Z_WRITE_ENABLE(depthWriteEnable).set_Z_FUNC(depthFunction);
reg.set_STENCIL_ENABLE(stencilTestEnable).set_BACK_STENCIL_ENABLE(backStencilTestEnable);
reg.set_STENCIL_FUNC_F(frontStencilFunction).set_STENCIL_FUNC_B(backStencilFunction);
reg.set_STENCIL_ZPASS_F(frontStencilZPass).set_STENCIL_ZFAIL_F(frontStencilZFail).set_STENCIL_FAIL_F(frontStencilFail);
reg.set_STENCIL_ZPASS_B(backStencilZPass).set_STENCIL_ZFAIL_B(backStencilZFail).set_STENCIL_FAIL_B(backStencilFail);
depthStencilControlReg->reg = reg;
}
void GX2SetDepthStencilControlReg(GX2DepthStencilControlReg* depthStencilControlReg)
{
GX2ReserveCmdSpace(3);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::DB_DEPTH_CONTROL - 0xA000,
depthStencilControlReg->reg);
}
void GX2GetDepthStencilControlReg(
GX2DepthStencilControlReg* depthStencilControlReg,
uint32be* depthTestEnable, uint32be* depthWriteEnable, uint32be* depthFunction,
uint32be* stencilTestEnable, uint32be* backStencilTestEnable,
uint32be* frontStencilFunction,
uint32be* frontStencilZPass, uint32be* frontStencilZFail, uint32be* frontStencilFail,
uint32be* backStencilFunction,
uint32be* backStencilZPass, uint32be* backStencilZFail, uint32be* backStencilFail)
{
// used by Hyrule Warriors
*depthTestEnable = depthStencilControlReg->reg.value().get_Z_ENABLE();
*depthWriteEnable = depthStencilControlReg->reg.value().get_Z_WRITE_ENABLE();
*depthFunction = (uint32)depthStencilControlReg->reg.value().get_Z_FUNC();
*stencilTestEnable = depthStencilControlReg->reg.value().get_STENCIL_ENABLE();
*backStencilTestEnable = depthStencilControlReg->reg.value().get_BACK_STENCIL_ENABLE();
*frontStencilFunction = (uint32)depthStencilControlReg->reg.value().get_STENCIL_FUNC_F();
*backStencilFunction = (uint32)depthStencilControlReg->reg.value().get_STENCIL_FUNC_B();
*frontStencilZPass = (uint32)depthStencilControlReg->reg.value().get_STENCIL_ZPASS_F();
*frontStencilZFail = (uint32)depthStencilControlReg->reg.value().get_STENCIL_ZFAIL_F();
*frontStencilFail = (uint32)depthStencilControlReg->reg.value().get_STENCIL_FAIL_F();
*backStencilZPass = (uint32)depthStencilControlReg->reg.value().get_STENCIL_ZPASS_B();
*backStencilZFail = (uint32)depthStencilControlReg->reg.value().get_STENCIL_ZFAIL_B();
*backStencilFail = (uint32)depthStencilControlReg->reg.value().get_STENCIL_FAIL_B();
}
void GX2InitStencilMaskReg(GX2StencilMaskReg* stencilMaskReg, uint8 compareMaskFront, uint8 writeMaskFront, uint8 refFront, uint8 compareMaskBack, uint8 writeMaskBack, uint8 refBack)
{
stencilMaskReg->stencilRefMaskFrontReg = LATTE_DB_STENCILREFMASK().set_STENCILREF_F(refFront).set_STENCILMASK_F(compareMaskFront).set_STENCILWRITEMASK_F(writeMaskFront);
stencilMaskReg->stencilRefMaskBackReg = LATTE_DB_STENCILREFMASK_BF().set_STENCILREF_B(refBack).set_STENCILMASK_B(compareMaskBack).set_STENCILWRITEMASK_B(writeMaskBack);
}
void GX2SetStencilMask(uint8 compareMaskFront, uint8 writeMaskFront, uint8 refFront, uint8 compareMaskBack, uint8 writeMaskBack, uint8 refBack)
{
GX2ReserveCmdSpace(3 + 3);
LATTE_DB_STENCILREFMASK frontReg;
frontReg.set_STENCILREF_F(refFront).set_STENCILMASK_F(compareMaskFront).set_STENCILWRITEMASK_F(writeMaskFront);
LATTE_DB_STENCILREFMASK_BF backReg;
backReg.set_STENCILREF_B(refBack).set_STENCILMASK_B(compareMaskBack).set_STENCILWRITEMASK_B(writeMaskBack);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
REGADDR::DB_STENCILREFMASK - 0xA000,
frontReg,
pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
REGADDR::DB_STENCILREFMASK_BF - 0xA000,
backReg);
}
void GX2SetStencilMaskReg(GX2StencilMaskReg* stencilMaskReg)
{
GX2ReserveCmdSpace(3 + 3);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
REGADDR::DB_STENCILREFMASK - 0xA000,
stencilMaskReg->stencilRefMaskFrontReg,
pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
REGADDR::DB_STENCILREFMASK_BF - 0xA000,
stencilMaskReg->stencilRefMaskBackReg);
}
void GX2SetPrimitiveRestartIndex(uint32 restartIndex)
{
GX2ReserveCmdSpace(3);
Latte::LATTE_VGT_MULTI_PRIM_IB_RESET_INDX reg{};
reg.set_RESTART_INDEX(restartIndex);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::VGT_MULTI_PRIM_IB_RESET_INDX - 0xA000,
reg);
}
void GX2InitTargetChannelMasksReg(GX2TargetChannelMaskReg* reg, GX2_CHANNELMASK t0, GX2_CHANNELMASK t1, GX2_CHANNELMASK t2, GX2_CHANNELMASK t3, GX2_CHANNELMASK t4, GX2_CHANNELMASK t5, GX2_CHANNELMASK t6, GX2_CHANNELMASK t7)
{
uint32 targetMask = 0;
targetMask |= ((t0 & 0xF) << 0);
targetMask |= ((t1 & 0xF) << 4);
targetMask |= ((t2 & 0xF) << 8);
targetMask |= ((t3 & 0xF) << 12);
targetMask |= ((t4 & 0xF) << 16);
targetMask |= ((t5 & 0xF) << 20);
targetMask |= ((t6 & 0xF) << 24);
targetMask |= ((t7 & 0xF) << 28);
Latte::LATTE_CB_TARGET_MASK r;
r.set_MASK(targetMask);
reg->reg = r;
}
void GX2SetTargetChannelMasksReg(GX2TargetChannelMaskReg* reg)
{
GX2ReserveCmdSpace(3);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::CB_TARGET_MASK - 0xA000,
reg->reg);
}
void GX2SetTargetChannelMasks(GX2_CHANNELMASK t0, GX2_CHANNELMASK t1, GX2_CHANNELMASK t2, GX2_CHANNELMASK t3, GX2_CHANNELMASK t4, GX2_CHANNELMASK t5, GX2_CHANNELMASK t6, GX2_CHANNELMASK t7)
{
GX2TargetChannelMaskReg tmpReg;
GX2InitTargetChannelMasksReg(&tmpReg, t0, t1, t2, t3, t4, t5, t6, t7);
GX2SetTargetChannelMasksReg(&tmpReg);
}
static_assert(sizeof(GX2_CHANNELMASK) == 4);
void GX2GetTargetChannelMasksReg(GX2TargetChannelMaskReg* reg, betype<GX2_CHANNELMASK>* t0, betype<GX2_CHANNELMASK>* t1, betype<GX2_CHANNELMASK>* t2, betype<GX2_CHANNELMASK>* t3,
betype<GX2_CHANNELMASK>* t4, betype<GX2_CHANNELMASK>* t5, betype<GX2_CHANNELMASK>* t6, betype<GX2_CHANNELMASK>* t7)
{
uint32 maskValue = reg->reg.value().get_MASK();
*t0 = (maskValue >> 0) & 0xF;
*t1 = (maskValue >> 4) & 0xF;
*t2 = (maskValue >> 8) & 0xF;
*t3 = (maskValue >> 12) & 0xF;
*t4 = (maskValue >> 16) & 0xF;
*t5 = (maskValue >> 20) & 0xF;
*t6 = (maskValue >> 24) & 0xF;
*t7 = (maskValue >> 28) & 0xF;
}
void GX2InitBlendControlReg(GX2BlendControlReg* reg, uint32 renderTargetIndex, GX2_BLENDFACTOR colorSrcFactor, GX2_BLENDFACTOR colorDstFactor, GX2_BLENDFUNC colorCombineFunc, uint32 separateAlphaBlend, GX2_BLENDFACTOR alphaSrcFactor, GX2_BLENDFACTOR alphaDstFactor, GX2_BLENDFUNC alphaCombineFunc)
{
Latte::LATTE_CB_BLENDN_CONTROL tmpReg;
tmpReg.set_COLOR_SRCBLEND(colorSrcFactor);
tmpReg.set_COLOR_DSTBLEND(colorDstFactor);
tmpReg.set_COLOR_COMB_FCN(colorCombineFunc);
tmpReg.set_ALPHA_SRCBLEND(alphaSrcFactor);
tmpReg.set_ALPHA_DSTBLEND(alphaDstFactor);
tmpReg.set_ALPHA_COMB_FCN(alphaCombineFunc);
tmpReg.set_SEPARATE_ALPHA_BLEND(separateAlphaBlend != 0);
reg->index = renderTargetIndex;
reg->reg = tmpReg;
}
void GX2SetBlendControlReg(GX2BlendControlReg* reg)
{
GX2ReserveCmdSpace(3);
gx2WriteGather_submit(
pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
(Latte::REGADDR::CB_BLEND0_CONTROL + (uint32)reg->index) - 0xA000,
reg->reg
);
}
void GX2SetBlendControl(uint32 renderTargetIndex, GX2_BLENDFACTOR colorSrcFactor, GX2_BLENDFACTOR colorDstFactor, GX2_BLENDFUNC colorCombineFunc, uint32 separateAlphaBlend, GX2_BLENDFACTOR alphaSrcFactor, GX2_BLENDFACTOR alphaDstFactor, GX2_BLENDFUNC alphaCombineFunc)
{
GX2BlendControlReg tmpReg;
GX2InitBlendControlReg(&tmpReg, renderTargetIndex, colorSrcFactor, colorDstFactor, colorCombineFunc, separateAlphaBlend, alphaSrcFactor, alphaDstFactor, alphaCombineFunc);
GX2SetBlendControlReg(&tmpReg);
}
void GX2InitBlendConstantColorReg(GX2BlendConstantColorReg* reg, float red, float green, float blue, float alpha)
{
reg->regRed = Latte::LATTE_CB_BLEND_RED().set_RED(red);
reg->regGreen = Latte::LATTE_CB_BLEND_GREEN().set_GREEN(green);
reg->regBlue = Latte::LATTE_CB_BLEND_BLUE().set_BLUE(blue);
reg->regAlpha = Latte::LATTE_CB_BLEND_ALPHA().set_ALPHA(alpha);
}
void GX2SetBlendConstantColorReg(GX2BlendConstantColorReg* reg)
{
GX2ReserveCmdSpace(6);
gx2WriteGather_submit(
pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 4),
Latte::REGADDR::CB_BLEND_RED - 0xA000,
reg->regRed,
reg->regGreen,
reg->regBlue,
reg->regAlpha
);
}
void GX2SetBlendConstantColor(float red, float green, float blue, float alpha)
{
GX2BlendConstantColorReg tmpReg;
GX2InitBlendConstantColorReg(&tmpReg, red, green, blue, alpha);
GX2SetBlendConstantColorReg(&tmpReg);
}
void GX2InitHiStencilInfoRegs(GX2HiStencilInfoReg* hiStencilInfo)
{
// seen in Color Splash
// but the game never calls GX2SetHiStencilInfo thus this has no effect
}
void GX2InitPointSizeReg(GX2PointSizeReg* reg, float width, float height)
{
if (width < 0.0f || height < 0.0f)
{
cemu_assert_suspicious();
}
uint32 widthI = (uint32)(width * 8.0f);
uint32 heightI = (uint32)(height * 8.0f);
widthI = std::min<uint32>(widthI, 0xFFFF);
heightI = std::min<uint32>(heightI, 0xFFFF);
reg->reg = Latte::LATTE_PA_SU_POINT_SIZE().set_WIDTH(widthI).set_HEIGHT(heightI);
}
void GX2SetPointSizeReg(GX2PointSizeReg* reg)
{
GX2ReserveCmdSpace(3);
gx2WriteGather_submit(
pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::PA_SU_POINT_SIZE - 0xA000,
reg->reg
);
}
void GX2SetPointSize(float width, float height)
{
GX2PointSizeReg tmpReg;
GX2InitPointSizeReg(&tmpReg, width, height);
GX2SetPointSizeReg(&tmpReg);
}
void GX2InitPointLimitsReg(GX2PointLimitsReg* reg, float minSize, float maxSize)
{
if (minSize < 0.0f || maxSize < 0.0f)
{
cemu_assert_suspicious();
}
uint32 minSizeI = (uint32)(minSize * 8.0f);
uint32 maxSizeI = (uint32)(maxSize * 8.0f);
minSizeI = std::min<uint32>(minSizeI, 0xFFFF);
maxSizeI = std::min<uint32>(maxSizeI, 0xFFFF);
reg->reg = Latte::LATTE_PA_SU_POINT_MINMAX().set_MIN_SIZE(minSizeI).set_MAX_SIZE(maxSizeI);
}
void GX2SetPointLimitsReg(GX2PointLimitsReg* reg)
{
GX2ReserveCmdSpace(3);
gx2WriteGather_submit(
pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::PA_SU_POINT_MINMAX - 0xA000,
reg->reg
);
}
void GX2SetPointLimits(float minSize, float maxSize)
{
GX2PointLimitsReg tmpReg;
GX2InitPointLimitsReg(&tmpReg, minSize, maxSize);
GX2SetPointLimitsReg(&tmpReg);
}
enum class GX2_SPECIAL_STATE : uint32
{
FAST_CLEAR = 0,
FAST_CLEAR_HIZ = 1,
};
void _setSpecialState0(bool isEnabled)
{
GX2ReserveCmdSpace(6);
if (isEnabled)
{
// set PA_CL_VTE_CNTL to 0x300
Latte::LATTE_PA_CL_VTE_CNTL regVTE{};
regVTE.set_VTX_XY_FMT(true);
regVTE.set_VTX_Z_FMT(true);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::PA_CL_VTE_CNTL - 0xA000,
regVTE);
// set PA_CL_CLIP_CNTL to 0x490000
Latte::LATTE_PA_CL_CLIP_CNTL regClip{};
regClip.set_CLIP_DISABLE(true); // 0x10000
regClip.set_DX_CLIP_SPACE_DEF(true); // 0x80000
regClip.set_DX_RASTERIZATION_KILL(true); // 0x400000
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::PA_CL_CLIP_CNTL - 0xA000,
regClip);
}
else
{
// set PA_CL_VTE_CNTL to 0x43F
Latte::LATTE_PA_CL_VTE_CNTL reg{};
reg.set_VPORT_X_OFFSET_ENA(true).set_VPORT_X_SCALE_ENA(true);
reg.set_VPORT_Y_OFFSET_ENA(true).set_VPORT_Y_SCALE_ENA(true);
reg.set_VPORT_Z_OFFSET_ENA(true).set_VPORT_Z_SCALE_ENA(true);
reg.set_VTX_W0_FMT(true);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1),
Latte::REGADDR::PA_CL_VTE_CNTL - 0xA000,
reg);
// reset PA_CL_CLIP_CNTL
GX2SetRasterizerClipControl(true, true);
}
}
void GX2SetSpecialState(GX2_SPECIAL_STATE stateId, uint32 isEnabled)
{
if (stateId == GX2_SPECIAL_STATE::FAST_CLEAR)
{
_setSpecialState0(isEnabled != 0);
}
else if (stateId == GX2_SPECIAL_STATE::FAST_CLEAR_HIZ)
{
// todo
// enables additional flags for special state 0
}
else
{
// legacy style
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_SPECIAL_STATE, 2));
gx2WriteGather_submitU32AsBE((uint32)stateId); // state id
gx2WriteGather_submitU32AsBE(isEnabled); // enable/disable bool
}
}
void GX2StateInit()
{
cafeExportRegister("gx2", GX2InitAlphaTestReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetAlphaTestReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetAlphaTest, LogType::GX2);
cafeExportRegister("gx2", GX2InitColorControlReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetColorControl, LogType::GX2);
cafeExportRegister("gx2", GX2SetColorControlReg, LogType::GX2);
cafeExportRegister("gx2", GX2InitPolygonControlReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetPolygonControlReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetPolygonControl, LogType::GX2);
cafeExportRegister("gx2", GX2SetCullOnlyControl, LogType::GX2);
cafeExportRegister("gx2", GX2InitPolygonOffsetReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetPolygonOffsetReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetPolygonOffset, LogType::GX2);
cafeExportRegister("gx2", GX2SetRasterizerClipControl, LogType::GX2);
cafeExportRegister("gx2", GX2SetRasterizerClipControlHalfZ, LogType::GX2);
cafeExportRegister("gx2", GX2SetRasterizerClipControlEx, LogType::GX2);
cafeExportRegister("gx2", GX2InitViewportReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetViewportReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetViewport, LogType::GX2);
cafeExportRegister("gx2", GX2InitScissorReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetScissorReg, LogType::GX2);
cafeExportRegister("gx2", GX2GetScissorReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetScissor, LogType::GX2);
cafeExportRegister("gx2", GX2InitDepthStencilControlReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetDepthStencilControlReg, LogType::GX2);
cafeExportRegister("gx2", GX2GetDepthStencilControlReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetDepthOnlyControl, LogType::GX2);
cafeExportRegister("gx2", GX2SetDepthStencilControl, LogType::GX2);
cafeExportRegister("gx2", GX2InitStencilMaskReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetStencilMask, LogType::GX2);
cafeExportRegister("gx2", GX2SetStencilMaskReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetPrimitiveRestartIndex, LogType::GX2);
cafeExportRegister("gx2", GX2InitTargetChannelMasksReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetTargetChannelMasksReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetTargetChannelMasks, LogType::GX2);
cafeExportRegister("gx2", GX2GetTargetChannelMasksReg, LogType::GX2);
cafeExportRegister("gx2", GX2InitBlendControlReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetBlendControlReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetBlendControl, LogType::GX2);
cafeExportRegister("gx2", GX2InitBlendConstantColorReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetBlendConstantColorReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetBlendConstantColor, LogType::GX2);
cafeExportRegister("gx2", GX2InitHiStencilInfoRegs, LogType::GX2);
cafeExportRegister("gx2", GX2InitPointSizeReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetPointSizeReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetPointSize, LogType::GX2);
cafeExportRegister("gx2", GX2InitPointLimitsReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetPointLimitsReg, LogType::GX2);
cafeExportRegister("gx2", GX2SetPointLimits, LogType::GX2);
cafeExportRegister("gx2", GX2SetSpecialState, LogType::GX2);
}
}

View file

@ -0,0 +1,192 @@
#pragma once
#include "Cafe/HW/Latte/ISA/LatteReg.h"
namespace GX2
{
struct GX2AlphaTestReg
{
betype<Latte::LATTE_SX_ALPHA_TEST_CONTROL> regAlphaTestControl;
betype<Latte::LATTE_SX_ALPHA_REF> regAlphaTestRef;
};
static_assert(sizeof(GX2AlphaTestReg) == 8);
struct GX2ColorControlReg
{
betype<Latte::LATTE_CB_COLOR_CONTROL> reg;
};
static_assert(sizeof(GX2ColorControlReg) == 4);
struct GX2PolygonControlReg
{
betype<Latte::LATTE_PA_SU_SC_MODE_CNTL> reg;
};
static_assert(sizeof(GX2PolygonControlReg) == 4);
struct GX2PolygonOffsetReg
{
betype<Latte::LATTE_PA_SU_POLY_OFFSET_FRONT_SCALE> regFrontScale;
betype<Latte::LATTE_PA_SU_POLY_OFFSET_FRONT_OFFSET> regFrontOffset;
betype<Latte::LATTE_PA_SU_POLY_OFFSET_BACK_SCALE> regBackScale;
betype<Latte::LATTE_PA_SU_POLY_OFFSET_BACK_OFFSET> regBackOffset;
betype<Latte::LATTE_PA_SU_POLY_OFFSET_CLAMP> regClamp;
};
static_assert(sizeof(GX2PolygonOffsetReg) == 0x14);
struct GX2DepthStencilControlReg
{
betype<Latte::LATTE_DB_DEPTH_CONTROL> reg;
};
static_assert(sizeof(GX2DepthStencilControlReg) == 4);
struct GX2StencilMaskReg
{
betype<Latte::LATTE_DB_STENCILREFMASK> stencilRefMaskFrontReg;
betype<Latte::LATTE_DB_STENCILREFMASK_BF> stencilRefMaskBackReg;
};
static_assert(sizeof(GX2StencilMaskReg) == 8);
struct GX2TargetChannelMaskReg
{
betype<Latte::LATTE_CB_TARGET_MASK> reg;
};
static_assert(sizeof(GX2TargetChannelMaskReg) == 4);
struct GX2HIStencilInfoData
{
/* +0x00 */ uint32be ukn00;
/* +0x04 */ uint8be ukn04;
/* +0x05 */ uint8be ukn05;
/* +0x06 */ uint8be ukn06; // probably padding?
/* +0x07 */ uint8be ukn07; // probably padding?
/* +0x08 */ uint32be isEnable; // 0 or 1
};
static_assert(sizeof(GX2HIStencilInfoData) == 0xC);
struct GX2HiStencilInfoReg
{
GX2HIStencilInfoData state[2];
uint32be reg[2]; // DB_SRESULTS_COMPARE_STATE0 and DB_SRESULTS_COMPARE_STATE1
};
static_assert(sizeof(GX2HiStencilInfoReg) == 0x20);
struct GX2BlendControlReg
{
uint32be index;
betype<Latte::LATTE_CB_BLENDN_CONTROL> reg;
};
static_assert(sizeof(GX2BlendControlReg) == 8);
struct GX2BlendConstantColorReg
{
betype<Latte::LATTE_CB_BLEND_RED> regRed;
betype<Latte::LATTE_CB_BLEND_GREEN> regGreen;
betype<Latte::LATTE_CB_BLEND_BLUE> regBlue;
betype<Latte::LATTE_CB_BLEND_ALPHA> regAlpha;
};
static_assert(sizeof(GX2BlendConstantColorReg) == 16);
struct GX2PointSizeReg
{
betype<Latte::LATTE_PA_SU_POINT_SIZE> reg;
};
static_assert(sizeof(GX2PointSizeReg) == 4);
struct GX2PointLimitsReg
{
betype<Latte::LATTE_PA_SU_POINT_MINMAX> reg;
};
static_assert(sizeof(GX2PointLimitsReg) == 4);
struct GX2ViewportReg
{
betype<Latte::LATTE_PA_CL_VPORT_XSCALE> xScale;
betype<Latte::LATTE_PA_CL_VPORT_XOFFSET> xOffset;
betype<Latte::LATTE_PA_CL_VPORT_YSCALE> yScale;
betype<Latte::LATTE_PA_CL_VPORT_YOFFSET> yOffset;
betype<Latte::LATTE_PA_CL_VPORT_ZSCALE> zScale;
betype<Latte::LATTE_PA_CL_VPORT_ZOFFSET> zOffset;
uint32 ukn[6]; // clipping registers?
};
static_assert(sizeof(GX2ViewportReg) == 48);
struct GX2ScissorReg
{
betype<Latte::LATTE_PA_SC_GENERIC_SCISSOR_TL> scissorTL;
betype<Latte::LATTE_PA_SC_GENERIC_SCISSOR_BR> scissorBR;
};
static_assert(sizeof(GX2ScissorReg) == 8);
using GX2_ALPHAFUNC = Latte::LATTE_SX_ALPHA_TEST_CONTROL::E_ALPHA_FUNC; // alias Latte::E_COMPAREFUNC
using GX2_LOGICOP = Latte::LATTE_CB_COLOR_CONTROL::E_LOGICOP;
using GX2_CHANNELMASK = uint32;
using GX2_BLENDFACTOR = Latte::LATTE_CB_BLENDN_CONTROL::E_BLENDFACTOR;
using GX2_BLENDFUNC = Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC;
void GX2InitAlphaTestReg(GX2AlphaTestReg* reg, uint32 alphaTestEnable, GX2_ALPHAFUNC alphaFunc, float alphaRef);
void GX2SetAlphaTestReg(GX2AlphaTestReg* reg);
void GX2SetAlphaTest(uint32 alphaTestEnable, GX2_ALPHAFUNC alphaFunc, float alphaRef);
void GX2InitColorControlReg(GX2ColorControlReg* reg, GX2_LOGICOP logicOp, uint32 blendMask, uint32 multiwriteEnable, uint32 colorBufferEnable);
void GX2SetColorControl(GX2_LOGICOP logicOp, uint32 blendMask, uint32 multiwriteEnable, uint32 colorBufferEnable);
void GX2SetColorControlReg(GX2ColorControlReg* reg);
void GX2InitPolygonControlReg(GX2PolygonControlReg* reg,
Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE frontFace, uint32 cullFront, uint32 cullBack,
Latte::LATTE_PA_SU_SC_MODE_CNTL::E_POLYGONMODE usePolygonMode,
Latte::LATTE_PA_SU_SC_MODE_CNTL::E_PTYPE polyModeFront,
Latte::LATTE_PA_SU_SC_MODE_CNTL::E_PTYPE polyModeBack,
uint32 polygonOffsetFrontEnable, uint32 polygonOffsetBackEnable, uint32 paraOffsetEnable);
void GX2SetPolygonControlReg(GX2PolygonControlReg* reg);
void GX2SetPolygonControl(Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE frontFace, uint32 cullFront, uint32 cullBack,
Latte::LATTE_PA_SU_SC_MODE_CNTL::E_POLYGONMODE usePolygonMode,
Latte::LATTE_PA_SU_SC_MODE_CNTL::E_PTYPE polyModeFront,
Latte::LATTE_PA_SU_SC_MODE_CNTL::E_PTYPE polyModeBack,
uint32 polygonOffsetFrontEnable, uint32 polygonOffsetBackEnable, uint32 paraOffsetEnable);
void GX2SetCullOnlyControl(Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE frontFace, uint32 cullFront, uint32 cullBack);
void GX2InitPolygonOffsetReg(GX2PolygonOffsetReg* reg, float frontOffset, float frontScale, float backOffset, float backScale, float clampOffset);
void GX2SetPolygonOffsetReg(GX2PolygonOffsetReg* reg);
void GX2SetPolygonOffset(float frontOffset, float frontScale, float backOffset, float backScale, float clampOffset);
void GX2InitPointSizeReg(GX2PointSizeReg* reg, float width, float height);
void GX2SetPointSizeReg(GX2PointSizeReg* reg);
void GX2SetPointSize(float width, float height);
void GX2InitPointLimitsReg(GX2PointLimitsReg* reg, float minSize, float maxSize);
void GX2SetPointLimitsReg(GX2PointLimitsReg* reg);
void GX2SetPointLimits(float minSize, float maxSize);
void GX2SetRasterizerClipControl(bool enableRasterizer, bool enableZClip);
void GX2SetRasterizerClipControlHalfZ(bool enableRasterizer, bool enableZClip, bool enableHalfZ);
void GX2SetRasterizerClipControlEx(bool enableRasterizer, bool enableZClip, bool enableHalfZ);
void GX2SetPrimitiveRestartIndex(uint32 restartIndex);
void GX2InitTargetChannelMasksReg(GX2TargetChannelMaskReg* reg, GX2_CHANNELMASK t0, GX2_CHANNELMASK t1, GX2_CHANNELMASK t2, GX2_CHANNELMASK t3, GX2_CHANNELMASK t4, GX2_CHANNELMASK t5, GX2_CHANNELMASK t6, GX2_CHANNELMASK t7);
void GX2SetTargetChannelMasksReg(GX2TargetChannelMaskReg* reg);
void GX2SetTargetChannelMasks(GX2_CHANNELMASK t0, GX2_CHANNELMASK t1, GX2_CHANNELMASK t2, GX2_CHANNELMASK t3, GX2_CHANNELMASK t4, GX2_CHANNELMASK t5, GX2_CHANNELMASK t6, GX2_CHANNELMASK t7);
void GX2InitBlendControlReg(GX2BlendControlReg* reg, uint32 renderTargetIndex, GX2_BLENDFACTOR colorSrcFactor, GX2_BLENDFACTOR colorDstFactor, GX2_BLENDFUNC colorCombineFunc, uint32 separateAlphaBlend, GX2_BLENDFACTOR alphaSrcFactor, GX2_BLENDFACTOR alphaDstFactor, GX2_BLENDFUNC alphaCombineFunc);
void GX2SetBlendControlReg(GX2BlendControlReg* reg);
void GX2SetBlendControl(uint32 renderTargetIndex, GX2_BLENDFACTOR colorSrcFactor, GX2_BLENDFACTOR colorDstFactor, GX2_BLENDFUNC colorCombineFunc, uint32 separateAlphaBlend, GX2_BLENDFACTOR alphaSrcFactor, GX2_BLENDFACTOR alphaDstFactor, GX2_BLENDFUNC alphaCombineFunc);
void GX2InitBlendConstantColorReg(GX2BlendConstantColorReg* reg, float red, float green, float blue, float alpha);
void GX2SetBlendConstantColorReg(GX2BlendConstantColorReg* reg);
void GX2SetBlendConstantColor(float red, float green, float blue, float alpha);
void GX2StateInit();
}

View file

@ -0,0 +1,113 @@
#include "GX2_Streamout.h"
#include "GX2_Command.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/OS/common/OSCommon.h"
namespace GX2
{
void GX2SetStreamOutBuffer(uint32 bufferIndex, GX2StreamOutBuffer* streamOutBuffer)
{
if (bufferIndex >= GX2_MAX_STREAMOUT_BUFFERS)
{
cemu_assert_suspicious();
debug_printf("GX2SetStreamOutBuffer(): Set out-of-bounds buffer\n");
return;
}
MPTR bufferAddr;
uint32 bufferSize;
if (streamOutBuffer->dataPtr.IsNull())
{
bufferAddr = streamOutBuffer->rBuffer.GetVirtualAddr();
bufferSize = streamOutBuffer->rBuffer.GetSize();
}
else
{
bufferAddr = streamOutBuffer->dataPtr.GetMPTR();
bufferSize = streamOutBuffer->size;
}
GX2ReserveCmdSpace(3 + 3);
// set buffer size
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1));
gx2WriteGather_submitU32AsBE((mmVGT_STRMOUT_BUFFER_SIZE_0 + bufferIndex * 4) - 0xA000);
gx2WriteGather_submitU32AsBE((bufferSize >> 2));
// set buffer base
uint32 physMem = memory_virtualToPhysical(bufferAddr);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1));
gx2WriteGather_submitU32AsBE((mmVGT_STRMOUT_BUFFER_BASE_0 + bufferIndex * 4) - 0xA000);
gx2WriteGather_submitU32AsBE((physMem >> 8));
// todo: Research and send IT_STRMOUT_BASE_UPDATE (0x72)
// note: Other stream out registers maybe set in GX2SetVertexShader() or GX2SetGeometryShader()
}
void GX2SetStreamOutEnable(uint32 enable)
{
cemu_assert_debug(enable == 0 || enable == 1);
GX2ReserveCmdSpace(3);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 1));
gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_EN - 0xA000);
gx2WriteGather_submitU32AsBE(enable & 1);
}
void GX2SetStreamOutContext(uint32 bufferIndex, GX2StreamOutBuffer* streamOutBuffer, uint32 mode)
{
if (bufferIndex >= GX2_MAX_STREAMOUT_BUFFERS)
{
cemu_assert_suspicious();
debug_printf("GX2SetStreamOutContext(): Set out-of-bounds buffer\n");
return;
}
GX2ReserveCmdSpace(6);
if (mode == 0)
{
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_STRMOUT_BUFFER_UPDATE, 5));
gx2WriteGather_submitU32AsBE((2 << 1) | (bufferIndex << 8));
gx2WriteGather_submitU32AsBE(MPTR_NULL);
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(streamOutBuffer->ctxPtr.GetMPTR()));
gx2WriteGather_submitU32AsBE(0);
}
else if (mode == 1)
{
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_STRMOUT_BUFFER_UPDATE, 5));
gx2WriteGather_submitU32AsBE((0 << 1) | (bufferIndex << 8));
gx2WriteGather_submitU32AsBE(MPTR_NULL);
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(streamOutBuffer->ctxPtr.GetMPTR()));
gx2WriteGather_submitU32AsBE(0);
}
else
{
cemu_assert_unimplemented();
}
}
void GX2SaveStreamOutContext(uint32 bufferIndex, GX2StreamOutBuffer* streamOutBuffer)
{
if (bufferIndex >= GX2_MAX_STREAMOUT_BUFFERS)
{
cemu_assert_suspicious();
debug_printf("GX2SaveStreamOutContext(): Set out-of-bounds buffer\n");
return;
}
GX2ReserveCmdSpace(6);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_STRMOUT_BUFFER_UPDATE, 5));
gx2WriteGather_submitU32AsBE(1 | (3 << 1) | (bufferIndex << 8));
gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(streamOutBuffer->ctxPtr.GetMPTR()));
gx2WriteGather_submitU32AsBE(0);
gx2WriteGather_submitU32AsBE(MPTR_NULL);
gx2WriteGather_submitU32AsBE(0);
}
void GX2StreamoutInit()
{
cafeExportRegister("gx2", GX2SetStreamOutBuffer, LogType::GX2);
cafeExportRegister("gx2", GX2SetStreamOutEnable, LogType::GX2);
cafeExportRegister("gx2", GX2SetStreamOutContext, LogType::GX2);
cafeExportRegister("gx2", GX2SaveStreamOutContext, LogType::GX2);
}
}

View file

@ -0,0 +1,21 @@
#pragma once
#include "GX2_Resource.h"
#define GX2_MAX_STREAMOUT_BUFFERS 4
namespace GX2
{
struct GX2StreamOutBuffer
{
/* +0x00 */ uint32be size; // size of buffer (if dataPtr is not NULL)
/* +0x04 */ MEMPTR<void> dataPtr;
/* +0x08 */ uint32be vertexStride;
/* +0x0C */ GX2RBuffer rBuffer; // if dataPtr is NULL, use this as the buffer and size
/* +0x1C */ MEMPTR<void> ctxPtr; // stream out context
};
static_assert(sizeof(GX2StreamOutBuffer) == 0x20, "GX2StreamOutBuffer_t has invalid size");
void GX2SetStreamOutBuffer(uint32 bufferIndex, GX2StreamOutBuffer* streamOutBuffer);
void GX2StreamoutInit();
}

View file

@ -0,0 +1,278 @@
#include "Cafe/OS/common/OSCommon.h"
#include "GX2.h"
#include "GX2_Surface.h"
#include "GX2_Resource.h"
#include "Cafe/HW/Latte/Core/Latte.h"
#include "Cafe/HW/Latte/Core/LatteDraw.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/HW/Latte/LatteAddrLib/LatteAddrLib.h"
namespace GX2
{
uint32 GX2GetSurfaceMipPitch(GX2Surface* surface, uint32 level)
{
LatteAddrLib::AddrSurfaceInfo_OUT surfOut;
GX2::GX2CalculateSurfaceInfo(surface, level, &surfOut);
return surfOut.pitch;
}
uint32 GX2GetSurfaceFormatBits(Latte::E_GX2SURFFMT surfaceFormat)
{
uint32 bpp = Latte::GetFormatBits(surfaceFormat);
if (Latte::IsCompressedFormat(surfaceFormat))
{
cemu_assert_debug((bpp & 0xF) == 0);
bpp /= (4 * 4);
}
return bpp;
}
uint32 _GX2CalculateSliceSize(GX2Surface* surface, const LatteAddrLib::AddrSurfaceInfo_OUT* surfaceInfo)
{
uint32 aaScaler = 1 << surface->aa;
return aaScaler * (surfaceInfo->bpp >> 3) * surfaceInfo->height * surfaceInfo->pitch;
}
uint32 GX2GetSurfaceMipSliceSize(GX2Surface* surface, uint32 level)
{
LatteAddrLib::AddrSurfaceInfo_OUT surfOut;
GX2::GX2CalculateSurfaceInfo(surface, level, &surfOut);
return _GX2CalculateSliceSize(surface, &surfOut);
}
uint32 GX2GetSurfaceSwizzleOffset(GX2Surface* surface, uint32 level)
{
uint32 swizzleOffset = 0;
uint32 swizzle = surface->swizzle;
if (!Latte::TM_IsMacroTiled(surface->tileMode) || level >= ((swizzle >> 16) & 0xFF))
swizzleOffset = 0;
else
swizzleOffset = swizzle & 0xFFFF;
return swizzleOffset;
}
uint32 GX2GetSurfaceSwizzle(GX2Surface* surface)
{
uint32 swizzle = surface->swizzle;
swizzle = (swizzle >> 8) & 0xFF;
return swizzle;
}
uint32 GX2SurfaceIsCompressed(Latte::E_GX2SURFFMT surfaceFormat)
{
return Latte::IsCompressedFormat(surfaceFormat) ? GX2_TRUE : GX2_FALSE;
}
void GX2CalcDepthBufferHiZInfo(GX2DepthBuffer* depthBuffer, uint32be* sizeOut, uint32be* alignOut)
{
*sizeOut = 0x1000;
*alignOut = 0x100;
// todo: implement
}
void GX2CalcColorBufferAuxInfo(GX2ColorBuffer* colorBuffer, uint32be* sizeOut, uint32be* alignOut)
{
*sizeOut = 0x1000;
*alignOut = 0x100;
// todo: implement
}
void GX2CalculateSurfaceInfo(GX2Surface* surfacePtr, uint32 level, LatteAddrLib::AddrSurfaceInfo_OUT* pSurfOut)
{
bool optimizeForDepthBuffer = (surfacePtr->resFlag & GX2_RESFLAG_USAGE_DEPTH_BUFFER) != 0;
bool optimizeForScanBuffer = (surfacePtr->resFlag & GX2_RESFLAG_USAGE_SCAN_BUFFER) != 0;
LatteAddrLib::GX2CalculateSurfaceInfo(surfacePtr->format, surfacePtr->width, surfacePtr->height, surfacePtr->depth, surfacePtr->dim, surfacePtr->tileMode, surfacePtr->aa, level, pSurfOut, optimizeForDepthBuffer, optimizeForScanBuffer);
}
uint32 _CalculateLevels(uint32 resolution)
{
uint32 x = 0x80000000;
uint32 v = resolution;
uint32 n = 0;
while (!(v & x))
{
n++;
if (n == 32)
break;
x >>= 1;
}
return 32 - n;
}
uint32 _GX2AdjustLevelCount(GX2Surface* surfacePtr)
{
if (surfacePtr->numLevels <= 1)
return 1;
uint32 levels = std::max(_CalculateLevels(surfacePtr->width), _CalculateLevels(surfacePtr->height));
if (surfacePtr->dim == Latte::E_DIM::DIM_3D)
levels = std::max(levels, _CalculateLevels(surfacePtr->depth));
return levels;
}
void GX2CalcSurfaceSizeAndAlignment(GX2Surface* surface)
{
LatteAddrLib::AddrSurfaceInfo_OUT surfOut = { 0 };
uint32 firstMipOffset = 0;
bool changeTilemode = false;
Latte::E_GX2TILEMODE lastTilemode = surface->tileMode;
bool hasTileMode32 = surface->tileMode == Latte::E_GX2TILEMODE::TM_32_SPECIAL;
if (surface->tileMode == Latte::E_GX2TILEMODE::TM_LINEAR_GENERAL || hasTileMode32)
{
if (surface->dim != Latte::E_DIM::DIM_1D || (surface->resFlag & GX2_RESFLAG_USAGE_DEPTH_BUFFER) != 0 || surface->aa)
{
if (surface->dim != Latte::E_DIM::DIM_3D || (surface->resFlag & GX2_RESFLAG_USAGE_COLOR_BUFFER) != 0)
surface->tileMode = Latte::E_GX2TILEMODE::TM_2D_TILED_THIN1;
else
surface->tileMode = Latte::E_GX2TILEMODE::TM_2D_TILED_THICK;
changeTilemode = true;
}
else
{
surface->tileMode = Latte::E_GX2TILEMODE::TM_LINEAR_ALIGNED;
}
lastTilemode = surface->tileMode;
}
if (surface->numLevels == 0)
surface->numLevels = 1;
surface->numLevels = std::min<uint32>(surface->numLevels, _GX2AdjustLevelCount(surface));
surface->mipOffset[0] = 0;
if (Latte::TM_IsMacroTiled(surface->tileMode))
surface->swizzle = (surface->swizzle & 0xFF00FFFF) | 0xD0000;
else
surface->swizzle = surface->swizzle & 0xFF00FFFF;
// FIX 32
uint32 fix32Mode;
if (hasTileMode32)
{
if (Latte::IsCompressedFormat(surface->format))
fix32Mode = 2;
else
fix32Mode = 0;
}
else
{
fix32Mode = 0;
}
// setup levels
uint32 prevSize = 0;
for (uint32 level = 0; level < surface->numLevels; ++level)
{
GX2CalculateSurfaceInfo(surface, level, &surfOut);
if (level)
{
uint32 pad = 0;
if (Latte::TM_IsMacroTiled(lastTilemode) && !Latte::TM_IsMacroTiled(surfOut.hwTileMode))
{
surface->swizzle = (surface->swizzle & 0xFF00FFFF) | (level << 16);
lastTilemode = (Latte::E_GX2TILEMODE)surfOut.hwTileMode;
if (level > 1)
pad = surface->swizzle & 0xFFFF;
}
pad += (surfOut.baseAlign - prevSize % surfOut.baseAlign) % surfOut.baseAlign;
if (level == 1)
{
firstMipOffset = pad + prevSize;
}
else if (level > 1)
{
surface->mipOffset[level - 1] = pad + prevSize + surface->mipOffset[level - 2];
}
}
else
{
if (changeTilemode)
{
if (surface->tileMode != (Latte::E_GX2TILEMODE)surfOut.hwTileMode)
{
surface->tileMode = (Latte::E_GX2TILEMODE)surfOut.hwTileMode;
GX2CalculateSurfaceInfo(surface, 0, &surfOut);
if (!Latte::TM_IsMacroTiled(surface->tileMode))
surface->swizzle = surface->swizzle & 0xFF00FFFF;
lastTilemode = surface->tileMode;
}
if (surface->width < (surfOut.pitchAlign << fix32Mode)
&& surface->height < (surfOut.heightAlign << fix32Mode))
{
if (surface->tileMode == Latte::E_GX2TILEMODE::TM_2D_TILED_THICK)
surface->tileMode = Latte::E_GX2TILEMODE::TM_1D_TILED_THICK;
else
surface->tileMode = Latte::E_GX2TILEMODE::TM_1D_TILED_THIN1;
GX2CalculateSurfaceInfo(surface, 0, &surfOut);
surface->swizzle = surface->swizzle & 0xFF00FFFF;
lastTilemode = surface->tileMode;
}
}
surface->imageSize = (uint32)(surfOut.surfSize);
surface->alignment = surfOut.baseAlign;
surface->pitch = surfOut.pitch;
}
prevSize = (uint32)(surfOut.surfSize);
}
if (surface->numLevels > 1)
surface->mipSize = prevSize + surface->mipOffset[surface->numLevels - 2];
else
surface->mipSize = 0;
surface->mipOffset[0] = firstMipOffset;
if (surface->format == Latte::E_GX2SURFFMT::NV12_UNORM)
{
uint32 padding = (surface->alignment - surface->imageSize % surface->alignment) % surface->alignment;
surface->mipOffset[0] = padding + surface->imageSize;
surface->imageSize = surface->mipOffset[0] + ((uint32)surface->imageSize >> 1);
}
}
Latte::E_ENDIAN_SWAP GetSurfaceFormatSwapMode(Latte::E_GX2SURFFMT fmt)
{
// swap mode is 0 for all formats
return Latte::E_ENDIAN_SWAP::SWAP_NONE;
}
uint32 GetSurfaceColorBufferExportFormat(Latte::E_GX2SURFFMT fmt)
{
const uint8 table[0x40] = {
0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
uint32 fmtHW = (uint32)fmt & 0x3F;
return table[fmtHW];
}
uint32 GX2CheckSurfaceUseVsFormat(uint32 resFlags, uint32 surfaceFormat)
{
cemuLog_logDebug(LogType::Force, "GX2CheckSurfaceUseVsFormat - stub");
return 1;
}
void GX2SetSurfaceSwizzle(GX2Surface* surface, uint32 newSwizzle)
{
uint32 currentSwizzle = surface->swizzle;
currentSwizzle &= ~0xFF00;
currentSwizzle |= (newSwizzle << 8); // newSwizzle isn't actually masked and some games set it to values above 0xFF
surface->swizzle = currentSwizzle;
}
void GX2SurfaceInit()
{
cafeExportRegister("gx2", GX2GetSurfaceMipPitch, LogType::GX2);
cafeExportRegister("gx2", GX2GetSurfaceFormatBits, LogType::GX2);
cafeExportRegister("gx2", GX2GetSurfaceMipSliceSize, LogType::GX2);
cafeExportRegister("gx2", GX2GetSurfaceSwizzleOffset, LogType::GX2);
cafeExportRegister("gx2", GX2GetSurfaceSwizzle, LogType::GX2);
cafeExportRegister("gx2", GX2SurfaceIsCompressed, LogType::GX2);
cafeExportRegister("gx2", GX2CalcDepthBufferHiZInfo, LogType::GX2);
cafeExportRegister("gx2", GX2CalcColorBufferAuxInfo, LogType::GX2);
cafeExportRegister("gx2", GX2CalcSurfaceSizeAndAlignment, LogType::GX2);
cafeExportRegister("gx2", GX2CheckSurfaceUseVsFormat, LogType::GX2);
cafeExportRegister("gx2", GX2SetSurfaceSwizzle, LogType::GX2);
}
};

View file

@ -0,0 +1,82 @@
#pragma once
#include "Cafe/HW/Latte/ISA/LatteReg.h"
#include "Cafe/HW/Latte/LatteAddrLib/LatteAddrLib.h"
// todo - move into GX2 namespace
struct GX2Surface
{
/* +0x000 */ betype<Latte::E_DIM> dim;
/* +0x004 */ uint32be width;
/* +0x008 */ uint32be height;
/* +0x00C */ uint32be depth;
/* +0x010 */ uint32be numLevels; // number of mipmap levels including base image. Should be at least 1
/* +0x014 */ betype<Latte::E_GX2SURFFMT> format;
/* +0x018 */ uint32be aa; // anti-aliasing mode
/* +0x01C */ uint32be resFlag; // GX2_RESFLAG_* and GX2R_RESFLAG_*
/* +0x020 */ uint32be imageSize;
/* +0x024 */ uint32be imagePtr;
/* +0x028 */ uint32be mipSize;
/* +0x02C */ uint32be mipPtr;
/* +0x030 */ betype<Latte::E_GX2TILEMODE> tileMode;
/* +0x034 */ uint32be swizzle;
/* +0x038 */ uint32be alignment;
/* +0x03C */ uint32be pitch;
/* +0x040 */ uint32be mipOffset[13];
}; // size: 0x74
static_assert(sizeof(betype<Latte::E_DIM>) == 4);
static_assert(sizeof(betype<Latte::E_GX2TILEMODE>) == 4);
static_assert(sizeof(GX2Surface) == 0x74);
// color and depth buffer
struct GX2ColorBuffer
{
/* +0x00 */ GX2Surface surface;
/* +0x74 */ uint32 viewMip;
/* +0x78 */ uint32 viewFirstSlice;
/* +0x7C */ uint32 viewNumSlices;
/* +0x80 */ MPTR auxData;
/* +0x84 */ uint32 auxSize;
/* +0x88 */ uint32be reg_size; // CB_COLOR*_SIZE
/* +0x8C */ uint32be reg_info; // CB_COLOR*_INFO
/* +0x90 */ uint32be reg_view; // CB_COLOR*_VIEW
/* +0x94 */ uint32be reg_mask; // CB_COLOR*_MASK
/* +0x98 */ uint32be reg4; // ?
};
static_assert(sizeof(GX2ColorBuffer) == 0x9C);
struct GX2DepthBuffer
{
/* +0x00 */ GX2Surface surface;
/* +0x74 */ uint32 viewMip;
/* +0x78 */ uint32 viewFirstSlice;
/* +0x7C */ uint32 viewNumSlices;
/* +0x80 */ MPTR hiZPtr;
/* +0x84 */ uint32 hiZSize;
/* +0x88 */ float clearDepth;
/* +0x8C */ uint32 clearStencil;
/* +0x90 */ uint32be reg_size;
/* +0x94 */ uint32be reg_view;
/* +0x98 */ uint32be reg_base;
/* +0x9C */ uint32be reg_htile_surface;
/* +0xA0 */ uint32be reg_prefetch_limit;
/* +0xA4 */ uint32be reg_preload_control;
/* +0xA8 */ uint32be reg_poly_offset_db_fmt_cntl;
};
static_assert(sizeof(GX2DepthBuffer) == 0xAC);
namespace GX2
{
void GX2CalculateSurfaceInfo(GX2Surface* surfacePtr, uint32 level, LatteAddrLib::AddrSurfaceInfo_OUT* pSurfOut);
Latte::E_ENDIAN_SWAP GetSurfaceFormatSwapMode(Latte::E_GX2SURFFMT fmt);
uint32 GetSurfaceColorBufferExportFormat(Latte::E_GX2SURFFMT fmt);
void GX2CalcSurfaceSizeAndAlignment(GX2Surface* surface);
void GX2SurfaceInit();
};

View file

@ -0,0 +1,743 @@
#include "Cafe/OS/common/OSCommon.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/HW/Latte/Core/Latte.h"
#include "Cafe/HW/Latte/Core/LatteDraw.h"
#include "Cafe/HW/Latte/Core/LatteAsyncCommands.h"
#include "Cafe/HW/Latte/LatteAddrLib/LatteAddrLib.h"
#include "util/highresolutiontimer/HighResolutionTimer.h"
#include "GX2.h"
#include "GX2_Resource.h"
template<uint32 copyBpp>
void gx2SurfaceCopySoftware_specialized(
uint8* inputData, sint32 surfSrcHeight, sint32 srcPitch, sint32 srcDepth, uint32 srcSlice, uint32 srcSwizzle, uint32 srcHwTileMode,
uint8* outputData, sint32 surfDstHeight, sint32 dstPitch, sint32 dstDepth, uint32 dstSlice, uint32 dstSwizzle, uint32 dstHwTileMode,
uint32 copyWidth, uint32 copyHeight)
{
uint32 srcPipeSwizzle = (srcSwizzle >> 8) & 1;
uint32 srcBankSwizzle = ((srcSwizzle >> 9) & 3);
uint32 dstPipeSwizzle = (dstSwizzle >> 8) & 1;
uint32 dstBankSwizzle = ((dstSwizzle >> 9) & 3);
for (uint32 y = 0; y < copyHeight; y++)
{
for (uint32 x = 0; x < copyWidth; x++)
{
// calculate address of input block
uint32 srcOffset = 0;
if (srcHwTileMode == 0 || srcHwTileMode == 1)
srcOffset = LatteAddrLib::ComputeSurfaceAddrFromCoordLinear(x, y, srcSlice, 0, copyBpp, srcPitch, surfSrcHeight, srcDepth);
else if (srcHwTileMode == 2 || srcHwTileMode == 3)
srcOffset = LatteAddrLib::ComputeSurfaceAddrFromCoordMicroTiled(x, y, srcSlice, copyBpp, srcPitch, surfSrcHeight, (Latte::E_HWTILEMODE)srcHwTileMode, false);
else
srcOffset = LatteAddrLib::ComputeSurfaceAddrFromCoordMacroTiled(x, y, srcSlice, 0, copyBpp, srcPitch, surfSrcHeight, 1 * 1, (Latte::E_HWTILEMODE)srcHwTileMode, false, srcPipeSwizzle, srcBankSwizzle);
uint8* inputBlockData = inputData + srcOffset;
// calculate address of output block
uint32 dstOffset = 0;
if (dstHwTileMode == 0 || dstHwTileMode == 1)
dstOffset = LatteAddrLib::ComputeSurfaceAddrFromCoordLinear(x, y, dstSlice, 0, copyBpp, dstPitch, surfDstHeight, dstDepth);
else if (dstHwTileMode == 2 || dstHwTileMode == 3)
dstOffset = LatteAddrLib::ComputeSurfaceAddrFromCoordMicroTiled(x, y, dstSlice, copyBpp, dstPitch, surfDstHeight, (Latte::E_HWTILEMODE)dstHwTileMode, false);
else
dstOffset = LatteAddrLib::ComputeSurfaceAddrFromCoordMacroTiled(x, y, dstSlice, 0, copyBpp, dstPitch, surfDstHeight, 1 * 1, (Latte::E_HWTILEMODE)dstHwTileMode, false, dstPipeSwizzle, dstBankSwizzle);
uint8* outputBlockData = outputData + dstOffset;
if constexpr (copyBpp == 8)
{
outputBlockData[0] = inputBlockData[0];
}
else if constexpr (copyBpp == 16)
{
*(uint16*)outputBlockData = *(uint16*)inputBlockData;
}
else if constexpr (copyBpp == 32)
{
*(uint32*)outputBlockData = *(uint32*)inputBlockData;
}
else if constexpr (copyBpp == 64)
{
*(uint32*)(outputBlockData + 0) = *(uint32*)(inputBlockData + 0);
*(uint32*)(outputBlockData + 4) = *(uint32*)(inputBlockData + 4);
}
else if constexpr (copyBpp == 128)
{
*(uint32*)(outputBlockData + 0) = *(uint32*)(inputBlockData + 0);
*(uint32*)(outputBlockData + 4) = *(uint32*)(inputBlockData + 4);
*(uint32*)(outputBlockData + 8) = *(uint32*)(inputBlockData + 8);
*(uint32*)(outputBlockData + 12) = *(uint32*)(inputBlockData + 12);
}
}
}
}
// fast copy for tilemode 4 to tilemode 4
// assumes aa 1
// this only supports the cases where every micro tile fits within 256 bytes (group size)
// we could accelerate this even further if we copied whole macro blocks
void gx2SurfaceCopySoftware_fastPath_tm4Copy(uint8* inputData, sint32 surfSrcHeight, sint32 srcPitch, sint32 srcDepth, uint32 srcSlice, uint32 srcSwizzle,
uint8* outputData, sint32 surfDstHeight, sint32 dstPitch, sint32 dstDepth, uint32 dstSlice, uint32 dstSwizzle,
uint32 copyWidth, uint32 copyHeight, uint32 copyBpp)
{
cemu_assert_debug((copyWidth & 7) == 0);
cemu_assert_debug((copyHeight & 7) == 0);
uint32 srcPipeSwizzle = (srcSwizzle >> 8) & 1;
uint32 srcBankSwizzle = ((srcSwizzle >> 9) & 3);
uint32 dstPipeSwizzle = (dstSwizzle >> 8) & 1;
uint32 dstBankSwizzle = ((dstSwizzle >> 9) & 3);
uint32 texelBytes = copyBpp / 8;
if (srcSlice == dstSlice && srcSwizzle == dstSwizzle && surfSrcHeight == surfDstHeight && srcPitch == dstPitch)
{
// shared tile offsets
for (uint32 y = 0; y < copyHeight; y += 8)
{
for (uint32 x = 0; x < copyWidth; x += 8)
{
// copy 8x8 micro tile
uint32 offset = LatteAddrLib::ComputeSurfaceAddrFromCoordMacroTiled(x, y, srcSlice, 0, copyBpp, srcPitch, surfSrcHeight, 1 * 1, Latte::E_HWTILEMODE::TM_2D_TILED_THIN1, false, srcPipeSwizzle, srcBankSwizzle);
uint8* inputBlockData = inputData + offset;
uint8* outputBlockData = outputData + offset;
memcpy(outputBlockData, inputBlockData, texelBytes * (8 * 8));
}
}
}
else
{
// separate tile offsets
for (uint32 y = 0; y < copyHeight; y += 8)
{
for (uint32 x = 0; x < copyWidth; x += 8)
{
// copy 8x8 micro tile
uint32 srcOffset = LatteAddrLib::ComputeSurfaceAddrFromCoordMacroTiled(x, y, srcSlice, 0, copyBpp, srcPitch, surfSrcHeight, 1 * 1, Latte::E_HWTILEMODE::TM_2D_TILED_THIN1, false, srcPipeSwizzle, srcBankSwizzle);
uint8* inputBlockData = inputData + srcOffset;
uint32 dstOffset = LatteAddrLib::ComputeSurfaceAddrFromCoordMacroTiled(x, y, dstSlice, 0, copyBpp, dstPitch, surfDstHeight, 1 * 1, Latte::E_HWTILEMODE::TM_2D_TILED_THIN1, false, dstPipeSwizzle, dstBankSwizzle);
uint8* outputBlockData = outputData + dstOffset;
memcpy(outputBlockData, inputBlockData, texelBytes * (8 * 8));
}
}
}
}
void gx2SurfaceCopySoftware(
uint8* inputData, sint32 surfSrcHeight, sint32 srcPitch, sint32 srcDepth, uint32 srcSlice, uint32 srcSwizzle, uint32 srcHwTileMode,
uint8* outputData, sint32 surfDstHeight, sint32 dstPitch, sint32 dstDepth, uint32 dstSlice, uint32 dstSwizzle, uint32 dstHwTileMode,
uint32 copyWidth, uint32 copyHeight, uint32 copyBpp)
{
if (srcHwTileMode == 4 && dstHwTileMode == 4 && (copyWidth & 7) == 0 && (copyHeight & 7) == 0 && copyBpp <= 32) // todo - check sample == 1
{
gx2SurfaceCopySoftware_fastPath_tm4Copy(inputData, surfSrcHeight, srcPitch, srcDepth, srcSlice, srcSwizzle, outputData, surfDstHeight, dstPitch, dstDepth, dstSlice, dstSwizzle, copyWidth, copyHeight, copyBpp);
return;
}
if (copyBpp == 8)
gx2SurfaceCopySoftware_specialized<8>(inputData, surfSrcHeight, srcPitch, srcDepth, srcSlice, srcSwizzle, srcHwTileMode, outputData, surfDstHeight, dstPitch, dstDepth, dstSlice, dstSwizzle, dstHwTileMode, copyWidth, copyHeight);
else if (copyBpp == 16)
gx2SurfaceCopySoftware_specialized<16>(inputData, surfSrcHeight, srcPitch, srcDepth, srcSlice, srcSwizzle, srcHwTileMode, outputData, surfDstHeight, dstPitch, dstDepth, dstSlice, dstSwizzle, dstHwTileMode, copyWidth, copyHeight);
else if (copyBpp == 32)
gx2SurfaceCopySoftware_specialized<32>(inputData, surfSrcHeight, srcPitch, srcDepth, srcSlice, srcSwizzle, srcHwTileMode, outputData, surfDstHeight, dstPitch, dstDepth, dstSlice, dstSwizzle, dstHwTileMode, copyWidth, copyHeight);
else if (copyBpp == 64)
gx2SurfaceCopySoftware_specialized<64>(inputData, surfSrcHeight, srcPitch, srcDepth, srcSlice, srcSwizzle, srcHwTileMode, outputData, surfDstHeight, dstPitch, dstDepth, dstSlice, dstSwizzle, dstHwTileMode, copyWidth, copyHeight);
else if (copyBpp == 128)
gx2SurfaceCopySoftware_specialized<128>(inputData, surfSrcHeight, srcPitch, srcDepth, srcSlice, srcSwizzle, srcHwTileMode, outputData, surfDstHeight, dstPitch, dstDepth, dstSlice, dstSwizzle, dstHwTileMode, copyWidth, copyHeight);
else
cemu_assert_debug(false);
}
void gx2Surface_GX2CopySurface(GX2Surface* srcSurface, uint32 srcMip, uint32 srcSlice, GX2Surface* dstSurface, uint32 dstMip, uint32 dstSlice)
{
sint32 dstWidth = dstSurface->width;
sint32 dstHeight = dstSurface->height;
sint32 srcWidth = srcSurface->width;
sint32 srcHeight = srcSurface->height;
sint32 dstMipWidth = std::max(dstWidth>>dstMip, 1);
sint32 dstMipHeight = std::max(dstHeight>>dstMip, 1);
sint32 srcMipWidth = std::max(srcWidth>>srcMip, 1);
sint32 srcMipHeight = std::max(srcHeight>>srcMip, 1);
if( dstMipWidth != srcMipWidth || dstMipHeight != srcMipHeight )
{
cemu_assert_debug(false);
return;
}
// handle format
Latte::E_GX2SURFFMT srcFormat = srcSurface->format;
Latte::E_GX2SURFFMT dstFormat = dstSurface->format;
uint32 srcBPP = Latte::GetFormatBits(srcFormat);
uint32 dstBPP = Latte::GetFormatBits(dstFormat);
auto srcHwFormat = Latte::GetHWFormat(srcFormat);
auto dstHwFormat = Latte::GetHWFormat(dstFormat);
// get texture info
LatteAddrLib::AddrSurfaceInfo_OUT surfOutSrc = {0};
GX2::GX2CalculateSurfaceInfo(srcSurface, srcMip, &surfOutSrc);
LatteAddrLib::AddrSurfaceInfo_OUT surfOutDst = {0};
GX2::GX2CalculateSurfaceInfo(dstSurface, dstMip, &surfOutDst);
// check parameters
if (srcSurface->numLevels == 0)
{
debug_printf("GX2CopySurface(): mip count is 0\n");
return;
}
// get input pointer
uint8* inputData = NULL;
cemu_assert(srcMip < srcSurface->numLevels);
if( srcMip == 0 )
inputData = (uint8*)memory_getPointerFromVirtualOffset(srcSurface->imagePtr);
else if( srcMip == 1 )
inputData = (uint8*)memory_getPointerFromVirtualOffset(srcSurface->mipPtr);
else
{
inputData = (uint8*)memory_getPointerFromVirtualOffset(srcSurface->mipPtr + srcSurface->mipOffset[srcMip - 1]);
}
// get output pointer
uint8* outputData = NULL;
cemu_assert(dstMip < dstSurface->numLevels);
if( dstMip == 0 )
outputData = (uint8*)memory_getPointerFromVirtualOffset(dstSurface->imagePtr);
else if( dstMip == 1 )
outputData = (uint8*)memory_getPointerFromVirtualOffset(dstSurface->mipPtr);
else
{
outputData = (uint8*)memory_getPointerFromVirtualOffset(dstSurface->mipPtr + dstSurface->mipOffset[dstMip - 1]);
}
if( srcHwFormat != dstHwFormat )
{
// mismatching format
forceLogDebug_printf("GX2CopySurface(): Format mismatch\n");
return;
}
// note: Do not trust values from the input GX2Surface* structs but rely on surfOutDst/surfOutSrc instead if possible.
// src
uint32 srcPitch = surfOutSrc.pitch;
uint32 srcSwizzle = srcSurface->swizzle;
uint32 srcHwTileMode = (uint32)surfOutSrc.hwTileMode;
uint32 srcDepth = std::max<uint32>(surfOutSrc.depth, 1);
if (srcHwTileMode == 0) // linear
{
srcPitch = srcSurface->pitch >> srcMip;
srcPitch = std::max<uint32>(srcPitch, 1);
}
// dst
uint32 dstPitch = surfOutDst.pitch;
uint32 dstSwizzle = dstSurface->swizzle;
uint32 dstHwTileMode = (uint32)surfOutDst.hwTileMode;
uint32 dstDepth = std::max<uint32>(surfOutDst.depth, 1);
uint32 dstBpp = surfOutDst.bpp;
//debug_printf("Src Tex: %08X %dx%d Swizzle: %08x tm: %d fmt: %04x use: %02x\n", _swapEndianU32(srcSurface->imagePtr), _swapEndianU32(srcSurface->width), _swapEndianU32(srcSurface->height), _swapEndianU32(srcSurface->swizzle), _swapEndianU32(srcSurface->tileMode), _swapEndianU32(srcSurface->format), (uint32)srcSurface->resFlag);
//debug_printf("Dst Tex: %08X %dx%d Swizzle: %08x tm: %d fmt: %04x use: %02x\n", _swapEndianU32(dstSurface->imagePtr), _swapEndianU32(dstSurface->width), _swapEndianU32(dstSurface->height), _swapEndianU32(dstSurface->swizzle), _swapEndianU32(dstSurface->tileMode), _swapEndianU32(dstSurface->format), (uint32)dstSurface->resFlag);
bool requestGPURAMCopy = false;
bool debugTestForceCPUCopy = false;
if (srcSurface->tileMode == Latte::E_GX2TILEMODE::TM_LINEAR_SPECIAL && dstSurface->tileMode == Latte::E_GX2TILEMODE::TM_2D_TILED_THIN1)
debugTestForceCPUCopy = true;
if (srcSurface->tileMode == Latte::E_GX2TILEMODE::TM_2D_TILED_THIN1 && dstSurface->tileMode == Latte::E_GX2TILEMODE::TM_LINEAR_SPECIAL )
{
LatteAsyncCommands_queueForceTextureReadback(
srcSurface->imagePtr,
srcSurface->mipPtr,
srcSurface->swizzle,
(uint32)srcSurface->format.value(),
srcSurface->width,
srcSurface->height,
srcSurface->depth,
srcSurface->pitch,
srcSlice,
(uint32)srcSurface->dim.value(),
Latte::MakeHWTileMode(srcSurface->tileMode),
srcSurface->aa,
srcMip);
LatteAsyncCommands_waitUntilAllProcessed();
debugTestForceCPUCopy = true;
}
// send copy command to GPU
if( srcHwTileMode > 0 && srcHwTileMode < 16 && dstHwTileMode > 0 && dstHwTileMode < 16 || requestGPURAMCopy )
{
GX2ReserveCmdSpace(1+13*2);
gx2WriteGather_submit(pm4HeaderType3(IT_HLE_COPY_SURFACE_NEW, 13*2),
// src
(uint32)srcSurface->imagePtr,
(uint32)srcSurface->mipPtr,
(uint32)srcSurface->swizzle,
(uint32)srcSurface->format.value(),
(uint32)srcSurface->width,
(uint32)srcSurface->height,
(uint32)srcSurface->depth,
(uint32)srcSurface->pitch,
srcSlice,
(uint32)srcSurface->dim.value(),
(uint32)srcSurface->tileMode.value(),
(uint32)srcSurface->aa,
srcMip,
// dst
(uint32)dstSurface->imagePtr,
(uint32)dstSurface->mipPtr,
(uint32)dstSurface->swizzle,
(uint32)dstSurface->format.value(),
(uint32)dstSurface->width,
(uint32)dstSurface->height,
(uint32)dstSurface->depth,
(uint32)dstSurface->pitch,
dstSlice,
(uint32)dstSurface->dim.value(),
(uint32)dstSurface->tileMode.value(),
(uint32)dstSurface->aa,
dstMip);
}
if (requestGPURAMCopy)
return; // if RAM copy happens on the GPU side we skip it here
// manually exclude expensive CPU texture copies for some known game framebuffer textures
// todo - find a better way to solve this
bool isDynamicTexCopy = false;
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->imagePtr >= 0xF4000000 && srcSurface->width >= 800 && srcFormat == Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT); // SM3DW
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->imagePtr >= 0xF4000000 && srcSurface->width >= 800 && srcFormat == Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM); // Trine 2
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->imagePtr >= 0xF4000000 && srcSurface->width == 0xA0 && srcFormat == Latte::E_GX2SURFFMT::R32_FLOAT); // Little Inferno
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->imagePtr >= 0xF4000000 && srcSurface->width == 1280 && srcFormat == Latte::E_GX2SURFFMT::R32_FLOAT); // Donkey Kong Tropical Freeze
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->imagePtr >= 0xF4000000 && srcSurface->width == 640 && srcSurface->height == 320 && srcFormat == Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT); // SM3DW Switch Scramble Circus
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 1280 && srcSurface->height == 720 && srcFormat == Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM && srcSurface->tileMode != Latte::E_GX2TILEMODE::TM_LINEAR_ALIGNED ); // Affordable Space Adventures
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 854 && srcSurface->height == 480 && srcFormat == Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM); // Affordable Space Adventures
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->imagePtr >= 0xF4000000 && srcSurface->width == 1152 && srcSurface->height == 720 && srcFormat == Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT && (srcSurface->resFlag&GX2_RESFLAG_USAGE_COLOR_BUFFER) != 0 ); // Star Fox Zero
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->imagePtr >= 0xF4000000 && srcSurface->width == 680 && srcSurface->height == 480 && srcFormat == Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT && (srcSurface->resFlag&GX2_RESFLAG_USAGE_COLOR_BUFFER) != 0 ); // Star Fox Zero
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->imagePtr >= 0xF4000000 && srcSurface->width == 1280 && srcSurface->height == 720 && srcFormat == Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT ); // Qube
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 322 && srcSurface->height == 182 && srcFormat == Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM ); // Qube
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 640 && srcSurface->height == 360 && srcFormat == Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT ); // Qube
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 1920 && srcSurface->height == 1080 && srcFormat == Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM && dstSurface->resFlag == 0x80000003); // Cosmophony
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 854 && srcSurface->height == 480 && srcFormat == Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM && dstSurface->resFlag == 0x3); // Cosmophony
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 1280 && srcSurface->height == 720 && srcFormat == Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB && dstSurface->resFlag == 0x3); // The Fall
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 854 && srcSurface->height == 480 && srcFormat == Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB && dstSurface->resFlag == 0x3); // The Fall
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 1280 && srcSurface->height == 720 && srcFormat == Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB && dstSurface->resFlag == 0x80000003); // The Fall
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 1280 && srcSurface->height == 720 && srcFormat == Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB && srcSurface->resFlag == 0x80000003); // Nano Assault Neo
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->imagePtr >= 0xF4000000 && srcSurface->width == 1280 && srcSurface->height == 720 && srcFormat == Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM); // Mario Party 10
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->imagePtr >= 0xF4000000 && srcSurface->width == 854 && srcSurface->height == 480 && srcFormat == Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM); // Mario Party 10
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 1920 && srcSurface->height == 1080 && srcFormat == Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM && dstSurface->resFlag == 0x3); // Hello Kitty Kruisers
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 1024 && srcSurface->height == 1024 && srcFormat == Latte::E_GX2SURFFMT::R32_FLOAT && dstSurface->resFlag == 0x5); // Art Academy
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 260 && srcSurface->height == 148 && srcFormat == Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT && dstSurface->resFlag == 0x3); // Transformers: Rise of the Dark Spark
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 1040 && srcSurface->height == 592 && srcFormat == Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT && dstSurface->resFlag == 0x3); // Transformers: Rise of the Dark Spark
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 854 && srcSurface->height == 480 && srcFormat == Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB && srcSurface->resFlag == 0x3); // Nano Assault Neo
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 1024 && srcSurface->height == 576 && srcFormat == Latte::E_GX2SURFFMT::D24_S8_UNORM && srcSurface->resFlag == 0x1); // Skylanders SuperChargers
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 1152 && srcSurface->height == 648 && (srcFormat == Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM || srcFormat == Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT) && srcSurface->resFlag == 0x1); // Watch Dogs
isDynamicTexCopy = isDynamicTexCopy || (srcSurface->width == 576 && srcSurface->height == 324 && (srcFormat == Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM || srcFormat == Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT) && srcSurface->resFlag == 0x1); // Watch Dogs
if( isDynamicTexCopy && debugTestForceCPUCopy == false)
{
debug_printf("Software tex copy blocked\n");
return;
}
sint32 copyWidth = dstMipWidth;
sint32 copyHeight = dstMipHeight;
if (Latte::IsCompressedFormat(dstHwFormat))
{
copyWidth = (copyWidth + 3) / 4;
copyHeight = (copyHeight + 3) / 4;
}
gx2SurfaceCopySoftware(inputData, surfOutSrc.height, srcPitch, srcDepth, srcSlice, srcSwizzle, srcHwTileMode,
outputData, surfOutDst.height, dstPitch, dstDepth, dstSlice, dstSwizzle, dstHwTileMode,
copyWidth, copyHeight, dstBpp);
}
void gx2Export_GX2CopySurface(PPCInterpreter_t* hCPU)
{
GX2Surface* srcSurface = (GX2Surface*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
uint32 srcMip = hCPU->gpr[4];
uint32 srcSlice = hCPU->gpr[5];
GX2Surface* dstSurface = (GX2Surface*)memory_getPointerFromVirtualOffset(hCPU->gpr[6]);
uint32 dstMip = hCPU->gpr[7];
uint32 dstSlice = hCPU->gpr[8];
gx2Surface_GX2CopySurface(srcSurface, srcMip, srcSlice, dstSurface, dstMip, dstSlice);
osLib_returnFromFunction(hCPU, 0);
}
typedef struct
{
sint32 left;
sint32 top;
sint32 right;
sint32 bottom;
}GX2Rect_t;
typedef struct
{
sint32 x;
sint32 y;
}GX2Point_t;
void gx2Export_GX2CopySurfaceEx(PPCInterpreter_t* hCPU)
{
forceLogDebug_printf("GX2CopySurfaceEx(0x%08x,%d,%d,0x%08x,%d,%d,%d,0x%08x,0x%08x)", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5], hCPU->gpr[6], hCPU->gpr[7], hCPU->gpr[8], hCPU->gpr[9], hCPU->gpr[10], memory_readU32(hCPU->gpr[1]+0x8));
GX2Surface* srcSurface = (GX2Surface*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
uint32 srcMip = hCPU->gpr[4];
uint32 srcSlice = hCPU->gpr[5];
GX2Surface* dstSurface = (GX2Surface*)memory_getPointerFromVirtualOffset(hCPU->gpr[6]);
uint32 dstMip = hCPU->gpr[7];
uint32 dstSlice = hCPU->gpr[8];
sint32 rectCount = hCPU->gpr[9];
MPTR rectSrcArrayMPTR = hCPU->gpr[10];
MPTR pointDstArrayMPTR = memory_readU32(hCPU->gpr[1]+0x8);
GX2Rect_t* rectSrc = (GX2Rect_t*)memory_getPointerFromVirtualOffset(rectSrcArrayMPTR);
GX2Point_t* rectDst = (GX2Point_t*)memory_getPointerFromVirtualOffset(pointDstArrayMPTR);
for (sint32 i = 0; i < rectCount; i++)
{
forceLogDebug_printf("rect left-top: %d/%d size: %d/%d", _swapEndianU32(rectSrc->left), _swapEndianU32(rectSrc->top), _swapEndianU32(rectSrc->right) - _swapEndianU32(rectSrc->left), _swapEndianU32(rectSrc->bottom) - _swapEndianU32(rectSrc->top));
}
#ifndef PUBLIC_RELEASE
if( rectCount != 1 )
assert_dbg();
if( srcMip != 0 )
assert_dbg();
if( srcSlice != 0 )
assert_dbg();
if( dstMip != 0 )
assert_dbg();
if( dstSlice != 0 )
assert_dbg();
#endif
for(sint32 i=0; i<rectCount; i++)
{
uint32 srcWidth = srcSurface->width;
uint32 srcHeight = srcSurface->height;
// calculate rect size
sint32 rectSrcX = (sint32)_swapEndianU32((uint32)rectSrc[i].left);
sint32 rectSrcY = (sint32)_swapEndianU32((uint32)rectSrc[i].top);
sint32 rectWidth = (sint32)_swapEndianU32((uint32)rectSrc[i].right) - rectSrcX;
sint32 rectHeight = (sint32)_swapEndianU32((uint32)rectSrc[i].bottom) - rectSrcY;
if( rectSrcX == 0 && rectSrcY == 0 && rectWidth == srcWidth && rectHeight == srcHeight )
{
// special case in which GX2CopySurfaceEx acts like GX2CopySurface()
gx2Surface_GX2CopySurface(srcSurface, srcMip, srcSlice, dstSurface, dstMip, dstSlice);
}
}
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2ResolveAAColorBuffer(PPCInterpreter_t* hCPU)
{
debug_printf("GX2ResolveAAColorBuffer(0x%08x,0x%08x,%d,%d)\n", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5], hCPU->gpr[6]);
GX2ColorBuffer* srcColorBuffer = (GX2ColorBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
GX2Surface* srcSurface = &srcColorBuffer->surface;
GX2Surface* dstSurface = (GX2Surface*)memory_getPointerFromVirtualOffset(hCPU->gpr[4]);
uint32 srcMip = _swapEndianU32(srcColorBuffer->viewMip);
uint32 dstMip = hCPU->gpr[5];
uint32 srcSlice = _swapEndianU32(srcColorBuffer->viewFirstSlice);
uint32 dstSlice = hCPU->gpr[6];
#ifndef PUBLIC_RELEASE
if( _swapEndianU32(srcColorBuffer->viewMip) != 0 || _swapEndianU32(srcColorBuffer->viewFirstSlice) != 0 )
assert_dbg();
#endif
// allocate pixel buffer
sint32 dstWidth = dstSurface->width;
sint32 dstHeight = dstSurface->height;
sint32 srcWidth = srcSurface->width;
sint32 srcHeight = srcSurface->height;
uint32 dstMipWidth = std::max(dstWidth>>dstMip, 1);
uint32 dstMipHeight = std::max(dstHeight>>dstMip, 1);
uint32 srcMipWidth = std::max(srcWidth>>srcMip, 1);
uint32 srcMipHeight = std::max(srcHeight>>srcMip, 1);
// check if surface properties match
if( srcSurface->width != dstSurface->width || srcSurface->height != dstSurface->height )
{
osLib_returnFromFunction(hCPU, 0);
return;
}
if( dstMipWidth != srcMipWidth || dstMipHeight != srcMipHeight )
{
cemu_assert_suspicious();
osLib_returnFromFunction(hCPU, 0);
return;
}
// handle format
Latte::E_GX2SURFFMT srcFormat = srcSurface->format;
Latte::E_GX2SURFFMT dstFormat = dstSurface->format;
uint32 srcBPP = Latte::GetFormatBits(srcFormat);
uint32 dstBPP = Latte::GetFormatBits(dstFormat);
sint32 srcStepX = 1;
sint32 srcStepY = 1;
sint32 dstStepX = 1;
sint32 dstStepY = 1;
auto srcHwFormat = Latte::GetHWFormat(srcFormat);
auto dstHwFormat = Latte::GetHWFormat(dstFormat);
// get texture info
LatteAddrLib::AddrSurfaceInfo_OUT surfOutSrc = {0};
GX2::GX2CalculateSurfaceInfo(srcSurface, srcMip, &surfOutSrc);
LatteAddrLib::AddrSurfaceInfo_OUT surfOutDst = {0};
GX2::GX2CalculateSurfaceInfo(dstSurface, dstMip, &surfOutDst);
// get input pointer
uint8* inputData = NULL;
cemu_assert(srcMip < srcSurface->numLevels);
if( srcMip == 0 )
inputData = (uint8*)memory_getPointerFromVirtualOffset(srcSurface->imagePtr);
else if( srcMip == 1 )
inputData = (uint8*)memory_getPointerFromVirtualOffset(srcSurface->mipPtr);
else
inputData = (uint8*)memory_getPointerFromVirtualOffset(srcSurface->mipPtr+srcSurface->mipOffset[srcMip-1]);
// get output pointer
uint8* outputData = NULL;
cemu_assert(dstMip < dstSurface->numLevels);
if( dstMip == 0 )
outputData = (uint8*)memory_getPointerFromVirtualOffset(dstSurface->imagePtr);
else if( dstMip == 1 )
outputData = (uint8*)memory_getPointerFromVirtualOffset(dstSurface->mipPtr);
else
outputData = (uint8*)memory_getPointerFromVirtualOffset(dstSurface->mipPtr+dstSurface->mipOffset[dstMip-1]);
// calculate step size for compressed textures
if( Latte::IsCompressedFormat(srcHwFormat) )
{
srcStepX = 4;
srcStepY = 4;
}
if(Latte::IsCompressedFormat(dstHwFormat) )
{
dstStepX = 4;
dstStepY = 4;
}
if( srcStepX != dstStepX || srcStepY != dstStepY )
assert_dbg();
if( srcHwFormat != dstHwFormat )
{
// mismatching format
debug_printf("GX2CopySurface(): Format mismatch\n");
osLib_returnFromFunction(hCPU, 0);
return;
}
// src
uint32 srcPitch = surfOutSrc.pitch;
uint32 srcSwizzle = srcSurface->swizzle;
uint32 srcPipeSwizzle = (srcSwizzle>>8)&1;
uint32 srcBankSwizzle = ((srcSwizzle>>9)&3);
uint32 srcTileMode = (uint32)surfOutSrc.hwTileMode;
uint32 srcDepth = std::max<uint32>(surfOutSrc.depth, 1);
// dst
uint32 dstPitch = surfOutDst.pitch;
uint32 dstSwizzle = dstSurface->swizzle;
uint32 dstPipeSwizzle = (dstSwizzle>>8)&1;
uint32 dstBankSwizzle = ((dstSwizzle>>9)&3);
uint32 dstTileMode = (uint32)surfOutDst.hwTileMode;
uint32 dstDepth = std::max<uint32>(surfOutDst.depth, 1);
// send copy command to GPU
GX2ReserveCmdSpace(1 + 13 * 2);
gx2WriteGather_submit(pm4HeaderType3(IT_HLE_COPY_SURFACE_NEW, 13 * 2),
// src
(uint32)srcSurface->imagePtr,
(uint32)srcSurface->mipPtr,
(uint32)srcSurface->swizzle,
(uint32)srcSurface->format.value(),
(uint32)srcSurface->width,
(uint32)srcSurface->height,
(uint32)srcSurface->depth,
(uint32)srcSurface->pitch,
srcSlice,
(uint32)srcSurface->dim.value(),
(uint32)srcSurface->tileMode.value(),
(uint32)srcSurface->aa,
srcMip,
// dst
(uint32)dstSurface->imagePtr,
(uint32)dstSurface->mipPtr,
(uint32)dstSurface->swizzle,
(uint32)dstSurface->format.value(),
(uint32)dstSurface->width,
(uint32)dstSurface->height,
(uint32)dstSurface->depth,
(uint32)dstSurface->pitch,
dstSlice,
(uint32)dstSurface->dim.value(),
(uint32)dstSurface->tileMode.value(),
(uint32)dstSurface->aa,
dstMip);
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2ConvertDepthBufferToTextureSurface(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2ConvertDepthBufferToTextureSurface(0x%x, 0x%x, %d, %d)\n", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5], hCPU->gpr[6]);
GX2DepthBuffer* depthBuffer = (GX2DepthBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
GX2Surface* dstSurface = (GX2Surface*)memory_getPointerFromVirtualOffset(hCPU->gpr[4]);
uint32 dstMip = hCPU->gpr[5];
uint32 dstSlice = hCPU->gpr[6];
if (dstMip != 0 || dstSlice != 0)
debugBreakpoint();
// get texture info
LatteAddrLib::AddrSurfaceInfo_OUT surfOutSrc = { 0 };
GX2::GX2CalculateSurfaceInfo(&depthBuffer->surface, 0, &surfOutSrc);
LatteAddrLib::AddrSurfaceInfo_OUT surfOutDst = { 0 };
GX2::GX2CalculateSurfaceInfo(dstSurface, 0, &surfOutDst);
if (depthBuffer->surface.imagePtr == dstSurface->imagePtr)
{
// in-place re-tiling doesn't need any actual copy operation?
if (dstMip != 0 || dstSlice != 0)
debugBreakpoint();
debug_printf("In-place re-tiling\n");
osLib_returnFromFunction(hCPU, 0);
return;
}
// note: Do not trust values from the input GX2Surface* structs but rely on surfOutDst/surfOutSrc instead if possible.
// src
uint32 srcPitch = surfOutSrc.pitch;
uint32 srcSwizzle = depthBuffer->surface.swizzle;
uint32 srcPipeSwizzle = (srcSwizzle >> 8) & 1;
uint32 srcBankSwizzle = ((srcSwizzle >> 9) & 3);
uint32 srcTileMode = (uint32)surfOutSrc.hwTileMode;
uint32 srcDepth = std::max<uint32>(surfOutSrc.depth, 1);
// dst
uint32 dstPitch = surfOutDst.pitch;
uint32 dstSwizzle = dstSurface->swizzle;
uint32 dstPipeSwizzle = (dstSwizzle >> 8) & 1;
uint32 dstBankSwizzle = ((dstSwizzle >> 9) & 3);
uint32 dstTileMode = (uint32)surfOutDst.hwTileMode;
uint32 dstDepth = srcDepth;
sint32 srcMip = 0;
uint32 numSlices = std::max<uint32>(_swapEndianU32(depthBuffer->viewNumSlices), 1);
GX2ReserveCmdSpace((1 + 13 * 2) * numSlices);
for (uint32 subSliceIndex = 0; subSliceIndex < numSlices; subSliceIndex++)
{
// send copy command to GPU
gx2WriteGather_submit(pm4HeaderType3(IT_HLE_COPY_SURFACE_NEW, 13 * 2),
// src
(uint32)(depthBuffer->surface.imagePtr),
(uint32)(depthBuffer->surface.mipPtr),
(uint32)(depthBuffer->surface.swizzle),
(uint32)(depthBuffer->surface.format.value()),
(uint32)(depthBuffer->surface.width),
(uint32)(depthBuffer->surface.height),
(uint32)(depthBuffer->surface.depth),
(uint32)(depthBuffer->surface.pitch),
(uint32)(depthBuffer->viewFirstSlice) + subSliceIndex,
(uint32)(depthBuffer->surface.dim.value()),
(uint32)(depthBuffer->surface.tileMode.value()),
(uint32)(depthBuffer->surface.aa),
srcMip,
// dst
(uint32)(dstSurface->imagePtr),
(uint32)(dstSurface->mipPtr),
(uint32)(dstSurface->swizzle),
(uint32)(dstSurface->format.value()),
(uint32)(dstSurface->width),
(uint32)(dstSurface->height),
(uint32)(dstSurface->depth),
(uint32)(dstSurface->pitch),
dstSlice + subSliceIndex,
(uint32)(dstSurface->dim.value()),
(uint32)(dstSurface->tileMode.value()),
(uint32)(dstSurface->aa),
dstMip);
}
osLib_returnFromFunction(hCPU, 0);
}
namespace GX2
{
void GX2SurfaceCopyInit()
{
osLib_addFunction("gx2", "GX2CopySurface", gx2Export_GX2CopySurface);
osLib_addFunction("gx2", "GX2CopySurfaceEx", gx2Export_GX2CopySurfaceEx);
osLib_addFunction("gx2", "GX2ResolveAAColorBuffer", gx2Export_GX2ResolveAAColorBuffer);
osLib_addFunction("gx2", "GX2ConvertDepthBufferToTextureSurface", gx2Export_GX2ConvertDepthBufferToTextureSurface);
}
};
void gx2CopySurfaceTest()
{
return;
BenchmarkTimer bt;
// copy 0
bt.Start();
for(sint32 i=0; i<100; i++)
gx2SurfaceCopySoftware(
memory_base + 0x10000000, 256, 256, 1, 0, 0, 4,
memory_base + 0x20000000, 256, 256, 1, 0, 0, 4,
64, 64, 32
);
bt.Stop();
debug_printf("Copy 0 - %lfms\n", bt.GetElapsedMilliseconds());
// copy 1
bt.Start();
for (sint32 i = 0; i < 100; i++)
gx2SurfaceCopySoftware(
memory_base + 0x11000000, 256, 256, 1, 0, 0, 4,
memory_base + 0x21000000, 256, 256, 1, 0, 0, 2,
64, 64, 32
);
bt.Stop();
debug_printf("Copy 1 - %lfms\n", bt.GetElapsedMilliseconds());
// copy 2
bt.Start();
for (sint32 i = 0; i < 100; i++)
gx2SurfaceCopySoftware(
memory_base + 0x12000000, 256, 256, 1, 0, 0, 1,
memory_base + 0x22000000, 256, 256, 1, 0, 0, 4,
64, 64, 128
);
bt.Stop();
debug_printf("Copy 2 - %lfms\n", bt.GetElapsedMilliseconds());
// copy 3
bt.Start();
for (sint32 i = 0; i < 100; i++)
gx2SurfaceCopySoftware(
memory_base + 0x12000000, 256, 256, 1, 0, 0, 4,
memory_base + 0x22000000, 256, 256, 1, 0, 0, 4,
64, 512, 32
);
bt.Stop();
debug_printf("Copy 3 - %lfms\n", bt.GetElapsedMilliseconds());
cemu_assert_debug(false);
// with bpp switch optimized away:
// Copy 0 - 19.777100ms
// Copy 1 - 14.311300ms
// Copy 2 - 10.837700ms
// Copy 3 - 158.174400ms
// Copy 0 - 19.846800ms
// Copy 1 - 14.054000ms
// Copy 2 - 11.013500ms
// Copy 3 - 159.916000ms
// with fast path added:
// Copy 0 - 0.222400ms
// Copy 1 - 14.125700ms
// Copy 2 - 13.298100ms
// Copy 3 - 1.764500ms
// with shared offset:
// Copy 0 - 0.143300ms
// Copy 1 - 13.814200ms
// Copy 2 - 10.309500ms
// Copy 3 - 1.191900ms
}

View file

@ -0,0 +1,6 @@
#pragma once
namespace GX2
{
void GX2SurfaceCopyInit();
};

View file

@ -0,0 +1,392 @@
#include "Cafe/OS/common/OSCommon.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "GX2.h"
#include "GX2_Texture.h"
#include "Cafe/HW/Latte/Core/Latte.h"
#include "Cafe/HW/Latte/Core/LatteDraw.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/HW/Latte/ISA/LatteReg.h"
namespace GX2
{
using namespace Latte;
/****** Texture functions ******/
void GX2InitTextureRegs(GX2Texture* texture)
{
uint32 _regs[5] = { 0 };
// some values may not be zero
if (texture->viewNumMips == 0)
texture->viewNumMips = 1;
if (texture->viewNumSlices == 0)
texture->viewNumSlices = 1;
if (texture->surface.height == 0)
texture->surface.height = 1;
if (texture->surface.depth == 0)
texture->surface.depth = 1;
if (texture->surface.numLevels == 0)
texture->surface.numLevels = 1;
// texture parameters
uint32 viewNumMips = texture->viewNumMips;
uint32 viewNumSlices = texture->viewNumSlices;
uint32 viewFirstMip = texture->viewFirstMip;
uint32 viewFirstSlice = texture->viewFirstSlice;
uint32 compSel = texture->compSel;
// surface parameters
uint32 width = texture->surface.width;
uint32 height = texture->surface.height;
uint32 depth = texture->surface.depth;
uint32 pitch = texture->surface.pitch;
uint32 numMips = texture->surface.numLevels;
Latte::E_GX2SURFFMT format = texture->surface.format;
Latte::E_DIM dim = texture->surface.dim;
uint32 tileMode = (uint32)texture->surface.tileMode.value();
uint32 surfaceFlags = texture->surface.resFlag;
uint32 surfaceAA = texture->surface.aa;
// calculate register word 0
Latte::E_HWSURFFMT formatHw = Latte::GetHWFormat(format);
Latte::LATTE_SQ_TEX_RESOURCE_WORD0_N newRegWord0;
newRegWord0.set_DIM(dim);
newRegWord0.set_TILE_MODE(Latte::MakeHWTileMode(texture->surface.tileMode));
newRegWord0.set_TILE_TYPE((surfaceFlags&4) != 0);
uint32 pixelPitch = pitch;
if (Latte::IsCompressedFormat(formatHw))
pixelPitch *= 4;
if(pixelPitch == 0)
newRegWord0.set_PITCH(0x7FF);
else
newRegWord0.set_PITCH((pixelPitch >> 3) - 1);
if (width == 0)
newRegWord0.set_WIDTH(0x1FFF);
else
newRegWord0.set_WIDTH(width - 1);
texture->regTexWord0 = newRegWord0;
// calculate register word 1
Latte::LATTE_SQ_TEX_RESOURCE_WORD1_N newRegWord1;
newRegWord1.set_HEIGHT(height - 1);
if (dim == Latte::E_DIM::DIM_CUBEMAP)
{
newRegWord1.set_DEPTH((depth / 6) - 1);
}
else if (dim == E_DIM::DIM_3D ||
dim == E_DIM::DIM_2D_ARRAY_MSAA ||
dim == E_DIM::DIM_2D_ARRAY ||
dim == E_DIM::DIM_1D_ARRAY)
{
newRegWord1.set_DEPTH(depth - 1);
}
else
{
newRegWord1.set_DEPTH(0);
}
newRegWord1.set_DATA_FORMAT(formatHw);
texture->regTexWord1 = newRegWord1;
// calculate register word 2
LATTE_SQ_TEX_RESOURCE_WORD4_N newRegWord4;
LATTE_SQ_TEX_RESOURCE_WORD4_N::E_FORMAT_COMP formatComp;
if (HAS_FLAG(format, Latte::E_GX2SURFFMT::FMT_BIT_SIGNED))
formatComp = LATTE_SQ_TEX_RESOURCE_WORD4_N::E_FORMAT_COMP::COMP_SIGNED;
else
formatComp = LATTE_SQ_TEX_RESOURCE_WORD4_N::E_FORMAT_COMP::COMP_UNSIGNED;
newRegWord4.set_FORMAT_COMP_X(formatComp);
newRegWord4.set_FORMAT_COMP_Y(formatComp);
newRegWord4.set_FORMAT_COMP_Z(formatComp);
newRegWord4.set_FORMAT_COMP_W(formatComp);
if (HAS_FLAG(format, Latte::E_GX2SURFFMT::FMT_BIT_FLOAT))
newRegWord4.set_NUM_FORM_ALL(LATTE_SQ_TEX_RESOURCE_WORD4_N::E_NUM_FORMAT_ALL::NUM_FORMAT_SCALED);
else if (HAS_FLAG(format, Latte::E_GX2SURFFMT::FMT_BIT_INT))
newRegWord4.set_NUM_FORM_ALL(LATTE_SQ_TEX_RESOURCE_WORD4_N::E_NUM_FORMAT_ALL::NUM_FORMAT_INT);
else
newRegWord4.set_NUM_FORM_ALL(LATTE_SQ_TEX_RESOURCE_WORD4_N::E_NUM_FORMAT_ALL::NUM_FORMAT_NORM);
if (HAS_FLAG(format, Latte::E_GX2SURFFMT::FMT_BIT_SRGB))
newRegWord4.set_FORCE_DEGAMMA(true);
newRegWord4.set_ENDIAN_SWAP(GX2::GetSurfaceFormatSwapMode((Latte::E_GX2SURFFMT)format));
newRegWord4.set_REQUEST_SIZE(2);
newRegWord4.set_DST_SEL_X((Latte::LATTE_SQ_TEX_RESOURCE_WORD4_N::E_SEL)((compSel >> 24) & 0x7));
newRegWord4.set_DST_SEL_Y((Latte::LATTE_SQ_TEX_RESOURCE_WORD4_N::E_SEL)((compSel >> 16) & 0x7));
newRegWord4.set_DST_SEL_Z((Latte::LATTE_SQ_TEX_RESOURCE_WORD4_N::E_SEL)((compSel >> 8) & 0x7));
newRegWord4.set_DST_SEL_W((Latte::LATTE_SQ_TEX_RESOURCE_WORD4_N::E_SEL)((compSel >> 0) & 0x7));
newRegWord4.set_BASE_LEVEL(viewFirstMip);
texture->regTexWord4 = newRegWord4;
// calculate register word 3
LATTE_SQ_TEX_RESOURCE_WORD5_N newRegWord5;
newRegWord5.set_LAST_LEVEL(viewFirstMip + viewNumMips - 1);
newRegWord5.set_BASE_ARRAY(viewFirstSlice);
newRegWord5.set_LAST_ARRAY(viewFirstSlice + viewNumSlices - 1);
if (dim == Latte::E_DIM::DIM_CUBEMAP && ((depth / 6) - 1) != 0)
newRegWord5.set_UKN_BIT_30(true);
if(surfaceAA >= 1 && surfaceAA <= 3)
newRegWord5.set_LAST_LEVEL(surfaceAA);
texture->regTexWord5 = newRegWord5;
// calculate register word 4
LATTE_SQ_TEX_RESOURCE_WORD6_N newRegWord6;
newRegWord6.set_MAX_ANISO(4);
newRegWord6.set_PERF_MODULATION(7);
newRegWord6.set_TYPE(Latte::LATTE_SQ_TEX_RESOURCE_WORD6_N::E_TYPE::VTX_VALID_TEXTURE);
texture->regTexWord6 = newRegWord6;
}
void _GX2SetTexture(GX2Texture* tex, Latte::REGADDR baseRegister, uint32 textureUnitIndex)
{
GX2ReserveCmdSpace(2 + 7);
MPTR imagePtr = tex->surface.imagePtr;
MPTR mipPtr = tex->surface.mipPtr;
if (mipPtr == MPTR_NULL)
mipPtr = imagePtr;
uint32 swizzle = tex->surface.swizzle;
cemu_assert_debug((swizzle & 0xFF) == 0); // does the low byte in swizzle field have any meaning?
if (Latte::TM_IsMacroTiled(tex->surface.tileMode))
{
uint32 swizzleStopLevel = (swizzle >> 16) & 0xFF;
// combine swizzle with image ptr if base level is macro tiled
if (swizzleStopLevel > 0)
imagePtr ^= (swizzle & 0xFFFF);
// combine swizzle with mip ptr if first mip (level 1) is macro tiled
if (swizzleStopLevel > 1)
mipPtr ^= (swizzle & 0xFFFF);
}
gx2WriteGather_submit(pm4HeaderType3(IT_SET_RESOURCE, 8),
baseRegister + textureUnitIndex * 7 - mmSQ_TEX_RESOURCE_WORD0,
tex->regTexWord0,
tex->regTexWord1,
memory_virtualToPhysical(imagePtr) >> 8,
memory_virtualToPhysical(mipPtr) >> 8,
tex->regTexWord4,
tex->regTexWord5,
tex->regTexWord6);
}
void GX2SetPixelTexture(GX2Texture* tex, uint32 texUnit)
{
cemu_assert_debug(texUnit < Latte::GPU_LIMITS::NUM_TEXTURES_PER_STAGE);
_GX2SetTexture(tex, Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_PS, texUnit);
}
void GX2SetVertexTexture(GX2Texture* tex, uint32 texUnit)
{
cemu_assert_debug(texUnit < Latte::GPU_LIMITS::NUM_TEXTURES_PER_STAGE);
_GX2SetTexture(tex, Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_VS, texUnit);
}
void GX2SetGeometryTexture(GX2Texture* tex, uint32 texUnit)
{
cemu_assert_debug(texUnit < Latte::GPU_LIMITS::NUM_TEXTURES_PER_STAGE);
_GX2SetTexture(tex, Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_GS, texUnit);
}
void GX2SetComputeTexture(GX2Texture* tex, uint32 texUnit)
{
GX2SetVertexTexture(tex, texUnit);
}
/****** Sampler functions ******/
void GX2InitSampler(GX2Sampler* sampler, LATTE_SQ_TEX_SAMPLER_WORD0_0::E_CLAMP clampXYZ, LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER filterMinMag)
{
LATTE_SQ_TEX_SAMPLER_WORD0_0 word0{};
word0.set_CLAMP_X(clampXYZ).set_CLAMP_Y(clampXYZ).set_CLAMP_Z(clampXYZ);
word0.set_XY_MAG_FILTER(filterMinMag).set_XY_MIN_FILTER(filterMinMag);
word0.set_Z_FILTER(LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::POINT);
word0.set_MIP_FILTER(LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::POINT);
word0.set_TEX_ARRAY_OVERRIDE(true);
LATTE_SQ_TEX_SAMPLER_WORD1_0 word1{};
word1.set_MAX_LOD(0x3FF);
LATTE_SQ_TEX_SAMPLER_WORD2_0 word2{};
word2.set_TYPE(LATTE_SQ_TEX_SAMPLER_WORD2_0::E_SAMPLER_TYPE::UKN1);
sampler->word0 = word0;
sampler->word1 = word1;
sampler->word2 = word2;
}
void GX2InitSamplerXYFilter(GX2Sampler* sampler, LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER magFilter, LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER minFilter, uint32 maxAnisoRatio)
{
LATTE_SQ_TEX_SAMPLER_WORD0_0 word0 = sampler->word0;
if (maxAnisoRatio == 0)
{
word0.set_XY_MAG_FILTER(magFilter);
word0.set_XY_MIN_FILTER(minFilter);
word0.set_MAX_ANISO_RATIO(0);
}
else
{
auto getAnisoFilter = [](LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER filter) -> LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER
{
if (filter == LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::POINT)
return LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::ANISO_POINT;
else if (filter == LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::BILINEAR)
return LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::ANISO_BILINEAR;
else
cemu_assert_debug(false);
return LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::POINT;
};
word0.set_XY_MAG_FILTER(getAnisoFilter(magFilter));
word0.set_XY_MIN_FILTER(getAnisoFilter(minFilter));
word0.set_MAX_ANISO_RATIO(maxAnisoRatio);
}
sampler->word0 = word0;
}
void GX2InitSamplerZMFilter(GX2Sampler* sampler, LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER zFilter, LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER mipFilter)
{
LATTE_SQ_TEX_SAMPLER_WORD0_0 word0 = sampler->word0;
word0.set_Z_FILTER(zFilter);
word0.set_MIP_FILTER(mipFilter);
sampler->word0 = word0;
}
void GX2InitSamplerLOD(GX2Sampler* sampler, float minLod, float maxLod, float lodBias)
{
// known special cases: Mario & Sonic Rio passes minimum and maximum float values for minLod/maxLod
if (minLod < 0.0)
minLod = 0.0;
if (maxLod > 16.0)
maxLod = 16.0;
uint32 iMinLod = ((uint32)floorf(minLod * 64.0f));
uint32 iMaxLod = ((uint32)floorf(maxLod * 64.0f));
sint32 iLodBias = (sint32)((sint32)floorf(lodBias * 64.0f)); // input range: -32.0 to 32.0
iMinLod = std::clamp(iMinLod, 0u, 1023u);
iMaxLod = std::clamp(iMaxLod, 0u, 1023u);
iLodBias = std::clamp(iLodBias, -2048, 2047);
LATTE_SQ_TEX_SAMPLER_WORD1_0 word1 = sampler->word1;
word1.set_MIN_LOD(iMinLod);
word1.set_MAX_LOD(iMaxLod);
word1.set_LOD_BIAS(iLodBias);
sampler->word1 = word1;
}
void GX2InitSamplerClamping(GX2Sampler* sampler, LATTE_SQ_TEX_SAMPLER_WORD0_0::E_CLAMP clampX, LATTE_SQ_TEX_SAMPLER_WORD0_0::E_CLAMP clampY, LATTE_SQ_TEX_SAMPLER_WORD0_0::E_CLAMP clampZ)
{
LATTE_SQ_TEX_SAMPLER_WORD0_0 word0 = sampler->word0;
word0.set_CLAMP_X(clampX);
word0.set_CLAMP_Y(clampY);
word0.set_CLAMP_Z(clampZ);
sampler->word0 = word0;
}
void GX2InitSamplerBorderType(GX2Sampler* sampler, LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE borderColorType)
{
LATTE_SQ_TEX_SAMPLER_WORD0_0 word0 = sampler->word0;
word0.set_BORDER_COLOR_TYPE(borderColorType);
sampler->word0 = word0;
}
void GX2InitSamplerDepthCompare(GX2Sampler* sampler, LATTE_SQ_TEX_SAMPLER_WORD0_0::E_DEPTH_COMPARE depthCompareFunction)
{
LATTE_SQ_TEX_SAMPLER_WORD0_0 word0 = sampler->word0;
word0.set_DEPTH_COMPARE_FUNCTION(depthCompareFunction);
sampler->word0 = word0;
}
void _GX2SetSampler(GX2Sampler* sampler, uint32 samplerIndex)
{
GX2ReserveCmdSpace(5);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_SAMPLER, 1 + 3),
samplerIndex * 3,
sampler->word0, sampler->word1, sampler->word2);
}
void GX2SetPixelSampler(GX2Sampler* sampler, uint32 samplerIndex)
{
_GX2SetSampler(sampler, samplerIndex + SAMPLER_BASE_INDEX_PIXEL);
}
void GX2SetVertexSampler(GX2Sampler* sampler, uint32 vertexSamplerIndex)
{
_GX2SetSampler(sampler, vertexSamplerIndex + SAMPLER_BASE_INDEX_VERTEX);
}
void GX2SetGeometrySampler(GX2Sampler* sampler, uint32 geometrySamplerIndex)
{
_GX2SetSampler(sampler, geometrySamplerIndex + SAMPLER_BASE_INDEX_GEOMETRY);
}
void GX2SetComputeSampler(GX2Sampler* sampler, uint32 computeSamplerIndex)
{
_GX2SetSampler(sampler, computeSamplerIndex + SAMPLER_BASE_INDEX_VERTEX); // uses vertex shader stage
}
void GX2SetSamplerBorderColor(uint32 registerBaseOffset, uint32 samplerIndex, float red, float green, float blue, float alpha)
{
GX2ReserveCmdSpace(6);
gx2WriteGather_submit(
pm4HeaderType3(IT_SET_CONFIG_REG, 1 + 4),
registerBaseOffset + samplerIndex * 4 - LATTE_REG_BASE_CONFIG,
red, green, blue, alpha);
}
void GX2SetPixelSamplerBorderColor(uint32 pixelSamplerIndex, float red, float green, float blue, float alpha)
{
GX2SetSamplerBorderColor(REGADDR::TD_PS_SAMPLER0_BORDER_RED, pixelSamplerIndex, red, green, blue, alpha);
}
void GX2SetVertexSamplerBorderColor(uint32 vertexSamplerIndex, float red, float green, float blue, float alpha)
{
GX2SetSamplerBorderColor(REGADDR::TD_VS_SAMPLER0_BORDER_RED, vertexSamplerIndex, red, green, blue, alpha);
}
void GX2SetGeometrySamplerBorderColor(uint32 geometrySamplerIndex, float red, float green, float blue, float alpha)
{
GX2SetSamplerBorderColor(REGADDR::TD_GS_SAMPLER0_BORDER_RED, geometrySamplerIndex, red, green, blue, alpha);
}
void GX2TextureInit()
{
// texture
cafeExportRegister("gx2", GX2InitTextureRegs, LogType::GX2);
cafeExportRegister("gx2", GX2SetPixelTexture, LogType::GX2);
cafeExportRegister("gx2", GX2SetVertexTexture, LogType::GX2);
cafeExportRegister("gx2", GX2SetGeometryTexture, LogType::GX2);
cafeExportRegister("gx2", GX2SetComputeTexture, LogType::GX2);
// sampler
cafeExportRegister("gx2", GX2InitSampler, LogType::GX2);
cafeExportRegister("gx2", GX2InitSamplerXYFilter, LogType::GX2);
cafeExportRegister("gx2", GX2InitSamplerZMFilter, LogType::GX2);
cafeExportRegister("gx2", GX2InitSamplerLOD, LogType::GX2);
cafeExportRegister("gx2", GX2InitSamplerClamping, LogType::GX2);
cafeExportRegister("gx2", GX2InitSamplerBorderType, LogType::GX2);
cafeExportRegister("gx2", GX2InitSamplerDepthCompare, LogType::GX2);
cafeExportRegister("gx2", GX2SetPixelSampler, LogType::GX2);
cafeExportRegister("gx2", GX2SetVertexSampler, LogType::GX2);
cafeExportRegister("gx2", GX2SetGeometrySampler, LogType::GX2);
cafeExportRegister("gx2", GX2SetComputeSampler, LogType::GX2);
cafeExportRegister("gx2", GX2SetPixelSamplerBorderColor, LogType::GX2);
cafeExportRegister("gx2", GX2SetVertexSamplerBorderColor, LogType::GX2);
cafeExportRegister("gx2", GX2SetGeometrySamplerBorderColor, LogType::GX2);
}
};

View file

@ -0,0 +1,37 @@
#pragma once
#include "Cafe/HW/Latte/ISA/LatteReg.h"
#include "GX2_Surface.h"
namespace GX2
{
struct GX2Texture
{
/* +0x00 */ GX2Surface surface;
/* +0x74 */ uint32be viewFirstMip;
/* +0x78 */ uint32be viewNumMips;
/* +0x7C */ uint32be viewFirstSlice;
/* +0x80 */ uint32be viewNumSlices;
/* +0x84 */ uint32be compSel;
/* +0x88 */ betype<Latte::LATTE_SQ_TEX_RESOURCE_WORD0_N> regTexWord0;
/* +0x8C */ betype<Latte::LATTE_SQ_TEX_RESOURCE_WORD1_N> regTexWord1;
// word2 and word3 are the base/mip address and are not stored
/* +0x90 */ betype<Latte::LATTE_SQ_TEX_RESOURCE_WORD4_N> regTexWord4;
/* +0x94 */ betype<Latte::LATTE_SQ_TEX_RESOURCE_WORD5_N> regTexWord5;
/* +0x98 */ betype<Latte::LATTE_SQ_TEX_RESOURCE_WORD6_N> regTexWord6;
};
static_assert(sizeof(GX2Texture) == 0x9C);
struct GX2Sampler
{
betype<Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0> word0;
betype<Latte::LATTE_SQ_TEX_SAMPLER_WORD1_0> word1;
betype<Latte::LATTE_SQ_TEX_SAMPLER_WORD2_0> word2;
};
static_assert(sizeof(GX2Sampler) == 12);
void GX2InitTextureRegs(GX2Texture* texture);
void GX2TextureInit();
};

View file

@ -0,0 +1,460 @@
#include "Cafe/OS/common/OSCommon.h"
#include "GX2.h"
#include "Cafe/HW/Latte/LatteAddrLib/LatteAddrLib.h"
#include "Cafe/HW/Latte/Core/LatteTextureLoader.h"
#define GX2_MAX_ACTIVE_TILING_APERATURES (32)
struct ActiveTilingAperature
{
uint32 addr;
uint32 size;
uint32 handle;
uint32 endianMode;
// surface info
GX2Surface surface;
uint32 sliceIndex;
uint32 mipLevel;
};
ActiveTilingAperature activeTilingAperature[GX2_MAX_ACTIVE_TILING_APERATURES];
sint32 activeTilingAperatureCount = 0;
MPTR GX2TilingAperature_allocateTilingMemory(uint32 size)
{
uint32 currentOffset = 0;
while( true )
{
// align offset
currentOffset = (currentOffset+0xFFF)&~0xFFF;
// check if out of range
if( (currentOffset+size) >= MEMORY_TILINGAPERTURE_AREA_SIZE )
break;
// check if range intersects with any already allocated range
bool isAvailable = true;
uint32 nextOffset = 0xFFFFFFFF;
for(sint32 i=0; i<activeTilingAperatureCount; i++)
{
uint32 startA = currentOffset;
uint32 endA = startA+size;
uint32 startB = activeTilingAperature[i].addr - MEMORY_TILINGAPERTURE_AREA_ADDR;
uint32 endB = startB+activeTilingAperature[i].size;
if( startA < endB && endA >= startB )
{
isAvailable = false;
nextOffset = std::min(nextOffset, endB);
}
}
if( isAvailable )
return currentOffset + MEMORY_TILINGAPERTURE_AREA_ADDR;
currentOffset = nextOffset;
}
return MPTR_NULL;
}
std::atomic<uint32> sGenAperatureHandle{1};
uint32 GX2TilingAperature_GenerateHandle()
{
return sGenAperatureHandle.fetch_add(1);
}
template<typename copyType, int count, bool isWrite>
void copyValue(uint8* outputBlockData, uint8* inputBlockData)
{
if (isWrite)
{
*(copyType*)outputBlockData = *(copyType*)inputBlockData;
if (count >= 2)
((copyType*)outputBlockData)[1] = ((copyType*)inputBlockData)[1];
if (count >= 3)
((copyType*)outputBlockData)[2] = ((copyType*)inputBlockData)[2];
if (count >= 4)
((copyType*)outputBlockData)[3] = ((copyType*)inputBlockData)[3];
}
else
{
*(copyType*)inputBlockData = *(copyType*)outputBlockData;
if (count >= 2)
((copyType*)inputBlockData)[1] = ((copyType*)outputBlockData)[1];
if (count >= 3)
((copyType*)inputBlockData)[2] = ((copyType*)outputBlockData)[2];
if (count >= 4)
((copyType*)inputBlockData)[3] = ((copyType*)outputBlockData)[3];
}
}
template<int bpp, bool isWrite, int surfaceTileMode>
void retileTexture(ActiveTilingAperature* tilingAperture, uint8* inputData, uint8* outputData, sint32 texelWidth, sint32 texelHeight, sint32 surfaceSlice, sint32 surfacePitch, sint32 surfaceHeight, sint32 surfaceDepth, LatteAddrLib::CachedSurfaceAddrInfo* cachedInfo)
{
for (sint32 y = 0; y < texelHeight; y++)
{
uint32 srcOffset;
uint8* inputBlockData;
if (bpp != 8)
{
srcOffset = (0 + y*surfacePitch)*(bpp / 8);
inputBlockData = inputData + srcOffset;
}
for (sint32 x = 0; x < texelWidth; x++)
{
// calculate address of input block
sint32 texelX = x;
sint32 texelY = y;
if (bpp == 8)
{
texelX ^= 8;
texelY ^= 2;
srcOffset = (texelX + texelY*surfacePitch)*(bpp / 8);
inputBlockData = inputData + srcOffset;
}
// calculate address of output block
uint32 dstBitPos = 0;
uint32 dstOffset = 0;
if (surfaceTileMode == 4)
dstOffset = ComputeSurfaceAddrFromCoordMacroTiledCached_tm04_sample1(x, y, cachedInfo);
else if (surfaceTileMode == 2 || surfaceTileMode == 3)
{
dstOffset = LatteAddrLib::ComputeSurfaceAddrFromCoordMicroTiled(x, y, cachedInfo->slice, cachedInfo->bpp, cachedInfo->pitch, cachedInfo->height, (Latte::E_HWTILEMODE)cachedInfo->tileMode, false);
}
else if (surfaceTileMode == 1 || surfaceTileMode == 0)
{
dstOffset = LatteAddrLib::ComputeSurfaceAddrFromCoordLinear(x, y, cachedInfo->slice, 0, cachedInfo->bpp, cachedInfo->pitch, cachedInfo->height, cachedInfo->depth);
}
else
dstOffset = LatteAddrLib::ComputeSurfaceAddrFromCoordMacroTiledCached(x, y, cachedInfo);
uint8* outputBlockData = outputData + dstOffset;
if (bpp == 32)
copyValue<uint32, 1, isWrite>(outputBlockData, inputBlockData);
else if (bpp == 16)
copyValue<uint16, 1, isWrite>(outputBlockData, inputBlockData);
else if (bpp == 8)
copyValue<uint8, 1, isWrite>(outputBlockData, inputBlockData);
else if (bpp == 64)
copyValue<uint64, 1, isWrite>(outputBlockData, inputBlockData);
else if (bpp == 128)
copyValue<uint64, 2, isWrite>(outputBlockData, inputBlockData);
else
{
cemu_assert_unimplemented();
}
if (bpp != 8)
{
inputBlockData += (bpp / 8);
}
}
}
}
template<int bpp, bool isWrite>
void retileTexture_tm04_sample1(ActiveTilingAperature* tilingAperture, uint8* inputData, uint8* outputData, sint32 texelWidth, sint32 texelHeight, sint32 surfaceSlice, sint32 surfacePitch, sint32 surfaceHeight, sint32 surfaceDepth, LatteAddrLib::CachedSurfaceAddrInfo* cachedInfo)
{
uint16* tableBase = cachedInfo->microTilePixelIndexTable + ((cachedInfo->slice & 7) << 6);
for (sint32 y = 0; y < texelHeight; y++)
{
uint32 srcOffset;
uint8* inputBlockData;
if (bpp != 8)
{
srcOffset = (0 + y*surfacePitch)*(bpp / 8);
inputBlockData = inputData + srcOffset;
}
for (sint32 bx = 0; bx < texelWidth; bx += 8)
{
uint16* pixelOffsets = tableBase + ((y&7) << 3);
uint32 baseOffset = ComputeSurfaceAddrFromCoordMacroTiledCached_tm04_sample1(bx, y, cachedInfo);
for (sint32 x = bx; x < bx+8; x++)
{
// calculate address of input block
if (bpp == 8)
{
sint32 texelX = x;
sint32 texelY = y;
texelX ^= 8;
texelY ^= 2;
srcOffset = (texelX + texelY*surfacePitch)*(bpp / 8);
inputBlockData = inputData + srcOffset;
}
// calculate address of output block
uint32 dstBitPos = 0;
uint32 pixelIndex = *pixelOffsets;
pixelOffsets++;
uint32 pixelOffset = pixelIndex * (bpp/8);
uint32 elemOffset = pixelOffset;
if ((bpp * 8) > 256)
{
// separate group bytes, for small formats this step is not necessary since elemOffset is never over 0xFF (maximum is 8*8*bpp)
elemOffset = (elemOffset & 0xFF) | ((elemOffset&~0xFF) << 3);
}
sint32 offset = baseOffset + elemOffset;
uint8* outputBlockData = outputData + offset;
if (bpp == 32)
copyValue<uint32, 1, isWrite>(outputBlockData, inputBlockData);
else if (bpp == 16)
copyValue<uint16, 1, isWrite>(outputBlockData, inputBlockData);
else if (bpp == 8)
copyValue<uint8, 1, isWrite>(outputBlockData, inputBlockData);
else if (bpp == 64)
copyValue<uint64, 1, isWrite>(outputBlockData, inputBlockData);
else if (bpp == 128)
copyValue<uint64, 2, isWrite>(outputBlockData, inputBlockData);
else
{
cemu_assert_unimplemented();
}
if (bpp != 8)
{
inputBlockData += (bpp / 8);
}
}
}
// copy remaining partial block
for (sint32 x = (texelWidth&~7); x < texelWidth; x++)
{
// calculate address of input block
sint32 texelX = x;
sint32 texelY = y;
if (bpp == 8)
{
texelX ^= 8;
texelY ^= 2;
srcOffset = (texelX + texelY*surfacePitch)*(bpp / 8);
inputBlockData = inputData + srcOffset;
}
// calculate address of output block
uint32 dstBitPos = 0;
uint32 dstOffset = 0;
dstOffset = ComputeSurfaceAddrFromCoordMacroTiledCached_tm04_sample1(x, y, cachedInfo);
uint8* outputBlockData = outputData + dstOffset;
if (bpp == 32)
copyValue<uint32, 1, isWrite>(outputBlockData, inputBlockData);
else if (bpp == 16)
copyValue<uint16, 1, isWrite>(outputBlockData, inputBlockData);
else if (bpp == 8)
copyValue<uint8, 1, isWrite>(outputBlockData, inputBlockData);
else if (bpp == 64)
copyValue<uint64, 1, isWrite>(outputBlockData, inputBlockData);
else if (bpp == 128)
copyValue<uint64, 2, isWrite>(outputBlockData, inputBlockData);
else
{
cemu_assert_unimplemented();
}
if (bpp != 8)
{
inputBlockData += (bpp / 8);
}
}
}
}
template<int bpp, bool isWrite>
void retileTextureWrapper(ActiveTilingAperature* tilingAperture, uint8* inputData, uint8* outputData, sint32 texelWidth, sint32 texelHeight, sint32 surfaceSlice, sint32 surfaceTileMode, sint32 surfacePitch, sint32 surfaceHeight, sint32 surfaceDepth, LatteAddrLib::CachedSurfaceAddrInfo* cachedInfo)
{
if (surfaceTileMode == 0)
retileTexture<bpp, isWrite, 0>(tilingAperture, inputData, outputData, texelWidth, texelHeight, surfaceSlice, surfacePitch, surfaceHeight, surfaceDepth, cachedInfo);
else if (surfaceTileMode == 1)
retileTexture<bpp, isWrite, 1>(tilingAperture, inputData, outputData, texelWidth, texelHeight, surfaceSlice, surfacePitch, surfaceHeight, surfaceDepth, cachedInfo);
else if (surfaceTileMode == 2)
retileTexture<bpp, isWrite, 2>(tilingAperture, inputData, outputData, texelWidth, texelHeight, surfaceSlice, surfacePitch, surfaceHeight, surfaceDepth, cachedInfo);
else if (surfaceTileMode == 3)
retileTexture<bpp, isWrite, 3>(tilingAperture, inputData, outputData, texelWidth, texelHeight, surfaceSlice, surfacePitch, surfaceHeight, surfaceDepth, cachedInfo);
else if (surfaceTileMode == 4)
retileTexture<bpp, isWrite, 4>(tilingAperture, inputData, outputData, texelWidth, texelHeight, surfaceSlice, surfacePitch, surfaceHeight, surfaceDepth, cachedInfo);
else if (surfaceTileMode == 7)
retileTexture<bpp, isWrite, 7>(tilingAperture, inputData, outputData, texelWidth, texelHeight, surfaceSlice, surfacePitch, surfaceHeight, surfaceDepth, cachedInfo);
else
{
cemu_assert_unimplemented();
}
}
void LatteTextureLoader_begin(LatteTextureLoaderCtx* textureLoader, uint32 sliceIndex, uint32 mipIndex, MPTR physImagePtr, MPTR physMipPtr, Latte::E_GX2SURFFMT format, Latte::E_DIM dim, uint32 width, uint32 height, uint32 depth, uint32 mipLevels, uint32 pitch, Latte::E_HWTILEMODE tileMode, uint32 swizzle);
void GX2TilingAperature_RetileTexture(ActiveTilingAperature* tilingAperture, bool doWrite)
{
//uint64 timerTilingStart = benchmarkTimer_start();
Latte::E_GX2SURFFMT surfaceFormat = tilingAperture->surface.format;
uint32 surfaceSlice = tilingAperture->sliceIndex;
LatteAddrLib::AddrSurfaceInfo_OUT surfaceInfo = {0};
GX2::GX2CalculateSurfaceInfo(&tilingAperture->surface, tilingAperture->mipLevel, &surfaceInfo);
uint32 surfacePitch = surfaceInfo.pitch;
uint32 surfaceSwizzle = tilingAperture->surface.swizzle;
uint32 surfacePipeSwizzle = (surfaceSwizzle>>8)&1;
uint32 surfaceBankSwizzle = ((surfaceSwizzle>>9)&3);
Latte::E_HWTILEMODE surfaceTileMode = surfaceInfo.hwTileMode;
uint32 surfaceDepth = std::max<uint32>(surfaceInfo.depth, 1);
sint32 width = std::max<uint32>((uint32)tilingAperture->surface.width >> tilingAperture->mipLevel, 1);
sint32 height = std::max<uint32>((uint32)tilingAperture->surface.height >> tilingAperture->mipLevel, 1);
Latte::E_DIM surfaceDim = tilingAperture->surface.dim;
uint32 surfaceMipSwizzle = 0; // todo
uint32 mipLevels = tilingAperture->surface.numLevels;
// get texture info
uint8* inputData = (uint8*)memory_getPointerFromVirtualOffset(tilingAperture->addr);
uint8* outputData;
if( tilingAperture->mipLevel == 0 )
outputData = (uint8*)memory_getPointerFromVirtualOffset(tilingAperture->surface.imagePtr);
else if( tilingAperture->mipLevel == 1 )
outputData = (uint8*)memory_getPointerFromVirtualOffset(tilingAperture->surface.mipPtr);
else
outputData = (uint8*)memory_getPointerFromVirtualOffset(tilingAperture->surface.mipPtr + tilingAperture->surface.mipOffset[tilingAperture->mipLevel-1]);
sint32 stepX = 1;
sint32 stepY = 1;
bool isCompressed = false;
if( Latte::IsCompressedFormat(surfaceFormat) )
{
isCompressed = true;
stepX = 4;
stepY = 4;
}
uint32 bpp = surfaceInfo.bpp;
uint32 bytesPerPixel = bpp/8;
LatteAddrLib::CachedSurfaceAddrInfo computeAddrInfo = { 0 };
SetupCachedSurfaceAddrInfo(&computeAddrInfo, surfaceSlice, 0, bpp, surfacePitch, surfaceInfo.height, surfaceInfo.depth, 1 * 1, surfaceTileMode, false, surfacePipeSwizzle, surfaceBankSwizzle);
// init info for swizzle encoder/decoder
LatteTextureLoaderCtx textureLoaderCtx{};
LatteTextureLoader_begin(&textureLoaderCtx, surfaceSlice, 0, tilingAperture->surface.imagePtr, tilingAperture->surface.mipPtr, surfaceFormat, surfaceDim, width, height, surfaceDepth, mipLevels, surfacePitch, surfaceTileMode, surfaceSwizzle);
textureLoaderCtx.decodedTexelCountX = surfacePitch;
textureLoaderCtx.decodedTexelCountY = isCompressed ? (height + 3) / 4 : height;
if( doWrite )
{
if (surfaceTileMode == Latte::E_HWTILEMODE::TM_2D_TILED_THIN1 && bpp == 32 && isCompressed == false)
{
optimizedDecodeLoops<uint32, 1, true, false>(&textureLoaderCtx, inputData);
}
else if (bpp == 8)
retileTextureWrapper<8, true>(tilingAperture, inputData, outputData, width / stepX, height / stepY, surfaceSlice, (uint32)surfaceTileMode, surfacePitch, surfaceInfo.height, surfaceDepth, &computeAddrInfo);
else if (bpp == 16)
retileTextureWrapper<16, true>(tilingAperture, inputData, outputData, width / stepX, height / stepY, surfaceSlice, (uint32)surfaceTileMode, surfacePitch, surfaceInfo.height, surfaceDepth, &computeAddrInfo);
else if (bpp == 32)
retileTextureWrapper<32, true>(tilingAperture, inputData, outputData, width / stepX, height / stepY, surfaceSlice, (uint32)surfaceTileMode, surfacePitch, surfaceInfo.height, surfaceDepth, &computeAddrInfo);
else if (bpp == 64)
retileTextureWrapper<64, true>(tilingAperture, inputData, outputData, width / stepX, height / stepY, surfaceSlice, (uint32)surfaceTileMode, surfacePitch, surfaceInfo.height, surfaceDepth, &computeAddrInfo);
else if (bpp == 128)
retileTextureWrapper<128, true>(tilingAperture, inputData, outputData, width / stepX, height / stepY, surfaceSlice, (uint32)surfaceTileMode, surfacePitch, surfaceInfo.height, surfaceDepth, &computeAddrInfo);
else
{
cemu_assert_unimplemented();
}
}
else
{
if (surfaceTileMode == Latte::E_HWTILEMODE::TM_2D_TILED_THIN1 && bpp == 32 && isCompressed == false)
{
optimizedDecodeLoops<uint32, 1, false, false>(&textureLoaderCtx, inputData);
}
else if (bpp == 8)
retileTextureWrapper<8, false>(tilingAperture, inputData, outputData, width / stepX, height / stepY, surfaceSlice, (uint32)surfaceTileMode, surfacePitch, surfaceInfo.height, surfaceDepth, &computeAddrInfo);
else if (bpp == 16)
retileTextureWrapper<16, false>(tilingAperture, inputData, outputData, width / stepX, height / stepY, surfaceSlice, (uint32)surfaceTileMode, surfacePitch, surfaceInfo.height, surfaceDepth, &computeAddrInfo);
else if (bpp == 32)
retileTextureWrapper<32, false>(tilingAperture, inputData, outputData, width / stepX, height / stepY, surfaceSlice, (uint32)surfaceTileMode, surfacePitch, surfaceInfo.height, surfaceDepth, &computeAddrInfo);
else if (bpp == 64)
retileTextureWrapper<64, false>(tilingAperture, inputData, outputData, width / stepX, height / stepY, surfaceSlice, (uint32)surfaceTileMode, surfacePitch, surfaceInfo.height, surfaceDepth, &computeAddrInfo);
else if (bpp == 128)
retileTextureWrapper<128, false>(tilingAperture, inputData, outputData, width / stepX, height / stepY, surfaceSlice, (uint32)surfaceTileMode, surfacePitch, surfaceInfo.height, surfaceDepth, &computeAddrInfo);
else
{
cemu_assert_unimplemented();
}
}
//double benchmarkTime = benchmarkTimer_stop(timerTilingStart);
//forceLogDebug_printf("TilingAperture res %04dx%04d fmt %04x tm %02x mip %d isWrite %d", (uint32)tilingAperture->surface.width, (uint32)tilingAperture->surface.height, (uint32)tilingAperture->surface.format, (uint32)tilingAperture->surface.tileMode, tilingAperture->mipLevel, doWrite?1:0);
//forceLogDebug_printf("Tiling took %.4lfms", benchmarkTime);
}
void gx2Export_GX2AllocateTilingApertureEx(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2AllocateTilingApertureEx(0x%08x, %d, %d, %d, 0x%08x, 0x%08x)", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5], hCPU->gpr[6], hCPU->gpr[7], hCPU->gpr[8]);
GX2Surface* surface = (GX2Surface*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
gx2Log_printf("Tiling Tex: %08X %dx%d Swizzle: %08x tm: %d fmt: %04x use: %02x", (uint32)surface->imagePtr, (uint32)surface->width, (uint32)surface->height, (uint32)surface->swizzle, (uint32)surface->tileMode.value(), (uint32)surface->format.value(), (uint32)surface->resFlag);
if( activeTilingAperatureCount >= GX2_MAX_ACTIVE_TILING_APERATURES )
{
debugBreakpoint();
memory_writeU32(hCPU->gpr[8], MPTR_NULL);
memory_writeU32(hCPU->gpr[7], 0);
osLib_returnFromFunction(hCPU, 0);
return;
}
uint32 mipLevel = hCPU->gpr[4];
uint32 sliceIndex = hCPU->gpr[5];
uint32 tilingSize = 0;
// calculate size of texture
Latte::E_GX2SURFFMT surfaceFormat = surface->format;
uint32 bitsPerPixel = Latte::GetFormatBits(surfaceFormat);
if (Latte::IsCompressedFormat(surfaceFormat))
bitsPerPixel /= (4*4);
// get surface pitch
LatteAddrLib::AddrSurfaceInfo_OUT surfaceInfo = {0};
GX2::GX2CalculateSurfaceInfo(surface, 0, &surfaceInfo);
uint32 surfacePitch = surfaceInfo.pitch;
uint32 width = std::max<uint32>((uint32)surface->width >> mipLevel, 1);
uint32 height = std::max<uint32>((uint32)surface->height >> mipLevel, 1);
uint32 alignedWidth = (width+3)&~3;
uint32 alignedHeight = (height+3)&~3;
tilingSize = (surfacePitch*alignedHeight*bitsPerPixel+7)/8;
uint32 taHandle = GX2TilingAperature_GenerateHandle();
// allocate memory for tiling space
MPTR tilingAddress = GX2TilingAperature_allocateTilingMemory(tilingSize);
if( tilingAddress == MPTR_NULL )
{
cemu_assert_suspicious();
memory_writeU32(hCPU->gpr[8], MPTR_NULL);
memory_writeU32(hCPU->gpr[7], 0);
osLib_returnFromFunction(hCPU, 0);
return;
}
// add tiling aperture entry
activeTilingAperature[activeTilingAperatureCount].addr = tilingAddress;
activeTilingAperature[activeTilingAperatureCount].size = tilingSize;
activeTilingAperature[activeTilingAperatureCount].handle = taHandle;
activeTilingAperature[activeTilingAperatureCount].endianMode = hCPU->gpr[6];
activeTilingAperature[activeTilingAperatureCount].sliceIndex = sliceIndex;
activeTilingAperature[activeTilingAperatureCount].mipLevel = mipLevel;
memcpy(&activeTilingAperature[activeTilingAperatureCount].surface, surface, sizeof(GX2Surface));
activeTilingAperatureCount++;
// return values
memory_writeU32(hCPU->gpr[8], tilingAddress);
memory_writeU32(hCPU->gpr[7], taHandle);
// load texture data into tiling area
GX2TilingAperature_RetileTexture(activeTilingAperature+activeTilingAperatureCount-1, false);
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2FreeTilingAperture(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2FreeTilingAperture(0x%08x)\n", hCPU->gpr[3]);
uint32 handle = hCPU->gpr[3];
for(sint32 i=0; i<activeTilingAperatureCount; i++)
{
if( activeTilingAperature[i].handle == handle )
{
// flush texture
GX2TilingAperature_RetileTexture(activeTilingAperature+i, true);
// remove entry
if( i+1 < activeTilingAperatureCount )
{
memcpy(activeTilingAperature+i, activeTilingAperature+activeTilingAperatureCount-1, sizeof(ActiveTilingAperature));
}
activeTilingAperatureCount--;
osLib_returnFromFunction(hCPU, 0);
return;
}
}
osLib_returnFromFunction(hCPU, 0);
}

View file

@ -0,0 +1,599 @@
#include "Cafe/OS/common/OSCommon.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "Cafe/HW/Latte/Core/Latte.h"
#include "Cafe/HW/Latte/Core/LatteDraw.h"
#include "Cafe/HW/Latte/ISA/LatteReg.h"
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "GX2.h"
#include "GX2_Shader.h"
void gx2Export_GX2SetFetchShader(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetFetchShader(0x%08x)\n", hCPU->gpr[3]);
GX2ReserveCmdSpace(11);
GX2FetchShader_t* fetchShaderPtr = (GX2FetchShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
cemu_assert_debug((_swapEndianU32(fetchShaderPtr->shaderPtr) & 0xFF) == 0);
gx2WriteGather_submit(
// setup fetch shader
pm4HeaderType3(IT_SET_CONTEXT_REG, 1+5),
mmSQ_PGM_START_FS-0xA000,
_swapEndianU32(fetchShaderPtr->shaderPtr)>>8, // pointer divided by 256
_swapEndianU32(fetchShaderPtr->shaderSize)>>3, // size divided by 8
0x10000, // ukn (ring buffer size?)
0x10000, // ukn (ring buffer size?)
*(uint32be*)&(fetchShaderPtr->_regs[0]),
// write instance step
pm4HeaderType3(IT_SET_CONTEXT_REG, 1+2),
mmVGT_INSTANCE_STEP_RATE_0-0xA000,
*(uint32be*)&(fetchShaderPtr->divisors[0]),
*(uint32be*)&(fetchShaderPtr->divisors[1]));
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2GetVertexShaderGPRs(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2GetVertexShaderGPRs(0x%08x)\n", hCPU->gpr[3]);
GX2VertexShader_t* vertexShader = (GX2VertexShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
uint8 numGPRs = _swapEndianU32(vertexShader->regs[0])&0xFF;
osLib_returnFromFunction(hCPU, numGPRs);
}
void gx2Export_GX2GetVertexShaderStackEntries(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2GetVertexShaderStackEntries(0x%08x)\n", hCPU->gpr[3]);
GX2VertexShader_t* vertexShader = (GX2VertexShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
uint8 stackEntries = (_swapEndianU32(vertexShader->regs[0])>>8)&0xFF;
osLib_returnFromFunction(hCPU, stackEntries);
}
void gx2Export_GX2GetPixelShaderGPRs(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2GetPixelShaderGPRs(0x%08x)\n", hCPU->gpr[3]);
GX2PixelShader_t* pixelShader = (GX2PixelShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
uint8 stackEntries = (_swapEndianU32(pixelShader->regs[0]))&0xFF;
osLib_returnFromFunction(hCPU, stackEntries);
}
void gx2Export_GX2GetPixelShaderStackEntries(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2GetPixelShaderStackEntries(0x%08x)\n", hCPU->gpr[3]);
GX2PixelShader_t* pixelShader = (GX2PixelShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
uint8 numGPRs = (_swapEndianU32(pixelShader->regs[0]>>8))&0xFF;
osLib_returnFromFunction(hCPU, numGPRs);
}
void gx2Export_GX2SetVertexShader(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetVertexShader(0x%08x)\n", hCPU->gpr[3]);
GX2ReserveCmdSpace(100);
GX2VertexShader_t* vertexShader = (GX2VertexShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
MPTR shaderProgramAddr;
uint32 shaderProgramSize;
if( _swapEndianU32(vertexShader->shaderPtr) != MPTR_NULL )
{
// without R API
shaderProgramAddr = _swapEndianU32(vertexShader->shaderPtr);
shaderProgramSize = _swapEndianU32(vertexShader->shaderSize);
}
else
{
shaderProgramAddr = vertexShader->rBuffer.GetVirtualAddr();
shaderProgramSize = vertexShader->rBuffer.GetSize();
}
cemu_assert_debug(shaderProgramAddr != 0);
cemu_assert_debug(shaderProgramSize != 0);
if( _swapEndianU32(vertexShader->shaderMode) == GX2_SHADER_MODE_GEOMETRY_SHADER )
{
// in geometry shader mode the vertex shader is written to _ES register and almost all vs control registers are set by GX2SetGeometryShader
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 6));
gx2WriteGather_submitU32AsBE(mmSQ_PGM_START_ES-0xA000);
gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(shaderProgramAddr)>>8);
gx2WriteGather_submitU32AsBE(shaderProgramSize>>3);
gx2WriteGather_submitU32AsBE(0x100000);
gx2WriteGather_submitU32AsBE(0x100000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(vertexShader->regs[0])); // unknown
}
else
{
gx2WriteGather_submit(
/* vertex shader program */
pm4HeaderType3(IT_SET_CONTEXT_REG, 6),
mmSQ_PGM_START_VS-0xA000,
memory_virtualToPhysical(shaderProgramAddr)>>8, // physical address
shaderProgramSize>>3, // size
0x100000,
0x100000,
_swapEndianU32(vertexShader->regs[0]), // unknown
/* primitive id enable */
pm4HeaderType3(IT_SET_CONTEXT_REG, 2),
mmVGT_PRIMITIVEID_EN-0xA000,
_swapEndianU32(vertexShader->regs[1]),
/* output config */
pm4HeaderType3(IT_SET_CONTEXT_REG, 2),
mmSPI_VS_OUT_CONFIG-0xA000,
_swapEndianU32(vertexShader->regs[2]));
if( (_swapEndianU32(vertexShader->regs[2]) & 1) != 0 )
debugBreakpoint(); // per-component flag?
// ukn
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmPA_CL_VS_OUT_CNTL-0xA000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(vertexShader->regs[14]));
uint32 numOutputIds = _swapEndianU32(vertexShader->regs[3]);
numOutputIds = std::min<uint32>(numOutputIds, 0xA);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numOutputIds));
gx2WriteGather_submitU32AsBE(mmSPI_VS_OUT_ID_0-0xA000);
for(uint32 i=0; i<numOutputIds; i++)
{
gx2WriteGather_submitU32AsBE(_swapEndianU32(vertexShader->regs[4+i]));
}
/*
VS _regs[]:
0 ?
1 mmVGT_PRIMITIVEID_EN (?)
2 mmSPI_VS_OUT_CONFIG
3 Number of used SPI_VS_OUT_ID_* entries
4 - 13 SPI_VS_OUT_ID_0 - SPI_VS_OUT_ID_9
14 pa_cl_vs_out_cntl
...
17 - ?? semantic table entry (input)
...
50 vgt_vertex_reuse_block_cntl
51 vgt_hos_reuse_depth
*/
// todo: mmSQ_PGM_CF_OFFSET_VS
// todo: mmVGT_STRMOUT_BUFFER_EN
// stream out
if( _swapEndianU32(vertexShader->usesStreamOut) != 0 )
{
// stride 0
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_0-0xA000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(vertexShader->streamOutVertexStride[0])>>2);
// stride 1
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_1-0xA000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(vertexShader->streamOutVertexStride[1])>>2);
// stride 2
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_2-0xA000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(vertexShader->streamOutVertexStride[2])>>2);
// stride 3
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_3-0xA000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(vertexShader->streamOutVertexStride[3])>>2);
}
}
// update semantic table
uint32 vsSemanticTableSize = _swapEndianU32(vertexShader->regs[0x40/4]);
if( vsSemanticTableSize > 0 )
{
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+1));
gx2WriteGather_submitU32AsBE(mmSQ_VTX_SEMANTIC_CLEAR-0xA000);
gx2WriteGather_submitU32AsBE(0xFFFFFFFF);
if( vsSemanticTableSize == 0 )
{
// todo: Figure out how this is done on real SW/HW (some vertex shaders don't have a semantic table)
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+1));
gx2WriteGather_submitU32AsBE(mmSQ_VTX_SEMANTIC_0-0xA000);
gx2WriteGather_submitU32AsBE(0xFFFFFFFF);
}
else
{
uint32* vsSemanticTable = vertexShader->regs+(0x44/4);
vsSemanticTableSize = std::min<uint32>(vsSemanticTableSize, 0x20);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+vsSemanticTableSize));
gx2WriteGather_submitU32AsBE(mmSQ_VTX_SEMANTIC_0-0xA000);
for(uint32 i=0; i<vsSemanticTableSize; i++)
gx2WriteGather_submitU32AsLE(vsSemanticTable[i]);
}
}
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetPixelShader(0x%08x)\n", hCPU->gpr[3]);
GX2ReserveCmdSpace(100);
GX2PixelShader_t* pixelShader = (GX2PixelShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
MPTR shaderProgramAddr;
uint32 shaderProgramSize;
if( _swapEndianU32(pixelShader->shaderPtr) != MPTR_NULL )
{
// old format
shaderProgramAddr = _swapEndianU32(pixelShader->shaderPtr);
shaderProgramSize = _swapEndianU32(pixelShader->shaderSize);
}
else
{
shaderProgramAddr = pixelShader->rBuffer.GetVirtualAddr();
shaderProgramSize = pixelShader->rBuffer.GetSize();
}
gx2WriteGather_submit(
/* pixel shader program */
pm4HeaderType3(IT_SET_CONTEXT_REG, 6),
mmSQ_PGM_START_PS - 0xA000,
memory_virtualToPhysical(shaderProgramAddr)>>8, // address
shaderProgramSize>>3, // size
0x100000,
0x100000,
_swapEndianU32(pixelShader->regs[0]), // ukn
/* setup pixel shader input control */
pm4HeaderType3(IT_SET_CONTEXT_REG, 3),
mmSPI_PS_IN_CONTROL_0-0xA000,
_swapEndianU32(pixelShader->regs[2]),
_swapEndianU32(pixelShader->regs[3]));
// setup pixel shader extended inputs control
uint32 numInputs = _swapEndianU32(pixelShader->regs[4]);
if( numInputs > 0x20 )
numInputs = 0x20;
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numInputs));
gx2WriteGather_submitU32AsBE(mmSPI_PS_INPUT_CNTL_0-0xA000);
for(uint32 i=0; i<numInputs; i++)
{
uint32 inputData = _swapEndianU32(pixelShader->regs[5+i]);
gx2WriteGather_submitU32AsBE(inputData);
}
gx2WriteGather_submit(
/* mmCB_SHADER_MASK */
pm4HeaderType3(IT_SET_CONTEXT_REG, 2),
mmCB_SHADER_MASK-0xA000,
_swapEndianU32(pixelShader->regs[37]),
/* mmCB_SHADER_CONTROL */
pm4HeaderType3(IT_SET_CONTEXT_REG, 2),
mmCB_SHADER_CONTROL-0xA000,
_swapEndianU32(pixelShader->regs[38]),
/* mmDB_SHADER_CONTROL */
pm4HeaderType3(IT_SET_CONTEXT_REG, 2),
mmDB_SHADER_CONTROL-0xA000,
_swapEndianU32(pixelShader->regs[39]),
/* SPI_INPUT_Z */
pm4HeaderType3(IT_SET_CONTEXT_REG, 2),
mmSPI_INPUT_Z-0xA000,
_swapEndianU32(pixelShader->regs[40]));
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetGeometryShader(0x%08x)\n", hCPU->gpr[3]);
GX2ReserveCmdSpace(100);
GX2GeometryShader_t* geometryShader = (GX2GeometryShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
MPTR shaderProgramAddr;
uint32 shaderProgramSize;
if( _swapEndianU32(geometryShader->shaderPtr) != MPTR_NULL )
{
// old format
shaderProgramAddr = _swapEndianU32(geometryShader->shaderPtr);
shaderProgramSize = _swapEndianU32(geometryShader->shaderSize);
}
else
{
shaderProgramAddr = geometryShader->rBuffer.GetVirtualAddr();
shaderProgramSize = geometryShader->rBuffer.GetSize();
}
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 6));
gx2WriteGather_submitU32AsBE(mmSQ_PGM_START_GS-0xA000);
gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(shaderProgramAddr)>>8);
gx2WriteGather_submitU32AsBE(shaderProgramSize>>3);
gx2WriteGather_submitU32AsBE(0x100000);
gx2WriteGather_submitU32AsBE(0x100000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->regs[0])); // unknown content (SQ_PGM_RESOURCES_GS)
uint32 primitiveOut = _swapEndianU32(geometryShader->regs[1]);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmVGT_GS_OUT_PRIM_TYPE-0xA000);
gx2WriteGather_submitU32AsLE(geometryShader->regs[1]);
gx2WriteGather_submit(
pm4HeaderType3(IT_SET_CONTEXT_REG, 2),
Latte::REGADDR::VGT_GS_MODE - 0xA000,
geometryShader->reg.VGT_GS_MODE
);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmSQ_PGM_RESOURCES_GS-0xA000);
gx2WriteGather_submitU32AsLE(geometryShader->regs[0]);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmSQ_GS_VERT_ITEMSIZE-0xA000);
gx2WriteGather_submitU32AsLE(geometryShader->regs[5]);
if( _swapEndianU32(geometryShader->useStreamout) != 0 )
{
// stride 0
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_0-0xA000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->streamoutStride[0])>>2);
// stride 1
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_1-0xA000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->streamoutStride[1])>>2);
// stride 2
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_2-0xA000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->streamoutStride[2])>>2);
// stride 3
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_3-0xA000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->streamoutStride[3])>>2);
}
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_BUFFER_EN-0xA000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->regs[18]));
// set copy shader (written to vertex shader registers, vs in turn is written to es registers)
MPTR copyShaderProgramAddr;
uint32 copyShaderProgramSize;
if( _swapEndianU32(geometryShader->copyShaderPtr) != MPTR_NULL )
{
copyShaderProgramAddr = _swapEndianU32(geometryShader->copyShaderPtr);
copyShaderProgramSize = _swapEndianU32(geometryShader->copyShaderSize);
}
else
{
copyShaderProgramAddr = geometryShader->rBufferCopyProgram.GetVirtualAddr();
copyShaderProgramSize = geometryShader->rBufferCopyProgram.GetSize();
}
cemu_assert_debug((copyShaderProgramAddr>>8) != 0);
cemu_assert_debug((copyShaderProgramSize>>3) != 0);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 6));
gx2WriteGather_submitU32AsBE(mmSQ_PGM_START_VS-0xA000);
gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(copyShaderProgramAddr)>>8);
gx2WriteGather_submitU32AsBE(copyShaderProgramSize>>3);
gx2WriteGather_submitU32AsBE(0x100000);
gx2WriteGather_submitU32AsBE(0x100000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->regs[4])); // mmSQ_PGM_RESOURCES_VS
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmPA_CL_VS_OUT_CNTL-0xA000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->regs[3]));
// GS outputs
uint32 numOutputIds = _swapEndianU32(geometryShader->regs[7]);
numOutputIds = std::min<uint32>(numOutputIds, 0xA);
if( numOutputIds != 0 )
{
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numOutputIds));
gx2WriteGather_submitU32AsBE(mmSPI_VS_OUT_ID_0-0xA000);
for(uint32 i=0; i<numOutputIds; i++)
{
gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->regs[8+i]));
}
}
// output config
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmSPI_VS_OUT_CONFIG-0xA000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->regs[6]));
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmSQ_GSVS_RING_ITEMSIZE-0xA000);
gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->ringItemsize)&0x7FFF);
/*
Geometry shader registers in regs[19]:
0 SQ_PGM_RESOURCES_GS ?
1 mmVGT_GS_OUT_PRIM_TYPE
2 mmVGT_GS_MODE
3 mmPA_CL_VS_OUT_CNTL
4 mmSQ_PGM_RESOURCES_VS (set in combination with mmSQ_PGM_START_VS)
5 mmSQ_GS_VERT_ITEMSIZE
6 mmSPI_VS_OUT_CONFIG
7 number of active mmSPI_VS_OUT_ID_* fields?
8-17 mmSPI_VS_OUT_ID_*
18 mmVGT_STRMOUT_BUFFER_EN
*/
osLib_returnFromFunction(hCPU, 0);
}
struct GX2ComputeShader_t
{
/* +0x00 */ uint32be regs[12];
/* +0x30 */ uint32be programSize;
/* +0x34 */ uint32be programPtr;
/* +0x38 */ uint32 ukn38;
/* +0x3C */ uint32 ukn3C;
/* +0x40 */ uint32 ukn40[8];
/* +0x60 */ uint32be workgroupSizeX;
/* +0x64 */ uint32be workgroupSizeY;
/* +0x68 */ uint32be workgroupSizeZ;
/* +0x6C */ uint32be workgroupSizeSpecial;
/* +0x70 */ uint32be ukn70;
/* +0x74 */ GX2RBuffer rBuffer;
};
static_assert(offsetof(GX2ComputeShader_t, programSize) == 0x30);
static_assert(offsetof(GX2ComputeShader_t, workgroupSizeX) == 0x60);
static_assert(offsetof(GX2ComputeShader_t, rBuffer) == 0x74);
void gx2Export_GX2SetComputeShader(PPCInterpreter_t* hCPU)
{
ppcDefineParamTypePtr(computeShader, GX2ComputeShader_t, 0);
gx2Log_printf("GX2SetComputeShader(0x%08x)", hCPU->gpr[3]);
MPTR shaderPtr;
uint32 shaderSize;
if (computeShader->programPtr)
{
shaderPtr = computeShader->programPtr;
shaderSize = computeShader->programSize;
}
else
{
shaderPtr = computeShader->rBuffer.GetVirtualAddr();
shaderSize = computeShader->rBuffer.GetSize();
}
GX2ReserveCmdSpace(0x11);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 6),
mmSQ_PGM_START_ES-0xA000,
memory_virtualToPhysical(shaderPtr) >> 8,
shaderSize >> 3,
0x100000,
0x100000,
computeShader->regs[0]);
// todo: Other registers
osLib_returnFromFunction(hCPU, 0);
}
void _GX2SubmitUniformReg(uint32 aluRegisterOffset, MPTR virtualAddress, uint32 count)
{
uint32* dataWords = (uint32*)memory_getPointerFromVirtualOffset(virtualAddress);
GX2ReserveCmdSpace(2 + (count / 0xFF) * 2 + count);
// write PM4 command(s)
uint32 currentRegisterOffset = aluRegisterOffset;
while (count > 0)
{
uint32 subCount = std::min(count, 0xFFu); // a single command can write at most 0xFF values
gx2WriteGather_submit(pm4HeaderType3(IT_SET_ALU_CONST, 1 + subCount),
currentRegisterOffset);
gx2WriteGather_submitU32AsLEArray(dataWords, subCount);
dataWords += subCount;
count -= subCount;
currentRegisterOffset += subCount;
}
}
void gx2Export_GX2SetVertexUniformReg(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetVertexUniformReg(0x%08x,0x%x,0x%08x)", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5]);
_GX2SubmitUniformReg(hCPU->gpr[3] + 0x400, hCPU->gpr[5], hCPU->gpr[4]);
cemu_assert_debug((hCPU->gpr[3] + hCPU->gpr[4]) <= 0x400);
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SetPixelUniformReg(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetPixelUniformReg(0x%08x,0x%x,0x%08x)", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5]);
_GX2SubmitUniformReg(hCPU->gpr[3], hCPU->gpr[5], hCPU->gpr[4]);
cemu_assert_debug((hCPU->gpr[3] + hCPU->gpr[4]) <= 0x400);
osLib_returnFromFunction(hCPU, 0);
}
void _GX2SubmitUniformBlock(uint32 registerBase, uint32 index, MPTR virtualAddress, uint32 size)
{
GX2ReserveCmdSpace(9);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_RESOURCE, 8),
registerBase + index * 7,
memory_virtualToPhysical(virtualAddress),
size - 1,
0,
1,
0, // ukn
0, // ukn
0xC0000000);
}
void gx2Export_GX2SetVertexUniformBlock(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetVertexUniformBlock(0x%08x,0x%x,0x%08x)", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5]);
_GX2SubmitUniformBlock(mmSQ_VTX_UNIFORM_BLOCK_START - mmSQ_TEX_RESOURCE_WORD0, hCPU->gpr[3], hCPU->gpr[5], hCPU->gpr[4]);
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SetPixelUniformBlock(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetPixelUniformBlock(0x%08x,0x%x,0x%08x)", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5]);
_GX2SubmitUniformBlock(mmSQ_PS_UNIFORM_BLOCK_START - mmSQ_TEX_RESOURCE_WORD0, hCPU->gpr[3], hCPU->gpr[5], hCPU->gpr[4]);
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SetGeometryUniformBlock(PPCInterpreter_t* hCPU)
{
gx2Log_printf("GX2SetGeometryUniformBlock(0x%08x,0x%x,0x%08x)", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5]);
_GX2SubmitUniformBlock(mmSQ_GS_UNIFORM_BLOCK_START - mmSQ_TEX_RESOURCE_WORD0, hCPU->gpr[3], hCPU->gpr[5], hCPU->gpr[4]);
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2RSetVertexUniformBlock(PPCInterpreter_t* hCPU)
{
GX2ReserveCmdSpace(9);
GX2RBuffer* bufferPtr = (GX2RBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
uint32 index = hCPU->gpr[4];
uint32 offset = hCPU->gpr[5];
_GX2SubmitUniformBlock(mmSQ_VTX_UNIFORM_BLOCK_START - mmSQ_TEX_RESOURCE_WORD0, index, bufferPtr->GetVirtualAddr() + offset, bufferPtr->GetSize() - offset);
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SetShaderModeEx(PPCInterpreter_t* hCPU)
{
GX2ReserveCmdSpace(8+4);
uint32 mode = hCPU->gpr[3];
uint32 sqConfig = hCPU->gpr[3] == 0 ? 4 : 0;
if (mode == GX2_SHADER_MODE_COMPUTE_SHADER)
sqConfig |= 0xE4000000; // ES/GS/PS priority?
// todo - other sqConfig bits
gx2WriteGather_submit((uint32)(pm4HeaderType3(IT_SET_CONFIG_REG, 7)),
(uint32)(mmSQ_CONFIG - 0x2000),
sqConfig,
0, // ukn / todo
0, // ukn / todo
0, // ukn / todo
0, // ukn / todo
0 // ukn / todo
);
// if not GS, then update mmVGT_GS_MODE
if( mode != GX2_SHADER_MODE_GEOMETRY_SHADER )
{
// update VGT_GS_MODE only if no geometry shader is used (else this register is already set by GX2SetGeometryShader)
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(Latte::REGADDR::VGT_GS_MODE-0xA000);
if (mode == GX2_SHADER_MODE_COMPUTE_SHADER)
gx2WriteGather_submitU32AsBE(Latte::LATTE_VGT_GS_MODE().set_MODE(Latte::LATTE_VGT_GS_MODE::E_MODE::SCENARIO_G).set_COMPUTE_MODE(Latte::LATTE_VGT_GS_MODE::E_COMPUTE_MODE::ON).set_PARTIAL_THD_AT_EOI(true).getRawValueBE());
else
gx2WriteGather_submitU32AsBE(_swapEndianU32(0));
}
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2CalcGeometryShaderInputRingBufferSize(PPCInterpreter_t* hCPU)
{
uint32 size = (hCPU->gpr[3]*4) * 0x1000;
osLib_returnFromFunction(hCPU, size);
}
void gx2Export_GX2CalcGeometryShaderOutputRingBufferSize(PPCInterpreter_t* hCPU)
{
uint32 size = (hCPU->gpr[3]*4) * 0x1000;
osLib_returnFromFunction(hCPU, size);
}