mirror of
https://git.suyu.dev/suyu/suyu.git
synced 2024-11-26 05:16:24 -05:00
Videocore: Implement simple vertex caching
This gives a ~2/3 reduction in the amount of vertices that need to be processed through the vertex loaders and the vertex shader, yielding a good speedup.
This commit is contained in:
parent
4d086a4db4
commit
a96502edd3
1 changed files with 90 additions and 63 deletions
|
@ -206,88 +206,115 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
|
||||||
std::map<u32, u32> ranges;
|
std::map<u32, u32> ranges;
|
||||||
} memory_accesses;
|
} memory_accesses;
|
||||||
|
|
||||||
|
// Simple circular-replacement vertex cache
|
||||||
|
// The size has been tuned for optimal balance between hit-rate and the cost of lookup
|
||||||
|
const size_t VERTEX_CACHE_SIZE = 32;
|
||||||
|
std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids;
|
||||||
|
std::array<VertexShader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache;
|
||||||
|
|
||||||
|
unsigned int vertex_cache_pos = 0;
|
||||||
|
vertex_cache_ids.fill(-1);
|
||||||
|
|
||||||
for (unsigned int index = 0; index < regs.num_vertices; ++index)
|
for (unsigned int index = 0; index < regs.num_vertices; ++index)
|
||||||
{
|
{
|
||||||
unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index;
|
unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index;
|
||||||
|
|
||||||
|
// -1 is a common special value used for primitive restart. Since it's unknown if
|
||||||
|
// the PICA supports it, and it would mess up the caching, guard against it here.
|
||||||
|
ASSERT(vertex != -1);
|
||||||
|
|
||||||
|
bool vertex_cache_hit = false;
|
||||||
|
VertexShader::OutputVertex output;
|
||||||
|
|
||||||
if (is_indexed) {
|
if (is_indexed) {
|
||||||
// TODO: Implement some sort of vertex cache!
|
|
||||||
if (g_debug_context && Pica::g_debug_context->recorder) {
|
if (g_debug_context && Pica::g_debug_context->recorder) {
|
||||||
int size = index_u16 ? 2 : 1;
|
int size = index_u16 ? 2 : 1;
|
||||||
memory_accesses.AddAccess(base_address + index_info.offset + size * index, size);
|
memory_accesses.AddAccess(base_address + index_info.offset + size * index, size);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Initialize data for the current vertex
|
for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) {
|
||||||
VertexShader::InputVertex input;
|
if (vertex == vertex_cache_ids[i]) {
|
||||||
|
output = vertex_cache[i];
|
||||||
for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
|
vertex_cache_hit = true;
|
||||||
if (vertex_attribute_elements[i] != 0) {
|
break;
|
||||||
// Default attribute values set if array elements have < 4 components. This
|
|
||||||
// is *not* carried over from the default attribute settings even if they're
|
|
||||||
// enabled for this attribute.
|
|
||||||
static const float24 zero = float24::FromFloat32(0.0f);
|
|
||||||
static const float24 one = float24::FromFloat32(1.0f);
|
|
||||||
input.attr[i] = Math::Vec4<float24>(zero, zero, zero, one);
|
|
||||||
|
|
||||||
// Load per-vertex data from the loader arrays
|
|
||||||
for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
|
|
||||||
u32 source_addr = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i];
|
|
||||||
const u8* srcdata = Memory::GetPhysicalPointer(source_addr);
|
|
||||||
|
|
||||||
if (g_debug_context && Pica::g_debug_context->recorder) {
|
|
||||||
memory_accesses.AddAccess(source_addr,
|
|
||||||
(vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4
|
|
||||||
: (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
const float srcval = (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *(s8*)srcdata :
|
|
||||||
(vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *(u8*)srcdata :
|
|
||||||
(vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *(s16*)srcdata :
|
|
||||||
*(float*)srcdata;
|
|
||||||
|
|
||||||
input.attr[i][comp] = float24::FromFloat32(srcval);
|
|
||||||
LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08lx + 0x%04lx: %f",
|
|
||||||
comp, i, vertex, index,
|
|
||||||
attribute_config.GetPhysicalBaseAddress(),
|
|
||||||
vertex_attribute_sources[i] - base_address,
|
|
||||||
vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i],
|
|
||||||
input.attr[i][comp].ToFloat32());
|
|
||||||
}
|
}
|
||||||
} else if (attribute_config.IsDefaultAttribute(i)) {
|
|
||||||
// Load the default attribute if we're configured to do so
|
|
||||||
input.attr[i] = g_state.vs.default_attributes[i];
|
|
||||||
LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)",
|
|
||||||
i, vertex, index,
|
|
||||||
input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),
|
|
||||||
input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
|
|
||||||
} else {
|
|
||||||
// TODO(yuriks): In this case, no data gets loaded and the vertex remains
|
|
||||||
// with the last value it had. This isn't currently maintained
|
|
||||||
// as global state, however, and so won't work in Cita yet.
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (g_debug_context)
|
if (!vertex_cache_hit) {
|
||||||
g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input);
|
// Initialize data for the current vertex
|
||||||
|
VertexShader::InputVertex input;
|
||||||
|
|
||||||
|
for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
|
||||||
|
if (vertex_attribute_elements[i] != 0) {
|
||||||
|
// Default attribute values set if array elements have < 4 components. This
|
||||||
|
// is *not* carried over from the default attribute settings even if they're
|
||||||
|
// enabled for this attribute.
|
||||||
|
static const float24 zero = float24::FromFloat32(0.0f);
|
||||||
|
static const float24 one = float24::FromFloat32(1.0f);
|
||||||
|
input.attr[i] = Math::Vec4<float24>(zero, zero, zero, one);
|
||||||
|
|
||||||
|
// Load per-vertex data from the loader arrays
|
||||||
|
for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
|
||||||
|
u32 source_addr = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i];
|
||||||
|
const u8* srcdata = Memory::GetPhysicalPointer(source_addr);
|
||||||
|
|
||||||
|
if (g_debug_context && Pica::g_debug_context->recorder) {
|
||||||
|
memory_accesses.AddAccess(source_addr,
|
||||||
|
(vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4
|
||||||
|
: (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float srcval = (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *(s8*)srcdata :
|
||||||
|
(vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *(u8*)srcdata :
|
||||||
|
(vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *(s16*)srcdata :
|
||||||
|
*(float*)srcdata;
|
||||||
|
|
||||||
|
input.attr[i][comp] = float24::FromFloat32(srcval);
|
||||||
|
LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08lx + 0x%04lx: %f",
|
||||||
|
comp, i, vertex, index,
|
||||||
|
attribute_config.GetPhysicalBaseAddress(),
|
||||||
|
vertex_attribute_sources[i] - base_address,
|
||||||
|
vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i],
|
||||||
|
input.attr[i][comp].ToFloat32());
|
||||||
|
}
|
||||||
|
} else if (attribute_config.IsDefaultAttribute(i)) {
|
||||||
|
// Load the default attribute if we're configured to do so
|
||||||
|
input.attr[i] = g_state.vs.default_attributes[i];
|
||||||
|
LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)",
|
||||||
|
i, vertex, index,
|
||||||
|
input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),
|
||||||
|
input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
|
||||||
|
} else {
|
||||||
|
// TODO(yuriks): In this case, no data gets loaded and the vertex
|
||||||
|
// remains with the last value it had. This isn't currently maintained
|
||||||
|
// as global state, however, and so won't work in Citra yet.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (g_debug_context)
|
||||||
|
g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input);
|
||||||
|
|
||||||
#if PICA_DUMP_GEOMETRY
|
#if PICA_DUMP_GEOMETRY
|
||||||
// NOTE: When dumping geometry, we simply assume that the first input attribute
|
// NOTE: When dumping geometry, we simply assume that the first input attribute
|
||||||
// corresponds to the position for now.
|
// corresponds to the position for now.
|
||||||
DebugUtils::GeometryDumper::Vertex dumped_vertex = {
|
DebugUtils::GeometryDumper::Vertex dumped_vertex = {
|
||||||
input.attr[0][0].ToFloat32(), input.attr[0][1].ToFloat32(), input.attr[0][2].ToFloat32()
|
input.attr[0][0].ToFloat32(), input.attr[0][1].ToFloat32(), input.attr[0][2].ToFloat32()
|
||||||
};
|
};
|
||||||
using namespace std::placeholders;
|
using namespace std::placeholders;
|
||||||
dumping_primitive_assembler.SubmitVertex(dumped_vertex,
|
dumping_primitive_assembler.SubmitVertex(dumped_vertex,
|
||||||
std::bind(&DebugUtils::GeometryDumper::AddTriangle,
|
std::bind(&DebugUtils::GeometryDumper::AddTriangle,
|
||||||
&geometry_dumper, _1, _2, _3));
|
&geometry_dumper, _1, _2, _3));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Send to vertex shader
|
// Send to vertex shader
|
||||||
VertexShader::OutputVertex output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes(), g_state.regs.vs, g_state.vs);
|
output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes(), g_state.regs.vs, g_state.vs);
|
||||||
|
|
||||||
if (is_indexed) {
|
if (is_indexed) {
|
||||||
// TODO: Add processed vertex to vertex cache!
|
vertex_cache[vertex_cache_pos] = output;
|
||||||
|
vertex_cache_ids[vertex_cache_pos] = vertex;
|
||||||
|
vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Settings::values.use_hw_renderer) {
|
if (Settings::values.use_hw_renderer) {
|
||||||
|
|
Loading…
Reference in a new issue