diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 36c3b9947..243abe842 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -206,88 +206,115 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                 std::map<u32, u32> ranges;
             } memory_accesses;
 
+            // Simple circular-replacement vertex cache
+            // The size has been tuned for optimal balance between hit-rate and the cost of lookup
+            const size_t VERTEX_CACHE_SIZE = 32;
+            std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids;
+            std::array<VertexShader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache;
+
+            unsigned int vertex_cache_pos = 0;
+            vertex_cache_ids.fill(-1);
+
             for (unsigned int index = 0; index < regs.num_vertices; ++index)
             {
                 unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index;
 
+                // -1 is a common special value used for primitive restart. Since it's unknown if
+                // the PICA supports it, and it would mess up the caching, guard against it here.
+                ASSERT(vertex != -1);
+
+                bool vertex_cache_hit = false;
+                VertexShader::OutputVertex output;
+
                 if (is_indexed) {
-                    // TODO: Implement some sort of vertex cache!
                     if (g_debug_context && Pica::g_debug_context->recorder) {
                         int size = index_u16 ? 2 : 1;
                         memory_accesses.AddAccess(base_address + index_info.offset + size * index, size);
                     }
-                }
 
-                // Initialize data for the current vertex
-                VertexShader::InputVertex input;
-
-                for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
-                    if (vertex_attribute_elements[i] != 0) {
-                        // Default attribute values set if array elements have < 4 components. This
-                        // is *not* carried over from the default attribute settings even if they're
-                        // enabled for this attribute.
-                        static const float24 zero = float24::FromFloat32(0.0f);
-                        static const float24 one = float24::FromFloat32(1.0f);
-                        input.attr[i] = Math::Vec4<float24>(zero, zero, zero, one);
-
-                        // Load per-vertex data from the loader arrays
-                        for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
-                            u32 source_addr = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i];
-                            const u8* srcdata = Memory::GetPhysicalPointer(source_addr);
-
-                            if (g_debug_context && Pica::g_debug_context->recorder) {
-                                memory_accesses.AddAccess(source_addr,
-                                    (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4
-                                    : (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1);
-                            }
-
-                            const float srcval = (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *(s8*)srcdata :
-                                (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *(u8*)srcdata :
-                                (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *(s16*)srcdata :
-                                *(float*)srcdata;
-
-                            input.attr[i][comp] = float24::FromFloat32(srcval);
-                            LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08lx + 0x%04lx: %f",
-                                comp, i, vertex, index,
-                                attribute_config.GetPhysicalBaseAddress(),
-                                vertex_attribute_sources[i] - base_address,
-                                vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i],
-                                input.attr[i][comp].ToFloat32());
+                    for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) {
+                        if (vertex == vertex_cache_ids[i]) {
+                            output = vertex_cache[i];
+                            vertex_cache_hit = true;
+                            break;
                         }
-                    } else if (attribute_config.IsDefaultAttribute(i)) {
-                        // Load the default attribute if we're configured to do so
-                        input.attr[i] = g_state.vs.default_attributes[i];
-                        LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)",
-                                  i, vertex, index,
-                                  input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),
-                                  input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
-                    } else {
-                        // TODO(yuriks): In this case, no data gets loaded and the vertex remains
-                        //              with the last value it had. This isn't currently maintained
-                        //              as global state, however, and so won't work in Cita yet.
                     }
                 }
 
-                if (g_debug_context)
-                    g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input);
+                if (!vertex_cache_hit) {
+                    // Initialize data for the current vertex
+                    VertexShader::InputVertex input;
+
+                    for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
+                        if (vertex_attribute_elements[i] != 0) {
+                            // Default attribute values set if array elements have < 4 components. This
+                            // is *not* carried over from the default attribute settings even if they're
+                            // enabled for this attribute.
+                            static const float24 zero = float24::FromFloat32(0.0f);
+                            static const float24 one = float24::FromFloat32(1.0f);
+                            input.attr[i] = Math::Vec4<float24>(zero, zero, zero, one);
+
+                            // Load per-vertex data from the loader arrays
+                            for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
+                                u32 source_addr = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i];
+                                const u8* srcdata = Memory::GetPhysicalPointer(source_addr);
+
+                                if (g_debug_context && Pica::g_debug_context->recorder) {
+                                    memory_accesses.AddAccess(source_addr,
+                                        (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4
+                                        : (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1);
+                                }
+
+                                const float srcval = (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *(s8*)srcdata :
+                                    (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *(u8*)srcdata :
+                                    (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *(s16*)srcdata :
+                                    *(float*)srcdata;
+
+                                input.attr[i][comp] = float24::FromFloat32(srcval);
+                                LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08lx + 0x%04lx: %f",
+                                    comp, i, vertex, index,
+                                    attribute_config.GetPhysicalBaseAddress(),
+                                    vertex_attribute_sources[i] - base_address,
+                                    vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i],
+                                    input.attr[i][comp].ToFloat32());
+                            }
+                        } else if (attribute_config.IsDefaultAttribute(i)) {
+                            // Load the default attribute if we're configured to do so
+                            input.attr[i] = g_state.vs.default_attributes[i];
+                            LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)",
+                                      i, vertex, index,
+                                      input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),
+                                      input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
+                        } else {
+                            // TODO(yuriks): In this case, no data gets loaded and the vertex
+                            // remains with the last value it had. This isn't currently maintained
+                            // as global state, however, and so won't work in Citra yet.
+                        }
+                    }
+
+                    if (g_debug_context)
+                        g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input);
 
 #if PICA_DUMP_GEOMETRY
-                // NOTE: When dumping geometry, we simply assume that the first input attribute
-                //       corresponds to the position for now.
-                DebugUtils::GeometryDumper::Vertex dumped_vertex = {
-                    input.attr[0][0].ToFloat32(), input.attr[0][1].ToFloat32(), input.attr[0][2].ToFloat32()
-                };
-                using namespace std::placeholders;
-                dumping_primitive_assembler.SubmitVertex(dumped_vertex,
-                                                         std::bind(&DebugUtils::GeometryDumper::AddTriangle,
-                                                                   &geometry_dumper, _1, _2, _3));
+                    // NOTE: When dumping geometry, we simply assume that the first input attribute
+                    //       corresponds to the position for now.
+                    DebugUtils::GeometryDumper::Vertex dumped_vertex = {
+                        input.attr[0][0].ToFloat32(), input.attr[0][1].ToFloat32(), input.attr[0][2].ToFloat32()
+                    };
+                    using namespace std::placeholders;
+                    dumping_primitive_assembler.SubmitVertex(dumped_vertex,
+                                                             std::bind(&DebugUtils::GeometryDumper::AddTriangle,
+                                                                       &geometry_dumper, _1, _2, _3));
 #endif
 
-                // Send to vertex shader
-                VertexShader::OutputVertex output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes(), g_state.regs.vs, g_state.vs);
+                    // Send to vertex shader
+                    output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes(), g_state.regs.vs, g_state.vs);
 
-                if (is_indexed) {
-                    // TODO: Add processed vertex to vertex cache!
+                    if (is_indexed) {
+                        vertex_cache[vertex_cache_pos] = output;
+                        vertex_cache_ids[vertex_cache_pos] = vertex;
+                        vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE;
+                    }
                 }
 
                 if (Settings::values.use_hw_renderer) {