BVB Source Codes

CRYENGINE Show MergedMeshGeometry.cpp Source code

Return Download CRYENGINE: download MergedMeshGeometry.cpp Source code - Download CRYENGINE Source code - Type:.cpp
  1. // Copyright 2001-2016 Crytek GmbH / Crytek Group. All rights reserved.
  2.  
  3. #include "StdAfx.h"
  4. #include <CryMath/Cry_Geo.h>
  5. #include <CryMath/Cry_GeoIntersect.h>
  6. #include "MergedMeshGeometry.h"
  7. #include "VMath.hpp"
  8.  
  9. #pragma warning(disable: 6001)
  10. #pragma warning(disable: 4101)
  11.  
  12. #if MMRM_ENABLE_PROFILER
  13.         #define MMRM_PROFILE_FUNCTION(x, y)  FUNCTION_PROFILER_3DENGINE(x, y)
  14.         #define MMRM_FRAME_PROFILER(x, y, z) FRAME_PROFILER(x, y, z)
  15. #else
  16.         #define MMRM_PROFILE_FUNCTION(x, y)  (void)0
  17.         #define MMRM_FRAME_PROFILER(x, y, z) (void)0
  18. #endif
  19.  
  20. namespace
  21. {
  22. template<class T, class D>
  23. static inline T* AdvancePtr(D*& pData, size_t nCount = 1)
  24. {
  25.         T* Elems = (T*)pData;
  26.         pData = (D*)((T*)pData + nCount);
  27.         return Elems;
  28. }
  29. }
  30.  
  31. struct SMMRMContact
  32. {
  33.         Plane p;
  34.         int   i;
  35. };
  36.  
  37. ////////////////////////////////////////////////////////////////////////////////
  38. // TODO: cleanup
  39. #if MMRM_USE_VECTORIZED_SSE_INSTRUCTIONS
  40. struct CRY_ALIGN(16) Vec3A: public Vec3
  41. {
  42.         Vec3A()
  43.         {
  44.         }
  45.         Vec3A(const Vec3 &v) : Vec3(v)
  46.         {
  47.         }
  48.         Vec3A& operator=(const Vec3& other)
  49.         {
  50.                 x = other.x;
  51.                 y = other.y;
  52.                 z = other.z;
  53.                 return *this;
  54.         }
  55. };
  56.  
  57. struct CRY_ALIGN(16) Matrix34V: public Matrix34
  58. {
  59.         Matrix34V()
  60.         {
  61.         }
  62.  
  63.         Matrix34V(const Matrix34 &v)
  64.                 : Matrix34(v)
  65.         {
  66.         }
  67.  
  68.         Matrix34V& operator=(const Matrix34V& other)
  69.         {
  70.                 *static_cast<Matrix34*>(this) = other;
  71.                 return *this;
  72.         }
  73. };
  74.  
  75. struct CRY_ALIGN(16) DualQuatA: public DualQuat
  76. {
  77.         DualQuatA()
  78.         {
  79.         }
  80.  
  81.         DualQuatA(const DualQuat &v)
  82.                 : DualQuat(v)
  83.         {
  84.         }
  85.  
  86.         DualQuatA& operator=(const DualQuat& other)
  87.         {
  88.                 *static_cast<DualQuat*>(this) = other;
  89.                 return *this;
  90.         }
  91. };
  92. #else
  93. typedef Vec3     Vec3A;
  94. typedef Matrix34 Matrix34V;
  95. typedef DualQuat DualQuatA;
  96. #endif
  97.  
  98. static inline Quat mat33_to_quat(const Matrix33& m)
  99. {
  100.         Quat q;
  101.         float s, p, tr = m.m00 + m.m11 + m.m22;
  102.         q.w = 1, q.v.x = 0, q.v.y = 0, q.v.z = 0;
  103.         if (tr > 0)
  104.                 s = sqrt_tpl(tr + 1.0f), p = 0.5f / s, q.w = s * 0.5f, q.v.x = (m.m21 - m.m12) * p, q.v.y = (m.m02 - m.m20) * p, q.v.z = (m.m10 - m.m01) * p;
  105.         else if ((m.m00 >= m.m11) && (m.m00 >= m.m22))
  106.                 s = sqrt_tpl(m.m00 - m.m11 - m.m22 + 1.0f), p = 0.5f / s, q.w = (m.m21 - m.m12) * p, q.v.x = s * 0.5f, q.v.y = (m.m10 + m.m01) * p, q.v.z = (m.m20 + m.m02) * p;
  107.         else if ((m.m11 >= m.m00) && (m.m11 >= m.m22))
  108.                 s = sqrt_tpl(m.m11 - m.m22 - m.m00 + 1.0f), p = 0.5f / s, q.w = (m.m02 - m.m20) * p, q.v.x = (m.m01 + m.m10) * p, q.v.y = s * 0.5f, q.v.z = (m.m21 + m.m12) * p;
  109.         else if ((m.m22 >= m.m00) && (m.m22 >= m.m11))
  110.                 s = sqrt_tpl(m.m22 - m.m00 - m.m11 + 1.0f), p = 0.5f / s, q.w = (m.m10 - m.m01) * p, q.v.x = (m.m02 + m.m20) * p, q.v.y = (m.m12 + m.m21) * p, q.v.z = s * 0.5f;
  111.         return q;
  112. }
  113.  
  114. #if MMRM_USE_VECTORIZED_SSE_INSTRUCTIONS
  115.         #if CRY_PLATFORM_SSE4
  116.         // _mm_dp_ps
  117.         //
  118.         // tmp0 : = (mask4 == 1) ? (a0 * b0) : +0.0
  119.         // tmp1 : = (mask5 == 1) ? (a1 * b1) : +0.0
  120.         // tmp2 : = (mask6 == 1) ? (a2 * b2) : +0.0
  121.         // tmp3 : = (mask7 == 1) ? (a3 * b3) : +0.0
  122.         //
  123.         // tmp4 : = tmp0 + tmp1 + tmp2 + tmp3
  124.         //
  125.         // r0 : = (mask0 == 1) ? tmp4 : +0.0
  126.         // r1 : = (mask1 == 1) ? tmp4 : +0.0
  127.         // r2 : = (mask2 == 1) ? tmp4 : +0.0
  128.         // r3 : = (mask3 == 1) ? tmp4 : +0.0
  129.         static inline __m128 DotProduct4(__m128 a, __m128 b)
  130.         {
  131.                 return _mm_dp_ps(a, b, 0xff);
  132.         }
  133.         static inline __m128 DotProduct3(__m128 a, __m128 b)
  134.         {
  135.                 return _mm_dp_ps(a, b, 0x77);
  136.         }
  137.         static inline __m128 DotProduct3to4(__m128 a, __m128 b)
  138.         {
  139.                 return _mm_dp_ps(a, b, 0x7f);
  140.         }
  141.  
  142.         // _mm_blendv_ps
  143.         //
  144.         // r0 : = (mask0 & 0x80000000) ? b0 : a0
  145.         // r1 : = (mask1 & 0x80000000) ? b1 : a1
  146.         // r2 : = (mask2 & 0x80000000) ? b2 : a2
  147.         // r3 : = (mask3 & 0x80000000) ? b3 : a3
  148.         static inline __m128 SelectFromVector(__m128 a, __m128 b, __m128 c)
  149.         {
  150.                 return _mm_blendv_ps(a, b, c);
  151.         }
  152.  
  153.         // _mm_blend_ps
  154.         //
  155.         // r0 : = (mask0 == 0) ? b0 : a0
  156.         // r1 : = (mask1 == 0) ? b1 : a1
  157.         // r2 : = (mask2 == 0) ? b2 : a2
  158.         // r3 : = (mask3 == 0) ? b3 : a3
  159.         template<const int c>
  160.         static inline __m128 SelectFromImmediate(__m128 a, __m128 b)
  161.         {
  162.                 return _mm_blend_ps(a, b, c);
  163.         }
  164.         #elif CRY_PLATFORM_SSE3
  165.         static inline __m128 DotProduct4(__m128 a, __m128 b)
  166.         {
  167.                 __m128 res;
  168.  
  169.                 res = _mm_mul_ps(a, b);
  170.                 res = _mm_hadd_ps(res, res);
  171.                 res = _mm_hadd_ps(res, res);
  172.  
  173.                 return res;
  174.         }
  175.         static inline __m128 DotProduct3(__m128 a, __m128 b)
  176.         {
  177.                 __m128 res;
  178.  
  179.                 res = _mm_mul_ps(a, b);
  180.                 res = _mm_and_ps(res, _mm_castsi128_ps(_mm_setr_epi32(~0, ~0, ~0, 0)));
  181.                 res = _mm_hadd_ps(res, res);
  182.                 res = _mm_hadd_ps(res, res);
  183.                 res = _mm_and_ps(res, _mm_castsi128_ps(_mm_setr_epi32(~0, ~0, ~0, 0)));
  184.  
  185.                 return res;
  186.         }
  187.         static inline __m128 DotProduct3to4(__m128 a, __m128 b)
  188.         {
  189.                 __m128 res;
  190.  
  191.                 res = _mm_and_ps(_mm_mul_ps(a, b), _mm_castsi128_ps(_mm_setr_epi32(~0, ~0, ~0, 0)));
  192.                 res = _mm_hadd_ps(res, res);
  193.                 res = _mm_hadd_ps(res, res);
  194.  
  195.                 return res;
  196.         }
  197.  
  198.         static inline __m128 SelectFromVector(__m128 a, __m128 b, __m128 c)
  199.         {
  200.                 __m128 bs = _mm_and_ps(b, c);
  201.                 __m128 as = _mm_andnot_ps(c, a);
  202.  
  203.                 return _mm_or_ps(bs, as);
  204.         }
  205.         #else
  206.         static inline __m128 DotProduct4(__m128 a, __m128 b)
  207.         {
  208.                 __m128 res;
  209.  
  210.                 res = _mm_mul_ps(a, b);
  211.                 res = _mm_add_ps(res, _mm_shuffle_ps(res, res, _MM_SHUFFLE(1, 0, 3, 2)));
  212.                 res = _mm_add_ps(res, _mm_shuffle_ps(res, res, _MM_SHUFFLE(0, 1, 2, 3)));
  213.  
  214.                 return res;
  215.         }
  216.         static inline __m128 DotProduct3(__m128 a, __m128 b)
  217.         {
  218.                 __m128 res;
  219.  
  220.                 res = _mm_mul_ps(a, b);
  221.                 res = _mm_and_ps(res, _mm_castsi128_ps(_mm_setr_epi32(~0, ~0, ~0, 0)));
  222.                 res = _mm_add_ps(res, _mm_shuffle_ps(res, res, _MM_SHUFFLE(1, 0, 3, 2)));
  223.                 res = _mm_add_ps(res, _mm_shuffle_ps(res, res, _MM_SHUFFLE(0, 1, 2, 3)));
  224.                 res = _mm_and_ps(res, _mm_castsi128_ps(_mm_setr_epi32(~0, ~0, ~0, 0)));
  225.  
  226.                 return res;
  227.         }
  228.         static inline __m128 DotProduct3to4(__m128 a, __m128 b)
  229.         {
  230.                 __m128 res;
  231.  
  232.                 res = _mm_mul_ps(a, b);
  233.                 res = _mm_and_ps(res, _mm_castsi128_ps(_mm_setr_epi32(~0, ~0, ~0, 0)));
  234.                 res = _mm_add_ps(res, _mm_shuffle_ps(res, res, _MM_SHUFFLE(1, 0, 3, 2)));
  235.                 res = _mm_add_ps(res, _mm_shuffle_ps(res, res, _MM_SHUFFLE(0, 1, 2, 3)));
  236.  
  237.                 return res;
  238.         }
  239.  
  240.         static inline __m128 SelectFromVector(__m128 a, __m128 b, __m128 c)
  241.         {
  242.                 __m128 bs = _mm_and_ps(b, c);
  243.                 __m128 as = _mm_andnot_ps(c, a);
  244.  
  245.                 return _mm_or_ps(bs, as);
  246.         }
  247. #endif
  248.  
  249.         static inline __m128 Absolute(__m128 a)
  250.         {
  251.                 return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
  252.         }
  253. #endif
  254.  
  255. #if MMRM_USE_VECTORIZED_SSE_INSTRUCTIONS
  256.         #if CRY_PLATFORM_F16C
  257.         static inline void CvtToHalf(Vec3f16& v, __m128 value)
  258.         {
  259.                 static_assert(sizeof(Vec3f16) == sizeof(uint64_t), "64-bit store intended here for performance");
  260.                 const __m128i result = _mm_cvtps_ph(value, 0);
  261.                 _mm_storel_epi64(reinterpret_cast<__m128i*>(&v), result); // Store X, Y, Z, W(!)
  262.         }
  263.  
  264.         static inline void CvtToHalf(Vec3f16& v, const Vec3& value)
  265.         {
  266.                 CvtToHalf(v, _mm_set_ps(0.0f, value.z, value.y, value.x));
  267.         }
  268.  
  269.         static inline void CvtToHalf(Vec2f16& v, const Vec2& value)
  270.         {
  271.                 static_assert(sizeof(Vec2f16) == sizeof(uint32_t), "32-bit store intended here for performance");
  272.                 const __m128 halfreg = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&value))); // Load X, Y, 0, 0
  273.                 const __m128i result = _mm_cvtps_ph(halfreg, 0);
  274.                 *reinterpret_cast<int*>(&v) = _mm_cvtsi128_si32(result); // _mm_storeu_si32 not supported on all compilers, this should (hopefully) be MOVD m32, xmm
  275.         }
  276.         #else
  277.         static inline __m128i float_to_half_SSE2(__m128 f)
  278.         {
  279.                 #define DECL_CONST4(name, val) static const CRY_ALIGN(16) uint name[4] = { (val), (val), (val), (val) }
  280.                 #define GET_CONSTI(name)       *(const __m128i*)&name
  281.                 #define GET_CONSTF(name)       *(const __m128*)&name
  282.  
  283.                 DECL_CONST4(mask_sign, 0x80000000u);
  284.                 DECL_CONST4(mask_round, ~0xfffu);
  285.                 DECL_CONST4(c_f32infty, 255 << 23);
  286.                 DECL_CONST4(c_magic, 15 << 23);
  287.                 DECL_CONST4(c_nanbit, 0x200);
  288.                 DECL_CONST4(c_infty_as_fp16, 0x7c00);
  289.                 DECL_CONST4(c_clamp, (31 << 23) - 0x1000);
  290.  
  291.                 __m128 msign = GET_CONSTF(mask_sign);
  292.                 __m128 justsign = _mm_and_ps(msign, f);
  293.                 __m128i f32infty = GET_CONSTI(c_f32infty);
  294.                 __m128 absf = _mm_xor_ps(f, justsign);
  295.                 __m128 mround = GET_CONSTF(mask_round);
  296.                 __m128i absf_int = _mm_castps_si128(absf); // pseudo-op, but val needs to be copied once so count as mov
  297.                 __m128i b_isnan = _mm_cmpgt_epi32(absf_int, f32infty);
  298.                 __m128i b_isnormal = _mm_cmpgt_epi32(f32infty, _mm_castps_si128(absf));
  299.                 __m128i nanbit = _mm_and_si128(b_isnan, GET_CONSTI(c_nanbit));
  300.                 __m128i inf_or_nan = _mm_or_si128(nanbit, GET_CONSTI(c_infty_as_fp16));
  301.  
  302.                 __m128 fnosticky = _mm_and_ps(absf, mround);
  303.                 __m128 scaled = _mm_mul_ps(fnosticky, GET_CONSTF(c_magic));
  304.                 __m128 clamped = _mm_min_ps(scaled, GET_CONSTF(c_clamp)); // logically, we want PMINSD on "biased", but this should gen better code
  305.                 __m128i biased = _mm_sub_epi32(_mm_castps_si128(clamped), _mm_castps_si128(mround));
  306.                 __m128i shifted = _mm_srli_epi32(biased, 13);
  307.                 __m128i normal = _mm_and_si128(shifted, b_isnormal);
  308.                 __m128i not_normal = _mm_andnot_si128(b_isnormal, inf_or_nan);
  309.                 __m128i joined = _mm_or_si128(normal, not_normal);
  310.  
  311.                 __m128i sign_shift = _mm_srli_epi32(_mm_castps_si128(justsign), 16);
  312.                 __m128i final = _mm_or_si128(joined, sign_shift);
  313.  
  314.                 // ~20 SSE2 ops
  315.                 return final;
  316.  
  317.                 #undef DECL_CONST4
  318.                 #undef GET_CONSTI
  319.                 #undef GET_CONSTF
  320.         }
  321.         static inline __m128i approx_float_to_half_SSE2(__m128 f)
  322.         {
  323.  
  324.                 #if defined(__GNUC__)
  325.                         #define DECL_CONST4(name, val) static const uint __attribute__((aligned(16))) name[4] = { (val), (val), (val), (val) }
  326.                 #else
  327.                         #define DECL_CONST4(name, val) static const __declspec(align(16)) uint name[4] = { (val), (val), (val), (val) }
  328.                 #endif
  329.                 #define GET_CONSTF(name)         *(const __m128*)&name
  330.  
  331.                 DECL_CONST4(mask_fabs, 0x7fffffffu);
  332.                 DECL_CONST4(c_f32infty, (255 << 23));
  333.                 DECL_CONST4(c_expinf, (255 ^ 31) << 23);
  334.                 DECL_CONST4(c_f16max, (127 + 16) << 23);
  335.                 DECL_CONST4(c_magic, 15 << 23);
  336.  
  337.                 __m128 mabs = GET_CONSTF(mask_fabs);
  338.                 __m128 fabs = _mm_and_ps(mabs, f);
  339.                 __m128 justsign = _mm_xor_ps(f, fabs);
  340.  
  341.                 __m128 f16max = GET_CONSTF(c_f16max);
  342.                 __m128 expinf = GET_CONSTF(c_expinf);
  343.                 __m128 infnancase = _mm_xor_ps(expinf, fabs);
  344.                 __m128 clamped = _mm_min_ps(f16max, fabs);
  345.                 __m128 b_notnormal = _mm_cmpnlt_ps(fabs, GET_CONSTF(c_f32infty));
  346.                 __m128 scaled = _mm_mul_ps(clamped, GET_CONSTF(c_magic));
  347.  
  348.                 __m128 merge1 = _mm_and_ps(infnancase, b_notnormal);
  349.                 __m128 merge2 = _mm_andnot_ps(b_notnormal, scaled);
  350.                 __m128 merged = _mm_or_ps(merge1, merge2);
  351.  
  352.                 __m128i shifted = _mm_srli_epi32(_mm_castps_si128(merged), 13);
  353.                 __m128i signshifted = _mm_srli_epi32(_mm_castps_si128(justsign), 16);
  354.                 __m128i final = _mm_or_si128(shifted, signshifted);
  355.  
  356.                 // ~15 SSE2 ops
  357.                 return final;
  358.  
  359.                 #undef DECL_CONST4
  360.                 #undef GET_CONSTF
  361.         }
  362.  
  363.         static inline void CvtToHalf(Vec3f16& v, __m128 value)
  364.         {
  365.                 __m128i result = approx_float_to_half_SSE2(value);
  366.                 v.x = (reinterpret_cast<int*>(&result))[0];
  367.                 v.y = (reinterpret_cast<int*>(&result))[1];
  368.                 v.z = (reinterpret_cast<int*>(&result))[2];
  369.         }
  370.  
  371.         static inline void CvtToHalf(Vec3f16& v, const Vec3& value)
  372.         {
  373.                 __m128 val128 = _mm_set_ps(0.f, value.z, value.y, value.x);
  374.                 __m128i result = approx_float_to_half_SSE2(val128);
  375.                 v.x = (reinterpret_cast<int*>(&result))[0];
  376.                 v.y = (reinterpret_cast<int*>(&result))[1];
  377.                 v.z = (reinterpret_cast<int*>(&result))[2];
  378.         }
  379.  
  380.         static inline void CvtToHalf(Vec2f16& v, const Vec2& value)
  381.         {
  382.                 __m128 val128 = _mm_set_ps(0.f, 0.f, value.y, value.x);
  383.                 __m128i result = approx_float_to_half_SSE2(val128);
  384.                 v.x = (reinterpret_cast<int*>(&result))[0];
  385.                 v.y = (reinterpret_cast<int*>(&result))[1];
  386.         }
  387.         #endif
  388. #endif
  389.  
  390. #define MMRM_USE_OPTIMIZED_LINESEG_SPHERE 1
  391. #define MMRM_USE_OPTIMIZED_POINT_LENSEG   1
  392. //----------------------------------------------------------------------------------
  393. //--- 0x00 = no intersection                               --------------------------
  394. //--- 0x01 = one intersection, lineseg has just an ENTRY point but no EXIT point (ls.end is inside the sphere)  --
  395. //--- 0x02 = one intersection, lineseg has just an EXIT point but no ENTRY point (ls.start is inside the sphere)  --
  396. //--- 0x03 = two intersection, lineseg has ENTRY and EXIT point  --
  397. //----------------------------------------------------------------------------------
  398. static ILINE int _Lineseg_Sphere(const Lineseg& ls, const Sphere& s, Vec3& i0, Vec3& i1)
  399. {
  400. #if MMRM_USE_OPTIMIZED_LINESEG_SPHERE
  401.         Vec3 dir = (ls.end - ls.start);
  402.         float a = dir | dir;
  403.         const Vec3 lssc = ls.start - s.center;
  404.         int intersection = 0;
  405.         float b = (dir | (lssc)) * 2.0f;
  406.         float c = ((lssc) | (lssc)) - (s.radius * s.radius);
  407.         float desc = (b * b) - (4 * a * c);
  408.  
  409.         if (desc >= 0.f)
  410.         {
  411.                 float sqrtdesc = sqrt_tpl(desc);
  412.                 float r2a = 1.f / (2.0f * a);
  413.                 float lamba0 = (-b - sqrtdesc) * r2a;
  414.                 float lamba1 = (-b + sqrtdesc) * r2a;
  415.  
  416.                 if (lamba0 > 0.0f)
  417.                 {
  418.                         i0 = ls.start + ((ls.end - ls.start) * lamba0);
  419.                         //skip, if 1st cutting-point is "in front" of ls.end
  420.                         if (((i0 - ls.end) | dir) > 0) return 0;
  421.                         intersection = 0x01;
  422.                 }
  423.  
  424.                 if (lamba1 > 0.0f)
  425.                 {
  426.                         i1 = ls.start + ((ls.end - ls.start) * lamba1);
  427.                         //skip, if 2nd cutting-point is "in front" of ls.end (=ls.end is inside sphere)
  428.                         if (((i1 - ls.end) | dir) > 0) return intersection;
  429.                         intersection |= 0x02;
  430.                 }
  431.         }
  432.         return intersection;
  433. #else
  434.  
  435.         Vec3 dir = (ls.end - ls.start);
  436.         float a = dir | dir;
  437.         float b = (dir | (ls.start - s.center)) * 2.0f;
  438.         float c = ((ls.start - s.center) | (ls.start - s.center)) - (s.radius * s.radius);
  439.         float desc = (b * b) - (4 * a * c);
  440.  
  441.         unsigned char intersection = 0;
  442.         if (desc >= 0.0f)
  443.         {
  444.                 float lamba0 = (-b - sqrt_tpl(desc)) / (2.0f * a);
  445.                 if (lamba0 > 0.0f)
  446.                 {
  447.                         i0 = ls.start + ((ls.end - ls.start) * lamba0);
  448.                         //skip, if 1st cutting-point is "in front" of ls.end
  449.                         if (((i0 - ls.end) | dir) > 0) return 0;
  450.                         intersection = 0x01;
  451.                 }
  452.  
  453.                 float lamba1 = (-b + sqrt_tpl(desc)) / (2.0f * a);
  454.                 if (lamba1 > 0.0f)
  455.                 {
  456.                         i1 = ls.start + ((ls.end - ls.start) * lamba1);
  457.                         //skip, if 2nd cutting-point is "in front" of ls.end (=ls.end is inside sphere)
  458.                         if (((i1 - ls.end) | dir) > 0) return intersection;
  459.                         intersection |= 0x02;
  460.                 }
  461.         }
  462.         return intersection;
  463. #endif
  464. }
  465.  
  466. //----------------------------------------------------------------------------------
  467. /// Returns squared distance from a point to a line segment
  468. //----------------------------------------------------------------------------------
  469. static ILINE float _Point_LinesegSq(const Vec3& p, const Lineseg& lineseg)
  470. {
  471. #if MMRM_USE_OPTIMIZED_POINT_LENSEG
  472.         Vec3 diff = p - lineseg.start;
  473.         Vec3 dir = lineseg.end - lineseg.start;
  474.         float fT = diff.Dot(dir);
  475.         fT = __fsel(fT, fT, 0.f);
  476.  
  477.         float fSqrLen = dir.len2();
  478.         float mask = fT - fSqrLen;
  479.         float mul0 = __fsel(mask, 1.f, 0.f);
  480.         float mul1 = __fsel(mask, 0.f, 1.f);
  481.  
  482.         mul1 /= __fsel(mask, 1.f, fSqrLen);
  483.         diff -= dir * (mul0 + mul1);
  484.  
  485.         return diff.len2();
  486. #else
  487.         Vec3 diff = p - lineseg.start;
  488.         Vec3 dir = lineseg.end - lineseg.start;
  489.         float fT = diff.Dot(dir);
  490.  
  491.         if (fT <= 0.0f)
  492.         {
  493.                 fT = 0.0f;
  494.         }
  495.         else
  496.         {
  497.                 float fSqrLen = dir.len2();
  498.                 if (fT >= fSqrLen)
  499.                 {
  500.                         fT = 1.0f;
  501.                         diff -= dir;
  502.                 }
  503.                 else
  504.                 {
  505.                         fT *= 1.f / (fSqrLen);
  506.                         diff -= fT * dir;
  507.                 }
  508.         }
  509.  
  510.         return diff.GetLengthSquared();
  511. #endif
  512. }
  513.  
  514. #undef MMRM_USE_OPTIMIZED_LINESEG_SPHERE
  515. #undef MMRM_USE_OPTIMIZED_POINT_LENSEG
  516.  
  517. struct SMMRMInstanceContext
  518. {
  519.         SMMRMInstance*      samples;
  520.         SMMRMSpineVtxBase*  spines;
  521.         SMMRMDeformVertex*  deform;
  522.         SMMRMGeometry*      geom;
  523.         const CCamera*      cam;
  524.         SMMRMUpdateContext* update;
  525.         SMergedRMChunk*     updateChunks;
  526.         primitives::sphere* colliders;
  527.         SMMRMProjectile*    projectiles;
  528.         Vec3*               wind;
  529.         int                 flags;
  530.         int                 ncolliders;
  531.         int                 nprojectiles;
  532.         float               maxViewDistSq;
  533.         float               lodRatioSq;
  534.         float               diameterSq;
  535.         float               dt, dtscale, abstime;
  536.         float               zRotation;
  537.         Vec3                rotationOrigin;
  538.         Vec3                min, max, centre;
  539.         size_t              amount;
  540.         int                 max_iter;
  541.         int                 use_spines;
  542.         int                 frame_count;
  543. };
  544.  
  545. ////////////////////////////////////////////////////////////////////////////////
  546. // Cull a set of instances against the frustum
  547. static inline size_t CullInstanceList(
  548.   SMMRMInstanceContext& context, SMMRMVisibleChunk* visChunks,
  549.   const Vec3& origin, const Vec3& rotationOrigin, float zRotation)
  550. {
  551.         MEMORY_SCOPE_CHECK_HEAP();
  552.         size_t numSamplesVisible = 0u;
  553.         const float maxViewDistSq = context.maxViewDistSq;
  554.         const float lodRatioSq = context.lodRatioSq;
  555.         const float diameterSq = context.diameterSq;
  556.         const float diameter = sqrt_tpl(diameterSq);
  557.         const Vec3 camPos = context.cam->GetPosition();
  558.         const int cullFrustum = context.flags & MMRM_CULL_FRUSTUM;
  559.         const int cullDistance = context.flags & MMRM_CULL_DISTANCE;
  560.         const int cullLod = context.flags & MMRM_CULL_LOD;
  561.         const int forceLod = (context.flags >> MMRM_LOD_SHIFT) & 3;
  562.         const int sampleReduction = context.flags >> MMRM_SAMPLE_REDUCTION_SHIFT;
  563.         const float sampleOffset = 1.0f + (float) sampleReduction;
  564.         const float fExtents = c_MergedMeshesExtent;
  565.         SMMRMChunk* chunks = NULL;
  566.         mmrm_printf("culling %d samples \n", context.amount);
  567.         for (size_t j = 0; j < context.amount; ++j)
  568.         {
  569.                 SMMRMInstance& sample = context.samples[j];
  570.                 Vec3 sample_pos = ConvertInstanceAbsolute(sample, origin, rotationOrigin, zRotation, fExtents);
  571.                 // Note: use the diameter, not the radius for culling as it better describes
  572.                 // the maximum movement radius a simulated instance can make.
  573.                 const Sphere sampleSphere(sample_pos, diameter);
  574.                 const float distanceSq = (camPos - sample_pos).len2() - diameterSq;
  575.                 if (j & sampleReduction)
  576.                 {
  577.                         sample.lastLod = -1;
  578.                         continue;
  579.                 }
  580.                 if (cullDistance && distanceSq > maxViewDistSq * sampleOffset)
  581.                 {
  582.                         sample.lastLod = -1;
  583.                         continue;
  584.                 }
  585.                 if (cullFrustum && !context.cam->IsSphereVisible_F(sampleSphere))
  586.                 {
  587.                         sample.lastLod = -1;
  588.                         continue;
  589.                 }
  590.                 int nLodTmp = (int)(distanceSq * lodRatioSq / (max(20.0f * 20.f * diameterSq, 0.001f)));
  591.                 size_t nLod = (size_t) (cullLod ? forceLod : nLodTmp);
  592.                 PREFAST_SUPPRESS_WARNING(6385)
  593.                 for (nLod = std::min(nLod, size_t(MAX_STATOBJ_LODS_NUM - 1)); !(chunks = context.geom->pChunks[nLod]); --nLod)
  594.                         ;
  595.                 if (!chunks)
  596.                 {
  597.                         sample.lastLod = -1;
  598.                         continue;
  599.                 }
  600.                 sample.lastLod = static_cast<int8>(nLod);
  601.                 PREFAST_SUPPRESS_WARNING(6385) for (size_t i = 0; i < context.geom->numChunks[nLod]; ++i)
  602.                 {
  603.                         mmrm_assert(chunks[i].nvertices);
  604.                         mmrm_assert(chunks[i].nindices);
  605.                         visChunks[i].vertices += chunks[i].nvertices;
  606.                         visChunks[i].indices += chunks[i].nindices;
  607.                         visChunks[i].matId = chunks[i].matId;
  608.                         mmrm_assert(visChunks[i].vertices);
  609.                         mmrm_assert(visChunks[i].indices);
  610.                 }
  611.                 ++numSamplesVisible;
  612.         }
  613.         mmrm_printf("%d samples visible\n", numSamplesVisible);
  614.         return numSamplesVisible;
  615. }
  616.  
  617. ////////////////////////////////////////////////////////////////////////////////
  618. // Update a set of vertices to be worldspace and merge a deformation stream into
  619. // them
  620. static inline void UpdateGeneral(
  621.   SVF_P3S_C4B_T2S* out
  622.   , SVF_P3F_C4B_T2F* in
  623.   , SMMRMDeformVertex* deform
  624.   , uint16* mapping
  625.   , Matrix34 tmat
  626.   , size_t count)
  627. {
  628.         MMRM_PROFILE_FUNCTION(gEnv->pSystem, PROFILE_3DENGINE);
  629.         for (size_t i = 0; i < count; ++i)
  630.         {
  631.                 out[i].xyz = tmat * deform[mapping[i]].pos[0];
  632.                 out[i].color = in[i].color;
  633.                 out[i].st = in[i].st;
  634.         }
  635. }
  636.  
  637. ////////////////////////////////////////////////////////////////////////////////
  638. // Update a set of vertices to be worldspace
  639. static inline void UpdateGeneral(
  640.   SVF_P3S_C4B_T2S* out
  641.   , SVF_P3F_C4B_T2F* in
  642.   , const Matrix34& wmat)
  643. {
  644.         MMRM_PROFILE_FUNCTION(gEnv->pSystem, PROFILE_3DENGINE);
  645.         out->xyz = wmat * in->xyz;
  646.         out->color = in->color;
  647.         out->st = in->st;
  648. }
  649. static inline void UpdateGeneral(
  650.   SVF_P3S_C4B_T2S* out
  651.   , SVF_P3F_C4B_T2F* in
  652.   , const Matrix34& wmat
  653.   , size_t count)
  654. {
  655.         for (size_t i = 0; i < (count & ~3); i += 4)
  656.         {
  657.                 out[i + 0].xyz = wmat * in[i + 0].xyz;
  658.                 out[i + 0].color = in[i + 0].color;
  659.                 out[i + 0].st = in[i + 0].st;
  660.                 out[i + 1].xyz = wmat * in[i + 1].xyz;
  661.                 out[i + 1].color = in[i + 1].color;
  662.                 out[i + 1].st = in[i + 1].st;
  663.                 out[i + 2].xyz = wmat * in[i + 2].xyz;
  664.                 out[i + 2].color = in[i + 2].color;
  665.                 out[i + 2].st = in[i + 2].st;
  666.                 out[i + 3].xyz = wmat * in[i + 3].xyz;
  667.                 out[i + 3].color = in[i + 3].color;
  668.                 out[i + 3].st = in[i + 3].st;
  669.         }
  670.         for (size_t i = (count & ~3); i < count; ++i)
  671.         {
  672.                 out[i].xyz = wmat * in[i].xyz;
  673.                 out[i].color = in[i].color;
  674.                 out[i].st = in[i].st;
  675.         }
  676. }
  677.  
  678. ////////////////////////////////////////////////////////////////////////////////
  679. // Skin a set of normals against a set of bones
  680. static inline void UpdateNormals(
  681.   Vec3f16* out
  682.   , Vec3* in
  683.   , size_t count)
  684. {
  685.         MMRM_PROFILE_FUNCTION(gEnv->pSystem, PROFILE_3DENGINE);
  686.         for (size_t i = 0; i < (count & ~3); i += 4)
  687.         {
  688.                 out[i + 0] = in[i + 0];
  689.                 out[i + 1] = in[i + 1];
  690.                 out[i + 2] = in[i + 2];
  691.                 out[i + 3] = in[i + 3];
  692.         }
  693.         for (size_t i = (count & ~3); i < count; ++i)
  694.         {
  695.                 out[i] = in[i];
  696.         }
  697. }
  698.  
  699. static inline void UpdateNormals(
  700.   Vec3f16* out
  701.   , Vec3* in
  702.   , DualQuatA* bones
  703.   , SMMRMBoneMapping* weights
  704.   , size_t maxSpines
  705.   , size_t count)
  706. {
  707.         MMRM_PROFILE_FUNCTION(gEnv->pSystem, PROFILE_3DENGINE);
  708. #if MMRM_USE_VECTORIZED_SSE_INSTRUCTIONS
  709.         using namespace NVMath;
  710.         DualQuatA wq[4];
  711.         __m128 _wq[4 * 2];
  712.         __m128 nq_len[4];
  713.         __m128 vweights[4];
  714.         __m128 vw[4];
  715.         for (size_t i = 0; i < (count & ~3); i += 4)
  716.         {
  717.                 _wq[0] = _wq[1] = _wq[2] = _wq[3] = _wq[4] = _wq[5] = _wq[6] = _wq[7] = _mm_xor_ps(_wq[0], _wq[0]);
  718.                 vweights[0] = _mm_load_ps(&weights[i + 0].weights[0]);
  719.                 vweights[1] = _mm_load_ps(&weights[i + 1].weights[0]);
  720.                 vweights[2] = _mm_load_ps(&weights[i + 2].weights[0]);
  721.                 vweights[3] = _mm_load_ps(&weights[i + 3].weights[0]);
  722.                 switch (maxSpines)
  723.                 {
  724.                 case 4:
  725.                         vw[0] = Swizzle<wwww>(vweights[0]);
  726.                         vw[1] = Swizzle<wwww>(vweights[1]);
  727.                         vw[2] = Swizzle<wwww>(vweights[2]);
  728.                         vw[3] = Swizzle<wwww>(vweights[3]);
  729.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[3]].nq), vw[0]));
  730.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[3]].dq), vw[0]));
  731.                         _wq[1 * 2 + 0] = _mm_add_ps(_wq[1 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[3]].nq), vw[1]));
  732.                         _wq[1 * 2 + 1] = _mm_add_ps(_wq[1 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[3]].dq), vw[1]));
  733.                         _wq[2 * 2 + 0] = _mm_add_ps(_wq[2 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[3]].nq), vw[2]));
  734.                         _wq[2 * 2 + 1] = _mm_add_ps(_wq[2 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[3]].dq), vw[2]));
  735.                         _wq[3 * 2 + 0] = _mm_add_ps(_wq[3 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[3]].nq), vw[3]));
  736.                         _wq[3 * 2 + 1] = _mm_add_ps(_wq[3 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[3]].dq), vw[3]));
  737.                 case 3:
  738.                         vw[0] = Swizzle<zzzz>(vweights[0]);
  739.                         vw[1] = Swizzle<zzzz>(vweights[1]);
  740.                         vw[2] = Swizzle<zzzz>(vweights[2]);
  741.                         vw[3] = Swizzle<zzzz>(vweights[3]);
  742.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[2]].nq), vw[0]));
  743.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[2]].dq), vw[0]));
  744.                         _wq[1 * 2 + 0] = _mm_add_ps(_wq[1 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[2]].nq), vw[1]));
  745.                         _wq[1 * 2 + 1] = _mm_add_ps(_wq[1 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[2]].dq), vw[1]));
  746.                         _wq[2 * 2 + 0] = _mm_add_ps(_wq[2 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[2]].nq), vw[2]));
  747.                         _wq[2 * 2 + 1] = _mm_add_ps(_wq[2 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[2]].dq), vw[2]));
  748.                         _wq[3 * 2 + 0] = _mm_add_ps(_wq[3 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[2]].nq), vw[3]));
  749.                         _wq[3 * 2 + 1] = _mm_add_ps(_wq[3 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[2]].dq), vw[3]));
  750.                 case 2:
  751.                         vw[0] = Swizzle<yyyy>(vweights[0]);
  752.                         vw[1] = Swizzle<yyyy>(vweights[1]);
  753.                         vw[2] = Swizzle<yyyy>(vweights[2]);
  754.                         vw[3] = Swizzle<yyyy>(vweights[3]);
  755.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[1]].nq), vw[0]));
  756.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[1]].dq), vw[0]));
  757.                         _wq[1 * 2 + 0] = _mm_add_ps(_wq[1 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[1]].nq), vw[1]));
  758.                         _wq[1 * 2 + 1] = _mm_add_ps(_wq[1 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[1]].dq), vw[1]));
  759.                         _wq[2 * 2 + 0] = _mm_add_ps(_wq[2 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[1]].nq), vw[2]));
  760.                         _wq[2 * 2 + 1] = _mm_add_ps(_wq[2 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[1]].dq), vw[2]));
  761.                         _wq[3 * 2 + 0] = _mm_add_ps(_wq[3 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[1]].nq), vw[3]));
  762.                         _wq[3 * 2 + 1] = _mm_add_ps(_wq[3 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[1]].dq), vw[3]));
  763.                 case 1:
  764.                         vw[0] = Swizzle<xxxx>(vweights[0]);
  765.                         vw[1] = Swizzle<xxxx>(vweights[1]);
  766.                         vw[2] = Swizzle<xxxx>(vweights[2]);
  767.                         vw[3] = Swizzle<xxxx>(vweights[3]);
  768.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[0]].nq), vw[0]));
  769.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[0]].dq), vw[0]));
  770.                         _wq[1 * 2 + 0] = _mm_add_ps(_wq[1 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[0]].nq), vw[1]));
  771.                         _wq[1 * 2 + 1] = _mm_add_ps(_wq[1 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[0]].dq), vw[1]));
  772.                         _wq[2 * 2 + 0] = _mm_add_ps(_wq[2 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[0]].nq), vw[2]));
  773.                         _wq[2 * 2 + 1] = _mm_add_ps(_wq[2 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[0]].dq), vw[2]));
  774.                         _wq[3 * 2 + 0] = _mm_add_ps(_wq[3 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[0]].nq), vw[3]));
  775.                         _wq[3 * 2 + 1] = _mm_add_ps(_wq[3 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[0]].dq), vw[3]));
  776.                 }
  777.                 nq_len[0] = _mm_rsqrt_ps(DotProduct4(_wq[0 * 2 + 0], _wq[0 * 2 + 0]));
  778.                 nq_len[1] = _mm_rsqrt_ps(DotProduct4(_wq[1 * 2 + 0], _wq[1 * 2 + 0]));
  779.                 nq_len[2] = _mm_rsqrt_ps(DotProduct4(_wq[2 * 2 + 0], _wq[2 * 2 + 0]));
  780.                 nq_len[3] = _mm_rsqrt_ps(DotProduct4(_wq[3 * 2 + 0], _wq[3 * 2 + 0]));
  781.  
  782.                 _mm_store_ps((float*)&wq[0].nq, _mm_mul_ps(_wq[0 * 2 + 0], nq_len[0]));
  783.                 _mm_store_ps((float*)&wq[0].dq, _mm_mul_ps(_wq[0 * 2 + 1], nq_len[0]));
  784.                 _mm_store_ps((float*)&wq[1].nq, _mm_mul_ps(_wq[1 * 2 + 0], nq_len[1]));
  785.                 _mm_store_ps((float*)&wq[1].dq, _mm_mul_ps(_wq[1 * 2 + 1], nq_len[1]));
  786.                 _mm_store_ps((float*)&wq[2].nq, _mm_mul_ps(_wq[2 * 2 + 0], nq_len[2]));
  787.                 _mm_store_ps((float*)&wq[2].dq, _mm_mul_ps(_wq[2 * 2 + 1], nq_len[2]));
  788.                 _mm_store_ps((float*)&wq[3].nq, _mm_mul_ps(_wq[3 * 2 + 0], nq_len[3]));
  789.                 _mm_store_ps((float*)&wq[3].dq, _mm_mul_ps(_wq[3 * 2 + 1], nq_len[3]));
  790.  
  791.                 out[i + 0] = wq[0].nq * in[i + 0];
  792.                 out[i + 1] = wq[1].nq * in[i + 1];
  793.                 out[i + 2] = wq[2].nq * in[i + 2];
  794.                 out[i + 3] = wq[3].nq * in[i + 3];
  795.         }
  796.         for (size_t i = (count & ~3); i < count; ++i)
  797.         {
  798.                 _wq[0] = _wq[1] = _mm_xor_ps(_wq[0], _wq[0]);
  799.                 vweights[0] = _mm_load_ps(&weights[i + 0].weights[0]);
  800.                 switch (maxSpines)
  801.                 {
  802.                 case 4:
  803.                         vw[0] = Swizzle<wwww>(vweights[0]);
  804.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[3]].nq), vw[0]));
  805.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[3]].dq), vw[0]));
  806.                 case 3:
  807.                         vw[0] = Swizzle<zzzz>(vweights[0]);
  808.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[2]].nq), vw[0]));
  809.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[2]].dq), vw[0]));
  810.                 case 2:
  811.                         vw[0] = Swizzle<yyyy>(vweights[0]);
  812.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[1]].nq), vw[0]));
  813.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[1]].dq), vw[0]));
  814.                 case 1:
  815.                         vw[0] = Swizzle<xxxx>(vweights[0]);
  816.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[0]].nq), vw[0]));
  817.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[0]].dq), vw[0]));
  818.                 }
  819.                 nq_len[0] = _mm_rsqrt_ps(DotProduct4(_wq[0 * 2 + 0], _wq[0 * 2 + 0]));
  820.                 _mm_store_ps((float*)&wq[0].nq, _mm_mul_ps(_wq[0 * 2 + 0], nq_len[0]));
  821.                 _mm_store_ps((float*)&wq[0].dq, _mm_mul_ps(_wq[0 * 2 + 1], nq_len[0]));
  822.                 out[i + 0] = wq[0].nq * in[i + 0];
  823.         }
  824. #else
  825.         DualQuatA wq[4];
  826.         float l[4];
  827.         for (size_t i = 0; i < (count & ~3); i += 4)
  828.         {
  829.                 wq[0] = wq[1] = wq[2] = wq[3] = DualQuatA(ZERO);
  830.                 switch (maxSpines)
  831.                 {
  832.                 case 4:
  833.                         wq[0] += bones[weights[i + 0].boneIds[3]] * (weights[i + 0].weights[3] / 255.f);
  834.                         wq[1] += bones[weights[i + 1].boneIds[3]] * (weights[i + 1].weights[3] / 255.f);
  835.                         wq[2] += bones[weights[i + 2].boneIds[3]] * (weights[i + 2].weights[3] / 255.f);
  836.                         wq[3] += bones[weights[i + 3].boneIds[3]] * (weights[i + 3].weights[3] / 255.f);
  837.                 case 3:
  838.                         wq[0] += bones[weights[i + 0].boneIds[2]] * (weights[i + 0].weights[2] / 255.f);
  839.                         wq[1] += bones[weights[i + 1].boneIds[2]] * (weights[i + 1].weights[2] / 255.f);
  840.                         wq[2] += bones[weights[i + 2].boneIds[2]] * (weights[i + 2].weights[2] / 255.f);
  841.                         wq[3] += bones[weights[i + 3].boneIds[2]] * (weights[i + 3].weights[2] / 255.f);
  842.                 case 2:
  843.                         wq[0] += bones[weights[i + 0].boneIds[1]] * (weights[i + 0].weights[1] / 255.f);
  844.                         wq[1] += bones[weights[i + 1].boneIds[1]] * (weights[i + 1].weights[1] / 255.f);
  845.                         wq[2] += bones[weights[i + 2].boneIds[1]] * (weights[i + 2].weights[1] / 255.f);
  846.                         wq[3] += bones[weights[i + 3].boneIds[1]] * (weights[i + 3].weights[1] / 255.f);
  847.                 case 1:
  848.                         wq[0] += bones[weights[i + 0].boneIds[0]] * (weights[i + 0].weights[0] / 255.f);
  849.                         wq[1] += bones[weights[i + 1].boneIds[0]] * (weights[i + 1].weights[0] / 255.f);
  850.                         wq[2] += bones[weights[i + 2].boneIds[0]] * (weights[i + 2].weights[0] / 255.f);
  851.                         wq[3] += bones[weights[i + 3].boneIds[0]] * (weights[i + 3].weights[0] / 255.f);
  852.                 }
  853.  
  854.                 l[0] = wq[0].nq.GetLength();
  855.                 l[1] = wq[1].nq.GetLength();
  856.                 l[2] = wq[2].nq.GetLength();
  857.                 l[3] = wq[3].nq.GetLength();
  858.  
  859.                 l[0] = l[0] > 0.f ? 1.f / (l[0]) : l[0];
  860.                 l[1] = l[1] > 0.f ? 1.f / (l[1]) : l[1];
  861.                 l[2] = l[2] > 0.f ? 1.f / (l[2]) : l[2];
  862.                 l[3] = l[3] > 0.f ? 1.f / (l[3]) : l[3];
  863.  
  864.                 wq[0].nq *= l[0];
  865.                 wq[1].nq *= l[1];
  866.                 wq[2].nq *= l[2];
  867.                 wq[3].nq *= l[3];
  868.  
  869.                 out[i + 0] = wq[0].nq * in[i + 0];
  870.                 out[i + 1] = wq[1].nq * in[i + 1];
  871.                 out[i + 2] = wq[2].nq * in[i + 2];
  872.                 out[i + 3] = wq[3].nq * in[i + 3];
  873.         }
  874.         for (size_t i = (count & ~3); i < count; ++i)
  875.         {
  876.                 wq[0] = DualQuatA(ZERO);
  877.                 switch (maxSpines)
  878.                 {
  879.                 case 4:
  880.                         wq[0] += bones[weights[i + 0].boneIds[3]] * (weights[i + 0].weights[3] / 255.f);
  881.                 case 3:
  882.                         wq[0] += bones[weights[i + 0].boneIds[2]] * (weights[i + 0].weights[2] / 255.f);
  883.                 case 2:
  884.                         wq[0] += bones[weights[i + 0].boneIds[1]] * (weights[i + 0].weights[1] / 255.f);
  885.                 case 1:
  886.                         wq[0] += bones[weights[i + 0].boneIds[0]] * (weights[i + 0].weights[0] / 255.f);
  887.                 }
  888.                 l[0] = wq[0].nq.GetLength();
  889.                 l[0] = l[0] > 0.f ? 1.f / (l[0]) : l[0];
  890.                 wq[0].nq *= l[0];
  891.                 out[i + 0] = wq[0].nq * in[i + 0];
  892.         }
  893. #endif
  894. }
  895.  
  896. ////////////////////////////////////////////////////////////////////////////////
  897. // Skin a set of vertices against a set of bones
  898. static inline void UpdateGeneral(
  899.   SVF_P3S_C4B_T2S* out
  900.   , SVF_P3F_C4B_T2F* in
  901.   , DualQuatA* bones
  902.   , SMMRMBoneMapping* weights
  903.   , const float fScale
  904.   , size_t maxSpines)
  905. {
  906.         MMRM_PROFILE_FUNCTION(gEnv->pSystem, PROFILE_3DENGINE);
  907.         Vec3 vpos = Vec3(0, 0, 0);
  908.         const Vec3 xyz = in->xyz * fScale;
  909.         switch (maxSpines)
  910.         {
  911.         case 4:
  912.                 vpos += bones[weights->boneIds[3]] * xyz * (weights->weights[3] / 255.f);
  913.         case 3:
  914.                 vpos += bones[weights->boneIds[2]] * xyz * (weights->weights[2] / 255.f);
  915.         case 2:
  916.                 vpos += bones[weights->boneIds[1]] * xyz * (weights->weights[1] / 255.f);
  917.         case 1:
  918.                 vpos += bones[weights->boneIds[0]] * xyz * (weights->weights[0] / 255.f);
  919.         }
  920.         out->xyz = vpos;
  921.         out->color = in->color;
  922.         out->st = in->st;
  923. }
  924. static inline void UpdateGeneral(
  925.   SVF_P3S_C4B_T2S* out
  926.   , SVF_P3F_C4B_T2F* in
  927.   , DualQuatA* bones
  928.   , SMMRMBoneMapping* weights
  929.   , const float fScale
  930.   , size_t maxSpines
  931.   , size_t count)
  932. {
  933.         MMRM_PROFILE_FUNCTION(gEnv->pSystem, PROFILE_3DENGINE);
  934. #if MMRM_USE_VECTORIZED_SSE_INSTRUCTIONS
  935.         using namespace NVMath;
  936.         DualQuatA wq[4];
  937.         __m128 _wq[4 * 2];
  938.         __m128 nq_len[4];
  939.         __m128 vweights[4];
  940.         __m128 vw[4];
  941.         #if MMRM_UNROLL_GEOMETRY_BAKING_LOOPS
  942.         for (size_t i = 0; i < (count & ~3); i += 4)
  943.         {
  944.                 _wq[0] = _wq[1] = _wq[2] = _wq[3] = _wq[4] = _wq[5] = _wq[6] = _wq[7] = _mm_xor_ps(_wq[0], _wq[0]);
  945.                 vweights[0] = _mm_load_ps(&weights[i + 0].weights[0]);
  946.                 vweights[1] = _mm_load_ps(&weights[i + 1].weights[0]);
  947.                 vweights[2] = _mm_load_ps(&weights[i + 2].weights[0]);
  948.                 vweights[3] = _mm_load_ps(&weights[i + 3].weights[0]);
  949.                 switch (maxSpines)
  950.                 {
  951.                 case 4:
  952.                         vw[0] = Swizzle<wwww>(vweights[0]);
  953.                         vw[1] = Swizzle<wwww>(vweights[1]);
  954.                         vw[2] = Swizzle<wwww>(vweights[2]);
  955.                         vw[3] = Swizzle<wwww>(vweights[3]);
  956.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[3]].nq), vw[0]));
  957.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[3]].dq), vw[0]));
  958.                         _wq[1 * 2 + 0] = _mm_add_ps(_wq[1 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[3]].nq), vw[1]));
  959.                         _wq[1 * 2 + 1] = _mm_add_ps(_wq[1 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[3]].dq), vw[1]));
  960.                         _wq[2 * 2 + 0] = _mm_add_ps(_wq[2 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[3]].nq), vw[2]));
  961.                         _wq[2 * 2 + 1] = _mm_add_ps(_wq[2 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[3]].dq), vw[2]));
  962.                         _wq[3 * 2 + 0] = _mm_add_ps(_wq[3 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[3]].nq), vw[3]));
  963.                         _wq[3 * 2 + 1] = _mm_add_ps(_wq[3 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[3]].dq), vw[3]));
  964.                 case 3:
  965.                         vw[0] = Swizzle<zzzz>(vweights[0]);
  966.                         vw[1] = Swizzle<zzzz>(vweights[1]);
  967.                         vw[2] = Swizzle<zzzz>(vweights[2]);
  968.                         vw[3] = Swizzle<zzzz>(vweights[3]);
  969.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[2]].nq), vw[0]));
  970.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[2]].dq), vw[0]));
  971.                         _wq[1 * 2 + 0] = _mm_add_ps(_wq[1 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[2]].nq), vw[1]));
  972.                         _wq[1 * 2 + 1] = _mm_add_ps(_wq[1 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[2]].dq), vw[1]));
  973.                         _wq[2 * 2 + 0] = _mm_add_ps(_wq[2 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[2]].nq), vw[2]));
  974.                         _wq[2 * 2 + 1] = _mm_add_ps(_wq[2 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[2]].dq), vw[2]));
  975.                         _wq[3 * 2 + 0] = _mm_add_ps(_wq[3 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[2]].nq), vw[3]));
  976.                         _wq[3 * 2 + 1] = _mm_add_ps(_wq[3 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[2]].dq), vw[3]));
  977.                 case 2:
  978.                         vw[0] = Swizzle<yyyy>(vweights[0]);
  979.                         vw[1] = Swizzle<yyyy>(vweights[1]);
  980.                         vw[2] = Swizzle<yyyy>(vweights[2]);
  981.                         vw[3] = Swizzle<yyyy>(vweights[3]);
  982.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[1]].nq), vw[0]));
  983.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[1]].dq), vw[0]));
  984.                         _wq[1 * 2 + 0] = _mm_add_ps(_wq[1 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[1]].nq), vw[1]));
  985.                         _wq[1 * 2 + 1] = _mm_add_ps(_wq[1 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[1]].dq), vw[1]));
  986.                         _wq[2 * 2 + 0] = _mm_add_ps(_wq[2 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[1]].nq), vw[2]));
  987.                         _wq[2 * 2 + 1] = _mm_add_ps(_wq[2 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[1]].dq), vw[2]));
  988.                         _wq[3 * 2 + 0] = _mm_add_ps(_wq[3 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[1]].nq), vw[3]));
  989.                         _wq[3 * 2 + 1] = _mm_add_ps(_wq[3 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[1]].dq), vw[3]));
  990.                 case 1:
  991.                         vw[0] = Swizzle<xxxx>(vweights[0]);
  992.                         vw[1] = Swizzle<xxxx>(vweights[1]);
  993.                         vw[2] = Swizzle<xxxx>(vweights[2]);
  994.                         vw[3] = Swizzle<xxxx>(vweights[3]);
  995.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[0]].nq), vw[0]));
  996.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[0]].dq), vw[0]));
  997.                         _wq[1 * 2 + 0] = _mm_add_ps(_wq[1 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[0]].nq), vw[1]));
  998.                         _wq[1 * 2 + 1] = _mm_add_ps(_wq[1 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[0]].dq), vw[1]));
  999.                         _wq[2 * 2 + 0] = _mm_add_ps(_wq[2 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[0]].nq), vw[2]));
  1000.                         _wq[2 * 2 + 1] = _mm_add_ps(_wq[2 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[0]].dq), vw[2]));
  1001.                         _wq[3 * 2 + 0] = _mm_add_ps(_wq[3 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[0]].nq), vw[3]));
  1002.                         _wq[3 * 2 + 1] = _mm_add_ps(_wq[3 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[0]].dq), vw[3]));
  1003.                 }
  1004.                 nq_len[0] = _mm_rsqrt_ps(DotProduct4(_wq[0 * 2 + 0], _wq[0 * 2 + 0]));
  1005.                 nq_len[1] = _mm_rsqrt_ps(DotProduct4(_wq[1 * 2 + 0], _wq[1 * 2 + 0]));
  1006.                 nq_len[2] = _mm_rsqrt_ps(DotProduct4(_wq[2 * 2 + 0], _wq[2 * 2 + 0]));
  1007.                 nq_len[3] = _mm_rsqrt_ps(DotProduct4(_wq[3 * 2 + 0], _wq[3 * 2 + 0]));
  1008.  
  1009.                 _mm_store_ps((float*)&wq[0].nq, _mm_mul_ps(_wq[0 * 2 + 0], nq_len[0]));
  1010.                 _mm_store_ps((float*)&wq[0].dq, _mm_mul_ps(_wq[0 * 2 + 1], nq_len[0]));
  1011.                 _mm_store_ps((float*)&wq[1].nq, _mm_mul_ps(_wq[1 * 2 + 0], nq_len[1]));
  1012.                 _mm_store_ps((float*)&wq[1].dq, _mm_mul_ps(_wq[1 * 2 + 1], nq_len[1]));
  1013.                 _mm_store_ps((float*)&wq[2].nq, _mm_mul_ps(_wq[2 * 2 + 0], nq_len[2]));
  1014.                 _mm_store_ps((float*)&wq[2].dq, _mm_mul_ps(_wq[2 * 2 + 1], nq_len[2]));
  1015.                 _mm_store_ps((float*)&wq[3].nq, _mm_mul_ps(_wq[3 * 2 + 0], nq_len[3]));
  1016.                 _mm_store_ps((float*)&wq[3].dq, _mm_mul_ps(_wq[3 * 2 + 1], nq_len[3]));
  1017.  
  1018.                 out[i + 0].xyz = wq[0] * (in[i + 0].xyz * fScale);
  1019.                 out[i + 1].xyz = wq[1] * (in[i + 1].xyz * fScale);
  1020.                 out[i + 2].xyz = wq[2] * (in[i + 2].xyz * fScale);
  1021.                 out[i + 3].xyz = wq[3] * (in[i + 3].xyz * fScale);
  1022.  
  1023.                 out[i + 0].color = in[i + 0].color;
  1024.                 out[i + 0].st = in[i + 0].st;
  1025.  
  1026.                 out[i + 1].color = in[i + 1].color;
  1027.                 out[i + 1].st = in[i + 1].st;
  1028.  
  1029.                 out[i + 2].color = in[i + 2].color;
  1030.                 out[i + 2].st = in[i + 2].st;
  1031.  
  1032.                 out[i + 3].color = in[i + 3].color;
  1033.                 out[i + 3].st = in[i + 3].st;
  1034.         }
  1035.         for (size_t i = (count & ~3); i < count; ++i)
  1036.         #else // MMRM_UNROLL_GEOMETRY_BAKING_LOOPS
  1037.         for (size_t i = 0; i < count; ++i)
  1038.         #endif
  1039.         {
  1040.                 _wq[0] = _wq[1] = _mm_xor_ps(_wq[0], _wq[0]);
  1041.                 vweights[0] = _mm_load_ps(&weights[i + 0].weights[0]);
  1042.                 switch (maxSpines)
  1043.                 {
  1044.                 case 4:
  1045.                         vw[0] = Swizzle<wwww>(vweights[0]);
  1046.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[3]].nq), vw[0]));
  1047.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[3]].dq), vw[0]));
  1048.                 case 3:
  1049.                         vw[0] = Swizzle<zzzz>(vweights[0]);
  1050.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[2]].nq), vw[0]));
  1051.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[2]].dq), vw[0]));
  1052.                 case 2:
  1053.                         vw[0] = Swizzle<yyyy>(vweights[0]);
  1054.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[1]].nq), vw[0]));
  1055.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[1]].dq), vw[0]));
  1056.                 case 1:
  1057.                         vw[0] = Swizzle<xxxx>(vweights[0]);
  1058.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[0]].nq), vw[0]));
  1059.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[0]].dq), vw[0]));
  1060.                 }
  1061.                 nq_len[0] = _mm_rsqrt_ps(DotProduct4(_wq[0 * 2 + 0], _wq[0 * 2 + 0]));
  1062.                 _mm_store_ps((float*)&wq[0].nq, _mm_mul_ps(_wq[0 * 2 + 0], nq_len[0]));
  1063.                 _mm_store_ps((float*)&wq[0].dq, _mm_mul_ps(_wq[0 * 2 + 1], nq_len[0]));
  1064.                 out[i].xyz = wq[0] * (in[i].xyz * fScale);
  1065.                 out[i].color = in[i].color;
  1066.                 out[i].st = in[i].st;
  1067.         }
  1068. #else
  1069.         DualQuatA wq[4];
  1070.         float l[4];
  1071.         for (size_t i = 0; i < (count & ~3); i += 4)
  1072.         {
  1073.                 wq[0] = wq[1] = wq[2] = wq[3] = DualQuatA(ZERO);
  1074.                 switch (maxSpines)
  1075.                 {
  1076.                 case 4:
  1077.                         wq[0] += bones[weights[i + 0].boneIds[3]] * (weights[i + 0].weights[3] / 255.f);
  1078.                         wq[1] += bones[weights[i + 1].boneIds[3]] * (weights[i + 1].weights[3] / 255.f);
  1079.                         wq[2] += bones[weights[i + 2].boneIds[3]] * (weights[i + 2].weights[3] / 255.f);
  1080.                         wq[3] += bones[weights[i + 3].boneIds[3]] * (weights[i + 3].weights[3] / 255.f);
  1081.                 case 3:
  1082.                         wq[0] += bones[weights[i + 0].boneIds[2]] * (weights[i + 0].weights[2] / 255.f);
  1083.                         wq[1] += bones[weights[i + 1].boneIds[2]] * (weights[i + 1].weights[2] / 255.f);
  1084.                         wq[2] += bones[weights[i + 2].boneIds[2]] * (weights[i + 2].weights[2] / 255.f);
  1085.                         wq[3] += bones[weights[i + 3].boneIds[2]] * (weights[i + 3].weights[2] / 255.f);
  1086.                 case 2:
  1087.                         wq[0] += bones[weights[i + 0].boneIds[1]] * (weights[i + 0].weights[1] / 255.f);
  1088.                         wq[1] += bones[weights[i + 1].boneIds[1]] * (weights[i + 1].weights[1] / 255.f);
  1089.                         wq[2] += bones[weights[i + 2].boneIds[1]] * (weights[i + 2].weights[1] / 255.f);
  1090.                         wq[3] += bones[weights[i + 3].boneIds[1]] * (weights[i + 3].weights[1] / 255.f);
  1091.                 case 1:
  1092.                         wq[0] += bones[weights[i + 0].boneIds[0]] * (weights[i + 0].weights[0] / 255.f);
  1093.                         wq[1] += bones[weights[i + 1].boneIds[0]] * (weights[i + 1].weights[0] / 255.f);
  1094.                         wq[2] += bones[weights[i + 2].boneIds[0]] * (weights[i + 2].weights[0] / 255.f);
  1095.                         wq[3] += bones[weights[i + 3].boneIds[0]] * (weights[i + 3].weights[0] / 255.f);
  1096.                 }
  1097.  
  1098.                 l[0] = wq[0].nq.GetLength();
  1099.                 l[1] = wq[1].nq.GetLength();
  1100.                 l[2] = wq[2].nq.GetLength();
  1101.                 l[3] = wq[3].nq.GetLength();
  1102.  
  1103.                 l[0] = l[0] > 0.f ? 1.f / (l[0]) : l[0];
  1104.                 l[1] = l[1] > 0.f ? 1.f / (l[1]) : l[1];
  1105.                 l[2] = l[2] > 0.f ? 1.f / (l[2]) : l[2];
  1106.                 l[3] = l[3] > 0.f ? 1.f / (l[3]) : l[3];
  1107.  
  1108.                 wq[0].nq *= l[0];
  1109.                 wq[0].dq *= l[0];
  1110.                 wq[1].nq *= l[1];
  1111.                 wq[1].dq *= l[1];
  1112.                 wq[2].nq *= l[2];
  1113.                 wq[2].dq *= l[2];
  1114.                 wq[3].nq *= l[3];
  1115.                 wq[3].dq *= l[3];
  1116.  
  1117.                 out[i + 0].xyz = wq[0] * (in[i + 0].xyz * fScale);
  1118.                 out[i + 1].xyz = wq[1] * (in[i + 1].xyz * fScale);
  1119.                 out[i + 2].xyz = wq[2] * (in[i + 2].xyz * fScale);
  1120.                 out[i + 3].xyz = wq[3] * (in[i + 3].xyz * fScale);
  1121.  
  1122.                 out[i + 0].color = in[i + 0].color;
  1123.                 out[i + 0].st = in[i + 0].st;
  1124.  
  1125.                 out[i + 1].color = in[i + 1].color;
  1126.                 out[i + 1].st = in[i + 1].st;
  1127.  
  1128.                 out[i + 2].color = in[i + 2].color;
  1129.                 out[i + 2].st = in[i + 2].st;
  1130.  
  1131.                 out[i + 3].color = in[i + 3].color;
  1132.                 out[i + 3].st = in[i + 3].st;
  1133.  
  1134.         }
  1135.         for (size_t i = (count & ~3); i < count; ++i)
  1136.         {
  1137.                 wq[0] = DualQuatA(ZERO);
  1138.                 switch (maxSpines)
  1139.                 {
  1140.                 case 4:
  1141.                         wq[0] += bones[weights[i].boneIds[3]] * (weights[i].weights[3] / 255.f);
  1142.                 case 3:
  1143.                         wq[0] += bones[weights[i].boneIds[2]] * (weights[i].weights[2] / 255.f);
  1144.                 case 2:
  1145.                         wq[0] += bones[weights[i].boneIds[1]] * (weights[i].weights[1] / 255.f);
  1146.                 case 1:
  1147.                         wq[0] += bones[weights[i].boneIds[0]] * (weights[i].weights[0] / 255.f);
  1148.                 }
  1149.                 l[0] = wq[0].nq.GetLength();
  1150.                 l[0] = l[0] > 0.f ? 1.f / (l[0]) : l[0];
  1151.                 wq[0].nq *= l[0];
  1152.                 wq[0].dq *= l[0];
  1153.                 out[i].xyz = wq[0] * (in[i].xyz * fScale);
  1154.                 out[i].color = in[i].color;
  1155.                 out[i].st = in[i].st;
  1156.         }
  1157. #endif
  1158. }
  1159.  
  1160. ////////////////////////////////////////////////////////////////////////////////
  1161. // Update a set of tangents
  1162. static inline void UpdateTangents(
  1163.   SPipTangents* out
  1164.   , SPipQTangents* in
  1165.   , DualQuatA* bones
  1166.   , SMMRMBoneMapping* weights
  1167.   , size_t maxSpines
  1168.   , size_t count)
  1169. {
  1170.         MMRM_PROFILE_FUNCTION(gEnv->pSystem, PROFILE_3DENGINE);
  1171. #if MMRM_USE_VECTORIZED_SSE_INSTRUCTIONS
  1172.         using namespace NVMath;
  1173.         Quat out_tangents[4];
  1174.         DualQuatA wq[4];
  1175.         int16 flip[4];
  1176.         __m128 _wq[4 * 2];
  1177.         __m128 nq_len[4];
  1178.         __m128 vweights[4];
  1179.         __m128 vw[4];
  1180.         Quat in_tangents[4];
  1181.         #if MMRM_UNROLL_GEOMETRY_BAKING_LOOPS
  1182.         for (size_t i = 0; i < (count & ~3); i += 4)
  1183.         {
  1184.                 in_tangents[0] = in[i + 0].GetQ();
  1185.                 in_tangents[1] = in[i + 1].GetQ();
  1186.                 in_tangents[2] = in[i + 2].GetQ();
  1187.                 in_tangents[3] = in[i + 3].GetQ();
  1188.                 _wq[0] = _wq[1] = _wq[2] = _wq[3] = _wq[4] = _wq[5] = _wq[6] = _wq[7] = _mm_xor_ps(_wq[0], _wq[0]);
  1189.                 vweights[0] = _mm_load_ps(&weights[i + 0].weights[0]);
  1190.                 vweights[1] = _mm_load_ps(&weights[i + 1].weights[0]);
  1191.                 vweights[2] = _mm_load_ps(&weights[i + 2].weights[0]);
  1192.                 vweights[3] = _mm_load_ps(&weights[i + 3].weights[0]);
  1193.                 switch (maxSpines)
  1194.                 {
  1195.                 case 4:
  1196.                         vw[0] = Swizzle<wwww>(vweights[0]);
  1197.                         vw[1] = Swizzle<wwww>(vweights[1]);
  1198.                         vw[2] = Swizzle<wwww>(vweights[2]);
  1199.                         vw[3] = Swizzle<wwww>(vweights[3]);
  1200.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[3]].nq), vw[0]));
  1201.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[3]].dq), vw[0]));
  1202.                         _wq[1 * 2 + 0] = _mm_add_ps(_wq[1 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[3]].nq), vw[1]));
  1203.                         _wq[1 * 2 + 1] = _mm_add_ps(_wq[1 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[3]].dq), vw[1]));
  1204.                         _wq[2 * 2 + 0] = _mm_add_ps(_wq[2 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[3]].nq), vw[2]));
  1205.                         _wq[2 * 2 + 1] = _mm_add_ps(_wq[2 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[3]].dq), vw[2]));
  1206.                         _wq[3 * 2 + 0] = _mm_add_ps(_wq[3 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[3]].nq), vw[3]));
  1207.                         _wq[3 * 2 + 1] = _mm_add_ps(_wq[3 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[3]].dq), vw[3]));
  1208.                 case 3:
  1209.                         vw[0] = Swizzle<zzzz>(vweights[0]);
  1210.                         vw[1] = Swizzle<zzzz>(vweights[1]);
  1211.                         vw[2] = Swizzle<zzzz>(vweights[2]);
  1212.                         vw[3] = Swizzle<zzzz>(vweights[3]);
  1213.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[2]].nq), vw[0]));
  1214.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[2]].dq), vw[0]));
  1215.                         _wq[1 * 2 + 0] = _mm_add_ps(_wq[1 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[2]].nq), vw[1]));
  1216.                         _wq[1 * 2 + 1] = _mm_add_ps(_wq[1 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[2]].dq), vw[1]));
  1217.                         _wq[2 * 2 + 0] = _mm_add_ps(_wq[2 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[2]].nq), vw[2]));
  1218.                         _wq[2 * 2 + 1] = _mm_add_ps(_wq[2 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[2]].dq), vw[2]));
  1219.                         _wq[3 * 2 + 0] = _mm_add_ps(_wq[3 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[2]].nq), vw[3]));
  1220.                         _wq[3 * 2 + 1] = _mm_add_ps(_wq[3 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[2]].dq), vw[3]));
  1221.                 case 2:
  1222.                         vw[0] = Swizzle<yyyy>(vweights[0]);
  1223.                         vw[1] = Swizzle<yyyy>(vweights[1]);
  1224.                         vw[2] = Swizzle<yyyy>(vweights[2]);
  1225.                         vw[3] = Swizzle<yyyy>(vweights[3]);
  1226.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[1]].nq), vw[0]));
  1227.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[1]].dq), vw[0]));
  1228.                         _wq[1 * 2 + 0] = _mm_add_ps(_wq[1 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[1]].nq), vw[1]));
  1229.                         _wq[1 * 2 + 1] = _mm_add_ps(_wq[1 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[1]].dq), vw[1]));
  1230.                         _wq[2 * 2 + 0] = _mm_add_ps(_wq[2 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[1]].nq), vw[2]));
  1231.                         _wq[2 * 2 + 1] = _mm_add_ps(_wq[2 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[1]].dq), vw[2]));
  1232.                         _wq[3 * 2 + 0] = _mm_add_ps(_wq[3 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[1]].nq), vw[3]));
  1233.                         _wq[3 * 2 + 1] = _mm_add_ps(_wq[3 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[1]].dq), vw[3]));
  1234.                 case 1:
  1235.                         vw[0] = Swizzle<xxxx>(vweights[0]);
  1236.                         vw[1] = Swizzle<xxxx>(vweights[1]);
  1237.                         vw[2] = Swizzle<xxxx>(vweights[2]);
  1238.                         vw[3] = Swizzle<xxxx>(vweights[3]);
  1239.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[0]].nq), vw[0]));
  1240.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[0]].dq), vw[0]));
  1241.                         _wq[1 * 2 + 0] = _mm_add_ps(_wq[1 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[0]].nq), vw[1]));
  1242.                         _wq[1 * 2 + 1] = _mm_add_ps(_wq[1 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[0]].dq), vw[1]));
  1243.                         _wq[2 * 2 + 0] = _mm_add_ps(_wq[2 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[0]].nq), vw[2]));
  1244.                         _wq[2 * 2 + 1] = _mm_add_ps(_wq[2 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[0]].dq), vw[2]));
  1245.                         _wq[3 * 2 + 0] = _mm_add_ps(_wq[3 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[0]].nq), vw[3]));
  1246.                         _wq[3 * 2 + 1] = _mm_add_ps(_wq[3 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[0]].dq), vw[3]));
  1247.                 }
  1248.                 nq_len[0] = _mm_rsqrt_ps(DotProduct4(_wq[0 * 2 + 0], _wq[0 * 2 + 0]));
  1249.                 nq_len[1] = _mm_rsqrt_ps(DotProduct4(_wq[1 * 2 + 0], _wq[1 * 2 + 0]));
  1250.                 nq_len[2] = _mm_rsqrt_ps(DotProduct4(_wq[2 * 2 + 0], _wq[2 * 2 + 0]));
  1251.                 nq_len[3] = _mm_rsqrt_ps(DotProduct4(_wq[3 * 2 + 0], _wq[3 * 2 + 0]));
  1252.  
  1253.                 _mm_store_ps((float*)&wq[0].nq, _mm_mul_ps(_wq[0 * 2 + 0], nq_len[0]));
  1254.                 _mm_store_ps((float*)&wq[0].dq, _mm_mul_ps(_wq[0 * 2 + 1], nq_len[0]));
  1255.                 _mm_store_ps((float*)&wq[1].nq, _mm_mul_ps(_wq[1 * 2 + 0], nq_len[1]));
  1256.                 _mm_store_ps((float*)&wq[1].dq, _mm_mul_ps(_wq[1 * 2 + 1], nq_len[1]));
  1257.                 _mm_store_ps((float*)&wq[2].nq, _mm_mul_ps(_wq[2 * 2 + 0], nq_len[2]));
  1258.                 _mm_store_ps((float*)&wq[2].dq, _mm_mul_ps(_wq[2 * 2 + 1], nq_len[2]));
  1259.                 _mm_store_ps((float*)&wq[3].nq, _mm_mul_ps(_wq[3 * 2 + 0], nq_len[3]));
  1260.                 _mm_store_ps((float*)&wq[3].dq, _mm_mul_ps(_wq[3 * 2 + 1], nq_len[3]));
  1261.  
  1262.                 out_tangents[0] = (wq[0].nq * in_tangents[0]);
  1263.                 out_tangents[1] = (wq[1].nq * in_tangents[1]);
  1264.                 out_tangents[2] = (wq[2].nq * in_tangents[2]);
  1265.                 out_tangents[3] = (wq[3].nq * in_tangents[3]);
  1266.  
  1267.                 if (out_tangents[0].w < 0.f) out_tangents[0] = -out_tangents[0];
  1268.                 if (out_tangents[1].w < 0.f) out_tangents[1] = -out_tangents[1];
  1269.                 if (out_tangents[2].w < 0.f) out_tangents[2] = -out_tangents[2];
  1270.                 if (out_tangents[3].w < 0.f) out_tangents[3] = -out_tangents[3];
  1271.  
  1272.                 flip[0] = in_tangents[0].w < 0.f ? -1 : 1;
  1273.                 flip[1] = in_tangents[1].w < 0.f ? -1 : 1;
  1274.                 flip[2] = in_tangents[2].w < 0.f ? -1 : 1;
  1275.                 flip[3] = in_tangents[3].w < 0.f ? -1 : 1;
  1276.  
  1277.                 if (flip[0] < 0) out_tangents[0] = -out_tangents[0];
  1278.                 if (flip[1] < 0) out_tangents[1] = -out_tangents[1];
  1279.                 if (flip[2] < 0) out_tangents[2] = -out_tangents[2];
  1280.                 if (flip[3] < 0) out_tangents[3] = -out_tangents[3];
  1281.  
  1282.                 out[i + 0] = SPipTangents(out_tangents[0], flip[0]);
  1283.                 out[i + 1] = SPipTangents(out_tangents[1], flip[1]);
  1284.                 out[i + 2] = SPipTangents(out_tangents[2], flip[2]);
  1285.                 out[i + 3] = SPipTangents(out_tangents[3], flip[3]);
  1286.         }
  1287.         for (size_t i = (count & ~3); i < count; ++i)
  1288.         #else // MMRM_UNROLL_GEOMETRY_BAKING_LOOPS
  1289.         for (size_t i = 0; i < count; ++i)
  1290.         #endif
  1291.         {
  1292.                 in_tangents[0] = in[i + 0].GetQ();
  1293.                 _wq[0] = _wq[1] = _mm_xor_ps(_wq[0], _wq[0]);
  1294.                 vweights[0] = _mm_load_ps(&weights[i + 0].weights[0]);
  1295.                 switch (maxSpines)
  1296.                 {
  1297.                 case 4:
  1298.                         vw[0] = Swizzle<wwww>(vweights[0]);
  1299.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[3]].nq), vw[0]));
  1300.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[3]].dq), vw[0]));
  1301.                 case 3:
  1302.                         vw[0] = Swizzle<zzzz>(vweights[0]);
  1303.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[2]].nq), vw[0]));
  1304.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[2]].dq), vw[0]));
  1305.                 case 2:
  1306.                         vw[0] = Swizzle<yyyy>(vweights[0]);
  1307.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[1]].nq), vw[0]));
  1308.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[1]].dq), vw[0]));
  1309.                 case 1:
  1310.                         vw[0] = Swizzle<xxxx>(vweights[0]);
  1311.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[0]].nq), vw[0]));
  1312.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[0]].dq), vw[0]));
  1313.                 }
  1314.                 nq_len[0] = _mm_rsqrt_ps(DotProduct4(_wq[0 * 2 + 0], _wq[0 * 2 + 0]));
  1315.                 _mm_store_ps((float*)&wq[0].nq, _mm_mul_ps(_wq[0 * 2 + 0], nq_len[0]));
  1316.                 _mm_store_ps((float*)&wq[0].dq, _mm_mul_ps(_wq[0 * 2 + 1], nq_len[0]));
  1317.                 out_tangents[0] = (wq[0].nq * in_tangents[0]);
  1318.                 if (out_tangents[0].w < 0.f) out_tangents[0] = -out_tangents[0];
  1319.                 flip[0] = in_tangents[0].w < 0.f ? -1 : 1;
  1320.                 if (flip[0] < 0) out_tangents[0] = -out_tangents[0];
  1321.                 out[i + 0] = SPipTangents(out_tangents[0], flip[0]);
  1322.         }
  1323. #else
  1324.         Quat out_tangents[4];
  1325.         Quat in_tangents[4];
  1326.         DualQuatA wq[4];
  1327.         int16 flip[4];
  1328.         f32 l[4];
  1329.         for (size_t i = 0; i < (count & ~3); i += 4)
  1330.         {
  1331.                 in_tangents[0] = in[i + 0].GetQ();
  1332.                 in_tangents[1] = in[i + 1].GetQ();
  1333.                 in_tangents[2] = in[i + 2].GetQ();
  1334.                 in_tangents[3] = in[i + 3].GetQ();
  1335.                 wq[0] = wq[1] = wq[2] = wq[3] = DualQuatA(ZERO);
  1336.                 switch (maxSpines)
  1337.                 {
  1338.                 case 4:
  1339.                         wq[0] += bones[weights[i + 0].boneIds[3]] * (weights[i + 0].weights[3] / 255.f);
  1340.                         wq[1] += bones[weights[i + 1].boneIds[3]] * (weights[i + 1].weights[3] / 255.f);
  1341.                         wq[2] += bones[weights[i + 2].boneIds[3]] * (weights[i + 2].weights[3] / 255.f);
  1342.                         wq[3] += bones[weights[i + 3].boneIds[3]] * (weights[i + 3].weights[3] / 255.f);
  1343.                 case 3:
  1344.                         wq[0] += bones[weights[i + 0].boneIds[2]] * (weights[i + 0].weights[2] / 255.f);
  1345.                         wq[1] += bones[weights[i + 1].boneIds[2]] * (weights[i + 1].weights[2] / 255.f);
  1346.                         wq[2] += bones[weights[i + 2].boneIds[2]] * (weights[i + 2].weights[2] / 255.f);
  1347.                         wq[3] += bones[weights[i + 3].boneIds[2]] * (weights[i + 3].weights[2] / 255.f);
  1348.                 case 2:
  1349.                         wq[0] += bones[weights[i + 0].boneIds[1]] * (weights[i + 0].weights[1] / 255.f);
  1350.                         wq[1] += bones[weights[i + 1].boneIds[1]] * (weights[i + 1].weights[1] / 255.f);
  1351.                         wq[2] += bones[weights[i + 2].boneIds[1]] * (weights[i + 2].weights[1] / 255.f);
  1352.                         wq[3] += bones[weights[i + 3].boneIds[1]] * (weights[i + 3].weights[1] / 255.f);
  1353.                 case 1:
  1354.                         wq[0] += bones[weights[i + 0].boneIds[0]] * (weights[i + 0].weights[0] / 255.f);
  1355.                         wq[1] += bones[weights[i + 1].boneIds[0]] * (weights[i + 1].weights[0] / 255.f);
  1356.                         wq[2] += bones[weights[i + 2].boneIds[0]] * (weights[i + 2].weights[0] / 255.f);
  1357.                         wq[3] += bones[weights[i + 3].boneIds[0]] * (weights[i + 3].weights[0] / 255.f);
  1358.                 }
  1359.  
  1360.                 l[0] = wq[0].nq.GetLength();
  1361.                 l[1] = wq[1].nq.GetLength();
  1362.                 l[2] = wq[2].nq.GetLength();
  1363.                 l[3] = wq[3].nq.GetLength();
  1364.  
  1365.                 l[0] = l[0] > 0.f ? 1.f / (l[0]) : l[0];
  1366.                 l[1] = l[1] > 0.f ? 1.f / (l[1]) : l[1];
  1367.                 l[2] = l[2] > 0.f ? 1.f / (l[2]) : l[2];
  1368.                 l[3] = l[3] > 0.f ? 1.f / (l[3]) : l[3];
  1369.  
  1370.                 wq[0].nq *= l[0];
  1371.                 wq[1].nq *= l[1];
  1372.                 wq[2].nq *= l[2];
  1373.                 wq[3].nq *= l[3];
  1374.  
  1375.                 out_tangents[0] = (wq[0].nq * in_tangents[0]);
  1376.                 out_tangents[1] = (wq[1].nq * in_tangents[1]);
  1377.                 out_tangents[2] = (wq[2].nq * in_tangents[2]);
  1378.                 out_tangents[3] = (wq[3].nq * in_tangents[3]);
  1379.  
  1380.                 if (out_tangents[0].w < 0.f) out_tangents[0] = -out_tangents[0];
  1381.                 if (out_tangents[1].w < 0.f) out_tangents[1] = -out_tangents[1];
  1382.                 if (out_tangents[2].w < 0.f) out_tangents[2] = -out_tangents[2];
  1383.                 if (out_tangents[3].w < 0.f) out_tangents[3] = -out_tangents[3];
  1384.  
  1385.                 flip[0] = in_tangents[0].w < 0.f ? -1 : 1;
  1386.                 flip[1] = in_tangents[1].w < 0.f ? -1 : 1;
  1387.                 flip[2] = in_tangents[2].w < 0.f ? -1 : 1;
  1388.                 flip[3] = in_tangents[3].w < 0.f ? -1 : 1;
  1389.  
  1390.                 if (flip[0] < 0) out_tangents[0] = -out_tangents[0];
  1391.                 if (flip[1] < 0) out_tangents[1] = -out_tangents[1];
  1392.                 if (flip[2] < 0) out_tangents[2] = -out_tangents[2];
  1393.                 if (flip[3] < 0) out_tangents[3] = -out_tangents[3];
  1394.  
  1395.                 out[i + 0] = SPipTangents(out_tangents[0], flip[0]);
  1396.                 out[i + 1] = SPipTangents(out_tangents[1], flip[1]);
  1397.                 out[i + 2] = SPipTangents(out_tangents[2], flip[2]);
  1398.                 out[i + 3] = SPipTangents(out_tangents[3], flip[3]);
  1399.         }
  1400.         for (size_t i = (count & ~3); i < count; ++i)
  1401.         {
  1402.                 in_tangents[0] = in[i + 0].GetQ();
  1403.                 wq[0] = DualQuatA(ZERO);
  1404.                 switch (maxSpines)
  1405.                 {
  1406.                 case 4:
  1407.                         wq[0] += bones[weights[i + 0].boneIds[3]] * (weights[i + 0].weights[3] / 255.f);
  1408.                 case 3:
  1409.                         wq[0] += bones[weights[i + 0].boneIds[2]] * (weights[i + 0].weights[2] / 255.f);
  1410.                 case 2:
  1411.                         wq[0] += bones[weights[i + 0].boneIds[1]] * (weights[i + 0].weights[1] / 255.f);
  1412.                 case 1:
  1413.                         wq[0] += bones[weights[i + 0].boneIds[0]] * (weights[i + 0].weights[0] / 255.f);
  1414.                 }
  1415.                 l[0] = wq[0].nq.GetLength();
  1416.                 l[0] = l[0] > 0.f ? 1.f / (l[0]) : l[0];
  1417.                 wq[0].nq *= l[0];
  1418.                 out_tangents[0] = (wq[0].nq * in_tangents[0]);
  1419.                 if (out_tangents[0].w < 0.f) out_tangents[0] = -out_tangents[0];
  1420.                 flip[0] = in_tangents[0].w < 0.f ? -1 : 1;
  1421.                 if (flip[0] < 0) out_tangents[0] = -out_tangents[0];
  1422.                 out[i + 0] = SPipTangents(out_tangents[0], flip[0]);
  1423.         }
  1424. #endif
  1425. }
  1426. static inline void UpdateTangents(SPipTangents* out, SPipQTangents* in, const Matrix34& mat, size_t count)
  1427. {
  1428.         MMRM_PROFILE_FUNCTION(gEnv->pSystem, PROFILE_3DENGINE);
  1429.         Quat out_tangents[4];
  1430.         Quat in_tangents[4];
  1431.         int16 flip[4];
  1432.         Quat q = mat33_to_quat(Matrix33(mat));
  1433.         q.NormalizeFast();
  1434.  
  1435.         for (size_t i = 0; i < (count & ~3); i += 4)
  1436.         {
  1437.                 in_tangents[0] = in[i + 0].GetQ();
  1438.                 in_tangents[1] = in[i + 1].GetQ();
  1439.                 in_tangents[2] = in[i + 2].GetQ();
  1440.                 in_tangents[3] = in[i + 3].GetQ();
  1441.  
  1442.                 out_tangents[0] = q * in_tangents[0];
  1443.                 out_tangents[1] = q * in_tangents[1];
  1444.                 out_tangents[2] = q * in_tangents[2];
  1445.                 out_tangents[3] = q * in_tangents[3];
  1446.  
  1447.                 flip[0] = in_tangents[0].w < 0.f ? -1 : 1;
  1448.                 flip[1] = in_tangents[1].w < 0.f ? -1 : 1;
  1449.                 flip[2] = in_tangents[2].w < 0.f ? -1 : 1;
  1450.                 flip[3] = in_tangents[3].w < 0.f ? -1 : 1;
  1451.  
  1452.                 if (flip[0] < 0) out_tangents[0] = -out_tangents[0];
  1453.                 if (flip[1] < 0) out_tangents[1] = -out_tangents[1];
  1454.                 if (flip[2] < 0) out_tangents[2] = -out_tangents[2];
  1455.                 if (flip[3] < 0) out_tangents[3] = -out_tangents[3];
  1456.  
  1457.                 out[i + 0] = SPipTangents(out_tangents[0], flip[0]);
  1458.                 out[i + 1] = SPipTangents(out_tangents[1], flip[1]);
  1459.                 out[i + 2] = SPipTangents(out_tangents[2], flip[2]);
  1460.                 out[i + 3] = SPipTangents(out_tangents[3], flip[3]);
  1461.         }
  1462.  
  1463.         for (size_t i = (count & ~3); i < count; ++i)
  1464.         {
  1465.                 in_tangents[0] = in[i + 0].GetQ();
  1466.                 out_tangents[0] = q * in_tangents[0];
  1467.  
  1468.                 flip[0] = in_tangents[0].w < 0.f ? -1 : 1;
  1469.                 if (flip[0] < 0) out_tangents[0] = -out_tangents[0];
  1470.                 out[i + 0] = SPipTangents(out_tangents[0], flip[0]);
  1471.         }
  1472. }
  1473.  
  1474. ////////////////////////////////////////////////////////////////////////////////
  1475. // Update a set of general, tangents and normals
  1476. static inline void UpdateGeneralTangents(
  1477.   SVF_P3S_C4B_T2S* out_general
  1478.   , SPipTangents* out_packed_tangents
  1479.   , SVF_P3F_C4B_T2F* in_general
  1480.   , SPipQTangents* in_packed_tangents
  1481.   , DualQuatA* bones
  1482.   , SMMRMBoneMapping* weights
  1483.   , const float fScale
  1484.   , size_t maxSpines
  1485.   , size_t count)
  1486. {
  1487.         MMRM_PROFILE_FUNCTION(gEnv->pSystem, PROFILE_3DENGINE);
  1488. #if MMRM_USE_VECTORIZED_SSE_INSTRUCTIONS
  1489.         using namespace NVMath;
  1490.         DualQuatA wq[4];
  1491.         __m128 _wq[4 * 2];
  1492.         __m128 nq_len[4];
  1493.         __m128 ot[4];
  1494.         __m128 vOne = _mm_set1_ps(1.f);
  1495.         int16 flip[4];
  1496.         __m128 vweights[4];
  1497.         __m128 vw[4];
  1498.         CRY_ALIGN(16) Quat out_tangents[4];
  1499.         CRY_ALIGN(16) Quat in_tangents[4];
  1500.         #if MMRM_UNROLL_GEOMETRY_BAKING_LOOPS
  1501.         for (size_t i = 0; i < (count & ~3); i += 4)
  1502.         {
  1503.                 _wq[0] = _wq[1] = _wq[2] = _wq[3] = _wq[4] = _wq[5] = _wq[6] = _wq[7] = _mm_xor_ps(_wq[0], _wq[0]);
  1504.                 in_tangents[0] = in_packed_tangents[i + 0].GetQ();
  1505.                 in_tangents[1] = in_packed_tangents[i + 1].GetQ();
  1506.                 in_tangents[2] = in_packed_tangents[i + 2].GetQ();
  1507.                 in_tangents[3] = in_packed_tangents[i + 3].GetQ();
  1508.                 vweights[0] = _mm_load_ps(&weights[i + 0].weights[0]);
  1509.                 vweights[1] = _mm_load_ps(&weights[i + 1].weights[0]);
  1510.                 vweights[2] = _mm_load_ps(&weights[i + 2].weights[0]);
  1511.                 vweights[3] = _mm_load_ps(&weights[i + 3].weights[0]);
  1512.                 switch (maxSpines)
  1513.                 {
  1514.                 case 4:
  1515.                         vw[0] = Swizzle<wwww>(vweights[0]);
  1516.                         vw[1] = Swizzle<wwww>(vweights[1]);
  1517.                         vw[2] = Swizzle<wwww>(vweights[2]);
  1518.                         vw[3] = Swizzle<wwww>(vweights[3]);
  1519.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[3]].nq), vw[0]));
  1520.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[3]].dq), vw[0]));
  1521.                         _wq[1 * 2 + 0] = _mm_add_ps(_wq[1 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[3]].nq), vw[1]));
  1522.                         _wq[1 * 2 + 1] = _mm_add_ps(_wq[1 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[3]].dq), vw[1]));
  1523.                         _wq[2 * 2 + 0] = _mm_add_ps(_wq[2 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[3]].nq), vw[2]));
  1524.                         _wq[2 * 2 + 1] = _mm_add_ps(_wq[2 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[3]].dq), vw[2]));
  1525.                         _wq[3 * 2 + 0] = _mm_add_ps(_wq[3 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[3]].nq), vw[3]));
  1526.                         _wq[3 * 2 + 1] = _mm_add_ps(_wq[3 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[3]].dq), vw[3]));
  1527.                 case 3:
  1528.                         vw[0] = Swizzle<zzzz>(vweights[0]);
  1529.                         vw[1] = Swizzle<zzzz>(vweights[1]);
  1530.                         vw[2] = Swizzle<zzzz>(vweights[2]);
  1531.                         vw[3] = Swizzle<zzzz>(vweights[3]);
  1532.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[2]].nq), vw[0]));
  1533.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[2]].dq), vw[0]));
  1534.                         _wq[1 * 2 + 0] = _mm_add_ps(_wq[1 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[2]].nq), vw[1]));
  1535.                         _wq[1 * 2 + 1] = _mm_add_ps(_wq[1 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[2]].dq), vw[1]));
  1536.                         _wq[2 * 2 + 0] = _mm_add_ps(_wq[2 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[2]].nq), vw[2]));
  1537.                         _wq[2 * 2 + 1] = _mm_add_ps(_wq[2 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[2]].dq), vw[2]));
  1538.                         _wq[3 * 2 + 0] = _mm_add_ps(_wq[3 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[2]].nq), vw[3]));
  1539.                         _wq[3 * 2 + 1] = _mm_add_ps(_wq[3 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 3].boneIds[2]].dq), vw[3]));
  1540.                 case 2:
  1541.                         vw[0] = Swizzle<yyyy>(vweights[0]);
  1542.                         vw[1] = Swizzle<yyyy>(vweights[1]);
  1543.                         vw[2] = Swizzle<yyyy>(vweights[2]);
  1544.                         vw[3] = Swizzle<yyyy>(vweights[3]);
  1545.                         _wq[0 * 2 + 0] = _mm_add_ps(_wq[0 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[1]].nq), vw[0]));
  1546.                         _wq[0 * 2 + 1] = _mm_add_ps(_wq[0 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 0].boneIds[1]].dq), vw[0]));
  1547.                         _wq[1 * 2 + 0] = _mm_add_ps(_wq[1 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[1]].nq), vw[1]));
  1548.                         _wq[1 * 2 + 1] = _mm_add_ps(_wq[1 * 2 + 1], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 1].boneIds[1]].dq), vw[1]));
  1549.                         _wq[2 * 2 + 0] = _mm_add_ps(_wq[2 * 2 + 0], _mm_mul_ps(_mm_load_ps((float*)&bones[weights[i + 2].boneIds[1]].nq), vw[2]));
  1550.                         _wq[2