Previous 199869 Revisions Next

r24006 Saturday 29th June, 2013 at 19:04:11 UTC by Ryan Holtz
01234567890123456789012345678901234567890123456789012345678901234567890123456789

-RSP SSE optimizations/changes: [MooglyGuy]
* Fixed load/store opcodes
* Added SSE versions of the following opcodes: VMADH, VADD, VSUB, VADDC, VSUBC,
  VCH, VCR, VMRG, VAND, VNAND, VOR, VNOR, VXOR, and VNXOR
[src/emu/cpu/rsp]rsp.h rspdrc.c

trunk/src/emu/cpu/rsp/rspdrc.c
r24005r24006
208208#define VS2REG                      ((op >> 16) & 0x1f)
209209#define EL                          ((op >> 21) & 0xf)
210210
211#define VREG_B(reg, offset)     rsp->v[(reg)].b[(offset)^1]
212#define W_VREG_S(reg, offset)       rsp->v[(reg)].s[(offset)]
213#define VREG_S(reg, offset)     (INT16)rsp->v[(reg)].s[(offset)]
211#define SIMD_EXTRACT16(reg, value, element) \
212   if (element < 0) printf("extract element <0 %d\n", element); \
213   switch((element) & 7) \
214   { \
215      case 0: value = _mm_extract_epi16(reg, 0); break; \
216      case 1: value = _mm_extract_epi16(reg, 1); break; \
217      case 2: value = _mm_extract_epi16(reg, 2); break; \
218      case 3: value = _mm_extract_epi16(reg, 3); break; \
219      case 4: value = _mm_extract_epi16(reg, 4); break; \
220      case 5: value = _mm_extract_epi16(reg, 5); break; \
221      case 6: value = _mm_extract_epi16(reg, 6); break; \
222      case 7: value = _mm_extract_epi16(reg, 7); break; \
223   }
214224
225
226#define SIMD_INSERT16(reg, value, element) \
227   if (element < 0) printf("insert element <0 %d\n", element); \
228   switch((element) & 7) \
229   { \
230      case 0: reg = _mm_insert_epi16(reg, value, 0); break; \
231      case 1: reg = _mm_insert_epi16(reg, value, 1); break; \
232      case 2: reg = _mm_insert_epi16(reg, value, 2); break; \
233      case 3: reg = _mm_insert_epi16(reg, value, 3); break; \
234      case 4: reg = _mm_insert_epi16(reg, value, 4); break; \
235      case 5: reg = _mm_insert_epi16(reg, value, 5); break; \
236      case 6: reg = _mm_insert_epi16(reg, value, 6); break; \
237      case 7: reg = _mm_insert_epi16(reg, value, 7); break; \
238   }
239
240
241#define VREG_B(reg, offset)        rsp->v[(reg)].b[(offset)^1]
242#define W_VREG_S(reg, offset)      rsp->v[(reg)].s[(offset)]
243#define VREG_S(reg, offset)        (INT16)rsp->v[(reg)].s[(offset)]
244
215245#define VEC_EL_2(x,z)               (vector_elements_2[(x)][(z)])
216246
217247#define ACCUM(x)        rsp->accum[x].q
r24005r24006
224254#define SET_CARRY_FLAG(x)           { rsp->flag[0] |= (1 << (x)); }
225255#define CLEAR_CARRY_FLAG(x)         { rsp->flag[0] &= ~(1 << (x)); }
226256
227#define COMPARE_FLAG(x)             ((rsp->flag[1] & (1 << (x))) ? 1 : 0)
257#define COMPARE_FLAG(x)             ((rsp->flag[1] >> (x)) & 1)
228258#define CLEAR_COMPARE_FLAGS()       { rsp->flag[1] &= ~0xff; }
229259#define SET_COMPARE_FLAG(x)         { rsp->flag[1] |= (1 << (x)); }
230260#define CLEAR_COMPARE_FLAG(x)       { rsp->flag[1] &= ~(1 << (x)); }
r24005r24006
537567/*****************************************************************************/
538568
539569/* Legacy.  Going forward, this will be transitioned into unrolled opcode decodes. */
540static const int vector_elements_1[16][8] =
541{
542   { 0, 1, 2, 3, 4, 5, 6, 7 },     // none
543   { 0, 1, 2, 3, 4, 5, 6 ,7 },     // ???
544   { 1, 3, 5, 7, 0, 2, 4, 6 },     // 0q
545   { 0, 2, 4, 6, 1, 3, 5, 7 },     // 1q
546   { 1, 2, 3, 5, 6, 7, 0, 4 },     // 0h
547   { 0, 2, 3, 4, 6, 7, 1, 5 },     // 1h
548   { 0, 1, 3, 4, 5, 7, 2, 6 },     // 2h
549   { 0, 1, 2, 4, 5, 6, 3, 7 },     // 3h
550   { 1, 2, 3, 4, 5, 6, 7, 0 },     // 0
551   { 0, 2, 3, 4, 5, 6, 7, 1 },     // 1
552   { 0, 1, 3, 4, 5, 6, 7, 2 },     // 2
553   { 0, 1, 2, 4, 5, 6, 7, 3 },     // 3
554   { 0, 1, 2, 3, 5, 6, 7, 4 },     // 4
555   { 0, 1, 2, 3, 4, 6, 7, 5 },     // 5
556   { 0, 1, 2, 3, 4, 5, 7, 6 },     // 6
557   { 0, 1, 2, 3, 4, 5, 6, 7 },     // 7
558};
559
560/* Legacy.  Going forward, this will be transitioned into unrolled opcode decodes. */
561570static const int vector_elements_2[16][8] =
562571{
563572   { 0, 1, 2, 3, 4, 5, 6, 7 },     // none
r24005r24006
578587   { 7, 7, 7, 7, 7, 7, 7, 7 },     // 7
579588};
580589
590#if USE_SIMD
591static __m128i vec_himask;
592static __m128i vec_lomask;
593static __m128i vec_overmask;
594static __m128i vec_zerobits;
595static __m128i vec_flagmask;
596static __m128i vec_shiftmask2;
597static __m128i vec_shiftmask4;
598static __m128i vec_zero;
599static __m128i vec_neg1;
600static __m128i vec_shuf[16];
601static __m128i vec_shuf_inverse[16];
602#endif
603
581604static void rspcom_init(rsp_state *rsp, legacy_cpu_device *device, device_irq_acknowledge_callback irqcallback)
582605{
583606   int regIdx = 0;
r24005r24006
610633   rsp->flag[1] = 0;
611634   rsp->flag[2] = 0;
612635   rsp->flag[3] = 0;
613   rsp->square_root_res = 0;
614   rsp->square_root_high = 0;
615636   rsp->reciprocal_res = 0;
616637   rsp->reciprocal_high = 0;
617638#endif
r24005r24006
624645
625646   rsp->sr = RSP_STATUS_HALT;
626647   rsp->step_count = 0;
648
649#if USE_SIMD
650   vec_shuf_inverse[ 0] = _mm_set_epi16(0x0f0e, 0x0d0c, 0x0b0a, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); // none
651   vec_shuf_inverse[ 1] = _mm_set_epi16(0x0f0e, 0x0d0c, 0x0b0a, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); // ???
652   vec_shuf_inverse[ 2] = _mm_set_epi16(0x0d0c, 0x0d0c, 0x0908, 0x0908, 0x0504, 0x0504, 0x0100, 0x0100); // 0q
653   vec_shuf_inverse[ 3] = _mm_set_epi16(0x0f0e, 0x0f0e, 0x0b0a, 0x0b0a, 0x0706, 0x0706, 0x0302, 0x0302); // 1q
654   vec_shuf_inverse[ 4] = _mm_set_epi16(0x0908, 0x0908, 0x0908, 0x0908, 0x0100, 0x0100, 0x0100, 0x0100); // 0h
655   vec_shuf_inverse[ 5] = _mm_set_epi16(0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0302, 0x0302, 0x0302, 0x0302); // 1h
656   vec_shuf_inverse[ 6] = _mm_set_epi16(0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0504, 0x0504, 0x0504, 0x0504); // 2h
657   vec_shuf_inverse[ 7] = _mm_set_epi16(0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0706, 0x0706, 0x0706, 0x0706); // 3h
658   vec_shuf_inverse[ 8] = _mm_set_epi16(0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100); // 0
659   vec_shuf_inverse[ 9] = _mm_set_epi16(0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302); // 1
660   vec_shuf_inverse[10] = _mm_set_epi16(0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504); // 2
661   vec_shuf_inverse[11] = _mm_set_epi16(0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706); // 3
662   vec_shuf_inverse[12] = _mm_set_epi16(0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908); // 4
663   vec_shuf_inverse[13] = _mm_set_epi16(0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a); // 5
664   vec_shuf_inverse[14] = _mm_set_epi16(0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c); // 6
665   vec_shuf_inverse[15] = _mm_set_epi16(0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e); // 7
666
667   vec_shuf[ 0] = _mm_set_epi16(0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0b0a, 0x0d0c, 0x0f0e); // none
668   vec_shuf[ 1] = _mm_set_epi16(0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0b0a, 0x0d0c, 0x0f0e); // ???
669   vec_shuf[ 2] = _mm_set_epi16(0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e); // 0q
670   vec_shuf[ 3] = _mm_set_epi16(0x0100, 0x0100, 0x0504, 0x0706, 0x0908, 0x0908, 0x0d0c, 0x0d0c); // 1q
671   vec_shuf[ 4] = _mm_set_epi16(0x0706, 0x0706, 0x0706, 0x0706, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e); // 0q
672   vec_shuf[ 5] = _mm_set_epi16(0x0504, 0x0504, 0x0504, 0x0504, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c); // 1q
673   vec_shuf[ 6] = _mm_set_epi16(0x0302, 0x0302, 0x0302, 0x0302, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a); // 2q
674   vec_shuf[ 7] = _mm_set_epi16(0x0100, 0x0100, 0x0100, 0x0100, 0x0908, 0x0908, 0x0908, 0x0908); // 3q
675   vec_shuf[ 8] = _mm_set_epi16(0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e); // 0
676   vec_shuf[ 9] = _mm_set_epi16(0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c); // 1
677   vec_shuf[10] = _mm_set_epi16(0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a); // 2
678   vec_shuf[11] = _mm_set_epi16(0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908); // 3
679   vec_shuf[12] = _mm_set_epi16(0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706); // 4
680   vec_shuf[13] = _mm_set_epi16(0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504); // 5
681   vec_shuf[14] = _mm_set_epi16(0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302); // 6
682   vec_shuf[15] = _mm_set_epi16(0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100); // 7
683   rsp->accum_h = _mm_setzero_si128();
684   rsp->accum_m = _mm_setzero_si128();
685   rsp->accum_l = _mm_setzero_si128();
686   vec_zero = _mm_setzero_si128();
687   vec_neg1 = _mm_set_epi64x(0xffffffffffffffffL, 0xffffffffffffffffL);
688   vec_himask = _mm_set_epi64x(0xffff0000ffff0000L, 0xffff0000ffff0000L);
689   vec_lomask = _mm_set_epi64x(0x0000ffff0000ffffL, 0x0000ffff0000ffffL);
690   vec_overmask = _mm_set_epi64x(0x0001000000010000L, 0x0001000000010000L);
691   vec_zerobits = _mm_set_epi64x(0x0000000100000001L, 0x0000000100000001L);
692   vec_flagmask = _mm_set_epi64x(0x0001000100010001L, 0x0001000100010001L);
693   vec_shiftmask2 = _mm_set_epi64x(0x0000000300000003L, 0x0000000300000003L);
694   vec_shiftmask4 = _mm_set_epi64x(0x000000000000000fL, 0x000000000000000fL);
695#endif
627696}
628697
629698static CPU_INIT( rsp )
r24005r24006
752821   // Load 1 byte to vector byte index
753822
754823   ea = (base) ? rsp->r[base] + offset : offset;
755   VREG_B(dest, index) = READ8(rsp, ea);
756824
757   // SSE
758825#if USE_SIMD
759   // Better solutions for this situation welcome. Need to be able to insert a byte at an arbitrary
760   // byte index in the __m128. Current method amounts to:
761   //     final_vec = (in_vec &~ discard_mask) | insert_value
762   // Naturally, SSE4.1 adds the highly-useful PINSRB opcode. As the name implies, it's an
763   // arbitrary byte-insert-into-m128, but do we want to require SSE4.1? Maybe just have an ifdef
764   // and use the more optimal one if available.
765   const __m128i neg1 = _mm_set_epi16(0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff);
766
767   __m128i insert_vec = _mm_setzero_si128();
768   INT16 insert_value = READ8(rsp, ea) << ((1 - (index & 1)) << 2);
769   _mm_insert_epi16 (insert_vec, insert_value, index >> 1);
770
771   __m128i discard_mask = _mm_setzero_si128();
772   INT16 discard_element = 0x00ff << ((1 - (index & 1)) << 2);
773   _mm_insert_epi16 (discard_mask, discard_element, index >> 1);
774   _mm_xor_si128 (discard_mask, neg1);
775   _mm_and_si128 (rsp->xv[dest], discard_mask);
776   _mm_or_si128 (rsp->xv[dest], insert_vec);
826   UINT16 element;
827   SIMD_EXTRACT16(rsp->xv[dest], element, (index >> 1));
828   element &= 0xff00 >> ((1-(index & 1)) * 8);
829   element |= READ8(rsp, ea) << ((1-(index & 1)) * 8);
830   SIMD_INSERT16(rsp->xv[dest], element, (index >> 1));
831#else
832   VREG_B(dest, index) = READ8(rsp, ea);
777833#endif
778834}
779835
r24005r24006
781837{
782838   rsp_state *rsp = (rsp_state*)param;
783839   UINT32 op = rsp->impstate->arg0;
784   UINT32 ea = 0;
785840   int dest = (op >> 16) & 0x1f;
786841   int base = (op >> 21) & 0x1f;
787842   int index = (op >> 7) & 0xe;
r24005r24006
797852   //
798853   // Loads 2 bytes starting from vector byte index
799854
800   ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2);
801
855   UINT32 ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2);
802856   int end = index + 2;
803
804857   for (int i = index; i < end; i++)
805858   {
859#if USE_SIMD
860      UINT16 element;
861      SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1));
862      element &= 0xff00 >> ((1 - (i & 1)) * 8);
863      element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8);
864      SIMD_INSERT16(rsp->xv[dest], element, (i >> 1));
865#else
806866      VREG_B(dest, i) = READ8(rsp, ea);
867#endif
807868      ea++;
808869   }
809
810   // SSE
811#if USE_SIMD
812   INT16 insert_value = READ8(rsp, ea) << 8 | READ8(rsp, ea + 1);
813   _mm_insert_epi16 (rsp->xv[dest], insert_value, index >> 1);
814#endif
815870}
816871
817872static void cfunc_rsp_llv(void *param)
r24005r24006
840895
841896   for (int i = index; i < end; i++)
842897   {
898#if USE_SIMD
899      UINT16 element;
900      SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1));
901      element &= 0xff00 >> ((1 - (i & 1)) * 8);
902      element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8);
903      SIMD_INSERT16(rsp->xv[dest], element, (i >> 1));
904#else
843905      VREG_B(dest, i) = READ8(rsp, ea);
906#endif
844907      ea++;
845908   }
846
847   // SSE
848#if USE_SIMD
849   INT16 insert_value0 = READ8(rsp, ea) << 8 | READ8(rsp, ea + 1);
850   INT16 insert_value1 = READ8(rsp, ea + 2) << 8 | READ8(rsp, ea + 3);
851   _mm_insert_epi16 (rsp->xv[dest], insert_value0, (index >> 1));
852   _mm_insert_epi16 (rsp->xv[dest], insert_value1, (index >> 1) + 1);
853#endif
854909}
855910
856911static void cfunc_rsp_ldv(void *param)
r24005r24006
879934
880935   for (int i = index; i < end; i++)
881936   {
937#if USE_SIMD
938      UINT16 element;
939      SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1));
940      element &= 0xff00 >> ((1 - (i & 1)) * 8);
941      element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8);
942      SIMD_INSERT16(rsp->xv[dest], element, (i >> 1));
943#else
882944      VREG_B(dest, i) = READ8(rsp, ea);
945#endif
883946      ea++;
884947   }
885
886#if USE_SIMD
887   INT16 insert_value0 = READ8(rsp, ea) << 8 | READ8(rsp, ea + 1);
888   INT16 insert_value1 = READ8(rsp, ea + 2) << 8 | READ8(rsp, ea + 3);
889   INT16 insert_value2 = READ8(rsp, ea + 4) << 8 | READ8(rsp, ea + 5);
890   INT16 insert_value3 = READ8(rsp, ea + 6) << 8 | READ8(rsp, ea + 7);
891   _mm_insert_epi16 (rsp->xv[dest], insert_value0, (index >> 1));
892   _mm_insert_epi16 (rsp->xv[dest], insert_value1, (index >> 1) + 1);
893   _mm_insert_epi16 (rsp->xv[dest], insert_value2, (index >> 1) + 2);
894   _mm_insert_epi16 (rsp->xv[dest], insert_value3, (index >> 1) + 3);
895#endif
896948}
897949
898950static void cfunc_rsp_lqv(void *param)
899951{
900952   rsp_state *rsp = (rsp_state*)param;
901953   UINT32 op = rsp->impstate->arg0;
902   int i = 0;
903   int end = 0;
904   UINT32 ea = 0;
905954   int dest = (op >> 16) & 0x1f;
906955   int base = (op >> 21) & 0x1f;
907   int index = 0; // Just a test, it goes right back the way it was if something breaks //(op >> 7) & 0xf;
956   //int index = 0; // Just a test, it goes right back the way it was if something breaks //(op >> 7) & 0xf;
908957   int offset = (op & 0x7f);
909958   if (offset & 0x40)
910959   {
r24005r24006
917966   //
918967   // Loads up to 16 bytes starting from vector byte index
919968
920   ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
969   UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
921970
922   end = index + (16 - (ea & 0xf));
971   int end = 16 - (ea & 0xf);
923972   if (end > 16) end = 16;
924973
925   for (i=index; i < end; i++)
974   for (int i = 0; i < end; i++)
926975   {
976#if USE_SIMD
977      UINT16 element;
978      SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1));
979      element &= 0xff00 >> ((1 - (i & 1)) * 8);
980      element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8);
981      SIMD_INSERT16(rsp->xv[dest], element, (i >> 1));
982#else
927983      VREG_B(dest, i) = READ8(rsp, ea);
984#endif
928985      ea++;
929986   }
930
931   // SSE
932#if USE_SIMD
933   INT16 val0 = READ8(rsp, ea) << 8 | READ8(rsp, ea + 1);
934   INT16 val1 = READ8(rsp, ea + 2) << 8 | READ8(rsp, ea + 3);
935   INT16 val2 = READ8(rsp, ea + 4) << 8 | READ8(rsp, ea + 5);
936   INT16 val3 = READ8(rsp, ea + 6) << 8 | READ8(rsp, ea + 7);
937   INT16 val4 = READ8(rsp, ea + 8) << 8 | READ8(rsp, ea + 9);
938   INT16 val5 = READ8(rsp, ea + 10) << 8 | READ8(rsp, ea + 11);
939   INT16 val6 = READ8(rsp, ea + 12) << 8 | READ8(rsp, ea + 13);
940   INT16 val7 = READ8(rsp, ea + 14) << 8 | READ8(rsp, ea + 15);
941
942   rsp->xv[dest] = _mm_set_epi16(val0, val1, val2, val3, val4, val5, val6, val7);
943#endif
944987}
945988
946989static void cfunc_rsp_lrv(void *param)
947990{
948991   rsp_state *rsp = (rsp_state*)param;
949992   UINT32 op = rsp->impstate->arg0;
950   int i = 0;
951   int end = 0;
952   UINT32 ea = 0;
953993   int dest = (op >> 16) & 0x1f;
954994   int base = (op >> 21) & 0x1f;
955995   int index = (op >> 7) & 0xf;
r24005r24006
9651005   //
9661006   // Stores up to 16 bytes starting from right side until 16-byte boundary
9671007
968   ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
1008   UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
9691009
9701010   index = 16 - ((ea & 0xf) - index);
971   end = 16;
9721011   ea &= ~0xf;
9731012
974#if USE_SIMD
975   INT16 mask[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
976   INT16 val[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
977#endif
978   for (i=index; i < end; i++)
1013   for (int i = index; i < 16; i++)
9791014   {
9801015#if USE_SIMD
981      mask[i >> 1] |= 0x00ff << ((i & 1) * 8);
982      val[i >> 1] |= READ8(rsp, ea) << ((i & 1) * 8);
1016      UINT16 element;
1017      SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1));
1018      element &= 0xff00 >> ((1-(i & 1)) * 8);
1019      element |= READ8(rsp, ea) << ((1-(i & 1)) * 8);
1020      SIMD_INSERT16(rsp->xv[dest], element, (i >> 1));
1021#else
1022      VREG_B(dest, i) = READ8(rsp, ea);
9831023#endif
984      VREG_B(dest, i) = READ8(rsp, ea);
9851024      ea++;
9861025   }
987
988#if USE_SIMD
989   __m128i neg1 = _mm_set_epi16(0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff);
990   __m128i keep_mask = _mm_set_epi16(mask[0], mask[1], mask[2], mask[3], mask[4], mask[5], mask[6], mask[7]);
991   __m128i load_val = _mm_set_epi16(val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
992   keep_mask = _mm_xor_si128(keep_mask, neg1);
993   rsp->xv[dest] = _mm_and_si128(rsp->xv[dest], keep_mask);
994   rsp->xv[dest] = _mm_or_si128(rsp->xv[dest], load_val);
995#endif
9961026}
9971027
9981028static void cfunc_rsp_lpv(void *param)
9991029{
10001030   rsp_state *rsp = (rsp_state*)param;
10011031   UINT32 op = rsp->impstate->arg0;
1002   int i = 0;
1003   UINT32 ea = 0;
10041032   int dest = (op >> 16) & 0x1f;
10051033   int base = (op >> 21) & 0x1f;
10061034   int index = (op >> 7) & 0xf;
r24005r24006
10161044   //
10171045   // Loads a byte as the upper 8 bits of each element
10181046
1019   ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8);
1047   UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8);
10201048
1021#if USE_SIMD
1022   INT16 val[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
1023#endif
1024   for (i=0; i < 8; i++)
1049   for (int i = 0; i < 8; i++)
10251050   {
10261051#if USE_SIMD
1027      val[i] = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 8;
1052      SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea + (((16-index) + i) & 0xf)) << 8, i);
1053#else
1054      W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 8;
10281055#endif
1029      W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 8;
10301056   }
1031
1032#if USE_SIMD
1033   rsp->xv[dest] = _mm_set_epi16(val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
1034#endif
10351057}
10361058
10371059static void cfunc_rsp_luv(void *param)
10381060{
10391061   rsp_state *rsp = (rsp_state*)param;
10401062   UINT32 op = rsp->impstate->arg0;
1041   int i = 0;
1042   UINT32 ea = 0;
10431063   int dest = (op >> 16) & 0x1f;
10441064   int base = (op >> 21) & 0x1f;
10451065   int index = (op >> 7) & 0xf;
r24005r24006
10551075   //
10561076   // Loads a byte as the bits 14-7 of each element
10571077
1058   ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8);
1078   UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8);
10591079
1060#if USE_SIMD
1061   INT16 val[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
1062#endif
1063   for (i=0; i < 8; i++)
1080   for (int i = 0; i < 8; i++)
10641081   {
10651082#if USE_SIMD
1066      val[i] = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7;
1083      SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7, i);
1084#else
1085      W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7;
10671086#endif
1068      W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7;
10691087   }
1070
1071#if USE_SIMD
1072   rsp->xv[dest] = _mm_set_epi16(val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
1073#endif
10741088}
10751089
10761090static void cfunc_rsp_lhv(void *param)
10771091{
10781092   rsp_state *rsp = (rsp_state*)param;
10791093   UINT32 op = rsp->impstate->arg0;
1080   int i = 0;
1081   UINT32 ea = 0;
10821094   int dest = (op >> 16) & 0x1f;
10831095   int base = (op >> 21) & 0x1f;
10841096   int index = (op >> 7) & 0xf;
r24005r24006
10941106   //
10951107   // Loads a byte as the bits 14-7 of each element, with 2-byte stride
10961108
1097   ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
1109   UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
10981110
1099#if USE_SIMD
1100   INT16 val[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
1101#endif
1102   for (i=0; i < 8; i++)
1111   for (int i = 0; i < 8; i++)
11031112   {
11041113#if USE_SIMD
1105      val[i] = READ8(rsp, ea + (((16-index) + (i<<1)) & 0xf)) << 7;
1114      SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea + (((16-index) + (i<<1)) & 0xf)) << 7, i);
1115#else
1116      W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + (i<<1)) & 0xf)) << 7;
11061117#endif
1107      W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + (i<<1)) & 0xf)) << 7;
11081118   }
1109
1110#if USE_SIMD
1111   rsp->xv[dest] = _mm_set_epi16(val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
1112#endif
11131119}
11141120
11151121static void cfunc_rsp_lfv(void *param)
11161122{
11171123   rsp_state *rsp = (rsp_state*)param;
11181124   UINT32 op = rsp->impstate->arg0;
1119   int i = 0;
1120   int end = 0;
1121   UINT32 ea = 0;
11221125   int dest = (op >> 16) & 0x1f;
11231126   int base = (op >> 21) & 0x1f;
11241127   int index = (op >> 7) & 0xf;
r24005r24006
11341137   //
11351138   // Loads a byte as the bits 14-7 of upper or lower quad, with 4-byte stride
11361139
1140   UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
11371141
1138   ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
1139
11401142   // not sure what happens if 16-byte boundary is crossed...
11411143
1142   end = (index >> 1) + 4;
1144   int end = (index >> 1) + 4;
11431145
1144#if USE_SIMD
1145   INT16 mask[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
1146   INT16 val[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
1147#endif
1148   for (i=index >> 1; i < end; i++)
1146   for (int i = index >> 1; i < end; i++)
11491147   {
11501148#if USE_SIMD
1151      mask[i] = 0xffff;
1152      val[i] = READ8(rsp, ea) << 7;
1149      SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea) << 7, i);
1150#else
1151      W_VREG_S(dest, i) = READ8(rsp, ea) << 7;
11531152#endif
1154      W_VREG_S(dest, i) = READ8(rsp, ea) << 7;
11551153      ea += 4;
11561154   }
1157
1158#if USE_SIMD
1159   __m128i neg1 = _mm_set_epi16(0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff);
1160   __m128i keep_mask = _mm_set_epi16(mask[0], mask[1], mask[2], mask[3], mask[4], mask[5], mask[6], mask[7]);
1161   __m128i load_val = _mm_set_epi16(val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
1162   keep_mask = _mm_xor_si128(keep_mask, neg1);
1163   rsp->xv[dest] = _mm_and_si128(rsp->xv[dest], keep_mask);
1164   rsp->xv[dest] = _mm_or_si128(rsp->xv[dest], load_val);
1165#endif
11661155}
11671156
11681157static void cfunc_rsp_lwv(void *param)
11691158{
11701159   rsp_state *rsp = (rsp_state*)param;
11711160   UINT32 op = rsp->impstate->arg0;
1172   int i = 0;
1173   int end = 0;
1174   UINT32 ea = 0;
11751161   int dest = (op >> 16) & 0x1f;
11761162   int base = (op >> 21) & 0x1f;
11771163   int index = (op >> 7) & 0xf;
r24005r24006
11881174   // Loads the full 128-bit vector starting from vector byte index and wrapping to index 0
11891175   // after byte index 15
11901176
1191   ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
1177   UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
1178   int end = (16 - index) + 16;
11921179
1193   end = (16 - index) + 16;
1194
11951180#if USE_SIMD
1196   INT16 val[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
1181   UINT8 val[16];
11971182#endif
1198   for (i=(16 - index); i < end; i++)
1183   for (int i = (16 - index); i < end; i++)
11991184   {
12001185#if USE_SIMD
1201      val[i >> 1] |= READ8(rsp, ea) << ((i & 1) * 8);
1186      val[i & 0xf] = READ8(rsp, ea);
1187#else
1188      VREG_B(dest, i & 0xf) = READ8(rsp, ea);
12021189#endif
1203      VREG_B(dest, i & 0xf) = READ8(rsp, ea);
12041190      ea += 4;
12051191   }
12061192
12071193#if USE_SIMD
1208   rsp->xv[dest] = _mm_set_epi16(val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
1194   rsp->xv[dest] = _mm_set_epi8(val[15], val[14], val[13], val[12], val[11], val[10], val[ 9], val[ 8],
1195                         val[ 7], val[ 6], val[ 5], val[ 4], val[ 3], val[ 2], val[ 1], val[ 0]);
12091196#endif
12101197}
12111198
r24005r24006
12131200{
12141201   rsp_state *rsp = (rsp_state*)param;
12151202   UINT32 op = rsp->impstate->arg0;
1216   int i = 0;
1217   UINT32 ea = 0;
12181203   int dest = (op >> 16) & 0x1f;
12191204   int base = (op >> 21) & 0x1f;
12201205   int index = (op >> 7) & 0xf;
r24005r24006
12291214
12301215   // FIXME: has a small problem with odd indices
12311216
1232   int element;
12331217   int vs = dest;
12341218   int ve = dest + 8;
12351219   if (ve > 32)
r24005r24006
12371221      ve = 32;
12381222   }
12391223
1240   element = 7 - (index >> 1);
1224   int element = 7 - (index >> 1);
12411225
1242   ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
1226   UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
12431227
12441228   ea = ((ea + 8) & ~0xf) + (index & 1);
1245   for (i = vs; i < ve; i++)
1229   for (int i = vs; i < ve; i++)
12461230   {
12471231      element = ((8 - (index >> 1) + (i - vs)) << 1);
12481232#if USE_SIMD
1249      UINT16 value = (READ8(rsp, ea + 1) << 8) | READ8(rsp, ea);
1250      _mm_insert_epi16 (rsp->xv[i], value, element);
1251#endif
1233      UINT16 value = (READ8(rsp, ea) << 8) | READ8(rsp, ea + 1);
1234      SIMD_INSERT16(rsp->xv[i], value, (element >> 1));
1235#else
12521236      VREG_B(i, (element & 0xf)) = READ8(rsp, ea);
12531237      VREG_B(i, ((element + 1) & 0xf)) = READ8(rsp, ea + 1);
1238#endif
12541239
12551240      ea += 2;
12561241   }
r24005r24006
13311316{
13321317   rsp_state *rsp = (rsp_state*)param;
13331318   UINT32 op = rsp->impstate->arg0;
1334   UINT32 ea = 0;
13351319   int dest = (op >> 16) & 0x1f;
13361320   int base = (op >> 21) & 0x1f;
13371321   int index = (op >> 7) & 0xf;
r24005r24006
13481332   //
13491333   // Stores 1 byte from vector byte index
13501334
1351   ea = (base) ? rsp->r[base] + offset : offset;
1335   UINT32 ea = (base) ? rsp->r[base] + offset : offset;
1336#if USE_SIMD
1337   UINT16 value;
1338   SIMD_EXTRACT16(rsp->xv[dest], value, (index >> 1));
1339   value >>= (1-(index & 1)) * 8;
1340   WRITE8(rsp, ea, (UINT8)value);
1341#else
13521342   WRITE8(rsp, ea, VREG_B(dest, index));
1343#endif
13531344}
13541345
13551346static void cfunc_rsp_ssv(void *param)
13561347{
13571348   rsp_state *rsp = (rsp_state*)param;
13581349   UINT32 op = rsp->impstate->arg0;
1359   UINT32 ea = 0;
13601350   int dest = (op >> 16) & 0x1f;
13611351   int base = (op >> 21) & 0x1f;
13621352   int index = (op >> 7) & 0xf;
r24005r24006
13731363   //
13741364   // Stores 2 bytes starting from vector byte index
13751365
1376   ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2);
1366   UINT32 ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2);
13771367
1368#if USE_SIMD
1369   UINT16 value;
1370   SIMD_EXTRACT16(rsp->xv[dest], value, (index >> 1));
1371   WRITE8(rsp, ea, (UINT8)(value >> 8));
1372   WRITE8(rsp, ea+1, (UINT8)(value & 0x00ff));
1373#else
13781374   int end = index + 2;
1379
13801375   for (int i = index; i < end; i++)
13811376   {
13821377      WRITE8(rsp, ea, VREG_B(dest, i));
13831378      ea++;
13841379   }
1380#endif
13851381}
13861382
13871383static void cfunc_rsp_slv(void *param)
13881384{
13891385   rsp_state *rsp = (rsp_state*)param;
13901386   UINT32 op = rsp->impstate->arg0;
1391   UINT32 ea = 0;
13921387   int dest = (op >> 16) & 0x1f;
13931388   int base = (op >> 21) & 0x1f;
13941389   int index = (op >> 7) & 0xf;
r24005r24006
14041399   //
14051400   // Stores 4 bytes starting from vector byte index
14061401
1407   ea = (base) ? rsp->r[base] + (offset * 4) : (offset * 4);
1402   UINT32 ea = (base) ? rsp->r[base] + (offset * 4) : (offset * 4);
14081403
1404#if USE_SIMD
1405   UINT16 value0, value1;
1406   index >>= 1;
1407   SIMD_EXTRACT16(rsp->xv[dest], value0, index);
1408   SIMD_EXTRACT16(rsp->xv[dest], value1, index+1);
1409   WRITE8(rsp, ea, (UINT8)(value0 >> 8));
1410   WRITE8(rsp, ea+1, (UINT8)(value0 & 0x00ff));
1411   WRITE8(rsp, ea+2, (UINT8)(value1 >> 8));
1412   WRITE8(rsp, ea+3, (UINT8)(value1 & 0x00ff));
1413#else
14091414   int end = index + 4;
1410
14111415   for (int i = index; i < end; i++)
14121416   {
14131417      WRITE8(rsp, ea, VREG_B(dest, i));
14141418      ea++;
14151419   }
1420#endif
14161421}
14171422
14181423static void cfunc_rsp_sdv(void *param)
14191424{
14201425   rsp_state *rsp = (rsp_state*)param;
14211426   UINT32 op = rsp->impstate->arg0;
1422   UINT32 ea = 0;
1423   int end = 0;
14241427   int dest = (op >> 16) & 0x1f;
14251428   int base = (op >> 21) & 0x1f;
1426   int index = (op >> 7) & 0xf;
1429   int index = (op >> 7) & 0x8;
14271430   int offset = (op & 0x7f);
14281431   if (offset & 0x40)
14291432   {
r24005r24006
14351438   // --------------------------------------------------
14361439   //
14371440   // Stores 8 bytes starting from vector byte index
1438   ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8);
1441   UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8);
14391442
1440   end = index + 8;
1441
1443#if USE_SIMD
1444   UINT16 value0, value1, value2, value3;
1445   index >>= 1;
1446   SIMD_EXTRACT16(rsp->xv[dest], value0, index);
1447   SIMD_EXTRACT16(rsp->xv[dest], value1, index+1);
1448   SIMD_EXTRACT16(rsp->xv[dest], value2, index+2);
1449   SIMD_EXTRACT16(rsp->xv[dest], value3, index+3);
1450   WRITE8(rsp, ea, (UINT8)(value0 >> 8));
1451   WRITE8(rsp, ea+1, (UINT8)(value0 & 0x00ff));
1452   WRITE8(rsp, ea+2, (UINT8)(value1 >> 8));
1453   WRITE8(rsp, ea+3, (UINT8)(value1 & 0x00ff));
1454   WRITE8(rsp, ea+4, (UINT8)(value2 >> 8));
1455   WRITE8(rsp, ea+5, (UINT8)(value2 & 0x00ff));
1456   WRITE8(rsp, ea+6, (UINT8)(value3 >> 8));
1457   WRITE8(rsp, ea+7, (UINT8)(value3 & 0x00ff));
1458#else
1459   int end = index + 8;
14421460   for (int i = index; i < end; i++)
14431461   {
14441462      WRITE8(rsp, ea, VREG_B(dest, i));
14451463      ea++;
14461464   }
1465#endif
14471466}
14481467
14491468static void cfunc_rsp_sqv(void *param)
14501469{
14511470   rsp_state *rsp = (rsp_state*)param;
14521471   UINT32 op = rsp->impstate->arg0;
1453   UINT32 ea = 0;
1454   int i = 0;
1455   int end = 0;
14561472   int dest = (op >> 16) & 0x1f;
14571473   int base = (op >> 21) & 0x1f;
14581474   int index = (op >> 7) & 0xf;
r24005r24006
14681484   //
14691485   // Stores up to 16 bytes starting from vector byte index until 16-byte boundary
14701486
1471   ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
1472
1473   end = index + (16 - (ea & 0xf));
1474
1475   for (i=index; i < end; i++)
1487   UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
1488   int end = index + (16 - (ea & 0xf));
1489   for (int i=index; i < end; i++)
14761490   {
1491#if USE_SIMD
1492      UINT16 value;
1493      SIMD_EXTRACT16(rsp->xv[dest], value, (i >> 1));
1494      value >>= (1-(i & 1)) * 8;
1495      WRITE8(rsp, ea, (UINT8)value);
1496#else
14771497      WRITE8(rsp, ea, VREG_B(dest, i & 0xf));
1498#endif
14781499      ea++;
14791500   }
14801501}
r24005r24006
15061527
15071528   for (int i = index; i < end; i++)
15081529   {
1530#if USE_SIMD
1531      UINT32 bi = (i + o) & 0xf;
1532      UINT16 value;
1533      SIMD_EXTRACT16(rsp->xv[dest], value, (bi >> 1));
1534      value >>= (1-(bi & 1)) * 8;
1535      WRITE8(rsp, ea, (UINT8)value);
1536#else
15091537      WRITE8(rsp, ea, VREG_B(dest, ((i + o) & 0xf)));
1538#endif
15101539      ea++;
15111540   }
15121541}
r24005r24006
15151544{
15161545   rsp_state *rsp = (rsp_state*)param;
15171546   UINT32 op = rsp->impstate->arg0;
1518   UINT32 ea = 0;
1519   int i = 0;
1520   int end = 0;
15211547   int dest = (op >> 16) & 0x1f;
15221548   int base = (op >> 21) & 0x1f;
15231549   int index = (op >> 7) & 0xf;
r24005r24006
15331559   //
15341560   // Stores upper 8 bits of each element
15351561
1536   ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8);
1537   end = index + 8;
1538
1539   for (i=index; i < end; i++)
1562   UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8);
1563   int end = index + 8;
1564   for (int i=index; i < end; i++)
15401565   {
15411566      if ((i & 0xf) < 8)
15421567      {
1543         WRITE8(rsp, ea, VREG_B(dest, ((i & 0xf) << 1)));
1568#if USE_SIMD
1569         UINT16 value;
1570         SIMD_EXTRACT16(rsp->xv[dest], value, i);
1571         WRITE8(rsp, ea, (UINT8)(value >> 8));
1572#else
1573         WRITE8(rsp, ea, VREG_B(dest, (i & 0xf) << 1));
1574#endif
15441575      }
15451576      else
15461577      {
1578#if USE_SIMD
1579         UINT16 value;
1580         SIMD_EXTRACT16(rsp->xv[dest], value, i);
1581         value >>= 7;
1582         WRITE8(rsp, ea, (UINT8)value);
1583#else
15471584         WRITE8(rsp, ea, VREG_S(dest, (i & 0x7)) >> 7);
1585#endif
15481586      }
15491587      ea++;
15501588   }
r24005r24006
15541592{
15551593   rsp_state *rsp = (rsp_state*)param;
15561594   UINT32 op = rsp->impstate->arg0;
1557   UINT32 ea = 0;
1558   int i = 0;
1559   int end = 0;
15601595   int dest = (op >> 16) & 0x1f;
15611596   int base = (op >> 21) & 0x1f;
15621597   int index = (op >> 7) & 0xf;
r24005r24006
15721607   //
15731608   // Stores bits 14-7 of each element
15741609
1575   ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8);
1576   end = index + 8;
1577
1578   for (i=index; i < end; i++)
1610   UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8);
1611   int end = index + 8;
1612   for (int i=index; i < end; i++)
15791613   {
15801614      if ((i & 0xf) < 8)
15811615      {
1616#if USE_SIMD
1617         UINT16 value;
1618         SIMD_EXTRACT16(rsp->xv[dest], value, i);
1619         value >>= 7;
1620         WRITE8(rsp, ea, (UINT8)value);
1621#else
15821622         WRITE8(rsp, ea, VREG_S(dest, (i & 0x7)) >> 7);
1623#endif
15831624      }
15841625      else
15851626      {
1627#if USE_SIMD
1628         UINT16 value;
1629         SIMD_EXTRACT16(rsp->xv[dest], value, i);
1630         WRITE8(rsp, ea, (UINT8)value >> 8);
1631#else
15861632         WRITE8(rsp, ea, VREG_B(dest, ((i & 0x7) << 1)));
1633#endif
15871634      }
15881635      ea++;
15891636   }
r24005r24006
15931640{
15941641   rsp_state *rsp = (rsp_state*)param;
15951642   UINT32 op = rsp->impstate->arg0;
1596   UINT32 ea = 0;
1597   int i = 0;
15981643   int dest = (op >> 16) & 0x1f;
15991644   int base = (op >> 21) & 0x1f;
16001645   int index = (op >> 7) & 0xf;
r24005r24006
16101655   //
16111656   // Stores bits 14-7 of each element, with 2-byte stride
16121657
1613   ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
1614
1615   for (i=0; i < 8; i++)
1658   UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
1659   for (int i=0; i < 8; i++)
16161660   {
1617      UINT8 d = ((VREG_B(dest, ((index + (i << 1) + 0) & 0xf))) << 1) |
1618               ((VREG_B(dest, ((index + (i << 1) + 1) & 0xf))) >> 7);
1619
1661      int element = index + (i << 1);
1662#if USE_SIMD
1663      UINT16 value;
1664      SIMD_EXTRACT16(rsp->xv[dest], value, element >> 1);
1665      WRITE8(rsp, ea, (value >> 7) & 0x00ff);
1666#else
1667      UINT8 d = (VREG_B(dest, (element & 0xf)) << 1) |
1668               (VREG_B(dest, ((element + 1) & 0xf)) >> 7);
16201669      WRITE8(rsp, ea, d);
1670#endif
16211671      ea += 2;
16221672   }
16231673}
r24005r24006
16261676{
16271677   rsp_state *rsp = (rsp_state*)param;
16281678   UINT32 op = rsp->impstate->arg0;
1629   UINT32 ea = 0;
1630   int i = 0;
1631   int end = 0;
1632   int eaoffset = 0;
16331679   int dest = (op >> 16) & 0x1f;
16341680   int base = (op >> 21) & 0x1f;
16351681   int index = (op >> 7) & 0xf;
r24005r24006
16471693
16481694   if (index & 0x7)    printf("RSP: SFV: index = %d at %08X\n", index, rsp->ppc);
16491695
1650   ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
1651
1652   eaoffset = ea & 0xf;
1696   UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
1697   int eaoffset = ea & 0xf;
16531698   ea &= ~0xf;
16541699
1655   end = (index >> 1) + 4;
1700   int end = (index >> 1) + 4;
16561701
1657   for (i=index >> 1; i < end; i++)
1702   for (int i = index>>1; i < end; i++)
16581703   {
1704#if USE_SIMD
1705      UINT16 value;
1706      SIMD_EXTRACT16(rsp->xv[dest], value, i);
1707      WRITE8(rsp, ea + (eaoffset & 0xf), (value >> 7) & 0x00ff);
1708#else
16591709      WRITE8(rsp, ea + (eaoffset & 0xf), VREG_S(dest, i) >> 7);
1710#endif
16601711      eaoffset += 4;
16611712   }
16621713}
r24005r24006
16651716{
16661717   rsp_state *rsp = (rsp_state*)param;
16671718   UINT32 op = rsp->impstate->arg0;
1668   UINT32 ea = 0;
1669   int i = 0;
1670   int end = 0;
1671   int eaoffset = 0;
16721719   int dest = (op >> 16) & 0x1f;
16731720   int base = (op >> 21) & 0x1f;
16741721   int index = (op >> 7) & 0xf;
r24005r24006
16851732   // Stores the full 128-bit vector starting from vector byte index and wrapping to index 0
16861733   // after byte index 15
16871734
1688   ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
1689
1690   eaoffset = ea & 0xf;
1735   UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
1736   int eaoffset = ea & 0xf;
16911737   ea &= ~0xf;
16921738
1693   end = index + 16;
1694
1695   for (i=index; i < end; i++)
1739   int end = index + 16;
1740   for (int i = index; i < end; i++)
16961741   {
1742#if USE_SIMD
1743      UINT16 value;
1744      SIMD_EXTRACT16(rsp->xv[dest], value, i >> 1);
1745      WRITE8(rsp, ea + (eaoffset & 0xf), (value >> ((1-(i & 1)) * 8)) & 0xff);
1746#else
16971747      WRITE8(rsp, ea + (eaoffset & 0xf), VREG_B(dest, i & 0xf));
1748#endif
16981749      eaoffset++;
16991750   }
17001751}
r24005r24006
17031754{
17041755   rsp_state *rsp = (rsp_state*)param;
17051756   UINT32 op = rsp->impstate->arg0;
1706   UINT32 ea = 0;
1707   int i = 0;
17081757   int dest = (op >> 16) & 0x1f;
17091758   int base = (op >> 21) & 0x1f;
17101759   int index = (op >> 7) & 0xf;
r24005r24006
17301779
17311780   int element = 8 - (index >> 1);
17321781
1733   ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
1734
1782   UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16);
17351783   int eaoffset = (ea & 0xf) + (element * 2);
17361784   ea &= ~0xf;
17371785
1738   for (i=vs; i < ve; i++)
1786   for (int i = vs; i < ve; i++)
17391787   {
1788#if USE_SIMD
1789      UINT16 value;
1790      SIMD_EXTRACT16(rsp->xv[dest], value, element);
1791      WRITE16(rsp, ea + (eaoffset & 0xf), value);
1792#else
17401793      WRITE16(rsp, ea + (eaoffset & 0xf), VREG_S(i, element & 0x7));
1794#endif
17411795      eaoffset += 2;
17421796      element++;
17431797   }
r24005r24006
18341888         {
18351889            if (slice == 0)
18361890            {
1891#if USE_SIMD
1892               UINT16 ret;
1893               SIMD_EXTRACT16(rsp->accum_l, ret, accum);
1894               return ret;
1895#else
18371896               return ACCUM_L(accum);
1897#endif
18381898            }
18391899            else if (slice == 1)
18401900            {
r24005r24006
18591919         {
18601920            if (slice == 0)
18611921            {
1922#if USE_SIMD
1923               UINT16 ret;
1924               SIMD_EXTRACT16(rsp->accum_l, ret, accum);
1925               return ret;
1926#else
18621927               return ACCUM_L(accum);
1928#endif
18631929            }
18641930            else
18651931            {
r24005r24006
18721938   return 0;
18731939}
18741940
1941#if USE_SIMD
1942__m128i SATURATE_ACCUM1(__m128i accum_h, __m128i accum_m, UINT16 negative, UINT16 positive)
1943{
1944   __m128i vnegative = _mm_set_epi16(negative, negative, negative, negative, negative, negative, negative, negative);
1945   __m128i vpositive = _mm_set_epi16(positive, positive, positive, positive, positive, positive, positive, positive);
1946
1947   // conditional masks
1948   __m128i accum_hlz = _mm_cmplt_epi16(accum_h, vec_zero);
1949   __m128i accum_hgz = _mm_cmpgt_epi16(accum_h, vec_zero);
1950   __m128i accum_hz = _mm_cmpeq_epi16(accum_h, vec_zero);
1951   __m128i accum_hn1 = _mm_cmpeq_epi16(accum_h, vec_neg1);
1952   __m128i accum_hnn1 = _mm_xor_si128(accum_hn1, vec_neg1);
1953
1954   __m128i accum_mlz = _mm_cmplt_epi16(accum_m, vec_zero);
1955   __m128i accum_mgz = _mm_cmpgt_epi16(accum_m, vec_zero);
1956   __m128i accum_mz = _mm_cmpeq_epi16(accum_m, vec_zero);
1957   __m128i accum_mgez = _mm_or_si128(accum_mz, accum_mgz);
1958
1959   // Return negative if H<0 && (H!=0xffff || M >= 0)
1960   // Return positive if H>0 || (H==0 && M<0)
1961   // Return medium slice if H==0xffff && M<0
1962   // Return medium slice if H==0 && M>=0
1963
1964   __m128i negative_mask = _mm_and_si128(accum_hlz, _mm_or_si128(accum_hnn1, accum_mgez));
1965   __m128i positive_mask = _mm_or_si128(accum_hgz, _mm_and_si128(accum_hz, accum_mlz));
1966   __m128i accumm_mask = _mm_or_si128(_mm_and_si128(accum_hz, accum_mgez), _mm_and_si128(accum_hn1, accum_mlz));
1967
1968   __m128i output = _mm_and_si128(accum_m, accumm_mask);
1969   output = _mm_or_si128(output, _mm_and_si128(vnegative, negative_mask));
1970   output = _mm_or_si128(output, _mm_and_si128(vpositive, positive_mask));
1971   return output;
1972}
1973#endif
1974
18751975INLINE UINT16 SATURATE_ACCUM1(rsp_state *rsp, int accum, UINT16 negative, UINT16 positive)
18761976{
1977   // Return negative if H<0 && (H!=0xffff || M >= 0)
1978   // Return positive if H>0 || (H==0 && M<0)
1979   // Return medium slice if H==0xffff && M<0
1980   // Return medium slice if H==0 && M>=0
18771981   if ((INT16)ACCUM_H(accum) < 0)
18781982   {
18791983      if ((UINT16)(ACCUM_H(accum)) != 0xffff)
r24005r24006
19142018   return 0;
19152019}
19162020
2021INLINE UINT16 C_SATURATE_ACCUM1(UINT16 *h, UINT16 *m, int accum, UINT16 negative, UINT16 positive)
2022{
2023   // Return negative if H<0 && (H!=0xffff || M >= 0)
2024   // Return positive if H>0 || (H==0 && M<0)
2025   // Return medium slice if H==0xffff && M<0
2026   // Return medium slice if H==0 && M>=0
2027   if ((INT16)h[accum] < 0)
2028   {
2029      if ((UINT16)h[accum] != 0xffff)
2030      {
2031         return negative;
2032      }
2033      else
2034      {
2035         if ((INT16)m[accum] >= 0)
2036         {
2037            return negative;
2038         }
2039         else
2040         {
2041            return m[accum];
2042         }
2043      }
2044   }
2045   else
2046   {
2047      if ((UINT16)h[accum] != 0)
2048      {
2049         return positive;
2050      }
2051      else
2052      {
2053         if ((INT16)m[accum] < 0)
2054         {
2055            return positive;
2056         }
2057         else
2058         {
2059            return m[accum];
2060         }
2061      }
2062   }
2063
2064   return 0;
2065}
2066
2067#if USE_SIMD
19172068#define WRITEBACK_RESULT() { \
2069      SIMD_INSERT16(rsp->xv[VDREG], vres[0], 0); \
2070      SIMD_INSERT16(rsp->xv[VDREG], vres[1], 1); \
2071      SIMD_INSERT16(rsp->xv[VDREG], vres[2], 2); \
2072      SIMD_INSERT16(rsp->xv[VDREG], vres[3], 3); \
2073      SIMD_INSERT16(rsp->xv[VDREG], vres[4], 4); \
2074      SIMD_INSERT16(rsp->xv[VDREG], vres[5], 5); \
2075      SIMD_INSERT16(rsp->xv[VDREG], vres[6], 6); \
2076      SIMD_INSERT16(rsp->xv[VDREG], vres[7], 7); \
2077}
2078#else
2079#define WRITEBACK_RESULT() { \
19182080      W_VREG_S(VDREG, 0) = vres[0];   \
19192081      W_VREG_S(VDREG, 1) = vres[1];   \
19202082      W_VREG_S(VDREG, 2) = vres[2];   \
r24005r24006
19242086      W_VREG_S(VDREG, 6) = vres[6];   \
19252087      W_VREG_S(VDREG, 7) = vres[7];   \
19262088}
2089#endif
19272090
19282091INLINE void cfunc_rsp_vmulf(void *param)
19292092{
19302093   rsp_state *rsp = (rsp_state*)param;
19312094   int op = rsp->impstate->arg0;
1932   INT16 vres[8] = { 0 };
19332095   //int i;
19342096   // 31       25  24     20      15      10      5        0
19352097   // ------------------------------------------------------
r24005r24006
19382100   //
19392101   // Multiplies signed integer by signed integer * 2
19402102
1941   int sel;
1942   INT32 s1, s2;
1943   INT64 r;
2103   INT16 vres[8] = { 0 };
19442104   for (int i = 0; i < 8; i++)
19452105   {
1946      sel = VEC_EL_2(EL, i);
1947      s1 = (INT32)(INT16)VREG_S(VS1REG, i);
1948      s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
2106#if USE_SIMD
2107      UINT16 w1, w2;
2108      SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i);
2109      SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i));
2110      INT32 s1 = (INT32)(INT16)w1;
2111      INT32 s2 = (INT32)(INT16)w2;
2112#else
2113      INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2114      INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
2115#endif
19492116      if (s1 == -32768 && s2 == -32768)
19502117      {
19512118         // overflow
19522119         ACCUM_H(i) = 0;
19532120         ACCUM_M(i) = -32768;
2121#if USE_SIMD
2122         SIMD_INSERT16(rsp->accum_l, -32768, i);
2123#else
19542124         ACCUM_L(i) = -32768;
2125#endif
19552126         vres[i] = 0x7fff;
19562127      }
19572128      else
19582129      {
1959         r =  s1 * s2 * 2;
2130         INT64 r =  s1 * s2 * 2;
19602131         r += 0x8000;    // rounding ?
19612132         ACCUM_H(i) = (r < 0) ? 0xffff : 0;      // sign-extend to 48-bit
19622133         ACCUM_M(i) = (INT16)(r >> 16);
1963         ACCUM_L(i) = (UINT16)(r);
2134#if USE_SIMD
2135         SIMD_INSERT16(rsp->accum_l, (UINT16)(r), i);
2136#else
2137         ACCUM_L(i) = (UINT16)r;
2138#endif
19642139         vres[i] = ACCUM_M(i);
19652140      }
19662141   }
r24005r24006
19712146{
19722147   rsp_state *rsp = (rsp_state*)param;
19732148   int op = rsp->impstate->arg0;
1974   INT16 vres[8];
1975   int i;
19762149   // 31       25  24     20      15      10      5        0
19772150   // ------------------------------------------------------
19782151   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000001 |
19792152   // ------------------------------------------------------
19802153   //
19812154
1982   int sel;
1983   INT32 s1, s2;
1984   INT64 r;
1985   for (i=0; i < 8; i++)
2155   INT16 vres[8];
2156   for (int i = 0; i < 8; i++)
19862157   {
1987      sel = VEC_EL_2(EL, i);
1988      s1 = (INT32)(INT16)VREG_S(VS1REG, i);
1989      s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
1990      r = s1 * s2 * 2;
2158#if USE_SIMD
2159      UINT16 w1, w2;
2160      SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i);
2161      SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i));
2162      INT32 s1 = (INT32)(INT16)w1;
2163      INT32 s2 = (INT32)(INT16)w2;
2164#else
2165      INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2166      INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
2167#endif
2168      INT64 r = s1 * s2 * 2;
19912169      r += 0x8000;    // rounding ?
19922170
19932171      ACCUM_H(i) = (UINT16)(r >> 32);
19942172      ACCUM_M(i) = (UINT16)(r >> 16);
2173#if USE_SIMD
2174      SIMD_INSERT16(rsp->accum_l, (UINT16)(r), i);
2175#else
19952176      ACCUM_L(i) = (UINT16)(r);
2177#endif
19962178
19972179      if (r < 0)
19982180      {
r24005r24006
20242206   // The result is added into accumulator
20252207   // The middle slice of accumulator is stored into destination element
20262208
2027   int sel;
2028   UINT32 s1, s2;
2029   UINT32 r;
20302209   for (int i = 0; i < 8; i++)
20312210   {
2032      sel = VEC_EL_2(EL, i);
2033      s1 = (UINT32)(UINT16)VREG_S(VS1REG, i);
2034      s2 = (UINT32)(UINT16)VREG_S(VS2REG, sel);
2035      r = s1 * s2;
2211#if USE_SIMD
2212      UINT16 w1, w2;
2213      SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i);
2214      SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i));
2215      UINT32 s1 = (UINT32)w1;
2216      UINT32 s2 = (UINT32)w2;
2217#else
2218      UINT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, i);
2219      UINT32 s2 = (UINT32)(UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
2220#endif
2221      UINT32 r = s1 * s2;
20362222
20372223      ACCUM_H(i) = 0;
20382224      ACCUM_M(i) = 0;
2225#if USE_SIMD
2226      SIMD_INSERT16(rsp->accum_l, (UINT16)(r >> 16), i);
2227#else
20392228      ACCUM_L(i) = (UINT16)(r >> 16);
2229#endif
20402230
2041      vres[i] = ACCUM_L(i);
2231      vres[i] = (UINT16)(r >> 16);
20422232   }
20432233   WRITEBACK_RESULT();
20442234}
r24005r24006
20582248   // The result is stored into accumulator
20592249   // The middle slice of accumulator is stored into destination element
20602250
2061   int sel;
2062   INT32 s1, s2;
2063   INT32 r;
20642251   for (int i = 0; i < 8; i++)
20652252   {
2066      sel = VEC_EL_2(EL, i);
2067      s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2068      s2 = (UINT16)VREG_S(VS2REG, sel);   // not sign-extended
2069      r =  s1 * s2;
2253#if USE_SIMD
2254      UINT16 w1, w2;
2255      SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i);
2256      SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i));
2257      INT32 s1 = (INT32)(INT16)w1;
2258      INT32 s2 = w2;
2259#else
2260      INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2261      INT32 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i));   // not sign-extended
2262#endif
2263      INT32 r =  s1 * s2;
20702264
20712265      ACCUM_H(i) = (r < 0) ? 0xffff : 0;      // sign-extend to 48-bit
20722266      ACCUM_M(i) = (INT16)(r >> 16);
2267#if USE_SIMD
2268      SIMD_INSERT16(rsp->accum_l, (UINT16)(r), i);
2269#else
20732270      ACCUM_L(i) = (UINT16)(r);
2271#endif
20742272
20752273      vres[i] = ACCUM_M(i);
20762274   }
r24005r24006
20812279{
20822280   rsp_state *rsp = (rsp_state*)param;
20832281   int op = rsp->impstate->arg0;
2084   INT16 vres[8] = { 0 };
20852282
20862283   // 31       25  24     20      15      10      5        0
20872284   // ------------------------------------------------------
r24005r24006
20922289   // The result is stored into accumulator
20932290   // The low slice of accumulator is stored into destination element
20942291
2095   int sel;
2096   INT32 s1, s2;
2097   INT32 r;
2292   INT16 vres[8] = { 0 };
20982293   for (int i = 0; i < 8; i++)
20992294   {
2100      sel = VEC_EL_2(EL, i);
2101      s1 = (UINT16)VREG_S(VS1REG, i);     // not sign-extended
2102      s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
2103      r = s1 * s2;
2295#if USE_SIMD
2296      UINT16 w1, w2;
2297      SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i);
2298      SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i));
2299      INT32 s1 = w1;
2300      INT32 s2 = (INT32)(INT16)w2;
2301#else
2302      INT32 s1 = (UINT16)VREG_S(VS1REG, i);     // not sign-extended
2303      INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
2304#endif
2305      INT32 r = s1 * s2;
21042306
21052307      ACCUM_H(i) = (r < 0) ? 0xffff : 0;      // sign-extend to 48-bit
21062308      ACCUM_M(i) = (INT16)(r >> 16);
2309#if USE_SIMD
2310      SIMD_INSERT16(rsp->accum_l, (UINT16)(r), i);
2311#else
21072312      ACCUM_L(i) = (UINT16)(r);
2313#endif
21082314
2109      vres[i] = ACCUM_L(i);
2315      vres[i] = (UINT16)(r);
21102316   }
21112317   WRITEBACK_RESULT();
21122318}
r24005r24006
21152321{
21162322   rsp_state *rsp = (rsp_state*)param;
21172323   int op = rsp->impstate->arg0;
2118   INT16 vres[8];
2119   int i;
21202324   // 31       25  24     20      15      10      5        0
21212325   // ------------------------------------------------------
21222326   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000111 |
r24005r24006
21262330   // The result is stored into highest 32 bits of accumulator, the low slice is zero
21272331   // The highest 32 bits of accumulator is saturated into destination element
21282332
2129   int sel;
2130   INT32 s1, s2;
2131   INT32 r;
2132   for (i=0; i < 8; i++)
2333   INT16 vres[8];
2334   for (int i = 0; i < 8; i++)
21332335   {
2134      sel = VEC_EL_2(EL, i);
2135      s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2136      s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
2137      r = s1 * s2;
2336#if USE_SIMD
2337      UINT16 w1, w2;
2338      SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i);
2339      SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i));
2340      INT32 s1 = (INT32)(INT16)w1;
2341      INT32 s2 = (INT32)(INT16)w2;
2342#else
2343      INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2344      INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
2345#endif
2346      INT32 r = s1 * s2;
21382347
21392348      ACCUM_H(i) = (INT16)(r >> 16);
21402349      ACCUM_M(i) = (UINT16)(r);
2350#if USE_SIMD
2351      SIMD_INSERT16(rsp->accum_l, 0, i);
2352#else
21412353      ACCUM_L(i) = 0;
2354#endif
21422355
21432356      if (r < -32768) r = -32768;
21442357      if (r >  32767) r = 32767;
r24005r24006
21512364{
21522365   rsp_state *rsp = (rsp_state*)param;
21532366   int op = rsp->impstate->arg0;
2367
21542368   INT16 vres[8];
2155
2156   int sel;
2157   INT32 s1, s2;
2158   INT32 r;
2159   UINT16 res;
21602369   for (int i = 0; i < 8; i++)
21612370   {
2162      sel = VEC_EL_2(EL, i);
2163      s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2164      s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
2165      r = s1 * s2;
2371#if USE_SIMD
2372      UINT16 w1, w2;
2373      SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i);
2374      SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i));
2375      INT32 s1 = (INT32)(INT16)w1;
2376      INT32 s2 = (INT32)(INT16)w2;
2377#else
2378      INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2379      INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
2380#endif
2381      INT32 r = s1 * s2;
21662382
2383#if USE_SIMD
2384      UINT64 q = (UINT64)ACCUM(i) & 0xffffffff0000ffffL;
2385      UINT16 accl;
2386      SIMD_EXTRACT16(rsp->accum_l, accl, i);
2387      q |= (UINT64)((UINT32)accl << 16);
2388      q += (INT64)(r) << 17;
2389      ACCUM(i) = q;
2390      SIMD_INSERT16(rsp->accum_l, (UINT16)(q >> 16), i);
2391#else
21672392      ACCUM(i) += (INT64)(r) << 17;
2168      res = SATURATE_ACCUM(rsp, i, 1, 0x8000, 0x7fff);
2393#endif
21692394
2170      vres[i] = res;
2395      vres[i] = SATURATE_ACCUM(rsp, i, 1, 0x8000, 0x7fff);
21712396   }
21722397   WRITEBACK_RESULT();
21732398}
r24005r24006
21762401{
21772402   rsp_state *rsp = (rsp_state*)param;
21782403   int op = rsp->impstate->arg0;
2179   INT16 vres[8];
2180   int i;
2404
21812405   // 31       25  24     20      15      10      5        0
21822406   // ------------------------------------------------------
21832407   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001001 |
21842408   // ------------------------------------------------------
21852409   //
21862410
2187   UINT16 res;
2188   int sel;
2189   INT32 s1, s2, r1;
2190   UINT32 r2, r3;
2191   for (i = 0; i < 8; i++)
2411   INT16 vres[8];
2412   for (int i = 0; i < 8; i++)
21922413   {
2193      sel = VEC_EL_2(EL, i);
2194      s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2195      s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
2196      r1 = s1 * s2;
2197      r2 = (UINT16)ACCUM_L(i) + ((UINT16)(r1) * 2);
2198      r3 = (UINT16)ACCUM_M(i) + (UINT16)((r1 >> 16) * 2) + (UINT16)(r2 >> 16);
2414#if USE_SIMD
2415      UINT16 w1, w2;
2416      SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i);
2417      SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i));
2418      INT32 s1 = (INT32)(INT16)w1;
2419      INT32 s2 = (INT32)(INT16)w2;
2420#else
2421      INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2422      INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
2423#endif
2424      INT32 r1 = s1 * s2;
2425#if USE_SIMD
2426      UINT16 accl;
2427      SIMD_EXTRACT16(rsp->accum_l, accl, i);
2428      UINT32 r2 = accl + ((UINT16)(r1) * 2);
2429#else
2430      UINT32 r2 = (UINT16)ACCUM_L(i) + ((UINT16)(r1) * 2);
2431#endif
2432      UINT32 r3 = (UINT16)ACCUM_M(i) + (UINT16)((r1 >> 16) * 2) + (UINT16)(r2 >> 16);
21992433
2434#if USE_SIMD
2435      SIMD_INSERT16(rsp->accum_l, (UINT16)(r2), i);
2436#else
22002437      ACCUM_L(i) = (UINT16)(r2);
2438#endif
22012439      ACCUM_M(i) = (UINT16)(r3);
22022440      ACCUM_H(i) += (UINT16)(r3 >> 16) + (UINT16)(r1 >> 31);
22032441
22042442      //res = SATURATE_ACCUM(i, 1, 0x0000, 0xffff);
22052443      if ((INT16)ACCUM_H(i) < 0)
22062444      {
2207         res = 0;
2445         vres[i] = 0;
22082446      }
22092447      else
22102448      {
22112449         if (ACCUM_H(i) != 0)
22122450         {
2213            res = 0xffff;
2451            vres[i] = 0xffff;
22142452         }
22152453         else
22162454         {
22172455            if ((INT16)ACCUM_M(i) < 0)
22182456            {
2219               res = 0xffff;
2457               vres[i] = 0xffff;
22202458            }
22212459            else
22222460            {
2223               res = ACCUM_M(i);
2461               vres[i] = ACCUM_M(i);
22242462            }
22252463         }
22262464      }
2227
2228      vres[i] = res;
22292465   }
22302466   WRITEBACK_RESULT();
22312467}
r24005r24006
22342470{
22352471   rsp_state *rsp = (rsp_state*)param;
22362472   int op = rsp->impstate->arg0;
2237   INT16 vres[8];
2238   int i;
2473
22392474   // 31       25  24     20      15      10      5        0
22402475   // ------------------------------------------------------
22412476   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001100 |
r24005r24006
22452480   // Adds the higher 16 bits of the 32-bit result to accumulator
22462481   // The low slice of accumulator is stored into destination element
22472482
2248   UINT16 res;
2249   int sel;
2250   UINT32 s1, s2, r1;
2251   UINT32 r2, r3;
2252   for (i = 0; i < 8; i++)
2483   INT16 vres[8];
2484   for (int i = 0; i < 8; i++)
22532485   {
2254      sel = VEC_EL_2(EL, i);
2255      s1 = (UINT32)(UINT16)VREG_S(VS1REG, i);
2256      s2 = (UINT32)(UINT16)VREG_S(VS2REG, sel);
2257      r1 = s1 * s2;
2258      r2 = (UINT16)ACCUM_L(i) + (r1 >> 16);
2259      r3 = (UINT16)ACCUM_M(i) + (r2 >> 16);
2486#if USE_SIMD
2487      UINT16 w1, w2;
2488      SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i);
2489      SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i));
2490      UINT32 s1 = w1;
2491      UINT32 s2 = w2;
2492#else
2493      UINT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, i);
2494      UINT32 s2 = (UINT32)(UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
2495#endif
2496      UINT32 r1 = s1 * s2;
2497#if USE_SIMD
2498      UINT16 accl;
2499      SIMD_EXTRACT16(rsp->accum_l, accl, i);
2500      UINT32 r2 = accl + (r1 >> 16);
2501#else
2502      UINT32 r2 = (UINT16)ACCUM_L(i) + (r1 >> 16);
2503#endif
2504      UINT32 r3 = (UINT16)ACCUM_M(i) + (r2 >> 16);
22602505
2506#if USE_SIMD
2507      SIMD_INSERT16(rsp->accum_l, (UINT16)(r2), i);
2508#else
22612509      ACCUM_L(i) = (UINT16)(r2);
2510#endif
22622511      ACCUM_M(i) = (UINT16)(r3);
22632512      ACCUM_H(i) += (INT16)(r3 >> 16);
22642513
2265      res = SATURATE_ACCUM(rsp, i, 0, 0x0000, 0xffff);
2266
2267      vres[i] = res;
2514      vres[i] = SATURATE_ACCUM(rsp, i, 0, 0x0000, 0xffff);
22682515   }
22692516   WRITEBACK_RESULT();
22702517}
r24005r24006
22732520{
22742521   rsp_state *rsp = (rsp_state*)param;
22752522   int op = rsp->impstate->arg0;
2523
22762524   INT16 vres[8];
2277
2278   UINT16 res;
2279   int sel;
2280   UINT32 s1, s2, r1;
2281   UINT32 r2, r3;
22822525   for (int i = 0; i < 8; i++)
22832526   {
2284      sel = VEC_EL_2(EL, i);
2285      s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2286      s2 = (UINT16)VREG_S(VS2REG, sel);   // not sign-extended
2287      r1 = s1 * s2;
2288      r2 = (UINT16)ACCUM_L(i) + (UINT16)(r1);
2289      r3 = (UINT16)ACCUM_M(i) + (r1 >> 16) + (r2 >> 16);
2527#if USE_SIMD
2528      UINT16 w1, w2;
2529      SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i);
2530      SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i));
2531      UINT32 s1 = (INT32)(INT16)w1;
2532      UINT32 s2 = w2;
2533#else
2534      UINT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2535      UINT32 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i));   // not sign-extended
2536#endif
2537      UINT32 r1 = s1 * s2;
2538#if USE_SIMD
2539      UINT16 accl;
2540      SIMD_EXTRACT16(rsp->accum_l, accl, i);
2541      UINT32 r2 = accl + (UINT16)(r1);
2542#else
2543      UINT32 r2 = (UINT16)ACCUM_L(i) + (UINT16)(r1);
2544#endif
2545      UINT32 r3 = (UINT16)ACCUM_M(i) + (r1 >> 16) + (r2 >> 16);
22902546
2547#if USE_SIMD
2548      SIMD_INSERT16(rsp->accum_l, (UINT16)(r2), i);
2549#else
22912550      ACCUM_L(i) = (UINT16)(r2);
2551#endif
22922552      ACCUM_M(i) = (UINT16)(r3);
22932553      ACCUM_H(i) += (UINT16)(r3 >> 16);
22942554      if ((INT32)(r1) < 0)
22952555         ACCUM_H(i) -= 1;
22962556
2297      res = SATURATE_ACCUM(rsp, i, 1, 0x8000, 0x7fff);
2298
2299      vres[i] = res;
2557      vres[i] = SATURATE_ACCUM(rsp, i, 1, 0x8000, 0x7fff);
23002558   }
23012559   WRITEBACK_RESULT();
23022560}
r24005r24006
23052563{
23062564   rsp_state *rsp = (rsp_state*)param;
23072565   int op = rsp->impstate->arg0;
2566
23082567   INT16 vres[8];
2309
2310   INT32 s1, s2;
2311   UINT16 res;
2312   int sel;
23132568   for (int i = 0; i < 8; i++)
23142569   {
2315      sel = VEC_EL_2(EL, i);
2316      s1 = (UINT16)VREG_S(VS1REG, i);     // not sign-extended
2317      s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
2570#if USE_SIMD
2571      UINT16 w1, w2;
2572      SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i);
2573      SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i));
2574      INT32 s1 = w1;
2575      INT32 s2 = (INT32)(INT16)w2;
2576#else
2577      INT32 s1 = (UINT16)VREG_S(VS1REG, i);     // not sign-extended
2578      INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
2579#endif
23182580
2319      ACCUM(i) += (INT64)(s1*s2)<<16;
2581#if USE_SIMD
2582      UINT64 q = (UINT64)ACCUM(i) & 0xffffffff0000ffffL;
2583      UINT16 accl;
2584      SIMD_EXTRACT16(rsp->accum_l, accl, i);
2585      q |= (UINT64)((UINT32)accl << 16);
2586      q += (INT64)(s1*s2) << 16;
2587      ACCUM(i) = q;
2588      SIMD_INSERT16(rsp->accum_l, (UINT16)(q >> 16), i);
2589#else
2590      ACCUM(i) += (INT64)(s1*s2) << 16;
2591#endif
23202592
2321      res = SATURATE_ACCUM(rsp, i, 0, 0x0000, 0xffff);
2322      vres[i] = res;
2593      vres[i] = SATURATE_ACCUM(rsp, i, 0, 0x0000, 0xffff);
23232594   }
23242595   WRITEBACK_RESULT();
23252596}
r24005r24006
23282599{
23292600   rsp_state *rsp = (rsp_state*)param;
23302601   int op = rsp->impstate->arg0;
2331   INT16 vres[8];
23322602   // 31       25  24     20      15      10      5        0
23332603   // ------------------------------------------------------
23342604   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001111 |
r24005r24006
23382608   // The result is added into highest 32 bits of accumulator, the low slice is zero
23392609   // The highest 32 bits of accumulator is saturated into destination element
23402610
2341   UINT16 res;
2342   int sel;
2343   INT32 s1, s2;
2611#if 0
2612   UINT16 caccumh[8], caccumm[8], vs1[8], vs2[8];
23442613   for (int i = 0; i < 8; i++)
23452614   {
2346      sel = VEC_EL_2(EL, i);
2347      s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2348      s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
2615      caccumh[i] = ACCUM_H(i);
2616      caccumm[i] = ACCUM_M(i);
2617      SIMD_EXTRACT16(rsp->xv[VS1REG], vs1[i], i);
2618      SIMD_EXTRACT16(rsp->xv[VS2REG], vs2[i], i);
2619      printf("%04x%04x\n", (UINT16)caccumh[i], (UINT16)caccumm[i]);
2620   }
2621#endif
23492622
2623#if USE_SIMD
2624   __m128i vec7531 = _mm_and_si128(rsp->xv[VS1REG], vec_himask);
2625   __m128i vec6420 = _mm_slli_epi32(rsp->xv[VS1REG], 16);
2626   __m128i shuf2 = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
2627
2628   __m128i shuf7531 = _mm_and_si128(shuf2, vec_himask);
2629   __m128i shuf6420 = _mm_slli_epi32(shuf2, 16);
2630
2631   __m128i upper7531 = _mm_mulhi_epi16(vec7531, shuf7531);
2632   __m128i lower7531 = _mm_srli_epi32(_mm_mullo_epi16(vec7531, shuf7531), 16);
2633   __m128i prod7531 = _mm_or_si128(upper7531, lower7531);
2634
2635   __m128i upper6420 = _mm_mulhi_epi16(vec6420, shuf6420);
2636   __m128i lower6420 = _mm_srli_epi32(_mm_mullo_epi16(vec6420, shuf6420), 16);
2637   __m128i prod6420 = _mm_or_si128(upper6420, lower6420);
2638
2639#if 0
2640   UINT16 svs1[8], svs2[8];
2641   svs1[0] = _mm_extract_epi16(rsp->xv[VS1REG], 7);
2642   svs1[1] = _mm_extract_epi16(rsp->xv[VS1REG], 6);
2643   svs1[2] = _mm_extract_epi16(rsp->xv[VS1REG], 5);
2644   svs1[3] = _mm_extract_epi16(rsp->xv[VS1REG], 4);
2645   svs1[4] = _mm_extract_epi16(rsp->xv[VS1REG], 3);
2646   svs1[5] = _mm_extract_epi16(rsp->xv[VS1REG], 2);
2647   svs1[6] = _mm_extract_epi16(rsp->xv[VS1REG], 1);
2648   svs1[7] = _mm_extract_epi16(rsp->xv[VS1REG], 0);
2649   svs2[0] = _mm_extract_epi16(rsp->xv[VS2REG], 7);
2650   svs2[1] = _mm_extract_epi16(rsp->xv[VS2REG], 6);
2651   svs2[2] = _mm_extract_epi16(rsp->xv[VS2REG], 5);
2652   svs2[3] = _mm_extract_epi16(rsp->xv[VS2REG], 4);
2653   svs2[4] = _mm_extract_epi16(rsp->xv[VS2REG], 3);
2654   svs2[5] = _mm_extract_epi16(rsp->xv[VS2REG], 2);
2655   svs2[6] = _mm_extract_epi16(rsp->xv[VS2REG], 1);
2656   svs2[7] = _mm_extract_epi16(rsp->xv[VS2REG], 0);
2657
2658   printf("%d\n", EL);
2659
2660   UINT16 vecs[16];
2661   vecs[0] = _mm_extract_epi16(vec7531, 0);
2662   vecs[1] = _mm_extract_epi16(vec7531, 1);
2663   vecs[2] = _mm_extract_epi16(vec7531, 2);
2664   vecs[3] = _mm_extract_epi16(vec7531, 3);
2665   vecs[4] = _mm_extract_epi16(vec7531, 4);
2666   vecs[5] = _mm_extract_epi16(vec7531, 5);
2667   vecs[6] = _mm_extract_epi16(vec7531, 6);
2668   vecs[7] = _mm_extract_epi16(vec7531, 7);
2669   vecs[8] = _mm_extract_epi16(vec6420, 0);
2670   vecs[9] = _mm_extract_epi16(vec6420, 1);
2671   vecs[10] = _mm_extract_epi16(vec6420, 2);
2672   vecs[11] = _mm_extract_epi16(vec6420, 3);
2673   vecs[12] = _mm_extract_epi16(vec6420, 4);
2674   vecs[13] = _mm_extract_epi16(vec6420, 5);
2675   vecs[14] = _mm_extract_epi16(vec6420, 6);
2676   vecs[15] = _mm_extract_epi16(vec6420, 7);
2677   printf("VS1 %04x%04x %04x%04x %04x%04x %04x%04x\n", vs1[0], vs1[1], vs1[2], vs1[3], vs1[4], vs1[5], vs1[6], vs1[7]);
2678   printf("VS2 %04x%04x %04x%04x %04x%04x %04x%04x\n", vs2[0], vs2[1], vs2[2], vs2[3], vs2[4], vs2[5], vs2[6], vs2[7]);
2679   printf("Vec %04x%04x %04x%04x %04x%04x %04x%04x\n", vecs[0], vecs[1], vecs[2], vecs[3], vecs[4], vecs[5], vecs[6], vecs[7]);
2680   printf("Vec %04x%04x %04x%04x %04x%04x %04x%04x\n", vecs[8], vecs[9], vecs[10], vecs[11], vecs[12], vecs[13], vecs[14], vecs[15]);
2681
2682   UINT16 shufs[16];
2683   shufs[0] = _mm_extract_epi16(shuf7531, 0);
2684   shufs[1] = _mm_extract_epi16(shuf7531, 1);
2685   shufs[2] = _mm_extract_epi16(shuf7531, 2);
2686   shufs[3] = _mm_extract_epi16(shuf7531, 3);
2687   shufs[4] = _mm_extract_epi16(shuf7531, 4);
2688   shufs[5] = _mm_extract_epi16(shuf7531, 5);
2689   shufs[6] = _mm_extract_epi16(shuf7531, 6);
2690   shufs[7] = _mm_extract_epi16(shuf7531, 7);
2691   shufs[8] = _mm_extract_epi16(shuf6420, 0);
2692   shufs[9] = _mm_extract_epi16(shuf6420, 1);
2693   shufs[10] = _mm_extract_epi16(shuf6420, 2);
2694   shufs[11] = _mm_extract_epi16(shuf6420, 3);
2695   shufs[12] = _mm_extract_epi16(shuf6420, 4);
2696   shufs[13] = _mm_extract_epi16(shuf6420, 5);
2697   shufs[14] = _mm_extract_epi16(shuf6420, 6);
2698   shufs[15] = _mm_extract_epi16(shuf6420, 7);
2699   printf("Shf %04x%04x %04x%04x %04x%04x %04x%04x\n", shufs[0], shufs[1], shufs[2], shufs[3], shufs[4], shufs[5], shufs[6], shufs[7]);
2700   printf("Shf %04x%04x %04x%04x %04x%04x %04x%04x\n", shufs[8], shufs[9], shufs[10], shufs[11], shufs[12], shufs[13], shufs[14], shufs[15]);
2701
2702   UINT16 uppers[16];
2703   uppers[0] = _mm_extract_epi16(upper7531, 0);
2704   uppers[1] = _mm_extract_epi16(upper7531, 1);
2705   uppers[2] = _mm_extract_epi16(upper7531, 2);
2706   uppers[3] = _mm_extract_epi16(upper7531, 3);
2707   uppers[4] = _mm_extract_epi16(upper7531, 4);
2708   uppers[5] = _mm_extract_epi16(upper7531, 5);
2709   uppers[6] = _mm_extract_epi16(upper7531, 6);
2710   uppers[7] = _mm_extract_epi16(upper7531, 7);
2711   uppers[8] = _mm_extract_epi16(upper6420, 0);
2712   uppers[9] = _mm_extract_epi16(upper6420, 1);
2713   uppers[10] = _mm_extract_epi16(upper6420, 2);
2714   uppers[11] = _mm_extract_epi16(upper6420, 3);
2715   uppers[12] = _mm_extract_epi16(upper6420, 4);
2716   uppers[13] = _mm_extract_epi16(upper6420, 5);
2717   uppers[14] = _mm_extract_epi16(upper6420, 6);
2718   uppers[15] = _mm_extract_epi16(upper6420, 7);
2719   printf("Upr %04x%04x %04x%04x %04x%04x %04x%04x\n", uppers[0], uppers[1], uppers[2], uppers[3], uppers[4], uppers[5], uppers[6], uppers[7]);
2720   printf("Upr %04x%04x %04x%04x %04x%04x %04x%04x\n", uppers[8], uppers[9], uppers[10], uppers[11], uppers[12], uppers[13], uppers[14], uppers[15]);
2721
2722   UINT16 lowers[16];
2723   lowers[0] = _mm_extract_epi16(lower7531, 0);
2724   lowers[1] = _mm_extract_epi16(lower7531, 1);
2725   lowers[2] = _mm_extract_epi16(lower7531, 2);
2726   lowers[3] = _mm_extract_epi16(lower7531, 3);
2727   lowers[4] = _mm_extract_epi16(lower7531, 4);
2728   lowers[5] = _mm_extract_epi16(lower7531, 5);
2729   lowers[6] = _mm_extract_epi16(lower7531, 6);
2730   lowers[7] = _mm_extract_epi16(lower7531, 7);
2731   lowers[8] = _mm_extract_epi16(lower6420, 0);
2732   lowers[9] = _mm_extract_epi16(lower6420, 1);
2733   lowers[10] = _mm_extract_epi16(lower6420, 2);
2734   lowers[11] = _mm_extract_epi16(lower6420, 3);
2735   lowers[12] = _mm_extract_epi16(lower6420, 4);
2736   lowers[13] = _mm_extract_epi16(lower6420, 5);
2737   lowers[14] = _mm_extract_epi16(lower6420, 6);
2738   lowers[15] = _mm_extract_epi16(lower6420, 7);
2739   printf("Lwr %04x%04x %04x%04x %04x%04x %04x%04x\n", lowers[0], lowers[1], lowers[2], lowers[3], lowers[4], lowers[5], lowers[6], lowers[7]);
2740   printf("Lwr %04x%04x %04x%04x %04x%04x %04x%04x\n", lowers[8], lowers[9], lowers[10], lowers[11], lowers[12], lowers[13], lowers[14], lowers[15]);
2741
2742   UINT16 prods[16];
2743   prods[0] = _mm_extract_epi16(prod7531, 0);
2744   prods[1] = _mm_extract_epi16(prod7531, 1);
2745   prods[2] = _mm_extract_epi16(prod7531, 2);
2746   prods[3] = _mm_extract_epi16(prod7531, 3);
2747   prods[4] = _mm_extract_epi16(prod7531, 4);
2748   prods[5] = _mm_extract_epi16(prod7531, 5);
2749   prods[6] = _mm_extract_epi16(prod7531, 6);
2750   prods[7] = _mm_extract_epi16(prod7531, 7);
2751   prods[8] = _mm_extract_epi16(prod6420, 0);
2752   prods[9] = _mm_extract_epi16(prod6420, 1);
2753   prods[10] = _mm_extract_epi16(prod6420, 2);
2754   prods[11] = _mm_extract_epi16(prod6420, 3);
2755   prods[12] = _mm_extract_epi16(prod6420, 4);
2756   prods[13] = _mm_extract_epi16(prod6420, 5);
2757   prods[14] = _mm_extract_epi16(prod6420, 6);
2758   prods[15] = _mm_extract_epi16(prod6420, 7);
2759   printf("Prd %04x%04x %04x%04x %04x%04x %04x%04x\n", prods[0], prods[1], prods[2], prods[3], prods[4], prods[5], prods[6], prods[7]);
2760   printf("Prd %04x%04x %04x%04x %04x%04x %04x%04x\n", prods[8], prods[9], prods[10], prods[11], prods[12], prods[13], prods[14], prods[15]);
2761#endif
2762
2763   __m128i accum7531 = _mm_set_epi16(ACCUM_H(7), ACCUM_M(7), ACCUM_H(5), ACCUM_M(5), ACCUM_H(3), ACCUM_M(3), ACCUM_H(1), ACCUM_M(1));
2764   __m128i accum6420 = _mm_set_epi16(ACCUM_H(6), ACCUM_M(6), ACCUM_H(4), ACCUM_M(4), ACCUM_H(2), ACCUM_M(2), ACCUM_H(0), ACCUM_M(0));
2765   accum7531 = _mm_add_epi32(accum7531, prod7531);
2766   accum6420 = _mm_add_epi32(accum6420, prod6420);
2767   __m128i accum7531_m = _mm_slli_epi32(_mm_and_si128(accum7531, vec_lomask), 16);
2768   __m128i accum7531_h = _mm_and_si128(accum7531, vec_himask);
2769   __m128i accum6420_m = _mm_and_si128(accum6420, vec_lomask);
2770   __m128i accum6420_h = _mm_srli_epi32(_mm_and_si128(accum6420, vec_himask), 16);
2771   __m128i newaccum_h = _mm_or_si128(accum7531_h, accum6420_h);
2772   __m128i newaccum_m = _mm_or_si128(accum7531_m, accum6420_m);
2773#if 0
2774   UINT16 accums[16];
2775   accums[0] = _mm_extract_epi16(newaccum_h, 0);
2776   accums[1] = _mm_extract_epi16(newaccum_h, 1);
2777   accums[2] = _mm_extract_epi16(newaccum_h, 2);
2778   accums[3] = _mm_extract_epi16(newaccum_h, 3);
2779   accums[4] = _mm_extract_epi16(newaccum_h, 4);
2780   accums[5] = _mm_extract_epi16(newaccum_h, 5);
2781   accums[6] = _mm_extract_epi16(newaccum_h, 6);
2782   accums[7] = _mm_extract_epi16(newaccum_h, 7);
2783   accums[8] = _mm_extract_epi16(newaccum_m, 0);
2784   accums[9] = _mm_extract_epi16(newaccum_m, 1);
2785   accums[10] = _mm_extract_epi16(newaccum_m, 2);
2786   accums[11] = _mm_extract_epi16(newaccum_m, 3);
2787   accums[12] = _mm_extract_epi16(newaccum_m, 4);
2788   accums[13] = _mm_extract_epi16(newaccum_m, 5);
2789   accums[14] = _mm_extract_epi16(newaccum_m, 6);
2790   accums[15] = _mm_extract_epi16(newaccum_m, 7);
2791   printf("AcH %04x%04x %04x%04x %04x%04x %04x%04x\n", accums[0], accums[1], accums[2], accums[3], accums[4], accums[5], accums[6], accums[7]);
2792   printf("AcM %04x%04x %04x%04x %04x%04x %04x%04x\n", accums[8], accums[9], accums[10], accums[11], accums[12], accums[13], accums[14], accums[15]);
2793#endif
2794
2795   __m128i result = SATURATE_ACCUM1(newaccum_h, newaccum_m, 0x8000, 0x7fff);
2796   rsp->xv[VDREG] = result;//_mm_shuffle_epi8(result, vec_shuf_inverse[0]);//SATURATE_ACCUM1(newaccum_h, newaccum_m, 0x8000, 0x7fff);
2797#if 0
2798   UINT16 vresult[8];
2799   vresult[0] = _mm_extract_epi16(result, 0);
2800   vresult[1] = _mm_extract_epi16(result, 1);
2801   vresult[2] = _mm_extract_epi16(result, 2);
2802   vresult[3] = _mm_extract_epi16(result, 3);
2803   vresult[4] = _mm_extract_epi16(result, 4);
2804   vresult[5] = _mm_extract_epi16(result, 5);
2805   vresult[6] = _mm_extract_epi16(result, 6);
2806   vresult[7] = _mm_extract_epi16(result, 7);
2807   printf("%04x %04x %04x %04x %04x %04x %04x %04x\n\n", vresult[0], vresult[1], vresult[2], vresult[3], vresult[4], vresult[5], vresult[6], vresult[7]);
2808#endif
2809   ACCUM_H(0) = _mm_extract_epi16(newaccum_h, 0);
2810   ACCUM_H(1) = _mm_extract_epi16(newaccum_h, 1);
2811   ACCUM_H(2) = _mm_extract_epi16(newaccum_h, 2);
2812   ACCUM_H(3) = _mm_extract_epi16(newaccum_h, 3);
2813   ACCUM_H(4) = _mm_extract_epi16(newaccum_h, 4);
2814   ACCUM_H(5) = _mm_extract_epi16(newaccum_h, 5);
2815   ACCUM_H(6) = _mm_extract_epi16(newaccum_h, 6);
2816   ACCUM_H(7) = _mm_extract_epi16(newaccum_h, 7);
2817   ACCUM_M(0) = _mm_extract_epi16(newaccum_m, 0);
2818   ACCUM_M(1) = _mm_extract_epi16(newaccum_m, 1);
2819   ACCUM_M(2) = _mm_extract_epi16(newaccum_m, 2);
2820   ACCUM_M(3) = _mm_extract_epi16(newaccum_m, 3);
2821   ACCUM_M(4) = _mm_extract_epi16(newaccum_m, 4);
2822   ACCUM_M(5) = _mm_extract_epi16(newaccum_m, 5);
2823   ACCUM_M(6) = _mm_extract_epi16(newaccum_m, 6);
2824   ACCUM_M(7) = _mm_extract_epi16(newaccum_m, 7);
2825#else
2826   INT16 vres[8];
2827   for (int i = 0; i < 8; i++)
2828   {
2829#if USE_SIMD
2830      UINT16 w1, w2;
2831      SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i);
2832      SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i));
2833      INT32 s1 = (INT32)(INT16)w1;
2834      INT32 s2 = (INT32)(INT16)w2;
2835#else
2836      INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2837      INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
2838#endif
2839      //INT32 s1 = (INT32)(INT16)vs1[i];
2840      //INT32 s2 = (INT32)(INT16)vs2[VEC_EL_2(EL, i)];
2841
23502842      rsp->accum[i].l[1] += s1*s2;
23512843
2352      res = SATURATE_ACCUM1(rsp, i, 0x8000, 0x7fff);
2844      vres[i] = SATURATE_ACCUM1(rsp, i, 0x8000, 0x7fff);
23532845
2354      vres[i] = res;
2846      /*INT32 accum = (INT32)((caccumh[i] << 16) | caccumm[i]);
2847      accum += (INT32)s1*s2;
2848      caccumh[i] = (accum >> 16) & 0x0000ffff;
2849      caccumm[i] = accum & 0x0000ffff;
2850
2851      vres[i] = C_SATURATE_ACCUM1(caccumh, caccumm, i, 0x8000, 0x7fff);*/
23552852   }
2853/*   printf("%08x\n", rsp->pc);
2854   for (int i = 0; i < 8; i++)
2855   {
2856      if ((UINT16)vres[i] != vresult[i])
2857      {
2858         printf("Result mismatch:\n");
2859         printf("   C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vres[0], vres[1], vres[2], vres[3], vres[4], vres[5], vres[6], vres[7]);
2860         printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", vresult[0], vresult[1], vresult[2], vresult[3], vresult[4], vresult[5], vresult[6], vresult[7]);
2861         printf("High accumulator:\n");
2862         printf("   C: %04x %04x %04x %04x %04x %04x %04x %04x\n", caccumh[0], caccumh[1], caccumh[2], caccumh[3], caccumh[4], caccumh[5], caccumh[6], caccumh[7]);
2863         printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", ACCUM_H(0), ACCUM_H(1), ACCUM_H(2), ACCUM_H(3), ACCUM_H(4), ACCUM_H(5), ACCUM_H(6), ACCUM_H(7));
2864         printf("Mid accumulator:\n");
2865         printf("   C: %04x %04x %04x %04x %04x %04x %04x %04x\n", caccumm[0], caccumm[1], caccumm[2], caccumm[3], caccumm[4], caccumm[5], caccumm[6], caccumm[7]);
2866         printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", ACCUM_M(0), ACCUM_M(1), ACCUM_M(2), ACCUM_M(3), ACCUM_M(4), ACCUM_M(5), ACCUM_M(6), ACCUM_M(7));
2867         printf("VS1:\n");
2868         printf("   C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vs1[0], vs1[1], vs1[2], vs1[3], vs1[4], vs1[5], vs1[6], vs1[7]);
2869         printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", svs1[0], svs1[1], svs1[2], svs1[3], svs1[4], svs1[5], svs1[6], svs1[7]);
2870         printf("VS2:\n");
2871         printf("   C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vs2[0], vs2[1], vs2[2], vs2[3], vs2[4], vs2[5], vs2[6], vs2[7]);
2872         printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", svs2[0], svs2[1], svs2[2], svs2[3], svs2[4], svs2[5], svs2[6], svs2[7]);
2873         fatalerror("asdf");
2874      }
2875      if (caccumh[i] != (UINT16)ACCUM_H(i))
2876      {
2877         printf("Result:\n");
2878         printf("   C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vres[0], vres[1], vres[2], vres[3], vres[4], vres[5], vres[6], vres[7]);
2879         printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", vresult[0], vresult[1], vresult[2], vresult[3], vresult[4], vresult[5], vresult[6], vresult[7]);
2880         printf("High accumulator mismatch:\n");
2881         printf("   C: %04x %04x %04x %04x %04x %04x %04x %04x\n", caccumh[0], caccumh[1], caccumh[2], caccumh[3], caccumh[4], caccumh[5], caccumh[6], caccumh[7]);
2882         printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", ACCUM_H(0), ACCUM_H(1), ACCUM_H(2), ACCUM_H(3), ACCUM_H(4), ACCUM_H(5), ACCUM_H(6), ACCUM_H(7));
2883         printf("Mid accumulator:\n");
2884         printf("   C: %04x %04x %04x %04x %04x %04x %04x %04x\n", caccumm[0], caccumm[1], caccumm[2], caccumm[3], caccumm[4], caccumm[5], caccumm[6], caccumm[7]);
2885         printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", ACCUM_M(0), ACCUM_M(1), ACCUM_M(2), ACCUM_M(3), ACCUM_M(4), ACCUM_M(5), ACCUM_M(6), ACCUM_M(7));
2886         printf("VS1:\n");
2887         printf("   C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vs1[0], vs1[1], vs1[2], vs1[3], vs1[4], vs1[5], vs1[6], vs1[7]);
2888         printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", svs1[0], svs1[1], svs1[2], svs1[3], svs1[4], svs1[5], svs1[6], svs1[7]);
2889         printf("VS2:\n");
2890         printf("   C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vs2[0], vs2[1], vs2[2], vs2[3], vs2[4], vs2[5], vs2[6], vs2[7]);
2891         printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", svs2[0], svs2[1], svs2[2], svs2[3], svs2[4], svs2[5], svs2[6], svs2[7]);
2892         fatalerror("asdf");
2893      }
2894      if (caccumm[i] != (UINT16)ACCUM_M(i))
2895      {
2896         printf("Result:\n");
2897         printf("   C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vres[0], vres[1], vres[2], vres[3], vres[4], vres[5], vres[6], vres[7]);
2898         printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", vresult[0], vresult[1], vresult[2], vresult[3], vresult[4], vresult[5], vresult[6], vresult[7]);
2899         printf("High accumulator:\n");
2900         printf("   C: %04x %04x %04x %04x %04x %04x %04x %04x\n", caccumh[0], caccumh[1], caccumh[2], caccumh[3], caccumh[4], caccumh[5], caccumh[6], caccumh[7]);
2901         printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", ACCUM_H(0), ACCUM_H(1), ACCUM_H(2), ACCUM_H(3), ACCUM_H(4), ACCUM_H(5), ACCUM_H(6), ACCUM_H(7));
2902         printf("Mid accumulator mismatch:\n");
2903         printf("   C: %04x %04x %04x %04x %04x %04x %04x %04x\n", caccumm[0], caccumm[1], caccumm[2], caccumm[3], caccumm[4], caccumm[5], caccumm[6], caccumm[7]);
2904         printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", ACCUM_M(0), ACCUM_M(1), ACCUM_M(2), ACCUM_M(3), ACCUM_M(4), ACCUM_M(5), ACCUM_M(6), ACCUM_M(7));
2905         printf("VS1:\n");
2906         printf("   C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vs1[0], vs1[1], vs1[2], vs1[3], vs1[4], vs1[5], vs1[6], vs1[7]);
2907         printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", svs1[0], svs1[1], svs1[2], svs1[3], svs1[4], svs1[5], svs1[6], svs1[7]);
2908         printf("VS2:\n");
2909         printf("   C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vs2[0], vs2[1], vs2[2], vs2[3], vs2[4], vs2[5], vs2[6], vs2[7]);
2910         printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", svs2[0], svs2[1], svs2[2], svs2[3], svs2[4], svs2[5], svs2[6], svs2[7]);
2911         fatalerror("asdf");
2912      }
2913   }*/
23562914   WRITEBACK_RESULT();
2915#endif
23572916}
23582917
23592918INLINE void cfunc_rsp_vadd(void *param)
23602919{
23612920   rsp_state *rsp = (rsp_state*)param;
23622921   int op = rsp->impstate->arg0;
2363   INT16 vres[8] = { 0 };
2364   //int i;
23652922   // 31       25  24     20      15      10      5        0
23662923   // ------------------------------------------------------
23672924   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010000 |
r24005r24006
23692926   //
23702927   // Adds two vector registers and carry flag, the result is saturated to 32767
23712928
2372   int sel;
2373   INT32 s1, s2, r;
2929#if USE_SIMD
2930   __m128i shuffled = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
2931   __m128i unsat = rsp->xv[VS1REG];
2932   __m128i carry = _mm_set_epi16(CARRY_FLAG(7), CARRY_FLAG(6), CARRY_FLAG(5), CARRY_FLAG(4),
2933                          CARRY_FLAG(3), CARRY_FLAG(2), CARRY_FLAG(1), CARRY_FLAG(0));
2934
2935   unsat = _mm_add_epi16(unsat, shuffled);
2936   unsat = _mm_add_epi16(unsat, carry);
2937
2938   __m128i maxval = _mm_set_epi64x(0x7fff7fff7fff7fffL, 0x7fff7fff7fff7fffL);
2939   __m128i minval = _mm_set_epi64x(0x8000800080008000L, 0x8000800080008000L);
2940
2941   __m128i addvec = _mm_adds_epi16(rsp->xv[VS1REG], shuffled);
2942
2943   __m128i carrymask = _mm_cmpeq_epi16(addvec, maxval);
2944   carrymask = _mm_xor_si128(carrymask, vec_neg1);
2945   carry = _mm_and_si128(carry, carrymask);
2946
2947   carrymask = _mm_cmpeq_epi16(addvec, minval);
2948   carrymask = _mm_xor_si128(carrymask, vec_neg1);
2949   carry = _mm_and_si128(carry, carrymask);
2950
2951   rsp->xv[VDREG] = _mm_add_epi16(addvec, carry);
2952
2953   rsp->accum_l = unsat;
2954   ACCUM_L(0) = _mm_extract_epi16(unsat, 0);
2955   ACCUM_L(1) = _mm_extract_epi16(unsat, 1);
2956   ACCUM_L(2) = _mm_extract_epi16(unsat, 2);
2957   ACCUM_L(3) = _mm_extract_epi16(unsat, 3);
2958   ACCUM_L(4) = _mm_extract_epi16(unsat, 4);
2959   ACCUM_L(5) = _mm_extract_epi16(unsat, 5);
2960   ACCUM_L(6) = _mm_extract_epi16(unsat, 6);
2961   ACCUM_L(7) = _mm_extract_epi16(unsat, 7);
2962
2963   CLEAR_ZERO_FLAGS();
2964   CLEAR_CARRY_FLAGS();
2965#else
2966   INT16 vres[8] = { 0 };
23742967   for (int i = 0; i < 8; i++)
23752968   {
2376      sel = VEC_EL_2(EL, i);
2377      s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2378      s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
2379      r = s1 + s2 + CARRY_FLAG(i);
2969#if USE_SIMD
2970      UINT16 w1, w2;
2971      SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i);
2972      SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i));
2973      INT32 s1 = (INT32)(INT16)w1;
2974      INT32 s2 = (INT32)(INT16)w2;
2975#else
2976      INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2977      INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
2978#endif
2979      INT32 r = s1 + s2 + CARRY_FLAG(i);
23802980
2981#if USE_SIMD
2982      SIMD_INSERT16(rsp->accum_l, (INT16)(r), i);
2983#else
23812984      ACCUM_L(i) = (INT16)(r);
2985#endif
23822986
23832987      if (r > 32767) r = 32767;
23842988      if (r < -32768) r = -32768;
r24005r24006
23872991   CLEAR_ZERO_FLAGS();
23882992   CLEAR_CARRY_FLAGS();
23892993   WRITEBACK_RESULT();
2994#endif
23902995}
23912996
23922997INLINE void cfunc_rsp_vsub(void *param)
23932998{
23942999   rsp_state *rsp = (rsp_state*)param;
23953000   int op = rsp->impstate->arg0;
2396   INT16 vres[8];
2397   int i;
23983001   // 31       25  24     20      15      10      5        0
23993002   // ------------------------------------------------------
24003003   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010001 |
r24005r24006
24043007
24053008   // TODO: check VS2REG == VDREG
24063009
2407   int sel;
2408   INT32 s1, s2, r;
2409   for (i = 0; i < 8; i++)
3010#if USE_SIMD
3011   __m128i shuffled = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
3012   __m128i unsat = rsp->xv[VS1REG];
3013   __m128i carry = _mm_set_epi16(CARRY_FLAG(7), CARRY_FLAG(6), CARRY_FLAG(5), CARRY_FLAG(4),
3014                          CARRY_FLAG(3), CARRY_FLAG(2), CARRY_FLAG(1), CARRY_FLAG(0));
3015
3016   unsat = _mm_sub_epi16(unsat, shuffled);
3017   unsat = _mm_sub_epi16(unsat, carry);
3018
3019   __m128i minval = _mm_set_epi64x(0x8000800080008000L, 0x8000800080008000L);
3020
3021   __m128i subvec = _mm_subs_epi16(rsp->xv[VS1REG], shuffled);
3022
3023   __m128i carrymask = _mm_cmpeq_epi16(subvec, minval);
3024   carrymask = _mm_xor_si128(carrymask, vec_neg1);
3025   carry = _mm_and_si128(carry, carrymask);
3026
3027   rsp->xv[VDREG] = _mm_sub_epi16(subvec, carry);
3028
3029   rsp->accum_l = unsat;
3030   ACCUM_L(0) = _mm_extract_epi16(unsat, 0);
3031   ACCUM_L(1) = _mm_extract_epi16(unsat, 1);
3032   ACCUM_L(2) = _mm_extract_epi16(unsat, 2);
3033   ACCUM_L(3) = _mm_extract_epi16(unsat, 3);
3034   ACCUM_L(4) = _mm_extract_epi16(unsat, 4);
3035   ACCUM_L(5) = _mm_extract_epi16(unsat, 5);
3036   ACCUM_L(6) = _mm_extract_epi16(unsat, 6);
3037   ACCUM_L(7) = _mm_extract_epi16(unsat, 7);
3038
3039   CLEAR_ZERO_FLAGS();
3040   CLEAR_CARRY_FLAGS();
3041#else
3042   INT16 vres[8];
3043   for (int i = 0; i < 8; i++)
24103044   {
2411      sel = VEC_EL_2(EL, i);
2412      s1 = (INT32)(INT16)VREG_S(VS1REG, i);
2413      s2 = (INT32)(INT16)VREG_S(VS2REG, sel);
2414      r = s1 - s2 - CARRY_FLAG(i);
3045#if USE_SIMD
3046      UINT16 w1, w2;
3047      SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i);
3048      SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i));
3049      INT32 s1 = (INT32)(INT16)w1;
3050      INT32 s2 = (INT32)(INT16)w2;
3051#else
3052      INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
3053      INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
3054#endif
3055      INT32 r = s1 - s2 - CARRY_FLAG(i);
24153056
3057#if USE_SIMD
3058      SIMD_INSERT16(rsp->accum_l, (INT16)(r), i);
3059#else
24163060      ACCUM_L(i) = (INT16)(r);
3061#endif
24173062
24183063      if (r > 32767) r = 32767;
24193064      if (r < -32768) r = -32768;
r24005r24006
24233068   CLEAR_ZERO_FLAGS();
24243069   CLEAR_CARRY_FLAGS();
24253070   WRITEBACK_RESULT();
3071#endif
24263072}
24273073
24283074INLINE void cfunc_rsp_vabs(void *param)
r24005r24006
24303076   rsp_state *rsp = (rsp_state*)param;
24313077   int op = rsp->impstate->arg0;
24323078   INT16 vres[8];
2433   int i;
24343079   // 31       25  24     20      15      10      5        0
24353080   // ------------------------------------------------------
24363081   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010011 |
r24005r24006
24393084   // Changes the sign of source register 2 if source register 1 is negative and stores
24403085   // the result to destination register
24413086
2442   int sel;
2443   INT16 s1, s2;
2444   for (i=0; i < 8; i++)
3087   for (int i = 0; i < 8; i++)
24453088   {
2446      sel = VEC_EL_2(EL, i);
2447      s1 = (INT16)VREG_S(VS1REG, i);
2448      s2 = (INT16)VREG_S(VS2REG, sel);
3089#if USE_SIMD
3090      INT16 s1, s2;
3091      SIMD_EXTRACT16(rsp->xv[VS1REG], s1, i);
3092      SIMD_EXTRACT16(rsp->xv[VS2REG], s2, VEC_EL_2(EL, i));
3093#else
3094      INT16 s1 = (INT16)VREG_S(VS1REG, i);
3095      INT16 s2 = (INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
3096#endif
24493097
24503098      if (s1 < 0)
24513099      {
r24005r24006
24673115         vres[i] = 0;
24683116      }
24693117
3118#if USE_SIMD
3119      SIMD_INSERT16(rsp->accum_l, vres[i], i);
3120#else
24703121      ACCUM_L(i) = vres[i];
3122#endif
24713123   }
24723124   WRITEBACK_RESULT();
24733125}
r24005r24006
24763128{
24773129   rsp_state *rsp = (rsp_state*)param;
24783130   int op = rsp->impstate->arg0;
2479   INT16 vres[8];
2480   int i;
24813131   // 31       25  24     20      15      10      5        0
24823132   // ------------------------------------------------------
24833133   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010100 |
r24005r24006
24873137
24883138   // TODO: check VS2REG = VDREG
24893139
2490   int sel;
2491   INT32 s1, s2, r;
24923140   CLEAR_ZERO_FLAGS();
24933141   CLEAR_CARRY_FLAGS();
24943142
2495   for (i=0; i < 8; i++)
3143#if USE_SIMD
3144   __m128i shuf2 = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
3145                                              __m128i vec7531 = _mm_and_si128(rsp->xv[VS1REG], vec_lomask);
3146   __m128i vec6420 = _mm_srli_epi32(rsp->xv[VS1REG], 16);
3147   __m128i shuf7531 = _mm_and_si128(shuf2, vec_lomask);
3148   __m128i shuf6420 = _mm_srli_epi32(shuf2, 16);
3149   __m128i sum7531 = _mm_add_epi32(vec7531, shuf7531);
3150   __m128i sum6420 = _mm_add_epi32(vec6420, shuf6420);
3151
3152   __m128i over7531 = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi16(sum7531, vec_zero), vec_neg1), vec_overmask);
3153   __m128i over6420 = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi16(sum6420, vec_zero), vec_neg1), vec_overmask);
3154
3155   rsp->flag[0] |= _mm_extract_epi16(over7531, 7) << 6;
3156   rsp->flag[0] |= _mm_extract_epi16(over7531, 5) << 4;
3157   rsp->flag[0] |= _mm_extract_epi16(over7531, 3) << 2;
3158   rsp->flag[0] |= _mm_extract_epi16(over7531, 1) << 0;
3159   rsp->flag[0] |= _mm_extract_epi16(over6420, 7) << 7;
3160   rsp->flag[0] |= _mm_extract_epi16(over6420, 5) << 5;
3161   rsp->flag[0] |= _mm_extract_epi16(over6420, 3) << 3;
3162   rsp->flag[0] |= _mm_extract_epi16(over6420, 1) << 1;
3163   rsp->xv[VDREG] = _mm_or_si128(_mm_slli_epi32(sum6420, 16), sum7531);
3164   rsp->accum_l = rsp->xv[VDREG];
3165
3166#else
3167   INT16 vres[8] = { 0 };
3168   for (int i = 0; i < 8; i++)
24963169   {
2497      sel = VEC_EL_2(EL, i);
2498      s1 = (UINT32)(UINT16)VREG_S(VS1REG, i);
2499      s2 = (UINT32)(UINT16)VREG_S(VS2REG, sel);
2500      r = s1 + s2;
3170      INT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, i);
3171      INT32 s2 = (UINT32)(UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
3172      INT32 r = s1 + s2;
25013173
2502      vres[i] = (INT16)(r);
2503      ACCUM_L(i) = (INT16)(r);
3174      vres[i] = (INT16)r;
3175      ACCUM_L(i) = (INT16)r;
25043176
25053177      if (r & 0xffff0000)
25063178      {
r24005r24006
25083180      }
25093181   }
25103182   WRITEBACK_RESULT();
3183#endif
25113184}
25123185
25133186INLINE void cfunc_rsp_vsubc(void *param)
25143187{
25153188   rsp_state *rsp = (rsp_state*)param;
25163189   int op = rsp->impstate->arg0;
2517   INT16 vres[8];
2518   int i;
25193190   // 31       25  24     20      15      10      5        0
25203191   // ------------------------------------------------------
25213192   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010101 |
r24005r24006
25253196
25263197   // TODO: check VS2REG = VDREG
25273198
2528   int sel;
2529   INT32 s1, s2, r;
25303199   CLEAR_ZERO_FLAGS();
25313200   CLEAR_CARRY_FLAGS();
25323201
2533   for (i=0; i < 8; i++)
3202#if USE_SIMD
3203   __m128i shuf2 = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
3204   __m128i vec7531 = _mm_and_si128(rsp->xv[VS1REG], vec_lomask);
3205   __m128i vec6420 = _mm_srli_epi32(rsp->xv[VS1REG], 16);
3206   __m128i shuf7531 = _mm_and_si128(shuf2, vec_lomask);
3207   __m128i shuf6420 = _mm_srli_epi32(shuf2, 16);
3208   __m128i sum7531 = _mm_sub_epi32(vec7531, shuf7531);
3209   __m128i sum6420 = _mm_sub_epi32(vec6420, shuf6420);
3210
3211   __m128i over7531 = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi16(sum7531, vec_zero), vec_neg1), vec_overmask);
3212   __m128i over6420 = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi16(sum6420, vec_zero), vec_neg1), vec_overmask);
3213   sum7531 = _mm_and_si128(sum7531, vec_lomask);
3214   sum6420 = _mm_and_si128(sum6420, vec_lomask);
3215   __m128i zero7531 = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi16(sum7531, vec_zero), vec_neg1), vec_zerobits);
3216   __m128i zero6420 = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi16(sum6420, vec_zero), vec_neg1), vec_zerobits);
3217
3218   rsp->flag[0] |= _mm_extract_epi16(over7531, 7) << 6;
3219   rsp->flag[0] |= _mm_extract_epi16(over7531, 5) << 4;
3220   rsp->flag[0] |= _mm_extract_epi16(over7531, 3) << 2;
3221   rsp->flag[0] |= _mm_extract_epi16(over7531, 1) << 0;
3222   rsp->flag[0] |= _mm_extract_epi16(over6420, 7) << 7;
3223   rsp->flag[0] |= _mm_extract_epi16(over6420, 5) << 5;
3224   rsp->flag[0] |= _mm_extract_epi16(over6420, 3) << 3;
3225   rsp->flag[0] |= _mm_extract_epi16(over6420, 1) << 1;
3226
3227   rsp->flag[0] |= _mm_extract_epi16(zero7531, 6) << 14;
3228   rsp->flag[0] |= _mm_extract_epi16(zero7531, 4) << 12;
3229   rsp->flag[0] |= _mm_extract_epi16(zero7531, 2) << 10;
3230   rsp->flag[0] |= _mm_extract_epi16(zero7531, 0) << 8;
3231   rsp->flag[0] |= _mm_extract_epi16(zero6420, 6) << 15;
3232   rsp->flag[0] |= _mm_extract_epi16(zero6420, 4) << 13;
3233   rsp->flag[0] |= _mm_extract_epi16(zero6420, 2) << 11;
3234   rsp->flag[0] |= _mm_extract_epi16(zero6420, 0) << 9;
3235
3236   rsp->xv[VDREG] = _mm_or_si128(_mm_slli_epi32(sum6420, 16), sum7531);
3237   rsp->accum_l = rsp->xv[VDREG];
3238
3239#else
3240   INT16 vres[8];
3241   for (int i = 0; i < 8; i++)
25343242   {
2535      sel = VEC_EL_2(EL, i);
2536      s1 = (UINT32)(UINT16)VREG_S(VS1REG, i);
2537      s2 = (UINT32)(UINT16)VREG_S(VS2REG, sel);
2538      r = s1 - s2;
3243      INT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, i);
3244      INT32 s2 = (UINT32)(UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
3245      INT32 r = s1 - s2;
25393246
25403247      vres[i] = (INT16)(r);
25413248      ACCUM_L(i) = (UINT16)(r);
r24005r24006
25503257      }
25513258   }
25523259   WRITEBACK_RESULT();
3260#endif
25533261}
25543262
25553263INLINE void cfunc_rsp_vsaw(void *param)
r24005r24006
25693277      {
25703278         for (int i = 0; i < 8; i++)
25713279         {
3280#if USE_SIMD
3281            rsp->xv[VDREG] = _mm_insert_epi16(rsp->xv[VDREG], ACCUM_H(i), i);
3282#else
25723283            W_VREG_S(VDREG, i) = ACCUM_H(i);
3284#endif
25733285         }
25743286         break;
25753287      }
r24005r24006
25773289      {
25783290         for (int i = 0; i < 8; i++)
25793291         {
3292#if USE_SIMD
3293            rsp->xv[VDREG] = _mm_insert_epi16(rsp->xv[VDREG], ACCUM_M(i), i);
3294#else
25803295            W_VREG_S(VDREG, i) = ACCUM_M(i);
3296#endif
25813297         }
25823298         break;
25833299      }
25843300      case 0x0a:      // VSAWL
25853301      {
3302#if USE_SIMD
3303         rsp->xv[VDREG] = rsp->accum_l;
3304#else
25863305         for (int i = 0; i < 8; i++)
25873306         {
25883307            W_VREG_S(VDREG, i) = ACCUM_L(i);
25893308         }
3309#endif
25903310         break;
25913311      }
25923312      default:    fatalerror("RSP: VSAW: el = %d\n", EL);
r24005r24006
26073327   // Sets compare flags if elements in VS1 are less than VS2
26083328   // Moves the element in VS2 to destination vector
26093329
2610   int sel;
26113330   rsp->flag[1] = 0;
26123331
26133332   for (int i = 0; i < 8; i++)
26143333   {
2615      sel = VEC_EL_2(EL, i);
2616
2617      if (VREG_S(VS1REG, i) < VREG_S(VS2REG, sel))
3334#if USE_SIMD
3335      INT16 s1, s2;
3336      SIMD_EXTRACT16(rsp->xv[VS1REG], s1, i);
3337      SIMD_EXTRACT16(rsp->xv[VS2REG], s2, VEC_EL_2(EL, i));
3338#else
3339      INT16 s1 = (INT16)VREG_S(VS1REG, i);
3340      INT16 s2 = (INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
3341#endif
3342      if (s1 < s2)
26183343      {
26193344         SET_COMPARE_FLAG(i);
26203345      }
2621      else if (VREG_S(VS1REG, i) == VREG_S(VS2REG, sel))
3346      else if (s1 == s2)
26223347      {
26233348         if (ZERO_FLAG(i) == 1 && CARRY_FLAG(i) != 0)
26243349         {
r24005r24006
26283353
26293354      if (COMPARE_FLAG(i))
26303355      {
2631         vres[i] = VREG_S(VS1REG, i);
3356         vres[i] = s1;
26323357      }
26333358      else
26343359      {
2635         vres[i] = VREG_S(VS2REG, sel);
3360         vres[i] = s2;
26363361      }
26373362
3363#if USE_SIMD
3364      SIMD_INSERT16(rsp->accum_l, vres[i], i);
3365#else
26383366      ACCUM_L(i) = vres[i];
3367#endif
26393368   }
26403369
26413370   rsp->flag[0] = 0;
r24005r24006
26473376   rsp_state *rsp = (rsp_state*)param;
26483377   int op = rsp->impstate->arg0;
26493378   INT16 vres[8];
2650   int i;
3379
26513380   // 31       25  24     20      15      10      5        0
26523381   // ------------------------------------------------------
26533382   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100001 |
r24005r24006
26563385   // Sets compare flags if elements in VS1 are equal with VS2
26573386   // Moves the element in VS2 to destination vector
26583387
2659   int sel;
26603388   rsp->flag[1] = 0;
26613389
2662   for (i = 0; i < 8; i++)
3390   for (int i = 0; i < 8; i++)
26633391   {
2664      sel = VEC_EL_2(EL, i);
2665
2666      if ((VREG_S(VS1REG, i) == VREG_S(VS2REG, sel)) && ZERO_FLAG(i) == 0)
3392#if USE_SIMD
3393      INT16 s1, s2;
3394      SIMD_EXTRACT16(rsp->xv[VS1REG], s1, i);
3395      SIMD_EXTRACT16(rsp->xv[VS2REG], s2, VEC_EL_2(EL, i));
3396#else
3397      INT16 s1 = (INT16)VREG_S(VS1REG, i);
3398      INT16 s2 = (INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
3399#endif
3400      if ((s1 == s2) && ZERO_FLAG(i) == 0)
26673401      {
26683402         SET_COMPARE_FLAG(i);
2669         vres[i] = VREG_S(VS1REG, i);
3403         vres[i] = s1;
26703404      }
26713405      else
26723406      {
2673         vres[i] = VREG_S(VS2REG, sel);
3407         vres[i] = s2;
26743408      }
3409#if USE_SIMD
3410      SIMD_INSERT16(rsp->accum_l, vres[i], i);
3411#else
26753412      ACCUM_L(i) = vres[i];
3413#endif
26763414   }
26773415
26783416   rsp->flag[0] = 0;
r24005r24006
26843422   rsp_state *rsp = (rsp_state*)param;
26853423   int op = rsp->impstate->arg0;
26863424   INT16 vres[8];
2687   int i;
3425
26883426   // 31       25  24     20      15      10      5        0
26893427   // ------------------------------------------------------
26903428   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100010 |
r24005r24006
26933431   // Sets compare flags if elements in VS1 are not equal with VS2
26943432   // Moves the element in VS2 to destination vector
26953433
2696   int sel;
26973434   rsp->flag[1] = 0;
26983435
2699   for (i=0; i < 8; i++)//?????????? ????
3436   for (int i = 0; i < 8; i++)
27003437   {
2701      sel = VEC_EL_2(EL, i);
2702
2703      if (VREG_S(VS1REG, i) != VREG_S(VS2REG, sel))
3438#if USE_SIMD
3439      INT16 s1, s2;
3440      SIMD_EXTRACT16(rsp->xv[VS1REG], s1, i);
3441      SIMD_EXTRACT16(rsp->xv[VS2REG], s2, VEC_EL_2(EL, i));
3442#else
3443      INT16 s1 = (INT16)VREG_S(VS1REG, i);
3444      INT16 s2 = (INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
3445#endif
3446      if (s1 != s2)
27043447      {
27053448         SET_COMPARE_FLAG(i);
27063449      }
r24005r24006
27133456      }
27143457      if (COMPARE_FLAG(i))
27153458      {
2716         vres[i] = VREG_S(VS1REG, i);
3459         vres[i] = s1;
27173460      }
27183461      else
27193462      {
2720         vres[i] = VREG_S(VS2REG, sel);
3463         vres[i] = s2;
27213464      }
3465#if USE_SIMD
3466      SIMD_INSERT16(rsp->accum_l, vres[i], i);
3467#else
27223468      ACCUM_L(i) = vres[i];
3469#endif
27233470   }
27243471
27253472   rsp->flag[0] = 0;
r24005r24006
27403487   // Sets compare flags if elements in VS1 are greater or equal with VS2
27413488   // Moves the element in VS2 to destination vector
27423489
2743   int sel;
27443490   rsp->flag[1] = 0;
27453491
27463492   for (int i = 0; i < 8; i++)
27473493   {
2748      sel = VEC_EL_2(EL, i);
2749
2750      if (VREG_S(VS1REG, i) == VREG_S(VS2REG, sel))
3494#if USE_SIMD
3495      INT16 s1, s2;
3496      SIMD_EXTRACT16(rsp->xv[VS1REG], s1, i);
3497      SIMD_EXTRACT16(rsp->xv[VS2REG], s2, VEC_EL_2(EL, i));
3498#else
3499      INT16 s1 = (INT16)VREG_S(VS1REG, i);
3500      INT16 s2 = (INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
3501#endif
3502      if (s1 == s2)
27513503      {
27523504         if (ZERO_FLAG(i) == 0 || CARRY_FLAG(i) == 0)
27533505         {
27543506            SET_COMPARE_FLAG(i);
27553507         }
27563508      }
2757      else if (VREG_S(VS1REG, i) > VREG_S(VS2REG, sel))
3509      else if (s1 > s2)
27583510      {
27593511         SET_COMPARE_FLAG(i);
27603512      }
27613513
27623514      if (COMPARE_FLAG(i) != 0)
27633515      {
2764         vres[i] = VREG_S(VS1REG, i);
3516         vres[i] = s1;
27653517      }
27663518      else
27673519      {
2768         vres[i] = VREG_S(VS2REG, sel);
3520         vres[i] = s2;
27693521      }
27703522
3523#if USE_SIMD
3524      SIMD_INSERT16(rsp->accum_l, vres[i], i);
3525#else
27713526      ACCUM_L(i) = vres[i];
3527#endif
27723528   }
27733529
27743530   rsp->flag[0] = 0;
r24005r24006
27803536   rsp_state *rsp = (rsp_state*)param;
27813537   int op = rsp->impstate->arg0;
27823538   INT16 vres[8];
2783   int i;
3539
27843540   // 31       25  24     20      15      10      5        0
27853541   // ------------------------------------------------------
27863542   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100100 |
r24005r24006
27883544   //
27893545   // Vector clip low
27903546
2791   int sel;
2792   INT16 s1, s2;
2793   for (i = 0; i < 8; i++)
3547   for (int i = 0; i < 8; i++)
27943548   {
2795      sel = VEC_EL_2(EL, i);
2796      s1 = VREG_S(VS1REG, i);
2797      s2 = VREG_S(VS2REG, sel);
3549#if USE_SIMD
3550      INT16 s1, s2;
3551      SIMD_EXTRACT16(rsp->xv[VS1REG], s1, i);
3552      SIMD_EXTRACT16(rsp->xv[VS2REG], s2, VEC_EL_2(EL, i));
3553#else
3554      INT16 s1 = VREG_S(VS1REG, i);
3555      INT16 s2 = VREG_S(VS2REG, VEC_EL_2(EL, i));
3556#endif
27983557
27993558      if (CARRY_FLAG(i) != 0)
28003559      {
r24005r24006
28023561         {
28033562            if (COMPARE_FLAG(i) != 0)
28043563            {
3564#if USE_SIMD
3565               SIMD_INSERT16(rsp->accum_l, -(UINT16)s2, i);
3566#else
28053567               ACCUM_L(i) = -(UINT16)s2;
3568#endif
28063569            }
28073570            else
28083571            {
3572#if USE_SIMD
3573               SIMD_INSERT16(rsp->accum_l, s1, i);
3574#else
28093575               ACCUM_L(i) = s1;
3576#endif
28103577            }
28113578         }
28123579         else//ZERO_FLAG(i)==0
r24005r24006
28153582            {
28163583               if (((UINT32)(UINT16)(s1) + (UINT32)(UINT16)(s2)) > 0x10000)
28173584               {//proper fix for Harvest Moon 64, r4
2818
3585#if USE_SIMD
3586                  SIMD_INSERT16(rsp->accum_l, s1, i);
3587#else
28193588                  ACCUM_L(i) = s1;
3589#endif
28203590                  CLEAR_COMPARE_FLAG(i);
28213591               }
28223592               else
28233593               {
3594#if USE_SIMD
3595                  SIMD_INSERT16(rsp->accum_l, -((UINT16)s2), i);
3596#else
28243597                  ACCUM_L(i) = -((UINT16)s2);
3598#endif
28253599                  SET_COMPARE_FLAG(i);
28263600               }
28273601            }
r24005r24006
28293603            {
28303604               if (((UINT32)(UINT16)(s1) + (UINT32)(UINT16)(s2)) != 0)
28313605               {
3606#if USE_SIMD
3607                  SIMD_INSERT16(rsp->accum_l, s1, i);
3608#else
28323609                  ACCUM_L(i) = s1;
3610#endif
28333611                  CLEAR_COMPARE_FLAG(i);
28343612               }
28353613               else
28363614               {
3615#if USE_SIMD
3616                  SIMD_INSERT16(rsp->accum_l, -((UINT16)s2), i);
3617#else
28373618                  ACCUM_L(i) = -((UINT16)s2);
3619#endif
28383620                  SET_COMPARE_FLAG(i);
28393621               }
28403622            }
28413623         }
2842      }//
3624      }
28433625      else//CARRY_FLAG(i)==0
28443626      {
28453627         if (ZERO_FLAG(i) != 0)
28463628         {
28473629            if (rsp->flag[1] & (1 << (8+i)))
28483630            {
3631#if USE_SIMD
3632               SIMD_INSERT16(rsp->accum_l, s2, i);
3633#else
28493634               ACCUM_L(i) = s2;
3635#endif
28503636            }
28513637            else
28523638            {
3639#if USE_SIMD
3640               SIMD_INSERT16(rsp->accum_l, s1, i);
3641#else
28533642               ACCUM_L(i) = s1;
3643#endif
28543644            }
28553645         }
28563646         else
28573647         {
28583648            if (((INT32)(UINT16)s1 - (INT32)(UINT16)s2) >= 0)
28593649            {
3650#if USE_SIMD
3651               SIMD_INSERT16(rsp->accum_l, s2, i);
3652#else
28603653               ACCUM_L(i) = s2;
3654#endif
28613655               rsp->flag[1] |= (1 << (8+i));
28623656            }
28633657            else
28643658            {
3659#if USE_SIMD
3660               SIMD_INSERT16(rsp->accum_l, s1, i);
3661#else
28653662               ACCUM_L(i) = s1;
3663#endif
28663664               rsp->flag[1] &= ~(1 << (8+i));
28673665            }
28683666         }
28693667      }
28703668
3669#if USE_SIMD
3670      SIMD_EXTRACT16(rsp->accum_l, vres[i], i);
3671#else
28713672      vres[i] = ACCUM_L(i);
3673#endif
28723674   }
28733675   rsp->flag[0] = 0;
28743676   rsp->flag[2] = 0;
r24005r24006
28793681{
28803682   rsp_state *rsp = (rsp_state*)param;
28813683   int op = rsp->impstate->arg0;
2882   INT16 vres[8];
2883   int i;
3684
28843685   // 31       25  24     20      15      10      5        0
28853686   // ------------------------------------------------------
28863687   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100101 |
r24005r24006
28883689   //
28893690   // Vector clip high
28903691
2891   int sel;
2892   INT16 s1, s2;
28933692   rsp->flag[0] = 0;
28943693   rsp->flag[1] = 0;
28953694   rsp->flag[2] = 0;
3695
3696#if USE_SIMD
3697   // Compare flag
3698   // flag[1] bit [0- 7] set if (s1 ^ s2) < 0 && (s1 + s2) <= 0)
3699   // flag[1] bit [0- 7] set if (s1 ^ s2) >= 0 && (s2 < 0)
3700
3701   // flag[1] bit [8-15] set if (s1 ^ s2) < 0 && (s2 < 0)
3702   // flag[1] bit [8-15] set if (s1 ^ s2) >= 0 && (s1 - s2) >= 0
3703
3704   // Carry flag
3705   // flag[0] bit [0- 7] set if (s1 ^ s2) < 0
3706
3707   // Zero flag
3708   // flag[0] bit [8-15] set if (s1 ^ s2) < 0  && (s1 + s2) != 0 && (s1 != ~s2)
3709   // flag[0] bit [8-15] set if (s1 ^ s2) >= 0 && (s1 - s2) != 0 && (s1 != ~s2)
3710
3711   // flag[2] bit [0- 7] set if (s1 ^ s2) < 0 && (s1 + s2) == -1
3712
3713   // accum set to -s2 if (s1 ^ s2) < 0 && (s1 + s2) <= 0)
3714   // accum set to -s2 if (s1 ^ s2) >= 0 && (s1 - s2) >= 0
3715
3716   // accum set to s1 if (s1 ^ s2) < 0 && (s1 + s2) > 0)
3717   // accum set to s1 if (s1 ^ s2) >= 0 && (s1 - s2) < 0
3718
3719   __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
3720   __m128i s1_xor_s2 = _mm_xor_si128(rsp->xv[VS1REG], shuf);
3721   __m128i s1_plus_s2 = _mm_add_epi16(rsp->xv[VS1REG], shuf);
3722   __m128i s1_sub_s2 = _mm_sub_epi16(rsp->xv[VS1REG], shuf);
3723   __m128i s2_neg = _mm_xor_si128(shuf, vec_neg1);
3724
3725   __m128i s2_lz = _mm_cmplt_epi16(shuf, vec_zero);
3726   __m128i s1s2_xor_lz = _mm_cmplt_epi16(s1_xor_s2, vec_zero);
3727   __m128i s1s2_xor_gez = _mm_xor_si128(s1s2_xor_lz, vec_neg1);
3728   __m128i s1s2_plus_nz = _mm_xor_si128(_mm_cmpeq_epi16(s1_plus_s2, vec_zero), vec_neg1);
3729   __m128i s1s2_plus_gz = _mm_cmpgt_epi16(s1_plus_s2, vec_zero);
3730   __m128i s1s2_plus_lez = _mm_xor_si128(s1s2_plus_gz, vec_neg1);
3731   __m128i s1s2_plus_n1 = _mm_cmpeq_epi16(s1_plus_s2, vec_neg1);
3732   __m128i s1s2_sub_nz = _mm_xor_si128(_mm_cmpeq_epi16(s1_sub_s2, vec_zero), vec_neg1);
3733   __m128i s1s2_sub_lz = _mm_cmplt_epi16(s1_sub_s2, vec_zero);
3734   __m128i s1s2_sub_gez = _mm_xor_si128(s1s2_sub_lz, vec_neg1);
3735   __m128i s1_nens2 = _mm_xor_si128(_mm_cmpeq_epi16(rsp->xv[VS1REG], s2_neg), vec_neg1);
3736
3737   __m128i ext_mask = _mm_and_si128(_mm_and_si128(s1s2_xor_lz, s1s2_plus_n1), vec_flagmask);
3738   rsp->flag[2] |= _mm_extract_epi16(ext_mask, 0) << 0;
3739   rsp->flag[2] |= _mm_extract_epi16(ext_mask, 1) << 1;
3740   rsp->flag[2] |= _mm_extract_epi16(ext_mask, 2) << 2;
3741   rsp->flag[2] |= _mm_extract_epi16(ext_mask, 3) << 3;
3742   rsp->flag[2] |= _mm_extract_epi16(ext_mask, 4) << 4;
3743   rsp->flag[2] |= _mm_extract_epi16(ext_mask, 5) << 5;
3744   rsp->flag[2] |= _mm_extract_epi16(ext_mask, 6) << 6;
3745   rsp->flag[2] |= _mm_extract_epi16(ext_mask, 7) << 7;
3746
3747   __m128i carry_mask = _mm_and_si128(s1s2_xor_lz, vec_flagmask);
3748   rsp->flag[0] |= _mm_extract_epi16(carry_mask, 0) << 0;
3749   rsp->flag[0] |= _mm_extract_epi16(carry_mask, 1) << 1;
3750   rsp->flag[0] |= _mm_extract_epi16(carry_mask, 2) << 2;
3751   rsp->flag[0] |= _mm_extract_epi16(carry_mask, 3) << 3;
3752   rsp->flag[0] |= _mm_extract_epi16(carry_mask, 4) << 4;
3753   rsp->flag[0] |= _mm_extract_epi16(carry_mask, 5) << 5;
3754   rsp->flag[0] |= _mm_extract_epi16(carry_mask, 6) << 6;
3755   rsp->flag[0] |= _mm_extract_epi16(carry_mask, 7) << 7;
3756
3757   __m128i z0_mask = _mm_and_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_nz), s1_nens2);
3758   __m128i z1_mask = _mm_and_si128(_mm_and_si128(s1s2_xor_lz, s1s2_plus_nz), s1_nens2);
3759   __m128i z_mask = _mm_and_si128(_mm_or_si128(z0_mask, z1_mask), vec_flagmask);
3760   z_mask = _mm_and_si128(_mm_or_si128(z_mask, _mm_srli_epi32(z_mask, 15)), vec_shiftmask2);
3761   z_mask = _mm_and_si128(_mm_or_si128(z_mask, _mm_srli_epi64(z_mask, 30)), vec_shiftmask4);
3762   z_mask = _mm_or_si128(z_mask, _mm_srli_si128(z_mask, 7));
3763   z_mask = _mm_or_si128(z_mask, _mm_srli_epi16(z_mask, 4));
3764   rsp->flag[0] |= (_mm_extract_epi16(z_mask, 0) << 8) & 0x00ff00;
3765
3766   __m128i f0_mask = _mm_and_si128(_mm_or_si128(_mm_and_si128(s1s2_xor_gez, s2_lz),         _mm_and_si128(s1s2_xor_lz, s1s2_plus_lez)), vec_flagmask);
3767   __m128i f8_mask = _mm_and_si128(_mm_or_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_gez),  _mm_and_si128(s1s2_xor_lz, s2_lz)), vec_flagmask);
3768   f0_mask = _mm_and_si128(f0_mask, vec_flagmask);
3769   f8_mask = _mm_and_si128(f8_mask, vec_flagmask);
3770   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 0) << 0;
3771   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 1) << 1;
3772   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 2) << 2;
3773   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 3) << 3;
3774   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 4) << 4;
3775   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 5) << 5;
3776   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 6) << 6;
3777   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 7) << 7;
3778
3779   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 0) << 8;
3780   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 1) << 9;
3781   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 2) << 10;
3782   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 3) << 11;
3783   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 4) << 12;
3784   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 5) << 13;
3785   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 6) << 14;
3786   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 7) << 15;
3787#else
3788
3789   INT16 vres[8];
28963790   UINT32 vce = 0;
2897
2898   for (i=0; i < 8; i++)
3791   for (int i = 0; i < 8; i++)
28993792   {
2900      sel = VEC_EL_2(EL, i);
2901      s1 = VREG_S(VS1REG, i);
2902      s2 = VREG_S(VS2REG, sel);
3793#if USE_SIMD
3794      INT16 s1, s2;
3795      SIMD_EXTRACT16(rsp->xv[VS1REG], s1, i);
3796      SIMD_EXTRACT16(rsp->xv[VS2REG], s2, VEC_EL_2(EL, i));
3797#else
3798      INT16 s1 = VREG_S(VS1REG, i);
3799      INT16 s2 = VREG_S(VS2REG, VEC_EL_2(EL, i));
3800#endif
29033801
29043802      if ((s1 ^ s2) < 0)
29053803      {
r24005r24006
29203818            vres[i] = s1;
29213819         }
29223820
2923         if (s1 + s2 != 0)
3821         if (s1 + s2 != 0 && s1 != ~s2)
29243822         {
2925            if (s1 != ~s2)
2926            {
2927               SET_ZERO_FLAG(i);
2928            }
3823            SET_ZERO_FLAG(i);
29293824         }
29303825      }//sign
29313826      else
r24005r24006
29453840            vres[i] = s1;
29463841         }
29473842
2948         if ((s1 - s2) != 0)
3843         if ((s1 - s2) != 0 && s1 != ~s2)
29493844         {
2950            if (s1 != ~s2)
2951            {
2952               SET_ZERO_FLAG(i);
2953            }
3845            SET_ZERO_FLAG(i);
29543846         }
29553847      }
29563848      rsp->flag[2] |= (vce << (i));
3849#if USE_SIMD
3850      SIMD_INSERT16(rsp->accum_l, vres[i], i);
3851#else
29573852      ACCUM_L(i) = vres[i];
3853#endif
29583854   }
29593855   WRITEBACK_RESULT();
3856#endif
29603857}
29613858
29623859INLINE void cfunc_rsp_vcr(void *param)
29633860{
29643861   rsp_state *rsp = (rsp_state*)param;
29653862   int op = rsp->impstate->arg0;
2966   INT16 vres[8];
2967   int i;
3863
29683864   // 31       25  24     20      15      10      5        0
29693865   // ------------------------------------------------------
29703866   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100110 |
r24005r24006
29723868   //
29733869   // Vector clip reverse
29743870
2975   int sel;
2976   INT16 s1, s2;
29773871   rsp->flag[0] = 0;
29783872   rsp->flag[1] = 0;
29793873   rsp->flag[2] = 0;
29803874
2981   for (i=0; i < 8; i++)
3875#if USE_SIMD
3876   // flag[1] bit [0- 7] set if (s1 ^ s2) < 0 && (s1 + s2) <= 0)
3877   // flag[1] bit [0- 7] set if (s1 ^ s2) >= 0 && (s2 < 0)
3878
3879   // flag[1] bit [8-15] set if (s1 ^ s2) < 0 && (s2 < 0)
3880   // flag[1] bit [8-15] set if (s1 ^ s2) >= 0 && (s1 - s2) >= 0
3881
3882   // accum set to ~s2 if (s1 ^ s2) < 0 && (s1 + s2) <= 0)
3883   // accum set to ~s2 if (s1 ^ s2) >= 0 && (s1 - s2) >= 0
3884
3885   // accum set to s1 if (s1 ^ s2) < 0 && (s1 + s2) > 0)
3886   // accum set to s1 if (s1 ^ s2) >= 0 && (s1 - s2) < 0
3887   __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
3888   __m128i s1_xor_s2 = _mm_xor_si128(rsp->xv[VS1REG], shuf);
3889   __m128i s1_plus_s2 = _mm_add_epi16(rsp->xv[VS1REG], shuf);
3890   __m128i s1_sub_s2 = _mm_sub_epi16(rsp->xv[VS1REG], shuf);
3891   __m128i s2_neg = _mm_xor_si128(shuf, vec_neg1);
3892
3893   __m128i s2_lz = _mm_cmplt_epi16(shuf, vec_zero);
3894   __m128i s1s2_xor_lz = _mm_cmplt_epi16(s1_xor_s2, vec_zero);
3895   __m128i s1s2_xor_gez = _mm_xor_si128(s1s2_xor_lz, vec_neg1);
3896   __m128i s1s2_plus_gz = _mm_cmpgt_epi16(s1_plus_s2, vec_zero);
3897   __m128i s1s2_plus_lez = _mm_xor_si128(s1s2_plus_gz, vec_neg1);
3898   __m128i s1s2_sub_lz = _mm_cmplt_epi16(s1_sub_s2, vec_zero);
3899   __m128i s1s2_sub_gez = _mm_xor_si128(s1s2_sub_lz, vec_neg1);
3900
3901   __m128i s1_mask = _mm_or_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_lz),   _mm_and_si128(s1s2_xor_lz, s1s2_plus_gz));
3902   __m128i s2_mask = _mm_or_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_gez),  _mm_and_si128(s1s2_xor_lz, s1s2_plus_lez));
3903   rsp->accum_l = _mm_or_si128(_mm_and_si128(rsp->xv[VS1REG], s1_mask), _mm_and_si128(s2_neg, s2_mask));
3904   rsp->xv[VDREG] = rsp->accum_l;
3905
3906   __m128i f0_mask = _mm_or_si128(_mm_and_si128(s1s2_xor_gez, s2_lz),         _mm_and_si128(s1s2_xor_lz, s1s2_plus_lez));
3907   __m128i f8_mask = _mm_or_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_gez),  _mm_and_si128(s1s2_xor_lz, s2_lz));
3908   f0_mask = _mm_and_si128(f0_mask, vec_flagmask);
3909   f8_mask = _mm_and_si128(f8_mask, vec_flagmask);
3910   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 0) << 0;
3911   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 1) << 1;
3912   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 2) << 2;
3913   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 3) << 3;
3914   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 4) << 4;
3915   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 5) << 5;
3916   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 6) << 6;
3917   rsp->flag[1] |= _mm_extract_epi16(f0_mask, 7) << 7;
3918
3919   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 0) << 8;
3920   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 1) << 9;
3921   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 2) << 10;
3922   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 3) << 11;
3923   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 4) << 12;
3924   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 5) << 13;
3925   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 6) << 14;
3926   rsp->flag[1] |= _mm_extract_epi16(f8_mask, 7) << 15;
3927#else
3928   INT16 vres[8];
3929   for (int i = 0; i < 8; i++)
29823930   {
2983      sel = VEC_EL_2(EL, i);
2984      s1 = VREG_S(VS1REG, i);
2985      s2 = VREG_S(VS2REG, sel);
3931      INT16 s1 = VREG_S(VS1REG, i);
3932      INT16 s2 = VREG_S(VS2REG, VEC_EL_2(EL, i));
29863933
29873934      if ((INT16)(s1 ^ s2) < 0)
29883935      {
r24005r24006
30203967      vres[i] = ACCUM_L(i);
30213968   }
30223969   WRITEBACK_RESULT();
3970#endif
30233971}
30243972
30253973INLINE void cfunc_rsp_vmrg(void *param)
30263974{
30273975   rsp_state *rsp = (rsp_state*)param;
30283976   int op = rsp->impstate->arg0;
3029   INT16 vres[8] = { 0 };
30303977   // 31       25  24     20      15      10      5        0
30313978   // ------------------------------------------------------
30323979   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100111 |
r24005r24006
30343981   //
30353982   // Merges two vectors according to compare flags
30363983
3037   int sel;
3984#if USE_SIMD
3985   __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
3986   __m128i compare = _mm_set_epi16(COMPARE_FLAG(7), COMPARE_FLAG(6), COMPARE_FLAG(5), COMPARE_FLAG(4),
3987                           COMPARE_FLAG(3), COMPARE_FLAG(2), COMPARE_FLAG(1), COMPARE_FLAG(0));
3988   __m128i s2mask = _mm_cmpeq_epi16(compare, vec_zero);
3989   __m128i s1mask = _mm_xor_si128(s2mask, vec_neg1);
3990   __m128i result = _mm_and_si128(rsp->xv[VS1REG], s1mask);
3991   rsp->xv[VDREG] = _mm_or_si128(result, _mm_and_si128(shuf, s2mask));
3992   rsp->accum_l = rsp->xv[VDREG];
3993#else
3994   INT16 vres[8];
30383995   for (int i = 0; i < 8; i++)
30393996   {
3040      sel = VEC_EL_2(EL, i);
3997      INT16 s1 = (INT16)VREG_S(VS1REG, i);
3998      INT16 s2 = (INT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
30413999      if (COMPARE_FLAG(i) != 0)
30424000      {
3043         vres[i] = VREG_S(VS1REG, i);
4001         vres[i] = s1;
30444002      }
30454003      else
30464004      {
3047         vres[i] = VREG_S(VS2REG, sel);//??? ???????????
4005         vres[i] = s2;
30484006      }
30494007
30504008      ACCUM_L(i) = vres[i];
30514009   }
30524010   WRITEBACK_RESULT();
4011#endif
30534012}
30544013
30554014INLINE void cfunc_rsp_vand(void *param)
30564015{
30574016   rsp_state *rsp = (rsp_state*)param;
30584017   int op = rsp->impstate->arg0;
3059   INT16 vres[8] = { 0 };
4018
30604019   // 31       25  24     20      15      10      5        0
30614020   // ------------------------------------------------------
30624021   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101000 |
r24005r24006
30644023   //
30654024   // Bitwise AND of two vector registers
30664025
3067   int sel;
4026#if USE_SIMD
4027   __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
4028   rsp->xv[VDREG] = _mm_and_si128(rsp->xv[VS1REG], shuf);
4029   rsp->accum_l = rsp->xv[VDREG];
4030#else
4031   INT16 vres[8];
30684032   for (int i = 0; i < 8; i++)
30694033   {
3070      sel = VEC_EL_2(EL, i);
3071      vres[i] = VREG_S(VS1REG, i) & VREG_S(VS2REG, sel);
4034      UINT16 s1 = (UINT16)VREG_S(VS1REG, i);
4035      UINT16 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
4036      vres[i] = s1 & s2;
30724037      ACCUM_L(i) = vres[i];
30734038   }
30744039   WRITEBACK_RESULT();
4040#endif
30754041}
30764042
30774043INLINE void cfunc_rsp_vnand(void *param)
30784044{
30794045   rsp_state *rsp = (rsp_state*)param;
30804046   int op = rsp->impstate->arg0;
3081   INT16 vres[8] = { 0 };
4047
30824048   // 31       25  24     20      15      10      5        0
30834049   // ------------------------------------------------------
30844050   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101001 |
r24005r24006
30864052   //
30874053   // Bitwise NOT AND of two vector registers
30884054
3089   int sel;
4055#if USE_SIMD
4056   __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
4057   rsp->xv[VDREG] = _mm_xor_si128(_mm_and_si128(rsp->xv[VS1REG], shuf), vec_neg1);
4058   rsp->accum_l = rsp->xv[VDREG];
4059#else
4060   INT16 vres[8];
30904061   for (int i = 0; i < 8; i++)
30914062   {
3092      sel = VEC_EL_2(EL, i);
3093      vres[i] = ~((VREG_S(VS1REG, i) & VREG_S(VS2REG, sel)));
4063      UINT16 s1 = (UINT16)VREG_S(VS1REG, i);
4064      UINT16 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
4065      vres[i] = ~((s1 & s2));
30944066      ACCUM_L(i) = vres[i];
30954067   }
30964068   WRITEBACK_RESULT();
4069#endif
30974070}
30984071
30994072INLINE void cfunc_rsp_vor(void *param)
31004073{
31014074   rsp_state *rsp = (rsp_state*)param;
31024075   int op = rsp->impstate->arg0;
3103   INT16 vres[8] = { 0 };;
4076
31044077   // 31       25  24     20      15      10      5        0
31054078   // ------------------------------------------------------
31064079   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101010 |
r24005r24006
31084081   //
31094082   // Bitwise OR of two vector registers
31104083
3111   int sel;
4084#if USE_SIMD
4085   __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
4086   rsp->xv[VDREG] = _mm_or_si128(rsp->xv[VS1REG], shuf);
4087   rsp->accum_l = rsp->xv[VDREG];
4088#else
4089   INT16 vres[8];
31124090   for (int i = 0; i < 8; i++)
31134091   {
3114      sel = VEC_EL_2(EL, i);
3115      vres[i] = VREG_S(VS1REG, i) | VREG_S(VS2REG, sel);
4092      UINT16 s1 = (UINT16)VREG_S(VS1REG, i);
4093      UINT16 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
4094      vres[i] = s1 | s2;
31164095      ACCUM_L(i) = vres[i];
31174096   }
31184097   WRITEBACK_RESULT();
4098#endif
31194099}
31204100
31214101INLINE void cfunc_rsp_vnor(void *param)
31224102{
31234103   rsp_state *rsp = (rsp_state*)param;
31244104   int op = rsp->impstate->arg0;
3125   INT16 vres[8] = { 0 };;
4105
31264106   // 31       25  24     20      15      10      5        0
31274107   // ------------------------------------------------------
31284108   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101011 |
r24005r24006
31304110   //
31314111   // Bitwise NOT OR of two vector registers
31324112
3133   int sel;
4113#if USE_SIMD
4114   __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
4115   rsp->xv[VDREG] = _mm_xor_si128(_mm_or_si128(rsp->xv[VS1REG], shuf), vec_neg1);
4116   rsp->accum_l = rsp->xv[VDREG];
4117#else
4118   INT16 vres[8];
31344119   for (int i = 0; i < 8; i++)
31354120   {
3136      sel = VEC_EL_2(EL, i);
3137      vres[i] = ~((VREG_S(VS1REG, i) | VREG_S(VS2REG, sel)));
4121      UINT16 s1 = (UINT16)VREG_S(VS1REG, i);
4122      UINT16 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
4123      vres[i] = ~((s1 | s2));
31384124      ACCUM_L(i) = vres[i];
31394125   }
31404126   WRITEBACK_RESULT();
4127#endif
31414128}
31424129
31434130INLINE void cfunc_rsp_vxor(void *param)
31444131{
31454132   rsp_state *rsp = (rsp_state*)param;
31464133   int op = rsp->impstate->arg0;
3147   INT16 vres[8] = { 0 };;
4134
31484135   // 31       25  24     20      15      10      5        0
31494136   // ------------------------------------------------------
31504137   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101100 |
r24005r24006
31524139   //
31534140   // Bitwise XOR of two vector registers
31544141
3155   int sel;
4142#if USE_SIMD
4143   __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
4144   rsp->xv[VDREG] = _mm_xor_si128(rsp->xv[VS1REG], shuf);
4145   rsp->accum_l = rsp->xv[VDREG];
4146#else
4147   INT16 vres[8];
31564148   for (int i = 0; i < 8; i++)
31574149   {
3158      sel = VEC_EL_2(EL, i);
3159      vres[i] = VREG_S(VS1REG, i) ^ VREG_S(VS2REG, sel);
4150      UINT16 s1 = (UINT16)VREG_S(VS1REG, i);
4151      UINT16 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
4152      vres[i] = s1 ^ s2;
31604153      ACCUM_L(i) = vres[i];
31614154   }
31624155   WRITEBACK_RESULT();
4156#endif
31634157}
31644158
31654159INLINE void cfunc_rsp_vnxor(void *param)
31664160{
31674161   rsp_state *rsp = (rsp_state*)param;
31684162   int op = rsp->impstate->arg0;
3169   INT16 vres[8] = { 0 };;
4163
31704164   // 31       25  24     20      15      10      5        0
31714165   // ------------------------------------------------------
31724166   // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101101 |
r24005r24006
31744168   //
31754169   // Bitwise NOT XOR of two vector registers
31764170
3177   int sel;
4171#if USE_SIMD
4172   __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
4173   rsp->xv[VDREG] = _mm_xor_si128(_mm_xor_si128(rsp->xv[VS1REG], shuf), vec_neg1);
4174   rsp->accum_l = rsp->xv[VDREG];
4175#else
4176   INT16 vres[8];
31784177   for (int i = 0; i < 8; i++)
31794178   {
3180      sel = VEC_EL_2(EL, i);
3181      vres[i] = ~((VREG_S(VS1REG, i) ^ VREG_S(VS2REG, sel)));
4179      UINT16 s1 = (UINT16)VREG_S(VS1REG, i);
4180      UINT16 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i));
4181      vres[i] = ~((s1 ^ s2));
31824182      ACCUM_L(i) = vres[i];
31834183   }
31844184   WRITEBACK_RESULT();
4185#endif
31854186}
31864187
31874188INLINE void cfunc_rsp_vrcp(void *param)
31884189{
31894190   rsp_state *rsp = (rsp_state*)param;
31904191   int op = rsp->impstate->arg0;
3191   int i;
4192
31924193   // 31       25  24     20      15      10      5        0
31934194   // ------------------------------------------------------
31944195   // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110000 |
r24005r24006
31964197   //
31974198   // Calculates reciprocal
31984199
3199   int del = VS1REG & 7;
3200   int sel = EL & 7;
32014200   INT32 shifter = 0;
3202
3203   INT32 rec = (INT16)(VREG_S(VS2REG, sel));
4201#if USE_SIMD
4202   UINT16 urec;
4203   INT32 rec;
4204   SIMD_EXTRACT16(rsp->xv[VS2REG], urec, EL);
4205   rec = (INT16)urec;
4206#else
4207   INT32 rec = (INT16)(VREG_S(VS2REG, EL & 7));
4208#endif
32044209   INT32 datainput = (rec < 0) ? (-rec) : rec;
32054210   if (datainput)
32064211   {
3207      for (i = 0; i < 32; i++)
4212      for (int i = 0; i < 32; i++)
32084213      {
3209         if (datainput & (1 << ((~i) & 0x1f)))//?.?.??? 31 - i
4214         if (datainput & (1 << ((~i) & 0x1f)))
32104215         {
32114216            shifter = i;
32124217            break;
r24005r24006
32384243   rsp->reciprocal_res = rec;
32394244   rsp->dp_allowed = 0;
32404245
3241   W_VREG_S(VDREG, del) = (UINT16)(rec & 0xffff);
4246#if USE_SIMD
4247   SIMD_INSERT16(rsp->xv[VDREG], (UINT16)rec, VS1REG);
4248#else
4249   W_VREG_S(VDREG, VS1REG & 7) = (UINT16)rec;
4250#endif
32424251
3243   for (i = 0; i < 8; i++)
4252   for (int i = 0; i < 8; i++)
32444253   {
3245      sel = VEC_EL_2(EL, i);
3246      ACCUM_L(i) = VREG_S(VS2REG, sel);
4254#if USE_SIMD
4255      INT16 val;
4256      SIMD_EXTRACT16(rsp->xv[VS2REG], val, VEC_EL_2(EL, i));
4257      SIMD_INSERT16(rsp->accum_l, val, i);
4258#else
4259      ACCUM_L(i) = VREG_S(VS2REG, VEC_EL_2(EL, i));
4260#endif
32474261   }
32484262}
32494263
r24005r24006
32514265{
32524266   rsp_state *rsp = (rsp_state*)param;
32534267   int op = rsp->impstate->arg0;
3254   int i;
4268
32554269   // 31       25  24     20      15      10      5        0
32564270   // ------------------------------------------------------
32574271   // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110001 |
r24005r24006
32594273   //
32604274   // Calculates reciprocal low part
32614275
3262   int del = VS1REG & 7;
3263   int sel = EL & 7;
32644276   INT32 shifter = 0;
32654277
3266   INT32 rec = ((UINT16)(VREG_S(VS2REG, sel)) | ((UINT32)(rsp->reciprocal_high) & 0xffff0000));
4278#if USE_SIMD
4279   UINT16 urec;
4280   INT32 rec;
4281   SIMD_EXTRACT16(rsp->xv[VS2REG], urec, EL);
4282   rec = (INT32)(rsp->reciprocal_high | urec);
4283#else
4284   INT32 rec = ((UINT16)(VREG_S(VS2REG, EL & 7)) | rsp->reciprocal_high);
4285#endif
32674286
32684287   INT32 datainput = rec;
32694288
r24005r24006
32894308
32904309   if (datainput)
32914310   {
3292      for (i = 0; i < 32; i++)
4311      for (int i = 0; i < 32; i++)
32934312      {
3294         if (datainput & (1 << ((~i) & 0x1f)))//?.?.??? 31 - i
4313         if (datainput & (1 << ((~i) & 0x1f)))
32954314         {
32964315            shifter = i;
32974316            break;
r24005r24006
33304349   rsp->reciprocal_res = rec;
33314350   rsp->dp_allowed = 0;
33324351
3333   W_VREG_S(VDREG, del) = (UINT16)(rec & 0xffff);
4352#if USE_SIMD
4353   SIMD_INSERT16(rsp->xv[VDREG], (UINT16)rec, VS1REG);
4354#else
4355   W_VREG_S(VDREG, VS1REG & 7) = (UINT16)rec;
4356#endif
33344357
3335   for (i = 0; i < 8; i++)
4358   for (int i = 0; i < 8; i++)
33364359   {
3337      sel = VEC_EL_2(EL, i);
3338      ACCUM_L(i) = VREG_S(VS2REG, sel);
4360#if USE_SIMD
4361      INT16 val;
4362      SIMD_EXTRACT16(rsp->xv[VS2REG], val, VEC_EL_2(EL, i));
4363      SIMD_INSERT16(rsp->accum_l, val, i);
4364#else
4365      ACCUM_L(i) = VREG_S(VS2REG, VEC_EL_2(EL, i));
4366#endif
33394367   }
33404368}
33414369
r24005r24006
33504378   //
33514379   // Calculates reciprocal high part
33524380
3353   int del = VS1REG & 7;
3354   int sel = EL & 7;
4381#if USE_SIMD
4382   UINT16 rcph;
4383   SIMD_EXTRACT16(rsp->xv[VS2REG], rcph, EL);
4384   rsp->reciprocal_high = rcph << 16;
4385   rsp->dp_allowed = 1;
33554386
3356   rsp->reciprocal_high = (VREG_S(VS2REG, sel)) << 16;
4387   //rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
4388   INT16 val;
4389   for (int i = 0; i < 8; i++)
4390   {
4391      SIMD_EXTRACT16(rsp->xv[VS2REG], val, VEC_EL_2(EL, i));
4392      SIMD_INSERT16(rsp->accum_l, val, i);
4393   }
4394
4395   SIMD_INSERT16(rsp->xv[VDREG], (INT16)(rsp->reciprocal_res >> 16), VS1REG);
4396#else
4397   rsp->reciprocal_high = (VREG_S(VS2REG, EL & 7)) << 16;
33574398   rsp->dp_allowed = 1;
33584399
33594400   for (int i = 0; i < 8; i++)
33604401   {
3361      sel = VEC_EL_2(EL, i);
3362      ACCUM_L(i) = VREG_S(VS2REG, sel);
4402      ACCUM_L(i) = VREG_S(VS2REG, VEC_EL_2(EL, i));
33634403   }
33644404
3365   W_VREG_S(VDREG, del) = (INT16)(rsp->reciprocal_res >> 16);
4405   W_VREG_S(VDREG, VS1REG & 7) = (INT16)(rsp->reciprocal_res >> 16);
4406#endif
33664407}
33674408
33684409INLINE void cfunc_rsp_vmov(void *param)
r24005r24006
33764417   //
33774418   // Moves element from vector to destination vector
33784419
3379   int del = VS1REG & 7;
3380   int sel = EL & 7;
3381
3382   W_VREG_S(VDREG, del) = VREG_S(VS2REG, sel);
4420#if USE_SIMD
4421   INT16 val;
4422   SIMD_EXTRACT16(rsp->xv[VS2REG], val, EL);
4423   SIMD_INSERT16(rsp->xv[VDREG], val, VS1REG);
4424   //rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
33834425   for (int i = 0; i < 8; i++)
33844426   {
3385      sel = VEC_EL_2(EL, i);
3386      ACCUM_L(i) = VREG_S(VS2REG, sel);
4427      SIMD_EXTRACT16(rsp->xv[VS2REG], val, VEC_EL_2(EL, i));
4428      SIMD_INSERT16(rsp->accum_l, val, i);
33874429   }
4430#else
4431   W_VREG_S(VDREG, VS1REG & 7) = VREG_S(VS2REG, EL & 7);
4432   for (int i = 0; i < 8; i++)
4433   {
4434      ACCUM_L(i) = VREG_S(VS2REG, VEC_EL_2(EL, i));
4435   }
4436#endif
33884437}
33894438
33904439INLINE void cfunc_rsp_vrsql(void *param)
33914440{
33924441   rsp_state *rsp = (rsp_state*)param;
33934442   int op = rsp->impstate->arg0;
3394   int i;
4443
33954444   // 31       25  24     20      15      10      5        0
33964445   // ------------------------------------------------------
33974446   // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110101 |
r24005r24006
33994448   //
34004449   // Calculates reciprocal square-root low part
34014450
3402   int del = VS1REG & 7;
3403   int sel = EL & 7;
34044451   INT32 shifter = 0;
3405
3406   INT32 rec = ((UINT16)(VREG_S(VS2REG, sel)) | ((UINT32)(rsp->reciprocal_high) & 0xffff0000));
3407
4452#if USE_SIMD
4453   UINT16 val;
4454   SIMD_EXTRACT16(rsp->xv[VS2REG], val, EL);
4455   INT32 rec = (INT32)(rsp->reciprocal_high | val);
4456#else
4457   INT32 rec = rsp->reciprocal_high | (UINT16)VREG_S(VS2REG, EL & 7);
4458#endif
34084459   INT32 datainput = rec;
34094460
34104461   if (rec < 0)
34114462   {
34124463      if (rsp->dp_allowed)
34134464      {
3414         if (rec < -32768)//VDIV.C,208
4465         if (rec < -32768)
34154466         {
34164467            datainput = ~datainput;
34174468         }
r24005r24006
34284479
34294480   if (datainput)
34304481   {
3431      for (i = 0; i < 32; i++)
4482      for (int i = 0; i < 32; i++)
34324483      {
34334484         if (datainput & (1 << ((~i) & 0x1f)))
34344485         {
r24005r24006
34714522   rsp->reciprocal_res = rec;
34724523   rsp->dp_allowed = 0;
34734524
3474   W_VREG_S(VDREG, del) = (UINT16)(rec & 0xffff);
3475
3476   for (i = 0; i < 8; i++)
4525#if USE_SIMD
4526   SIMD_INSERT16(rsp->xv[VDREG], (UINT16)rec, VS1REG);
4527   //rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
4528   for (int i = 0; i < 8; i++)
34774529   {
3478      sel = VEC_EL_2(EL, i);
3479      ACCUM_L(i) = VREG_S(VS2REG, sel);
4530      SIMD_EXTRACT16(rsp->xv[VS2REG], val, VEC_EL_2(EL, i));
4531      SIMD_INSERT16(rsp->accum_l, val, i);
34804532   }
4533#else
4534   W_VREG_S(VDREG, VS1REG & 7) = (UINT16)(rec & 0xffff);
4535   for (int i = 0; i < 8; i++)
4536   {
4537      ACCUM_L(i) = VREG_S(VS2REG, VEC_EL_2(EL, i));
4538   }
4539#endif
34814540}
34824541
34834542INLINE void cfunc_rsp_vrsqh(void *param)
34844543{
34854544   rsp_state *rsp = (rsp_state*)param;
34864545   int op = rsp->impstate->arg0;
3487   int i;
4546
34884547   // 31       25  24     20      15      10      5        0
34894548   // ------------------------------------------------------
34904549   // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110110 |
r24005r24006
34924551   //
34934552   // Calculates reciprocal square-root high part
34944553
3495   int del = VS1REG & 7;
3496   int sel = EL & 7;
4554#if USE_SIMD
4555   UINT16 val;
4556   SIMD_EXTRACT16(rsp->xv[VS2REG], val, EL);
4557   rsp->reciprocal_high = val << 16;
4558   rsp->dp_allowed = 1;
34974559
3498   rsp->reciprocal_high = (VREG_S(VS2REG, sel)) << 16;
4560   //rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);
4561   for (int i = 0; i < 8; i++)
4562   {
4563      SIMD_EXTRACT16(rsp->xv[VS2REG], val, VEC_EL_2(EL, i));
4564      SIMD_INSERT16(rsp->accum_l, val, i);
4565   }
4566
4567   SIMD_INSERT16(rsp->xv[VDREG], (UINT16)(rsp->reciprocal_res >> 16), VS1REG); // store high part
4568#else
4569   rsp->reciprocal_high = (VREG_S(VS2REG, EL & 7)) << 16;
34994570   rsp->dp_allowed = 1;
35004571
3501   for (i=0; i < 8; i++)
4572   for (int i = 0; i < 8; i++)
35024573   {
3503      sel = VEC_EL_2(EL, i);
3504      ACCUM_L(i) = VREG_S(VS2REG, sel);
4574      ACCUM_L(i) = VREG_S(VS2REG, VEC_EL_2(EL, i));
35054575   }
35064576
3507   W_VREG_S(VDREG, del) = (INT16)(rsp->reciprocal_res >> 16);  // store high part
4577   W_VREG_S(VDREG, VS1REG & 7) = (INT16)(rsp->reciprocal_res >> 16);  // store high part
4578#endif
35084579}
35094580
35104581static void cfunc_sp_set_status_cb(void *param)
r24005r24006
47815852   rsp_state *rsp = (rsp_state*)param;
47825853   UINT32 op = rsp->impstate->arg0;
47835854   int el = (op >> 7) & 0xf;
5855#if USE_SIMD
5856   UINT16 w;
5857   SIMD_EXTRACT16(rsp->xv[VS1REG], w, el >> 1);
5858   rsp->r[RTREG] = (INT32)(INT16)w;
5859#else
47845860   UINT16 b1 = VREG_B(VS1REG, (el+0) & 0xf);
47855861   UINT16 b2 = VREG_B(VS1REG, (el+1) & 0xf);
47865862   if (RTREG) RTVAL = (INT32)(INT16)((b1 << 8) | (b2));
5863#endif
47875864}
47885865
47895866static void cfunc_cfc2(void *param)
r24005r24006
48105887   rsp_state *rsp = (rsp_state*)param;
48115888   UINT32 op = rsp->impstate->arg0;
48125889   int el = (op >> 7) & 0xf;
5890#if USE_SIMD
5891   SIMD_INSERT16(rsp->xv[VS1REG], RTVAL, el >> 1);
5892#else
48135893   VREG_B(VS1REG, (el+0) & 0xf) = (RTVAL >> 8) & 0xff;
48145894   VREG_B(VS1REG, (el+1) & 0xf) = (RTVAL >> 0) & 0xff;
5895#endif
48155896}
48165897
48175898static void cfunc_ctc2(void *param)
r24005r24006
50136094      case CPUINFO_STR_REGISTER + RSP_R29:            sprintf(info->s, "R29: %08X", rsp->r[29]); break;
50146095      case CPUINFO_STR_REGISTER + RSP_R30:            sprintf(info->s, "R30: %08X", rsp->r[30]); break;
50156096      case CPUINFO_STR_REGISTER + RSP_R31:            sprintf(info->s, "R31: %08X", rsp->r[31]); break;
6097
6098#if USE_SIMD
6099      case CPUINFO_STR_REGISTER + RSP_V0:             sprintf(info->s, "V0: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)_mm_extract_epi16(rsp->xv[ 0], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 0], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 0], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 0], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 0], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 0], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 0], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 0], 0)); break;
6100      case CPUINFO_STR_REGISTER + RSP_V1:             sprintf(info->s, "V1: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)_mm_extract_epi16(rsp->xv[ 1], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 1], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 1], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 1], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 1], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 1], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 1], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 1], 0)); break;
6101      case CPUINFO_STR_REGISTER + RSP_V2:             sprintf(info->s, "V2: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)_mm_extract_epi16(rsp->xv[ 2], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 2], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 2], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 2], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 2], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 2], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 2], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 2], 0)); break;
6102      case CPUINFO_STR_REGISTER + RSP_V3:             sprintf(info->s, "V3: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)_mm_extract_epi16(rsp->xv[ 3], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 3], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 3], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 3], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 3], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 3], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 3], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 3], 0)); break;
6103      case CPUINFO_STR_REGISTER + RSP_V4:             sprintf(info->s, "V4: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)_mm_extract_epi16(rsp->xv[ 4], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 4], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 4], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 4], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 4], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 4], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 4], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 4], 0)); break;
6104      case CPUINFO_STR_REGISTER + RSP_V5:             sprintf(info->s, "V5: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)_mm_extract_epi16(rsp->xv[ 5], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 5], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 5], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 5], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 5], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 5], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 5], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 5], 0)); break;
6105      case CPUINFO_STR_REGISTER + RSP_V6:             sprintf(info->s, "V6: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)_mm_extract_epi16(rsp->xv[ 6], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 6], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 6], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 6], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 6], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 6], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 6], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 6], 0)); break;
6106      case CPUINFO_STR_REGISTER + RSP_V7:             sprintf(info->s, "V7: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)_mm_extract_epi16(rsp->xv[ 7], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 7], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 7], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 7], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 7], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 7], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 7], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 7], 0)); break;
6107      case CPUINFO_STR_REGISTER + RSP_V8:             sprintf(info->s, "V8: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)_mm_extract_epi16(rsp->xv[ 8], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 8], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 8], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 8], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 8], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 8], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 8], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 8], 0)); break;
6108      case CPUINFO_STR_REGISTER + RSP_V9:             sprintf(info->s, "V9: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)_mm_extract_epi16(rsp->xv[ 9], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 9], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 9], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 9], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 9], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 9], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 9], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 9], 0)); break;
6109      case CPUINFO_STR_REGISTER + RSP_V10:            sprintf(info->s, "V10: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[10], 7), (UINT16)_mm_extract_epi16(rsp->xv[10], 6), (UINT16)_mm_extract_epi16(rsp->xv[10], 5), (UINT16)_mm_extract_epi16(rsp->xv[10], 4), (UINT16)_mm_extract_epi16(rsp->xv[10], 3), (UINT16)_mm_extract_epi16(rsp->xv[10], 2), (UINT16)_mm_extract_epi16(rsp->xv[10], 1), (UINT16)_mm_extract_epi16(rsp->xv[10], 0)); break;
6110      case CPUINFO_STR_REGISTER + RSP_V11:            sprintf(info->s, "V11: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[11], 7), (UINT16)_mm_extract_epi16(rsp->xv[11], 6), (UINT16)_mm_extract_epi16(rsp->xv[11], 5), (UINT16)_mm_extract_epi16(rsp->xv[11], 4), (UINT16)_mm_extract_epi16(rsp->xv[11], 3), (UINT16)_mm_extract_epi16(rsp->xv[11], 2), (UINT16)_mm_extract_epi16(rsp->xv[11], 1), (UINT16)_mm_extract_epi16(rsp->xv[11], 0)); break;
6111      case CPUINFO_STR_REGISTER + RSP_V12:            sprintf(info->s, "V12: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[12], 7), (UINT16)_mm_extract_epi16(rsp->xv[12], 6), (UINT16)_mm_extract_epi16(rsp->xv[12], 5), (UINT16)_mm_extract_epi16(rsp->xv[12], 4), (UINT16)_mm_extract_epi16(rsp->xv[12], 3), (UINT16)_mm_extract_epi16(rsp->xv[12], 2), (UINT16)_mm_extract_epi16(rsp->xv[12], 1), (UINT16)_mm_extract_epi16(rsp->xv[12], 0)); break;
6112      case CPUINFO_STR_REGISTER + RSP_V13:            sprintf(info->s, "V13: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[13], 7), (UINT16)_mm_extract_epi16(rsp->xv[13], 6), (UINT16)_mm_extract_epi16(rsp->xv[13], 5), (UINT16)_mm_extract_epi16(rsp->xv[13], 4), (UINT16)_mm_extract_epi16(rsp->xv[13], 3), (UINT16)_mm_extract_epi16(rsp->xv[13], 2), (UINT16)_mm_extract_epi16(rsp->xv[13], 1), (UINT16)_mm_extract_epi16(rsp->xv[13], 0)); break;
6113      case CPUINFO_STR_REGISTER + RSP_V14:            sprintf(info->s, "V14: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[14], 7), (UINT16)_mm_extract_epi16(rsp->xv[14], 6), (UINT16)_mm_extract_epi16(rsp->xv[14], 5), (UINT16)_mm_extract_epi16(rsp->xv[14], 4), (UINT16)_mm_extract_epi16(rsp->xv[14], 3), (UINT16)_mm_extract_epi16(rsp->xv[14], 2), (UINT16)_mm_extract_epi16(rsp->xv[14], 1), (UINT16)_mm_extract_epi16(rsp->xv[14], 0)); break;
6114      case CPUINFO_STR_REGISTER + RSP_V15:            sprintf(info->s, "V15: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[15], 7), (UINT16)_mm_extract_epi16(rsp->xv[15], 6), (UINT16)_mm_extract_epi16(rsp->xv[15], 5), (UINT16)_mm_extract_epi16(rsp->xv[15], 4), (UINT16)_mm_extract_epi16(rsp->xv[15], 3), (UINT16)_mm_extract_epi16(rsp->xv[15], 2), (UINT16)_mm_extract_epi16(rsp->xv[15], 1), (UINT16)_mm_extract_epi16(rsp->xv[15], 0)); break;
6115      case CPUINFO_STR_REGISTER + RSP_V16:            sprintf(info->s, "V16: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[16], 7), (UINT16)_mm_extract_epi16(rsp->xv[16], 6), (UINT16)_mm_extract_epi16(rsp->xv[16], 5), (UINT16)_mm_extract_epi16(rsp->xv[16], 4), (UINT16)_mm_extract_epi16(rsp->xv[16], 3), (UINT16)_mm_extract_epi16(rsp->xv[16], 2), (UINT16)_mm_extract_epi16(rsp->xv[16], 1), (UINT16)_mm_extract_epi16(rsp->xv[16], 0)); break;
6116      case CPUINFO_STR_REGISTER + RSP_V17:            sprintf(info->s, "V17: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[17], 7), (UINT16)_mm_extract_epi16(rsp->xv[17], 6), (UINT16)_mm_extract_epi16(rsp->xv[17], 5), (UINT16)_mm_extract_epi16(rsp->xv[17], 4), (UINT16)_mm_extract_epi16(rsp->xv[17], 3), (UINT16)_mm_extract_epi16(rsp->xv[17], 2), (UINT16)_mm_extract_epi16(rsp->xv[17], 1), (UINT16)_mm_extract_epi16(rsp->xv[17], 0)); break;
6117      case CPUINFO_STR_REGISTER + RSP_V18:            sprintf(info->s, "V18: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[18], 7), (UINT16)_mm_extract_epi16(rsp->xv[18], 6), (UINT16)_mm_extract_epi16(rsp->xv[18], 5), (UINT16)_mm_extract_epi16(rsp->xv[18], 4), (UINT16)_mm_extract_epi16(rsp->xv[18], 3), (UINT16)_mm_extract_epi16(rsp->xv[18], 2), (UINT16)_mm_extract_epi16(rsp->xv[18], 1), (UINT16)_mm_extract_epi16(rsp->xv[18], 0)); break;
6118      case CPUINFO_STR_REGISTER + RSP_V19:            sprintf(info->s, "V19: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[19], 7), (UINT16)_mm_extract_epi16(rsp->xv[19], 6), (UINT16)_mm_extract_epi16(rsp->xv[19], 5), (UINT16)_mm_extract_epi16(rsp->xv[19], 4), (UINT16)_mm_extract_epi16(rsp->xv[19], 3), (UINT16)_mm_extract_epi16(rsp->xv[19], 2), (UINT16)_mm_extract_epi16(rsp->xv[19], 1), (UINT16)_mm_extract_epi16(rsp->xv[19], 0)); break;
6119      case CPUINFO_STR_REGISTER + RSP_V20:            sprintf(info->s, "V20: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[20], 7), (UINT16)_mm_extract_epi16(rsp->xv[20], 6), (UINT16)_mm_extract_epi16(rsp->xv[20], 5), (UINT16)_mm_extract_epi16(rsp->xv[20], 4), (UINT16)_mm_extract_epi16(rsp->xv[20], 3), (UINT16)_mm_extract_epi16(rsp->xv[20], 2), (UINT16)_mm_extract_epi16(rsp->xv[20], 1), (UINT16)_mm_extract_epi16(rsp->xv[20], 0)); break;
6120      case CPUINFO_STR_REGISTER + RSP_V21:            sprintf(info->s, "V21: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[21], 7), (UINT16)_mm_extract_epi16(rsp->xv[21], 6), (UINT16)_mm_extract_epi16(rsp->xv[21], 5), (UINT16)_mm_extract_epi16(rsp->xv[21], 4), (UINT16)_mm_extract_epi16(rsp->xv[21], 3), (UINT16)_mm_extract_epi16(rsp->xv[21], 2), (UINT16)_mm_extract_epi16(rsp->xv[21], 1), (UINT16)_mm_extract_epi16(rsp->xv[21], 0)); break;
6121      case CPUINFO_STR_REGISTER + RSP_V22:            sprintf(info->s, "V22: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[22], 7), (UINT16)_mm_extract_epi16(rsp->xv[22], 6), (UINT16)_mm_extract_epi16(rsp->xv[22], 5), (UINT16)_mm_extract_epi16(rsp->xv[22], 4), (UINT16)_mm_extract_epi16(rsp->xv[22], 3), (UINT16)_mm_extract_epi16(rsp->xv[22], 2), (UINT16)_mm_extract_epi16(rsp->xv[22], 1), (UINT16)_mm_extract_epi16(rsp->xv[22], 0)); break;
6122      case CPUINFO_STR_REGISTER + RSP_V23:            sprintf(info->s, "V23: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[23], 7), (UINT16)_mm_extract_epi16(rsp->xv[23], 6), (UINT16)_mm_extract_epi16(rsp->xv[23], 5), (UINT16)_mm_extract_epi16(rsp->xv[23], 4), (UINT16)_mm_extract_epi16(rsp->xv[23], 3), (UINT16)_mm_extract_epi16(rsp->xv[23], 2), (UINT16)_mm_extract_epi16(rsp->xv[23], 1), (UINT16)_mm_extract_epi16(rsp->xv[23], 0)); break;
6123      case CPUINFO_STR_REGISTER + RSP_V24:            sprintf(info->s, "V24: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[24], 7), (UINT16)_mm_extract_epi16(rsp->xv[24], 6), (UINT16)_mm_extract_epi16(rsp->xv[24], 5), (UINT16)_mm_extract_epi16(rsp->xv[24], 4), (UINT16)_mm_extract_epi16(rsp->xv[24], 3), (UINT16)_mm_extract_epi16(rsp->xv[24], 2), (UINT16)_mm_extract_epi16(rsp->xv[24], 1), (UINT16)_mm_extract_epi16(rsp->xv[24], 0)); break;
6124      case CPUINFO_STR_REGISTER + RSP_V25:            sprintf(info->s, "V25: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[25], 7), (UINT16)_mm_extract_epi16(rsp->xv[25], 6), (UINT16)_mm_extract_epi16(rsp->xv[25], 5), (UINT16)_mm_extract_epi16(rsp->xv[25], 4), (UINT16)_mm_extract_epi16(rsp->xv[25], 3), (UINT16)_mm_extract_epi16(rsp->xv[25], 2), (UINT16)_mm_extract_epi16(rsp->xv[25], 1), (UINT16)_mm_extract_epi16(rsp->xv[25], 0)); break;
6125      case CPUINFO_STR_REGISTER + RSP_V26:            sprintf(info->s, "V26: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[26], 7), (UINT16)_mm_extract_epi16(rsp->xv[26], 6), (UINT16)_mm_extract_epi16(rsp->xv[26], 5), (UINT16)_mm_extract_epi16(rsp->xv[26], 4), (UINT16)_mm_extract_epi16(rsp->xv[26], 3), (UINT16)_mm_extract_epi16(rsp->xv[26], 2), (UINT16)_mm_extract_epi16(rsp->xv[26], 1), (UINT16)_mm_extract_epi16(rsp->xv[26], 0)); break;
6126      case CPUINFO_STR_REGISTER + RSP_V27:            sprintf(info->s, "V27: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[27], 7), (UINT16)_mm_extract_epi16(rsp->xv[27], 6), (UINT16)_mm_extract_epi16(rsp->xv[27], 5), (UINT16)_mm_extract_epi16(rsp->xv[27], 4), (UINT16)_mm_extract_epi16(rsp->xv[27], 3), (UINT16)_mm_extract_epi16(rsp->xv[27], 2), (UINT16)_mm_extract_epi16(rsp->xv[27], 1), (UINT16)_mm_extract_epi16(rsp->xv[27], 0)); break;
6127      case CPUINFO_STR_REGISTER + RSP_V28:            sprintf(info->s, "V28: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[28], 7), (UINT16)_mm_extract_epi16(rsp->xv[28], 6), (UINT16)_mm_extract_epi16(rsp->xv[28], 5), (UINT16)_mm_extract_epi16(rsp->xv[28], 4), (UINT16)_mm_extract_epi16(rsp->xv[28], 3), (UINT16)_mm_extract_epi16(rsp->xv[28], 2), (UINT16)_mm_extract_epi16(rsp->xv[28], 1), (UINT16)_mm_extract_epi16(rsp->xv[28], 0)); break;
6128      case CPUINFO_STR_REGISTER + RSP_V29:            sprintf(info->s, "V29: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[29], 7), (UINT16)_mm_extract_epi16(rsp->xv[29], 6), (UINT16)_mm_extract_epi16(rsp->xv[29], 5), (UINT16)_mm_extract_epi16(rsp->xv[29], 4), (UINT16)_mm_extract_epi16(rsp->xv[29], 3), (UINT16)_mm_extract_epi16(rsp->xv[29], 2), (UINT16)_mm_extract_epi16(rsp->xv[29], 1), (UINT16)_mm_extract_epi16(rsp->xv[29], 0)); break;
6129      case CPUINFO_STR_REGISTER + RSP_V30:            sprintf(info->s, "V30: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[30], 7), (UINT16)_mm_extract_epi16(rsp->xv[30], 6), (UINT16)_mm_extract_epi16(rsp->xv[30], 5), (UINT16)_mm_extract_epi16(rsp->xv[30], 4), (UINT16)_mm_extract_epi16(rsp->xv[30], 3), (UINT16)_mm_extract_epi16(rsp->xv[30], 2), (UINT16)_mm_extract_epi16(rsp->xv[30], 1), (UINT16)_mm_extract_epi16(rsp->xv[30], 0)); break;
6130      case CPUINFO_STR_REGISTER + RSP_V31:            sprintf(info->s, "V31: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[31], 7), (UINT16)_mm_extract_epi16(rsp->xv[31], 6), (UINT16)_mm_extract_epi16(rsp->xv[31], 5), (UINT16)_mm_extract_epi16(rsp->xv[31], 4), (UINT16)_mm_extract_epi16(rsp->xv[31], 3), (UINT16)_mm_extract_epi16(rsp->xv[31], 2), (UINT16)_mm_extract_epi16(rsp->xv[31], 1), (UINT16)_mm_extract_epi16(rsp->xv[31], 0)); break;
6131#else
6132      case CPUINFO_STR_REGISTER + RSP_V0:             sprintf(info->s, "V0: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)VREG_S( 0, 0), (UINT16)VREG_S( 0, 1), (UINT16)VREG_S( 0, 2), (UINT16)VREG_S( 0, 3), (UINT16)VREG_S( 0, 4), (UINT16)VREG_S( 0, 5), (UINT16)VREG_S( 0, 6), (UINT16)VREG_S( 0, 7)); break;
6133      case CPUINFO_STR_REGISTER + RSP_V1:             sprintf(info->s, "V1: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)VREG_S( 1, 0), (UINT16)VREG_S( 1, 1), (UINT16)VREG_S( 1, 2), (UINT16)VREG_S( 1, 3), (UINT16)VREG_S( 1, 4), (UINT16)VREG_S( 1, 5), (UINT16)VREG_S( 1, 6), (UINT16)VREG_S( 1, 7)); break;
6134      case CPUINFO_STR_REGISTER + RSP_V2:             sprintf(info->s, "V2: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)VREG_S( 2, 0), (UINT16)VREG_S( 2, 1), (UINT16)VREG_S( 2, 2), (UINT16)VREG_S( 2, 3), (UINT16)VREG_S( 2, 4), (UINT16)VREG_S( 2, 5), (UINT16)VREG_S( 2, 6), (UINT16)VREG_S( 2, 7)); break;
6135      case CPUINFO_STR_REGISTER + RSP_V3:             sprintf(info->s, "V3: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)VREG_S( 3, 0), (UINT16)VREG_S( 3, 1), (UINT16)VREG_S( 3, 2), (UINT16)VREG_S( 3, 3), (UINT16)VREG_S( 3, 4), (UINT16)VREG_S( 3, 5), (UINT16)VREG_S( 3, 6), (UINT16)VREG_S( 3, 7)); break;
6136      case CPUINFO_STR_REGISTER + RSP_V4:             sprintf(info->s, "V4: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)VREG_S( 4, 0), (UINT16)VREG_S( 4, 1), (UINT16)VREG_S( 4, 2), (UINT16)VREG_S( 4, 3), (UINT16)VREG_S( 4, 4), (UINT16)VREG_S( 4, 5), (UINT16)VREG_S( 4, 6), (UINT16)VREG_S( 4, 7)); break;
6137      case CPUINFO_STR_REGISTER + RSP_V5:             sprintf(info->s, "V5: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)VREG_S( 5, 0), (UINT16)VREG_S( 5, 1), (UINT16)VREG_S( 5, 2), (UINT16)VREG_S( 5, 3), (UINT16)VREG_S( 5, 4), (UINT16)VREG_S( 5, 5), (UINT16)VREG_S( 5, 6), (UINT16)VREG_S( 5, 7)); break;
6138      case CPUINFO_STR_REGISTER + RSP_V6:             sprintf(info->s, "V6: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)VREG_S( 6, 0), (UINT16)VREG_S( 6, 1), (UINT16)VREG_S( 6, 2), (UINT16)VREG_S( 6, 3), (UINT16)VREG_S( 6, 4), (UINT16)VREG_S( 6, 5), (UINT16)VREG_S( 6, 6), (UINT16)VREG_S( 6, 7)); break;
6139      case CPUINFO_STR_REGISTER + RSP_V7:             sprintf(info->s, "V7: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)VREG_S( 7, 0), (UINT16)VREG_S( 7, 1), (UINT16)VREG_S( 7, 2), (UINT16)VREG_S( 7, 3), (UINT16)VREG_S( 7, 4), (UINT16)VREG_S( 7, 5), (UINT16)VREG_S( 7, 6), (UINT16)VREG_S( 7, 7)); break;
6140      case CPUINFO_STR_REGISTER + RSP_V8:             sprintf(info->s, "V8: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)VREG_S( 8, 0), (UINT16)VREG_S( 8, 1), (UINT16)VREG_S( 8, 2), (UINT16)VREG_S( 8, 3), (UINT16)VREG_S( 8, 4), (UINT16)VREG_S( 8, 5), (UINT16)VREG_S( 8, 6), (UINT16)VREG_S( 8, 7)); break;
6141      case CPUINFO_STR_REGISTER + RSP_V9:             sprintf(info->s, "V9: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X",  (UINT16)VREG_S( 9, 0), (UINT16)VREG_S( 9, 1), (UINT16)VREG_S( 9, 2), (UINT16)VREG_S( 9, 3), (UINT16)VREG_S( 9, 4), (UINT16)VREG_S( 9, 5), (UINT16)VREG_S( 9, 6), (UINT16)VREG_S( 9, 7)); break;
6142      case CPUINFO_STR_REGISTER + RSP_V10:            sprintf(info->s, "V10: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(10, 0), (UINT16)VREG_S(10, 1), (UINT16)VREG_S(10, 2), (UINT16)VREG_S(10, 3), (UINT16)VREG_S(10, 4), (UINT16)VREG_S(10, 5), (UINT16)VREG_S(10, 6), (UINT16)VREG_S(10, 7)); break;
6143      case CPUINFO_STR_REGISTER + RSP_V11:            sprintf(info->s, "V11: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(11, 0), (UINT16)VREG_S(11, 1), (UINT16)VREG_S(11, 2), (UINT16)VREG_S(11, 3), (UINT16)VREG_S(11, 4), (UINT16)VREG_S(11, 5), (UINT16)VREG_S(11, 6), (UINT16)VREG_S(11, 7)); break;
6144      case CPUINFO_STR_REGISTER + RSP_V12:            sprintf(info->s, "V12: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(12, 0), (UINT16)VREG_S(12, 1), (UINT16)VREG_S(12, 2), (UINT16)VREG_S(12, 3), (UINT16)VREG_S(12, 4), (UINT16)VREG_S(12, 5), (UINT16)VREG_S(12, 6), (UINT16)VREG_S(12, 7)); break;
6145      case CPUINFO_STR_REGISTER + RSP_V13:            sprintf(info->s, "V13: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(13, 0), (UINT16)VREG_S(13, 1), (UINT16)VREG_S(13, 2), (UINT16)VREG_S(13, 3), (UINT16)VREG_S(13, 4), (UINT16)VREG_S(13, 5), (UINT16)VREG_S(13, 6), (UINT16)VREG_S(13, 7)); break;
6146      case CPUINFO_STR_REGISTER + RSP_V14:            sprintf(info->s, "V14: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(14, 0), (UINT16)VREG_S(14, 1), (UINT16)VREG_S(14, 2), (UINT16)VREG_S(14, 3), (UINT16)VREG_S(14, 4), (UINT16)VREG_S(14, 5), (UINT16)VREG_S(14, 6), (UINT16)VREG_S(14, 7)); break;
6147      case CPUINFO_STR_REGISTER + RSP_V15:            sprintf(info->s, "V15: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(15, 0), (UINT16)VREG_S(15, 1), (UINT16)VREG_S(15, 2), (UINT16)VREG_S(15, 3), (UINT16)VREG_S(15, 4), (UINT16)VREG_S(15, 5), (UINT16)VREG_S(15, 6), (UINT16)VREG_S(15, 7)); break;
6148      case CPUINFO_STR_REGISTER + RSP_V16:            sprintf(info->s, "V16: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(16, 0), (UINT16)VREG_S(16, 1), (UINT16)VREG_S(16, 2), (UINT16)VREG_S(16, 3), (UINT16)VREG_S(16, 4), (UINT16)VREG_S(16, 5), (UINT16)VREG_S(16, 6), (UINT16)VREG_S(16, 7)); break;
6149      case CPUINFO_STR_REGISTER + RSP_V17:            sprintf(info->s, "V17: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(17, 0), (UINT16)VREG_S(17, 1), (UINT16)VREG_S(17, 2), (UINT16)VREG_S(17, 3), (UINT16)VREG_S(17, 4), (UINT16)VREG_S(17, 5), (UINT16)VREG_S(17, 6), (UINT16)VREG_S(17, 7)); break;
6150      case CPUINFO_STR_REGISTER + RSP_V18:            sprintf(info->s, "V18: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(18, 0), (UINT16)VREG_S(18, 1), (UINT16)VREG_S(18, 2), (UINT16)VREG_S(18, 3), (UINT16)VREG_S(18, 4), (UINT16)VREG_S(18, 5), (UINT16)VREG_S(18, 6), (UINT16)VREG_S(18, 7)); break;
6151      case CPUINFO_STR_REGISTER + RSP_V19:            sprintf(info->s, "V19: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(19, 0), (UINT16)VREG_S(19, 1), (UINT16)VREG_S(19, 2), (UINT16)VREG_S(19, 3), (UINT16)VREG_S(19, 4), (UINT16)VREG_S(19, 5), (UINT16)VREG_S(19, 6), (UINT16)VREG_S(19, 7)); break;
6152      case CPUINFO_STR_REGISTER + RSP_V20:            sprintf(info->s, "V20: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(20, 0), (UINT16)VREG_S(20, 1), (UINT16)VREG_S(20, 2), (UINT16)VREG_S(20, 3), (UINT16)VREG_S(20, 4), (UINT16)VREG_S(20, 5), (UINT16)VREG_S(20, 6), (UINT16)VREG_S(20, 7)); break;
6153      case CPUINFO_STR_REGISTER + RSP_V21:            sprintf(info->s, "V21: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(21, 0), (UINT16)VREG_S(21, 1), (UINT16)VREG_S(21, 2), (UINT16)VREG_S(21, 3), (UINT16)VREG_S(21, 4), (UINT16)VREG_S(21, 5), (UINT16)VREG_S(21, 6), (UINT16)VREG_S(21, 7)); break;
6154      case CPUINFO_STR_REGISTER + RSP_V22:            sprintf(info->s, "V22: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(22, 0), (UINT16)VREG_S(22, 1), (UINT16)VREG_S(22, 2), (UINT16)VREG_S(22, 3), (UINT16)VREG_S(22, 4), (UINT16)VREG_S(22, 5), (UINT16)VREG_S(22, 6), (UINT16)VREG_S(22, 7)); break;
6155      case CPUINFO_STR_REGISTER + RSP_V23:            sprintf(info->s, "V23: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(23, 0), (UINT16)VREG_S(23, 1), (UINT16)VREG_S(23, 2), (UINT16)VREG_S(23, 3), (UINT16)VREG_S(23, 4), (UINT16)VREG_S(23, 5), (UINT16)VREG_S(23, 6), (UINT16)VREG_S(23, 7)); break;
6156      case CPUINFO_STR_REGISTER + RSP_V24:            sprintf(info->s, "V24: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(24, 0), (UINT16)VREG_S(24, 1), (UINT16)VREG_S(24, 2), (UINT16)VREG_S(24, 3), (UINT16)VREG_S(24, 4), (UINT16)VREG_S(24, 5), (UINT16)VREG_S(24, 6), (UINT16)VREG_S(24, 7)); break;
6157      case CPUINFO_STR_REGISTER + RSP_V25:            sprintf(info->s, "V25: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(25, 0), (UINT16)VREG_S(25, 1), (UINT16)VREG_S(25, 2), (UINT16)VREG_S(25, 3), (UINT16)VREG_S(25, 4), (UINT16)VREG_S(25, 5), (UINT16)VREG_S(25, 6), (UINT16)VREG_S(25, 7)); break;
6158      case CPUINFO_STR_REGISTER + RSP_V26:            sprintf(info->s, "V26: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(26, 0), (UINT16)VREG_S(26, 1), (UINT16)VREG_S(26, 2), (UINT16)VREG_S(26, 3), (UINT16)VREG_S(26, 4), (UINT16)VREG_S(26, 5), (UINT16)VREG_S(26, 6), (UINT16)VREG_S(26, 7)); break;
6159      case CPUINFO_STR_REGISTER + RSP_V27:            sprintf(info->s, "V27: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(27, 0), (UINT16)VREG_S(27, 1), (UINT16)VREG_S(27, 2), (UINT16)VREG_S(27, 3), (UINT16)VREG_S(27, 4), (UINT16)VREG_S(27, 5), (UINT16)VREG_S(27, 6), (UINT16)VREG_S(27, 7)); break;
6160      case CPUINFO_STR_REGISTER + RSP_V28:            sprintf(info->s, "V28: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(28, 0), (UINT16)VREG_S(28, 1), (UINT16)VREG_S(28, 2), (UINT16)VREG_S(28, 3), (UINT16)VREG_S(28, 4), (UINT16)VREG_S(28, 5), (UINT16)VREG_S(28, 6), (UINT16)VREG_S(28, 7)); break;
6161      case CPUINFO_STR_REGISTER + RSP_V29:            sprintf(info->s, "V29: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(29, 0), (UINT16)VREG_S(29, 1), (UINT16)VREG_S(29, 2), (UINT16)VREG_S(29, 3), (UINT16)VREG_S(29, 4), (UINT16)VREG_S(29, 5), (UINT16)VREG_S(29, 6), (UINT16)VREG_S(29, 7)); break;
6162      case CPUINFO_STR_REGISTER + RSP_V30:            sprintf(info->s, "V30: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(30, 0), (UINT16)VREG_S(30, 1), (UINT16)VREG_S(30, 2), (UINT16)VREG_S(30, 3), (UINT16)VREG_S(30, 4), (UINT16)VREG_S(30, 5), (UINT16)VREG_S(30, 6), (UINT16)VREG_S(30, 7)); break;
6163      case CPUINFO_STR_REGISTER + RSP_V31:            sprintf(info->s, "V31: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(31, 0), (UINT16)VREG_S(31, 1), (UINT16)VREG_S(31, 2), (UINT16)VREG_S(31, 3), (UINT16)VREG_S(31, 4), (UINT16)VREG_S(31, 5), (UINT16)VREG_S(31, 6), (UINT16)VREG_S(31, 7)); break;
6164#endif
50166165      case CPUINFO_STR_REGISTER + RSP_SR:             sprintf(info->s, "SR: %08X",  rsp->sr);    break;
50176166      case CPUINFO_STR_REGISTER + RSP_NEXTPC:         sprintf(info->s, "NPC: %08X", rsp->nextpc);break;
50186167      case CPUINFO_STR_REGISTER + RSP_STEPCNT:        sprintf(info->s, "STEP: %d",  rsp->step_count);  break;
trunk/src/emu/cpu/rsp/rsp.h
r24005r24006
6666   RSP_SR,
6767   RSP_NEXTPC,
6868   RSP_STEPCNT,
69   RSP_V0,  RSP_V1,  RSP_V2,  RSP_V3,  RSP_V4,  RSP_V5,  RSP_V6,  RSP_V7,
70   RSP_V8,  RSP_V9,  RSP_V10, RSP_V11, RSP_V12, RSP_V13, RSP_V14, RSP_V15,
71   RSP_V16, RSP_V17, RSP_V18, RSP_V19, RSP_V20, RSP_V21, RSP_V22, RSP_V23,
72   RSP_V24, RSP_V25, RSP_V26, RSP_V27, RSP_V28, RSP_V29, RSP_V30, RSP_V31
6973};
7074
7175
r24005r24006
175179   UINT32 step_count;
176180
177181   ACCUMULATOR_REG accum[8];
178   INT32 square_root_res;
179   INT32 square_root_high;
182#if USE_SIMD
183   __m128i accum_h;
184   __m128i accum_m;
185   __m128i accum_l;
186#endif
180187   INT32 reciprocal_res;
181   INT32 reciprocal_high;
188   UINT32 reciprocal_high;
182189   INT32 dp_allowed;
183190
184191   UINT32 ppc;

Previous 199869 Revisions Next


© 1997-2024 The MAME Team