trunk/src/emu/cpu/rsp/rspdrc.c
| r24005 | r24006 | |
| 208 | 208 | #define VS2REG ((op >> 16) & 0x1f) |
| 209 | 209 | #define EL ((op >> 21) & 0xf) |
| 210 | 210 | |
| 211 | | #define VREG_B(reg, offset) rsp->v[(reg)].b[(offset)^1] |
| 212 | | #define W_VREG_S(reg, offset) rsp->v[(reg)].s[(offset)] |
| 213 | | #define VREG_S(reg, offset) (INT16)rsp->v[(reg)].s[(offset)] |
| 211 | #define SIMD_EXTRACT16(reg, value, element) \ |
| 212 | if (element < 0) printf("extract element <0 %d\n", element); \ |
| 213 | switch((element) & 7) \ |
| 214 | { \ |
| 215 | case 0: value = _mm_extract_epi16(reg, 0); break; \ |
| 216 | case 1: value = _mm_extract_epi16(reg, 1); break; \ |
| 217 | case 2: value = _mm_extract_epi16(reg, 2); break; \ |
| 218 | case 3: value = _mm_extract_epi16(reg, 3); break; \ |
| 219 | case 4: value = _mm_extract_epi16(reg, 4); break; \ |
| 220 | case 5: value = _mm_extract_epi16(reg, 5); break; \ |
| 221 | case 6: value = _mm_extract_epi16(reg, 6); break; \ |
| 222 | case 7: value = _mm_extract_epi16(reg, 7); break; \ |
| 223 | } |
| 214 | 224 | |
| 225 | |
| 226 | #define SIMD_INSERT16(reg, value, element) \ |
| 227 | if (element < 0) printf("insert element <0 %d\n", element); \ |
| 228 | switch((element) & 7) \ |
| 229 | { \ |
| 230 | case 0: reg = _mm_insert_epi16(reg, value, 0); break; \ |
| 231 | case 1: reg = _mm_insert_epi16(reg, value, 1); break; \ |
| 232 | case 2: reg = _mm_insert_epi16(reg, value, 2); break; \ |
| 233 | case 3: reg = _mm_insert_epi16(reg, value, 3); break; \ |
| 234 | case 4: reg = _mm_insert_epi16(reg, value, 4); break; \ |
| 235 | case 5: reg = _mm_insert_epi16(reg, value, 5); break; \ |
| 236 | case 6: reg = _mm_insert_epi16(reg, value, 6); break; \ |
| 237 | case 7: reg = _mm_insert_epi16(reg, value, 7); break; \ |
| 238 | } |
| 239 | |
| 240 | |
| 241 | #define VREG_B(reg, offset) rsp->v[(reg)].b[(offset)^1] |
| 242 | #define W_VREG_S(reg, offset) rsp->v[(reg)].s[(offset)] |
| 243 | #define VREG_S(reg, offset) (INT16)rsp->v[(reg)].s[(offset)] |
| 244 | |
| 215 | 245 | #define VEC_EL_2(x,z) (vector_elements_2[(x)][(z)]) |
| 216 | 246 | |
| 217 | 247 | #define ACCUM(x) rsp->accum[x].q |
| r24005 | r24006 | |
| 224 | 254 | #define SET_CARRY_FLAG(x) { rsp->flag[0] |= (1 << (x)); } |
| 225 | 255 | #define CLEAR_CARRY_FLAG(x) { rsp->flag[0] &= ~(1 << (x)); } |
| 226 | 256 | |
| 227 | | #define COMPARE_FLAG(x) ((rsp->flag[1] & (1 << (x))) ? 1 : 0) |
| 257 | #define COMPARE_FLAG(x) ((rsp->flag[1] >> (x)) & 1) |
| 228 | 258 | #define CLEAR_COMPARE_FLAGS() { rsp->flag[1] &= ~0xff; } |
| 229 | 259 | #define SET_COMPARE_FLAG(x) { rsp->flag[1] |= (1 << (x)); } |
| 230 | 260 | #define CLEAR_COMPARE_FLAG(x) { rsp->flag[1] &= ~(1 << (x)); } |
| r24005 | r24006 | |
| 537 | 567 | /*****************************************************************************/ |
| 538 | 568 | |
| 539 | 569 | /* Legacy. Going forward, this will be transitioned into unrolled opcode decodes. */ |
| 540 | | static const int vector_elements_1[16][8] = |
| 541 | | { |
| 542 | | { 0, 1, 2, 3, 4, 5, 6, 7 }, // none |
| 543 | | { 0, 1, 2, 3, 4, 5, 6 ,7 }, // ??? |
| 544 | | { 1, 3, 5, 7, 0, 2, 4, 6 }, // 0q |
| 545 | | { 0, 2, 4, 6, 1, 3, 5, 7 }, // 1q |
| 546 | | { 1, 2, 3, 5, 6, 7, 0, 4 }, // 0h |
| 547 | | { 0, 2, 3, 4, 6, 7, 1, 5 }, // 1h |
| 548 | | { 0, 1, 3, 4, 5, 7, 2, 6 }, // 2h |
| 549 | | { 0, 1, 2, 4, 5, 6, 3, 7 }, // 3h |
| 550 | | { 1, 2, 3, 4, 5, 6, 7, 0 }, // 0 |
| 551 | | { 0, 2, 3, 4, 5, 6, 7, 1 }, // 1 |
| 552 | | { 0, 1, 3, 4, 5, 6, 7, 2 }, // 2 |
| 553 | | { 0, 1, 2, 4, 5, 6, 7, 3 }, // 3 |
| 554 | | { 0, 1, 2, 3, 5, 6, 7, 4 }, // 4 |
| 555 | | { 0, 1, 2, 3, 4, 6, 7, 5 }, // 5 |
| 556 | | { 0, 1, 2, 3, 4, 5, 7, 6 }, // 6 |
| 557 | | { 0, 1, 2, 3, 4, 5, 6, 7 }, // 7 |
| 558 | | }; |
| 559 | | |
| 560 | | /* Legacy. Going forward, this will be transitioned into unrolled opcode decodes. */ |
| 561 | 570 | static const int vector_elements_2[16][8] = |
| 562 | 571 | { |
| 563 | 572 | { 0, 1, 2, 3, 4, 5, 6, 7 }, // none |
| r24005 | r24006 | |
| 578 | 587 | { 7, 7, 7, 7, 7, 7, 7, 7 }, // 7 |
| 579 | 588 | }; |
| 580 | 589 | |
| 590 | #if USE_SIMD |
| 591 | static __m128i vec_himask; |
| 592 | static __m128i vec_lomask; |
| 593 | static __m128i vec_overmask; |
| 594 | static __m128i vec_zerobits; |
| 595 | static __m128i vec_flagmask; |
| 596 | static __m128i vec_shiftmask2; |
| 597 | static __m128i vec_shiftmask4; |
| 598 | static __m128i vec_zero; |
| 599 | static __m128i vec_neg1; |
| 600 | static __m128i vec_shuf[16]; |
| 601 | static __m128i vec_shuf_inverse[16]; |
| 602 | #endif |
| 603 | |
| 581 | 604 | static void rspcom_init(rsp_state *rsp, legacy_cpu_device *device, device_irq_acknowledge_callback irqcallback) |
| 582 | 605 | { |
| 583 | 606 | int regIdx = 0; |
| r24005 | r24006 | |
| 610 | 633 | rsp->flag[1] = 0; |
| 611 | 634 | rsp->flag[2] = 0; |
| 612 | 635 | rsp->flag[3] = 0; |
| 613 | | rsp->square_root_res = 0; |
| 614 | | rsp->square_root_high = 0; |
| 615 | 636 | rsp->reciprocal_res = 0; |
| 616 | 637 | rsp->reciprocal_high = 0; |
| 617 | 638 | #endif |
| r24005 | r24006 | |
| 624 | 645 | |
| 625 | 646 | rsp->sr = RSP_STATUS_HALT; |
| 626 | 647 | rsp->step_count = 0; |
| 648 | |
| 649 | #if USE_SIMD |
| 650 | vec_shuf_inverse[ 0] = _mm_set_epi16(0x0f0e, 0x0d0c, 0x0b0a, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); // none |
| 651 | vec_shuf_inverse[ 1] = _mm_set_epi16(0x0f0e, 0x0d0c, 0x0b0a, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); // ??? |
| 652 | vec_shuf_inverse[ 2] = _mm_set_epi16(0x0d0c, 0x0d0c, 0x0908, 0x0908, 0x0504, 0x0504, 0x0100, 0x0100); // 0q |
| 653 | vec_shuf_inverse[ 3] = _mm_set_epi16(0x0f0e, 0x0f0e, 0x0b0a, 0x0b0a, 0x0706, 0x0706, 0x0302, 0x0302); // 1q |
| 654 | vec_shuf_inverse[ 4] = _mm_set_epi16(0x0908, 0x0908, 0x0908, 0x0908, 0x0100, 0x0100, 0x0100, 0x0100); // 0h |
| 655 | vec_shuf_inverse[ 5] = _mm_set_epi16(0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0302, 0x0302, 0x0302, 0x0302); // 1h |
| 656 | vec_shuf_inverse[ 6] = _mm_set_epi16(0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0504, 0x0504, 0x0504, 0x0504); // 2h |
| 657 | vec_shuf_inverse[ 7] = _mm_set_epi16(0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0706, 0x0706, 0x0706, 0x0706); // 3h |
| 658 | vec_shuf_inverse[ 8] = _mm_set_epi16(0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100); // 0 |
| 659 | vec_shuf_inverse[ 9] = _mm_set_epi16(0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302); // 1 |
| 660 | vec_shuf_inverse[10] = _mm_set_epi16(0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504); // 2 |
| 661 | vec_shuf_inverse[11] = _mm_set_epi16(0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706); // 3 |
| 662 | vec_shuf_inverse[12] = _mm_set_epi16(0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908); // 4 |
| 663 | vec_shuf_inverse[13] = _mm_set_epi16(0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a); // 5 |
| 664 | vec_shuf_inverse[14] = _mm_set_epi16(0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c); // 6 |
| 665 | vec_shuf_inverse[15] = _mm_set_epi16(0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e); // 7 |
| 666 | |
| 667 | vec_shuf[ 0] = _mm_set_epi16(0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0b0a, 0x0d0c, 0x0f0e); // none |
| 668 | vec_shuf[ 1] = _mm_set_epi16(0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0b0a, 0x0d0c, 0x0f0e); // ??? |
| 669 | vec_shuf[ 2] = _mm_set_epi16(0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e); // 0q |
| 670 | vec_shuf[ 3] = _mm_set_epi16(0x0100, 0x0100, 0x0504, 0x0706, 0x0908, 0x0908, 0x0d0c, 0x0d0c); // 1q |
| 671 | vec_shuf[ 4] = _mm_set_epi16(0x0706, 0x0706, 0x0706, 0x0706, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e); // 0q |
| 672 | vec_shuf[ 5] = _mm_set_epi16(0x0504, 0x0504, 0x0504, 0x0504, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c); // 1q |
| 673 | vec_shuf[ 6] = _mm_set_epi16(0x0302, 0x0302, 0x0302, 0x0302, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a); // 2q |
| 674 | vec_shuf[ 7] = _mm_set_epi16(0x0100, 0x0100, 0x0100, 0x0100, 0x0908, 0x0908, 0x0908, 0x0908); // 3q |
| 675 | vec_shuf[ 8] = _mm_set_epi16(0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e); // 0 |
| 676 | vec_shuf[ 9] = _mm_set_epi16(0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c); // 1 |
| 677 | vec_shuf[10] = _mm_set_epi16(0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a); // 2 |
| 678 | vec_shuf[11] = _mm_set_epi16(0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908); // 3 |
| 679 | vec_shuf[12] = _mm_set_epi16(0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706); // 4 |
| 680 | vec_shuf[13] = _mm_set_epi16(0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504); // 5 |
| 681 | vec_shuf[14] = _mm_set_epi16(0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302); // 6 |
| 682 | vec_shuf[15] = _mm_set_epi16(0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100); // 7 |
| 683 | rsp->accum_h = _mm_setzero_si128(); |
| 684 | rsp->accum_m = _mm_setzero_si128(); |
| 685 | rsp->accum_l = _mm_setzero_si128(); |
| 686 | vec_zero = _mm_setzero_si128(); |
| 687 | vec_neg1 = _mm_set_epi64x(0xffffffffffffffffL, 0xffffffffffffffffL); |
| 688 | vec_himask = _mm_set_epi64x(0xffff0000ffff0000L, 0xffff0000ffff0000L); |
| 689 | vec_lomask = _mm_set_epi64x(0x0000ffff0000ffffL, 0x0000ffff0000ffffL); |
| 690 | vec_overmask = _mm_set_epi64x(0x0001000000010000L, 0x0001000000010000L); |
| 691 | vec_zerobits = _mm_set_epi64x(0x0000000100000001L, 0x0000000100000001L); |
| 692 | vec_flagmask = _mm_set_epi64x(0x0001000100010001L, 0x0001000100010001L); |
| 693 | vec_shiftmask2 = _mm_set_epi64x(0x0000000300000003L, 0x0000000300000003L); |
| 694 | vec_shiftmask4 = _mm_set_epi64x(0x000000000000000fL, 0x000000000000000fL); |
| 695 | #endif |
| 627 | 696 | } |
| 628 | 697 | |
| 629 | 698 | static CPU_INIT( rsp ) |
| r24005 | r24006 | |
| 752 | 821 | // Load 1 byte to vector byte index |
| 753 | 822 | |
| 754 | 823 | ea = (base) ? rsp->r[base] + offset : offset; |
| 755 | | VREG_B(dest, index) = READ8(rsp, ea); |
| 756 | 824 | |
| 757 | | // SSE |
| 758 | 825 | #if USE_SIMD |
| 759 | | // Better solutions for this situation welcome. Need to be able to insert a byte at an arbitrary |
| 760 | | // byte index in the __m128. Current method amounts to: |
| 761 | | // final_vec = (in_vec &~ discard_mask) | insert_value |
| 762 | | // Naturally, SSE4.1 adds the highly-useful PINSRB opcode. As the name implies, it's an |
| 763 | | // arbitrary byte-insert-into-m128, but do we want to require SSE4.1? Maybe just have an ifdef |
| 764 | | // and use the more optimal one if available. |
| 765 | | const __m128i neg1 = _mm_set_epi16(0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff); |
| 766 | | |
| 767 | | __m128i insert_vec = _mm_setzero_si128(); |
| 768 | | INT16 insert_value = READ8(rsp, ea) << ((1 - (index & 1)) << 2); |
| 769 | | _mm_insert_epi16 (insert_vec, insert_value, index >> 1); |
| 770 | | |
| 771 | | __m128i discard_mask = _mm_setzero_si128(); |
| 772 | | INT16 discard_element = 0x00ff << ((1 - (index & 1)) << 2); |
| 773 | | _mm_insert_epi16 (discard_mask, discard_element, index >> 1); |
| 774 | | _mm_xor_si128 (discard_mask, neg1); |
| 775 | | _mm_and_si128 (rsp->xv[dest], discard_mask); |
| 776 | | _mm_or_si128 (rsp->xv[dest], insert_vec); |
| 826 | UINT16 element; |
| 827 | SIMD_EXTRACT16(rsp->xv[dest], element, (index >> 1)); |
| 828 | element &= 0xff00 >> ((1-(index & 1)) * 8); |
| 829 | element |= READ8(rsp, ea) << ((1-(index & 1)) * 8); |
| 830 | SIMD_INSERT16(rsp->xv[dest], element, (index >> 1)); |
| 831 | #else |
| 832 | VREG_B(dest, index) = READ8(rsp, ea); |
| 777 | 833 | #endif |
| 778 | 834 | } |
| 779 | 835 | |
| r24005 | r24006 | |
| 781 | 837 | { |
| 782 | 838 | rsp_state *rsp = (rsp_state*)param; |
| 783 | 839 | UINT32 op = rsp->impstate->arg0; |
| 784 | | UINT32 ea = 0; |
| 785 | 840 | int dest = (op >> 16) & 0x1f; |
| 786 | 841 | int base = (op >> 21) & 0x1f; |
| 787 | 842 | int index = (op >> 7) & 0xe; |
| r24005 | r24006 | |
| 797 | 852 | // |
| 798 | 853 | // Loads 2 bytes starting from vector byte index |
| 799 | 854 | |
| 800 | | ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2); |
| 801 | | |
| 855 | UINT32 ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2); |
| 802 | 856 | int end = index + 2; |
| 803 | | |
| 804 | 857 | for (int i = index; i < end; i++) |
| 805 | 858 | { |
| 859 | #if USE_SIMD |
| 860 | UINT16 element; |
| 861 | SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1)); |
| 862 | element &= 0xff00 >> ((1 - (i & 1)) * 8); |
| 863 | element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8); |
| 864 | SIMD_INSERT16(rsp->xv[dest], element, (i >> 1)); |
| 865 | #else |
| 806 | 866 | VREG_B(dest, i) = READ8(rsp, ea); |
| 867 | #endif |
| 807 | 868 | ea++; |
| 808 | 869 | } |
| 809 | | |
| 810 | | // SSE |
| 811 | | #if USE_SIMD |
| 812 | | INT16 insert_value = READ8(rsp, ea) << 8 | READ8(rsp, ea + 1); |
| 813 | | _mm_insert_epi16 (rsp->xv[dest], insert_value, index >> 1); |
| 814 | | #endif |
| 815 | 870 | } |
| 816 | 871 | |
| 817 | 872 | static void cfunc_rsp_llv(void *param) |
| r24005 | r24006 | |
| 840 | 895 | |
| 841 | 896 | for (int i = index; i < end; i++) |
| 842 | 897 | { |
| 898 | #if USE_SIMD |
| 899 | UINT16 element; |
| 900 | SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1)); |
| 901 | element &= 0xff00 >> ((1 - (i & 1)) * 8); |
| 902 | element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8); |
| 903 | SIMD_INSERT16(rsp->xv[dest], element, (i >> 1)); |
| 904 | #else |
| 843 | 905 | VREG_B(dest, i) = READ8(rsp, ea); |
| 906 | #endif |
| 844 | 907 | ea++; |
| 845 | 908 | } |
| 846 | | |
| 847 | | // SSE |
| 848 | | #if USE_SIMD |
| 849 | | INT16 insert_value0 = READ8(rsp, ea) << 8 | READ8(rsp, ea + 1); |
| 850 | | INT16 insert_value1 = READ8(rsp, ea + 2) << 8 | READ8(rsp, ea + 3); |
| 851 | | _mm_insert_epi16 (rsp->xv[dest], insert_value0, (index >> 1)); |
| 852 | | _mm_insert_epi16 (rsp->xv[dest], insert_value1, (index >> 1) + 1); |
| 853 | | #endif |
| 854 | 909 | } |
| 855 | 910 | |
| 856 | 911 | static void cfunc_rsp_ldv(void *param) |
| r24005 | r24006 | |
| 879 | 934 | |
| 880 | 935 | for (int i = index; i < end; i++) |
| 881 | 936 | { |
| 937 | #if USE_SIMD |
| 938 | UINT16 element; |
| 939 | SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1)); |
| 940 | element &= 0xff00 >> ((1 - (i & 1)) * 8); |
| 941 | element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8); |
| 942 | SIMD_INSERT16(rsp->xv[dest], element, (i >> 1)); |
| 943 | #else |
| 882 | 944 | VREG_B(dest, i) = READ8(rsp, ea); |
| 945 | #endif |
| 883 | 946 | ea++; |
| 884 | 947 | } |
| 885 | | |
| 886 | | #if USE_SIMD |
| 887 | | INT16 insert_value0 = READ8(rsp, ea) << 8 | READ8(rsp, ea + 1); |
| 888 | | INT16 insert_value1 = READ8(rsp, ea + 2) << 8 | READ8(rsp, ea + 3); |
| 889 | | INT16 insert_value2 = READ8(rsp, ea + 4) << 8 | READ8(rsp, ea + 5); |
| 890 | | INT16 insert_value3 = READ8(rsp, ea + 6) << 8 | READ8(rsp, ea + 7); |
| 891 | | _mm_insert_epi16 (rsp->xv[dest], insert_value0, (index >> 1)); |
| 892 | | _mm_insert_epi16 (rsp->xv[dest], insert_value1, (index >> 1) + 1); |
| 893 | | _mm_insert_epi16 (rsp->xv[dest], insert_value2, (index >> 1) + 2); |
| 894 | | _mm_insert_epi16 (rsp->xv[dest], insert_value3, (index >> 1) + 3); |
| 895 | | #endif |
| 896 | 948 | } |
| 897 | 949 | |
| 898 | 950 | static void cfunc_rsp_lqv(void *param) |
| 899 | 951 | { |
| 900 | 952 | rsp_state *rsp = (rsp_state*)param; |
| 901 | 953 | UINT32 op = rsp->impstate->arg0; |
| 902 | | int i = 0; |
| 903 | | int end = 0; |
| 904 | | UINT32 ea = 0; |
| 905 | 954 | int dest = (op >> 16) & 0x1f; |
| 906 | 955 | int base = (op >> 21) & 0x1f; |
| 907 | | int index = 0; // Just a test, it goes right back the way it was if something breaks //(op >> 7) & 0xf; |
| 956 | //int index = 0; // Just a test, it goes right back the way it was if something breaks //(op >> 7) & 0xf; |
| 908 | 957 | int offset = (op & 0x7f); |
| 909 | 958 | if (offset & 0x40) |
| 910 | 959 | { |
| r24005 | r24006 | |
| 917 | 966 | // |
| 918 | 967 | // Loads up to 16 bytes starting from vector byte index |
| 919 | 968 | |
| 920 | | ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 969 | UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 921 | 970 | |
| 922 | | end = index + (16 - (ea & 0xf)); |
| 971 | int end = 16 - (ea & 0xf); |
| 923 | 972 | if (end > 16) end = 16; |
| 924 | 973 | |
| 925 | | for (i=index; i < end; i++) |
| 974 | for (int i = 0; i < end; i++) |
| 926 | 975 | { |
| 976 | #if USE_SIMD |
| 977 | UINT16 element; |
| 978 | SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1)); |
| 979 | element &= 0xff00 >> ((1 - (i & 1)) * 8); |
| 980 | element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8); |
| 981 | SIMD_INSERT16(rsp->xv[dest], element, (i >> 1)); |
| 982 | #else |
| 927 | 983 | VREG_B(dest, i) = READ8(rsp, ea); |
| 984 | #endif |
| 928 | 985 | ea++; |
| 929 | 986 | } |
| 930 | | |
| 931 | | // SSE |
| 932 | | #if USE_SIMD |
| 933 | | INT16 val0 = READ8(rsp, ea) << 8 | READ8(rsp, ea + 1); |
| 934 | | INT16 val1 = READ8(rsp, ea + 2) << 8 | READ8(rsp, ea + 3); |
| 935 | | INT16 val2 = READ8(rsp, ea + 4) << 8 | READ8(rsp, ea + 5); |
| 936 | | INT16 val3 = READ8(rsp, ea + 6) << 8 | READ8(rsp, ea + 7); |
| 937 | | INT16 val4 = READ8(rsp, ea + 8) << 8 | READ8(rsp, ea + 9); |
| 938 | | INT16 val5 = READ8(rsp, ea + 10) << 8 | READ8(rsp, ea + 11); |
| 939 | | INT16 val6 = READ8(rsp, ea + 12) << 8 | READ8(rsp, ea + 13); |
| 940 | | INT16 val7 = READ8(rsp, ea + 14) << 8 | READ8(rsp, ea + 15); |
| 941 | | |
| 942 | | rsp->xv[dest] = _mm_set_epi16(val0, val1, val2, val3, val4, val5, val6, val7); |
| 943 | | #endif |
| 944 | 987 | } |
| 945 | 988 | |
| 946 | 989 | static void cfunc_rsp_lrv(void *param) |
| 947 | 990 | { |
| 948 | 991 | rsp_state *rsp = (rsp_state*)param; |
| 949 | 992 | UINT32 op = rsp->impstate->arg0; |
| 950 | | int i = 0; |
| 951 | | int end = 0; |
| 952 | | UINT32 ea = 0; |
| 953 | 993 | int dest = (op >> 16) & 0x1f; |
| 954 | 994 | int base = (op >> 21) & 0x1f; |
| 955 | 995 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 965 | 1005 | // |
| 966 | 1006 | // Stores up to 16 bytes starting from right side until 16-byte boundary |
| 967 | 1007 | |
| 968 | | ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1008 | UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 969 | 1009 | |
| 970 | 1010 | index = 16 - ((ea & 0xf) - index); |
| 971 | | end = 16; |
| 972 | 1011 | ea &= ~0xf; |
| 973 | 1012 | |
| 974 | | #if USE_SIMD |
| 975 | | INT16 mask[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; |
| 976 | | INT16 val[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; |
| 977 | | #endif |
| 978 | | for (i=index; i < end; i++) |
| 1013 | for (int i = index; i < 16; i++) |
| 979 | 1014 | { |
| 980 | 1015 | #if USE_SIMD |
| 981 | | mask[i >> 1] |= 0x00ff << ((i & 1) * 8); |
| 982 | | val[i >> 1] |= READ8(rsp, ea) << ((i & 1) * 8); |
| 1016 | UINT16 element; |
| 1017 | SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1)); |
| 1018 | element &= 0xff00 >> ((1-(i & 1)) * 8); |
| 1019 | element |= READ8(rsp, ea) << ((1-(i & 1)) * 8); |
| 1020 | SIMD_INSERT16(rsp->xv[dest], element, (i >> 1)); |
| 1021 | #else |
| 1022 | VREG_B(dest, i) = READ8(rsp, ea); |
| 983 | 1023 | #endif |
| 984 | | VREG_B(dest, i) = READ8(rsp, ea); |
| 985 | 1024 | ea++; |
| 986 | 1025 | } |
| 987 | | |
| 988 | | #if USE_SIMD |
| 989 | | __m128i neg1 = _mm_set_epi16(0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff); |
| 990 | | __m128i keep_mask = _mm_set_epi16(mask[0], mask[1], mask[2], mask[3], mask[4], mask[5], mask[6], mask[7]); |
| 991 | | __m128i load_val = _mm_set_epi16(val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]); |
| 992 | | keep_mask = _mm_xor_si128(keep_mask, neg1); |
| 993 | | rsp->xv[dest] = _mm_and_si128(rsp->xv[dest], keep_mask); |
| 994 | | rsp->xv[dest] = _mm_or_si128(rsp->xv[dest], load_val); |
| 995 | | #endif |
| 996 | 1026 | } |
| 997 | 1027 | |
| 998 | 1028 | static void cfunc_rsp_lpv(void *param) |
| 999 | 1029 | { |
| 1000 | 1030 | rsp_state *rsp = (rsp_state*)param; |
| 1001 | 1031 | UINT32 op = rsp->impstate->arg0; |
| 1002 | | int i = 0; |
| 1003 | | UINT32 ea = 0; |
| 1004 | 1032 | int dest = (op >> 16) & 0x1f; |
| 1005 | 1033 | int base = (op >> 21) & 0x1f; |
| 1006 | 1034 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1016 | 1044 | // |
| 1017 | 1045 | // Loads a byte as the upper 8 bits of each element |
| 1018 | 1046 | |
| 1019 | | ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); |
| 1047 | UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); |
| 1020 | 1048 | |
| 1021 | | #if USE_SIMD |
| 1022 | | INT16 val[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; |
| 1023 | | #endif |
| 1024 | | for (i=0; i < 8; i++) |
| 1049 | for (int i = 0; i < 8; i++) |
| 1025 | 1050 | { |
| 1026 | 1051 | #if USE_SIMD |
| 1027 | | val[i] = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 8; |
| 1052 | SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea + (((16-index) + i) & 0xf)) << 8, i); |
| 1053 | #else |
| 1054 | W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 8; |
| 1028 | 1055 | #endif |
| 1029 | | W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 8; |
| 1030 | 1056 | } |
| 1031 | | |
| 1032 | | #if USE_SIMD |
| 1033 | | rsp->xv[dest] = _mm_set_epi16(val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]); |
| 1034 | | #endif |
| 1035 | 1057 | } |
| 1036 | 1058 | |
| 1037 | 1059 | static void cfunc_rsp_luv(void *param) |
| 1038 | 1060 | { |
| 1039 | 1061 | rsp_state *rsp = (rsp_state*)param; |
| 1040 | 1062 | UINT32 op = rsp->impstate->arg0; |
| 1041 | | int i = 0; |
| 1042 | | UINT32 ea = 0; |
| 1043 | 1063 | int dest = (op >> 16) & 0x1f; |
| 1044 | 1064 | int base = (op >> 21) & 0x1f; |
| 1045 | 1065 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1055 | 1075 | // |
| 1056 | 1076 | // Loads a byte as the bits 14-7 of each element |
| 1057 | 1077 | |
| 1058 | | ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); |
| 1078 | UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); |
| 1059 | 1079 | |
| 1060 | | #if USE_SIMD |
| 1061 | | INT16 val[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; |
| 1062 | | #endif |
| 1063 | | for (i=0; i < 8; i++) |
| 1080 | for (int i = 0; i < 8; i++) |
| 1064 | 1081 | { |
| 1065 | 1082 | #if USE_SIMD |
| 1066 | | val[i] = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7; |
| 1083 | SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7, i); |
| 1084 | #else |
| 1085 | W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7; |
| 1067 | 1086 | #endif |
| 1068 | | W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7; |
| 1069 | 1087 | } |
| 1070 | | |
| 1071 | | #if USE_SIMD |
| 1072 | | rsp->xv[dest] = _mm_set_epi16(val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]); |
| 1073 | | #endif |
| 1074 | 1088 | } |
| 1075 | 1089 | |
| 1076 | 1090 | static void cfunc_rsp_lhv(void *param) |
| 1077 | 1091 | { |
| 1078 | 1092 | rsp_state *rsp = (rsp_state*)param; |
| 1079 | 1093 | UINT32 op = rsp->impstate->arg0; |
| 1080 | | int i = 0; |
| 1081 | | UINT32 ea = 0; |
| 1082 | 1094 | int dest = (op >> 16) & 0x1f; |
| 1083 | 1095 | int base = (op >> 21) & 0x1f; |
| 1084 | 1096 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1094 | 1106 | // |
| 1095 | 1107 | // Loads a byte as the bits 14-7 of each element, with 2-byte stride |
| 1096 | 1108 | |
| 1097 | | ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1109 | UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1098 | 1110 | |
| 1099 | | #if USE_SIMD |
| 1100 | | INT16 val[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; |
| 1101 | | #endif |
| 1102 | | for (i=0; i < 8; i++) |
| 1111 | for (int i = 0; i < 8; i++) |
| 1103 | 1112 | { |
| 1104 | 1113 | #if USE_SIMD |
| 1105 | | val[i] = READ8(rsp, ea + (((16-index) + (i<<1)) & 0xf)) << 7; |
| 1114 | SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea + (((16-index) + (i<<1)) & 0xf)) << 7, i); |
| 1115 | #else |
| 1116 | W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + (i<<1)) & 0xf)) << 7; |
| 1106 | 1117 | #endif |
| 1107 | | W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + (i<<1)) & 0xf)) << 7; |
| 1108 | 1118 | } |
| 1109 | | |
| 1110 | | #if USE_SIMD |
| 1111 | | rsp->xv[dest] = _mm_set_epi16(val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]); |
| 1112 | | #endif |
| 1113 | 1119 | } |
| 1114 | 1120 | |
| 1115 | 1121 | static void cfunc_rsp_lfv(void *param) |
| 1116 | 1122 | { |
| 1117 | 1123 | rsp_state *rsp = (rsp_state*)param; |
| 1118 | 1124 | UINT32 op = rsp->impstate->arg0; |
| 1119 | | int i = 0; |
| 1120 | | int end = 0; |
| 1121 | | UINT32 ea = 0; |
| 1122 | 1125 | int dest = (op >> 16) & 0x1f; |
| 1123 | 1126 | int base = (op >> 21) & 0x1f; |
| 1124 | 1127 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1134 | 1137 | // |
| 1135 | 1138 | // Loads a byte as the bits 14-7 of upper or lower quad, with 4-byte stride |
| 1136 | 1139 | |
| 1140 | UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1137 | 1141 | |
| 1138 | | ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1139 | | |
| 1140 | 1142 | // not sure what happens if 16-byte boundary is crossed... |
| 1141 | 1143 | |
| 1142 | | end = (index >> 1) + 4; |
| 1144 | int end = (index >> 1) + 4; |
| 1143 | 1145 | |
| 1144 | | #if USE_SIMD |
| 1145 | | INT16 mask[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; |
| 1146 | | INT16 val[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; |
| 1147 | | #endif |
| 1148 | | for (i=index >> 1; i < end; i++) |
| 1146 | for (int i = index >> 1; i < end; i++) |
| 1149 | 1147 | { |
| 1150 | 1148 | #if USE_SIMD |
| 1151 | | mask[i] = 0xffff; |
| 1152 | | val[i] = READ8(rsp, ea) << 7; |
| 1149 | SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea) << 7, i); |
| 1150 | #else |
| 1151 | W_VREG_S(dest, i) = READ8(rsp, ea) << 7; |
| 1153 | 1152 | #endif |
| 1154 | | W_VREG_S(dest, i) = READ8(rsp, ea) << 7; |
| 1155 | 1153 | ea += 4; |
| 1156 | 1154 | } |
| 1157 | | |
| 1158 | | #if USE_SIMD |
| 1159 | | __m128i neg1 = _mm_set_epi16(0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff); |
| 1160 | | __m128i keep_mask = _mm_set_epi16(mask[0], mask[1], mask[2], mask[3], mask[4], mask[5], mask[6], mask[7]); |
| 1161 | | __m128i load_val = _mm_set_epi16(val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]); |
| 1162 | | keep_mask = _mm_xor_si128(keep_mask, neg1); |
| 1163 | | rsp->xv[dest] = _mm_and_si128(rsp->xv[dest], keep_mask); |
| 1164 | | rsp->xv[dest] = _mm_or_si128(rsp->xv[dest], load_val); |
| 1165 | | #endif |
| 1166 | 1155 | } |
| 1167 | 1156 | |
| 1168 | 1157 | static void cfunc_rsp_lwv(void *param) |
| 1169 | 1158 | { |
| 1170 | 1159 | rsp_state *rsp = (rsp_state*)param; |
| 1171 | 1160 | UINT32 op = rsp->impstate->arg0; |
| 1172 | | int i = 0; |
| 1173 | | int end = 0; |
| 1174 | | UINT32 ea = 0; |
| 1175 | 1161 | int dest = (op >> 16) & 0x1f; |
| 1176 | 1162 | int base = (op >> 21) & 0x1f; |
| 1177 | 1163 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1188 | 1174 | // Loads the full 128-bit vector starting from vector byte index and wrapping to index 0 |
| 1189 | 1175 | // after byte index 15 |
| 1190 | 1176 | |
| 1191 | | ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1177 | UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1178 | int end = (16 - index) + 16; |
| 1192 | 1179 | |
| 1193 | | end = (16 - index) + 16; |
| 1194 | | |
| 1195 | 1180 | #if USE_SIMD |
| 1196 | | INT16 val[8] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; |
| 1181 | UINT8 val[16]; |
| 1197 | 1182 | #endif |
| 1198 | | for (i=(16 - index); i < end; i++) |
| 1183 | for (int i = (16 - index); i < end; i++) |
| 1199 | 1184 | { |
| 1200 | 1185 | #if USE_SIMD |
| 1201 | | val[i >> 1] |= READ8(rsp, ea) << ((i & 1) * 8); |
| 1186 | val[i & 0xf] = READ8(rsp, ea); |
| 1187 | #else |
| 1188 | VREG_B(dest, i & 0xf) = READ8(rsp, ea); |
| 1202 | 1189 | #endif |
| 1203 | | VREG_B(dest, i & 0xf) = READ8(rsp, ea); |
| 1204 | 1190 | ea += 4; |
| 1205 | 1191 | } |
| 1206 | 1192 | |
| 1207 | 1193 | #if USE_SIMD |
| 1208 | | rsp->xv[dest] = _mm_set_epi16(val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]); |
| 1194 | rsp->xv[dest] = _mm_set_epi8(val[15], val[14], val[13], val[12], val[11], val[10], val[ 9], val[ 8], |
| 1195 | val[ 7], val[ 6], val[ 5], val[ 4], val[ 3], val[ 2], val[ 1], val[ 0]); |
| 1209 | 1196 | #endif |
| 1210 | 1197 | } |
| 1211 | 1198 | |
| r24005 | r24006 | |
| 1213 | 1200 | { |
| 1214 | 1201 | rsp_state *rsp = (rsp_state*)param; |
| 1215 | 1202 | UINT32 op = rsp->impstate->arg0; |
| 1216 | | int i = 0; |
| 1217 | | UINT32 ea = 0; |
| 1218 | 1203 | int dest = (op >> 16) & 0x1f; |
| 1219 | 1204 | int base = (op >> 21) & 0x1f; |
| 1220 | 1205 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1229 | 1214 | |
| 1230 | 1215 | // FIXME: has a small problem with odd indices |
| 1231 | 1216 | |
| 1232 | | int element; |
| 1233 | 1217 | int vs = dest; |
| 1234 | 1218 | int ve = dest + 8; |
| 1235 | 1219 | if (ve > 32) |
| r24005 | r24006 | |
| 1237 | 1221 | ve = 32; |
| 1238 | 1222 | } |
| 1239 | 1223 | |
| 1240 | | element = 7 - (index >> 1); |
| 1224 | int element = 7 - (index >> 1); |
| 1241 | 1225 | |
| 1242 | | ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1226 | UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1243 | 1227 | |
| 1244 | 1228 | ea = ((ea + 8) & ~0xf) + (index & 1); |
| 1245 | | for (i = vs; i < ve; i++) |
| 1229 | for (int i = vs; i < ve; i++) |
| 1246 | 1230 | { |
| 1247 | 1231 | element = ((8 - (index >> 1) + (i - vs)) << 1); |
| 1248 | 1232 | #if USE_SIMD |
| 1249 | | UINT16 value = (READ8(rsp, ea + 1) << 8) | READ8(rsp, ea); |
| 1250 | | _mm_insert_epi16 (rsp->xv[i], value, element); |
| 1251 | | #endif |
| 1233 | UINT16 value = (READ8(rsp, ea) << 8) | READ8(rsp, ea + 1); |
| 1234 | SIMD_INSERT16(rsp->xv[i], value, (element >> 1)); |
| 1235 | #else |
| 1252 | 1236 | VREG_B(i, (element & 0xf)) = READ8(rsp, ea); |
| 1253 | 1237 | VREG_B(i, ((element + 1) & 0xf)) = READ8(rsp, ea + 1); |
| 1238 | #endif |
| 1254 | 1239 | |
| 1255 | 1240 | ea += 2; |
| 1256 | 1241 | } |
| r24005 | r24006 | |
| 1331 | 1316 | { |
| 1332 | 1317 | rsp_state *rsp = (rsp_state*)param; |
| 1333 | 1318 | UINT32 op = rsp->impstate->arg0; |
| 1334 | | UINT32 ea = 0; |
| 1335 | 1319 | int dest = (op >> 16) & 0x1f; |
| 1336 | 1320 | int base = (op >> 21) & 0x1f; |
| 1337 | 1321 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1348 | 1332 | // |
| 1349 | 1333 | // Stores 1 byte from vector byte index |
| 1350 | 1334 | |
| 1351 | | ea = (base) ? rsp->r[base] + offset : offset; |
| 1335 | UINT32 ea = (base) ? rsp->r[base] + offset : offset; |
| 1336 | #if USE_SIMD |
| 1337 | UINT16 value; |
| 1338 | SIMD_EXTRACT16(rsp->xv[dest], value, (index >> 1)); |
| 1339 | value >>= (1-(index & 1)) * 8; |
| 1340 | WRITE8(rsp, ea, (UINT8)value); |
| 1341 | #else |
| 1352 | 1342 | WRITE8(rsp, ea, VREG_B(dest, index)); |
| 1343 | #endif |
| 1353 | 1344 | } |
| 1354 | 1345 | |
| 1355 | 1346 | static void cfunc_rsp_ssv(void *param) |
| 1356 | 1347 | { |
| 1357 | 1348 | rsp_state *rsp = (rsp_state*)param; |
| 1358 | 1349 | UINT32 op = rsp->impstate->arg0; |
| 1359 | | UINT32 ea = 0; |
| 1360 | 1350 | int dest = (op >> 16) & 0x1f; |
| 1361 | 1351 | int base = (op >> 21) & 0x1f; |
| 1362 | 1352 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1373 | 1363 | // |
| 1374 | 1364 | // Stores 2 bytes starting from vector byte index |
| 1375 | 1365 | |
| 1376 | | ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2); |
| 1366 | UINT32 ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2); |
| 1377 | 1367 | |
| 1368 | #if USE_SIMD |
| 1369 | UINT16 value; |
| 1370 | SIMD_EXTRACT16(rsp->xv[dest], value, (index >> 1)); |
| 1371 | WRITE8(rsp, ea, (UINT8)(value >> 8)); |
| 1372 | WRITE8(rsp, ea+1, (UINT8)(value & 0x00ff)); |
| 1373 | #else |
| 1378 | 1374 | int end = index + 2; |
| 1379 | | |
| 1380 | 1375 | for (int i = index; i < end; i++) |
| 1381 | 1376 | { |
| 1382 | 1377 | WRITE8(rsp, ea, VREG_B(dest, i)); |
| 1383 | 1378 | ea++; |
| 1384 | 1379 | } |
| 1380 | #endif |
| 1385 | 1381 | } |
| 1386 | 1382 | |
| 1387 | 1383 | static void cfunc_rsp_slv(void *param) |
| 1388 | 1384 | { |
| 1389 | 1385 | rsp_state *rsp = (rsp_state*)param; |
| 1390 | 1386 | UINT32 op = rsp->impstate->arg0; |
| 1391 | | UINT32 ea = 0; |
| 1392 | 1387 | int dest = (op >> 16) & 0x1f; |
| 1393 | 1388 | int base = (op >> 21) & 0x1f; |
| 1394 | 1389 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1404 | 1399 | // |
| 1405 | 1400 | // Stores 4 bytes starting from vector byte index |
| 1406 | 1401 | |
| 1407 | | ea = (base) ? rsp->r[base] + (offset * 4) : (offset * 4); |
| 1402 | UINT32 ea = (base) ? rsp->r[base] + (offset * 4) : (offset * 4); |
| 1408 | 1403 | |
| 1404 | #if USE_SIMD |
| 1405 | UINT16 value0, value1; |
| 1406 | index >>= 1; |
| 1407 | SIMD_EXTRACT16(rsp->xv[dest], value0, index); |
| 1408 | SIMD_EXTRACT16(rsp->xv[dest], value1, index+1); |
| 1409 | WRITE8(rsp, ea, (UINT8)(value0 >> 8)); |
| 1410 | WRITE8(rsp, ea+1, (UINT8)(value0 & 0x00ff)); |
| 1411 | WRITE8(rsp, ea+2, (UINT8)(value1 >> 8)); |
| 1412 | WRITE8(rsp, ea+3, (UINT8)(value1 & 0x00ff)); |
| 1413 | #else |
| 1409 | 1414 | int end = index + 4; |
| 1410 | | |
| 1411 | 1415 | for (int i = index; i < end; i++) |
| 1412 | 1416 | { |
| 1413 | 1417 | WRITE8(rsp, ea, VREG_B(dest, i)); |
| 1414 | 1418 | ea++; |
| 1415 | 1419 | } |
| 1420 | #endif |
| 1416 | 1421 | } |
| 1417 | 1422 | |
| 1418 | 1423 | static void cfunc_rsp_sdv(void *param) |
| 1419 | 1424 | { |
| 1420 | 1425 | rsp_state *rsp = (rsp_state*)param; |
| 1421 | 1426 | UINT32 op = rsp->impstate->arg0; |
| 1422 | | UINT32 ea = 0; |
| 1423 | | int end = 0; |
| 1424 | 1427 | int dest = (op >> 16) & 0x1f; |
| 1425 | 1428 | int base = (op >> 21) & 0x1f; |
| 1426 | | int index = (op >> 7) & 0xf; |
| 1429 | int index = (op >> 7) & 0x8; |
| 1427 | 1430 | int offset = (op & 0x7f); |
| 1428 | 1431 | if (offset & 0x40) |
| 1429 | 1432 | { |
| r24005 | r24006 | |
| 1435 | 1438 | // -------------------------------------------------- |
| 1436 | 1439 | // |
| 1437 | 1440 | // Stores 8 bytes starting from vector byte index |
| 1438 | | ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); |
| 1441 | UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); |
| 1439 | 1442 | |
| 1440 | | end = index + 8; |
| 1441 | | |
| 1443 | #if USE_SIMD |
| 1444 | UINT16 value0, value1, value2, value3; |
| 1445 | index >>= 1; |
| 1446 | SIMD_EXTRACT16(rsp->xv[dest], value0, index); |
| 1447 | SIMD_EXTRACT16(rsp->xv[dest], value1, index+1); |
| 1448 | SIMD_EXTRACT16(rsp->xv[dest], value2, index+2); |
| 1449 | SIMD_EXTRACT16(rsp->xv[dest], value3, index+3); |
| 1450 | WRITE8(rsp, ea, (UINT8)(value0 >> 8)); |
| 1451 | WRITE8(rsp, ea+1, (UINT8)(value0 & 0x00ff)); |
| 1452 | WRITE8(rsp, ea+2, (UINT8)(value1 >> 8)); |
| 1453 | WRITE8(rsp, ea+3, (UINT8)(value1 & 0x00ff)); |
| 1454 | WRITE8(rsp, ea+4, (UINT8)(value2 >> 8)); |
| 1455 | WRITE8(rsp, ea+5, (UINT8)(value2 & 0x00ff)); |
| 1456 | WRITE8(rsp, ea+6, (UINT8)(value3 >> 8)); |
| 1457 | WRITE8(rsp, ea+7, (UINT8)(value3 & 0x00ff)); |
| 1458 | #else |
| 1459 | int end = index + 8; |
| 1442 | 1460 | for (int i = index; i < end; i++) |
| 1443 | 1461 | { |
| 1444 | 1462 | WRITE8(rsp, ea, VREG_B(dest, i)); |
| 1445 | 1463 | ea++; |
| 1446 | 1464 | } |
| 1465 | #endif |
| 1447 | 1466 | } |
| 1448 | 1467 | |
| 1449 | 1468 | static void cfunc_rsp_sqv(void *param) |
| 1450 | 1469 | { |
| 1451 | 1470 | rsp_state *rsp = (rsp_state*)param; |
| 1452 | 1471 | UINT32 op = rsp->impstate->arg0; |
| 1453 | | UINT32 ea = 0; |
| 1454 | | int i = 0; |
| 1455 | | int end = 0; |
| 1456 | 1472 | int dest = (op >> 16) & 0x1f; |
| 1457 | 1473 | int base = (op >> 21) & 0x1f; |
| 1458 | 1474 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1468 | 1484 | // |
| 1469 | 1485 | // Stores up to 16 bytes starting from vector byte index until 16-byte boundary |
| 1470 | 1486 | |
| 1471 | | ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1472 | | |
| 1473 | | end = index + (16 - (ea & 0xf)); |
| 1474 | | |
| 1475 | | for (i=index; i < end; i++) |
| 1487 | UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1488 | int end = index + (16 - (ea & 0xf)); |
| 1489 | for (int i=index; i < end; i++) |
| 1476 | 1490 | { |
| 1491 | #if USE_SIMD |
| 1492 | UINT16 value; |
| 1493 | SIMD_EXTRACT16(rsp->xv[dest], value, (i >> 1)); |
| 1494 | value >>= (1-(i & 1)) * 8; |
| 1495 | WRITE8(rsp, ea, (UINT8)value); |
| 1496 | #else |
| 1477 | 1497 | WRITE8(rsp, ea, VREG_B(dest, i & 0xf)); |
| 1498 | #endif |
| 1478 | 1499 | ea++; |
| 1479 | 1500 | } |
| 1480 | 1501 | } |
| r24005 | r24006 | |
| 1506 | 1527 | |
| 1507 | 1528 | for (int i = index; i < end; i++) |
| 1508 | 1529 | { |
| 1530 | #if USE_SIMD |
| 1531 | UINT32 bi = (i + o) & 0xf; |
| 1532 | UINT16 value; |
| 1533 | SIMD_EXTRACT16(rsp->xv[dest], value, (bi >> 1)); |
| 1534 | value >>= (1-(bi & 1)) * 8; |
| 1535 | WRITE8(rsp, ea, (UINT8)value); |
| 1536 | #else |
| 1509 | 1537 | WRITE8(rsp, ea, VREG_B(dest, ((i + o) & 0xf))); |
| 1538 | #endif |
| 1510 | 1539 | ea++; |
| 1511 | 1540 | } |
| 1512 | 1541 | } |
| r24005 | r24006 | |
| 1515 | 1544 | { |
| 1516 | 1545 | rsp_state *rsp = (rsp_state*)param; |
| 1517 | 1546 | UINT32 op = rsp->impstate->arg0; |
| 1518 | | UINT32 ea = 0; |
| 1519 | | int i = 0; |
| 1520 | | int end = 0; |
| 1521 | 1547 | int dest = (op >> 16) & 0x1f; |
| 1522 | 1548 | int base = (op >> 21) & 0x1f; |
| 1523 | 1549 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1533 | 1559 | // |
| 1534 | 1560 | // Stores upper 8 bits of each element |
| 1535 | 1561 | |
| 1536 | | ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); |
| 1537 | | end = index + 8; |
| 1538 | | |
| 1539 | | for (i=index; i < end; i++) |
| 1562 | UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); |
| 1563 | int end = index + 8; |
| 1564 | for (int i=index; i < end; i++) |
| 1540 | 1565 | { |
| 1541 | 1566 | if ((i & 0xf) < 8) |
| 1542 | 1567 | { |
| 1543 | | WRITE8(rsp, ea, VREG_B(dest, ((i & 0xf) << 1))); |
| 1568 | #if USE_SIMD |
| 1569 | UINT16 value; |
| 1570 | SIMD_EXTRACT16(rsp->xv[dest], value, i); |
| 1571 | WRITE8(rsp, ea, (UINT8)(value >> 8)); |
| 1572 | #else |
| 1573 | WRITE8(rsp, ea, VREG_B(dest, (i & 0xf) << 1)); |
| 1574 | #endif |
| 1544 | 1575 | } |
| 1545 | 1576 | else |
| 1546 | 1577 | { |
| 1578 | #if USE_SIMD |
| 1579 | UINT16 value; |
| 1580 | SIMD_EXTRACT16(rsp->xv[dest], value, i); |
| 1581 | value >>= 7; |
| 1582 | WRITE8(rsp, ea, (UINT8)value); |
| 1583 | #else |
| 1547 | 1584 | WRITE8(rsp, ea, VREG_S(dest, (i & 0x7)) >> 7); |
| 1585 | #endif |
| 1548 | 1586 | } |
| 1549 | 1587 | ea++; |
| 1550 | 1588 | } |
| r24005 | r24006 | |
| 1554 | 1592 | { |
| 1555 | 1593 | rsp_state *rsp = (rsp_state*)param; |
| 1556 | 1594 | UINT32 op = rsp->impstate->arg0; |
| 1557 | | UINT32 ea = 0; |
| 1558 | | int i = 0; |
| 1559 | | int end = 0; |
| 1560 | 1595 | int dest = (op >> 16) & 0x1f; |
| 1561 | 1596 | int base = (op >> 21) & 0x1f; |
| 1562 | 1597 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1572 | 1607 | // |
| 1573 | 1608 | // Stores bits 14-7 of each element |
| 1574 | 1609 | |
| 1575 | | ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); |
| 1576 | | end = index + 8; |
| 1577 | | |
| 1578 | | for (i=index; i < end; i++) |
| 1610 | UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); |
| 1611 | int end = index + 8; |
| 1612 | for (int i=index; i < end; i++) |
| 1579 | 1613 | { |
| 1580 | 1614 | if ((i & 0xf) < 8) |
| 1581 | 1615 | { |
| 1616 | #if USE_SIMD |
| 1617 | UINT16 value; |
| 1618 | SIMD_EXTRACT16(rsp->xv[dest], value, i); |
| 1619 | value >>= 7; |
| 1620 | WRITE8(rsp, ea, (UINT8)value); |
| 1621 | #else |
| 1582 | 1622 | WRITE8(rsp, ea, VREG_S(dest, (i & 0x7)) >> 7); |
| 1623 | #endif |
| 1583 | 1624 | } |
| 1584 | 1625 | else |
| 1585 | 1626 | { |
| 1627 | #if USE_SIMD |
| 1628 | UINT16 value; |
| 1629 | SIMD_EXTRACT16(rsp->xv[dest], value, i); |
| 1630 | WRITE8(rsp, ea, (UINT8)value >> 8); |
| 1631 | #else |
| 1586 | 1632 | WRITE8(rsp, ea, VREG_B(dest, ((i & 0x7) << 1))); |
| 1633 | #endif |
| 1587 | 1634 | } |
| 1588 | 1635 | ea++; |
| 1589 | 1636 | } |
| r24005 | r24006 | |
| 1593 | 1640 | { |
| 1594 | 1641 | rsp_state *rsp = (rsp_state*)param; |
| 1595 | 1642 | UINT32 op = rsp->impstate->arg0; |
| 1596 | | UINT32 ea = 0; |
| 1597 | | int i = 0; |
| 1598 | 1643 | int dest = (op >> 16) & 0x1f; |
| 1599 | 1644 | int base = (op >> 21) & 0x1f; |
| 1600 | 1645 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1610 | 1655 | // |
| 1611 | 1656 | // Stores bits 14-7 of each element, with 2-byte stride |
| 1612 | 1657 | |
| 1613 | | ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1614 | | |
| 1615 | | for (i=0; i < 8; i++) |
| 1658 | UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1659 | for (int i=0; i < 8; i++) |
| 1616 | 1660 | { |
| 1617 | | UINT8 d = ((VREG_B(dest, ((index + (i << 1) + 0) & 0xf))) << 1) | |
| 1618 | | ((VREG_B(dest, ((index + (i << 1) + 1) & 0xf))) >> 7); |
| 1619 | | |
| 1661 | int element = index + (i << 1); |
| 1662 | #if USE_SIMD |
| 1663 | UINT16 value; |
| 1664 | SIMD_EXTRACT16(rsp->xv[dest], value, element >> 1); |
| 1665 | WRITE8(rsp, ea, (value >> 7) & 0x00ff); |
| 1666 | #else |
| 1667 | UINT8 d = (VREG_B(dest, (element & 0xf)) << 1) | |
| 1668 | (VREG_B(dest, ((element + 1) & 0xf)) >> 7); |
| 1620 | 1669 | WRITE8(rsp, ea, d); |
| 1670 | #endif |
| 1621 | 1671 | ea += 2; |
| 1622 | 1672 | } |
| 1623 | 1673 | } |
| r24005 | r24006 | |
| 1626 | 1676 | { |
| 1627 | 1677 | rsp_state *rsp = (rsp_state*)param; |
| 1628 | 1678 | UINT32 op = rsp->impstate->arg0; |
| 1629 | | UINT32 ea = 0; |
| 1630 | | int i = 0; |
| 1631 | | int end = 0; |
| 1632 | | int eaoffset = 0; |
| 1633 | 1679 | int dest = (op >> 16) & 0x1f; |
| 1634 | 1680 | int base = (op >> 21) & 0x1f; |
| 1635 | 1681 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1647 | 1693 | |
| 1648 | 1694 | if (index & 0x7) printf("RSP: SFV: index = %d at %08X\n", index, rsp->ppc); |
| 1649 | 1695 | |
| 1650 | | ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1651 | | |
| 1652 | | eaoffset = ea & 0xf; |
| 1696 | UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1697 | int eaoffset = ea & 0xf; |
| 1653 | 1698 | ea &= ~0xf; |
| 1654 | 1699 | |
| 1655 | | end = (index >> 1) + 4; |
| 1700 | int end = (index >> 1) + 4; |
| 1656 | 1701 | |
| 1657 | | for (i=index >> 1; i < end; i++) |
| 1702 | for (int i = index>>1; i < end; i++) |
| 1658 | 1703 | { |
| 1704 | #if USE_SIMD |
| 1705 | UINT16 value; |
| 1706 | SIMD_EXTRACT16(rsp->xv[dest], value, i); |
| 1707 | WRITE8(rsp, ea + (eaoffset & 0xf), (value >> 7) & 0x00ff); |
| 1708 | #else |
| 1659 | 1709 | WRITE8(rsp, ea + (eaoffset & 0xf), VREG_S(dest, i) >> 7); |
| 1710 | #endif |
| 1660 | 1711 | eaoffset += 4; |
| 1661 | 1712 | } |
| 1662 | 1713 | } |
| r24005 | r24006 | |
| 1665 | 1716 | { |
| 1666 | 1717 | rsp_state *rsp = (rsp_state*)param; |
| 1667 | 1718 | UINT32 op = rsp->impstate->arg0; |
| 1668 | | UINT32 ea = 0; |
| 1669 | | int i = 0; |
| 1670 | | int end = 0; |
| 1671 | | int eaoffset = 0; |
| 1672 | 1719 | int dest = (op >> 16) & 0x1f; |
| 1673 | 1720 | int base = (op >> 21) & 0x1f; |
| 1674 | 1721 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1685 | 1732 | // Stores the full 128-bit vector starting from vector byte index and wrapping to index 0 |
| 1686 | 1733 | // after byte index 15 |
| 1687 | 1734 | |
| 1688 | | ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1689 | | |
| 1690 | | eaoffset = ea & 0xf; |
| 1735 | UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1736 | int eaoffset = ea & 0xf; |
| 1691 | 1737 | ea &= ~0xf; |
| 1692 | 1738 | |
| 1693 | | end = index + 16; |
| 1694 | | |
| 1695 | | for (i=index; i < end; i++) |
| 1739 | int end = index + 16; |
| 1740 | for (int i = index; i < end; i++) |
| 1696 | 1741 | { |
| 1742 | #if USE_SIMD |
| 1743 | UINT16 value; |
| 1744 | SIMD_EXTRACT16(rsp->xv[dest], value, i >> 1); |
| 1745 | WRITE8(rsp, ea + (eaoffset & 0xf), (value >> ((1-(i & 1)) * 8)) & 0xff); |
| 1746 | #else |
| 1697 | 1747 | WRITE8(rsp, ea + (eaoffset & 0xf), VREG_B(dest, i & 0xf)); |
| 1748 | #endif |
| 1698 | 1749 | eaoffset++; |
| 1699 | 1750 | } |
| 1700 | 1751 | } |
| r24005 | r24006 | |
| 1703 | 1754 | { |
| 1704 | 1755 | rsp_state *rsp = (rsp_state*)param; |
| 1705 | 1756 | UINT32 op = rsp->impstate->arg0; |
| 1706 | | UINT32 ea = 0; |
| 1707 | | int i = 0; |
| 1708 | 1757 | int dest = (op >> 16) & 0x1f; |
| 1709 | 1758 | int base = (op >> 21) & 0x1f; |
| 1710 | 1759 | int index = (op >> 7) & 0xf; |
| r24005 | r24006 | |
| 1730 | 1779 | |
| 1731 | 1780 | int element = 8 - (index >> 1); |
| 1732 | 1781 | |
| 1733 | | ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1734 | | |
| 1782 | UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); |
| 1735 | 1783 | int eaoffset = (ea & 0xf) + (element * 2); |
| 1736 | 1784 | ea &= ~0xf; |
| 1737 | 1785 | |
| 1738 | | for (i=vs; i < ve; i++) |
| 1786 | for (int i = vs; i < ve; i++) |
| 1739 | 1787 | { |
| 1788 | #if USE_SIMD |
| 1789 | UINT16 value; |
| 1790 | SIMD_EXTRACT16(rsp->xv[dest], value, element); |
| 1791 | WRITE16(rsp, ea + (eaoffset & 0xf), value); |
| 1792 | #else |
| 1740 | 1793 | WRITE16(rsp, ea + (eaoffset & 0xf), VREG_S(i, element & 0x7)); |
| 1794 | #endif |
| 1741 | 1795 | eaoffset += 2; |
| 1742 | 1796 | element++; |
| 1743 | 1797 | } |
| r24005 | r24006 | |
| 1834 | 1888 | { |
| 1835 | 1889 | if (slice == 0) |
| 1836 | 1890 | { |
| 1891 | #if USE_SIMD |
| 1892 | UINT16 ret; |
| 1893 | SIMD_EXTRACT16(rsp->accum_l, ret, accum); |
| 1894 | return ret; |
| 1895 | #else |
| 1837 | 1896 | return ACCUM_L(accum); |
| 1897 | #endif |
| 1838 | 1898 | } |
| 1839 | 1899 | else if (slice == 1) |
| 1840 | 1900 | { |
| r24005 | r24006 | |
| 1859 | 1919 | { |
| 1860 | 1920 | if (slice == 0) |
| 1861 | 1921 | { |
| 1922 | #if USE_SIMD |
| 1923 | UINT16 ret; |
| 1924 | SIMD_EXTRACT16(rsp->accum_l, ret, accum); |
| 1925 | return ret; |
| 1926 | #else |
| 1862 | 1927 | return ACCUM_L(accum); |
| 1928 | #endif |
| 1863 | 1929 | } |
| 1864 | 1930 | else |
| 1865 | 1931 | { |
| r24005 | r24006 | |
| 1872 | 1938 | return 0; |
| 1873 | 1939 | } |
| 1874 | 1940 | |
| 1941 | #if USE_SIMD |
| 1942 | __m128i SATURATE_ACCUM1(__m128i accum_h, __m128i accum_m, UINT16 negative, UINT16 positive) |
| 1943 | { |
| 1944 | __m128i vnegative = _mm_set_epi16(negative, negative, negative, negative, negative, negative, negative, negative); |
| 1945 | __m128i vpositive = _mm_set_epi16(positive, positive, positive, positive, positive, positive, positive, positive); |
| 1946 | |
| 1947 | // conditional masks |
| 1948 | __m128i accum_hlz = _mm_cmplt_epi16(accum_h, vec_zero); |
| 1949 | __m128i accum_hgz = _mm_cmpgt_epi16(accum_h, vec_zero); |
| 1950 | __m128i accum_hz = _mm_cmpeq_epi16(accum_h, vec_zero); |
| 1951 | __m128i accum_hn1 = _mm_cmpeq_epi16(accum_h, vec_neg1); |
| 1952 | __m128i accum_hnn1 = _mm_xor_si128(accum_hn1, vec_neg1); |
| 1953 | |
| 1954 | __m128i accum_mlz = _mm_cmplt_epi16(accum_m, vec_zero); |
| 1955 | __m128i accum_mgz = _mm_cmpgt_epi16(accum_m, vec_zero); |
| 1956 | __m128i accum_mz = _mm_cmpeq_epi16(accum_m, vec_zero); |
| 1957 | __m128i accum_mgez = _mm_or_si128(accum_mz, accum_mgz); |
| 1958 | |
| 1959 | // Return negative if H<0 && (H!=0xffff || M >= 0) |
| 1960 | // Return positive if H>0 || (H==0 && M<0) |
| 1961 | // Return medium slice if H==0xffff && M<0 |
| 1962 | // Return medium slice if H==0 && M>=0 |
| 1963 | |
| 1964 | __m128i negative_mask = _mm_and_si128(accum_hlz, _mm_or_si128(accum_hnn1, accum_mgez)); |
| 1965 | __m128i positive_mask = _mm_or_si128(accum_hgz, _mm_and_si128(accum_hz, accum_mlz)); |
| 1966 | __m128i accumm_mask = _mm_or_si128(_mm_and_si128(accum_hz, accum_mgez), _mm_and_si128(accum_hn1, accum_mlz)); |
| 1967 | |
| 1968 | __m128i output = _mm_and_si128(accum_m, accumm_mask); |
| 1969 | output = _mm_or_si128(output, _mm_and_si128(vnegative, negative_mask)); |
| 1970 | output = _mm_or_si128(output, _mm_and_si128(vpositive, positive_mask)); |
| 1971 | return output; |
| 1972 | } |
| 1973 | #endif |
| 1974 | |
| 1875 | 1975 | INLINE UINT16 SATURATE_ACCUM1(rsp_state *rsp, int accum, UINT16 negative, UINT16 positive) |
| 1876 | 1976 | { |
| 1977 | // Return negative if H<0 && (H!=0xffff || M >= 0) |
| 1978 | // Return positive if H>0 || (H==0 && M<0) |
| 1979 | // Return medium slice if H==0xffff && M<0 |
| 1980 | // Return medium slice if H==0 && M>=0 |
| 1877 | 1981 | if ((INT16)ACCUM_H(accum) < 0) |
| 1878 | 1982 | { |
| 1879 | 1983 | if ((UINT16)(ACCUM_H(accum)) != 0xffff) |
| r24005 | r24006 | |
| 1914 | 2018 | return 0; |
| 1915 | 2019 | } |
| 1916 | 2020 | |
| 2021 | INLINE UINT16 C_SATURATE_ACCUM1(UINT16 *h, UINT16 *m, int accum, UINT16 negative, UINT16 positive) |
| 2022 | { |
| 2023 | // Return negative if H<0 && (H!=0xffff || M >= 0) |
| 2024 | // Return positive if H>0 || (H==0 && M<0) |
| 2025 | // Return medium slice if H==0xffff && M<0 |
| 2026 | // Return medium slice if H==0 && M>=0 |
| 2027 | if ((INT16)h[accum] < 0) |
| 2028 | { |
| 2029 | if ((UINT16)h[accum] != 0xffff) |
| 2030 | { |
| 2031 | return negative; |
| 2032 | } |
| 2033 | else |
| 2034 | { |
| 2035 | if ((INT16)m[accum] >= 0) |
| 2036 | { |
| 2037 | return negative; |
| 2038 | } |
| 2039 | else |
| 2040 | { |
| 2041 | return m[accum]; |
| 2042 | } |
| 2043 | } |
| 2044 | } |
| 2045 | else |
| 2046 | { |
| 2047 | if ((UINT16)h[accum] != 0) |
| 2048 | { |
| 2049 | return positive; |
| 2050 | } |
| 2051 | else |
| 2052 | { |
| 2053 | if ((INT16)m[accum] < 0) |
| 2054 | { |
| 2055 | return positive; |
| 2056 | } |
| 2057 | else |
| 2058 | { |
| 2059 | return m[accum]; |
| 2060 | } |
| 2061 | } |
| 2062 | } |
| 2063 | |
| 2064 | return 0; |
| 2065 | } |
| 2066 | |
| 2067 | #if USE_SIMD |
| 1917 | 2068 | #define WRITEBACK_RESULT() { \ |
| 2069 | SIMD_INSERT16(rsp->xv[VDREG], vres[0], 0); \ |
| 2070 | SIMD_INSERT16(rsp->xv[VDREG], vres[1], 1); \ |
| 2071 | SIMD_INSERT16(rsp->xv[VDREG], vres[2], 2); \ |
| 2072 | SIMD_INSERT16(rsp->xv[VDREG], vres[3], 3); \ |
| 2073 | SIMD_INSERT16(rsp->xv[VDREG], vres[4], 4); \ |
| 2074 | SIMD_INSERT16(rsp->xv[VDREG], vres[5], 5); \ |
| 2075 | SIMD_INSERT16(rsp->xv[VDREG], vres[6], 6); \ |
| 2076 | SIMD_INSERT16(rsp->xv[VDREG], vres[7], 7); \ |
| 2077 | } |
| 2078 | #else |
| 2079 | #define WRITEBACK_RESULT() { \ |
| 1918 | 2080 | W_VREG_S(VDREG, 0) = vres[0]; \ |
| 1919 | 2081 | W_VREG_S(VDREG, 1) = vres[1]; \ |
| 1920 | 2082 | W_VREG_S(VDREG, 2) = vres[2]; \ |
| r24005 | r24006 | |
| 1924 | 2086 | W_VREG_S(VDREG, 6) = vres[6]; \ |
| 1925 | 2087 | W_VREG_S(VDREG, 7) = vres[7]; \ |
| 1926 | 2088 | } |
| 2089 | #endif |
| 1927 | 2090 | |
| 1928 | 2091 | INLINE void cfunc_rsp_vmulf(void *param) |
| 1929 | 2092 | { |
| 1930 | 2093 | rsp_state *rsp = (rsp_state*)param; |
| 1931 | 2094 | int op = rsp->impstate->arg0; |
| 1932 | | INT16 vres[8] = { 0 }; |
| 1933 | 2095 | //int i; |
| 1934 | 2096 | // 31 25 24 20 15 10 5 0 |
| 1935 | 2097 | // ------------------------------------------------------ |
| r24005 | r24006 | |
| 1938 | 2100 | // |
| 1939 | 2101 | // Multiplies signed integer by signed integer * 2 |
| 1940 | 2102 | |
| 1941 | | int sel; |
| 1942 | | INT32 s1, s2; |
| 1943 | | INT64 r; |
| 2103 | INT16 vres[8] = { 0 }; |
| 1944 | 2104 | for (int i = 0; i < 8; i++) |
| 1945 | 2105 | { |
| 1946 | | sel = VEC_EL_2(EL, i); |
| 1947 | | s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 1948 | | s2 = (INT32)(INT16)VREG_S(VS2REG, sel); |
| 2106 | #if USE_SIMD |
| 2107 | UINT16 w1, w2; |
| 2108 | SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i); |
| 2109 | SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i)); |
| 2110 | INT32 s1 = (INT32)(INT16)w1; |
| 2111 | INT32 s2 = (INT32)(INT16)w2; |
| 2112 | #else |
| 2113 | INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2114 | INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 2115 | #endif |
| 1949 | 2116 | if (s1 == -32768 && s2 == -32768) |
| 1950 | 2117 | { |
| 1951 | 2118 | // overflow |
| 1952 | 2119 | ACCUM_H(i) = 0; |
| 1953 | 2120 | ACCUM_M(i) = -32768; |
| 2121 | #if USE_SIMD |
| 2122 | SIMD_INSERT16(rsp->accum_l, -32768, i); |
| 2123 | #else |
| 1954 | 2124 | ACCUM_L(i) = -32768; |
| 2125 | #endif |
| 1955 | 2126 | vres[i] = 0x7fff; |
| 1956 | 2127 | } |
| 1957 | 2128 | else |
| 1958 | 2129 | { |
| 1959 | | r = s1 * s2 * 2; |
| 2130 | INT64 r = s1 * s2 * 2; |
| 1960 | 2131 | r += 0x8000; // rounding ? |
| 1961 | 2132 | ACCUM_H(i) = (r < 0) ? 0xffff : 0; // sign-extend to 48-bit |
| 1962 | 2133 | ACCUM_M(i) = (INT16)(r >> 16); |
| 1963 | | ACCUM_L(i) = (UINT16)(r); |
| 2134 | #if USE_SIMD |
| 2135 | SIMD_INSERT16(rsp->accum_l, (UINT16)(r), i); |
| 2136 | #else |
| 2137 | ACCUM_L(i) = (UINT16)r; |
| 2138 | #endif |
| 1964 | 2139 | vres[i] = ACCUM_M(i); |
| 1965 | 2140 | } |
| 1966 | 2141 | } |
| r24005 | r24006 | |
| 1971 | 2146 | { |
| 1972 | 2147 | rsp_state *rsp = (rsp_state*)param; |
| 1973 | 2148 | int op = rsp->impstate->arg0; |
| 1974 | | INT16 vres[8]; |
| 1975 | | int i; |
| 1976 | 2149 | // 31 25 24 20 15 10 5 0 |
| 1977 | 2150 | // ------------------------------------------------------ |
| 1978 | 2151 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000001 | |
| 1979 | 2152 | // ------------------------------------------------------ |
| 1980 | 2153 | // |
| 1981 | 2154 | |
| 1982 | | int sel; |
| 1983 | | INT32 s1, s2; |
| 1984 | | INT64 r; |
| 1985 | | for (i=0; i < 8; i++) |
| 2155 | INT16 vres[8]; |
| 2156 | for (int i = 0; i < 8; i++) |
| 1986 | 2157 | { |
| 1987 | | sel = VEC_EL_2(EL, i); |
| 1988 | | s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 1989 | | s2 = (INT32)(INT16)VREG_S(VS2REG, sel); |
| 1990 | | r = s1 * s2 * 2; |
| 2158 | #if USE_SIMD |
| 2159 | UINT16 w1, w2; |
| 2160 | SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i); |
| 2161 | SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i)); |
| 2162 | INT32 s1 = (INT32)(INT16)w1; |
| 2163 | INT32 s2 = (INT32)(INT16)w2; |
| 2164 | #else |
| 2165 | INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2166 | INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 2167 | #endif |
| 2168 | INT64 r = s1 * s2 * 2; |
| 1991 | 2169 | r += 0x8000; // rounding ? |
| 1992 | 2170 | |
| 1993 | 2171 | ACCUM_H(i) = (UINT16)(r >> 32); |
| 1994 | 2172 | ACCUM_M(i) = (UINT16)(r >> 16); |
| 2173 | #if USE_SIMD |
| 2174 | SIMD_INSERT16(rsp->accum_l, (UINT16)(r), i); |
| 2175 | #else |
| 1995 | 2176 | ACCUM_L(i) = (UINT16)(r); |
| 2177 | #endif |
| 1996 | 2178 | |
| 1997 | 2179 | if (r < 0) |
| 1998 | 2180 | { |
| r24005 | r24006 | |
| 2024 | 2206 | // The result is added into accumulator |
| 2025 | 2207 | // The middle slice of accumulator is stored into destination element |
| 2026 | 2208 | |
| 2027 | | int sel; |
| 2028 | | UINT32 s1, s2; |
| 2029 | | UINT32 r; |
| 2030 | 2209 | for (int i = 0; i < 8; i++) |
| 2031 | 2210 | { |
| 2032 | | sel = VEC_EL_2(EL, i); |
| 2033 | | s1 = (UINT32)(UINT16)VREG_S(VS1REG, i); |
| 2034 | | s2 = (UINT32)(UINT16)VREG_S(VS2REG, sel); |
| 2035 | | r = s1 * s2; |
| 2211 | #if USE_SIMD |
| 2212 | UINT16 w1, w2; |
| 2213 | SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i); |
| 2214 | SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i)); |
| 2215 | UINT32 s1 = (UINT32)w1; |
| 2216 | UINT32 s2 = (UINT32)w2; |
| 2217 | #else |
| 2218 | UINT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, i); |
| 2219 | UINT32 s2 = (UINT32)(UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 2220 | #endif |
| 2221 | UINT32 r = s1 * s2; |
| 2036 | 2222 | |
| 2037 | 2223 | ACCUM_H(i) = 0; |
| 2038 | 2224 | ACCUM_M(i) = 0; |
| 2225 | #if USE_SIMD |
| 2226 | SIMD_INSERT16(rsp->accum_l, (UINT16)(r >> 16), i); |
| 2227 | #else |
| 2039 | 2228 | ACCUM_L(i) = (UINT16)(r >> 16); |
| 2229 | #endif |
| 2040 | 2230 | |
| 2041 | | vres[i] = ACCUM_L(i); |
| 2231 | vres[i] = (UINT16)(r >> 16); |
| 2042 | 2232 | } |
| 2043 | 2233 | WRITEBACK_RESULT(); |
| 2044 | 2234 | } |
| r24005 | r24006 | |
| 2058 | 2248 | // The result is stored into accumulator |
| 2059 | 2249 | // The middle slice of accumulator is stored into destination element |
| 2060 | 2250 | |
| 2061 | | int sel; |
| 2062 | | INT32 s1, s2; |
| 2063 | | INT32 r; |
| 2064 | 2251 | for (int i = 0; i < 8; i++) |
| 2065 | 2252 | { |
| 2066 | | sel = VEC_EL_2(EL, i); |
| 2067 | | s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2068 | | s2 = (UINT16)VREG_S(VS2REG, sel); // not sign-extended |
| 2069 | | r = s1 * s2; |
| 2253 | #if USE_SIMD |
| 2254 | UINT16 w1, w2; |
| 2255 | SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i); |
| 2256 | SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i)); |
| 2257 | INT32 s1 = (INT32)(INT16)w1; |
| 2258 | INT32 s2 = w2; |
| 2259 | #else |
| 2260 | INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2261 | INT32 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); // not sign-extended |
| 2262 | #endif |
| 2263 | INT32 r = s1 * s2; |
| 2070 | 2264 | |
| 2071 | 2265 | ACCUM_H(i) = (r < 0) ? 0xffff : 0; // sign-extend to 48-bit |
| 2072 | 2266 | ACCUM_M(i) = (INT16)(r >> 16); |
| 2267 | #if USE_SIMD |
| 2268 | SIMD_INSERT16(rsp->accum_l, (UINT16)(r), i); |
| 2269 | #else |
| 2073 | 2270 | ACCUM_L(i) = (UINT16)(r); |
| 2271 | #endif |
| 2074 | 2272 | |
| 2075 | 2273 | vres[i] = ACCUM_M(i); |
| 2076 | 2274 | } |
| r24005 | r24006 | |
| 2081 | 2279 | { |
| 2082 | 2280 | rsp_state *rsp = (rsp_state*)param; |
| 2083 | 2281 | int op = rsp->impstate->arg0; |
| 2084 | | INT16 vres[8] = { 0 }; |
| 2085 | 2282 | |
| 2086 | 2283 | // 31 25 24 20 15 10 5 0 |
| 2087 | 2284 | // ------------------------------------------------------ |
| r24005 | r24006 | |
| 2092 | 2289 | // The result is stored into accumulator |
| 2093 | 2290 | // The low slice of accumulator is stored into destination element |
| 2094 | 2291 | |
| 2095 | | int sel; |
| 2096 | | INT32 s1, s2; |
| 2097 | | INT32 r; |
| 2292 | INT16 vres[8] = { 0 }; |
| 2098 | 2293 | for (int i = 0; i < 8; i++) |
| 2099 | 2294 | { |
| 2100 | | sel = VEC_EL_2(EL, i); |
| 2101 | | s1 = (UINT16)VREG_S(VS1REG, i); // not sign-extended |
| 2102 | | s2 = (INT32)(INT16)VREG_S(VS2REG, sel); |
| 2103 | | r = s1 * s2; |
| 2295 | #if USE_SIMD |
| 2296 | UINT16 w1, w2; |
| 2297 | SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i); |
| 2298 | SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i)); |
| 2299 | INT32 s1 = w1; |
| 2300 | INT32 s2 = (INT32)(INT16)w2; |
| 2301 | #else |
| 2302 | INT32 s1 = (UINT16)VREG_S(VS1REG, i); // not sign-extended |
| 2303 | INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 2304 | #endif |
| 2305 | INT32 r = s1 * s2; |
| 2104 | 2306 | |
| 2105 | 2307 | ACCUM_H(i) = (r < 0) ? 0xffff : 0; // sign-extend to 48-bit |
| 2106 | 2308 | ACCUM_M(i) = (INT16)(r >> 16); |
| 2309 | #if USE_SIMD |
| 2310 | SIMD_INSERT16(rsp->accum_l, (UINT16)(r), i); |
| 2311 | #else |
| 2107 | 2312 | ACCUM_L(i) = (UINT16)(r); |
| 2313 | #endif |
| 2108 | 2314 | |
| 2109 | | vres[i] = ACCUM_L(i); |
| 2315 | vres[i] = (UINT16)(r); |
| 2110 | 2316 | } |
| 2111 | 2317 | WRITEBACK_RESULT(); |
| 2112 | 2318 | } |
| r24005 | r24006 | |
| 2115 | 2321 | { |
| 2116 | 2322 | rsp_state *rsp = (rsp_state*)param; |
| 2117 | 2323 | int op = rsp->impstate->arg0; |
| 2118 | | INT16 vres[8]; |
| 2119 | | int i; |
| 2120 | 2324 | // 31 25 24 20 15 10 5 0 |
| 2121 | 2325 | // ------------------------------------------------------ |
| 2122 | 2326 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000111 | |
| r24005 | r24006 | |
| 2126 | 2330 | // The result is stored into highest 32 bits of accumulator, the low slice is zero |
| 2127 | 2331 | // The highest 32 bits of accumulator is saturated into destination element |
| 2128 | 2332 | |
| 2129 | | int sel; |
| 2130 | | INT32 s1, s2; |
| 2131 | | INT32 r; |
| 2132 | | for (i=0; i < 8; i++) |
| 2333 | INT16 vres[8]; |
| 2334 | for (int i = 0; i < 8; i++) |
| 2133 | 2335 | { |
| 2134 | | sel = VEC_EL_2(EL, i); |
| 2135 | | s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2136 | | s2 = (INT32)(INT16)VREG_S(VS2REG, sel); |
| 2137 | | r = s1 * s2; |
| 2336 | #if USE_SIMD |
| 2337 | UINT16 w1, w2; |
| 2338 | SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i); |
| 2339 | SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i)); |
| 2340 | INT32 s1 = (INT32)(INT16)w1; |
| 2341 | INT32 s2 = (INT32)(INT16)w2; |
| 2342 | #else |
| 2343 | INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2344 | INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 2345 | #endif |
| 2346 | INT32 r = s1 * s2; |
| 2138 | 2347 | |
| 2139 | 2348 | ACCUM_H(i) = (INT16)(r >> 16); |
| 2140 | 2349 | ACCUM_M(i) = (UINT16)(r); |
| 2350 | #if USE_SIMD |
| 2351 | SIMD_INSERT16(rsp->accum_l, 0, i); |
| 2352 | #else |
| 2141 | 2353 | ACCUM_L(i) = 0; |
| 2354 | #endif |
| 2142 | 2355 | |
| 2143 | 2356 | if (r < -32768) r = -32768; |
| 2144 | 2357 | if (r > 32767) r = 32767; |
| r24005 | r24006 | |
| 2151 | 2364 | { |
| 2152 | 2365 | rsp_state *rsp = (rsp_state*)param; |
| 2153 | 2366 | int op = rsp->impstate->arg0; |
| 2367 | |
| 2154 | 2368 | INT16 vres[8]; |
| 2155 | | |
| 2156 | | int sel; |
| 2157 | | INT32 s1, s2; |
| 2158 | | INT32 r; |
| 2159 | | UINT16 res; |
| 2160 | 2369 | for (int i = 0; i < 8; i++) |
| 2161 | 2370 | { |
| 2162 | | sel = VEC_EL_2(EL, i); |
| 2163 | | s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2164 | | s2 = (INT32)(INT16)VREG_S(VS2REG, sel); |
| 2165 | | r = s1 * s2; |
| 2371 | #if USE_SIMD |
| 2372 | UINT16 w1, w2; |
| 2373 | SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i); |
| 2374 | SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i)); |
| 2375 | INT32 s1 = (INT32)(INT16)w1; |
| 2376 | INT32 s2 = (INT32)(INT16)w2; |
| 2377 | #else |
| 2378 | INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2379 | INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 2380 | #endif |
| 2381 | INT32 r = s1 * s2; |
| 2166 | 2382 | |
| 2383 | #if USE_SIMD |
| 2384 | UINT64 q = (UINT64)ACCUM(i) & 0xffffffff0000ffffL; |
| 2385 | UINT16 accl; |
| 2386 | SIMD_EXTRACT16(rsp->accum_l, accl, i); |
| 2387 | q |= (UINT64)((UINT32)accl << 16); |
| 2388 | q += (INT64)(r) << 17; |
| 2389 | ACCUM(i) = q; |
| 2390 | SIMD_INSERT16(rsp->accum_l, (UINT16)(q >> 16), i); |
| 2391 | #else |
| 2167 | 2392 | ACCUM(i) += (INT64)(r) << 17; |
| 2168 | | res = SATURATE_ACCUM(rsp, i, 1, 0x8000, 0x7fff); |
| 2393 | #endif |
| 2169 | 2394 | |
| 2170 | | vres[i] = res; |
| 2395 | vres[i] = SATURATE_ACCUM(rsp, i, 1, 0x8000, 0x7fff); |
| 2171 | 2396 | } |
| 2172 | 2397 | WRITEBACK_RESULT(); |
| 2173 | 2398 | } |
| r24005 | r24006 | |
| 2176 | 2401 | { |
| 2177 | 2402 | rsp_state *rsp = (rsp_state*)param; |
| 2178 | 2403 | int op = rsp->impstate->arg0; |
| 2179 | | INT16 vres[8]; |
| 2180 | | int i; |
| 2404 | |
| 2181 | 2405 | // 31 25 24 20 15 10 5 0 |
| 2182 | 2406 | // ------------------------------------------------------ |
| 2183 | 2407 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001001 | |
| 2184 | 2408 | // ------------------------------------------------------ |
| 2185 | 2409 | // |
| 2186 | 2410 | |
| 2187 | | UINT16 res; |
| 2188 | | int sel; |
| 2189 | | INT32 s1, s2, r1; |
| 2190 | | UINT32 r2, r3; |
| 2191 | | for (i = 0; i < 8; i++) |
| 2411 | INT16 vres[8]; |
| 2412 | for (int i = 0; i < 8; i++) |
| 2192 | 2413 | { |
| 2193 | | sel = VEC_EL_2(EL, i); |
| 2194 | | s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2195 | | s2 = (INT32)(INT16)VREG_S(VS2REG, sel); |
| 2196 | | r1 = s1 * s2; |
| 2197 | | r2 = (UINT16)ACCUM_L(i) + ((UINT16)(r1) * 2); |
| 2198 | | r3 = (UINT16)ACCUM_M(i) + (UINT16)((r1 >> 16) * 2) + (UINT16)(r2 >> 16); |
| 2414 | #if USE_SIMD |
| 2415 | UINT16 w1, w2; |
| 2416 | SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i); |
| 2417 | SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i)); |
| 2418 | INT32 s1 = (INT32)(INT16)w1; |
| 2419 | INT32 s2 = (INT32)(INT16)w2; |
| 2420 | #else |
| 2421 | INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2422 | INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 2423 | #endif |
| 2424 | INT32 r1 = s1 * s2; |
| 2425 | #if USE_SIMD |
| 2426 | UINT16 accl; |
| 2427 | SIMD_EXTRACT16(rsp->accum_l, accl, i); |
| 2428 | UINT32 r2 = accl + ((UINT16)(r1) * 2); |
| 2429 | #else |
| 2430 | UINT32 r2 = (UINT16)ACCUM_L(i) + ((UINT16)(r1) * 2); |
| 2431 | #endif |
| 2432 | UINT32 r3 = (UINT16)ACCUM_M(i) + (UINT16)((r1 >> 16) * 2) + (UINT16)(r2 >> 16); |
| 2199 | 2433 | |
| 2434 | #if USE_SIMD |
| 2435 | SIMD_INSERT16(rsp->accum_l, (UINT16)(r2), i); |
| 2436 | #else |
| 2200 | 2437 | ACCUM_L(i) = (UINT16)(r2); |
| 2438 | #endif |
| 2201 | 2439 | ACCUM_M(i) = (UINT16)(r3); |
| 2202 | 2440 | ACCUM_H(i) += (UINT16)(r3 >> 16) + (UINT16)(r1 >> 31); |
| 2203 | 2441 | |
| 2204 | 2442 | //res = SATURATE_ACCUM(i, 1, 0x0000, 0xffff); |
| 2205 | 2443 | if ((INT16)ACCUM_H(i) < 0) |
| 2206 | 2444 | { |
| 2207 | | res = 0; |
| 2445 | vres[i] = 0; |
| 2208 | 2446 | } |
| 2209 | 2447 | else |
| 2210 | 2448 | { |
| 2211 | 2449 | if (ACCUM_H(i) != 0) |
| 2212 | 2450 | { |
| 2213 | | res = 0xffff; |
| 2451 | vres[i] = 0xffff; |
| 2214 | 2452 | } |
| 2215 | 2453 | else |
| 2216 | 2454 | { |
| 2217 | 2455 | if ((INT16)ACCUM_M(i) < 0) |
| 2218 | 2456 | { |
| 2219 | | res = 0xffff; |
| 2457 | vres[i] = 0xffff; |
| 2220 | 2458 | } |
| 2221 | 2459 | else |
| 2222 | 2460 | { |
| 2223 | | res = ACCUM_M(i); |
| 2461 | vres[i] = ACCUM_M(i); |
| 2224 | 2462 | } |
| 2225 | 2463 | } |
| 2226 | 2464 | } |
| 2227 | | |
| 2228 | | vres[i] = res; |
| 2229 | 2465 | } |
| 2230 | 2466 | WRITEBACK_RESULT(); |
| 2231 | 2467 | } |
| r24005 | r24006 | |
| 2234 | 2470 | { |
| 2235 | 2471 | rsp_state *rsp = (rsp_state*)param; |
| 2236 | 2472 | int op = rsp->impstate->arg0; |
| 2237 | | INT16 vres[8]; |
| 2238 | | int i; |
| 2473 | |
| 2239 | 2474 | // 31 25 24 20 15 10 5 0 |
| 2240 | 2475 | // ------------------------------------------------------ |
| 2241 | 2476 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001100 | |
| r24005 | r24006 | |
| 2245 | 2480 | // Adds the higher 16 bits of the 32-bit result to accumulator |
| 2246 | 2481 | // The low slice of accumulator is stored into destination element |
| 2247 | 2482 | |
| 2248 | | UINT16 res; |
| 2249 | | int sel; |
| 2250 | | UINT32 s1, s2, r1; |
| 2251 | | UINT32 r2, r3; |
| 2252 | | for (i = 0; i < 8; i++) |
| 2483 | INT16 vres[8]; |
| 2484 | for (int i = 0; i < 8; i++) |
| 2253 | 2485 | { |
| 2254 | | sel = VEC_EL_2(EL, i); |
| 2255 | | s1 = (UINT32)(UINT16)VREG_S(VS1REG, i); |
| 2256 | | s2 = (UINT32)(UINT16)VREG_S(VS2REG, sel); |
| 2257 | | r1 = s1 * s2; |
| 2258 | | r2 = (UINT16)ACCUM_L(i) + (r1 >> 16); |
| 2259 | | r3 = (UINT16)ACCUM_M(i) + (r2 >> 16); |
| 2486 | #if USE_SIMD |
| 2487 | UINT16 w1, w2; |
| 2488 | SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i); |
| 2489 | SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i)); |
| 2490 | UINT32 s1 = w1; |
| 2491 | UINT32 s2 = w2; |
| 2492 | #else |
| 2493 | UINT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, i); |
| 2494 | UINT32 s2 = (UINT32)(UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 2495 | #endif |
| 2496 | UINT32 r1 = s1 * s2; |
| 2497 | #if USE_SIMD |
| 2498 | UINT16 accl; |
| 2499 | SIMD_EXTRACT16(rsp->accum_l, accl, i); |
| 2500 | UINT32 r2 = accl + (r1 >> 16); |
| 2501 | #else |
| 2502 | UINT32 r2 = (UINT16)ACCUM_L(i) + (r1 >> 16); |
| 2503 | #endif |
| 2504 | UINT32 r3 = (UINT16)ACCUM_M(i) + (r2 >> 16); |
| 2260 | 2505 | |
| 2506 | #if USE_SIMD |
| 2507 | SIMD_INSERT16(rsp->accum_l, (UINT16)(r2), i); |
| 2508 | #else |
| 2261 | 2509 | ACCUM_L(i) = (UINT16)(r2); |
| 2510 | #endif |
| 2262 | 2511 | ACCUM_M(i) = (UINT16)(r3); |
| 2263 | 2512 | ACCUM_H(i) += (INT16)(r3 >> 16); |
| 2264 | 2513 | |
| 2265 | | res = SATURATE_ACCUM(rsp, i, 0, 0x0000, 0xffff); |
| 2266 | | |
| 2267 | | vres[i] = res; |
| 2514 | vres[i] = SATURATE_ACCUM(rsp, i, 0, 0x0000, 0xffff); |
| 2268 | 2515 | } |
| 2269 | 2516 | WRITEBACK_RESULT(); |
| 2270 | 2517 | } |
| r24005 | r24006 | |
| 2273 | 2520 | { |
| 2274 | 2521 | rsp_state *rsp = (rsp_state*)param; |
| 2275 | 2522 | int op = rsp->impstate->arg0; |
| 2523 | |
| 2276 | 2524 | INT16 vres[8]; |
| 2277 | | |
| 2278 | | UINT16 res; |
| 2279 | | int sel; |
| 2280 | | UINT32 s1, s2, r1; |
| 2281 | | UINT32 r2, r3; |
| 2282 | 2525 | for (int i = 0; i < 8; i++) |
| 2283 | 2526 | { |
| 2284 | | sel = VEC_EL_2(EL, i); |
| 2285 | | s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2286 | | s2 = (UINT16)VREG_S(VS2REG, sel); // not sign-extended |
| 2287 | | r1 = s1 * s2; |
| 2288 | | r2 = (UINT16)ACCUM_L(i) + (UINT16)(r1); |
| 2289 | | r3 = (UINT16)ACCUM_M(i) + (r1 >> 16) + (r2 >> 16); |
| 2527 | #if USE_SIMD |
| 2528 | UINT16 w1, w2; |
| 2529 | SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i); |
| 2530 | SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i)); |
| 2531 | UINT32 s1 = (INT32)(INT16)w1; |
| 2532 | UINT32 s2 = w2; |
| 2533 | #else |
| 2534 | UINT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2535 | UINT32 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); // not sign-extended |
| 2536 | #endif |
| 2537 | UINT32 r1 = s1 * s2; |
| 2538 | #if USE_SIMD |
| 2539 | UINT16 accl; |
| 2540 | SIMD_EXTRACT16(rsp->accum_l, accl, i); |
| 2541 | UINT32 r2 = accl + (UINT16)(r1); |
| 2542 | #else |
| 2543 | UINT32 r2 = (UINT16)ACCUM_L(i) + (UINT16)(r1); |
| 2544 | #endif |
| 2545 | UINT32 r3 = (UINT16)ACCUM_M(i) + (r1 >> 16) + (r2 >> 16); |
| 2290 | 2546 | |
| 2547 | #if USE_SIMD |
| 2548 | SIMD_INSERT16(rsp->accum_l, (UINT16)(r2), i); |
| 2549 | #else |
| 2291 | 2550 | ACCUM_L(i) = (UINT16)(r2); |
| 2551 | #endif |
| 2292 | 2552 | ACCUM_M(i) = (UINT16)(r3); |
| 2293 | 2553 | ACCUM_H(i) += (UINT16)(r3 >> 16); |
| 2294 | 2554 | if ((INT32)(r1) < 0) |
| 2295 | 2555 | ACCUM_H(i) -= 1; |
| 2296 | 2556 | |
| 2297 | | res = SATURATE_ACCUM(rsp, i, 1, 0x8000, 0x7fff); |
| 2298 | | |
| 2299 | | vres[i] = res; |
| 2557 | vres[i] = SATURATE_ACCUM(rsp, i, 1, 0x8000, 0x7fff); |
| 2300 | 2558 | } |
| 2301 | 2559 | WRITEBACK_RESULT(); |
| 2302 | 2560 | } |
| r24005 | r24006 | |
| 2305 | 2563 | { |
| 2306 | 2564 | rsp_state *rsp = (rsp_state*)param; |
| 2307 | 2565 | int op = rsp->impstate->arg0; |
| 2566 | |
| 2308 | 2567 | INT16 vres[8]; |
| 2309 | | |
| 2310 | | INT32 s1, s2; |
| 2311 | | UINT16 res; |
| 2312 | | int sel; |
| 2313 | 2568 | for (int i = 0; i < 8; i++) |
| 2314 | 2569 | { |
| 2315 | | sel = VEC_EL_2(EL, i); |
| 2316 | | s1 = (UINT16)VREG_S(VS1REG, i); // not sign-extended |
| 2317 | | s2 = (INT32)(INT16)VREG_S(VS2REG, sel); |
| 2570 | #if USE_SIMD |
| 2571 | UINT16 w1, w2; |
| 2572 | SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i); |
| 2573 | SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i)); |
| 2574 | INT32 s1 = w1; |
| 2575 | INT32 s2 = (INT32)(INT16)w2; |
| 2576 | #else |
| 2577 | INT32 s1 = (UINT16)VREG_S(VS1REG, i); // not sign-extended |
| 2578 | INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 2579 | #endif |
| 2318 | 2580 | |
| 2319 | | ACCUM(i) += (INT64)(s1*s2)<<16; |
| 2581 | #if USE_SIMD |
| 2582 | UINT64 q = (UINT64)ACCUM(i) & 0xffffffff0000ffffL; |
| 2583 | UINT16 accl; |
| 2584 | SIMD_EXTRACT16(rsp->accum_l, accl, i); |
| 2585 | q |= (UINT64)((UINT32)accl << 16); |
| 2586 | q += (INT64)(s1*s2) << 16; |
| 2587 | ACCUM(i) = q; |
| 2588 | SIMD_INSERT16(rsp->accum_l, (UINT16)(q >> 16), i); |
| 2589 | #else |
| 2590 | ACCUM(i) += (INT64)(s1*s2) << 16; |
| 2591 | #endif |
| 2320 | 2592 | |
| 2321 | | res = SATURATE_ACCUM(rsp, i, 0, 0x0000, 0xffff); |
| 2322 | | vres[i] = res; |
| 2593 | vres[i] = SATURATE_ACCUM(rsp, i, 0, 0x0000, 0xffff); |
| 2323 | 2594 | } |
| 2324 | 2595 | WRITEBACK_RESULT(); |
| 2325 | 2596 | } |
| r24005 | r24006 | |
| 2328 | 2599 | { |
| 2329 | 2600 | rsp_state *rsp = (rsp_state*)param; |
| 2330 | 2601 | int op = rsp->impstate->arg0; |
| 2331 | | INT16 vres[8]; |
| 2332 | 2602 | // 31 25 24 20 15 10 5 0 |
| 2333 | 2603 | // ------------------------------------------------------ |
| 2334 | 2604 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001111 | |
| r24005 | r24006 | |
| 2338 | 2608 | // The result is added into highest 32 bits of accumulator, the low slice is zero |
| 2339 | 2609 | // The highest 32 bits of accumulator is saturated into destination element |
| 2340 | 2610 | |
| 2341 | | UINT16 res; |
| 2342 | | int sel; |
| 2343 | | INT32 s1, s2; |
| 2611 | #if 0 |
| 2612 | UINT16 caccumh[8], caccumm[8], vs1[8], vs2[8]; |
| 2344 | 2613 | for (int i = 0; i < 8; i++) |
| 2345 | 2614 | { |
| 2346 | | sel = VEC_EL_2(EL, i); |
| 2347 | | s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2348 | | s2 = (INT32)(INT16)VREG_S(VS2REG, sel); |
| 2615 | caccumh[i] = ACCUM_H(i); |
| 2616 | caccumm[i] = ACCUM_M(i); |
| 2617 | SIMD_EXTRACT16(rsp->xv[VS1REG], vs1[i], i); |
| 2618 | SIMD_EXTRACT16(rsp->xv[VS2REG], vs2[i], i); |
| 2619 | printf("%04x%04x\n", (UINT16)caccumh[i], (UINT16)caccumm[i]); |
| 2620 | } |
| 2621 | #endif |
| 2349 | 2622 | |
| 2623 | #if USE_SIMD |
| 2624 | __m128i vec7531 = _mm_and_si128(rsp->xv[VS1REG], vec_himask); |
| 2625 | __m128i vec6420 = _mm_slli_epi32(rsp->xv[VS1REG], 16); |
| 2626 | __m128i shuf2 = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 2627 | |
| 2628 | __m128i shuf7531 = _mm_and_si128(shuf2, vec_himask); |
| 2629 | __m128i shuf6420 = _mm_slli_epi32(shuf2, 16); |
| 2630 | |
| 2631 | __m128i upper7531 = _mm_mulhi_epi16(vec7531, shuf7531); |
| 2632 | __m128i lower7531 = _mm_srli_epi32(_mm_mullo_epi16(vec7531, shuf7531), 16); |
| 2633 | __m128i prod7531 = _mm_or_si128(upper7531, lower7531); |
| 2634 | |
| 2635 | __m128i upper6420 = _mm_mulhi_epi16(vec6420, shuf6420); |
| 2636 | __m128i lower6420 = _mm_srli_epi32(_mm_mullo_epi16(vec6420, shuf6420), 16); |
| 2637 | __m128i prod6420 = _mm_or_si128(upper6420, lower6420); |
| 2638 | |
| 2639 | #if 0 |
| 2640 | UINT16 svs1[8], svs2[8]; |
| 2641 | svs1[0] = _mm_extract_epi16(rsp->xv[VS1REG], 7); |
| 2642 | svs1[1] = _mm_extract_epi16(rsp->xv[VS1REG], 6); |
| 2643 | svs1[2] = _mm_extract_epi16(rsp->xv[VS1REG], 5); |
| 2644 | svs1[3] = _mm_extract_epi16(rsp->xv[VS1REG], 4); |
| 2645 | svs1[4] = _mm_extract_epi16(rsp->xv[VS1REG], 3); |
| 2646 | svs1[5] = _mm_extract_epi16(rsp->xv[VS1REG], 2); |
| 2647 | svs1[6] = _mm_extract_epi16(rsp->xv[VS1REG], 1); |
| 2648 | svs1[7] = _mm_extract_epi16(rsp->xv[VS1REG], 0); |
| 2649 | svs2[0] = _mm_extract_epi16(rsp->xv[VS2REG], 7); |
| 2650 | svs2[1] = _mm_extract_epi16(rsp->xv[VS2REG], 6); |
| 2651 | svs2[2] = _mm_extract_epi16(rsp->xv[VS2REG], 5); |
| 2652 | svs2[3] = _mm_extract_epi16(rsp->xv[VS2REG], 4); |
| 2653 | svs2[4] = _mm_extract_epi16(rsp->xv[VS2REG], 3); |
| 2654 | svs2[5] = _mm_extract_epi16(rsp->xv[VS2REG], 2); |
| 2655 | svs2[6] = _mm_extract_epi16(rsp->xv[VS2REG], 1); |
| 2656 | svs2[7] = _mm_extract_epi16(rsp->xv[VS2REG], 0); |
| 2657 | |
| 2658 | printf("%d\n", EL); |
| 2659 | |
| 2660 | UINT16 vecs[16]; |
| 2661 | vecs[0] = _mm_extract_epi16(vec7531, 0); |
| 2662 | vecs[1] = _mm_extract_epi16(vec7531, 1); |
| 2663 | vecs[2] = _mm_extract_epi16(vec7531, 2); |
| 2664 | vecs[3] = _mm_extract_epi16(vec7531, 3); |
| 2665 | vecs[4] = _mm_extract_epi16(vec7531, 4); |
| 2666 | vecs[5] = _mm_extract_epi16(vec7531, 5); |
| 2667 | vecs[6] = _mm_extract_epi16(vec7531, 6); |
| 2668 | vecs[7] = _mm_extract_epi16(vec7531, 7); |
| 2669 | vecs[8] = _mm_extract_epi16(vec6420, 0); |
| 2670 | vecs[9] = _mm_extract_epi16(vec6420, 1); |
| 2671 | vecs[10] = _mm_extract_epi16(vec6420, 2); |
| 2672 | vecs[11] = _mm_extract_epi16(vec6420, 3); |
| 2673 | vecs[12] = _mm_extract_epi16(vec6420, 4); |
| 2674 | vecs[13] = _mm_extract_epi16(vec6420, 5); |
| 2675 | vecs[14] = _mm_extract_epi16(vec6420, 6); |
| 2676 | vecs[15] = _mm_extract_epi16(vec6420, 7); |
| 2677 | printf("VS1 %04x%04x %04x%04x %04x%04x %04x%04x\n", vs1[0], vs1[1], vs1[2], vs1[3], vs1[4], vs1[5], vs1[6], vs1[7]); |
| 2678 | printf("VS2 %04x%04x %04x%04x %04x%04x %04x%04x\n", vs2[0], vs2[1], vs2[2], vs2[3], vs2[4], vs2[5], vs2[6], vs2[7]); |
| 2679 | printf("Vec %04x%04x %04x%04x %04x%04x %04x%04x\n", vecs[0], vecs[1], vecs[2], vecs[3], vecs[4], vecs[5], vecs[6], vecs[7]); |
| 2680 | printf("Vec %04x%04x %04x%04x %04x%04x %04x%04x\n", vecs[8], vecs[9], vecs[10], vecs[11], vecs[12], vecs[13], vecs[14], vecs[15]); |
| 2681 | |
| 2682 | UINT16 shufs[16]; |
| 2683 | shufs[0] = _mm_extract_epi16(shuf7531, 0); |
| 2684 | shufs[1] = _mm_extract_epi16(shuf7531, 1); |
| 2685 | shufs[2] = _mm_extract_epi16(shuf7531, 2); |
| 2686 | shufs[3] = _mm_extract_epi16(shuf7531, 3); |
| 2687 | shufs[4] = _mm_extract_epi16(shuf7531, 4); |
| 2688 | shufs[5] = _mm_extract_epi16(shuf7531, 5); |
| 2689 | shufs[6] = _mm_extract_epi16(shuf7531, 6); |
| 2690 | shufs[7] = _mm_extract_epi16(shuf7531, 7); |
| 2691 | shufs[8] = _mm_extract_epi16(shuf6420, 0); |
| 2692 | shufs[9] = _mm_extract_epi16(shuf6420, 1); |
| 2693 | shufs[10] = _mm_extract_epi16(shuf6420, 2); |
| 2694 | shufs[11] = _mm_extract_epi16(shuf6420, 3); |
| 2695 | shufs[12] = _mm_extract_epi16(shuf6420, 4); |
| 2696 | shufs[13] = _mm_extract_epi16(shuf6420, 5); |
| 2697 | shufs[14] = _mm_extract_epi16(shuf6420, 6); |
| 2698 | shufs[15] = _mm_extract_epi16(shuf6420, 7); |
| 2699 | printf("Shf %04x%04x %04x%04x %04x%04x %04x%04x\n", shufs[0], shufs[1], shufs[2], shufs[3], shufs[4], shufs[5], shufs[6], shufs[7]); |
| 2700 | printf("Shf %04x%04x %04x%04x %04x%04x %04x%04x\n", shufs[8], shufs[9], shufs[10], shufs[11], shufs[12], shufs[13], shufs[14], shufs[15]); |
| 2701 | |
| 2702 | UINT16 uppers[16]; |
| 2703 | uppers[0] = _mm_extract_epi16(upper7531, 0); |
| 2704 | uppers[1] = _mm_extract_epi16(upper7531, 1); |
| 2705 | uppers[2] = _mm_extract_epi16(upper7531, 2); |
| 2706 | uppers[3] = _mm_extract_epi16(upper7531, 3); |
| 2707 | uppers[4] = _mm_extract_epi16(upper7531, 4); |
| 2708 | uppers[5] = _mm_extract_epi16(upper7531, 5); |
| 2709 | uppers[6] = _mm_extract_epi16(upper7531, 6); |
| 2710 | uppers[7] = _mm_extract_epi16(upper7531, 7); |
| 2711 | uppers[8] = _mm_extract_epi16(upper6420, 0); |
| 2712 | uppers[9] = _mm_extract_epi16(upper6420, 1); |
| 2713 | uppers[10] = _mm_extract_epi16(upper6420, 2); |
| 2714 | uppers[11] = _mm_extract_epi16(upper6420, 3); |
| 2715 | uppers[12] = _mm_extract_epi16(upper6420, 4); |
| 2716 | uppers[13] = _mm_extract_epi16(upper6420, 5); |
| 2717 | uppers[14] = _mm_extract_epi16(upper6420, 6); |
| 2718 | uppers[15] = _mm_extract_epi16(upper6420, 7); |
| 2719 | printf("Upr %04x%04x %04x%04x %04x%04x %04x%04x\n", uppers[0], uppers[1], uppers[2], uppers[3], uppers[4], uppers[5], uppers[6], uppers[7]); |
| 2720 | printf("Upr %04x%04x %04x%04x %04x%04x %04x%04x\n", uppers[8], uppers[9], uppers[10], uppers[11], uppers[12], uppers[13], uppers[14], uppers[15]); |
| 2721 | |
| 2722 | UINT16 lowers[16]; |
| 2723 | lowers[0] = _mm_extract_epi16(lower7531, 0); |
| 2724 | lowers[1] = _mm_extract_epi16(lower7531, 1); |
| 2725 | lowers[2] = _mm_extract_epi16(lower7531, 2); |
| 2726 | lowers[3] = _mm_extract_epi16(lower7531, 3); |
| 2727 | lowers[4] = _mm_extract_epi16(lower7531, 4); |
| 2728 | lowers[5] = _mm_extract_epi16(lower7531, 5); |
| 2729 | lowers[6] = _mm_extract_epi16(lower7531, 6); |
| 2730 | lowers[7] = _mm_extract_epi16(lower7531, 7); |
| 2731 | lowers[8] = _mm_extract_epi16(lower6420, 0); |
| 2732 | lowers[9] = _mm_extract_epi16(lower6420, 1); |
| 2733 | lowers[10] = _mm_extract_epi16(lower6420, 2); |
| 2734 | lowers[11] = _mm_extract_epi16(lower6420, 3); |
| 2735 | lowers[12] = _mm_extract_epi16(lower6420, 4); |
| 2736 | lowers[13] = _mm_extract_epi16(lower6420, 5); |
| 2737 | lowers[14] = _mm_extract_epi16(lower6420, 6); |
| 2738 | lowers[15] = _mm_extract_epi16(lower6420, 7); |
| 2739 | printf("Lwr %04x%04x %04x%04x %04x%04x %04x%04x\n", lowers[0], lowers[1], lowers[2], lowers[3], lowers[4], lowers[5], lowers[6], lowers[7]); |
| 2740 | printf("Lwr %04x%04x %04x%04x %04x%04x %04x%04x\n", lowers[8], lowers[9], lowers[10], lowers[11], lowers[12], lowers[13], lowers[14], lowers[15]); |
| 2741 | |
| 2742 | UINT16 prods[16]; |
| 2743 | prods[0] = _mm_extract_epi16(prod7531, 0); |
| 2744 | prods[1] = _mm_extract_epi16(prod7531, 1); |
| 2745 | prods[2] = _mm_extract_epi16(prod7531, 2); |
| 2746 | prods[3] = _mm_extract_epi16(prod7531, 3); |
| 2747 | prods[4] = _mm_extract_epi16(prod7531, 4); |
| 2748 | prods[5] = _mm_extract_epi16(prod7531, 5); |
| 2749 | prods[6] = _mm_extract_epi16(prod7531, 6); |
| 2750 | prods[7] = _mm_extract_epi16(prod7531, 7); |
| 2751 | prods[8] = _mm_extract_epi16(prod6420, 0); |
| 2752 | prods[9] = _mm_extract_epi16(prod6420, 1); |
| 2753 | prods[10] = _mm_extract_epi16(prod6420, 2); |
| 2754 | prods[11] = _mm_extract_epi16(prod6420, 3); |
| 2755 | prods[12] = _mm_extract_epi16(prod6420, 4); |
| 2756 | prods[13] = _mm_extract_epi16(prod6420, 5); |
| 2757 | prods[14] = _mm_extract_epi16(prod6420, 6); |
| 2758 | prods[15] = _mm_extract_epi16(prod6420, 7); |
| 2759 | printf("Prd %04x%04x %04x%04x %04x%04x %04x%04x\n", prods[0], prods[1], prods[2], prods[3], prods[4], prods[5], prods[6], prods[7]); |
| 2760 | printf("Prd %04x%04x %04x%04x %04x%04x %04x%04x\n", prods[8], prods[9], prods[10], prods[11], prods[12], prods[13], prods[14], prods[15]); |
| 2761 | #endif |
| 2762 | |
| 2763 | __m128i accum7531 = _mm_set_epi16(ACCUM_H(7), ACCUM_M(7), ACCUM_H(5), ACCUM_M(5), ACCUM_H(3), ACCUM_M(3), ACCUM_H(1), ACCUM_M(1)); |
| 2764 | __m128i accum6420 = _mm_set_epi16(ACCUM_H(6), ACCUM_M(6), ACCUM_H(4), ACCUM_M(4), ACCUM_H(2), ACCUM_M(2), ACCUM_H(0), ACCUM_M(0)); |
| 2765 | accum7531 = _mm_add_epi32(accum7531, prod7531); |
| 2766 | accum6420 = _mm_add_epi32(accum6420, prod6420); |
| 2767 | __m128i accum7531_m = _mm_slli_epi32(_mm_and_si128(accum7531, vec_lomask), 16); |
| 2768 | __m128i accum7531_h = _mm_and_si128(accum7531, vec_himask); |
| 2769 | __m128i accum6420_m = _mm_and_si128(accum6420, vec_lomask); |
| 2770 | __m128i accum6420_h = _mm_srli_epi32(_mm_and_si128(accum6420, vec_himask), 16); |
| 2771 | __m128i newaccum_h = _mm_or_si128(accum7531_h, accum6420_h); |
| 2772 | __m128i newaccum_m = _mm_or_si128(accum7531_m, accum6420_m); |
| 2773 | #if 0 |
| 2774 | UINT16 accums[16]; |
| 2775 | accums[0] = _mm_extract_epi16(newaccum_h, 0); |
| 2776 | accums[1] = _mm_extract_epi16(newaccum_h, 1); |
| 2777 | accums[2] = _mm_extract_epi16(newaccum_h, 2); |
| 2778 | accums[3] = _mm_extract_epi16(newaccum_h, 3); |
| 2779 | accums[4] = _mm_extract_epi16(newaccum_h, 4); |
| 2780 | accums[5] = _mm_extract_epi16(newaccum_h, 5); |
| 2781 | accums[6] = _mm_extract_epi16(newaccum_h, 6); |
| 2782 | accums[7] = _mm_extract_epi16(newaccum_h, 7); |
| 2783 | accums[8] = _mm_extract_epi16(newaccum_m, 0); |
| 2784 | accums[9] = _mm_extract_epi16(newaccum_m, 1); |
| 2785 | accums[10] = _mm_extract_epi16(newaccum_m, 2); |
| 2786 | accums[11] = _mm_extract_epi16(newaccum_m, 3); |
| 2787 | accums[12] = _mm_extract_epi16(newaccum_m, 4); |
| 2788 | accums[13] = _mm_extract_epi16(newaccum_m, 5); |
| 2789 | accums[14] = _mm_extract_epi16(newaccum_m, 6); |
| 2790 | accums[15] = _mm_extract_epi16(newaccum_m, 7); |
| 2791 | printf("AcH %04x%04x %04x%04x %04x%04x %04x%04x\n", accums[0], accums[1], accums[2], accums[3], accums[4], accums[5], accums[6], accums[7]); |
| 2792 | printf("AcM %04x%04x %04x%04x %04x%04x %04x%04x\n", accums[8], accums[9], accums[10], accums[11], accums[12], accums[13], accums[14], accums[15]); |
| 2793 | #endif |
| 2794 | |
| 2795 | __m128i result = SATURATE_ACCUM1(newaccum_h, newaccum_m, 0x8000, 0x7fff); |
| 2796 | rsp->xv[VDREG] = result;//_mm_shuffle_epi8(result, vec_shuf_inverse[0]);//SATURATE_ACCUM1(newaccum_h, newaccum_m, 0x8000, 0x7fff); |
| 2797 | #if 0 |
| 2798 | UINT16 vresult[8]; |
| 2799 | vresult[0] = _mm_extract_epi16(result, 0); |
| 2800 | vresult[1] = _mm_extract_epi16(result, 1); |
| 2801 | vresult[2] = _mm_extract_epi16(result, 2); |
| 2802 | vresult[3] = _mm_extract_epi16(result, 3); |
| 2803 | vresult[4] = _mm_extract_epi16(result, 4); |
| 2804 | vresult[5] = _mm_extract_epi16(result, 5); |
| 2805 | vresult[6] = _mm_extract_epi16(result, 6); |
| 2806 | vresult[7] = _mm_extract_epi16(result, 7); |
| 2807 | printf("%04x %04x %04x %04x %04x %04x %04x %04x\n\n", vresult[0], vresult[1], vresult[2], vresult[3], vresult[4], vresult[5], vresult[6], vresult[7]); |
| 2808 | #endif |
| 2809 | ACCUM_H(0) = _mm_extract_epi16(newaccum_h, 0); |
| 2810 | ACCUM_H(1) = _mm_extract_epi16(newaccum_h, 1); |
| 2811 | ACCUM_H(2) = _mm_extract_epi16(newaccum_h, 2); |
| 2812 | ACCUM_H(3) = _mm_extract_epi16(newaccum_h, 3); |
| 2813 | ACCUM_H(4) = _mm_extract_epi16(newaccum_h, 4); |
| 2814 | ACCUM_H(5) = _mm_extract_epi16(newaccum_h, 5); |
| 2815 | ACCUM_H(6) = _mm_extract_epi16(newaccum_h, 6); |
| 2816 | ACCUM_H(7) = _mm_extract_epi16(newaccum_h, 7); |
| 2817 | ACCUM_M(0) = _mm_extract_epi16(newaccum_m, 0); |
| 2818 | ACCUM_M(1) = _mm_extract_epi16(newaccum_m, 1); |
| 2819 | ACCUM_M(2) = _mm_extract_epi16(newaccum_m, 2); |
| 2820 | ACCUM_M(3) = _mm_extract_epi16(newaccum_m, 3); |
| 2821 | ACCUM_M(4) = _mm_extract_epi16(newaccum_m, 4); |
| 2822 | ACCUM_M(5) = _mm_extract_epi16(newaccum_m, 5); |
| 2823 | ACCUM_M(6) = _mm_extract_epi16(newaccum_m, 6); |
| 2824 | ACCUM_M(7) = _mm_extract_epi16(newaccum_m, 7); |
| 2825 | #else |
| 2826 | INT16 vres[8]; |
| 2827 | for (int i = 0; i < 8; i++) |
| 2828 | { |
| 2829 | #if USE_SIMD |
| 2830 | UINT16 w1, w2; |
| 2831 | SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i); |
| 2832 | SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i)); |
| 2833 | INT32 s1 = (INT32)(INT16)w1; |
| 2834 | INT32 s2 = (INT32)(INT16)w2; |
| 2835 | #else |
| 2836 | INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2837 | INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 2838 | #endif |
| 2839 | //INT32 s1 = (INT32)(INT16)vs1[i]; |
| 2840 | //INT32 s2 = (INT32)(INT16)vs2[VEC_EL_2(EL, i)]; |
| 2841 | |
| 2350 | 2842 | rsp->accum[i].l[1] += s1*s2; |
| 2351 | 2843 | |
| 2352 | | res = SATURATE_ACCUM1(rsp, i, 0x8000, 0x7fff); |
| 2844 | vres[i] = SATURATE_ACCUM1(rsp, i, 0x8000, 0x7fff); |
| 2353 | 2845 | |
| 2354 | | vres[i] = res; |
| 2846 | /*INT32 accum = (INT32)((caccumh[i] << 16) | caccumm[i]); |
| 2847 | accum += (INT32)s1*s2; |
| 2848 | caccumh[i] = (accum >> 16) & 0x0000ffff; |
| 2849 | caccumm[i] = accum & 0x0000ffff; |
| 2850 | |
| 2851 | vres[i] = C_SATURATE_ACCUM1(caccumh, caccumm, i, 0x8000, 0x7fff);*/ |
| 2355 | 2852 | } |
| 2853 | /* printf("%08x\n", rsp->pc); |
| 2854 | for (int i = 0; i < 8; i++) |
| 2855 | { |
| 2856 | if ((UINT16)vres[i] != vresult[i]) |
| 2857 | { |
| 2858 | printf("Result mismatch:\n"); |
| 2859 | printf(" C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vres[0], vres[1], vres[2], vres[3], vres[4], vres[5], vres[6], vres[7]); |
| 2860 | printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", vresult[0], vresult[1], vresult[2], vresult[3], vresult[4], vresult[5], vresult[6], vresult[7]); |
| 2861 | printf("High accumulator:\n"); |
| 2862 | printf(" C: %04x %04x %04x %04x %04x %04x %04x %04x\n", caccumh[0], caccumh[1], caccumh[2], caccumh[3], caccumh[4], caccumh[5], caccumh[6], caccumh[7]); |
| 2863 | printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", ACCUM_H(0), ACCUM_H(1), ACCUM_H(2), ACCUM_H(3), ACCUM_H(4), ACCUM_H(5), ACCUM_H(6), ACCUM_H(7)); |
| 2864 | printf("Mid accumulator:\n"); |
| 2865 | printf(" C: %04x %04x %04x %04x %04x %04x %04x %04x\n", caccumm[0], caccumm[1], caccumm[2], caccumm[3], caccumm[4], caccumm[5], caccumm[6], caccumm[7]); |
| 2866 | printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", ACCUM_M(0), ACCUM_M(1), ACCUM_M(2), ACCUM_M(3), ACCUM_M(4), ACCUM_M(5), ACCUM_M(6), ACCUM_M(7)); |
| 2867 | printf("VS1:\n"); |
| 2868 | printf(" C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vs1[0], vs1[1], vs1[2], vs1[3], vs1[4], vs1[5], vs1[6], vs1[7]); |
| 2869 | printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", svs1[0], svs1[1], svs1[2], svs1[3], svs1[4], svs1[5], svs1[6], svs1[7]); |
| 2870 | printf("VS2:\n"); |
| 2871 | printf(" C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vs2[0], vs2[1], vs2[2], vs2[3], vs2[4], vs2[5], vs2[6], vs2[7]); |
| 2872 | printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", svs2[0], svs2[1], svs2[2], svs2[3], svs2[4], svs2[5], svs2[6], svs2[7]); |
| 2873 | fatalerror("asdf"); |
| 2874 | } |
| 2875 | if (caccumh[i] != (UINT16)ACCUM_H(i)) |
| 2876 | { |
| 2877 | printf("Result:\n"); |
| 2878 | printf(" C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vres[0], vres[1], vres[2], vres[3], vres[4], vres[5], vres[6], vres[7]); |
| 2879 | printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", vresult[0], vresult[1], vresult[2], vresult[3], vresult[4], vresult[5], vresult[6], vresult[7]); |
| 2880 | printf("High accumulator mismatch:\n"); |
| 2881 | printf(" C: %04x %04x %04x %04x %04x %04x %04x %04x\n", caccumh[0], caccumh[1], caccumh[2], caccumh[3], caccumh[4], caccumh[5], caccumh[6], caccumh[7]); |
| 2882 | printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", ACCUM_H(0), ACCUM_H(1), ACCUM_H(2), ACCUM_H(3), ACCUM_H(4), ACCUM_H(5), ACCUM_H(6), ACCUM_H(7)); |
| 2883 | printf("Mid accumulator:\n"); |
| 2884 | printf(" C: %04x %04x %04x %04x %04x %04x %04x %04x\n", caccumm[0], caccumm[1], caccumm[2], caccumm[3], caccumm[4], caccumm[5], caccumm[6], caccumm[7]); |
| 2885 | printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", ACCUM_M(0), ACCUM_M(1), ACCUM_M(2), ACCUM_M(3), ACCUM_M(4), ACCUM_M(5), ACCUM_M(6), ACCUM_M(7)); |
| 2886 | printf("VS1:\n"); |
| 2887 | printf(" C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vs1[0], vs1[1], vs1[2], vs1[3], vs1[4], vs1[5], vs1[6], vs1[7]); |
| 2888 | printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", svs1[0], svs1[1], svs1[2], svs1[3], svs1[4], svs1[5], svs1[6], svs1[7]); |
| 2889 | printf("VS2:\n"); |
| 2890 | printf(" C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vs2[0], vs2[1], vs2[2], vs2[3], vs2[4], vs2[5], vs2[6], vs2[7]); |
| 2891 | printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", svs2[0], svs2[1], svs2[2], svs2[3], svs2[4], svs2[5], svs2[6], svs2[7]); |
| 2892 | fatalerror("asdf"); |
| 2893 | } |
| 2894 | if (caccumm[i] != (UINT16)ACCUM_M(i)) |
| 2895 | { |
| 2896 | printf("Result:\n"); |
| 2897 | printf(" C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vres[0], vres[1], vres[2], vres[3], vres[4], vres[5], vres[6], vres[7]); |
| 2898 | printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", vresult[0], vresult[1], vresult[2], vresult[3], vresult[4], vresult[5], vresult[6], vresult[7]); |
| 2899 | printf("High accumulator:\n"); |
| 2900 | printf(" C: %04x %04x %04x %04x %04x %04x %04x %04x\n", caccumh[0], caccumh[1], caccumh[2], caccumh[3], caccumh[4], caccumh[5], caccumh[6], caccumh[7]); |
| 2901 | printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", ACCUM_H(0), ACCUM_H(1), ACCUM_H(2), ACCUM_H(3), ACCUM_H(4), ACCUM_H(5), ACCUM_H(6), ACCUM_H(7)); |
| 2902 | printf("Mid accumulator mismatch:\n"); |
| 2903 | printf(" C: %04x %04x %04x %04x %04x %04x %04x %04x\n", caccumm[0], caccumm[1], caccumm[2], caccumm[3], caccumm[4], caccumm[5], caccumm[6], caccumm[7]); |
| 2904 | printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", ACCUM_M(0), ACCUM_M(1), ACCUM_M(2), ACCUM_M(3), ACCUM_M(4), ACCUM_M(5), ACCUM_M(6), ACCUM_M(7)); |
| 2905 | printf("VS1:\n"); |
| 2906 | printf(" C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vs1[0], vs1[1], vs1[2], vs1[3], vs1[4], vs1[5], vs1[6], vs1[7]); |
| 2907 | printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", svs1[0], svs1[1], svs1[2], svs1[3], svs1[4], svs1[5], svs1[6], svs1[7]); |
| 2908 | printf("VS2:\n"); |
| 2909 | printf(" C: %04x %04x %04x %04x %04x %04x %04x %04x\n", vs2[0], vs2[1], vs2[2], vs2[3], vs2[4], vs2[5], vs2[6], vs2[7]); |
| 2910 | printf("SIMD: %04x %04x %04x %04x %04x %04x %04x %04x\n", svs2[0], svs2[1], svs2[2], svs2[3], svs2[4], svs2[5], svs2[6], svs2[7]); |
| 2911 | fatalerror("asdf"); |
| 2912 | } |
| 2913 | }*/ |
| 2356 | 2914 | WRITEBACK_RESULT(); |
| 2915 | #endif |
| 2357 | 2916 | } |
| 2358 | 2917 | |
| 2359 | 2918 | INLINE void cfunc_rsp_vadd(void *param) |
| 2360 | 2919 | { |
| 2361 | 2920 | rsp_state *rsp = (rsp_state*)param; |
| 2362 | 2921 | int op = rsp->impstate->arg0; |
| 2363 | | INT16 vres[8] = { 0 }; |
| 2364 | | //int i; |
| 2365 | 2922 | // 31 25 24 20 15 10 5 0 |
| 2366 | 2923 | // ------------------------------------------------------ |
| 2367 | 2924 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010000 | |
| r24005 | r24006 | |
| 2369 | 2926 | // |
| 2370 | 2927 | // Adds two vector registers and carry flag, the result is saturated to 32767 |
| 2371 | 2928 | |
| 2372 | | int sel; |
| 2373 | | INT32 s1, s2, r; |
| 2929 | #if USE_SIMD |
| 2930 | __m128i shuffled = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 2931 | __m128i unsat = rsp->xv[VS1REG]; |
| 2932 | __m128i carry = _mm_set_epi16(CARRY_FLAG(7), CARRY_FLAG(6), CARRY_FLAG(5), CARRY_FLAG(4), |
| 2933 | CARRY_FLAG(3), CARRY_FLAG(2), CARRY_FLAG(1), CARRY_FLAG(0)); |
| 2934 | |
| 2935 | unsat = _mm_add_epi16(unsat, shuffled); |
| 2936 | unsat = _mm_add_epi16(unsat, carry); |
| 2937 | |
| 2938 | __m128i maxval = _mm_set_epi64x(0x7fff7fff7fff7fffL, 0x7fff7fff7fff7fffL); |
| 2939 | __m128i minval = _mm_set_epi64x(0x8000800080008000L, 0x8000800080008000L); |
| 2940 | |
| 2941 | __m128i addvec = _mm_adds_epi16(rsp->xv[VS1REG], shuffled); |
| 2942 | |
| 2943 | __m128i carrymask = _mm_cmpeq_epi16(addvec, maxval); |
| 2944 | carrymask = _mm_xor_si128(carrymask, vec_neg1); |
| 2945 | carry = _mm_and_si128(carry, carrymask); |
| 2946 | |
| 2947 | carrymask = _mm_cmpeq_epi16(addvec, minval); |
| 2948 | carrymask = _mm_xor_si128(carrymask, vec_neg1); |
| 2949 | carry = _mm_and_si128(carry, carrymask); |
| 2950 | |
| 2951 | rsp->xv[VDREG] = _mm_add_epi16(addvec, carry); |
| 2952 | |
| 2953 | rsp->accum_l = unsat; |
| 2954 | ACCUM_L(0) = _mm_extract_epi16(unsat, 0); |
| 2955 | ACCUM_L(1) = _mm_extract_epi16(unsat, 1); |
| 2956 | ACCUM_L(2) = _mm_extract_epi16(unsat, 2); |
| 2957 | ACCUM_L(3) = _mm_extract_epi16(unsat, 3); |
| 2958 | ACCUM_L(4) = _mm_extract_epi16(unsat, 4); |
| 2959 | ACCUM_L(5) = _mm_extract_epi16(unsat, 5); |
| 2960 | ACCUM_L(6) = _mm_extract_epi16(unsat, 6); |
| 2961 | ACCUM_L(7) = _mm_extract_epi16(unsat, 7); |
| 2962 | |
| 2963 | CLEAR_ZERO_FLAGS(); |
| 2964 | CLEAR_CARRY_FLAGS(); |
| 2965 | #else |
| 2966 | INT16 vres[8] = { 0 }; |
| 2374 | 2967 | for (int i = 0; i < 8; i++) |
| 2375 | 2968 | { |
| 2376 | | sel = VEC_EL_2(EL, i); |
| 2377 | | s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2378 | | s2 = (INT32)(INT16)VREG_S(VS2REG, sel); |
| 2379 | | r = s1 + s2 + CARRY_FLAG(i); |
| 2969 | #if USE_SIMD |
| 2970 | UINT16 w1, w2; |
| 2971 | SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i); |
| 2972 | SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i)); |
| 2973 | INT32 s1 = (INT32)(INT16)w1; |
| 2974 | INT32 s2 = (INT32)(INT16)w2; |
| 2975 | #else |
| 2976 | INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2977 | INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 2978 | #endif |
| 2979 | INT32 r = s1 + s2 + CARRY_FLAG(i); |
| 2380 | 2980 | |
| 2981 | #if USE_SIMD |
| 2982 | SIMD_INSERT16(rsp->accum_l, (INT16)(r), i); |
| 2983 | #else |
| 2381 | 2984 | ACCUM_L(i) = (INT16)(r); |
| 2985 | #endif |
| 2382 | 2986 | |
| 2383 | 2987 | if (r > 32767) r = 32767; |
| 2384 | 2988 | if (r < -32768) r = -32768; |
| r24005 | r24006 | |
| 2387 | 2991 | CLEAR_ZERO_FLAGS(); |
| 2388 | 2992 | CLEAR_CARRY_FLAGS(); |
| 2389 | 2993 | WRITEBACK_RESULT(); |
| 2994 | #endif |
| 2390 | 2995 | } |
| 2391 | 2996 | |
| 2392 | 2997 | INLINE void cfunc_rsp_vsub(void *param) |
| 2393 | 2998 | { |
| 2394 | 2999 | rsp_state *rsp = (rsp_state*)param; |
| 2395 | 3000 | int op = rsp->impstate->arg0; |
| 2396 | | INT16 vres[8]; |
| 2397 | | int i; |
| 2398 | 3001 | // 31 25 24 20 15 10 5 0 |
| 2399 | 3002 | // ------------------------------------------------------ |
| 2400 | 3003 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010001 | |
| r24005 | r24006 | |
| 2404 | 3007 | |
| 2405 | 3008 | // TODO: check VS2REG == VDREG |
| 2406 | 3009 | |
| 2407 | | int sel; |
| 2408 | | INT32 s1, s2, r; |
| 2409 | | for (i = 0; i < 8; i++) |
| 3010 | #if USE_SIMD |
| 3011 | __m128i shuffled = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 3012 | __m128i unsat = rsp->xv[VS1REG]; |
| 3013 | __m128i carry = _mm_set_epi16(CARRY_FLAG(7), CARRY_FLAG(6), CARRY_FLAG(5), CARRY_FLAG(4), |
| 3014 | CARRY_FLAG(3), CARRY_FLAG(2), CARRY_FLAG(1), CARRY_FLAG(0)); |
| 3015 | |
| 3016 | unsat = _mm_sub_epi16(unsat, shuffled); |
| 3017 | unsat = _mm_sub_epi16(unsat, carry); |
| 3018 | |
| 3019 | __m128i minval = _mm_set_epi64x(0x8000800080008000L, 0x8000800080008000L); |
| 3020 | |
| 3021 | __m128i subvec = _mm_subs_epi16(rsp->xv[VS1REG], shuffled); |
| 3022 | |
| 3023 | __m128i carrymask = _mm_cmpeq_epi16(subvec, minval); |
| 3024 | carrymask = _mm_xor_si128(carrymask, vec_neg1); |
| 3025 | carry = _mm_and_si128(carry, carrymask); |
| 3026 | |
| 3027 | rsp->xv[VDREG] = _mm_sub_epi16(subvec, carry); |
| 3028 | |
| 3029 | rsp->accum_l = unsat; |
| 3030 | ACCUM_L(0) = _mm_extract_epi16(unsat, 0); |
| 3031 | ACCUM_L(1) = _mm_extract_epi16(unsat, 1); |
| 3032 | ACCUM_L(2) = _mm_extract_epi16(unsat, 2); |
| 3033 | ACCUM_L(3) = _mm_extract_epi16(unsat, 3); |
| 3034 | ACCUM_L(4) = _mm_extract_epi16(unsat, 4); |
| 3035 | ACCUM_L(5) = _mm_extract_epi16(unsat, 5); |
| 3036 | ACCUM_L(6) = _mm_extract_epi16(unsat, 6); |
| 3037 | ACCUM_L(7) = _mm_extract_epi16(unsat, 7); |
| 3038 | |
| 3039 | CLEAR_ZERO_FLAGS(); |
| 3040 | CLEAR_CARRY_FLAGS(); |
| 3041 | #else |
| 3042 | INT16 vres[8]; |
| 3043 | for (int i = 0; i < 8; i++) |
| 2410 | 3044 | { |
| 2411 | | sel = VEC_EL_2(EL, i); |
| 2412 | | s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 2413 | | s2 = (INT32)(INT16)VREG_S(VS2REG, sel); |
| 2414 | | r = s1 - s2 - CARRY_FLAG(i); |
| 3045 | #if USE_SIMD |
| 3046 | UINT16 w1, w2; |
| 3047 | SIMD_EXTRACT16(rsp->xv[VS1REG], w1, i); |
| 3048 | SIMD_EXTRACT16(rsp->xv[VS2REG], w2, VEC_EL_2(EL, i)); |
| 3049 | INT32 s1 = (INT32)(INT16)w1; |
| 3050 | INT32 s2 = (INT32)(INT16)w2; |
| 3051 | #else |
| 3052 | INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); |
| 3053 | INT32 s2 = (INT32)(INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 3054 | #endif |
| 3055 | INT32 r = s1 - s2 - CARRY_FLAG(i); |
| 2415 | 3056 | |
| 3057 | #if USE_SIMD |
| 3058 | SIMD_INSERT16(rsp->accum_l, (INT16)(r), i); |
| 3059 | #else |
| 2416 | 3060 | ACCUM_L(i) = (INT16)(r); |
| 3061 | #endif |
| 2417 | 3062 | |
| 2418 | 3063 | if (r > 32767) r = 32767; |
| 2419 | 3064 | if (r < -32768) r = -32768; |
| r24005 | r24006 | |
| 2423 | 3068 | CLEAR_ZERO_FLAGS(); |
| 2424 | 3069 | CLEAR_CARRY_FLAGS(); |
| 2425 | 3070 | WRITEBACK_RESULT(); |
| 3071 | #endif |
| 2426 | 3072 | } |
| 2427 | 3073 | |
| 2428 | 3074 | INLINE void cfunc_rsp_vabs(void *param) |
| r24005 | r24006 | |
| 2430 | 3076 | rsp_state *rsp = (rsp_state*)param; |
| 2431 | 3077 | int op = rsp->impstate->arg0; |
| 2432 | 3078 | INT16 vres[8]; |
| 2433 | | int i; |
| 2434 | 3079 | // 31 25 24 20 15 10 5 0 |
| 2435 | 3080 | // ------------------------------------------------------ |
| 2436 | 3081 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010011 | |
| r24005 | r24006 | |
| 2439 | 3084 | // Changes the sign of source register 2 if source register 1 is negative and stores |
| 2440 | 3085 | // the result to destination register |
| 2441 | 3086 | |
| 2442 | | int sel; |
| 2443 | | INT16 s1, s2; |
| 2444 | | for (i=0; i < 8; i++) |
| 3087 | for (int i = 0; i < 8; i++) |
| 2445 | 3088 | { |
| 2446 | | sel = VEC_EL_2(EL, i); |
| 2447 | | s1 = (INT16)VREG_S(VS1REG, i); |
| 2448 | | s2 = (INT16)VREG_S(VS2REG, sel); |
| 3089 | #if USE_SIMD |
| 3090 | INT16 s1, s2; |
| 3091 | SIMD_EXTRACT16(rsp->xv[VS1REG], s1, i); |
| 3092 | SIMD_EXTRACT16(rsp->xv[VS2REG], s2, VEC_EL_2(EL, i)); |
| 3093 | #else |
| 3094 | INT16 s1 = (INT16)VREG_S(VS1REG, i); |
| 3095 | INT16 s2 = (INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 3096 | #endif |
| 2449 | 3097 | |
| 2450 | 3098 | if (s1 < 0) |
| 2451 | 3099 | { |
| r24005 | r24006 | |
| 2467 | 3115 | vres[i] = 0; |
| 2468 | 3116 | } |
| 2469 | 3117 | |
| 3118 | #if USE_SIMD |
| 3119 | SIMD_INSERT16(rsp->accum_l, vres[i], i); |
| 3120 | #else |
| 2470 | 3121 | ACCUM_L(i) = vres[i]; |
| 3122 | #endif |
| 2471 | 3123 | } |
| 2472 | 3124 | WRITEBACK_RESULT(); |
| 2473 | 3125 | } |
| r24005 | r24006 | |
| 2476 | 3128 | { |
| 2477 | 3129 | rsp_state *rsp = (rsp_state*)param; |
| 2478 | 3130 | int op = rsp->impstate->arg0; |
| 2479 | | INT16 vres[8]; |
| 2480 | | int i; |
| 2481 | 3131 | // 31 25 24 20 15 10 5 0 |
| 2482 | 3132 | // ------------------------------------------------------ |
| 2483 | 3133 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010100 | |
| r24005 | r24006 | |
| 2487 | 3137 | |
| 2488 | 3138 | // TODO: check VS2REG = VDREG |
| 2489 | 3139 | |
| 2490 | | int sel; |
| 2491 | | INT32 s1, s2, r; |
| 2492 | 3140 | CLEAR_ZERO_FLAGS(); |
| 2493 | 3141 | CLEAR_CARRY_FLAGS(); |
| 2494 | 3142 | |
| 2495 | | for (i=0; i < 8; i++) |
| 3143 | #if USE_SIMD |
| 3144 | __m128i shuf2 = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 3145 | __m128i vec7531 = _mm_and_si128(rsp->xv[VS1REG], vec_lomask); |
| 3146 | __m128i vec6420 = _mm_srli_epi32(rsp->xv[VS1REG], 16); |
| 3147 | __m128i shuf7531 = _mm_and_si128(shuf2, vec_lomask); |
| 3148 | __m128i shuf6420 = _mm_srli_epi32(shuf2, 16); |
| 3149 | __m128i sum7531 = _mm_add_epi32(vec7531, shuf7531); |
| 3150 | __m128i sum6420 = _mm_add_epi32(vec6420, shuf6420); |
| 3151 | |
| 3152 | __m128i over7531 = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi16(sum7531, vec_zero), vec_neg1), vec_overmask); |
| 3153 | __m128i over6420 = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi16(sum6420, vec_zero), vec_neg1), vec_overmask); |
| 3154 | |
| 3155 | rsp->flag[0] |= _mm_extract_epi16(over7531, 7) << 6; |
| 3156 | rsp->flag[0] |= _mm_extract_epi16(over7531, 5) << 4; |
| 3157 | rsp->flag[0] |= _mm_extract_epi16(over7531, 3) << 2; |
| 3158 | rsp->flag[0] |= _mm_extract_epi16(over7531, 1) << 0; |
| 3159 | rsp->flag[0] |= _mm_extract_epi16(over6420, 7) << 7; |
| 3160 | rsp->flag[0] |= _mm_extract_epi16(over6420, 5) << 5; |
| 3161 | rsp->flag[0] |= _mm_extract_epi16(over6420, 3) << 3; |
| 3162 | rsp->flag[0] |= _mm_extract_epi16(over6420, 1) << 1; |
| 3163 | rsp->xv[VDREG] = _mm_or_si128(_mm_slli_epi32(sum6420, 16), sum7531); |
| 3164 | rsp->accum_l = rsp->xv[VDREG]; |
| 3165 | |
| 3166 | #else |
| 3167 | INT16 vres[8] = { 0 }; |
| 3168 | for (int i = 0; i < 8; i++) |
| 2496 | 3169 | { |
| 2497 | | sel = VEC_EL_2(EL, i); |
| 2498 | | s1 = (UINT32)(UINT16)VREG_S(VS1REG, i); |
| 2499 | | s2 = (UINT32)(UINT16)VREG_S(VS2REG, sel); |
| 2500 | | r = s1 + s2; |
| 3170 | INT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, i); |
| 3171 | INT32 s2 = (UINT32)(UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 3172 | INT32 r = s1 + s2; |
| 2501 | 3173 | |
| 2502 | | vres[i] = (INT16)(r); |
| 2503 | | ACCUM_L(i) = (INT16)(r); |
| 3174 | vres[i] = (INT16)r; |
| 3175 | ACCUM_L(i) = (INT16)r; |
| 2504 | 3176 | |
| 2505 | 3177 | if (r & 0xffff0000) |
| 2506 | 3178 | { |
| r24005 | r24006 | |
| 2508 | 3180 | } |
| 2509 | 3181 | } |
| 2510 | 3182 | WRITEBACK_RESULT(); |
| 3183 | #endif |
| 2511 | 3184 | } |
| 2512 | 3185 | |
| 2513 | 3186 | INLINE void cfunc_rsp_vsubc(void *param) |
| 2514 | 3187 | { |
| 2515 | 3188 | rsp_state *rsp = (rsp_state*)param; |
| 2516 | 3189 | int op = rsp->impstate->arg0; |
| 2517 | | INT16 vres[8]; |
| 2518 | | int i; |
| 2519 | 3190 | // 31 25 24 20 15 10 5 0 |
| 2520 | 3191 | // ------------------------------------------------------ |
| 2521 | 3192 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010101 | |
| r24005 | r24006 | |
| 2525 | 3196 | |
| 2526 | 3197 | // TODO: check VS2REG = VDREG |
| 2527 | 3198 | |
| 2528 | | int sel; |
| 2529 | | INT32 s1, s2, r; |
| 2530 | 3199 | CLEAR_ZERO_FLAGS(); |
| 2531 | 3200 | CLEAR_CARRY_FLAGS(); |
| 2532 | 3201 | |
| 2533 | | for (i=0; i < 8; i++) |
| 3202 | #if USE_SIMD |
| 3203 | __m128i shuf2 = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 3204 | __m128i vec7531 = _mm_and_si128(rsp->xv[VS1REG], vec_lomask); |
| 3205 | __m128i vec6420 = _mm_srli_epi32(rsp->xv[VS1REG], 16); |
| 3206 | __m128i shuf7531 = _mm_and_si128(shuf2, vec_lomask); |
| 3207 | __m128i shuf6420 = _mm_srli_epi32(shuf2, 16); |
| 3208 | __m128i sum7531 = _mm_sub_epi32(vec7531, shuf7531); |
| 3209 | __m128i sum6420 = _mm_sub_epi32(vec6420, shuf6420); |
| 3210 | |
| 3211 | __m128i over7531 = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi16(sum7531, vec_zero), vec_neg1), vec_overmask); |
| 3212 | __m128i over6420 = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi16(sum6420, vec_zero), vec_neg1), vec_overmask); |
| 3213 | sum7531 = _mm_and_si128(sum7531, vec_lomask); |
| 3214 | sum6420 = _mm_and_si128(sum6420, vec_lomask); |
| 3215 | __m128i zero7531 = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi16(sum7531, vec_zero), vec_neg1), vec_zerobits); |
| 3216 | __m128i zero6420 = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi16(sum6420, vec_zero), vec_neg1), vec_zerobits); |
| 3217 | |
| 3218 | rsp->flag[0] |= _mm_extract_epi16(over7531, 7) << 6; |
| 3219 | rsp->flag[0] |= _mm_extract_epi16(over7531, 5) << 4; |
| 3220 | rsp->flag[0] |= _mm_extract_epi16(over7531, 3) << 2; |
| 3221 | rsp->flag[0] |= _mm_extract_epi16(over7531, 1) << 0; |
| 3222 | rsp->flag[0] |= _mm_extract_epi16(over6420, 7) << 7; |
| 3223 | rsp->flag[0] |= _mm_extract_epi16(over6420, 5) << 5; |
| 3224 | rsp->flag[0] |= _mm_extract_epi16(over6420, 3) << 3; |
| 3225 | rsp->flag[0] |= _mm_extract_epi16(over6420, 1) << 1; |
| 3226 | |
| 3227 | rsp->flag[0] |= _mm_extract_epi16(zero7531, 6) << 14; |
| 3228 | rsp->flag[0] |= _mm_extract_epi16(zero7531, 4) << 12; |
| 3229 | rsp->flag[0] |= _mm_extract_epi16(zero7531, 2) << 10; |
| 3230 | rsp->flag[0] |= _mm_extract_epi16(zero7531, 0) << 8; |
| 3231 | rsp->flag[0] |= _mm_extract_epi16(zero6420, 6) << 15; |
| 3232 | rsp->flag[0] |= _mm_extract_epi16(zero6420, 4) << 13; |
| 3233 | rsp->flag[0] |= _mm_extract_epi16(zero6420, 2) << 11; |
| 3234 | rsp->flag[0] |= _mm_extract_epi16(zero6420, 0) << 9; |
| 3235 | |
| 3236 | rsp->xv[VDREG] = _mm_or_si128(_mm_slli_epi32(sum6420, 16), sum7531); |
| 3237 | rsp->accum_l = rsp->xv[VDREG]; |
| 3238 | |
| 3239 | #else |
| 3240 | INT16 vres[8]; |
| 3241 | for (int i = 0; i < 8; i++) |
| 2534 | 3242 | { |
| 2535 | | sel = VEC_EL_2(EL, i); |
| 2536 | | s1 = (UINT32)(UINT16)VREG_S(VS1REG, i); |
| 2537 | | s2 = (UINT32)(UINT16)VREG_S(VS2REG, sel); |
| 2538 | | r = s1 - s2; |
| 3243 | INT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, i); |
| 3244 | INT32 s2 = (UINT32)(UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 3245 | INT32 r = s1 - s2; |
| 2539 | 3246 | |
| 2540 | 3247 | vres[i] = (INT16)(r); |
| 2541 | 3248 | ACCUM_L(i) = (UINT16)(r); |
| r24005 | r24006 | |
| 2550 | 3257 | } |
| 2551 | 3258 | } |
| 2552 | 3259 | WRITEBACK_RESULT(); |
| 3260 | #endif |
| 2553 | 3261 | } |
| 2554 | 3262 | |
| 2555 | 3263 | INLINE void cfunc_rsp_vsaw(void *param) |
| r24005 | r24006 | |
| 2569 | 3277 | { |
| 2570 | 3278 | for (int i = 0; i < 8; i++) |
| 2571 | 3279 | { |
| 3280 | #if USE_SIMD |
| 3281 | rsp->xv[VDREG] = _mm_insert_epi16(rsp->xv[VDREG], ACCUM_H(i), i); |
| 3282 | #else |
| 2572 | 3283 | W_VREG_S(VDREG, i) = ACCUM_H(i); |
| 3284 | #endif |
| 2573 | 3285 | } |
| 2574 | 3286 | break; |
| 2575 | 3287 | } |
| r24005 | r24006 | |
| 2577 | 3289 | { |
| 2578 | 3290 | for (int i = 0; i < 8; i++) |
| 2579 | 3291 | { |
| 3292 | #if USE_SIMD |
| 3293 | rsp->xv[VDREG] = _mm_insert_epi16(rsp->xv[VDREG], ACCUM_M(i), i); |
| 3294 | #else |
| 2580 | 3295 | W_VREG_S(VDREG, i) = ACCUM_M(i); |
| 3296 | #endif |
| 2581 | 3297 | } |
| 2582 | 3298 | break; |
| 2583 | 3299 | } |
| 2584 | 3300 | case 0x0a: // VSAWL |
| 2585 | 3301 | { |
| 3302 | #if USE_SIMD |
| 3303 | rsp->xv[VDREG] = rsp->accum_l; |
| 3304 | #else |
| 2586 | 3305 | for (int i = 0; i < 8; i++) |
| 2587 | 3306 | { |
| 2588 | 3307 | W_VREG_S(VDREG, i) = ACCUM_L(i); |
| 2589 | 3308 | } |
| 3309 | #endif |
| 2590 | 3310 | break; |
| 2591 | 3311 | } |
| 2592 | 3312 | default: fatalerror("RSP: VSAW: el = %d\n", EL); |
| r24005 | r24006 | |
| 2607 | 3327 | // Sets compare flags if elements in VS1 are less than VS2 |
| 2608 | 3328 | // Moves the element in VS2 to destination vector |
| 2609 | 3329 | |
| 2610 | | int sel; |
| 2611 | 3330 | rsp->flag[1] = 0; |
| 2612 | 3331 | |
| 2613 | 3332 | for (int i = 0; i < 8; i++) |
| 2614 | 3333 | { |
| 2615 | | sel = VEC_EL_2(EL, i); |
| 2616 | | |
| 2617 | | if (VREG_S(VS1REG, i) < VREG_S(VS2REG, sel)) |
| 3334 | #if USE_SIMD |
| 3335 | INT16 s1, s2; |
| 3336 | SIMD_EXTRACT16(rsp->xv[VS1REG], s1, i); |
| 3337 | SIMD_EXTRACT16(rsp->xv[VS2REG], s2, VEC_EL_2(EL, i)); |
| 3338 | #else |
| 3339 | INT16 s1 = (INT16)VREG_S(VS1REG, i); |
| 3340 | INT16 s2 = (INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 3341 | #endif |
| 3342 | if (s1 < s2) |
| 2618 | 3343 | { |
| 2619 | 3344 | SET_COMPARE_FLAG(i); |
| 2620 | 3345 | } |
| 2621 | | else if (VREG_S(VS1REG, i) == VREG_S(VS2REG, sel)) |
| 3346 | else if (s1 == s2) |
| 2622 | 3347 | { |
| 2623 | 3348 | if (ZERO_FLAG(i) == 1 && CARRY_FLAG(i) != 0) |
| 2624 | 3349 | { |
| r24005 | r24006 | |
| 2628 | 3353 | |
| 2629 | 3354 | if (COMPARE_FLAG(i)) |
| 2630 | 3355 | { |
| 2631 | | vres[i] = VREG_S(VS1REG, i); |
| 3356 | vres[i] = s1; |
| 2632 | 3357 | } |
| 2633 | 3358 | else |
| 2634 | 3359 | { |
| 2635 | | vres[i] = VREG_S(VS2REG, sel); |
| 3360 | vres[i] = s2; |
| 2636 | 3361 | } |
| 2637 | 3362 | |
| 3363 | #if USE_SIMD |
| 3364 | SIMD_INSERT16(rsp->accum_l, vres[i], i); |
| 3365 | #else |
| 2638 | 3366 | ACCUM_L(i) = vres[i]; |
| 3367 | #endif |
| 2639 | 3368 | } |
| 2640 | 3369 | |
| 2641 | 3370 | rsp->flag[0] = 0; |
| r24005 | r24006 | |
| 2647 | 3376 | rsp_state *rsp = (rsp_state*)param; |
| 2648 | 3377 | int op = rsp->impstate->arg0; |
| 2649 | 3378 | INT16 vres[8]; |
| 2650 | | int i; |
| 3379 | |
| 2651 | 3380 | // 31 25 24 20 15 10 5 0 |
| 2652 | 3381 | // ------------------------------------------------------ |
| 2653 | 3382 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100001 | |
| r24005 | r24006 | |
| 2656 | 3385 | // Sets compare flags if elements in VS1 are equal with VS2 |
| 2657 | 3386 | // Moves the element in VS2 to destination vector |
| 2658 | 3387 | |
| 2659 | | int sel; |
| 2660 | 3388 | rsp->flag[1] = 0; |
| 2661 | 3389 | |
| 2662 | | for (i = 0; i < 8; i++) |
| 3390 | for (int i = 0; i < 8; i++) |
| 2663 | 3391 | { |
| 2664 | | sel = VEC_EL_2(EL, i); |
| 2665 | | |
| 2666 | | if ((VREG_S(VS1REG, i) == VREG_S(VS2REG, sel)) && ZERO_FLAG(i) == 0) |
| 3392 | #if USE_SIMD |
| 3393 | INT16 s1, s2; |
| 3394 | SIMD_EXTRACT16(rsp->xv[VS1REG], s1, i); |
| 3395 | SIMD_EXTRACT16(rsp->xv[VS2REG], s2, VEC_EL_2(EL, i)); |
| 3396 | #else |
| 3397 | INT16 s1 = (INT16)VREG_S(VS1REG, i); |
| 3398 | INT16 s2 = (INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 3399 | #endif |
| 3400 | if ((s1 == s2) && ZERO_FLAG(i) == 0) |
| 2667 | 3401 | { |
| 2668 | 3402 | SET_COMPARE_FLAG(i); |
| 2669 | | vres[i] = VREG_S(VS1REG, i); |
| 3403 | vres[i] = s1; |
| 2670 | 3404 | } |
| 2671 | 3405 | else |
| 2672 | 3406 | { |
| 2673 | | vres[i] = VREG_S(VS2REG, sel); |
| 3407 | vres[i] = s2; |
| 2674 | 3408 | } |
| 3409 | #if USE_SIMD |
| 3410 | SIMD_INSERT16(rsp->accum_l, vres[i], i); |
| 3411 | #else |
| 2675 | 3412 | ACCUM_L(i) = vres[i]; |
| 3413 | #endif |
| 2676 | 3414 | } |
| 2677 | 3415 | |
| 2678 | 3416 | rsp->flag[0] = 0; |
| r24005 | r24006 | |
| 2684 | 3422 | rsp_state *rsp = (rsp_state*)param; |
| 2685 | 3423 | int op = rsp->impstate->arg0; |
| 2686 | 3424 | INT16 vres[8]; |
| 2687 | | int i; |
| 3425 | |
| 2688 | 3426 | // 31 25 24 20 15 10 5 0 |
| 2689 | 3427 | // ------------------------------------------------------ |
| 2690 | 3428 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100010 | |
| r24005 | r24006 | |
| 2693 | 3431 | // Sets compare flags if elements in VS1 are not equal with VS2 |
| 2694 | 3432 | // Moves the element in VS2 to destination vector |
| 2695 | 3433 | |
| 2696 | | int sel; |
| 2697 | 3434 | rsp->flag[1] = 0; |
| 2698 | 3435 | |
| 2699 | | for (i=0; i < 8; i++)//?????????? ???? |
| 3436 | for (int i = 0; i < 8; i++) |
| 2700 | 3437 | { |
| 2701 | | sel = VEC_EL_2(EL, i); |
| 2702 | | |
| 2703 | | if (VREG_S(VS1REG, i) != VREG_S(VS2REG, sel)) |
| 3438 | #if USE_SIMD |
| 3439 | INT16 s1, s2; |
| 3440 | SIMD_EXTRACT16(rsp->xv[VS1REG], s1, i); |
| 3441 | SIMD_EXTRACT16(rsp->xv[VS2REG], s2, VEC_EL_2(EL, i)); |
| 3442 | #else |
| 3443 | INT16 s1 = (INT16)VREG_S(VS1REG, i); |
| 3444 | INT16 s2 = (INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 3445 | #endif |
| 3446 | if (s1 != s2) |
| 2704 | 3447 | { |
| 2705 | 3448 | SET_COMPARE_FLAG(i); |
| 2706 | 3449 | } |
| r24005 | r24006 | |
| 2713 | 3456 | } |
| 2714 | 3457 | if (COMPARE_FLAG(i)) |
| 2715 | 3458 | { |
| 2716 | | vres[i] = VREG_S(VS1REG, i); |
| 3459 | vres[i] = s1; |
| 2717 | 3460 | } |
| 2718 | 3461 | else |
| 2719 | 3462 | { |
| 2720 | | vres[i] = VREG_S(VS2REG, sel); |
| 3463 | vres[i] = s2; |
| 2721 | 3464 | } |
| 3465 | #if USE_SIMD |
| 3466 | SIMD_INSERT16(rsp->accum_l, vres[i], i); |
| 3467 | #else |
| 2722 | 3468 | ACCUM_L(i) = vres[i]; |
| 3469 | #endif |
| 2723 | 3470 | } |
| 2724 | 3471 | |
| 2725 | 3472 | rsp->flag[0] = 0; |
| r24005 | r24006 | |
| 2740 | 3487 | // Sets compare flags if elements in VS1 are greater or equal with VS2 |
| 2741 | 3488 | // Moves the element in VS2 to destination vector |
| 2742 | 3489 | |
| 2743 | | int sel; |
| 2744 | 3490 | rsp->flag[1] = 0; |
| 2745 | 3491 | |
| 2746 | 3492 | for (int i = 0; i < 8; i++) |
| 2747 | 3493 | { |
| 2748 | | sel = VEC_EL_2(EL, i); |
| 2749 | | |
| 2750 | | if (VREG_S(VS1REG, i) == VREG_S(VS2REG, sel)) |
| 3494 | #if USE_SIMD |
| 3495 | INT16 s1, s2; |
| 3496 | SIMD_EXTRACT16(rsp->xv[VS1REG], s1, i); |
| 3497 | SIMD_EXTRACT16(rsp->xv[VS2REG], s2, VEC_EL_2(EL, i)); |
| 3498 | #else |
| 3499 | INT16 s1 = (INT16)VREG_S(VS1REG, i); |
| 3500 | INT16 s2 = (INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 3501 | #endif |
| 3502 | if (s1 == s2) |
| 2751 | 3503 | { |
| 2752 | 3504 | if (ZERO_FLAG(i) == 0 || CARRY_FLAG(i) == 0) |
| 2753 | 3505 | { |
| 2754 | 3506 | SET_COMPARE_FLAG(i); |
| 2755 | 3507 | } |
| 2756 | 3508 | } |
| 2757 | | else if (VREG_S(VS1REG, i) > VREG_S(VS2REG, sel)) |
| 3509 | else if (s1 > s2) |
| 2758 | 3510 | { |
| 2759 | 3511 | SET_COMPARE_FLAG(i); |
| 2760 | 3512 | } |
| 2761 | 3513 | |
| 2762 | 3514 | if (COMPARE_FLAG(i) != 0) |
| 2763 | 3515 | { |
| 2764 | | vres[i] = VREG_S(VS1REG, i); |
| 3516 | vres[i] = s1; |
| 2765 | 3517 | } |
| 2766 | 3518 | else |
| 2767 | 3519 | { |
| 2768 | | vres[i] = VREG_S(VS2REG, sel); |
| 3520 | vres[i] = s2; |
| 2769 | 3521 | } |
| 2770 | 3522 | |
| 3523 | #if USE_SIMD |
| 3524 | SIMD_INSERT16(rsp->accum_l, vres[i], i); |
| 3525 | #else |
| 2771 | 3526 | ACCUM_L(i) = vres[i]; |
| 3527 | #endif |
| 2772 | 3528 | } |
| 2773 | 3529 | |
| 2774 | 3530 | rsp->flag[0] = 0; |
| r24005 | r24006 | |
| 2780 | 3536 | rsp_state *rsp = (rsp_state*)param; |
| 2781 | 3537 | int op = rsp->impstate->arg0; |
| 2782 | 3538 | INT16 vres[8]; |
| 2783 | | int i; |
| 3539 | |
| 2784 | 3540 | // 31 25 24 20 15 10 5 0 |
| 2785 | 3541 | // ------------------------------------------------------ |
| 2786 | 3542 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100100 | |
| r24005 | r24006 | |
| 2788 | 3544 | // |
| 2789 | 3545 | // Vector clip low |
| 2790 | 3546 | |
| 2791 | | int sel; |
| 2792 | | INT16 s1, s2; |
| 2793 | | for (i = 0; i < 8; i++) |
| 3547 | for (int i = 0; i < 8; i++) |
| 2794 | 3548 | { |
| 2795 | | sel = VEC_EL_2(EL, i); |
| 2796 | | s1 = VREG_S(VS1REG, i); |
| 2797 | | s2 = VREG_S(VS2REG, sel); |
| 3549 | #if USE_SIMD |
| 3550 | INT16 s1, s2; |
| 3551 | SIMD_EXTRACT16(rsp->xv[VS1REG], s1, i); |
| 3552 | SIMD_EXTRACT16(rsp->xv[VS2REG], s2, VEC_EL_2(EL, i)); |
| 3553 | #else |
| 3554 | INT16 s1 = VREG_S(VS1REG, i); |
| 3555 | INT16 s2 = VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 3556 | #endif |
| 2798 | 3557 | |
| 2799 | 3558 | if (CARRY_FLAG(i) != 0) |
| 2800 | 3559 | { |
| r24005 | r24006 | |
| 2802 | 3561 | { |
| 2803 | 3562 | if (COMPARE_FLAG(i) != 0) |
| 2804 | 3563 | { |
| 3564 | #if USE_SIMD |
| 3565 | SIMD_INSERT16(rsp->accum_l, -(UINT16)s2, i); |
| 3566 | #else |
| 2805 | 3567 | ACCUM_L(i) = -(UINT16)s2; |
| 3568 | #endif |
| 2806 | 3569 | } |
| 2807 | 3570 | else |
| 2808 | 3571 | { |
| 3572 | #if USE_SIMD |
| 3573 | SIMD_INSERT16(rsp->accum_l, s1, i); |
| 3574 | #else |
| 2809 | 3575 | ACCUM_L(i) = s1; |
| 3576 | #endif |
| 2810 | 3577 | } |
| 2811 | 3578 | } |
| 2812 | 3579 | else//ZERO_FLAG(i)==0 |
| r24005 | r24006 | |
| 2815 | 3582 | { |
| 2816 | 3583 | if (((UINT32)(UINT16)(s1) + (UINT32)(UINT16)(s2)) > 0x10000) |
| 2817 | 3584 | {//proper fix for Harvest Moon 64, r4 |
| 2818 | | |
| 3585 | #if USE_SIMD |
| 3586 | SIMD_INSERT16(rsp->accum_l, s1, i); |
| 3587 | #else |
| 2819 | 3588 | ACCUM_L(i) = s1; |
| 3589 | #endif |
| 2820 | 3590 | CLEAR_COMPARE_FLAG(i); |
| 2821 | 3591 | } |
| 2822 | 3592 | else |
| 2823 | 3593 | { |
| 3594 | #if USE_SIMD |
| 3595 | SIMD_INSERT16(rsp->accum_l, -((UINT16)s2), i); |
| 3596 | #else |
| 2824 | 3597 | ACCUM_L(i) = -((UINT16)s2); |
| 3598 | #endif |
| 2825 | 3599 | SET_COMPARE_FLAG(i); |
| 2826 | 3600 | } |
| 2827 | 3601 | } |
| r24005 | r24006 | |
| 2829 | 3603 | { |
| 2830 | 3604 | if (((UINT32)(UINT16)(s1) + (UINT32)(UINT16)(s2)) != 0) |
| 2831 | 3605 | { |
| 3606 | #if USE_SIMD |
| 3607 | SIMD_INSERT16(rsp->accum_l, s1, i); |
| 3608 | #else |
| 2832 | 3609 | ACCUM_L(i) = s1; |
| 3610 | #endif |
| 2833 | 3611 | CLEAR_COMPARE_FLAG(i); |
| 2834 | 3612 | } |
| 2835 | 3613 | else |
| 2836 | 3614 | { |
| 3615 | #if USE_SIMD |
| 3616 | SIMD_INSERT16(rsp->accum_l, -((UINT16)s2), i); |
| 3617 | #else |
| 2837 | 3618 | ACCUM_L(i) = -((UINT16)s2); |
| 3619 | #endif |
| 2838 | 3620 | SET_COMPARE_FLAG(i); |
| 2839 | 3621 | } |
| 2840 | 3622 | } |
| 2841 | 3623 | } |
| 2842 | | }// |
| 3624 | } |
| 2843 | 3625 | else//CARRY_FLAG(i)==0 |
| 2844 | 3626 | { |
| 2845 | 3627 | if (ZERO_FLAG(i) != 0) |
| 2846 | 3628 | { |
| 2847 | 3629 | if (rsp->flag[1] & (1 << (8+i))) |
| 2848 | 3630 | { |
| 3631 | #if USE_SIMD |
| 3632 | SIMD_INSERT16(rsp->accum_l, s2, i); |
| 3633 | #else |
| 2849 | 3634 | ACCUM_L(i) = s2; |
| 3635 | #endif |
| 2850 | 3636 | } |
| 2851 | 3637 | else |
| 2852 | 3638 | { |
| 3639 | #if USE_SIMD |
| 3640 | SIMD_INSERT16(rsp->accum_l, s1, i); |
| 3641 | #else |
| 2853 | 3642 | ACCUM_L(i) = s1; |
| 3643 | #endif |
| 2854 | 3644 | } |
| 2855 | 3645 | } |
| 2856 | 3646 | else |
| 2857 | 3647 | { |
| 2858 | 3648 | if (((INT32)(UINT16)s1 - (INT32)(UINT16)s2) >= 0) |
| 2859 | 3649 | { |
| 3650 | #if USE_SIMD |
| 3651 | SIMD_INSERT16(rsp->accum_l, s2, i); |
| 3652 | #else |
| 2860 | 3653 | ACCUM_L(i) = s2; |
| 3654 | #endif |
| 2861 | 3655 | rsp->flag[1] |= (1 << (8+i)); |
| 2862 | 3656 | } |
| 2863 | 3657 | else |
| 2864 | 3658 | { |
| 3659 | #if USE_SIMD |
| 3660 | SIMD_INSERT16(rsp->accum_l, s1, i); |
| 3661 | #else |
| 2865 | 3662 | ACCUM_L(i) = s1; |
| 3663 | #endif |
| 2866 | 3664 | rsp->flag[1] &= ~(1 << (8+i)); |
| 2867 | 3665 | } |
| 2868 | 3666 | } |
| 2869 | 3667 | } |
| 2870 | 3668 | |
| 3669 | #if USE_SIMD |
| 3670 | SIMD_EXTRACT16(rsp->accum_l, vres[i], i); |
| 3671 | #else |
| 2871 | 3672 | vres[i] = ACCUM_L(i); |
| 3673 | #endif |
| 2872 | 3674 | } |
| 2873 | 3675 | rsp->flag[0] = 0; |
| 2874 | 3676 | rsp->flag[2] = 0; |
| r24005 | r24006 | |
| 2879 | 3681 | { |
| 2880 | 3682 | rsp_state *rsp = (rsp_state*)param; |
| 2881 | 3683 | int op = rsp->impstate->arg0; |
| 2882 | | INT16 vres[8]; |
| 2883 | | int i; |
| 3684 | |
| 2884 | 3685 | // 31 25 24 20 15 10 5 0 |
| 2885 | 3686 | // ------------------------------------------------------ |
| 2886 | 3687 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100101 | |
| r24005 | r24006 | |
| 2888 | 3689 | // |
| 2889 | 3690 | // Vector clip high |
| 2890 | 3691 | |
| 2891 | | int sel; |
| 2892 | | INT16 s1, s2; |
| 2893 | 3692 | rsp->flag[0] = 0; |
| 2894 | 3693 | rsp->flag[1] = 0; |
| 2895 | 3694 | rsp->flag[2] = 0; |
| 3695 | |
| 3696 | #if USE_SIMD |
| 3697 | // Compare flag |
| 3698 | // flag[1] bit [0- 7] set if (s1 ^ s2) < 0 && (s1 + s2) <= 0) |
| 3699 | // flag[1] bit [0- 7] set if (s1 ^ s2) >= 0 && (s2 < 0) |
| 3700 | |
| 3701 | // flag[1] bit [8-15] set if (s1 ^ s2) < 0 && (s2 < 0) |
| 3702 | // flag[1] bit [8-15] set if (s1 ^ s2) >= 0 && (s1 - s2) >= 0 |
| 3703 | |
| 3704 | // Carry flag |
| 3705 | // flag[0] bit [0- 7] set if (s1 ^ s2) < 0 |
| 3706 | |
| 3707 | // Zero flag |
| 3708 | // flag[0] bit [8-15] set if (s1 ^ s2) < 0 && (s1 + s2) != 0 && (s1 != ~s2) |
| 3709 | // flag[0] bit [8-15] set if (s1 ^ s2) >= 0 && (s1 - s2) != 0 && (s1 != ~s2) |
| 3710 | |
| 3711 | // flag[2] bit [0- 7] set if (s1 ^ s2) < 0 && (s1 + s2) == -1 |
| 3712 | |
| 3713 | // accum set to -s2 if (s1 ^ s2) < 0 && (s1 + s2) <= 0) |
| 3714 | // accum set to -s2 if (s1 ^ s2) >= 0 && (s1 - s2) >= 0 |
| 3715 | |
| 3716 | // accum set to s1 if (s1 ^ s2) < 0 && (s1 + s2) > 0) |
| 3717 | // accum set to s1 if (s1 ^ s2) >= 0 && (s1 - s2) < 0 |
| 3718 | |
| 3719 | __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 3720 | __m128i s1_xor_s2 = _mm_xor_si128(rsp->xv[VS1REG], shuf); |
| 3721 | __m128i s1_plus_s2 = _mm_add_epi16(rsp->xv[VS1REG], shuf); |
| 3722 | __m128i s1_sub_s2 = _mm_sub_epi16(rsp->xv[VS1REG], shuf); |
| 3723 | __m128i s2_neg = _mm_xor_si128(shuf, vec_neg1); |
| 3724 | |
| 3725 | __m128i s2_lz = _mm_cmplt_epi16(shuf, vec_zero); |
| 3726 | __m128i s1s2_xor_lz = _mm_cmplt_epi16(s1_xor_s2, vec_zero); |
| 3727 | __m128i s1s2_xor_gez = _mm_xor_si128(s1s2_xor_lz, vec_neg1); |
| 3728 | __m128i s1s2_plus_nz = _mm_xor_si128(_mm_cmpeq_epi16(s1_plus_s2, vec_zero), vec_neg1); |
| 3729 | __m128i s1s2_plus_gz = _mm_cmpgt_epi16(s1_plus_s2, vec_zero); |
| 3730 | __m128i s1s2_plus_lez = _mm_xor_si128(s1s2_plus_gz, vec_neg1); |
| 3731 | __m128i s1s2_plus_n1 = _mm_cmpeq_epi16(s1_plus_s2, vec_neg1); |
| 3732 | __m128i s1s2_sub_nz = _mm_xor_si128(_mm_cmpeq_epi16(s1_sub_s2, vec_zero), vec_neg1); |
| 3733 | __m128i s1s2_sub_lz = _mm_cmplt_epi16(s1_sub_s2, vec_zero); |
| 3734 | __m128i s1s2_sub_gez = _mm_xor_si128(s1s2_sub_lz, vec_neg1); |
| 3735 | __m128i s1_nens2 = _mm_xor_si128(_mm_cmpeq_epi16(rsp->xv[VS1REG], s2_neg), vec_neg1); |
| 3736 | |
| 3737 | __m128i ext_mask = _mm_and_si128(_mm_and_si128(s1s2_xor_lz, s1s2_plus_n1), vec_flagmask); |
| 3738 | rsp->flag[2] |= _mm_extract_epi16(ext_mask, 0) << 0; |
| 3739 | rsp->flag[2] |= _mm_extract_epi16(ext_mask, 1) << 1; |
| 3740 | rsp->flag[2] |= _mm_extract_epi16(ext_mask, 2) << 2; |
| 3741 | rsp->flag[2] |= _mm_extract_epi16(ext_mask, 3) << 3; |
| 3742 | rsp->flag[2] |= _mm_extract_epi16(ext_mask, 4) << 4; |
| 3743 | rsp->flag[2] |= _mm_extract_epi16(ext_mask, 5) << 5; |
| 3744 | rsp->flag[2] |= _mm_extract_epi16(ext_mask, 6) << 6; |
| 3745 | rsp->flag[2] |= _mm_extract_epi16(ext_mask, 7) << 7; |
| 3746 | |
| 3747 | __m128i carry_mask = _mm_and_si128(s1s2_xor_lz, vec_flagmask); |
| 3748 | rsp->flag[0] |= _mm_extract_epi16(carry_mask, 0) << 0; |
| 3749 | rsp->flag[0] |= _mm_extract_epi16(carry_mask, 1) << 1; |
| 3750 | rsp->flag[0] |= _mm_extract_epi16(carry_mask, 2) << 2; |
| 3751 | rsp->flag[0] |= _mm_extract_epi16(carry_mask, 3) << 3; |
| 3752 | rsp->flag[0] |= _mm_extract_epi16(carry_mask, 4) << 4; |
| 3753 | rsp->flag[0] |= _mm_extract_epi16(carry_mask, 5) << 5; |
| 3754 | rsp->flag[0] |= _mm_extract_epi16(carry_mask, 6) << 6; |
| 3755 | rsp->flag[0] |= _mm_extract_epi16(carry_mask, 7) << 7; |
| 3756 | |
| 3757 | __m128i z0_mask = _mm_and_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_nz), s1_nens2); |
| 3758 | __m128i z1_mask = _mm_and_si128(_mm_and_si128(s1s2_xor_lz, s1s2_plus_nz), s1_nens2); |
| 3759 | __m128i z_mask = _mm_and_si128(_mm_or_si128(z0_mask, z1_mask), vec_flagmask); |
| 3760 | z_mask = _mm_and_si128(_mm_or_si128(z_mask, _mm_srli_epi32(z_mask, 15)), vec_shiftmask2); |
| 3761 | z_mask = _mm_and_si128(_mm_or_si128(z_mask, _mm_srli_epi64(z_mask, 30)), vec_shiftmask4); |
| 3762 | z_mask = _mm_or_si128(z_mask, _mm_srli_si128(z_mask, 7)); |
| 3763 | z_mask = _mm_or_si128(z_mask, _mm_srli_epi16(z_mask, 4)); |
| 3764 | rsp->flag[0] |= (_mm_extract_epi16(z_mask, 0) << 8) & 0x00ff00; |
| 3765 | |
| 3766 | __m128i f0_mask = _mm_and_si128(_mm_or_si128(_mm_and_si128(s1s2_xor_gez, s2_lz), _mm_and_si128(s1s2_xor_lz, s1s2_plus_lez)), vec_flagmask); |
| 3767 | __m128i f8_mask = _mm_and_si128(_mm_or_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_gez), _mm_and_si128(s1s2_xor_lz, s2_lz)), vec_flagmask); |
| 3768 | f0_mask = _mm_and_si128(f0_mask, vec_flagmask); |
| 3769 | f8_mask = _mm_and_si128(f8_mask, vec_flagmask); |
| 3770 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 0) << 0; |
| 3771 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 1) << 1; |
| 3772 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 2) << 2; |
| 3773 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 3) << 3; |
| 3774 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 4) << 4; |
| 3775 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 5) << 5; |
| 3776 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 6) << 6; |
| 3777 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 7) << 7; |
| 3778 | |
| 3779 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 0) << 8; |
| 3780 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 1) << 9; |
| 3781 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 2) << 10; |
| 3782 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 3) << 11; |
| 3783 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 4) << 12; |
| 3784 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 5) << 13; |
| 3785 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 6) << 14; |
| 3786 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 7) << 15; |
| 3787 | #else |
| 3788 | |
| 3789 | INT16 vres[8]; |
| 2896 | 3790 | UINT32 vce = 0; |
| 2897 | | |
| 2898 | | for (i=0; i < 8; i++) |
| 3791 | for (int i = 0; i < 8; i++) |
| 2899 | 3792 | { |
| 2900 | | sel = VEC_EL_2(EL, i); |
| 2901 | | s1 = VREG_S(VS1REG, i); |
| 2902 | | s2 = VREG_S(VS2REG, sel); |
| 3793 | #if USE_SIMD |
| 3794 | INT16 s1, s2; |
| 3795 | SIMD_EXTRACT16(rsp->xv[VS1REG], s1, i); |
| 3796 | SIMD_EXTRACT16(rsp->xv[VS2REG], s2, VEC_EL_2(EL, i)); |
| 3797 | #else |
| 3798 | INT16 s1 = VREG_S(VS1REG, i); |
| 3799 | INT16 s2 = VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 3800 | #endif |
| 2903 | 3801 | |
| 2904 | 3802 | if ((s1 ^ s2) < 0) |
| 2905 | 3803 | { |
| r24005 | r24006 | |
| 2920 | 3818 | vres[i] = s1; |
| 2921 | 3819 | } |
| 2922 | 3820 | |
| 2923 | | if (s1 + s2 != 0) |
| 3821 | if (s1 + s2 != 0 && s1 != ~s2) |
| 2924 | 3822 | { |
| 2925 | | if (s1 != ~s2) |
| 2926 | | { |
| 2927 | | SET_ZERO_FLAG(i); |
| 2928 | | } |
| 3823 | SET_ZERO_FLAG(i); |
| 2929 | 3824 | } |
| 2930 | 3825 | }//sign |
| 2931 | 3826 | else |
| r24005 | r24006 | |
| 2945 | 3840 | vres[i] = s1; |
| 2946 | 3841 | } |
| 2947 | 3842 | |
| 2948 | | if ((s1 - s2) != 0) |
| 3843 | if ((s1 - s2) != 0 && s1 != ~s2) |
| 2949 | 3844 | { |
| 2950 | | if (s1 != ~s2) |
| 2951 | | { |
| 2952 | | SET_ZERO_FLAG(i); |
| 2953 | | } |
| 3845 | SET_ZERO_FLAG(i); |
| 2954 | 3846 | } |
| 2955 | 3847 | } |
| 2956 | 3848 | rsp->flag[2] |= (vce << (i)); |
| 3849 | #if USE_SIMD |
| 3850 | SIMD_INSERT16(rsp->accum_l, vres[i], i); |
| 3851 | #else |
| 2957 | 3852 | ACCUM_L(i) = vres[i]; |
| 3853 | #endif |
| 2958 | 3854 | } |
| 2959 | 3855 | WRITEBACK_RESULT(); |
| 3856 | #endif |
| 2960 | 3857 | } |
| 2961 | 3858 | |
| 2962 | 3859 | INLINE void cfunc_rsp_vcr(void *param) |
| 2963 | 3860 | { |
| 2964 | 3861 | rsp_state *rsp = (rsp_state*)param; |
| 2965 | 3862 | int op = rsp->impstate->arg0; |
| 2966 | | INT16 vres[8]; |
| 2967 | | int i; |
| 3863 | |
| 2968 | 3864 | // 31 25 24 20 15 10 5 0 |
| 2969 | 3865 | // ------------------------------------------------------ |
| 2970 | 3866 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100110 | |
| r24005 | r24006 | |
| 2972 | 3868 | // |
| 2973 | 3869 | // Vector clip reverse |
| 2974 | 3870 | |
| 2975 | | int sel; |
| 2976 | | INT16 s1, s2; |
| 2977 | 3871 | rsp->flag[0] = 0; |
| 2978 | 3872 | rsp->flag[1] = 0; |
| 2979 | 3873 | rsp->flag[2] = 0; |
| 2980 | 3874 | |
| 2981 | | for (i=0; i < 8; i++) |
| 3875 | #if USE_SIMD |
| 3876 | // flag[1] bit [0- 7] set if (s1 ^ s2) < 0 && (s1 + s2) <= 0) |
| 3877 | // flag[1] bit [0- 7] set if (s1 ^ s2) >= 0 && (s2 < 0) |
| 3878 | |
| 3879 | // flag[1] bit [8-15] set if (s1 ^ s2) < 0 && (s2 < 0) |
| 3880 | // flag[1] bit [8-15] set if (s1 ^ s2) >= 0 && (s1 - s2) >= 0 |
| 3881 | |
| 3882 | // accum set to ~s2 if (s1 ^ s2) < 0 && (s1 + s2) <= 0) |
| 3883 | // accum set to ~s2 if (s1 ^ s2) >= 0 && (s1 - s2) >= 0 |
| 3884 | |
| 3885 | // accum set to s1 if (s1 ^ s2) < 0 && (s1 + s2) > 0) |
| 3886 | // accum set to s1 if (s1 ^ s2) >= 0 && (s1 - s2) < 0 |
| 3887 | __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 3888 | __m128i s1_xor_s2 = _mm_xor_si128(rsp->xv[VS1REG], shuf); |
| 3889 | __m128i s1_plus_s2 = _mm_add_epi16(rsp->xv[VS1REG], shuf); |
| 3890 | __m128i s1_sub_s2 = _mm_sub_epi16(rsp->xv[VS1REG], shuf); |
| 3891 | __m128i s2_neg = _mm_xor_si128(shuf, vec_neg1); |
| 3892 | |
| 3893 | __m128i s2_lz = _mm_cmplt_epi16(shuf, vec_zero); |
| 3894 | __m128i s1s2_xor_lz = _mm_cmplt_epi16(s1_xor_s2, vec_zero); |
| 3895 | __m128i s1s2_xor_gez = _mm_xor_si128(s1s2_xor_lz, vec_neg1); |
| 3896 | __m128i s1s2_plus_gz = _mm_cmpgt_epi16(s1_plus_s2, vec_zero); |
| 3897 | __m128i s1s2_plus_lez = _mm_xor_si128(s1s2_plus_gz, vec_neg1); |
| 3898 | __m128i s1s2_sub_lz = _mm_cmplt_epi16(s1_sub_s2, vec_zero); |
| 3899 | __m128i s1s2_sub_gez = _mm_xor_si128(s1s2_sub_lz, vec_neg1); |
| 3900 | |
| 3901 | __m128i s1_mask = _mm_or_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_lz), _mm_and_si128(s1s2_xor_lz, s1s2_plus_gz)); |
| 3902 | __m128i s2_mask = _mm_or_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_gez), _mm_and_si128(s1s2_xor_lz, s1s2_plus_lez)); |
| 3903 | rsp->accum_l = _mm_or_si128(_mm_and_si128(rsp->xv[VS1REG], s1_mask), _mm_and_si128(s2_neg, s2_mask)); |
| 3904 | rsp->xv[VDREG] = rsp->accum_l; |
| 3905 | |
| 3906 | __m128i f0_mask = _mm_or_si128(_mm_and_si128(s1s2_xor_gez, s2_lz), _mm_and_si128(s1s2_xor_lz, s1s2_plus_lez)); |
| 3907 | __m128i f8_mask = _mm_or_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_gez), _mm_and_si128(s1s2_xor_lz, s2_lz)); |
| 3908 | f0_mask = _mm_and_si128(f0_mask, vec_flagmask); |
| 3909 | f8_mask = _mm_and_si128(f8_mask, vec_flagmask); |
| 3910 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 0) << 0; |
| 3911 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 1) << 1; |
| 3912 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 2) << 2; |
| 3913 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 3) << 3; |
| 3914 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 4) << 4; |
| 3915 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 5) << 5; |
| 3916 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 6) << 6; |
| 3917 | rsp->flag[1] |= _mm_extract_epi16(f0_mask, 7) << 7; |
| 3918 | |
| 3919 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 0) << 8; |
| 3920 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 1) << 9; |
| 3921 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 2) << 10; |
| 3922 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 3) << 11; |
| 3923 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 4) << 12; |
| 3924 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 5) << 13; |
| 3925 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 6) << 14; |
| 3926 | rsp->flag[1] |= _mm_extract_epi16(f8_mask, 7) << 15; |
| 3927 | #else |
| 3928 | INT16 vres[8]; |
| 3929 | for (int i = 0; i < 8; i++) |
| 2982 | 3930 | { |
| 2983 | | sel = VEC_EL_2(EL, i); |
| 2984 | | s1 = VREG_S(VS1REG, i); |
| 2985 | | s2 = VREG_S(VS2REG, sel); |
| 3931 | INT16 s1 = VREG_S(VS1REG, i); |
| 3932 | INT16 s2 = VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 2986 | 3933 | |
| 2987 | 3934 | if ((INT16)(s1 ^ s2) < 0) |
| 2988 | 3935 | { |
| r24005 | r24006 | |
| 3020 | 3967 | vres[i] = ACCUM_L(i); |
| 3021 | 3968 | } |
| 3022 | 3969 | WRITEBACK_RESULT(); |
| 3970 | #endif |
| 3023 | 3971 | } |
| 3024 | 3972 | |
| 3025 | 3973 | INLINE void cfunc_rsp_vmrg(void *param) |
| 3026 | 3974 | { |
| 3027 | 3975 | rsp_state *rsp = (rsp_state*)param; |
| 3028 | 3976 | int op = rsp->impstate->arg0; |
| 3029 | | INT16 vres[8] = { 0 }; |
| 3030 | 3977 | // 31 25 24 20 15 10 5 0 |
| 3031 | 3978 | // ------------------------------------------------------ |
| 3032 | 3979 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100111 | |
| r24005 | r24006 | |
| 3034 | 3981 | // |
| 3035 | 3982 | // Merges two vectors according to compare flags |
| 3036 | 3983 | |
| 3037 | | int sel; |
| 3984 | #if USE_SIMD |
| 3985 | __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 3986 | __m128i compare = _mm_set_epi16(COMPARE_FLAG(7), COMPARE_FLAG(6), COMPARE_FLAG(5), COMPARE_FLAG(4), |
| 3987 | COMPARE_FLAG(3), COMPARE_FLAG(2), COMPARE_FLAG(1), COMPARE_FLAG(0)); |
| 3988 | __m128i s2mask = _mm_cmpeq_epi16(compare, vec_zero); |
| 3989 | __m128i s1mask = _mm_xor_si128(s2mask, vec_neg1); |
| 3990 | __m128i result = _mm_and_si128(rsp->xv[VS1REG], s1mask); |
| 3991 | rsp->xv[VDREG] = _mm_or_si128(result, _mm_and_si128(shuf, s2mask)); |
| 3992 | rsp->accum_l = rsp->xv[VDREG]; |
| 3993 | #else |
| 3994 | INT16 vres[8]; |
| 3038 | 3995 | for (int i = 0; i < 8; i++) |
| 3039 | 3996 | { |
| 3040 | | sel = VEC_EL_2(EL, i); |
| 3997 | INT16 s1 = (INT16)VREG_S(VS1REG, i); |
| 3998 | INT16 s2 = (INT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 3041 | 3999 | if (COMPARE_FLAG(i) != 0) |
| 3042 | 4000 | { |
| 3043 | | vres[i] = VREG_S(VS1REG, i); |
| 4001 | vres[i] = s1; |
| 3044 | 4002 | } |
| 3045 | 4003 | else |
| 3046 | 4004 | { |
| 3047 | | vres[i] = VREG_S(VS2REG, sel);//??? ??????????? |
| 4005 | vres[i] = s2; |
| 3048 | 4006 | } |
| 3049 | 4007 | |
| 3050 | 4008 | ACCUM_L(i) = vres[i]; |
| 3051 | 4009 | } |
| 3052 | 4010 | WRITEBACK_RESULT(); |
| 4011 | #endif |
| 3053 | 4012 | } |
| 3054 | 4013 | |
| 3055 | 4014 | INLINE void cfunc_rsp_vand(void *param) |
| 3056 | 4015 | { |
| 3057 | 4016 | rsp_state *rsp = (rsp_state*)param; |
| 3058 | 4017 | int op = rsp->impstate->arg0; |
| 3059 | | INT16 vres[8] = { 0 }; |
| 4018 | |
| 3060 | 4019 | // 31 25 24 20 15 10 5 0 |
| 3061 | 4020 | // ------------------------------------------------------ |
| 3062 | 4021 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101000 | |
| r24005 | r24006 | |
| 3064 | 4023 | // |
| 3065 | 4024 | // Bitwise AND of two vector registers |
| 3066 | 4025 | |
| 3067 | | int sel; |
| 4026 | #if USE_SIMD |
| 4027 | __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 4028 | rsp->xv[VDREG] = _mm_and_si128(rsp->xv[VS1REG], shuf); |
| 4029 | rsp->accum_l = rsp->xv[VDREG]; |
| 4030 | #else |
| 4031 | INT16 vres[8]; |
| 3068 | 4032 | for (int i = 0; i < 8; i++) |
| 3069 | 4033 | { |
| 3070 | | sel = VEC_EL_2(EL, i); |
| 3071 | | vres[i] = VREG_S(VS1REG, i) & VREG_S(VS2REG, sel); |
| 4034 | UINT16 s1 = (UINT16)VREG_S(VS1REG, i); |
| 4035 | UINT16 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 4036 | vres[i] = s1 & s2; |
| 3072 | 4037 | ACCUM_L(i) = vres[i]; |
| 3073 | 4038 | } |
| 3074 | 4039 | WRITEBACK_RESULT(); |
| 4040 | #endif |
| 3075 | 4041 | } |
| 3076 | 4042 | |
| 3077 | 4043 | INLINE void cfunc_rsp_vnand(void *param) |
| 3078 | 4044 | { |
| 3079 | 4045 | rsp_state *rsp = (rsp_state*)param; |
| 3080 | 4046 | int op = rsp->impstate->arg0; |
| 3081 | | INT16 vres[8] = { 0 }; |
| 4047 | |
| 3082 | 4048 | // 31 25 24 20 15 10 5 0 |
| 3083 | 4049 | // ------------------------------------------------------ |
| 3084 | 4050 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101001 | |
| r24005 | r24006 | |
| 3086 | 4052 | // |
| 3087 | 4053 | // Bitwise NOT AND of two vector registers |
| 3088 | 4054 | |
| 3089 | | int sel; |
| 4055 | #if USE_SIMD |
| 4056 | __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 4057 | rsp->xv[VDREG] = _mm_xor_si128(_mm_and_si128(rsp->xv[VS1REG], shuf), vec_neg1); |
| 4058 | rsp->accum_l = rsp->xv[VDREG]; |
| 4059 | #else |
| 4060 | INT16 vres[8]; |
| 3090 | 4061 | for (int i = 0; i < 8; i++) |
| 3091 | 4062 | { |
| 3092 | | sel = VEC_EL_2(EL, i); |
| 3093 | | vres[i] = ~((VREG_S(VS1REG, i) & VREG_S(VS2REG, sel))); |
| 4063 | UINT16 s1 = (UINT16)VREG_S(VS1REG, i); |
| 4064 | UINT16 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 4065 | vres[i] = ~((s1 & s2)); |
| 3094 | 4066 | ACCUM_L(i) = vres[i]; |
| 3095 | 4067 | } |
| 3096 | 4068 | WRITEBACK_RESULT(); |
| 4069 | #endif |
| 3097 | 4070 | } |
| 3098 | 4071 | |
| 3099 | 4072 | INLINE void cfunc_rsp_vor(void *param) |
| 3100 | 4073 | { |
| 3101 | 4074 | rsp_state *rsp = (rsp_state*)param; |
| 3102 | 4075 | int op = rsp->impstate->arg0; |
| 3103 | | INT16 vres[8] = { 0 };; |
| 4076 | |
| 3104 | 4077 | // 31 25 24 20 15 10 5 0 |
| 3105 | 4078 | // ------------------------------------------------------ |
| 3106 | 4079 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101010 | |
| r24005 | r24006 | |
| 3108 | 4081 | // |
| 3109 | 4082 | // Bitwise OR of two vector registers |
| 3110 | 4083 | |
| 3111 | | int sel; |
| 4084 | #if USE_SIMD |
| 4085 | __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 4086 | rsp->xv[VDREG] = _mm_or_si128(rsp->xv[VS1REG], shuf); |
| 4087 | rsp->accum_l = rsp->xv[VDREG]; |
| 4088 | #else |
| 4089 | INT16 vres[8]; |
| 3112 | 4090 | for (int i = 0; i < 8; i++) |
| 3113 | 4091 | { |
| 3114 | | sel = VEC_EL_2(EL, i); |
| 3115 | | vres[i] = VREG_S(VS1REG, i) | VREG_S(VS2REG, sel); |
| 4092 | UINT16 s1 = (UINT16)VREG_S(VS1REG, i); |
| 4093 | UINT16 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 4094 | vres[i] = s1 | s2; |
| 3116 | 4095 | ACCUM_L(i) = vres[i]; |
| 3117 | 4096 | } |
| 3118 | 4097 | WRITEBACK_RESULT(); |
| 4098 | #endif |
| 3119 | 4099 | } |
| 3120 | 4100 | |
| 3121 | 4101 | INLINE void cfunc_rsp_vnor(void *param) |
| 3122 | 4102 | { |
| 3123 | 4103 | rsp_state *rsp = (rsp_state*)param; |
| 3124 | 4104 | int op = rsp->impstate->arg0; |
| 3125 | | INT16 vres[8] = { 0 };; |
| 4105 | |
| 3126 | 4106 | // 31 25 24 20 15 10 5 0 |
| 3127 | 4107 | // ------------------------------------------------------ |
| 3128 | 4108 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101011 | |
| r24005 | r24006 | |
| 3130 | 4110 | // |
| 3131 | 4111 | // Bitwise NOT OR of two vector registers |
| 3132 | 4112 | |
| 3133 | | int sel; |
| 4113 | #if USE_SIMD |
| 4114 | __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 4115 | rsp->xv[VDREG] = _mm_xor_si128(_mm_or_si128(rsp->xv[VS1REG], shuf), vec_neg1); |
| 4116 | rsp->accum_l = rsp->xv[VDREG]; |
| 4117 | #else |
| 4118 | INT16 vres[8]; |
| 3134 | 4119 | for (int i = 0; i < 8; i++) |
| 3135 | 4120 | { |
| 3136 | | sel = VEC_EL_2(EL, i); |
| 3137 | | vres[i] = ~((VREG_S(VS1REG, i) | VREG_S(VS2REG, sel))); |
| 4121 | UINT16 s1 = (UINT16)VREG_S(VS1REG, i); |
| 4122 | UINT16 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 4123 | vres[i] = ~((s1 | s2)); |
| 3138 | 4124 | ACCUM_L(i) = vres[i]; |
| 3139 | 4125 | } |
| 3140 | 4126 | WRITEBACK_RESULT(); |
| 4127 | #endif |
| 3141 | 4128 | } |
| 3142 | 4129 | |
| 3143 | 4130 | INLINE void cfunc_rsp_vxor(void *param) |
| 3144 | 4131 | { |
| 3145 | 4132 | rsp_state *rsp = (rsp_state*)param; |
| 3146 | 4133 | int op = rsp->impstate->arg0; |
| 3147 | | INT16 vres[8] = { 0 };; |
| 4134 | |
| 3148 | 4135 | // 31 25 24 20 15 10 5 0 |
| 3149 | 4136 | // ------------------------------------------------------ |
| 3150 | 4137 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101100 | |
| r24005 | r24006 | |
| 3152 | 4139 | // |
| 3153 | 4140 | // Bitwise XOR of two vector registers |
| 3154 | 4141 | |
| 3155 | | int sel; |
| 4142 | #if USE_SIMD |
| 4143 | __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 4144 | rsp->xv[VDREG] = _mm_xor_si128(rsp->xv[VS1REG], shuf); |
| 4145 | rsp->accum_l = rsp->xv[VDREG]; |
| 4146 | #else |
| 4147 | INT16 vres[8]; |
| 3156 | 4148 | for (int i = 0; i < 8; i++) |
| 3157 | 4149 | { |
| 3158 | | sel = VEC_EL_2(EL, i); |
| 3159 | | vres[i] = VREG_S(VS1REG, i) ^ VREG_S(VS2REG, sel); |
| 4150 | UINT16 s1 = (UINT16)VREG_S(VS1REG, i); |
| 4151 | UINT16 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 4152 | vres[i] = s1 ^ s2; |
| 3160 | 4153 | ACCUM_L(i) = vres[i]; |
| 3161 | 4154 | } |
| 3162 | 4155 | WRITEBACK_RESULT(); |
| 4156 | #endif |
| 3163 | 4157 | } |
| 3164 | 4158 | |
| 3165 | 4159 | INLINE void cfunc_rsp_vnxor(void *param) |
| 3166 | 4160 | { |
| 3167 | 4161 | rsp_state *rsp = (rsp_state*)param; |
| 3168 | 4162 | int op = rsp->impstate->arg0; |
| 3169 | | INT16 vres[8] = { 0 };; |
| 4163 | |
| 3170 | 4164 | // 31 25 24 20 15 10 5 0 |
| 3171 | 4165 | // ------------------------------------------------------ |
| 3172 | 4166 | // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101101 | |
| r24005 | r24006 | |
| 3174 | 4168 | // |
| 3175 | 4169 | // Bitwise NOT XOR of two vector registers |
| 3176 | 4170 | |
| 3177 | | int sel; |
| 4171 | #if USE_SIMD |
| 4172 | __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 4173 | rsp->xv[VDREG] = _mm_xor_si128(_mm_xor_si128(rsp->xv[VS1REG], shuf), vec_neg1); |
| 4174 | rsp->accum_l = rsp->xv[VDREG]; |
| 4175 | #else |
| 4176 | INT16 vres[8]; |
| 3178 | 4177 | for (int i = 0; i < 8; i++) |
| 3179 | 4178 | { |
| 3180 | | sel = VEC_EL_2(EL, i); |
| 3181 | | vres[i] = ~((VREG_S(VS1REG, i) ^ VREG_S(VS2REG, sel))); |
| 4179 | UINT16 s1 = (UINT16)VREG_S(VS1REG, i); |
| 4180 | UINT16 s2 = (UINT16)VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 4181 | vres[i] = ~((s1 ^ s2)); |
| 3182 | 4182 | ACCUM_L(i) = vres[i]; |
| 3183 | 4183 | } |
| 3184 | 4184 | WRITEBACK_RESULT(); |
| 4185 | #endif |
| 3185 | 4186 | } |
| 3186 | 4187 | |
| 3187 | 4188 | INLINE void cfunc_rsp_vrcp(void *param) |
| 3188 | 4189 | { |
| 3189 | 4190 | rsp_state *rsp = (rsp_state*)param; |
| 3190 | 4191 | int op = rsp->impstate->arg0; |
| 3191 | | int i; |
| 4192 | |
| 3192 | 4193 | // 31 25 24 20 15 10 5 0 |
| 3193 | 4194 | // ------------------------------------------------------ |
| 3194 | 4195 | // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110000 | |
| r24005 | r24006 | |
| 3196 | 4197 | // |
| 3197 | 4198 | // Calculates reciprocal |
| 3198 | 4199 | |
| 3199 | | int del = VS1REG & 7; |
| 3200 | | int sel = EL & 7; |
| 3201 | 4200 | INT32 shifter = 0; |
| 3202 | | |
| 3203 | | INT32 rec = (INT16)(VREG_S(VS2REG, sel)); |
| 4201 | #if USE_SIMD |
| 4202 | UINT16 urec; |
| 4203 | INT32 rec; |
| 4204 | SIMD_EXTRACT16(rsp->xv[VS2REG], urec, EL); |
| 4205 | rec = (INT16)urec; |
| 4206 | #else |
| 4207 | INT32 rec = (INT16)(VREG_S(VS2REG, EL & 7)); |
| 4208 | #endif |
| 3204 | 4209 | INT32 datainput = (rec < 0) ? (-rec) : rec; |
| 3205 | 4210 | if (datainput) |
| 3206 | 4211 | { |
| 3207 | | for (i = 0; i < 32; i++) |
| 4212 | for (int i = 0; i < 32; i++) |
| 3208 | 4213 | { |
| 3209 | | if (datainput & (1 << ((~i) & 0x1f)))//?.?.??? 31 - i |
| 4214 | if (datainput & (1 << ((~i) & 0x1f))) |
| 3210 | 4215 | { |
| 3211 | 4216 | shifter = i; |
| 3212 | 4217 | break; |
| r24005 | r24006 | |
| 3238 | 4243 | rsp->reciprocal_res = rec; |
| 3239 | 4244 | rsp->dp_allowed = 0; |
| 3240 | 4245 | |
| 3241 | | W_VREG_S(VDREG, del) = (UINT16)(rec & 0xffff); |
| 4246 | #if USE_SIMD |
| 4247 | SIMD_INSERT16(rsp->xv[VDREG], (UINT16)rec, VS1REG); |
| 4248 | #else |
| 4249 | W_VREG_S(VDREG, VS1REG & 7) = (UINT16)rec; |
| 4250 | #endif |
| 3242 | 4251 | |
| 3243 | | for (i = 0; i < 8; i++) |
| 4252 | for (int i = 0; i < 8; i++) |
| 3244 | 4253 | { |
| 3245 | | sel = VEC_EL_2(EL, i); |
| 3246 | | ACCUM_L(i) = VREG_S(VS2REG, sel); |
| 4254 | #if USE_SIMD |
| 4255 | INT16 val; |
| 4256 | SIMD_EXTRACT16(rsp->xv[VS2REG], val, VEC_EL_2(EL, i)); |
| 4257 | SIMD_INSERT16(rsp->accum_l, val, i); |
| 4258 | #else |
| 4259 | ACCUM_L(i) = VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 4260 | #endif |
| 3247 | 4261 | } |
| 3248 | 4262 | } |
| 3249 | 4263 | |
| r24005 | r24006 | |
| 3251 | 4265 | { |
| 3252 | 4266 | rsp_state *rsp = (rsp_state*)param; |
| 3253 | 4267 | int op = rsp->impstate->arg0; |
| 3254 | | int i; |
| 4268 | |
| 3255 | 4269 | // 31 25 24 20 15 10 5 0 |
| 3256 | 4270 | // ------------------------------------------------------ |
| 3257 | 4271 | // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110001 | |
| r24005 | r24006 | |
| 3259 | 4273 | // |
| 3260 | 4274 | // Calculates reciprocal low part |
| 3261 | 4275 | |
| 3262 | | int del = VS1REG & 7; |
| 3263 | | int sel = EL & 7; |
| 3264 | 4276 | INT32 shifter = 0; |
| 3265 | 4277 | |
| 3266 | | INT32 rec = ((UINT16)(VREG_S(VS2REG, sel)) | ((UINT32)(rsp->reciprocal_high) & 0xffff0000)); |
| 4278 | #if USE_SIMD |
| 4279 | UINT16 urec; |
| 4280 | INT32 rec; |
| 4281 | SIMD_EXTRACT16(rsp->xv[VS2REG], urec, EL); |
| 4282 | rec = (INT32)(rsp->reciprocal_high | urec); |
| 4283 | #else |
| 4284 | INT32 rec = ((UINT16)(VREG_S(VS2REG, EL & 7)) | rsp->reciprocal_high); |
| 4285 | #endif |
| 3267 | 4286 | |
| 3268 | 4287 | INT32 datainput = rec; |
| 3269 | 4288 | |
| r24005 | r24006 | |
| 3289 | 4308 | |
| 3290 | 4309 | if (datainput) |
| 3291 | 4310 | { |
| 3292 | | for (i = 0; i < 32; i++) |
| 4311 | for (int i = 0; i < 32; i++) |
| 3293 | 4312 | { |
| 3294 | | if (datainput & (1 << ((~i) & 0x1f)))//?.?.??? 31 - i |
| 4313 | if (datainput & (1 << ((~i) & 0x1f))) |
| 3295 | 4314 | { |
| 3296 | 4315 | shifter = i; |
| 3297 | 4316 | break; |
| r24005 | r24006 | |
| 3330 | 4349 | rsp->reciprocal_res = rec; |
| 3331 | 4350 | rsp->dp_allowed = 0; |
| 3332 | 4351 | |
| 3333 | | W_VREG_S(VDREG, del) = (UINT16)(rec & 0xffff); |
| 4352 | #if USE_SIMD |
| 4353 | SIMD_INSERT16(rsp->xv[VDREG], (UINT16)rec, VS1REG); |
| 4354 | #else |
| 4355 | W_VREG_S(VDREG, VS1REG & 7) = (UINT16)rec; |
| 4356 | #endif |
| 3334 | 4357 | |
| 3335 | | for (i = 0; i < 8; i++) |
| 4358 | for (int i = 0; i < 8; i++) |
| 3336 | 4359 | { |
| 3337 | | sel = VEC_EL_2(EL, i); |
| 3338 | | ACCUM_L(i) = VREG_S(VS2REG, sel); |
| 4360 | #if USE_SIMD |
| 4361 | INT16 val; |
| 4362 | SIMD_EXTRACT16(rsp->xv[VS2REG], val, VEC_EL_2(EL, i)); |
| 4363 | SIMD_INSERT16(rsp->accum_l, val, i); |
| 4364 | #else |
| 4365 | ACCUM_L(i) = VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 4366 | #endif |
| 3339 | 4367 | } |
| 3340 | 4368 | } |
| 3341 | 4369 | |
| r24005 | r24006 | |
| 3350 | 4378 | // |
| 3351 | 4379 | // Calculates reciprocal high part |
| 3352 | 4380 | |
| 3353 | | int del = VS1REG & 7; |
| 3354 | | int sel = EL & 7; |
| 4381 | #if USE_SIMD |
| 4382 | UINT16 rcph; |
| 4383 | SIMD_EXTRACT16(rsp->xv[VS2REG], rcph, EL); |
| 4384 | rsp->reciprocal_high = rcph << 16; |
| 4385 | rsp->dp_allowed = 1; |
| 3355 | 4386 | |
| 3356 | | rsp->reciprocal_high = (VREG_S(VS2REG, sel)) << 16; |
| 4387 | //rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 4388 | INT16 val; |
| 4389 | for (int i = 0; i < 8; i++) |
| 4390 | { |
| 4391 | SIMD_EXTRACT16(rsp->xv[VS2REG], val, VEC_EL_2(EL, i)); |
| 4392 | SIMD_INSERT16(rsp->accum_l, val, i); |
| 4393 | } |
| 4394 | |
| 4395 | SIMD_INSERT16(rsp->xv[VDREG], (INT16)(rsp->reciprocal_res >> 16), VS1REG); |
| 4396 | #else |
| 4397 | rsp->reciprocal_high = (VREG_S(VS2REG, EL & 7)) << 16; |
| 3357 | 4398 | rsp->dp_allowed = 1; |
| 3358 | 4399 | |
| 3359 | 4400 | for (int i = 0; i < 8; i++) |
| 3360 | 4401 | { |
| 3361 | | sel = VEC_EL_2(EL, i); |
| 3362 | | ACCUM_L(i) = VREG_S(VS2REG, sel); |
| 4402 | ACCUM_L(i) = VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 3363 | 4403 | } |
| 3364 | 4404 | |
| 3365 | | W_VREG_S(VDREG, del) = (INT16)(rsp->reciprocal_res >> 16); |
| 4405 | W_VREG_S(VDREG, VS1REG & 7) = (INT16)(rsp->reciprocal_res >> 16); |
| 4406 | #endif |
| 3366 | 4407 | } |
| 3367 | 4408 | |
| 3368 | 4409 | INLINE void cfunc_rsp_vmov(void *param) |
| r24005 | r24006 | |
| 3376 | 4417 | // |
| 3377 | 4418 | // Moves element from vector to destination vector |
| 3378 | 4419 | |
| 3379 | | int del = VS1REG & 7; |
| 3380 | | int sel = EL & 7; |
| 3381 | | |
| 3382 | | W_VREG_S(VDREG, del) = VREG_S(VS2REG, sel); |
| 4420 | #if USE_SIMD |
| 4421 | INT16 val; |
| 4422 | SIMD_EXTRACT16(rsp->xv[VS2REG], val, EL); |
| 4423 | SIMD_INSERT16(rsp->xv[VDREG], val, VS1REG); |
| 4424 | //rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 3383 | 4425 | for (int i = 0; i < 8; i++) |
| 3384 | 4426 | { |
| 3385 | | sel = VEC_EL_2(EL, i); |
| 3386 | | ACCUM_L(i) = VREG_S(VS2REG, sel); |
| 4427 | SIMD_EXTRACT16(rsp->xv[VS2REG], val, VEC_EL_2(EL, i)); |
| 4428 | SIMD_INSERT16(rsp->accum_l, val, i); |
| 3387 | 4429 | } |
| 4430 | #else |
| 4431 | W_VREG_S(VDREG, VS1REG & 7) = VREG_S(VS2REG, EL & 7); |
| 4432 | for (int i = 0; i < 8; i++) |
| 4433 | { |
| 4434 | ACCUM_L(i) = VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 4435 | } |
| 4436 | #endif |
| 3388 | 4437 | } |
| 3389 | 4438 | |
| 3390 | 4439 | INLINE void cfunc_rsp_vrsql(void *param) |
| 3391 | 4440 | { |
| 3392 | 4441 | rsp_state *rsp = (rsp_state*)param; |
| 3393 | 4442 | int op = rsp->impstate->arg0; |
| 3394 | | int i; |
| 4443 | |
| 3395 | 4444 | // 31 25 24 20 15 10 5 0 |
| 3396 | 4445 | // ------------------------------------------------------ |
| 3397 | 4446 | // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110101 | |
| r24005 | r24006 | |
| 3399 | 4448 | // |
| 3400 | 4449 | // Calculates reciprocal square-root low part |
| 3401 | 4450 | |
| 3402 | | int del = VS1REG & 7; |
| 3403 | | int sel = EL & 7; |
| 3404 | 4451 | INT32 shifter = 0; |
| 3405 | | |
| 3406 | | INT32 rec = ((UINT16)(VREG_S(VS2REG, sel)) | ((UINT32)(rsp->reciprocal_high) & 0xffff0000)); |
| 3407 | | |
| 4452 | #if USE_SIMD |
| 4453 | UINT16 val; |
| 4454 | SIMD_EXTRACT16(rsp->xv[VS2REG], val, EL); |
| 4455 | INT32 rec = (INT32)(rsp->reciprocal_high | val); |
| 4456 | #else |
| 4457 | INT32 rec = rsp->reciprocal_high | (UINT16)VREG_S(VS2REG, EL & 7); |
| 4458 | #endif |
| 3408 | 4459 | INT32 datainput = rec; |
| 3409 | 4460 | |
| 3410 | 4461 | if (rec < 0) |
| 3411 | 4462 | { |
| 3412 | 4463 | if (rsp->dp_allowed) |
| 3413 | 4464 | { |
| 3414 | | if (rec < -32768)//VDIV.C,208 |
| 4465 | if (rec < -32768) |
| 3415 | 4466 | { |
| 3416 | 4467 | datainput = ~datainput; |
| 3417 | 4468 | } |
| r24005 | r24006 | |
| 3428 | 4479 | |
| 3429 | 4480 | if (datainput) |
| 3430 | 4481 | { |
| 3431 | | for (i = 0; i < 32; i++) |
| 4482 | for (int i = 0; i < 32; i++) |
| 3432 | 4483 | { |
| 3433 | 4484 | if (datainput & (1 << ((~i) & 0x1f))) |
| 3434 | 4485 | { |
| r24005 | r24006 | |
| 3471 | 4522 | rsp->reciprocal_res = rec; |
| 3472 | 4523 | rsp->dp_allowed = 0; |
| 3473 | 4524 | |
| 3474 | | W_VREG_S(VDREG, del) = (UINT16)(rec & 0xffff); |
| 3475 | | |
| 3476 | | for (i = 0; i < 8; i++) |
| 4525 | #if USE_SIMD |
| 4526 | SIMD_INSERT16(rsp->xv[VDREG], (UINT16)rec, VS1REG); |
| 4527 | //rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 4528 | for (int i = 0; i < 8; i++) |
| 3477 | 4529 | { |
| 3478 | | sel = VEC_EL_2(EL, i); |
| 3479 | | ACCUM_L(i) = VREG_S(VS2REG, sel); |
| 4530 | SIMD_EXTRACT16(rsp->xv[VS2REG], val, VEC_EL_2(EL, i)); |
| 4531 | SIMD_INSERT16(rsp->accum_l, val, i); |
| 3480 | 4532 | } |
| 4533 | #else |
| 4534 | W_VREG_S(VDREG, VS1REG & 7) = (UINT16)(rec & 0xffff); |
| 4535 | for (int i = 0; i < 8; i++) |
| 4536 | { |
| 4537 | ACCUM_L(i) = VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 4538 | } |
| 4539 | #endif |
| 3481 | 4540 | } |
| 3482 | 4541 | |
| 3483 | 4542 | INLINE void cfunc_rsp_vrsqh(void *param) |
| 3484 | 4543 | { |
| 3485 | 4544 | rsp_state *rsp = (rsp_state*)param; |
| 3486 | 4545 | int op = rsp->impstate->arg0; |
| 3487 | | int i; |
| 4546 | |
| 3488 | 4547 | // 31 25 24 20 15 10 5 0 |
| 3489 | 4548 | // ------------------------------------------------------ |
| 3490 | 4549 | // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110110 | |
| r24005 | r24006 | |
| 3492 | 4551 | // |
| 3493 | 4552 | // Calculates reciprocal square-root high part |
| 3494 | 4553 | |
| 3495 | | int del = VS1REG & 7; |
| 3496 | | int sel = EL & 7; |
| 4554 | #if USE_SIMD |
| 4555 | UINT16 val; |
| 4556 | SIMD_EXTRACT16(rsp->xv[VS2REG], val, EL); |
| 4557 | rsp->reciprocal_high = val << 16; |
| 4558 | rsp->dp_allowed = 1; |
| 3497 | 4559 | |
| 3498 | | rsp->reciprocal_high = (VREG_S(VS2REG, sel)) << 16; |
| 4560 | //rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); |
| 4561 | for (int i = 0; i < 8; i++) |
| 4562 | { |
| 4563 | SIMD_EXTRACT16(rsp->xv[VS2REG], val, VEC_EL_2(EL, i)); |
| 4564 | SIMD_INSERT16(rsp->accum_l, val, i); |
| 4565 | } |
| 4566 | |
| 4567 | SIMD_INSERT16(rsp->xv[VDREG], (UINT16)(rsp->reciprocal_res >> 16), VS1REG); // store high part |
| 4568 | #else |
| 4569 | rsp->reciprocal_high = (VREG_S(VS2REG, EL & 7)) << 16; |
| 3499 | 4570 | rsp->dp_allowed = 1; |
| 3500 | 4571 | |
| 3501 | | for (i=0; i < 8; i++) |
| 4572 | for (int i = 0; i < 8; i++) |
| 3502 | 4573 | { |
| 3503 | | sel = VEC_EL_2(EL, i); |
| 3504 | | ACCUM_L(i) = VREG_S(VS2REG, sel); |
| 4574 | ACCUM_L(i) = VREG_S(VS2REG, VEC_EL_2(EL, i)); |
| 3505 | 4575 | } |
| 3506 | 4576 | |
| 3507 | | W_VREG_S(VDREG, del) = (INT16)(rsp->reciprocal_res >> 16); // store high part |
| 4577 | W_VREG_S(VDREG, VS1REG & 7) = (INT16)(rsp->reciprocal_res >> 16); // store high part |
| 4578 | #endif |
| 3508 | 4579 | } |
| 3509 | 4580 | |
| 3510 | 4581 | static void cfunc_sp_set_status_cb(void *param) |
| r24005 | r24006 | |
| 4781 | 5852 | rsp_state *rsp = (rsp_state*)param; |
| 4782 | 5853 | UINT32 op = rsp->impstate->arg0; |
| 4783 | 5854 | int el = (op >> 7) & 0xf; |
| 5855 | #if USE_SIMD |
| 5856 | UINT16 w; |
| 5857 | SIMD_EXTRACT16(rsp->xv[VS1REG], w, el >> 1); |
| 5858 | rsp->r[RTREG] = (INT32)(INT16)w; |
| 5859 | #else |
| 4784 | 5860 | UINT16 b1 = VREG_B(VS1REG, (el+0) & 0xf); |
| 4785 | 5861 | UINT16 b2 = VREG_B(VS1REG, (el+1) & 0xf); |
| 4786 | 5862 | if (RTREG) RTVAL = (INT32)(INT16)((b1 << 8) | (b2)); |
| 5863 | #endif |
| 4787 | 5864 | } |
| 4788 | 5865 | |
| 4789 | 5866 | static void cfunc_cfc2(void *param) |
| r24005 | r24006 | |
| 4810 | 5887 | rsp_state *rsp = (rsp_state*)param; |
| 4811 | 5888 | UINT32 op = rsp->impstate->arg0; |
| 4812 | 5889 | int el = (op >> 7) & 0xf; |
| 5890 | #if USE_SIMD |
| 5891 | SIMD_INSERT16(rsp->xv[VS1REG], RTVAL, el >> 1); |
| 5892 | #else |
| 4813 | 5893 | VREG_B(VS1REG, (el+0) & 0xf) = (RTVAL >> 8) & 0xff; |
| 4814 | 5894 | VREG_B(VS1REG, (el+1) & 0xf) = (RTVAL >> 0) & 0xff; |
| 5895 | #endif |
| 4815 | 5896 | } |
| 4816 | 5897 | |
| 4817 | 5898 | static void cfunc_ctc2(void *param) |
| r24005 | r24006 | |
| 5013 | 6094 | case CPUINFO_STR_REGISTER + RSP_R29: sprintf(info->s, "R29: %08X", rsp->r[29]); break; |
| 5014 | 6095 | case CPUINFO_STR_REGISTER + RSP_R30: sprintf(info->s, "R30: %08X", rsp->r[30]); break; |
| 5015 | 6096 | case CPUINFO_STR_REGISTER + RSP_R31: sprintf(info->s, "R31: %08X", rsp->r[31]); break; |
| 6097 | |
| 6098 | #if USE_SIMD |
| 6099 | case CPUINFO_STR_REGISTER + RSP_V0: sprintf(info->s, "V0: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[ 0], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 0], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 0], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 0], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 0], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 0], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 0], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 0], 0)); break; |
| 6100 | case CPUINFO_STR_REGISTER + RSP_V1: sprintf(info->s, "V1: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[ 1], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 1], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 1], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 1], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 1], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 1], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 1], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 1], 0)); break; |
| 6101 | case CPUINFO_STR_REGISTER + RSP_V2: sprintf(info->s, "V2: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[ 2], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 2], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 2], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 2], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 2], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 2], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 2], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 2], 0)); break; |
| 6102 | case CPUINFO_STR_REGISTER + RSP_V3: sprintf(info->s, "V3: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[ 3], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 3], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 3], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 3], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 3], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 3], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 3], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 3], 0)); break; |
| 6103 | case CPUINFO_STR_REGISTER + RSP_V4: sprintf(info->s, "V4: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[ 4], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 4], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 4], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 4], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 4], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 4], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 4], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 4], 0)); break; |
| 6104 | case CPUINFO_STR_REGISTER + RSP_V5: sprintf(info->s, "V5: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[ 5], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 5], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 5], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 5], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 5], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 5], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 5], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 5], 0)); break; |
| 6105 | case CPUINFO_STR_REGISTER + RSP_V6: sprintf(info->s, "V6: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[ 6], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 6], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 6], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 6], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 6], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 6], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 6], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 6], 0)); break; |
| 6106 | case CPUINFO_STR_REGISTER + RSP_V7: sprintf(info->s, "V7: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[ 7], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 7], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 7], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 7], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 7], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 7], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 7], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 7], 0)); break; |
| 6107 | case CPUINFO_STR_REGISTER + RSP_V8: sprintf(info->s, "V8: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[ 8], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 8], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 8], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 8], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 8], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 8], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 8], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 8], 0)); break; |
| 6108 | case CPUINFO_STR_REGISTER + RSP_V9: sprintf(info->s, "V9: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[ 9], 7), (UINT16)_mm_extract_epi16(rsp->xv[ 9], 6), (UINT16)_mm_extract_epi16(rsp->xv[ 9], 5), (UINT16)_mm_extract_epi16(rsp->xv[ 9], 4), (UINT16)_mm_extract_epi16(rsp->xv[ 9], 3), (UINT16)_mm_extract_epi16(rsp->xv[ 9], 2), (UINT16)_mm_extract_epi16(rsp->xv[ 9], 1), (UINT16)_mm_extract_epi16(rsp->xv[ 9], 0)); break; |
| 6109 | case CPUINFO_STR_REGISTER + RSP_V10: sprintf(info->s, "V10: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[10], 7), (UINT16)_mm_extract_epi16(rsp->xv[10], 6), (UINT16)_mm_extract_epi16(rsp->xv[10], 5), (UINT16)_mm_extract_epi16(rsp->xv[10], 4), (UINT16)_mm_extract_epi16(rsp->xv[10], 3), (UINT16)_mm_extract_epi16(rsp->xv[10], 2), (UINT16)_mm_extract_epi16(rsp->xv[10], 1), (UINT16)_mm_extract_epi16(rsp->xv[10], 0)); break; |
| 6110 | case CPUINFO_STR_REGISTER + RSP_V11: sprintf(info->s, "V11: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[11], 7), (UINT16)_mm_extract_epi16(rsp->xv[11], 6), (UINT16)_mm_extract_epi16(rsp->xv[11], 5), (UINT16)_mm_extract_epi16(rsp->xv[11], 4), (UINT16)_mm_extract_epi16(rsp->xv[11], 3), (UINT16)_mm_extract_epi16(rsp->xv[11], 2), (UINT16)_mm_extract_epi16(rsp->xv[11], 1), (UINT16)_mm_extract_epi16(rsp->xv[11], 0)); break; |
| 6111 | case CPUINFO_STR_REGISTER + RSP_V12: sprintf(info->s, "V12: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[12], 7), (UINT16)_mm_extract_epi16(rsp->xv[12], 6), (UINT16)_mm_extract_epi16(rsp->xv[12], 5), (UINT16)_mm_extract_epi16(rsp->xv[12], 4), (UINT16)_mm_extract_epi16(rsp->xv[12], 3), (UINT16)_mm_extract_epi16(rsp->xv[12], 2), (UINT16)_mm_extract_epi16(rsp->xv[12], 1), (UINT16)_mm_extract_epi16(rsp->xv[12], 0)); break; |
| 6112 | case CPUINFO_STR_REGISTER + RSP_V13: sprintf(info->s, "V13: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[13], 7), (UINT16)_mm_extract_epi16(rsp->xv[13], 6), (UINT16)_mm_extract_epi16(rsp->xv[13], 5), (UINT16)_mm_extract_epi16(rsp->xv[13], 4), (UINT16)_mm_extract_epi16(rsp->xv[13], 3), (UINT16)_mm_extract_epi16(rsp->xv[13], 2), (UINT16)_mm_extract_epi16(rsp->xv[13], 1), (UINT16)_mm_extract_epi16(rsp->xv[13], 0)); break; |
| 6113 | case CPUINFO_STR_REGISTER + RSP_V14: sprintf(info->s, "V14: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[14], 7), (UINT16)_mm_extract_epi16(rsp->xv[14], 6), (UINT16)_mm_extract_epi16(rsp->xv[14], 5), (UINT16)_mm_extract_epi16(rsp->xv[14], 4), (UINT16)_mm_extract_epi16(rsp->xv[14], 3), (UINT16)_mm_extract_epi16(rsp->xv[14], 2), (UINT16)_mm_extract_epi16(rsp->xv[14], 1), (UINT16)_mm_extract_epi16(rsp->xv[14], 0)); break; |
| 6114 | case CPUINFO_STR_REGISTER + RSP_V15: sprintf(info->s, "V15: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[15], 7), (UINT16)_mm_extract_epi16(rsp->xv[15], 6), (UINT16)_mm_extract_epi16(rsp->xv[15], 5), (UINT16)_mm_extract_epi16(rsp->xv[15], 4), (UINT16)_mm_extract_epi16(rsp->xv[15], 3), (UINT16)_mm_extract_epi16(rsp->xv[15], 2), (UINT16)_mm_extract_epi16(rsp->xv[15], 1), (UINT16)_mm_extract_epi16(rsp->xv[15], 0)); break; |
| 6115 | case CPUINFO_STR_REGISTER + RSP_V16: sprintf(info->s, "V16: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[16], 7), (UINT16)_mm_extract_epi16(rsp->xv[16], 6), (UINT16)_mm_extract_epi16(rsp->xv[16], 5), (UINT16)_mm_extract_epi16(rsp->xv[16], 4), (UINT16)_mm_extract_epi16(rsp->xv[16], 3), (UINT16)_mm_extract_epi16(rsp->xv[16], 2), (UINT16)_mm_extract_epi16(rsp->xv[16], 1), (UINT16)_mm_extract_epi16(rsp->xv[16], 0)); break; |
| 6116 | case CPUINFO_STR_REGISTER + RSP_V17: sprintf(info->s, "V17: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[17], 7), (UINT16)_mm_extract_epi16(rsp->xv[17], 6), (UINT16)_mm_extract_epi16(rsp->xv[17], 5), (UINT16)_mm_extract_epi16(rsp->xv[17], 4), (UINT16)_mm_extract_epi16(rsp->xv[17], 3), (UINT16)_mm_extract_epi16(rsp->xv[17], 2), (UINT16)_mm_extract_epi16(rsp->xv[17], 1), (UINT16)_mm_extract_epi16(rsp->xv[17], 0)); break; |
| 6117 | case CPUINFO_STR_REGISTER + RSP_V18: sprintf(info->s, "V18: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[18], 7), (UINT16)_mm_extract_epi16(rsp->xv[18], 6), (UINT16)_mm_extract_epi16(rsp->xv[18], 5), (UINT16)_mm_extract_epi16(rsp->xv[18], 4), (UINT16)_mm_extract_epi16(rsp->xv[18], 3), (UINT16)_mm_extract_epi16(rsp->xv[18], 2), (UINT16)_mm_extract_epi16(rsp->xv[18], 1), (UINT16)_mm_extract_epi16(rsp->xv[18], 0)); break; |
| 6118 | case CPUINFO_STR_REGISTER + RSP_V19: sprintf(info->s, "V19: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[19], 7), (UINT16)_mm_extract_epi16(rsp->xv[19], 6), (UINT16)_mm_extract_epi16(rsp->xv[19], 5), (UINT16)_mm_extract_epi16(rsp->xv[19], 4), (UINT16)_mm_extract_epi16(rsp->xv[19], 3), (UINT16)_mm_extract_epi16(rsp->xv[19], 2), (UINT16)_mm_extract_epi16(rsp->xv[19], 1), (UINT16)_mm_extract_epi16(rsp->xv[19], 0)); break; |
| 6119 | case CPUINFO_STR_REGISTER + RSP_V20: sprintf(info->s, "V20: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[20], 7), (UINT16)_mm_extract_epi16(rsp->xv[20], 6), (UINT16)_mm_extract_epi16(rsp->xv[20], 5), (UINT16)_mm_extract_epi16(rsp->xv[20], 4), (UINT16)_mm_extract_epi16(rsp->xv[20], 3), (UINT16)_mm_extract_epi16(rsp->xv[20], 2), (UINT16)_mm_extract_epi16(rsp->xv[20], 1), (UINT16)_mm_extract_epi16(rsp->xv[20], 0)); break; |
| 6120 | case CPUINFO_STR_REGISTER + RSP_V21: sprintf(info->s, "V21: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[21], 7), (UINT16)_mm_extract_epi16(rsp->xv[21], 6), (UINT16)_mm_extract_epi16(rsp->xv[21], 5), (UINT16)_mm_extract_epi16(rsp->xv[21], 4), (UINT16)_mm_extract_epi16(rsp->xv[21], 3), (UINT16)_mm_extract_epi16(rsp->xv[21], 2), (UINT16)_mm_extract_epi16(rsp->xv[21], 1), (UINT16)_mm_extract_epi16(rsp->xv[21], 0)); break; |
| 6121 | case CPUINFO_STR_REGISTER + RSP_V22: sprintf(info->s, "V22: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[22], 7), (UINT16)_mm_extract_epi16(rsp->xv[22], 6), (UINT16)_mm_extract_epi16(rsp->xv[22], 5), (UINT16)_mm_extract_epi16(rsp->xv[22], 4), (UINT16)_mm_extract_epi16(rsp->xv[22], 3), (UINT16)_mm_extract_epi16(rsp->xv[22], 2), (UINT16)_mm_extract_epi16(rsp->xv[22], 1), (UINT16)_mm_extract_epi16(rsp->xv[22], 0)); break; |
| 6122 | case CPUINFO_STR_REGISTER + RSP_V23: sprintf(info->s, "V23: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[23], 7), (UINT16)_mm_extract_epi16(rsp->xv[23], 6), (UINT16)_mm_extract_epi16(rsp->xv[23], 5), (UINT16)_mm_extract_epi16(rsp->xv[23], 4), (UINT16)_mm_extract_epi16(rsp->xv[23], 3), (UINT16)_mm_extract_epi16(rsp->xv[23], 2), (UINT16)_mm_extract_epi16(rsp->xv[23], 1), (UINT16)_mm_extract_epi16(rsp->xv[23], 0)); break; |
| 6123 | case CPUINFO_STR_REGISTER + RSP_V24: sprintf(info->s, "V24: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[24], 7), (UINT16)_mm_extract_epi16(rsp->xv[24], 6), (UINT16)_mm_extract_epi16(rsp->xv[24], 5), (UINT16)_mm_extract_epi16(rsp->xv[24], 4), (UINT16)_mm_extract_epi16(rsp->xv[24], 3), (UINT16)_mm_extract_epi16(rsp->xv[24], 2), (UINT16)_mm_extract_epi16(rsp->xv[24], 1), (UINT16)_mm_extract_epi16(rsp->xv[24], 0)); break; |
| 6124 | case CPUINFO_STR_REGISTER + RSP_V25: sprintf(info->s, "V25: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[25], 7), (UINT16)_mm_extract_epi16(rsp->xv[25], 6), (UINT16)_mm_extract_epi16(rsp->xv[25], 5), (UINT16)_mm_extract_epi16(rsp->xv[25], 4), (UINT16)_mm_extract_epi16(rsp->xv[25], 3), (UINT16)_mm_extract_epi16(rsp->xv[25], 2), (UINT16)_mm_extract_epi16(rsp->xv[25], 1), (UINT16)_mm_extract_epi16(rsp->xv[25], 0)); break; |
| 6125 | case CPUINFO_STR_REGISTER + RSP_V26: sprintf(info->s, "V26: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[26], 7), (UINT16)_mm_extract_epi16(rsp->xv[26], 6), (UINT16)_mm_extract_epi16(rsp->xv[26], 5), (UINT16)_mm_extract_epi16(rsp->xv[26], 4), (UINT16)_mm_extract_epi16(rsp->xv[26], 3), (UINT16)_mm_extract_epi16(rsp->xv[26], 2), (UINT16)_mm_extract_epi16(rsp->xv[26], 1), (UINT16)_mm_extract_epi16(rsp->xv[26], 0)); break; |
| 6126 | case CPUINFO_STR_REGISTER + RSP_V27: sprintf(info->s, "V27: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[27], 7), (UINT16)_mm_extract_epi16(rsp->xv[27], 6), (UINT16)_mm_extract_epi16(rsp->xv[27], 5), (UINT16)_mm_extract_epi16(rsp->xv[27], 4), (UINT16)_mm_extract_epi16(rsp->xv[27], 3), (UINT16)_mm_extract_epi16(rsp->xv[27], 2), (UINT16)_mm_extract_epi16(rsp->xv[27], 1), (UINT16)_mm_extract_epi16(rsp->xv[27], 0)); break; |
| 6127 | case CPUINFO_STR_REGISTER + RSP_V28: sprintf(info->s, "V28: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[28], 7), (UINT16)_mm_extract_epi16(rsp->xv[28], 6), (UINT16)_mm_extract_epi16(rsp->xv[28], 5), (UINT16)_mm_extract_epi16(rsp->xv[28], 4), (UINT16)_mm_extract_epi16(rsp->xv[28], 3), (UINT16)_mm_extract_epi16(rsp->xv[28], 2), (UINT16)_mm_extract_epi16(rsp->xv[28], 1), (UINT16)_mm_extract_epi16(rsp->xv[28], 0)); break; |
| 6128 | case CPUINFO_STR_REGISTER + RSP_V29: sprintf(info->s, "V29: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[29], 7), (UINT16)_mm_extract_epi16(rsp->xv[29], 6), (UINT16)_mm_extract_epi16(rsp->xv[29], 5), (UINT16)_mm_extract_epi16(rsp->xv[29], 4), (UINT16)_mm_extract_epi16(rsp->xv[29], 3), (UINT16)_mm_extract_epi16(rsp->xv[29], 2), (UINT16)_mm_extract_epi16(rsp->xv[29], 1), (UINT16)_mm_extract_epi16(rsp->xv[29], 0)); break; |
| 6129 | case CPUINFO_STR_REGISTER + RSP_V30: sprintf(info->s, "V30: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[30], 7), (UINT16)_mm_extract_epi16(rsp->xv[30], 6), (UINT16)_mm_extract_epi16(rsp->xv[30], 5), (UINT16)_mm_extract_epi16(rsp->xv[30], 4), (UINT16)_mm_extract_epi16(rsp->xv[30], 3), (UINT16)_mm_extract_epi16(rsp->xv[30], 2), (UINT16)_mm_extract_epi16(rsp->xv[30], 1), (UINT16)_mm_extract_epi16(rsp->xv[30], 0)); break; |
| 6130 | case CPUINFO_STR_REGISTER + RSP_V31: sprintf(info->s, "V31: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)_mm_extract_epi16(rsp->xv[31], 7), (UINT16)_mm_extract_epi16(rsp->xv[31], 6), (UINT16)_mm_extract_epi16(rsp->xv[31], 5), (UINT16)_mm_extract_epi16(rsp->xv[31], 4), (UINT16)_mm_extract_epi16(rsp->xv[31], 3), (UINT16)_mm_extract_epi16(rsp->xv[31], 2), (UINT16)_mm_extract_epi16(rsp->xv[31], 1), (UINT16)_mm_extract_epi16(rsp->xv[31], 0)); break; |
| 6131 | #else |
| 6132 | case CPUINFO_STR_REGISTER + RSP_V0: sprintf(info->s, "V0: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S( 0, 0), (UINT16)VREG_S( 0, 1), (UINT16)VREG_S( 0, 2), (UINT16)VREG_S( 0, 3), (UINT16)VREG_S( 0, 4), (UINT16)VREG_S( 0, 5), (UINT16)VREG_S( 0, 6), (UINT16)VREG_S( 0, 7)); break; |
| 6133 | case CPUINFO_STR_REGISTER + RSP_V1: sprintf(info->s, "V1: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S( 1, 0), (UINT16)VREG_S( 1, 1), (UINT16)VREG_S( 1, 2), (UINT16)VREG_S( 1, 3), (UINT16)VREG_S( 1, 4), (UINT16)VREG_S( 1, 5), (UINT16)VREG_S( 1, 6), (UINT16)VREG_S( 1, 7)); break; |
| 6134 | case CPUINFO_STR_REGISTER + RSP_V2: sprintf(info->s, "V2: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S( 2, 0), (UINT16)VREG_S( 2, 1), (UINT16)VREG_S( 2, 2), (UINT16)VREG_S( 2, 3), (UINT16)VREG_S( 2, 4), (UINT16)VREG_S( 2, 5), (UINT16)VREG_S( 2, 6), (UINT16)VREG_S( 2, 7)); break; |
| 6135 | case CPUINFO_STR_REGISTER + RSP_V3: sprintf(info->s, "V3: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S( 3, 0), (UINT16)VREG_S( 3, 1), (UINT16)VREG_S( 3, 2), (UINT16)VREG_S( 3, 3), (UINT16)VREG_S( 3, 4), (UINT16)VREG_S( 3, 5), (UINT16)VREG_S( 3, 6), (UINT16)VREG_S( 3, 7)); break; |
| 6136 | case CPUINFO_STR_REGISTER + RSP_V4: sprintf(info->s, "V4: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S( 4, 0), (UINT16)VREG_S( 4, 1), (UINT16)VREG_S( 4, 2), (UINT16)VREG_S( 4, 3), (UINT16)VREG_S( 4, 4), (UINT16)VREG_S( 4, 5), (UINT16)VREG_S( 4, 6), (UINT16)VREG_S( 4, 7)); break; |
| 6137 | case CPUINFO_STR_REGISTER + RSP_V5: sprintf(info->s, "V5: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S( 5, 0), (UINT16)VREG_S( 5, 1), (UINT16)VREG_S( 5, 2), (UINT16)VREG_S( 5, 3), (UINT16)VREG_S( 5, 4), (UINT16)VREG_S( 5, 5), (UINT16)VREG_S( 5, 6), (UINT16)VREG_S( 5, 7)); break; |
| 6138 | case CPUINFO_STR_REGISTER + RSP_V6: sprintf(info->s, "V6: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S( 6, 0), (UINT16)VREG_S( 6, 1), (UINT16)VREG_S( 6, 2), (UINT16)VREG_S( 6, 3), (UINT16)VREG_S( 6, 4), (UINT16)VREG_S( 6, 5), (UINT16)VREG_S( 6, 6), (UINT16)VREG_S( 6, 7)); break; |
| 6139 | case CPUINFO_STR_REGISTER + RSP_V7: sprintf(info->s, "V7: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S( 7, 0), (UINT16)VREG_S( 7, 1), (UINT16)VREG_S( 7, 2), (UINT16)VREG_S( 7, 3), (UINT16)VREG_S( 7, 4), (UINT16)VREG_S( 7, 5), (UINT16)VREG_S( 7, 6), (UINT16)VREG_S( 7, 7)); break; |
| 6140 | case CPUINFO_STR_REGISTER + RSP_V8: sprintf(info->s, "V8: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S( 8, 0), (UINT16)VREG_S( 8, 1), (UINT16)VREG_S( 8, 2), (UINT16)VREG_S( 8, 3), (UINT16)VREG_S( 8, 4), (UINT16)VREG_S( 8, 5), (UINT16)VREG_S( 8, 6), (UINT16)VREG_S( 8, 7)); break; |
| 6141 | case CPUINFO_STR_REGISTER + RSP_V9: sprintf(info->s, "V9: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S( 9, 0), (UINT16)VREG_S( 9, 1), (UINT16)VREG_S( 9, 2), (UINT16)VREG_S( 9, 3), (UINT16)VREG_S( 9, 4), (UINT16)VREG_S( 9, 5), (UINT16)VREG_S( 9, 6), (UINT16)VREG_S( 9, 7)); break; |
| 6142 | case CPUINFO_STR_REGISTER + RSP_V10: sprintf(info->s, "V10: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(10, 0), (UINT16)VREG_S(10, 1), (UINT16)VREG_S(10, 2), (UINT16)VREG_S(10, 3), (UINT16)VREG_S(10, 4), (UINT16)VREG_S(10, 5), (UINT16)VREG_S(10, 6), (UINT16)VREG_S(10, 7)); break; |
| 6143 | case CPUINFO_STR_REGISTER + RSP_V11: sprintf(info->s, "V11: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(11, 0), (UINT16)VREG_S(11, 1), (UINT16)VREG_S(11, 2), (UINT16)VREG_S(11, 3), (UINT16)VREG_S(11, 4), (UINT16)VREG_S(11, 5), (UINT16)VREG_S(11, 6), (UINT16)VREG_S(11, 7)); break; |
| 6144 | case CPUINFO_STR_REGISTER + RSP_V12: sprintf(info->s, "V12: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(12, 0), (UINT16)VREG_S(12, 1), (UINT16)VREG_S(12, 2), (UINT16)VREG_S(12, 3), (UINT16)VREG_S(12, 4), (UINT16)VREG_S(12, 5), (UINT16)VREG_S(12, 6), (UINT16)VREG_S(12, 7)); break; |
| 6145 | case CPUINFO_STR_REGISTER + RSP_V13: sprintf(info->s, "V13: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(13, 0), (UINT16)VREG_S(13, 1), (UINT16)VREG_S(13, 2), (UINT16)VREG_S(13, 3), (UINT16)VREG_S(13, 4), (UINT16)VREG_S(13, 5), (UINT16)VREG_S(13, 6), (UINT16)VREG_S(13, 7)); break; |
| 6146 | case CPUINFO_STR_REGISTER + RSP_V14: sprintf(info->s, "V14: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(14, 0), (UINT16)VREG_S(14, 1), (UINT16)VREG_S(14, 2), (UINT16)VREG_S(14, 3), (UINT16)VREG_S(14, 4), (UINT16)VREG_S(14, 5), (UINT16)VREG_S(14, 6), (UINT16)VREG_S(14, 7)); break; |
| 6147 | case CPUINFO_STR_REGISTER + RSP_V15: sprintf(info->s, "V15: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(15, 0), (UINT16)VREG_S(15, 1), (UINT16)VREG_S(15, 2), (UINT16)VREG_S(15, 3), (UINT16)VREG_S(15, 4), (UINT16)VREG_S(15, 5), (UINT16)VREG_S(15, 6), (UINT16)VREG_S(15, 7)); break; |
| 6148 | case CPUINFO_STR_REGISTER + RSP_V16: sprintf(info->s, "V16: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(16, 0), (UINT16)VREG_S(16, 1), (UINT16)VREG_S(16, 2), (UINT16)VREG_S(16, 3), (UINT16)VREG_S(16, 4), (UINT16)VREG_S(16, 5), (UINT16)VREG_S(16, 6), (UINT16)VREG_S(16, 7)); break; |
| 6149 | case CPUINFO_STR_REGISTER + RSP_V17: sprintf(info->s, "V17: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(17, 0), (UINT16)VREG_S(17, 1), (UINT16)VREG_S(17, 2), (UINT16)VREG_S(17, 3), (UINT16)VREG_S(17, 4), (UINT16)VREG_S(17, 5), (UINT16)VREG_S(17, 6), (UINT16)VREG_S(17, 7)); break; |
| 6150 | case CPUINFO_STR_REGISTER + RSP_V18: sprintf(info->s, "V18: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(18, 0), (UINT16)VREG_S(18, 1), (UINT16)VREG_S(18, 2), (UINT16)VREG_S(18, 3), (UINT16)VREG_S(18, 4), (UINT16)VREG_S(18, 5), (UINT16)VREG_S(18, 6), (UINT16)VREG_S(18, 7)); break; |
| 6151 | case CPUINFO_STR_REGISTER + RSP_V19: sprintf(info->s, "V19: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(19, 0), (UINT16)VREG_S(19, 1), (UINT16)VREG_S(19, 2), (UINT16)VREG_S(19, 3), (UINT16)VREG_S(19, 4), (UINT16)VREG_S(19, 5), (UINT16)VREG_S(19, 6), (UINT16)VREG_S(19, 7)); break; |
| 6152 | case CPUINFO_STR_REGISTER + RSP_V20: sprintf(info->s, "V20: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(20, 0), (UINT16)VREG_S(20, 1), (UINT16)VREG_S(20, 2), (UINT16)VREG_S(20, 3), (UINT16)VREG_S(20, 4), (UINT16)VREG_S(20, 5), (UINT16)VREG_S(20, 6), (UINT16)VREG_S(20, 7)); break; |
| 6153 | case CPUINFO_STR_REGISTER + RSP_V21: sprintf(info->s, "V21: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(21, 0), (UINT16)VREG_S(21, 1), (UINT16)VREG_S(21, 2), (UINT16)VREG_S(21, 3), (UINT16)VREG_S(21, 4), (UINT16)VREG_S(21, 5), (UINT16)VREG_S(21, 6), (UINT16)VREG_S(21, 7)); break; |
| 6154 | case CPUINFO_STR_REGISTER + RSP_V22: sprintf(info->s, "V22: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(22, 0), (UINT16)VREG_S(22, 1), (UINT16)VREG_S(22, 2), (UINT16)VREG_S(22, 3), (UINT16)VREG_S(22, 4), (UINT16)VREG_S(22, 5), (UINT16)VREG_S(22, 6), (UINT16)VREG_S(22, 7)); break; |
| 6155 | case CPUINFO_STR_REGISTER + RSP_V23: sprintf(info->s, "V23: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(23, 0), (UINT16)VREG_S(23, 1), (UINT16)VREG_S(23, 2), (UINT16)VREG_S(23, 3), (UINT16)VREG_S(23, 4), (UINT16)VREG_S(23, 5), (UINT16)VREG_S(23, 6), (UINT16)VREG_S(23, 7)); break; |
| 6156 | case CPUINFO_STR_REGISTER + RSP_V24: sprintf(info->s, "V24: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(24, 0), (UINT16)VREG_S(24, 1), (UINT16)VREG_S(24, 2), (UINT16)VREG_S(24, 3), (UINT16)VREG_S(24, 4), (UINT16)VREG_S(24, 5), (UINT16)VREG_S(24, 6), (UINT16)VREG_S(24, 7)); break; |
| 6157 | case CPUINFO_STR_REGISTER + RSP_V25: sprintf(info->s, "V25: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(25, 0), (UINT16)VREG_S(25, 1), (UINT16)VREG_S(25, 2), (UINT16)VREG_S(25, 3), (UINT16)VREG_S(25, 4), (UINT16)VREG_S(25, 5), (UINT16)VREG_S(25, 6), (UINT16)VREG_S(25, 7)); break; |
| 6158 | case CPUINFO_STR_REGISTER + RSP_V26: sprintf(info->s, "V26: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(26, 0), (UINT16)VREG_S(26, 1), (UINT16)VREG_S(26, 2), (UINT16)VREG_S(26, 3), (UINT16)VREG_S(26, 4), (UINT16)VREG_S(26, 5), (UINT16)VREG_S(26, 6), (UINT16)VREG_S(26, 7)); break; |
| 6159 | case CPUINFO_STR_REGISTER + RSP_V27: sprintf(info->s, "V27: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(27, 0), (UINT16)VREG_S(27, 1), (UINT16)VREG_S(27, 2), (UINT16)VREG_S(27, 3), (UINT16)VREG_S(27, 4), (UINT16)VREG_S(27, 5), (UINT16)VREG_S(27, 6), (UINT16)VREG_S(27, 7)); break; |
| 6160 | case CPUINFO_STR_REGISTER + RSP_V28: sprintf(info->s, "V28: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(28, 0), (UINT16)VREG_S(28, 1), (UINT16)VREG_S(28, 2), (UINT16)VREG_S(28, 3), (UINT16)VREG_S(28, 4), (UINT16)VREG_S(28, 5), (UINT16)VREG_S(28, 6), (UINT16)VREG_S(28, 7)); break; |
| 6161 | case CPUINFO_STR_REGISTER + RSP_V29: sprintf(info->s, "V29: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(29, 0), (UINT16)VREG_S(29, 1), (UINT16)VREG_S(29, 2), (UINT16)VREG_S(29, 3), (UINT16)VREG_S(29, 4), (UINT16)VREG_S(29, 5), (UINT16)VREG_S(29, 6), (UINT16)VREG_S(29, 7)); break; |
| 6162 | case CPUINFO_STR_REGISTER + RSP_V30: sprintf(info->s, "V30: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(30, 0), (UINT16)VREG_S(30, 1), (UINT16)VREG_S(30, 2), (UINT16)VREG_S(30, 3), (UINT16)VREG_S(30, 4), (UINT16)VREG_S(30, 5), (UINT16)VREG_S(30, 6), (UINT16)VREG_S(30, 7)); break; |
| 6163 | case CPUINFO_STR_REGISTER + RSP_V31: sprintf(info->s, "V31: %04X|%04X|%04X|%04X|%04X|%04X|%04X|%04X", (UINT16)VREG_S(31, 0), (UINT16)VREG_S(31, 1), (UINT16)VREG_S(31, 2), (UINT16)VREG_S(31, 3), (UINT16)VREG_S(31, 4), (UINT16)VREG_S(31, 5), (UINT16)VREG_S(31, 6), (UINT16)VREG_S(31, 7)); break; |
| 6164 | #endif |
| 5016 | 6165 | case CPUINFO_STR_REGISTER + RSP_SR: sprintf(info->s, "SR: %08X", rsp->sr); break; |
| 5017 | 6166 | case CPUINFO_STR_REGISTER + RSP_NEXTPC: sprintf(info->s, "NPC: %08X", rsp->nextpc);break; |
| 5018 | 6167 | case CPUINFO_STR_REGISTER + RSP_STEPCNT: sprintf(info->s, "STEP: %d", rsp->step_count); break; |