Kyle Swenson | 8d8f654 | 2021-03-15 11:02:55 -0600 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) |
| 3 | * |
| 4 | * This program is free software; you can redistribute it and/or modify |
| 5 | * it under the terms of the GNU General Public License version 2 as |
| 6 | * published by the Free Software Foundation. |
| 7 | */ |
| 8 | |
| 9 | #include <linux/linkage.h> |
| 10 | |
| 11 | #ifdef __LITTLE_ENDIAN__ |
| 12 | # define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; << |
| 13 | # define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >> |
| 14 | # define MERGE_1(RX,RY,IMM) asl RX, RY, IMM |
| 15 | # define MERGE_2(RX,RY,IMM) |
| 16 | # define EXTRACT_1(RX,RY,IMM) and RX, RY, 0xFFFF |
| 17 | # define EXTRACT_2(RX,RY,IMM) lsr RX, RY, IMM |
| 18 | #else |
| 19 | # define SHIFT_1(RX,RY,IMM) lsr RX, RY, IMM ; >> |
| 20 | # define SHIFT_2(RX,RY,IMM) asl RX, RY, IMM ; << |
| 21 | # define MERGE_1(RX,RY,IMM) asl RX, RY, IMM ; << |
| 22 | # define MERGE_2(RX,RY,IMM) asl RX, RY, IMM ; << |
| 23 | # define EXTRACT_1(RX,RY,IMM) lsr RX, RY, IMM |
| 24 | # define EXTRACT_2(RX,RY,IMM) lsr RX, RY, 0x08 |
| 25 | #endif |
| 26 | |
| 27 | #ifdef CONFIG_ARC_HAS_LL64 |
| 28 | # define PREFETCH_READ(RX) prefetch [RX, 56] |
| 29 | # define PREFETCH_WRITE(RX) prefetchw [RX, 64] |
| 30 | # define LOADX(DST,RX) ldd.ab DST, [RX, 8] |
| 31 | # define STOREX(SRC,RX) std.ab SRC, [RX, 8] |
| 32 | # define ZOLSHFT 5 |
| 33 | # define ZOLAND 0x1F |
| 34 | #else |
| 35 | # define PREFETCH_READ(RX) prefetch [RX, 28] |
| 36 | # define PREFETCH_WRITE(RX) prefetchw [RX, 32] |
| 37 | # define LOADX(DST,RX) ld.ab DST, [RX, 4] |
| 38 | # define STOREX(SRC,RX) st.ab SRC, [RX, 4] |
| 39 | # define ZOLSHFT 4 |
| 40 | # define ZOLAND 0xF |
| 41 | #endif |
| 42 | |
| 43 | ENTRY(memcpy) |
| 44 | prefetch [r1] ; Prefetch the read location |
| 45 | prefetchw [r0] ; Prefetch the write location |
| 46 | mov.f 0, r2 |
| 47 | ;;; if size is zero |
| 48 | jz.d [blink] |
| 49 | mov r3, r0 ; don;t clobber ret val |
| 50 | |
| 51 | ;;; if size <= 8 |
| 52 | cmp r2, 8 |
| 53 | bls.d @.Lsmallchunk |
| 54 | mov.f lp_count, r2 |
| 55 | |
| 56 | and.f r4, r0, 0x03 |
| 57 | rsub lp_count, r4, 4 |
| 58 | lpnz @.Laligndestination |
| 59 | ;; LOOP BEGIN |
| 60 | ldb.ab r5, [r1,1] |
| 61 | sub r2, r2, 1 |
| 62 | stb.ab r5, [r3,1] |
| 63 | .Laligndestination: |
| 64 | |
| 65 | ;;; Check the alignment of the source |
| 66 | and.f r4, r1, 0x03 |
| 67 | bnz.d @.Lsourceunaligned |
| 68 | |
| 69 | ;;; CASE 0: Both source and destination are 32bit aligned |
| 70 | ;;; Convert len to Dwords, unfold x4 |
| 71 | lsr.f lp_count, r2, ZOLSHFT |
| 72 | lpnz @.Lcopy32_64bytes |
| 73 | ;; LOOP START |
| 74 | LOADX (r6, r1) |
| 75 | PREFETCH_READ (r1) |
| 76 | PREFETCH_WRITE (r3) |
| 77 | LOADX (r8, r1) |
| 78 | LOADX (r10, r1) |
| 79 | LOADX (r4, r1) |
| 80 | STOREX (r6, r3) |
| 81 | STOREX (r8, r3) |
| 82 | STOREX (r10, r3) |
| 83 | STOREX (r4, r3) |
| 84 | .Lcopy32_64bytes: |
| 85 | |
| 86 | and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes |
| 87 | .Lsmallchunk: |
| 88 | lpnz @.Lcopyremainingbytes |
| 89 | ;; LOOP START |
| 90 | ldb.ab r5, [r1,1] |
| 91 | stb.ab r5, [r3,1] |
| 92 | .Lcopyremainingbytes: |
| 93 | |
| 94 | j [blink] |
| 95 | ;;; END CASE 0 |
| 96 | |
| 97 | .Lsourceunaligned: |
| 98 | cmp r4, 2 |
| 99 | beq.d @.LunalignedOffby2 |
| 100 | sub r2, r2, 1 |
| 101 | |
| 102 | bhi.d @.LunalignedOffby3 |
| 103 | ldb.ab r5, [r1, 1] |
| 104 | |
| 105 | ;;; CASE 1: The source is unaligned, off by 1 |
| 106 | ;; Hence I need to read 1 byte for a 16bit alignment |
| 107 | ;; and 2bytes to reach 32bit alignment |
| 108 | ldh.ab r6, [r1, 2] |
| 109 | sub r2, r2, 2 |
| 110 | ;; Convert to words, unfold x2 |
| 111 | lsr.f lp_count, r2, 3 |
| 112 | MERGE_1 (r6, r6, 8) |
| 113 | MERGE_2 (r5, r5, 24) |
| 114 | or r5, r5, r6 |
| 115 | |
| 116 | ;; Both src and dst are aligned |
| 117 | lpnz @.Lcopy8bytes_1 |
| 118 | ;; LOOP START |
| 119 | ld.ab r6, [r1, 4] |
| 120 | prefetch [r1, 28] ;Prefetch the next read location |
| 121 | ld.ab r8, [r1,4] |
| 122 | prefetchw [r3, 32] ;Prefetch the next write location |
| 123 | |
| 124 | SHIFT_1 (r7, r6, 24) |
| 125 | or r7, r7, r5 |
| 126 | SHIFT_2 (r5, r6, 8) |
| 127 | |
| 128 | SHIFT_1 (r9, r8, 24) |
| 129 | or r9, r9, r5 |
| 130 | SHIFT_2 (r5, r8, 8) |
| 131 | |
| 132 | st.ab r7, [r3, 4] |
| 133 | st.ab r9, [r3, 4] |
| 134 | .Lcopy8bytes_1: |
| 135 | |
| 136 | ;; Write back the remaining 16bits |
| 137 | EXTRACT_1 (r6, r5, 16) |
| 138 | sth.ab r6, [r3, 2] |
| 139 | ;; Write back the remaining 8bits |
| 140 | EXTRACT_2 (r5, r5, 16) |
| 141 | stb.ab r5, [r3, 1] |
| 142 | |
| 143 | and.f lp_count, r2, 0x07 ;Last 8bytes |
| 144 | lpnz @.Lcopybytewise_1 |
| 145 | ;; LOOP START |
| 146 | ldb.ab r6, [r1,1] |
| 147 | stb.ab r6, [r3,1] |
| 148 | .Lcopybytewise_1: |
| 149 | j [blink] |
| 150 | |
| 151 | .LunalignedOffby2: |
| 152 | ;;; CASE 2: The source is unaligned, off by 2 |
| 153 | ldh.ab r5, [r1, 2] |
| 154 | sub r2, r2, 1 |
| 155 | |
| 156 | ;; Both src and dst are aligned |
| 157 | ;; Convert to words, unfold x2 |
| 158 | lsr.f lp_count, r2, 3 |
| 159 | #ifdef __BIG_ENDIAN__ |
| 160 | asl.nz r5, r5, 16 |
| 161 | #endif |
| 162 | lpnz @.Lcopy8bytes_2 |
| 163 | ;; LOOP START |
| 164 | ld.ab r6, [r1, 4] |
| 165 | prefetch [r1, 28] ;Prefetch the next read location |
| 166 | ld.ab r8, [r1,4] |
| 167 | prefetchw [r3, 32] ;Prefetch the next write location |
| 168 | |
| 169 | SHIFT_1 (r7, r6, 16) |
| 170 | or r7, r7, r5 |
| 171 | SHIFT_2 (r5, r6, 16) |
| 172 | |
| 173 | SHIFT_1 (r9, r8, 16) |
| 174 | or r9, r9, r5 |
| 175 | SHIFT_2 (r5, r8, 16) |
| 176 | |
| 177 | st.ab r7, [r3, 4] |
| 178 | st.ab r9, [r3, 4] |
| 179 | .Lcopy8bytes_2: |
| 180 | |
| 181 | #ifdef __BIG_ENDIAN__ |
| 182 | lsr.nz r5, r5, 16 |
| 183 | #endif |
| 184 | sth.ab r5, [r3, 2] |
| 185 | |
| 186 | and.f lp_count, r2, 0x07 ;Last 8bytes |
| 187 | lpnz @.Lcopybytewise_2 |
| 188 | ;; LOOP START |
| 189 | ldb.ab r6, [r1,1] |
| 190 | stb.ab r6, [r3,1] |
| 191 | .Lcopybytewise_2: |
| 192 | j [blink] |
| 193 | |
| 194 | .LunalignedOffby3: |
| 195 | ;;; CASE 3: The source is unaligned, off by 3 |
| 196 | ;;; Hence, I need to read 1byte for achieve the 32bit alignment |
| 197 | |
| 198 | ;; Both src and dst are aligned |
| 199 | ;; Convert to words, unfold x2 |
| 200 | lsr.f lp_count, r2, 3 |
| 201 | #ifdef __BIG_ENDIAN__ |
| 202 | asl.ne r5, r5, 24 |
| 203 | #endif |
| 204 | lpnz @.Lcopy8bytes_3 |
| 205 | ;; LOOP START |
| 206 | ld.ab r6, [r1, 4] |
| 207 | prefetch [r1, 28] ;Prefetch the next read location |
| 208 | ld.ab r8, [r1,4] |
| 209 | prefetchw [r3, 32] ;Prefetch the next write location |
| 210 | |
| 211 | SHIFT_1 (r7, r6, 8) |
| 212 | or r7, r7, r5 |
| 213 | SHIFT_2 (r5, r6, 24) |
| 214 | |
| 215 | SHIFT_1 (r9, r8, 8) |
| 216 | or r9, r9, r5 |
| 217 | SHIFT_2 (r5, r8, 24) |
| 218 | |
| 219 | st.ab r7, [r3, 4] |
| 220 | st.ab r9, [r3, 4] |
| 221 | .Lcopy8bytes_3: |
| 222 | |
| 223 | #ifdef __BIG_ENDIAN__ |
| 224 | lsr.nz r5, r5, 24 |
| 225 | #endif |
| 226 | stb.ab r5, [r3, 1] |
| 227 | |
| 228 | and.f lp_count, r2, 0x07 ;Last 8bytes |
| 229 | lpnz @.Lcopybytewise_3 |
| 230 | ;; LOOP START |
| 231 | ldb.ab r6, [r1,1] |
| 232 | stb.ab r6, [r3,1] |
| 233 | .Lcopybytewise_3: |
| 234 | j [blink] |
| 235 | |
| 236 | END(memcpy) |