File: | home/bhubbard/working/src/ceph/build/src/dpdk/include/rte_ether.h |
Warning: | line 821, column 21 Assigned value is garbage or undefined |
[?] Use j/k keys for keyboard navigation
1 | /* SPDX-License-Identifier: BSD-3-Clause | |||
2 | * Copyright(c) 2010-2014 Intel Corporation | |||
3 | */ | |||
4 | ||||
5 | #ifndef _RTE_ETHER_H_ | |||
6 | #define _RTE_ETHER_H_ | |||
7 | ||||
8 | /** | |||
9 | * @file | |||
10 | * | |||
11 | * Ethernet Helpers in RTE | |||
12 | */ | |||
13 | ||||
14 | #ifdef __cplusplus | |||
15 | extern "C" { | |||
16 | #endif | |||
17 | ||||
18 | #include <stdint.h> | |||
19 | #include <stdio.h> | |||
20 | ||||
21 | #include <rte_memcpy.h> | |||
22 | #include <rte_random.h> | |||
23 | #include <rte_mbuf.h> | |||
24 | #include <rte_byteorder.h> | |||
25 | ||||
26 | #define ETHER_ADDR_LEN6 6 /**< Length of Ethernet address. */ | |||
27 | #define ETHER_TYPE_LEN2 2 /**< Length of Ethernet type field. */ | |||
28 | #define ETHER_CRC_LEN4 4 /**< Length of Ethernet CRC. */ | |||
29 | #define ETHER_HDR_LEN(6 * 2 + 2) \ | |||
30 | (ETHER_ADDR_LEN6 * 2 + ETHER_TYPE_LEN2) /**< Length of Ethernet header. */ | |||
31 | #define ETHER_MIN_LEN64 64 /**< Minimum frame len, including CRC. */ | |||
32 | #define ETHER_MAX_LEN1518 1518 /**< Maximum frame len, including CRC. */ | |||
33 | #define ETHER_MTU(1518 - (6 * 2 + 2) - 4) \ | |||
34 | (ETHER_MAX_LEN1518 - ETHER_HDR_LEN(6 * 2 + 2) - ETHER_CRC_LEN4) /**< Ethernet MTU. */ | |||
35 | ||||
36 | #define ETHER_MAX_VLAN_FRAME_LEN(1518 + 4) \ | |||
37 | (ETHER_MAX_LEN1518 + 4) /**< Maximum VLAN frame length, including CRC. */ | |||
38 | ||||
39 | #define ETHER_MAX_JUMBO_FRAME_LEN0x3F00 \ | |||
40 | 0x3F00 /**< Maximum Jumbo frame length, including CRC. */ | |||
41 | ||||
42 | #define ETHER_MAX_VLAN_ID4095 4095 /**< Maximum VLAN ID. */ | |||
43 | ||||
44 | #define ETHER_MIN_MTU68 68 /**< Minimum MTU for IPv4 packets, see RFC 791. */ | |||
45 | ||||
46 | /** | |||
47 | * Ethernet address: | |||
48 | * A universally administered address is uniquely assigned to a device by its | |||
49 | * manufacturer. The first three octets (in transmission order) contain the | |||
50 | * Organizationally Unique Identifier (OUI). The following three (MAC-48 and | |||
51 | * EUI-48) octets are assigned by that organization with the only constraint | |||
52 | * of uniqueness. | |||
53 | * A locally administered address is assigned to a device by a network | |||
54 | * administrator and does not contain OUIs. | |||
55 | * See http://standards.ieee.org/regauth/groupmac/tutorial.html | |||
56 | */ | |||
57 | struct ether_addr { | |||
58 | uint8_t addr_bytes[ETHER_ADDR_LEN6]; /**< Addr bytes in tx order */ | |||
59 | } __attribute__((__packed__)); | |||
60 | ||||
61 | #define ETHER_LOCAL_ADMIN_ADDR0x02 0x02 /**< Locally assigned Eth. address. */ | |||
62 | #define ETHER_GROUP_ADDR0x01 0x01 /**< Multicast or broadcast Eth. address. */ | |||
63 | ||||
64 | /** | |||
65 | * Check if two Ethernet addresses are the same. | |||
66 | * | |||
67 | * @param ea1 | |||
68 | * A pointer to the first ether_addr structure containing | |||
69 | * the ethernet address. | |||
70 | * @param ea2 | |||
71 | * A pointer to the second ether_addr structure containing | |||
72 | * the ethernet address. | |||
73 | * | |||
74 | * @return | |||
75 | * True (1) if the given two ethernet address are the same; | |||
76 | * False (0) otherwise. | |||
77 | */ | |||
78 | static inline int is_same_ether_addr(const struct ether_addr *ea1, | |||
79 | const struct ether_addr *ea2) | |||
80 | { | |||
81 | int i; | |||
82 | for (i = 0; i < ETHER_ADDR_LEN6; i++) | |||
83 | if (ea1->addr_bytes[i] != ea2->addr_bytes[i]) | |||
84 | return 0; | |||
85 | return 1; | |||
86 | } | |||
87 | ||||
88 | /** | |||
89 | * Check if an Ethernet address is filled with zeros. | |||
90 | * | |||
91 | * @param ea | |||
92 | * A pointer to a ether_addr structure containing the ethernet address | |||
93 | * to check. | |||
94 | * @return | |||
95 | * True (1) if the given ethernet address is filled with zeros; | |||
96 | * false (0) otherwise. | |||
97 | */ | |||
98 | static inline int is_zero_ether_addr(const struct ether_addr *ea) | |||
99 | { | |||
100 | int i; | |||
101 | for (i = 0; i < ETHER_ADDR_LEN6; i++) | |||
102 | if (ea->addr_bytes[i] != 0x00) | |||
103 | return 0; | |||
104 | return 1; | |||
105 | } | |||
106 | ||||
107 | /** | |||
108 | * Check if an Ethernet address is a unicast address. | |||
109 | * | |||
110 | * @param ea | |||
111 | * A pointer to a ether_addr structure containing the ethernet address | |||
112 | * to check. | |||
113 | * @return | |||
114 | * True (1) if the given ethernet address is a unicast address; | |||
115 | * false (0) otherwise. | |||
116 | */ | |||
117 | static inline int is_unicast_ether_addr(const struct ether_addr *ea) | |||
118 | { | |||
119 | return (ea->addr_bytes[0] & ETHER_GROUP_ADDR0x01) == 0; | |||
120 | } | |||
121 | ||||
122 | /** | |||
123 | * Check if an Ethernet address is a multicast address. | |||
124 | * | |||
125 | * @param ea | |||
126 | * A pointer to a ether_addr structure containing the ethernet address | |||
127 | * to check. | |||
128 | * @return | |||
129 | * True (1) if the given ethernet address is a multicast address; | |||
130 | * false (0) otherwise. | |||
131 | */ | |||
132 | static inline int is_multicast_ether_addr(const struct ether_addr *ea) | |||
133 | { | |||
134 | return ea->addr_bytes[0] & ETHER_GROUP_ADDR0x01; | |||
135 | } | |||
136 | ||||
137 | /** | |||
138 | * Check if an Ethernet address is a broadcast address. | |||
139 | * | |||
140 | * @param ea | |||
141 | * A pointer to a ether_addr structure containing the ethernet address | |||
142 | * to check. | |||
143 | * @return | |||
144 | * True (1) if the given ethernet address is a broadcast address; | |||
145 | * false (0) otherwise. | |||
146 | */ | |||
147 | static inline int is_broadcast_ether_addr(const struct ether_addr *ea) | |||
148 | { | |||
149 | const unaligned_uint16_t *ea_words = (const unaligned_uint16_t *)ea; | |||
150 | ||||
151 | return (ea_words[0] == 0xFFFF && ea_words[1] == 0xFFFF && | |||
152 | ea_words[2] == 0xFFFF); | |||
153 | } | |||
154 | ||||
155 | /** | |||
156 | * Check if an Ethernet address is a universally assigned address. | |||
157 | * | |||
158 | * @param ea | |||
159 | * A pointer to a ether_addr structure containing the ethernet address | |||
160 | * to check. | |||
161 | * @return | |||
162 | * True (1) if the given ethernet address is a universally assigned address; | |||
163 | * false (0) otherwise. | |||
164 | */ | |||
165 | static inline int is_universal_ether_addr(const struct ether_addr *ea) | |||
166 | { | |||
167 | return (ea->addr_bytes[0] & ETHER_LOCAL_ADMIN_ADDR0x02) == 0; | |||
168 | } | |||
169 | ||||
170 | /** | |||
171 | * Check if an Ethernet address is a locally assigned address. | |||
172 | * | |||
173 | * @param ea | |||
174 | * A pointer to a ether_addr structure containing the ethernet address | |||
175 | * to check. | |||
176 | * @return | |||
177 | * True (1) if the given ethernet address is a locally assigned address; | |||
178 | * false (0) otherwise. | |||
179 | */ | |||
180 | static inline int is_local_admin_ether_addr(const struct ether_addr *ea) | |||
181 | { | |||
182 | return (ea->addr_bytes[0] & ETHER_LOCAL_ADMIN_ADDR0x02) != 0; | |||
183 | } | |||
184 | ||||
185 | /** | |||
186 | * Check if an Ethernet address is a valid address. Checks that the address is a | |||
187 | * unicast address and is not filled with zeros. | |||
188 | * | |||
189 | * @param ea | |||
190 | * A pointer to a ether_addr structure containing the ethernet address | |||
191 | * to check. | |||
192 | * @return | |||
193 | * True (1) if the given ethernet address is valid; | |||
194 | * false (0) otherwise. | |||
195 | */ | |||
196 | static inline int is_valid_assigned_ether_addr(const struct ether_addr *ea) | |||
197 | { | |||
198 | return is_unicast_ether_addr(ea) && (!is_zero_ether_addr(ea)); | |||
199 | } | |||
200 | ||||
201 | /** | |||
202 | * Generate a random Ethernet address that is locally administered | |||
203 | * and not multicast. | |||
204 | * @param addr | |||
205 | * A pointer to Ethernet address. | |||
206 | */ | |||
207 | static inline void eth_random_addr(uint8_t *addr) | |||
208 | { | |||
209 | uint64_t rand = rte_rand(); | |||
210 | uint8_t *p = (uint8_t *)&rand; | |||
211 | ||||
212 | rte_memcpy(addr, p, ETHER_ADDR_LEN6); | |||
| ||||
213 | addr[0] &= (uint8_t)~ETHER_GROUP_ADDR0x01; /* clear multicast bit */ | |||
214 | addr[0] |= ETHER_LOCAL_ADMIN_ADDR0x02; /* set local assignment bit */ | |||
215 | } | |||
216 | ||||
217 | /** | |||
218 | * Fast copy an Ethernet address. | |||
219 | * | |||
220 | * @param ea_from | |||
221 | * A pointer to a ether_addr structure holding the Ethernet address to copy. | |||
222 | * @param ea_to | |||
223 | * A pointer to a ether_addr structure where to copy the Ethernet address. | |||
224 | */ | |||
225 | static inline void ether_addr_copy(const struct ether_addr *ea_from, | |||
226 | struct ether_addr *ea_to) | |||
227 | { | |||
228 | #ifdef __INTEL_COMPILER | |||
229 | uint16_t *from_words = (uint16_t *)(ea_from->addr_bytes); | |||
230 | uint16_t *to_words = (uint16_t *)(ea_to->addr_bytes); | |||
231 | ||||
232 | to_words[0] = from_words[0]; | |||
233 | to_words[1] = from_words[1]; | |||
234 | to_words[2] = from_words[2]; | |||
235 | #else | |||
236 | /* | |||
237 | * Use the common way, because of a strange gcc warning. | |||
238 | */ | |||
239 | *ea_to = *ea_from; | |||
240 | #endif | |||
241 | } | |||
242 | ||||
243 | #define ETHER_ADDR_FMT_SIZE18 18 | |||
244 | /** | |||
245 | * Format 48bits Ethernet address in pattern xx:xx:xx:xx:xx:xx. | |||
246 | * | |||
247 | * @param buf | |||
248 | * A pointer to buffer contains the formatted MAC address. | |||
249 | * @param size | |||
250 | * The format buffer size. | |||
251 | * @param eth_addr | |||
252 | * A pointer to a ether_addr structure. | |||
253 | */ | |||
254 | static inline void | |||
255 | ether_format_addr(char *buf, uint16_t size, | |||
256 | const struct ether_addr *eth_addr) | |||
257 | { | |||
258 | snprintf(buf, size, "%02X:%02X:%02X:%02X:%02X:%02X", | |||
259 | eth_addr->addr_bytes[0], | |||
260 | eth_addr->addr_bytes[1], | |||
261 | eth_addr->addr_bytes[2], | |||
262 | eth_addr->addr_bytes[3], | |||
263 | eth_addr->addr_bytes[4], | |||
264 | eth_addr->addr_bytes[5]); | |||
265 | } | |||
266 | ||||
267 | /** | |||
268 | * Ethernet header: Contains the destination address, source address | |||
269 | * and frame type. | |||
270 | */ | |||
271 | struct ether_hdr { | |||
272 | struct ether_addr d_addr; /**< Destination address. */ | |||
273 | struct ether_addr s_addr; /**< Source address. */ | |||
274 | uint16_t ether_type; /**< Frame type. */ | |||
275 | } __attribute__((__packed__)); | |||
276 | ||||
277 | /** | |||
278 | * Ethernet VLAN Header. | |||
279 | * Contains the 16-bit VLAN Tag Control Identifier and the Ethernet type | |||
280 | * of the encapsulated frame. | |||
281 | */ | |||
282 | struct vlan_hdr { | |||
283 | uint16_t vlan_tci; /**< Priority (3) + CFI (1) + Identifier Code (12) */ | |||
284 | uint16_t eth_proto;/**< Ethernet type of encapsulated frame. */ | |||
285 | } __attribute__((__packed__)); | |||
286 | ||||
287 | /** | |||
288 | * VXLAN protocol header. | |||
289 | * Contains the 8-bit flag, 24-bit VXLAN Network Identifier and | |||
290 | * Reserved fields (24 bits and 8 bits) | |||
291 | */ | |||
292 | struct vxlan_hdr { | |||
293 | uint32_t vx_flags; /**< flag (8) + Reserved (24). */ | |||
294 | uint32_t vx_vni; /**< VNI (24) + Reserved (8). */ | |||
295 | } __attribute__((__packed__)); | |||
296 | ||||
297 | /* Ethernet frame types */ | |||
298 | #define ETHER_TYPE_IPv40x0800 0x0800 /**< IPv4 Protocol. */ | |||
299 | #define ETHER_TYPE_IPv60x86DD 0x86DD /**< IPv6 Protocol. */ | |||
300 | #define ETHER_TYPE_ARP0x0806 0x0806 /**< Arp Protocol. */ | |||
301 | #define ETHER_TYPE_RARP0x8035 0x8035 /**< Reverse Arp Protocol. */ | |||
302 | #define ETHER_TYPE_VLAN0x8100 0x8100 /**< IEEE 802.1Q VLAN tagging. */ | |||
303 | #define ETHER_TYPE_QINQ0x88A8 0x88A8 /**< IEEE 802.1ad QinQ tagging. */ | |||
304 | #define ETHER_TYPE_PPPOE_DISCOVERY0x8863 0x8863 /**< PPPoE Discovery Stage. */ | |||
305 | #define ETHER_TYPE_PPPOE_SESSION0x8864 0x8864 /**< PPPoE Session Stage. */ | |||
306 | #define ETHER_TYPE_ETAG0x893F 0x893F /**< IEEE 802.1BR E-Tag. */ | |||
307 | #define ETHER_TYPE_15880x88F7 0x88F7 /**< IEEE 802.1AS 1588 Precise Time Protocol. */ | |||
308 | #define ETHER_TYPE_SLOW0x8809 0x8809 /**< Slow protocols (LACP and Marker). */ | |||
309 | #define ETHER_TYPE_TEB0x6558 0x6558 /**< Transparent Ethernet Bridging. */ | |||
310 | #define ETHER_TYPE_LLDP0x88CC 0x88CC /**< LLDP Protocol. */ | |||
311 | #define ETHER_TYPE_MPLS0x8847 0x8847 /**< MPLS ethertype. */ | |||
312 | #define ETHER_TYPE_MPLSM0x8848 0x8848 /**< MPLS multicast ethertype. */ | |||
313 | ||||
314 | #define ETHER_VXLAN_HLEN(sizeof(struct udp_hdr) + sizeof(struct vxlan_hdr)) (sizeof(struct udp_hdr) + sizeof(struct vxlan_hdr)) | |||
315 | /**< VXLAN tunnel header length. */ | |||
316 | ||||
317 | /** | |||
318 | * VXLAN-GPE protocol header (draft-ietf-nvo3-vxlan-gpe-05). | |||
319 | * Contains the 8-bit flag, 8-bit next-protocol, 24-bit VXLAN Network | |||
320 | * Identifier and Reserved fields (16 bits and 8 bits). | |||
321 | */ | |||
322 | struct vxlan_gpe_hdr { | |||
323 | uint8_t vx_flags; /**< flag (8). */ | |||
324 | uint8_t reserved[2]; /**< Reserved (16). */ | |||
325 | uint8_t proto; /**< next-protocol (8). */ | |||
326 | uint32_t vx_vni; /**< VNI (24) + Reserved (8). */ | |||
327 | } __attribute__((__packed__)); | |||
328 | ||||
329 | /* VXLAN-GPE next protocol types */ | |||
330 | #define VXLAN_GPE_TYPE_IPV41 1 /**< IPv4 Protocol. */ | |||
331 | #define VXLAN_GPE_TYPE_IPV62 2 /**< IPv6 Protocol. */ | |||
332 | #define VXLAN_GPE_TYPE_ETH3 3 /**< Ethernet Protocol. */ | |||
333 | #define VXLAN_GPE_TYPE_NSH4 4 /**< NSH Protocol. */ | |||
334 | #define VXLAN_GPE_TYPE_MPLS5 5 /**< MPLS Protocol. */ | |||
335 | #define VXLAN_GPE_TYPE_GBP6 6 /**< GBP Protocol. */ | |||
336 | #define VXLAN_GPE_TYPE_VBNG7 7 /**< vBNG Protocol. */ | |||
337 | ||||
338 | #define ETHER_VXLAN_GPE_HLEN(sizeof(struct udp_hdr) + sizeof(struct vxlan_gpe_hdr)) (sizeof(struct udp_hdr) + \ | |||
339 | sizeof(struct vxlan_gpe_hdr)) | |||
340 | /**< VXLAN-GPE tunnel header length. */ | |||
341 | ||||
342 | /** | |||
343 | * Extract VLAN tag information into mbuf | |||
344 | * | |||
345 | * Software version of VLAN stripping | |||
346 | * | |||
347 | * @param m | |||
348 | * The packet mbuf. | |||
349 | * @return | |||
350 | * - 0: Success | |||
351 | * - 1: not a vlan packet | |||
352 | */ | |||
353 | static inline int rte_vlan_strip(struct rte_mbuf *m) | |||
354 | { | |||
355 | struct ether_hdr *eh | |||
356 | = rte_pktmbuf_mtod(m, struct ether_hdr *)((struct ether_hdr *)((char *)(m)->buf_addr + (m)->data_off + (0))); | |||
357 | struct vlan_hdr *vh; | |||
358 | ||||
359 | if (eh->ether_type != rte_cpu_to_be_16(ETHER_TYPE_VLAN)((uint16_t)(__builtin_constant_p(0x8100) ? rte_constant_bswap16 (0x8100) : rte_arch_bswap16(0x8100)))) | |||
360 | return -1; | |||
361 | ||||
362 | vh = (struct vlan_hdr *)(eh + 1); | |||
363 | m->ol_flags |= PKT_RX_VLAN(1ULL << 0) | PKT_RX_VLAN_STRIPPED(1ULL << 6); | |||
364 | m->vlan_tci = rte_be_to_cpu_16(vh->vlan_tci)((uint16_t)(__builtin_constant_p(vh->vlan_tci) ? rte_constant_bswap16 (vh->vlan_tci) : rte_arch_bswap16(vh->vlan_tci))); | |||
365 | ||||
366 | /* Copy ether header over rather than moving whole packet */ | |||
367 | memmove(rte_pktmbuf_adj(m, sizeof(struct vlan_hdr)), | |||
368 | eh, 2 * ETHER_ADDR_LEN6); | |||
369 | ||||
370 | return 0; | |||
371 | } | |||
372 | ||||
373 | /** | |||
374 | * Insert VLAN tag into mbuf. | |||
375 | * | |||
376 | * Software version of VLAN unstripping | |||
377 | * | |||
378 | * @param m | |||
379 | * The packet mbuf. | |||
380 | * @return | |||
381 | * - 0: On success | |||
382 | * -EPERM: mbuf is is shared overwriting would be unsafe | |||
383 | * -ENOSPC: not enough headroom in mbuf | |||
384 | */ | |||
385 | static inline int rte_vlan_insert(struct rte_mbuf **m) | |||
386 | { | |||
387 | struct ether_hdr *oh, *nh; | |||
388 | struct vlan_hdr *vh; | |||
389 | ||||
390 | /* Can't insert header if mbuf is shared */ | |||
391 | if (rte_mbuf_refcnt_read(*m) > 1) { | |||
392 | struct rte_mbuf *copy; | |||
393 | ||||
394 | copy = rte_pktmbuf_clone(*m, (*m)->pool); | |||
395 | if (unlikely(copy == NULL)__builtin_expect(!!(copy == ((void*)0)), 0)) | |||
396 | return -ENOMEM12; | |||
397 | rte_pktmbuf_free(*m); | |||
398 | *m = copy; | |||
399 | } | |||
400 | ||||
401 | oh = rte_pktmbuf_mtod(*m, struct ether_hdr *)((struct ether_hdr *)((char *)(*m)->buf_addr + (*m)->data_off + (0))); | |||
402 | nh = (struct ether_hdr *) | |||
403 | rte_pktmbuf_prepend(*m, sizeof(struct vlan_hdr)); | |||
404 | if (nh == NULL((void*)0)) | |||
405 | return -ENOSPC28; | |||
406 | ||||
407 | memmove(nh, oh, 2 * ETHER_ADDR_LEN6); | |||
408 | nh->ether_type = rte_cpu_to_be_16(ETHER_TYPE_VLAN)((uint16_t)(__builtin_constant_p(0x8100) ? rte_constant_bswap16 (0x8100) : rte_arch_bswap16(0x8100))); | |||
409 | ||||
410 | vh = (struct vlan_hdr *) (nh + 1); | |||
411 | vh->vlan_tci = rte_cpu_to_be_16((*m)->vlan_tci)((uint16_t)(__builtin_constant_p((*m)->vlan_tci) ? rte_constant_bswap16 ((*m)->vlan_tci) : rte_arch_bswap16((*m)->vlan_tci))); | |||
412 | ||||
413 | (*m)->ol_flags &= ~(PKT_RX_VLAN_STRIPPED(1ULL << 6) | PKT_TX_VLAN(1ULL << 57)); | |||
414 | ||||
415 | return 0; | |||
416 | } | |||
417 | ||||
418 | #ifdef __cplusplus | |||
419 | } | |||
420 | #endif | |||
421 | ||||
422 | #endif /* _RTE_ETHER_H_ */ |
1 | /* SPDX-License-Identifier: BSD-3-Clause | |||
2 | * Copyright(c) 2010-2014 Intel Corporation | |||
3 | */ | |||
4 | ||||
5 | #ifndef _RTE_MEMCPY_X86_64_H_ | |||
6 | #define _RTE_MEMCPY_X86_64_H_ | |||
7 | ||||
8 | /** | |||
9 | * @file | |||
10 | * | |||
11 | * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy(). | |||
12 | */ | |||
13 | ||||
14 | #include <stdio.h> | |||
15 | #include <stdint.h> | |||
16 | #include <string.h> | |||
17 | #include <rte_vect.h> | |||
18 | #include <rte_common.h> | |||
19 | #include <rte_config.h> | |||
20 | ||||
21 | #ifdef __cplusplus | |||
22 | extern "C" { | |||
23 | #endif | |||
24 | ||||
25 | /** | |||
26 | * Copy bytes from one location to another. The locations must not overlap. | |||
27 | * | |||
28 | * @note This is implemented as a macro, so it's address should not be taken | |||
29 | * and care is needed as parameter expressions may be evaluated multiple times. | |||
30 | * | |||
31 | * @param dst | |||
32 | * Pointer to the destination of the data. | |||
33 | * @param src | |||
34 | * Pointer to the source data. | |||
35 | * @param n | |||
36 | * Number of bytes to copy. | |||
37 | * @return | |||
38 | * Pointer to the destination data. | |||
39 | */ | |||
40 | static __rte_always_inlineinline __attribute__((always_inline)) void * | |||
41 | rte_memcpy(void *dst, const void *src, size_t n); | |||
42 | ||||
43 | #ifdef RTE_MACHINE_CPUFLAG_AVX512F | |||
44 | ||||
45 | #define ALIGNMENT_MASK0x0F 0x3F | |||
46 | ||||
47 | /** | |||
48 | * AVX512 implementation below | |||
49 | */ | |||
50 | ||||
51 | /** | |||
52 | * Copy 16 bytes from one location to another, | |||
53 | * locations should not overlap. | |||
54 | */ | |||
55 | static __rte_always_inlineinline __attribute__((always_inline)) void | |||
56 | rte_mov16(uint8_t *dst, const uint8_t *src) | |||
57 | { | |||
58 | __m128i xmm0; | |||
59 | ||||
60 | xmm0 = _mm_loadu_si128((const __m128i *)src); | |||
61 | _mm_storeu_si128((__m128i *)dst, xmm0); | |||
62 | } | |||
63 | ||||
64 | /** | |||
65 | * Copy 32 bytes from one location to another, | |||
66 | * locations should not overlap. | |||
67 | */ | |||
68 | static __rte_always_inlineinline __attribute__((always_inline)) void | |||
69 | rte_mov32(uint8_t *dst, const uint8_t *src) | |||
70 | { | |||
71 | __m256i ymm0; | |||
72 | ||||
73 | ymm0 = _mm256_loadu_si256((const __m256i *)src); | |||
74 | _mm256_storeu_si256((__m256i *)dst, ymm0); | |||
75 | } | |||
76 | ||||
77 | /** | |||
78 | * Copy 64 bytes from one location to another, | |||
79 | * locations should not overlap. | |||
80 | */ | |||
81 | static __rte_always_inlineinline __attribute__((always_inline)) void | |||
82 | rte_mov64(uint8_t *dst, const uint8_t *src) | |||
83 | { | |||
84 | __m512i zmm0; | |||
85 | ||||
86 | zmm0 = _mm512_loadu_si512((const void *)src); | |||
87 | _mm512_storeu_si512((void *)dst, zmm0); | |||
88 | } | |||
89 | ||||
90 | /** | |||
91 | * Copy 128 bytes from one location to another, | |||
92 | * locations should not overlap. | |||
93 | */ | |||
94 | static __rte_always_inlineinline __attribute__((always_inline)) void | |||
95 | rte_mov128(uint8_t *dst, const uint8_t *src) | |||
96 | { | |||
97 | rte_mov64(dst + 0 * 64, src + 0 * 64); | |||
98 | rte_mov64(dst + 1 * 64, src + 1 * 64); | |||
99 | } | |||
100 | ||||
101 | /** | |||
102 | * Copy 256 bytes from one location to another, | |||
103 | * locations should not overlap. | |||
104 | */ | |||
105 | static __rte_always_inlineinline __attribute__((always_inline)) void | |||
106 | rte_mov256(uint8_t *dst, const uint8_t *src) | |||
107 | { | |||
108 | rte_mov64(dst + 0 * 64, src + 0 * 64); | |||
109 | rte_mov64(dst + 1 * 64, src + 1 * 64); | |||
110 | rte_mov64(dst + 2 * 64, src + 2 * 64); | |||
111 | rte_mov64(dst + 3 * 64, src + 3 * 64); | |||
112 | } | |||
113 | ||||
114 | /** | |||
115 | * Copy 128-byte blocks from one location to another, | |||
116 | * locations should not overlap. | |||
117 | */ | |||
118 | static inline void | |||
119 | rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) | |||
120 | { | |||
121 | __m512i zmm0, zmm1; | |||
122 | ||||
123 | while (n >= 128) { | |||
124 | zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); | |||
125 | n -= 128; | |||
126 | zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); | |||
127 | src = src + 128; | |||
128 | _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); | |||
129 | _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); | |||
130 | dst = dst + 128; | |||
131 | } | |||
132 | } | |||
133 | ||||
134 | /** | |||
135 | * Copy 512-byte blocks from one location to another, | |||
136 | * locations should not overlap. | |||
137 | */ | |||
138 | static inline void | |||
139 | rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) | |||
140 | { | |||
141 | __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; | |||
142 | ||||
143 | while (n >= 512) { | |||
144 | zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); | |||
145 | n -= 512; | |||
146 | zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); | |||
147 | zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64)); | |||
148 | zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64)); | |||
149 | zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64)); | |||
150 | zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64)); | |||
151 | zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64)); | |||
152 | zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64)); | |||
153 | src = src + 512; | |||
154 | _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); | |||
155 | _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); | |||
156 | _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2); | |||
157 | _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3); | |||
158 | _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4); | |||
159 | _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5); | |||
160 | _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6); | |||
161 | _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7); | |||
162 | dst = dst + 512; | |||
163 | } | |||
164 | } | |||
165 | ||||
166 | static inline void * | |||
167 | rte_memcpy_generic(void *dst, const void *src, size_t n) | |||
168 | { | |||
169 | uintptr_t dstu = (uintptr_t)dst; | |||
170 | uintptr_t srcu = (uintptr_t)src; | |||
171 | void *ret = dst; | |||
172 | size_t dstofss; | |||
173 | size_t bits; | |||
174 | ||||
175 | /** | |||
176 | * Copy less than 16 bytes | |||
177 | */ | |||
178 | if (n < 16) { | |||
179 | if (n & 0x01) { | |||
180 | *(uint8_t *)dstu = *(const uint8_t *)srcu; | |||
181 | srcu = (uintptr_t)((const uint8_t *)srcu + 1); | |||
182 | dstu = (uintptr_t)((uint8_t *)dstu + 1); | |||
183 | } | |||
184 | if (n & 0x02) { | |||
185 | *(uint16_t *)dstu = *(const uint16_t *)srcu; | |||
186 | srcu = (uintptr_t)((const uint16_t *)srcu + 1); | |||
187 | dstu = (uintptr_t)((uint16_t *)dstu + 1); | |||
188 | } | |||
189 | if (n & 0x04) { | |||
190 | *(uint32_t *)dstu = *(const uint32_t *)srcu; | |||
191 | srcu = (uintptr_t)((const uint32_t *)srcu + 1); | |||
192 | dstu = (uintptr_t)((uint32_t *)dstu + 1); | |||
193 | } | |||
194 | if (n & 0x08) | |||
195 | *(uint64_t *)dstu = *(const uint64_t *)srcu; | |||
196 | return ret; | |||
197 | } | |||
198 | ||||
199 | /** | |||
200 | * Fast way when copy size doesn't exceed 512 bytes | |||
201 | */ | |||
202 | if (n <= 32) { | |||
203 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |||
204 | rte_mov16((uint8_t *)dst - 16 + n, | |||
205 | (const uint8_t *)src - 16 + n); | |||
206 | return ret; | |||
207 | } | |||
208 | if (n <= 64) { | |||
209 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |||
210 | rte_mov32((uint8_t *)dst - 32 + n, | |||
211 | (const uint8_t *)src - 32 + n); | |||
212 | return ret; | |||
213 | } | |||
214 | if (n <= 512) { | |||
215 | if (n >= 256) { | |||
216 | n -= 256; | |||
217 | rte_mov256((uint8_t *)dst, (const uint8_t *)src); | |||
218 | src = (const uint8_t *)src + 256; | |||
219 | dst = (uint8_t *)dst + 256; | |||
220 | } | |||
221 | if (n >= 128) { | |||
222 | n -= 128; | |||
223 | rte_mov128((uint8_t *)dst, (const uint8_t *)src); | |||
224 | src = (const uint8_t *)src + 128; | |||
225 | dst = (uint8_t *)dst + 128; | |||
226 | } | |||
227 | COPY_BLOCK_128_BACK63: | |||
228 | if (n > 64) { | |||
229 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |||
230 | rte_mov64((uint8_t *)dst - 64 + n, | |||
231 | (const uint8_t *)src - 64 + n); | |||
232 | return ret; | |||
233 | } | |||
234 | if (n > 0) | |||
235 | rte_mov64((uint8_t *)dst - 64 + n, | |||
236 | (const uint8_t *)src - 64 + n); | |||
237 | return ret; | |||
238 | } | |||
239 | ||||
240 | /** | |||
241 | * Make store aligned when copy size exceeds 512 bytes | |||
242 | */ | |||
243 | dstofss = ((uintptr_t)dst & 0x3F); | |||
244 | if (dstofss > 0) { | |||
245 | dstofss = 64 - dstofss; | |||
246 | n -= dstofss; | |||
247 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |||
248 | src = (const uint8_t *)src + dstofss; | |||
249 | dst = (uint8_t *)dst + dstofss; | |||
250 | } | |||
251 | ||||
252 | /** | |||
253 | * Copy 512-byte blocks. | |||
254 | * Use copy block function for better instruction order control, | |||
255 | * which is important when load is unaligned. | |||
256 | */ | |||
257 | rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n); | |||
258 | bits = n; | |||
259 | n = n & 511; | |||
260 | bits -= n; | |||
261 | src = (const uint8_t *)src + bits; | |||
262 | dst = (uint8_t *)dst + bits; | |||
263 | ||||
264 | /** | |||
265 | * Copy 128-byte blocks. | |||
266 | * Use copy block function for better instruction order control, | |||
267 | * which is important when load is unaligned. | |||
268 | */ | |||
269 | if (n >= 128) { | |||
270 | rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); | |||
271 | bits = n; | |||
272 | n = n & 127; | |||
273 | bits -= n; | |||
274 | src = (const uint8_t *)src + bits; | |||
275 | dst = (uint8_t *)dst + bits; | |||
276 | } | |||
277 | ||||
278 | /** | |||
279 | * Copy whatever left | |||
280 | */ | |||
281 | goto COPY_BLOCK_128_BACK63; | |||
282 | } | |||
283 | ||||
284 | #elif defined RTE_MACHINE_CPUFLAG_AVX2 | |||
285 | ||||
286 | #define ALIGNMENT_MASK0x0F 0x1F | |||
287 | ||||
288 | /** | |||
289 | * AVX2 implementation below | |||
290 | */ | |||
291 | ||||
292 | /** | |||
293 | * Copy 16 bytes from one location to another, | |||
294 | * locations should not overlap. | |||
295 | */ | |||
296 | static __rte_always_inlineinline __attribute__((always_inline)) void | |||
297 | rte_mov16(uint8_t *dst, const uint8_t *src) | |||
298 | { | |||
299 | __m128i xmm0; | |||
300 | ||||
301 | xmm0 = _mm_loadu_si128((const __m128i *)src); | |||
302 | _mm_storeu_si128((__m128i *)dst, xmm0); | |||
303 | } | |||
304 | ||||
305 | /** | |||
306 | * Copy 32 bytes from one location to another, | |||
307 | * locations should not overlap. | |||
308 | */ | |||
309 | static __rte_always_inlineinline __attribute__((always_inline)) void | |||
310 | rte_mov32(uint8_t *dst, const uint8_t *src) | |||
311 | { | |||
312 | __m256i ymm0; | |||
313 | ||||
314 | ymm0 = _mm256_loadu_si256((const __m256i *)src); | |||
315 | _mm256_storeu_si256((__m256i *)dst, ymm0); | |||
316 | } | |||
317 | ||||
318 | /** | |||
319 | * Copy 64 bytes from one location to another, | |||
320 | * locations should not overlap. | |||
321 | */ | |||
322 | static __rte_always_inlineinline __attribute__((always_inline)) void | |||
323 | rte_mov64(uint8_t *dst, const uint8_t *src) | |||
324 | { | |||
325 | rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); | |||
326 | rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); | |||
327 | } | |||
328 | ||||
329 | /** | |||
330 | * Copy 128 bytes from one location to another, | |||
331 | * locations should not overlap. | |||
332 | */ | |||
333 | static inline void | |||
334 | rte_mov128(uint8_t *dst, const uint8_t *src) | |||
335 | { | |||
336 | rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); | |||
337 | rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); | |||
338 | rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); | |||
339 | rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); | |||
340 | } | |||
341 | ||||
342 | /** | |||
343 | * Copy 128-byte blocks from one location to another, | |||
344 | * locations should not overlap. | |||
345 | */ | |||
346 | static inline void | |||
347 | rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) | |||
348 | { | |||
349 | __m256i ymm0, ymm1, ymm2, ymm3; | |||
350 | ||||
351 | while (n >= 128) { | |||
352 | ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32)); | |||
353 | n -= 128; | |||
354 | ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32)); | |||
355 | ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32)); | |||
356 | ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32)); | |||
357 | src = (const uint8_t *)src + 128; | |||
358 | _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0); | |||
359 | _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1); | |||
360 | _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2); | |||
361 | _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3); | |||
362 | dst = (uint8_t *)dst + 128; | |||
363 | } | |||
364 | } | |||
365 | ||||
366 | static inline void * | |||
367 | rte_memcpy_generic(void *dst, const void *src, size_t n) | |||
368 | { | |||
369 | uintptr_t dstu = (uintptr_t)dst; | |||
370 | uintptr_t srcu = (uintptr_t)src; | |||
371 | void *ret = dst; | |||
372 | size_t dstofss; | |||
373 | size_t bits; | |||
374 | ||||
375 | /** | |||
376 | * Copy less than 16 bytes | |||
377 | */ | |||
378 | if (n < 16) { | |||
379 | if (n & 0x01) { | |||
380 | *(uint8_t *)dstu = *(const uint8_t *)srcu; | |||
381 | srcu = (uintptr_t)((const uint8_t *)srcu + 1); | |||
382 | dstu = (uintptr_t)((uint8_t *)dstu + 1); | |||
383 | } | |||
384 | if (n & 0x02) { | |||
385 | *(uint16_t *)dstu = *(const uint16_t *)srcu; | |||
386 | srcu = (uintptr_t)((const uint16_t *)srcu + 1); | |||
387 | dstu = (uintptr_t)((uint16_t *)dstu + 1); | |||
388 | } | |||
389 | if (n & 0x04) { | |||
390 | *(uint32_t *)dstu = *(const uint32_t *)srcu; | |||
391 | srcu = (uintptr_t)((const uint32_t *)srcu + 1); | |||
392 | dstu = (uintptr_t)((uint32_t *)dstu + 1); | |||
393 | } | |||
394 | if (n & 0x08) { | |||
395 | *(uint64_t *)dstu = *(const uint64_t *)srcu; | |||
396 | } | |||
397 | return ret; | |||
398 | } | |||
399 | ||||
400 | /** | |||
401 | * Fast way when copy size doesn't exceed 256 bytes | |||
402 | */ | |||
403 | if (n <= 32) { | |||
404 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |||
405 | rte_mov16((uint8_t *)dst - 16 + n, | |||
406 | (const uint8_t *)src - 16 + n); | |||
407 | return ret; | |||
408 | } | |||
409 | if (n <= 48) { | |||
410 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |||
411 | rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); | |||
412 | rte_mov16((uint8_t *)dst - 16 + n, | |||
413 | (const uint8_t *)src - 16 + n); | |||
414 | return ret; | |||
415 | } | |||
416 | if (n <= 64) { | |||
417 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |||
418 | rte_mov32((uint8_t *)dst - 32 + n, | |||
419 | (const uint8_t *)src - 32 + n); | |||
420 | return ret; | |||
421 | } | |||
422 | if (n <= 256) { | |||
423 | if (n >= 128) { | |||
424 | n -= 128; | |||
425 | rte_mov128((uint8_t *)dst, (const uint8_t *)src); | |||
426 | src = (const uint8_t *)src + 128; | |||
427 | dst = (uint8_t *)dst + 128; | |||
428 | } | |||
429 | COPY_BLOCK_128_BACK31: | |||
430 | if (n >= 64) { | |||
431 | n -= 64; | |||
432 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |||
433 | src = (const uint8_t *)src + 64; | |||
434 | dst = (uint8_t *)dst + 64; | |||
435 | } | |||
436 | if (n > 32) { | |||
437 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |||
438 | rte_mov32((uint8_t *)dst - 32 + n, | |||
439 | (const uint8_t *)src - 32 + n); | |||
440 | return ret; | |||
441 | } | |||
442 | if (n > 0) { | |||
443 | rte_mov32((uint8_t *)dst - 32 + n, | |||
444 | (const uint8_t *)src - 32 + n); | |||
445 | } | |||
446 | return ret; | |||
447 | } | |||
448 | ||||
449 | /** | |||
450 | * Make store aligned when copy size exceeds 256 bytes | |||
451 | */ | |||
452 | dstofss = (uintptr_t)dst & 0x1F; | |||
453 | if (dstofss > 0) { | |||
454 | dstofss = 32 - dstofss; | |||
455 | n -= dstofss; | |||
456 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |||
457 | src = (const uint8_t *)src + dstofss; | |||
458 | dst = (uint8_t *)dst + dstofss; | |||
459 | } | |||
460 | ||||
461 | /** | |||
462 | * Copy 128-byte blocks | |||
463 | */ | |||
464 | rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); | |||
465 | bits = n; | |||
466 | n = n & 127; | |||
467 | bits -= n; | |||
468 | src = (const uint8_t *)src + bits; | |||
469 | dst = (uint8_t *)dst + bits; | |||
470 | ||||
471 | /** | |||
472 | * Copy whatever left | |||
473 | */ | |||
474 | goto COPY_BLOCK_128_BACK31; | |||
475 | } | |||
476 | ||||
477 | #else /* RTE_MACHINE_CPUFLAG */ | |||
478 | ||||
479 | #define ALIGNMENT_MASK0x0F 0x0F | |||
480 | ||||
481 | /** | |||
482 | * SSE & AVX implementation below | |||
483 | */ | |||
484 | ||||
485 | /** | |||
486 | * Copy 16 bytes from one location to another, | |||
487 | * locations should not overlap. | |||
488 | */ | |||
489 | static __rte_always_inlineinline __attribute__((always_inline)) void | |||
490 | rte_mov16(uint8_t *dst, const uint8_t *src) | |||
491 | { | |||
492 | __m128i xmm0; | |||
493 | ||||
494 | xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src); | |||
495 | _mm_storeu_si128((__m128i *)dst, xmm0); | |||
496 | } | |||
497 | ||||
498 | /** | |||
499 | * Copy 32 bytes from one location to another, | |||
500 | * locations should not overlap. | |||
501 | */ | |||
502 | static __rte_always_inlineinline __attribute__((always_inline)) void | |||
503 | rte_mov32(uint8_t *dst, const uint8_t *src) | |||
504 | { | |||
505 | rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); | |||
506 | rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); | |||
507 | } | |||
508 | ||||
509 | /** | |||
510 | * Copy 64 bytes from one location to another, | |||
511 | * locations should not overlap. | |||
512 | */ | |||
513 | static __rte_always_inlineinline __attribute__((always_inline)) void | |||
514 | rte_mov64(uint8_t *dst, const uint8_t *src) | |||
515 | { | |||
516 | rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); | |||
517 | rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); | |||
518 | rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); | |||
519 | rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); | |||
520 | } | |||
521 | ||||
522 | /** | |||
523 | * Copy 128 bytes from one location to another, | |||
524 | * locations should not overlap. | |||
525 | */ | |||
526 | static inline void | |||
527 | rte_mov128(uint8_t *dst, const uint8_t *src) | |||
528 | { | |||
529 | rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); | |||
530 | rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); | |||
531 | rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); | |||
532 | rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); | |||
533 | rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); | |||
534 | rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); | |||
535 | rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); | |||
536 | rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); | |||
537 | } | |||
538 | ||||
539 | /** | |||
540 | * Copy 256 bytes from one location to another, | |||
541 | * locations should not overlap. | |||
542 | */ | |||
543 | static inline void | |||
544 | rte_mov256(uint8_t *dst, const uint8_t *src) | |||
545 | { | |||
546 | rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); | |||
547 | rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); | |||
548 | rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); | |||
549 | rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); | |||
550 | rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); | |||
551 | rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); | |||
552 | rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); | |||
553 | rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); | |||
554 | rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); | |||
555 | rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); | |||
556 | rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); | |||
557 | rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); | |||
558 | rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); | |||
559 | rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); | |||
560 | rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); | |||
561 | rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); | |||
562 | } | |||
563 | ||||
564 | /** | |||
565 | * Macro for copying unaligned block from one location to another with constant load offset, | |||
566 | * 47 bytes leftover maximum, | |||
567 | * locations should not overlap. | |||
568 | * Requirements: | |||
569 | * - Store is aligned | |||
570 | * - Load offset is <offset>, which must be immediate value within [1, 15] | |||
571 | * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading | |||
572 | * - <dst>, <src>, <len> must be variables | |||
573 | * - __m128i <xmm0> ~ <xmm8> must be pre-defined | |||
574 | */ | |||
575 | #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)__extension__ ({ size_t tmp; while (len >= 128 + 16 - offset ) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - offset + 0 * 16)); len -= 128; xmm1 = _mm_loadu_si128( (const __m128i *)((const uint8_t *)src - offset + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16)); xmm5 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - offset + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128(( __v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (offset)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2 ), (__v16qi)(__m128i)(xmm1), (offset)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi )(__m128i)(xmm2), (offset)); })); _mm_storeu_si128((__m128i * )((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (offset) ); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16) , __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (offset)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi )(__m128i)(xmm5), (offset)); })); _mm_storeu_si128((__m128i * )((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (offset) ); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16) , __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (offset)); })); dst = (uint8_t *)dst + 128; } tmp = len; len = ((len - 16 + offset ) & 127) + 16 - offset; tmp -= len; src = (const uint8_t * )src + tmp; dst = (uint8_t *)dst + tmp; if (len >= 32 + 16 - offset) { while (len >= 32 + 16 - offset) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); len -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (offset)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (offset)); })); dst = (uint8_t *)dst + 32; } tmp = len; len = ((len - 16 + offset) & 31) + 16 - offset ; tmp -= len; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }) \ | |||
576 | __extension__ ({ \ | |||
577 | size_t tmp; \ | |||
578 | while (len >= 128 + 16 - offset) { \ | |||
579 | xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \ | |||
580 | len -= 128; \ | |||
581 | xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \ | |||
582 | xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \ | |||
583 | xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16)); \ | |||
584 | xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16)); \ | |||
585 | xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16)); \ | |||
586 | xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16)); \ | |||
587 | xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16)); \ | |||
588 | xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16)); \ | |||
589 | src = (const uint8_t *)src + 128; \ | |||
590 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi) (__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (offset)); })); \ | |||
591 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi) (__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (offset)); })); \ | |||
592 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi) (__m128i)(xmm3), (__v16qi)(__m128i)(xmm2), (offset)); })); \ | |||
593 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi) (__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (offset)); })); \ | |||
594 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi) (__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (offset)); })); \ | |||
595 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi) (__m128i)(xmm6), (__v16qi)(__m128i)(xmm5), (offset)); })); \ | |||
596 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi) (__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (offset)); })); \ | |||
597 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi) (__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (offset)); })); \ | |||
598 | dst = (uint8_t *)dst + 128; \ | |||
599 | } \ | |||
600 | tmp = len; \ | |||
601 | len = ((len - 16 + offset) & 127) + 16 - offset; \ | |||
602 | tmp -= len; \ | |||
603 | src = (const uint8_t *)src + tmp; \ | |||
604 | dst = (uint8_t *)dst + tmp; \ | |||
605 | if (len >= 32 + 16 - offset) { \ | |||
606 | while (len >= 32 + 16 - offset) { \ | |||
607 | xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \ | |||
608 | len -= 32; \ | |||
609 | xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \ | |||
610 | xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \ | |||
611 | src = (const uint8_t *)src + 32; \ | |||
612 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi) (__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (offset)); })); \ | |||
613 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi) (__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (offset)); })); \ | |||
614 | dst = (uint8_t *)dst + 32; \ | |||
615 | } \ | |||
616 | tmp = len; \ | |||
617 | len = ((len - 16 + offset) & 31) + 16 - offset; \ | |||
618 | tmp -= len; \ | |||
619 | src = (const uint8_t *)src + tmp; \ | |||
620 | dst = (uint8_t *)dst + tmp; \ | |||
621 | } \ | |||
622 | }) | |||
623 | ||||
624 | /** | |||
625 | * Macro for copying unaligned block from one location to another, | |||
626 | * 47 bytes leftover maximum, | |||
627 | * locations should not overlap. | |||
628 | * Use switch here because the aligning instruction requires immediate value for shift count. | |||
629 | * Requirements: | |||
630 | * - Store is aligned | |||
631 | * - Load offset is <offset>, which must be within [1, 15] | |||
632 | * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading | |||
633 | * - <dst>, <src>, <len> must be variables | |||
634 | * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined | |||
635 | */ | |||
636 | #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)__extension__ ({ switch (offset) { case 0x01: __extension__ ( { size_t tmp; while (n >= 128 + 16 - 0x01) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x01 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x01 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x01 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x01)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x01)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x01)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x01)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x01)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x01)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x01)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x01)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x01 ) & 127) + 16 - 0x01; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x01 ) { while (n >= 32 + 16 - 0x01) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x01 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x01 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x01)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x01)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x01) & 31) + 16 - 0x01; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; case 0x02: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x02) { xmm0 = _mm_loadu_si128((const __m128i *)( (const uint8_t *)src - 0x02 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x02 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 4 * 16)); xmm5 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x02 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128(( __v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x02)); } )); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2 ), (__v16qi)(__m128i)(xmm1), (0x02)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi )(__m128i)(xmm2), (0x02)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x02)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5 ), (__v16qi)(__m128i)(xmm4), (0x02)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi )(__m128i)(xmm5), (0x02)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x02)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8 ), (__v16qi)(__m128i)(xmm7), (0x02)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x02) & 127) + 16 - 0x02 ; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t * )dst + tmp; if (n >= 32 + 16 - 0x02) { while (n >= 32 + 16 - 0x02) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x02 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x02)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x02)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x02 ) & 31) + 16 - 0x02; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x03: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x03) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x03 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x03 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x03 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x03)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x03)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x03)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x03)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x03)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x03)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x03)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x03)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x03 ) & 127) + 16 - 0x03; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x03 ) { while (n >= 32 + 16 - 0x03) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x03 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x03 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x03)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x03)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x03) & 31) + 16 - 0x03; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; case 0x04: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x04) { xmm0 = _mm_loadu_si128((const __m128i *)( (const uint8_t *)src - 0x04 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x04 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 4 * 16)); xmm5 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x04 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128(( __v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x04)); } )); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2 ), (__v16qi)(__m128i)(xmm1), (0x04)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi )(__m128i)(xmm2), (0x04)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x04)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5 ), (__v16qi)(__m128i)(xmm4), (0x04)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi )(__m128i)(xmm5), (0x04)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x04)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8 ), (__v16qi)(__m128i)(xmm7), (0x04)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x04) & 127) + 16 - 0x04 ; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t * )dst + tmp; if (n >= 32 + 16 - 0x04) { while (n >= 32 + 16 - 0x04) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x04 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x04)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x04)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x04 ) & 31) + 16 - 0x04; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x05: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x05) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x05 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x05 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x05 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x05)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x05)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x05)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x05)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x05)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x05)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x05)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x05)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x05 ) & 127) + 16 - 0x05; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x05 ) { while (n >= 32 + 16 - 0x05) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x05 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x05 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x05)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x05)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x05) & 31) + 16 - 0x05; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; case 0x06: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x06) { xmm0 = _mm_loadu_si128((const __m128i *)( (const uint8_t *)src - 0x06 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x06 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 4 * 16)); xmm5 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x06 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128(( __v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x06)); } )); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2 ), (__v16qi)(__m128i)(xmm1), (0x06)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi )(__m128i)(xmm2), (0x06)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x06)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5 ), (__v16qi)(__m128i)(xmm4), (0x06)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi )(__m128i)(xmm5), (0x06)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x06)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8 ), (__v16qi)(__m128i)(xmm7), (0x06)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x06) & 127) + 16 - 0x06 ; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t * )dst + tmp; if (n >= 32 + 16 - 0x06) { while (n >= 32 + 16 - 0x06) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x06 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x06)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x06)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x06 ) & 31) + 16 - 0x06; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x07: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x07) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x07 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x07 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x07 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x07)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x07)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x07)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x07)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x07)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x07)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x07)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x07)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x07 ) & 127) + 16 - 0x07; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x07 ) { while (n >= 32 + 16 - 0x07) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x07 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x07 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x07)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x07)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x07) & 31) + 16 - 0x07; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; case 0x08: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x08) { xmm0 = _mm_loadu_si128((const __m128i *)( (const uint8_t *)src - 0x08 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x08 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 4 * 16)); xmm5 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x08 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128(( __v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x08)); } )); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2 ), (__v16qi)(__m128i)(xmm1), (0x08)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi )(__m128i)(xmm2), (0x08)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x08)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5 ), (__v16qi)(__m128i)(xmm4), (0x08)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi )(__m128i)(xmm5), (0x08)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x08)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8 ), (__v16qi)(__m128i)(xmm7), (0x08)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x08) & 127) + 16 - 0x08 ; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t * )dst + tmp; if (n >= 32 + 16 - 0x08) { while (n >= 32 + 16 - 0x08) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x08 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x08)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x08)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x08 ) & 31) + 16 - 0x08; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x09: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x09) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x09 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x09 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x09 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x09)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x09)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x09)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x09)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x09)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x09)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x09)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x09)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x09 ) & 127) + 16 - 0x09; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x09 ) { while (n >= 32 + 16 - 0x09) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x09 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x09 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x09)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x09)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x09) & 31) + 16 - 0x09; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; case 0x0A: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0A) { xmm0 = _mm_loadu_si128((const __m128i *)( (const uint8_t *)src - 0x0A + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0A + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 4 * 16)); xmm5 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0A + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128(( __v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0A)); } )); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2 ), (__v16qi)(__m128i)(xmm1), (0x0A)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi )(__m128i)(xmm2), (0x0A)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x0A)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5 ), (__v16qi)(__m128i)(xmm4), (0x0A)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi )(__m128i)(xmm5), (0x0A)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x0A)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8 ), (__v16qi)(__m128i)(xmm7), (0x0A)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0A) & 127) + 16 - 0x0A ; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t * )dst + tmp; if (n >= 32 + 16 - 0x0A) { while (n >= 32 + 16 - 0x0A) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0A + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0A)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0A)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0A ) & 31) + 16 - 0x0A; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x0B: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0B) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0B + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0B + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0B + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0B)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0B)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x0B)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x0B)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0B)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x0B)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x0B)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0B)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0B ) & 127) + 16 - 0x0B; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0B ) { while (n >= 32 + 16 - 0x0B) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x0B + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x0B + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0B)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x0B)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0B) & 31) + 16 - 0x0B; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; case 0x0C: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0C) { xmm0 = _mm_loadu_si128((const __m128i *)( (const uint8_t *)src - 0x0C + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0C + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 4 * 16)); xmm5 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0C + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128(( __v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0C)); } )); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2 ), (__v16qi)(__m128i)(xmm1), (0x0C)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi )(__m128i)(xmm2), (0x0C)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x0C)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5 ), (__v16qi)(__m128i)(xmm4), (0x0C)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi )(__m128i)(xmm5), (0x0C)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x0C)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8 ), (__v16qi)(__m128i)(xmm7), (0x0C)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0C) & 127) + 16 - 0x0C ; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t * )dst + tmp; if (n >= 32 + 16 - 0x0C) { while (n >= 32 + 16 - 0x0C) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0C + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0C)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0C)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0C ) & 31) + 16 - 0x0C; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x0D: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0D) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0D + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0D + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0D + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0D)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0D)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x0D)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x0D)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0D)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x0D)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x0D)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0D)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0D ) & 127) + 16 - 0x0D; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0D ) { while (n >= 32 + 16 - 0x0D) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x0D + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x0D + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0D)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x0D)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0D) & 31) + 16 - 0x0D; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; case 0x0E: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0E) { xmm0 = _mm_loadu_si128((const __m128i *)( (const uint8_t *)src - 0x0E + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0E + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 4 * 16)); xmm5 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0E + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128(( __v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0E)); } )); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2 ), (__v16qi)(__m128i)(xmm1), (0x0E)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi )(__m128i)(xmm2), (0x0E)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x0E)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5 ), (__v16qi)(__m128i)(xmm4), (0x0E)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi )(__m128i)(xmm5), (0x0E)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x0E)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8 ), (__v16qi)(__m128i)(xmm7), (0x0E)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0E) & 127) + 16 - 0x0E ; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t * )dst + tmp; if (n >= 32 + 16 - 0x0E) { while (n >= 32 + 16 - 0x0E) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0E + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0E)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0E)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0E ) & 31) + 16 - 0x0E; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x0F: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0F) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0F + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0F + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0F + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0F)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0F)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x0F)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x0F)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0F)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x0F)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x0F)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0F)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0F ) & 127) + 16 - 0x0F; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0F ) { while (n >= 32 + 16 - 0x0F) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x0F + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x0F + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0F)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x0F)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0F) & 31) + 16 - 0x0F; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; default:; } }) \ | |||
637 | __extension__ ({ \ | |||
638 | switch (offset) { \ | |||
639 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x01) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 1 * 16)); xmm2 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x01 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 5 * 16)); xmm6 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x01 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x01)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x01)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x01)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x01)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x01)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x01)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x01)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x01)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x01 ) & 127) + 16 - 0x01; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x01 ) { while (n >= 32 + 16 - 0x01) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x01 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x01 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x01)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x01)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x01) & 31) + 16 - 0x01; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; \ | |||
640 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x02) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 1 * 16)); xmm2 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x02 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 5 * 16)); xmm6 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x02 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x02)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x02)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x02)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x02)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x02)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x02)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x02)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x02)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x02 ) & 127) + 16 - 0x02; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x02 ) { while (n >= 32 + 16 - 0x02) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x02 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x02 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x02)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x02)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x02) & 31) + 16 - 0x02; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; \ | |||
641 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x03) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 1 * 16)); xmm2 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x03 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 5 * 16)); xmm6 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x03 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x03)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x03)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x03)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x03)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x03)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x03)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x03)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x03)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x03 ) & 127) + 16 - 0x03; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x03 ) { while (n >= 32 + 16 - 0x03) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x03 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x03 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x03)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x03)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x03) & 31) + 16 - 0x03; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; \ | |||
642 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x04) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 1 * 16)); xmm2 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x04 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 5 * 16)); xmm6 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x04 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x04)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x04)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x04)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x04)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x04)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x04)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x04)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x04)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x04 ) & 127) + 16 - 0x04; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x04 ) { while (n >= 32 + 16 - 0x04) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x04 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x04 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x04)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x04)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x04) & 31) + 16 - 0x04; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; \ | |||
643 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x05) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 1 * 16)); xmm2 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x05 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 5 * 16)); xmm6 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x05 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x05)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x05)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x05)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x05)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x05)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x05)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x05)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x05)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x05 ) & 127) + 16 - 0x05; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x05 ) { while (n >= 32 + 16 - 0x05) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x05 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x05 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x05)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x05)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x05) & 31) + 16 - 0x05; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; \ | |||
644 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x06) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 1 * 16)); xmm2 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x06 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 5 * 16)); xmm6 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x06 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x06)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x06)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x06)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x06)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x06)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x06)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x06)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x06)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x06 ) & 127) + 16 - 0x06; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x06 ) { while (n >= 32 + 16 - 0x06) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x06 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x06 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x06)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x06)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x06) & 31) + 16 - 0x06; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; \ | |||
645 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x07) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 1 * 16)); xmm2 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x07 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 5 * 16)); xmm6 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x07 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x07)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x07)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x07)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x07)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x07)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x07)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x07)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x07)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x07 ) & 127) + 16 - 0x07; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x07 ) { while (n >= 32 + 16 - 0x07) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x07 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x07 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x07)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x07)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x07) & 31) + 16 - 0x07; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; \ | |||
646 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x08) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 1 * 16)); xmm2 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x08 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 5 * 16)); xmm6 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x08 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x08)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x08)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x08)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x08)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x08)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x08)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x08)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x08)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x08 ) & 127) + 16 - 0x08; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x08 ) { while (n >= 32 + 16 - 0x08) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x08 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x08 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x08)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x08)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x08) & 31) + 16 - 0x08; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; \ | |||
647 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x09) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 1 * 16)); xmm2 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x09 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 5 * 16)); xmm6 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x09 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x09)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x09)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x09)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x09)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x09)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x09)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x09)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x09)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x09 ) & 127) + 16 - 0x09; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x09 ) { while (n >= 32 + 16 - 0x09) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x09 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x09 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x09)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x09)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x09) & 31) + 16 - 0x09; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; \ | |||
648 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0A) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 1 * 16)); xmm2 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0A + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 5 * 16)); xmm6 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0A + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0A)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0A)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x0A)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x0A)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0A)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x0A)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x0A)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0A)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0A ) & 127) + 16 - 0x0A; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0A ) { while (n >= 32 + 16 - 0x0A) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x0A + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x0A + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0A)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x0A)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0A) & 31) + 16 - 0x0A; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; \ | |||
649 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0B) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 1 * 16)); xmm2 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0B + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 5 * 16)); xmm6 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0B + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0B)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0B)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x0B)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x0B)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0B)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x0B)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x0B)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0B)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0B ) & 127) + 16 - 0x0B; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0B ) { while (n >= 32 + 16 - 0x0B) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x0B + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x0B + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0B)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x0B)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0B) & 31) + 16 - 0x0B; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; \ | |||
650 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0C) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 1 * 16)); xmm2 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0C + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 5 * 16)); xmm6 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0C + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0C)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0C)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x0C)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x0C)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0C)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x0C)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x0C)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0C)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0C ) & 127) + 16 - 0x0C; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0C ) { while (n >= 32 + 16 - 0x0C) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x0C + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x0C + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0C)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x0C)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0C) & 31) + 16 - 0x0C; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; \ | |||
651 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0D) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 1 * 16)); xmm2 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0D + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 5 * 16)); xmm6 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0D + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0D)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0D)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x0D)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x0D)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0D)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x0D)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x0D)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0D)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0D ) & 127) + 16 - 0x0D; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0D ) { while (n >= 32 + 16 - 0x0D) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x0D + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x0D + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0D)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x0D)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0D) & 31) + 16 - 0x0D; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; \ | |||
652 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0E) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 1 * 16)); xmm2 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0E + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 5 * 16)); xmm6 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0E + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0E)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0E)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x0E)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x0E)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0E)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x0E)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x0E)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0E)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0E ) & 127) + 16 - 0x0E; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0E ) { while (n >= 32 + 16 - 0x0E) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x0E + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x0E + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0E)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x0E)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0E) & 31) + 16 - 0x0E; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; \ | |||
653 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0F) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 1 * 16)); xmm2 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0F + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 5 * 16)); xmm6 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0F + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0F)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0F)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x0F)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x0F)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0F)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x0F)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x0F)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0F)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0F ) & 127) + 16 - 0x0F; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0F ) { while (n >= 32 + 16 - 0x0F) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x0F + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x0F + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0F)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x0F)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0F) & 31) + 16 - 0x0F; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; \ | |||
654 | default:; \ | |||
655 | } \ | |||
656 | }) | |||
657 | ||||
658 | static inline void * | |||
659 | rte_memcpy_generic(void *dst, const void *src, size_t n) | |||
660 | { | |||
661 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; | |||
662 | uintptr_t dstu = (uintptr_t)dst; | |||
663 | uintptr_t srcu = (uintptr_t)src; | |||
664 | void *ret = dst; | |||
665 | size_t dstofss; | |||
666 | size_t srcofs; | |||
667 | ||||
668 | /** | |||
669 | * Copy less than 16 bytes | |||
670 | */ | |||
671 | if (n < 16) { | |||
672 | if (n & 0x01) { | |||
673 | *(uint8_t *)dstu = *(const uint8_t *)srcu; | |||
674 | srcu = (uintptr_t)((const uint8_t *)srcu + 1); | |||
675 | dstu = (uintptr_t)((uint8_t *)dstu + 1); | |||
676 | } | |||
677 | if (n & 0x02) { | |||
678 | *(uint16_t *)dstu = *(const uint16_t *)srcu; | |||
679 | srcu = (uintptr_t)((const uint16_t *)srcu + 1); | |||
680 | dstu = (uintptr_t)((uint16_t *)dstu + 1); | |||
681 | } | |||
682 | if (n & 0x04) { | |||
683 | *(uint32_t *)dstu = *(const uint32_t *)srcu; | |||
684 | srcu = (uintptr_t)((const uint32_t *)srcu + 1); | |||
685 | dstu = (uintptr_t)((uint32_t *)dstu + 1); | |||
686 | } | |||
687 | if (n & 0x08) { | |||
688 | *(uint64_t *)dstu = *(const uint64_t *)srcu; | |||
689 | } | |||
690 | return ret; | |||
691 | } | |||
692 | ||||
693 | /** | |||
694 | * Fast way when copy size doesn't exceed 512 bytes | |||
695 | */ | |||
696 | if (n <= 32) { | |||
697 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |||
698 | rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); | |||
699 | return ret; | |||
700 | } | |||
701 | if (n <= 48) { | |||
702 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |||
703 | rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); | |||
704 | return ret; | |||
705 | } | |||
706 | if (n <= 64) { | |||
707 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |||
708 | rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); | |||
709 | rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); | |||
710 | return ret; | |||
711 | } | |||
712 | if (n <= 128) { | |||
713 | goto COPY_BLOCK_128_BACK15; | |||
714 | } | |||
715 | if (n <= 512) { | |||
716 | if (n >= 256) { | |||
717 | n -= 256; | |||
718 | rte_mov128((uint8_t *)dst, (const uint8_t *)src); | |||
719 | rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128); | |||
720 | src = (const uint8_t *)src + 256; | |||
721 | dst = (uint8_t *)dst + 256; | |||
722 | } | |||
723 | COPY_BLOCK_255_BACK15: | |||
724 | if (n >= 128) { | |||
725 | n -= 128; | |||
726 | rte_mov128((uint8_t *)dst, (const uint8_t *)src); | |||
727 | src = (const uint8_t *)src + 128; | |||
728 | dst = (uint8_t *)dst + 128; | |||
729 | } | |||
730 | COPY_BLOCK_128_BACK15: | |||
731 | if (n >= 64) { | |||
732 | n -= 64; | |||
733 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |||
734 | src = (const uint8_t *)src + 64; | |||
735 | dst = (uint8_t *)dst + 64; | |||
736 | } | |||
737 | COPY_BLOCK_64_BACK15: | |||
738 | if (n >= 32) { | |||
739 | n -= 32; | |||
740 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |||
741 | src = (const uint8_t *)src + 32; | |||
742 | dst = (uint8_t *)dst + 32; | |||
743 | } | |||
744 | if (n > 16) { | |||
745 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |||
746 | rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); | |||
747 | return ret; | |||
748 | } | |||
749 | if (n > 0) { | |||
750 | rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); | |||
751 | } | |||
752 | return ret; | |||
753 | } | |||
754 | ||||
755 | /** | |||
756 | * Make store aligned when copy size exceeds 512 bytes, | |||
757 | * and make sure the first 15 bytes are copied, because | |||
758 | * unaligned copy functions require up to 15 bytes | |||
759 | * backwards access. | |||
760 | */ | |||
761 | dstofss = (uintptr_t)dst & 0x0F; | |||
762 | if (dstofss > 0) { | |||
763 | dstofss = 16 - dstofss + 16; | |||
764 | n -= dstofss; | |||
765 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |||
766 | src = (const uint8_t *)src + dstofss; | |||
767 | dst = (uint8_t *)dst + dstofss; | |||
768 | } | |||
769 | srcofs = ((uintptr_t)src & 0x0F); | |||
770 | ||||
771 | /** | |||
772 | * For aligned copy | |||
773 | */ | |||
774 | if (srcofs == 0) { | |||
775 | /** | |||
776 | * Copy 256-byte blocks | |||
777 | */ | |||
778 | for (; n >= 256; n -= 256) { | |||
779 | rte_mov256((uint8_t *)dst, (const uint8_t *)src); | |||
780 | dst = (uint8_t *)dst + 256; | |||
781 | src = (const uint8_t *)src + 256; | |||
782 | } | |||
783 | ||||
784 | /** | |||
785 | * Copy whatever left | |||
786 | */ | |||
787 | goto COPY_BLOCK_255_BACK15; | |||
788 | } | |||
789 | ||||
790 | /** | |||
791 | * For copy with unaligned load | |||
792 | */ | |||
793 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs)__extension__ ({ switch (srcofs) { case 0x01: __extension__ ( { size_t tmp; while (n >= 128 + 16 - 0x01) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x01 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x01 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x01 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x01)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x01)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x01)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x01)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x01)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x01)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x01)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x01)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x01 ) & 127) + 16 - 0x01; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x01 ) { while (n >= 32 + 16 - 0x01) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x01 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x01 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x01)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x01)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x01) & 31) + 16 - 0x01; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; case 0x02: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x02) { xmm0 = _mm_loadu_si128((const __m128i *)( (const uint8_t *)src - 0x02 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x02 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 4 * 16)); xmm5 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x02 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128(( __v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x02)); } )); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2 ), (__v16qi)(__m128i)(xmm1), (0x02)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi )(__m128i)(xmm2), (0x02)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x02)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5 ), (__v16qi)(__m128i)(xmm4), (0x02)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi )(__m128i)(xmm5), (0x02)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x02)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8 ), (__v16qi)(__m128i)(xmm7), (0x02)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x02) & 127) + 16 - 0x02 ; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t * )dst + tmp; if (n >= 32 + 16 - 0x02) { while (n >= 32 + 16 - 0x02) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x02 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x02)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x02)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x02 ) & 31) + 16 - 0x02; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x03: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x03) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x03 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x03 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x03 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x03)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x03)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x03)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x03)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x03)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x03)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x03)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x03)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x03 ) & 127) + 16 - 0x03; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x03 ) { while (n >= 32 + 16 - 0x03) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x03 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x03 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x03)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x03)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x03) & 31) + 16 - 0x03; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; case 0x04: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x04) { xmm0 = _mm_loadu_si128((const __m128i *)( (const uint8_t *)src - 0x04 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x04 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 4 * 16)); xmm5 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x04 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128(( __v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x04)); } )); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2 ), (__v16qi)(__m128i)(xmm1), (0x04)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi )(__m128i)(xmm2), (0x04)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x04)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5 ), (__v16qi)(__m128i)(xmm4), (0x04)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi )(__m128i)(xmm5), (0x04)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x04)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8 ), (__v16qi)(__m128i)(xmm7), (0x04)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x04) & 127) + 16 - 0x04 ; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t * )dst + tmp; if (n >= 32 + 16 - 0x04) { while (n >= 32 + 16 - 0x04) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x04 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x04)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x04)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x04 ) & 31) + 16 - 0x04; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x05: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x05) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x05 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x05 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x05 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x05)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x05)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x05)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x05)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x05)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x05)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x05)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x05)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x05 ) & 127) + 16 - 0x05; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x05 ) { while (n >= 32 + 16 - 0x05) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x05 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x05 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x05)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x05)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x05) & 31) + 16 - 0x05; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; case 0x06: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x06) { xmm0 = _mm_loadu_si128((const __m128i *)( (const uint8_t *)src - 0x06 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x06 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 4 * 16)); xmm5 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x06 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128(( __v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x06)); } )); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2 ), (__v16qi)(__m128i)(xmm1), (0x06)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi )(__m128i)(xmm2), (0x06)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x06)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5 ), (__v16qi)(__m128i)(xmm4), (0x06)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi )(__m128i)(xmm5), (0x06)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x06)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8 ), (__v16qi)(__m128i)(xmm7), (0x06)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x06) & 127) + 16 - 0x06 ; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t * )dst + tmp; if (n >= 32 + 16 - 0x06) { while (n >= 32 + 16 - 0x06) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x06 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x06)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x06)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x06 ) & 31) + 16 - 0x06; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x07: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x07) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x07 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x07 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x07 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x07)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x07)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x07)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x07)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x07)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x07)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x07)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x07)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x07 ) & 127) + 16 - 0x07; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x07 ) { while (n >= 32 + 16 - 0x07) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x07 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x07 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x07)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x07)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x07) & 31) + 16 - 0x07; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; case 0x08: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x08) { xmm0 = _mm_loadu_si128((const __m128i *)( (const uint8_t *)src - 0x08 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x08 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 4 * 16)); xmm5 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x08 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128(( __v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x08)); } )); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2 ), (__v16qi)(__m128i)(xmm1), (0x08)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi )(__m128i)(xmm2), (0x08)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x08)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5 ), (__v16qi)(__m128i)(xmm4), (0x08)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi )(__m128i)(xmm5), (0x08)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x08)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8 ), (__v16qi)(__m128i)(xmm7), (0x08)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x08) & 127) + 16 - 0x08 ; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t * )dst + tmp; if (n >= 32 + 16 - 0x08) { while (n >= 32 + 16 - 0x08) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x08 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x08)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x08)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x08 ) & 31) + 16 - 0x08; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x09: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x09) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x09 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x09 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x09 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x09)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x09)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x09)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x09)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x09)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x09)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x09)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x09)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x09 ) & 127) + 16 - 0x09; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x09 ) { while (n >= 32 + 16 - 0x09) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x09 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x09 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09 + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x09)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x09)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x09) & 31) + 16 - 0x09; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; case 0x0A: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0A) { xmm0 = _mm_loadu_si128((const __m128i *)( (const uint8_t *)src - 0x0A + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0A + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 4 * 16)); xmm5 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0A + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128(( __v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0A)); } )); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2 ), (__v16qi)(__m128i)(xmm1), (0x0A)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi )(__m128i)(xmm2), (0x0A)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x0A)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5 ), (__v16qi)(__m128i)(xmm4), (0x0A)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi )(__m128i)(xmm5), (0x0A)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x0A)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8 ), (__v16qi)(__m128i)(xmm7), (0x0A)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0A) & 127) + 16 - 0x0A ; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t * )dst + tmp; if (n >= 32 + 16 - 0x0A) { while (n >= 32 + 16 - 0x0A) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0A + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0A)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0A)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0A ) & 31) + 16 - 0x0A; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x0B: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0B) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0B + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0B + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0B + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0B)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0B)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x0B)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x0B)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0B)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x0B)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x0B)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0B)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0B ) & 127) + 16 - 0x0B; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0B ) { while (n >= 32 + 16 - 0x0B) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x0B + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x0B + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0B)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x0B)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0B) & 31) + 16 - 0x0B; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; case 0x0C: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0C) { xmm0 = _mm_loadu_si128((const __m128i *)( (const uint8_t *)src - 0x0C + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0C + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 4 * 16)); xmm5 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0C + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128(( __v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0C)); } )); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2 ), (__v16qi)(__m128i)(xmm1), (0x0C)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi )(__m128i)(xmm2), (0x0C)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x0C)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5 ), (__v16qi)(__m128i)(xmm4), (0x0C)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi )(__m128i)(xmm5), (0x0C)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x0C)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8 ), (__v16qi)(__m128i)(xmm7), (0x0C)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0C) & 127) + 16 - 0x0C ; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t * )dst + tmp; if (n >= 32 + 16 - 0x0C) { while (n >= 32 + 16 - 0x0C) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0C + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0C)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0C)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0C ) & 31) + 16 - 0x0C; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x0D: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0D) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0D + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0D + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0D + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0D)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0D)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x0D)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x0D)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0D)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x0D)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x0D)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0D)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0D ) & 127) + 16 - 0x0D; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0D ) { while (n >= 32 + 16 - 0x0D) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x0D + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x0D + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0D)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x0D)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0D) & 31) + 16 - 0x0D; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; case 0x0E: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0E) { xmm0 = _mm_loadu_si128((const __m128i *)( (const uint8_t *)src - 0x0E + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0E + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 4 * 16)); xmm5 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0E + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128(( __v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0E)); } )); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2 ), (__v16qi)(__m128i)(xmm1), (0x0E)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi )(__m128i)(xmm2), (0x0E)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x0E)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5 ), (__v16qi)(__m128i)(xmm4), (0x0E)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi )(__m128i)(xmm5), (0x0E)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x0E)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8 ), (__v16qi)(__m128i)(xmm7), (0x0E)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0E) & 127) + 16 - 0x0E ; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t * )dst + tmp; if (n >= 32 + 16 - 0x0E) { while (n >= 32 + 16 - 0x0E) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0E + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0E)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0E)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0E ) & 31) + 16 - 0x0E; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x0F: __extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0F) { xmm0 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0F + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 2 * 16)); xmm3 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0F + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 6 * 16)); xmm7 = _mm_loadu_si128 ((const __m128i *)((const uint8_t *)src - 0x0F + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi )(__m128i)(xmm0), (0x0F)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0F)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3 ), (__v16qi)(__m128i)(xmm2), (0x0F)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi )(__m128i)(xmm3), (0x0F)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0F)); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6 ), (__v16qi)(__m128i)(xmm5), (0x0F)); })); _mm_storeu_si128(( __m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi )(__m128i)(xmm6), (0x0F)); })); _mm_storeu_si128((__m128i *)( (uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128 ((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0F)); })); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0F ) & 127) + 16 - 0x0F; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0F ) { while (n >= 32 + 16 - 0x0F) { xmm0 = _mm_loadu_si128(( const __m128i *)((const uint8_t *)src - 0x0F + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t * )src - 0x0F + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F + 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi )(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0F)); })); _mm_storeu_si128 ((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i )__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi )(__m128i)(xmm1), (0x0F)); })); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0F) & 31) + 16 - 0x0F; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } } ); break; default:; } }); | |||
794 | ||||
795 | /** | |||
796 | * Copy whatever left | |||
797 | */ | |||
798 | goto COPY_BLOCK_64_BACK15; | |||
799 | } | |||
800 | ||||
801 | #endif /* RTE_MACHINE_CPUFLAG */ | |||
802 | ||||
803 | static inline void * | |||
804 | rte_memcpy_aligned(void *dst, const void *src, size_t n) | |||
805 | { | |||
806 | void *ret = dst; | |||
807 | ||||
808 | /* Copy size <= 16 bytes */ | |||
809 | if (n < 16) { | |||
810 | if (n & 0x01) { | |||
811 | *(uint8_t *)dst = *(const uint8_t *)src; | |||
812 | src = (const uint8_t *)src + 1; | |||
813 | dst = (uint8_t *)dst + 1; | |||
814 | } | |||
815 | if (n & 0x02) { | |||
816 | *(uint16_t *)dst = *(const uint16_t *)src; | |||
817 | src = (const uint16_t *)src + 1; | |||
818 | dst = (uint16_t *)dst + 1; | |||
819 | } | |||
820 | if (n & 0x04) { | |||
821 | *(uint32_t *)dst = *(const uint32_t *)src; | |||
| ||||
822 | src = (const uint32_t *)src + 1; | |||
823 | dst = (uint32_t *)dst + 1; | |||
824 | } | |||
825 | if (n & 0x08) | |||
826 | *(uint64_t *)dst = *(const uint64_t *)src; | |||
827 | ||||
828 | return ret; | |||
829 | } | |||
830 | ||||
831 | /* Copy 16 <= size <= 32 bytes */ | |||
832 | if (n <= 32) { | |||
833 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |||
834 | rte_mov16((uint8_t *)dst - 16 + n, | |||
835 | (const uint8_t *)src - 16 + n); | |||
836 | ||||
837 | return ret; | |||
838 | } | |||
839 | ||||
840 | /* Copy 32 < size <= 64 bytes */ | |||
841 | if (n <= 64) { | |||
842 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |||
843 | rte_mov32((uint8_t *)dst - 32 + n, | |||
844 | (const uint8_t *)src - 32 + n); | |||
845 | ||||
846 | return ret; | |||
847 | } | |||
848 | ||||
849 | /* Copy 64 bytes blocks */ | |||
850 | for (; n >= 64; n -= 64) { | |||
851 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |||
852 | dst = (uint8_t *)dst + 64; | |||
853 | src = (const uint8_t *)src + 64; | |||
854 | } | |||
855 | ||||
856 | /* Copy whatever left */ | |||
857 | rte_mov64((uint8_t *)dst - 64 + n, | |||
858 | (const uint8_t *)src - 64 + n); | |||
859 | ||||
860 | return ret; | |||
861 | } | |||
862 | ||||
863 | static inline void * | |||
864 | rte_memcpy(void *dst, const void *src, size_t n) | |||
865 | { | |||
866 | if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK0x0F)) | |||
867 | return rte_memcpy_aligned(dst, src, n); | |||
868 | else | |||
869 | return rte_memcpy_generic(dst, src, n); | |||
870 | } | |||
871 | ||||
872 | #ifdef __cplusplus | |||
873 | } | |||
874 | #endif | |||
875 | ||||
876 | #endif /* _RTE_MEMCPY_X86_64_H_ */ |