Bug Summary

File:home/bhubbard/working/src/ceph/build/src/dpdk/include/rte_ether.h
Warning:line 821, column 21
Assigned value is garbage or undefined

Annotated Source Code

[?] Use j/k keys for keyboard navigation

/home/bhubbard/working/src/ceph/build/src/dpdk/include/rte_ether.h

1/* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2014 Intel Corporation
3 */
4
5#ifndef _RTE_ETHER_H_
6#define _RTE_ETHER_H_
7
8/**
9 * @file
10 *
11 * Ethernet Helpers in RTE
12 */
13
14#ifdef __cplusplus
15extern "C" {
16#endif
17
18#include <stdint.h>
19#include <stdio.h>
20
21#include <rte_memcpy.h>
22#include <rte_random.h>
23#include <rte_mbuf.h>
24#include <rte_byteorder.h>
25
26#define ETHER_ADDR_LEN6 6 /**< Length of Ethernet address. */
27#define ETHER_TYPE_LEN2 2 /**< Length of Ethernet type field. */
28#define ETHER_CRC_LEN4 4 /**< Length of Ethernet CRC. */
29#define ETHER_HDR_LEN(6 * 2 + 2) \
30 (ETHER_ADDR_LEN6 * 2 + ETHER_TYPE_LEN2) /**< Length of Ethernet header. */
31#define ETHER_MIN_LEN64 64 /**< Minimum frame len, including CRC. */
32#define ETHER_MAX_LEN1518 1518 /**< Maximum frame len, including CRC. */
33#define ETHER_MTU(1518 - (6 * 2 + 2) - 4) \
34 (ETHER_MAX_LEN1518 - ETHER_HDR_LEN(6 * 2 + 2) - ETHER_CRC_LEN4) /**< Ethernet MTU. */
35
36#define ETHER_MAX_VLAN_FRAME_LEN(1518 + 4) \
37 (ETHER_MAX_LEN1518 + 4) /**< Maximum VLAN frame length, including CRC. */
38
39#define ETHER_MAX_JUMBO_FRAME_LEN0x3F00 \
40 0x3F00 /**< Maximum Jumbo frame length, including CRC. */
41
42#define ETHER_MAX_VLAN_ID4095 4095 /**< Maximum VLAN ID. */
43
44#define ETHER_MIN_MTU68 68 /**< Minimum MTU for IPv4 packets, see RFC 791. */
45
46/**
47 * Ethernet address:
48 * A universally administered address is uniquely assigned to a device by its
49 * manufacturer. The first three octets (in transmission order) contain the
50 * Organizationally Unique Identifier (OUI). The following three (MAC-48 and
51 * EUI-48) octets are assigned by that organization with the only constraint
52 * of uniqueness.
53 * A locally administered address is assigned to a device by a network
54 * administrator and does not contain OUIs.
55 * See http://standards.ieee.org/regauth/groupmac/tutorial.html
56 */
57struct ether_addr {
58 uint8_t addr_bytes[ETHER_ADDR_LEN6]; /**< Addr bytes in tx order */
59} __attribute__((__packed__));
60
61#define ETHER_LOCAL_ADMIN_ADDR0x02 0x02 /**< Locally assigned Eth. address. */
62#define ETHER_GROUP_ADDR0x01 0x01 /**< Multicast or broadcast Eth. address. */
63
64/**
65 * Check if two Ethernet addresses are the same.
66 *
67 * @param ea1
68 * A pointer to the first ether_addr structure containing
69 * the ethernet address.
70 * @param ea2
71 * A pointer to the second ether_addr structure containing
72 * the ethernet address.
73 *
74 * @return
75 * True (1) if the given two ethernet address are the same;
76 * False (0) otherwise.
77 */
78static inline int is_same_ether_addr(const struct ether_addr *ea1,
79 const struct ether_addr *ea2)
80{
81 int i;
82 for (i = 0; i < ETHER_ADDR_LEN6; i++)
83 if (ea1->addr_bytes[i] != ea2->addr_bytes[i])
84 return 0;
85 return 1;
86}
87
88/**
89 * Check if an Ethernet address is filled with zeros.
90 *
91 * @param ea
92 * A pointer to a ether_addr structure containing the ethernet address
93 * to check.
94 * @return
95 * True (1) if the given ethernet address is filled with zeros;
96 * false (0) otherwise.
97 */
98static inline int is_zero_ether_addr(const struct ether_addr *ea)
99{
100 int i;
101 for (i = 0; i < ETHER_ADDR_LEN6; i++)
102 if (ea->addr_bytes[i] != 0x00)
103 return 0;
104 return 1;
105}
106
107/**
108 * Check if an Ethernet address is a unicast address.
109 *
110 * @param ea
111 * A pointer to a ether_addr structure containing the ethernet address
112 * to check.
113 * @return
114 * True (1) if the given ethernet address is a unicast address;
115 * false (0) otherwise.
116 */
117static inline int is_unicast_ether_addr(const struct ether_addr *ea)
118{
119 return (ea->addr_bytes[0] & ETHER_GROUP_ADDR0x01) == 0;
120}
121
122/**
123 * Check if an Ethernet address is a multicast address.
124 *
125 * @param ea
126 * A pointer to a ether_addr structure containing the ethernet address
127 * to check.
128 * @return
129 * True (1) if the given ethernet address is a multicast address;
130 * false (0) otherwise.
131 */
132static inline int is_multicast_ether_addr(const struct ether_addr *ea)
133{
134 return ea->addr_bytes[0] & ETHER_GROUP_ADDR0x01;
135}
136
137/**
138 * Check if an Ethernet address is a broadcast address.
139 *
140 * @param ea
141 * A pointer to a ether_addr structure containing the ethernet address
142 * to check.
143 * @return
144 * True (1) if the given ethernet address is a broadcast address;
145 * false (0) otherwise.
146 */
147static inline int is_broadcast_ether_addr(const struct ether_addr *ea)
148{
149 const unaligned_uint16_t *ea_words = (const unaligned_uint16_t *)ea;
150
151 return (ea_words[0] == 0xFFFF && ea_words[1] == 0xFFFF &&
152 ea_words[2] == 0xFFFF);
153}
154
155/**
156 * Check if an Ethernet address is a universally assigned address.
157 *
158 * @param ea
159 * A pointer to a ether_addr structure containing the ethernet address
160 * to check.
161 * @return
162 * True (1) if the given ethernet address is a universally assigned address;
163 * false (0) otherwise.
164 */
165static inline int is_universal_ether_addr(const struct ether_addr *ea)
166{
167 return (ea->addr_bytes[0] & ETHER_LOCAL_ADMIN_ADDR0x02) == 0;
168}
169
170/**
171 * Check if an Ethernet address is a locally assigned address.
172 *
173 * @param ea
174 * A pointer to a ether_addr structure containing the ethernet address
175 * to check.
176 * @return
177 * True (1) if the given ethernet address is a locally assigned address;
178 * false (0) otherwise.
179 */
180static inline int is_local_admin_ether_addr(const struct ether_addr *ea)
181{
182 return (ea->addr_bytes[0] & ETHER_LOCAL_ADMIN_ADDR0x02) != 0;
183}
184
185/**
186 * Check if an Ethernet address is a valid address. Checks that the address is a
187 * unicast address and is not filled with zeros.
188 *
189 * @param ea
190 * A pointer to a ether_addr structure containing the ethernet address
191 * to check.
192 * @return
193 * True (1) if the given ethernet address is valid;
194 * false (0) otherwise.
195 */
196static inline int is_valid_assigned_ether_addr(const struct ether_addr *ea)
197{
198 return is_unicast_ether_addr(ea) && (!is_zero_ether_addr(ea));
199}
200
201/**
202 * Generate a random Ethernet address that is locally administered
203 * and not multicast.
204 * @param addr
205 * A pointer to Ethernet address.
206 */
207static inline void eth_random_addr(uint8_t *addr)
208{
209 uint64_t rand = rte_rand();
210 uint8_t *p = (uint8_t *)&rand;
211
212 rte_memcpy(addr, p, ETHER_ADDR_LEN6);
1
Calling 'rte_memcpy'
213 addr[0] &= (uint8_t)~ETHER_GROUP_ADDR0x01; /* clear multicast bit */
214 addr[0] |= ETHER_LOCAL_ADMIN_ADDR0x02; /* set local assignment bit */
215}
216
217/**
218 * Fast copy an Ethernet address.
219 *
220 * @param ea_from
221 * A pointer to a ether_addr structure holding the Ethernet address to copy.
222 * @param ea_to
223 * A pointer to a ether_addr structure where to copy the Ethernet address.
224 */
225static inline void ether_addr_copy(const struct ether_addr *ea_from,
226 struct ether_addr *ea_to)
227{
228#ifdef __INTEL_COMPILER
229 uint16_t *from_words = (uint16_t *)(ea_from->addr_bytes);
230 uint16_t *to_words = (uint16_t *)(ea_to->addr_bytes);
231
232 to_words[0] = from_words[0];
233 to_words[1] = from_words[1];
234 to_words[2] = from_words[2];
235#else
236 /*
237 * Use the common way, because of a strange gcc warning.
238 */
239 *ea_to = *ea_from;
240#endif
241}
242
243#define ETHER_ADDR_FMT_SIZE18 18
244/**
245 * Format 48bits Ethernet address in pattern xx:xx:xx:xx:xx:xx.
246 *
247 * @param buf
248 * A pointer to buffer contains the formatted MAC address.
249 * @param size
250 * The format buffer size.
251 * @param eth_addr
252 * A pointer to a ether_addr structure.
253 */
254static inline void
255ether_format_addr(char *buf, uint16_t size,
256 const struct ether_addr *eth_addr)
257{
258 snprintf(buf, size, "%02X:%02X:%02X:%02X:%02X:%02X",
259 eth_addr->addr_bytes[0],
260 eth_addr->addr_bytes[1],
261 eth_addr->addr_bytes[2],
262 eth_addr->addr_bytes[3],
263 eth_addr->addr_bytes[4],
264 eth_addr->addr_bytes[5]);
265}
266
267/**
268 * Ethernet header: Contains the destination address, source address
269 * and frame type.
270 */
271struct ether_hdr {
272 struct ether_addr d_addr; /**< Destination address. */
273 struct ether_addr s_addr; /**< Source address. */
274 uint16_t ether_type; /**< Frame type. */
275} __attribute__((__packed__));
276
277/**
278 * Ethernet VLAN Header.
279 * Contains the 16-bit VLAN Tag Control Identifier and the Ethernet type
280 * of the encapsulated frame.
281 */
282struct vlan_hdr {
283 uint16_t vlan_tci; /**< Priority (3) + CFI (1) + Identifier Code (12) */
284 uint16_t eth_proto;/**< Ethernet type of encapsulated frame. */
285} __attribute__((__packed__));
286
287/**
288 * VXLAN protocol header.
289 * Contains the 8-bit flag, 24-bit VXLAN Network Identifier and
290 * Reserved fields (24 bits and 8 bits)
291 */
292struct vxlan_hdr {
293 uint32_t vx_flags; /**< flag (8) + Reserved (24). */
294 uint32_t vx_vni; /**< VNI (24) + Reserved (8). */
295} __attribute__((__packed__));
296
297/* Ethernet frame types */
298#define ETHER_TYPE_IPv40x0800 0x0800 /**< IPv4 Protocol. */
299#define ETHER_TYPE_IPv60x86DD 0x86DD /**< IPv6 Protocol. */
300#define ETHER_TYPE_ARP0x0806 0x0806 /**< Arp Protocol. */
301#define ETHER_TYPE_RARP0x8035 0x8035 /**< Reverse Arp Protocol. */
302#define ETHER_TYPE_VLAN0x8100 0x8100 /**< IEEE 802.1Q VLAN tagging. */
303#define ETHER_TYPE_QINQ0x88A8 0x88A8 /**< IEEE 802.1ad QinQ tagging. */
304#define ETHER_TYPE_PPPOE_DISCOVERY0x8863 0x8863 /**< PPPoE Discovery Stage. */
305#define ETHER_TYPE_PPPOE_SESSION0x8864 0x8864 /**< PPPoE Session Stage. */
306#define ETHER_TYPE_ETAG0x893F 0x893F /**< IEEE 802.1BR E-Tag. */
307#define ETHER_TYPE_15880x88F7 0x88F7 /**< IEEE 802.1AS 1588 Precise Time Protocol. */
308#define ETHER_TYPE_SLOW0x8809 0x8809 /**< Slow protocols (LACP and Marker). */
309#define ETHER_TYPE_TEB0x6558 0x6558 /**< Transparent Ethernet Bridging. */
310#define ETHER_TYPE_LLDP0x88CC 0x88CC /**< LLDP Protocol. */
311#define ETHER_TYPE_MPLS0x8847 0x8847 /**< MPLS ethertype. */
312#define ETHER_TYPE_MPLSM0x8848 0x8848 /**< MPLS multicast ethertype. */
313
314#define ETHER_VXLAN_HLEN(sizeof(struct udp_hdr) + sizeof(struct vxlan_hdr)) (sizeof(struct udp_hdr) + sizeof(struct vxlan_hdr))
315/**< VXLAN tunnel header length. */
316
317/**
318 * VXLAN-GPE protocol header (draft-ietf-nvo3-vxlan-gpe-05).
319 * Contains the 8-bit flag, 8-bit next-protocol, 24-bit VXLAN Network
320 * Identifier and Reserved fields (16 bits and 8 bits).
321 */
322struct vxlan_gpe_hdr {
323 uint8_t vx_flags; /**< flag (8). */
324 uint8_t reserved[2]; /**< Reserved (16). */
325 uint8_t proto; /**< next-protocol (8). */
326 uint32_t vx_vni; /**< VNI (24) + Reserved (8). */
327} __attribute__((__packed__));
328
329/* VXLAN-GPE next protocol types */
330#define VXLAN_GPE_TYPE_IPV41 1 /**< IPv4 Protocol. */
331#define VXLAN_GPE_TYPE_IPV62 2 /**< IPv6 Protocol. */
332#define VXLAN_GPE_TYPE_ETH3 3 /**< Ethernet Protocol. */
333#define VXLAN_GPE_TYPE_NSH4 4 /**< NSH Protocol. */
334#define VXLAN_GPE_TYPE_MPLS5 5 /**< MPLS Protocol. */
335#define VXLAN_GPE_TYPE_GBP6 6 /**< GBP Protocol. */
336#define VXLAN_GPE_TYPE_VBNG7 7 /**< vBNG Protocol. */
337
338#define ETHER_VXLAN_GPE_HLEN(sizeof(struct udp_hdr) + sizeof(struct vxlan_gpe_hdr)) (sizeof(struct udp_hdr) + \
339 sizeof(struct vxlan_gpe_hdr))
340/**< VXLAN-GPE tunnel header length. */
341
342/**
343 * Extract VLAN tag information into mbuf
344 *
345 * Software version of VLAN stripping
346 *
347 * @param m
348 * The packet mbuf.
349 * @return
350 * - 0: Success
351 * - 1: not a vlan packet
352 */
353static inline int rte_vlan_strip(struct rte_mbuf *m)
354{
355 struct ether_hdr *eh
356 = rte_pktmbuf_mtod(m, struct ether_hdr *)((struct ether_hdr *)((char *)(m)->buf_addr + (m)->data_off
+ (0)))
;
357 struct vlan_hdr *vh;
358
359 if (eh->ether_type != rte_cpu_to_be_16(ETHER_TYPE_VLAN)((uint16_t)(__builtin_constant_p(0x8100) ? rte_constant_bswap16
(0x8100) : rte_arch_bswap16(0x8100)))
)
360 return -1;
361
362 vh = (struct vlan_hdr *)(eh + 1);
363 m->ol_flags |= PKT_RX_VLAN(1ULL << 0) | PKT_RX_VLAN_STRIPPED(1ULL << 6);
364 m->vlan_tci = rte_be_to_cpu_16(vh->vlan_tci)((uint16_t)(__builtin_constant_p(vh->vlan_tci) ? rte_constant_bswap16
(vh->vlan_tci) : rte_arch_bswap16(vh->vlan_tci)))
;
365
366 /* Copy ether header over rather than moving whole packet */
367 memmove(rte_pktmbuf_adj(m, sizeof(struct vlan_hdr)),
368 eh, 2 * ETHER_ADDR_LEN6);
369
370 return 0;
371}
372
373/**
374 * Insert VLAN tag into mbuf.
375 *
376 * Software version of VLAN unstripping
377 *
378 * @param m
379 * The packet mbuf.
380 * @return
381 * - 0: On success
382 * -EPERM: mbuf is is shared overwriting would be unsafe
383 * -ENOSPC: not enough headroom in mbuf
384 */
385static inline int rte_vlan_insert(struct rte_mbuf **m)
386{
387 struct ether_hdr *oh, *nh;
388 struct vlan_hdr *vh;
389
390 /* Can't insert header if mbuf is shared */
391 if (rte_mbuf_refcnt_read(*m) > 1) {
392 struct rte_mbuf *copy;
393
394 copy = rte_pktmbuf_clone(*m, (*m)->pool);
395 if (unlikely(copy == NULL)__builtin_expect(!!(copy == ((void*)0)), 0))
396 return -ENOMEM12;
397 rte_pktmbuf_free(*m);
398 *m = copy;
399 }
400
401 oh = rte_pktmbuf_mtod(*m, struct ether_hdr *)((struct ether_hdr *)((char *)(*m)->buf_addr + (*m)->data_off
+ (0)))
;
402 nh = (struct ether_hdr *)
403 rte_pktmbuf_prepend(*m, sizeof(struct vlan_hdr));
404 if (nh == NULL((void*)0))
405 return -ENOSPC28;
406
407 memmove(nh, oh, 2 * ETHER_ADDR_LEN6);
408 nh->ether_type = rte_cpu_to_be_16(ETHER_TYPE_VLAN)((uint16_t)(__builtin_constant_p(0x8100) ? rte_constant_bswap16
(0x8100) : rte_arch_bswap16(0x8100)))
;
409
410 vh = (struct vlan_hdr *) (nh + 1);
411 vh->vlan_tci = rte_cpu_to_be_16((*m)->vlan_tci)((uint16_t)(__builtin_constant_p((*m)->vlan_tci) ? rte_constant_bswap16
((*m)->vlan_tci) : rte_arch_bswap16((*m)->vlan_tci)))
;
412
413 (*m)->ol_flags &= ~(PKT_RX_VLAN_STRIPPED(1ULL << 6) | PKT_TX_VLAN(1ULL << 57));
414
415 return 0;
416}
417
418#ifdef __cplusplus
419}
420#endif
421
422#endif /* _RTE_ETHER_H_ */

/home/bhubbard/working/src/ceph/build/src/dpdk/include/rte_memcpy.h

1/* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2014 Intel Corporation
3 */
4
5#ifndef _RTE_MEMCPY_X86_64_H_
6#define _RTE_MEMCPY_X86_64_H_
7
8/**
9 * @file
10 *
11 * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
12 */
13
14#include <stdio.h>
15#include <stdint.h>
16#include <string.h>
17#include <rte_vect.h>
18#include <rte_common.h>
19#include <rte_config.h>
20
21#ifdef __cplusplus
22extern "C" {
23#endif
24
25/**
26 * Copy bytes from one location to another. The locations must not overlap.
27 *
28 * @note This is implemented as a macro, so it's address should not be taken
29 * and care is needed as parameter expressions may be evaluated multiple times.
30 *
31 * @param dst
32 * Pointer to the destination of the data.
33 * @param src
34 * Pointer to the source data.
35 * @param n
36 * Number of bytes to copy.
37 * @return
38 * Pointer to the destination data.
39 */
40static __rte_always_inlineinline __attribute__((always_inline)) void *
41rte_memcpy(void *dst, const void *src, size_t n);
42
43#ifdef RTE_MACHINE_CPUFLAG_AVX512F
44
45#define ALIGNMENT_MASK0x0F 0x3F
46
47/**
48 * AVX512 implementation below
49 */
50
51/**
52 * Copy 16 bytes from one location to another,
53 * locations should not overlap.
54 */
55static __rte_always_inlineinline __attribute__((always_inline)) void
56rte_mov16(uint8_t *dst, const uint8_t *src)
57{
58 __m128i xmm0;
59
60 xmm0 = _mm_loadu_si128((const __m128i *)src);
61 _mm_storeu_si128((__m128i *)dst, xmm0);
62}
63
64/**
65 * Copy 32 bytes from one location to another,
66 * locations should not overlap.
67 */
68static __rte_always_inlineinline __attribute__((always_inline)) void
69rte_mov32(uint8_t *dst, const uint8_t *src)
70{
71 __m256i ymm0;
72
73 ymm0 = _mm256_loadu_si256((const __m256i *)src);
74 _mm256_storeu_si256((__m256i *)dst, ymm0);
75}
76
77/**
78 * Copy 64 bytes from one location to another,
79 * locations should not overlap.
80 */
81static __rte_always_inlineinline __attribute__((always_inline)) void
82rte_mov64(uint8_t *dst, const uint8_t *src)
83{
84 __m512i zmm0;
85
86 zmm0 = _mm512_loadu_si512((const void *)src);
87 _mm512_storeu_si512((void *)dst, zmm0);
88}
89
90/**
91 * Copy 128 bytes from one location to another,
92 * locations should not overlap.
93 */
94static __rte_always_inlineinline __attribute__((always_inline)) void
95rte_mov128(uint8_t *dst, const uint8_t *src)
96{
97 rte_mov64(dst + 0 * 64, src + 0 * 64);
98 rte_mov64(dst + 1 * 64, src + 1 * 64);
99}
100
101/**
102 * Copy 256 bytes from one location to another,
103 * locations should not overlap.
104 */
105static __rte_always_inlineinline __attribute__((always_inline)) void
106rte_mov256(uint8_t *dst, const uint8_t *src)
107{
108 rte_mov64(dst + 0 * 64, src + 0 * 64);
109 rte_mov64(dst + 1 * 64, src + 1 * 64);
110 rte_mov64(dst + 2 * 64, src + 2 * 64);
111 rte_mov64(dst + 3 * 64, src + 3 * 64);
112}
113
114/**
115 * Copy 128-byte blocks from one location to another,
116 * locations should not overlap.
117 */
118static inline void
119rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
120{
121 __m512i zmm0, zmm1;
122
123 while (n >= 128) {
124 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
125 n -= 128;
126 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
127 src = src + 128;
128 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
129 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
130 dst = dst + 128;
131 }
132}
133
134/**
135 * Copy 512-byte blocks from one location to another,
136 * locations should not overlap.
137 */
138static inline void
139rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
140{
141 __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
142
143 while (n >= 512) {
144 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
145 n -= 512;
146 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
147 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
148 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
149 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
150 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
151 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
152 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
153 src = src + 512;
154 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
155 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
156 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
157 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
158 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
159 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
160 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
161 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
162 dst = dst + 512;
163 }
164}
165
166static inline void *
167rte_memcpy_generic(void *dst, const void *src, size_t n)
168{
169 uintptr_t dstu = (uintptr_t)dst;
170 uintptr_t srcu = (uintptr_t)src;
171 void *ret = dst;
172 size_t dstofss;
173 size_t bits;
174
175 /**
176 * Copy less than 16 bytes
177 */
178 if (n < 16) {
179 if (n & 0x01) {
180 *(uint8_t *)dstu = *(const uint8_t *)srcu;
181 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
182 dstu = (uintptr_t)((uint8_t *)dstu + 1);
183 }
184 if (n & 0x02) {
185 *(uint16_t *)dstu = *(const uint16_t *)srcu;
186 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
187 dstu = (uintptr_t)((uint16_t *)dstu + 1);
188 }
189 if (n & 0x04) {
190 *(uint32_t *)dstu = *(const uint32_t *)srcu;
191 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
192 dstu = (uintptr_t)((uint32_t *)dstu + 1);
193 }
194 if (n & 0x08)
195 *(uint64_t *)dstu = *(const uint64_t *)srcu;
196 return ret;
197 }
198
199 /**
200 * Fast way when copy size doesn't exceed 512 bytes
201 */
202 if (n <= 32) {
203 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
204 rte_mov16((uint8_t *)dst - 16 + n,
205 (const uint8_t *)src - 16 + n);
206 return ret;
207 }
208 if (n <= 64) {
209 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
210 rte_mov32((uint8_t *)dst - 32 + n,
211 (const uint8_t *)src - 32 + n);
212 return ret;
213 }
214 if (n <= 512) {
215 if (n >= 256) {
216 n -= 256;
217 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
218 src = (const uint8_t *)src + 256;
219 dst = (uint8_t *)dst + 256;
220 }
221 if (n >= 128) {
222 n -= 128;
223 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
224 src = (const uint8_t *)src + 128;
225 dst = (uint8_t *)dst + 128;
226 }
227COPY_BLOCK_128_BACK63:
228 if (n > 64) {
229 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
230 rte_mov64((uint8_t *)dst - 64 + n,
231 (const uint8_t *)src - 64 + n);
232 return ret;
233 }
234 if (n > 0)
235 rte_mov64((uint8_t *)dst - 64 + n,
236 (const uint8_t *)src - 64 + n);
237 return ret;
238 }
239
240 /**
241 * Make store aligned when copy size exceeds 512 bytes
242 */
243 dstofss = ((uintptr_t)dst & 0x3F);
244 if (dstofss > 0) {
245 dstofss = 64 - dstofss;
246 n -= dstofss;
247 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
248 src = (const uint8_t *)src + dstofss;
249 dst = (uint8_t *)dst + dstofss;
250 }
251
252 /**
253 * Copy 512-byte blocks.
254 * Use copy block function for better instruction order control,
255 * which is important when load is unaligned.
256 */
257 rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
258 bits = n;
259 n = n & 511;
260 bits -= n;
261 src = (const uint8_t *)src + bits;
262 dst = (uint8_t *)dst + bits;
263
264 /**
265 * Copy 128-byte blocks.
266 * Use copy block function for better instruction order control,
267 * which is important when load is unaligned.
268 */
269 if (n >= 128) {
270 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
271 bits = n;
272 n = n & 127;
273 bits -= n;
274 src = (const uint8_t *)src + bits;
275 dst = (uint8_t *)dst + bits;
276 }
277
278 /**
279 * Copy whatever left
280 */
281 goto COPY_BLOCK_128_BACK63;
282}
283
284#elif defined RTE_MACHINE_CPUFLAG_AVX2
285
286#define ALIGNMENT_MASK0x0F 0x1F
287
288/**
289 * AVX2 implementation below
290 */
291
292/**
293 * Copy 16 bytes from one location to another,
294 * locations should not overlap.
295 */
296static __rte_always_inlineinline __attribute__((always_inline)) void
297rte_mov16(uint8_t *dst, const uint8_t *src)
298{
299 __m128i xmm0;
300
301 xmm0 = _mm_loadu_si128((const __m128i *)src);
302 _mm_storeu_si128((__m128i *)dst, xmm0);
303}
304
305/**
306 * Copy 32 bytes from one location to another,
307 * locations should not overlap.
308 */
309static __rte_always_inlineinline __attribute__((always_inline)) void
310rte_mov32(uint8_t *dst, const uint8_t *src)
311{
312 __m256i ymm0;
313
314 ymm0 = _mm256_loadu_si256((const __m256i *)src);
315 _mm256_storeu_si256((__m256i *)dst, ymm0);
316}
317
318/**
319 * Copy 64 bytes from one location to another,
320 * locations should not overlap.
321 */
322static __rte_always_inlineinline __attribute__((always_inline)) void
323rte_mov64(uint8_t *dst, const uint8_t *src)
324{
325 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
326 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
327}
328
329/**
330 * Copy 128 bytes from one location to another,
331 * locations should not overlap.
332 */
333static inline void
334rte_mov128(uint8_t *dst, const uint8_t *src)
335{
336 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
337 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
338 rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
339 rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
340}
341
342/**
343 * Copy 128-byte blocks from one location to another,
344 * locations should not overlap.
345 */
346static inline void
347rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
348{
349 __m256i ymm0, ymm1, ymm2, ymm3;
350
351 while (n >= 128) {
352 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
353 n -= 128;
354 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
355 ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
356 ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
357 src = (const uint8_t *)src + 128;
358 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
359 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
360 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
361 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
362 dst = (uint8_t *)dst + 128;
363 }
364}
365
366static inline void *
367rte_memcpy_generic(void *dst, const void *src, size_t n)
368{
369 uintptr_t dstu = (uintptr_t)dst;
370 uintptr_t srcu = (uintptr_t)src;
371 void *ret = dst;
372 size_t dstofss;
373 size_t bits;
374
375 /**
376 * Copy less than 16 bytes
377 */
378 if (n < 16) {
379 if (n & 0x01) {
380 *(uint8_t *)dstu = *(const uint8_t *)srcu;
381 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
382 dstu = (uintptr_t)((uint8_t *)dstu + 1);
383 }
384 if (n & 0x02) {
385 *(uint16_t *)dstu = *(const uint16_t *)srcu;
386 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
387 dstu = (uintptr_t)((uint16_t *)dstu + 1);
388 }
389 if (n & 0x04) {
390 *(uint32_t *)dstu = *(const uint32_t *)srcu;
391 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
392 dstu = (uintptr_t)((uint32_t *)dstu + 1);
393 }
394 if (n & 0x08) {
395 *(uint64_t *)dstu = *(const uint64_t *)srcu;
396 }
397 return ret;
398 }
399
400 /**
401 * Fast way when copy size doesn't exceed 256 bytes
402 */
403 if (n <= 32) {
404 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
405 rte_mov16((uint8_t *)dst - 16 + n,
406 (const uint8_t *)src - 16 + n);
407 return ret;
408 }
409 if (n <= 48) {
410 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
411 rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
412 rte_mov16((uint8_t *)dst - 16 + n,
413 (const uint8_t *)src - 16 + n);
414 return ret;
415 }
416 if (n <= 64) {
417 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
418 rte_mov32((uint8_t *)dst - 32 + n,
419 (const uint8_t *)src - 32 + n);
420 return ret;
421 }
422 if (n <= 256) {
423 if (n >= 128) {
424 n -= 128;
425 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
426 src = (const uint8_t *)src + 128;
427 dst = (uint8_t *)dst + 128;
428 }
429COPY_BLOCK_128_BACK31:
430 if (n >= 64) {
431 n -= 64;
432 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
433 src = (const uint8_t *)src + 64;
434 dst = (uint8_t *)dst + 64;
435 }
436 if (n > 32) {
437 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
438 rte_mov32((uint8_t *)dst - 32 + n,
439 (const uint8_t *)src - 32 + n);
440 return ret;
441 }
442 if (n > 0) {
443 rte_mov32((uint8_t *)dst - 32 + n,
444 (const uint8_t *)src - 32 + n);
445 }
446 return ret;
447 }
448
449 /**
450 * Make store aligned when copy size exceeds 256 bytes
451 */
452 dstofss = (uintptr_t)dst & 0x1F;
453 if (dstofss > 0) {
454 dstofss = 32 - dstofss;
455 n -= dstofss;
456 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
457 src = (const uint8_t *)src + dstofss;
458 dst = (uint8_t *)dst + dstofss;
459 }
460
461 /**
462 * Copy 128-byte blocks
463 */
464 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
465 bits = n;
466 n = n & 127;
467 bits -= n;
468 src = (const uint8_t *)src + bits;
469 dst = (uint8_t *)dst + bits;
470
471 /**
472 * Copy whatever left
473 */
474 goto COPY_BLOCK_128_BACK31;
475}
476
477#else /* RTE_MACHINE_CPUFLAG */
478
479#define ALIGNMENT_MASK0x0F 0x0F
480
481/**
482 * SSE & AVX implementation below
483 */
484
485/**
486 * Copy 16 bytes from one location to another,
487 * locations should not overlap.
488 */
489static __rte_always_inlineinline __attribute__((always_inline)) void
490rte_mov16(uint8_t *dst, const uint8_t *src)
491{
492 __m128i xmm0;
493
494 xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
495 _mm_storeu_si128((__m128i *)dst, xmm0);
496}
497
498/**
499 * Copy 32 bytes from one location to another,
500 * locations should not overlap.
501 */
502static __rte_always_inlineinline __attribute__((always_inline)) void
503rte_mov32(uint8_t *dst, const uint8_t *src)
504{
505 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
506 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
507}
508
509/**
510 * Copy 64 bytes from one location to another,
511 * locations should not overlap.
512 */
513static __rte_always_inlineinline __attribute__((always_inline)) void
514rte_mov64(uint8_t *dst, const uint8_t *src)
515{
516 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
517 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
518 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
519 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
520}
521
522/**
523 * Copy 128 bytes from one location to another,
524 * locations should not overlap.
525 */
526static inline void
527rte_mov128(uint8_t *dst, const uint8_t *src)
528{
529 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
530 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
531 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
532 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
533 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
534 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
535 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
536 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
537}
538
539/**
540 * Copy 256 bytes from one location to another,
541 * locations should not overlap.
542 */
543static inline void
544rte_mov256(uint8_t *dst, const uint8_t *src)
545{
546 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
547 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
548 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
549 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
550 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
551 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
552 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
553 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
554 rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
555 rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
556 rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
557 rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
558 rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
559 rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
560 rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
561 rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
562}
563
564/**
565 * Macro for copying unaligned block from one location to another with constant load offset,
566 * 47 bytes leftover maximum,
567 * locations should not overlap.
568 * Requirements:
569 * - Store is aligned
570 * - Load offset is <offset>, which must be immediate value within [1, 15]
571 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
572 * - <dst>, <src>, <len> must be variables
573 * - __m128i <xmm0> ~ <xmm8> must be pre-defined
574 */
575#define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)__extension__ ({ size_t tmp; while (len >= 128 + 16 - offset
) { xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - offset + 0 * 16)); len -= 128; xmm1 = _mm_loadu_si128(
(const __m128i *)((const uint8_t *)src - offset + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset
+ 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - offset + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - offset + 4 * 16)); xmm5 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - offset + 5 * 16)); xmm6
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset
+ 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - offset + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - offset + 8 * 16)); src = (const uint8_t
*)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0
* 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((
__v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (offset));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2
), (__v16qi)(__m128i)(xmm1), (offset)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi
)(__m128i)(xmm2), (offset)); })); _mm_storeu_si128((__m128i *
)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (offset)
); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16)
, __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (offset)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi
)(__m128i)(xmm5), (offset)); })); _mm_storeu_si128((__m128i *
)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (offset)
); })); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16)
, __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (offset)); })); dst
= (uint8_t *)dst + 128; } tmp = len; len = ((len - 16 + offset
) & 127) + 16 - offset; tmp -= len; src = (const uint8_t *
)src + tmp; dst = (uint8_t *)dst + tmp; if (len >= 32 + 16
- offset) { while (len >= 32 + 16 - offset) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); len
-= 32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - offset + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - offset + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (offset)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (offset)); })); dst = (uint8_t *)dst + 32; }
tmp = len; len = ((len - 16 + offset) & 31) + 16 - offset
; tmp -= len; src = (const uint8_t *)src + tmp; dst = (uint8_t
*)dst + tmp; } })
\
576__extension__ ({ \
577 size_t tmp; \
578 while (len >= 128 + 16 - offset) { \
579 xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \
580 len -= 128; \
581 xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \
582 xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \
583 xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16)); \
584 xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16)); \
585 xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16)); \
586 xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16)); \
587 xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16)); \
588 xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16)); \
589 src = (const uint8_t *)src + 128; \
590 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)
(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (offset)); })
); \
591 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)
(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (offset)); })
); \
592 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)
(__m128i)(xmm3), (__v16qi)(__m128i)(xmm2), (offset)); })
); \
593 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)
(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (offset)); })
); \
594 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)
(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (offset)); })
); \
595 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)
(__m128i)(xmm6), (__v16qi)(__m128i)(xmm5), (offset)); })
); \
596 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)
(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (offset)); })
); \
597 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)
(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (offset)); })
); \
598 dst = (uint8_t *)dst + 128; \
599 } \
600 tmp = len; \
601 len = ((len - 16 + offset) & 127) + 16 - offset; \
602 tmp -= len; \
603 src = (const uint8_t *)src + tmp; \
604 dst = (uint8_t *)dst + tmp; \
605 if (len >= 32 + 16 - offset) { \
606 while (len >= 32 + 16 - offset) { \
607 xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \
608 len -= 32; \
609 xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \
610 xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \
611 src = (const uint8_t *)src + 32; \
612 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)
(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (offset)); })
); \
613 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)__extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi)
(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (offset)); })
); \
614 dst = (uint8_t *)dst + 32; \
615 } \
616 tmp = len; \
617 len = ((len - 16 + offset) & 31) + 16 - offset; \
618 tmp -= len; \
619 src = (const uint8_t *)src + tmp; \
620 dst = (uint8_t *)dst + tmp; \
621 } \
622})
623
624/**
625 * Macro for copying unaligned block from one location to another,
626 * 47 bytes leftover maximum,
627 * locations should not overlap.
628 * Use switch here because the aligning instruction requires immediate value for shift count.
629 * Requirements:
630 * - Store is aligned
631 * - Load offset is <offset>, which must be within [1, 15]
632 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
633 * - <dst>, <src>, <len> must be variables
634 * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
635 */
636#define MOVEUNALIGNED_LEFT47(dst, src, len, offset)__extension__ ({ switch (offset) { case 0x01: __extension__ (
{ size_t tmp; while (n >= 128 + 16 - 0x01) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x01 + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x01 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x01 + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x01 + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x01 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x01 + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x01 + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x01)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x01));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x01)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x01)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x01));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x01)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x01)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x01));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x01
) & 127) + 16 - 0x01; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x01
) { while (n >= 32 + 16 - 0x01) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x01 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x01 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x01 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x01)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x01)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x01) & 31) + 16 - 0x01; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; case 0x02: __extension__ ({ size_t tmp; while (n >=
128 + 16 - 0x02) { xmm0 = _mm_loadu_si128((const __m128i *)(
(const uint8_t *)src - 0x02 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x02 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02
+ 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x02 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x02 + 4 * 16)); xmm5 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x02 + 5 * 16)); xmm6
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02
+ 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x02 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x02 + 8 * 16)); src = (const uint8_t
*)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0
* 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((
__v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x02)); }
)); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2
), (__v16qi)(__m128i)(xmm1), (0x02)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi
)(__m128i)(xmm2), (0x02)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x02));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5
), (__v16qi)(__m128i)(xmm4), (0x02)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi
)(__m128i)(xmm5), (0x02)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x02));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8
), (__v16qi)(__m128i)(xmm7), (0x02)); })); dst = (uint8_t *)dst
+ 128; } tmp = n; n = ((n - 16 + 0x02) & 127) + 16 - 0x02
; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *
)dst + tmp; if (n >= 32 + 16 - 0x02) { while (n >= 32 +
16 - 0x02) { xmm0 = _mm_loadu_si128((const __m128i *)((const
uint8_t *)src - 0x02 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x02 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02
+ 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x02)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x02));
})); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x02
) & 31) + 16 - 0x02; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x03: __extension__
({ size_t tmp; while (n >= 128 + 16 - 0x03) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x03 + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x03 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x03 + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x03 + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x03 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x03 + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x03 + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x03)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x03));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x03)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x03)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x03));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x03)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x03)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x03));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x03
) & 127) + 16 - 0x03; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x03
) { while (n >= 32 + 16 - 0x03) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x03 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x03 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x03 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x03)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x03)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x03) & 31) + 16 - 0x03; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; case 0x04: __extension__ ({ size_t tmp; while (n >=
128 + 16 - 0x04) { xmm0 = _mm_loadu_si128((const __m128i *)(
(const uint8_t *)src - 0x04 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x04 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04
+ 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x04 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x04 + 4 * 16)); xmm5 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x04 + 5 * 16)); xmm6
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04
+ 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x04 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x04 + 8 * 16)); src = (const uint8_t
*)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0
* 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((
__v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x04)); }
)); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2
), (__v16qi)(__m128i)(xmm1), (0x04)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi
)(__m128i)(xmm2), (0x04)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x04));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5
), (__v16qi)(__m128i)(xmm4), (0x04)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi
)(__m128i)(xmm5), (0x04)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x04));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8
), (__v16qi)(__m128i)(xmm7), (0x04)); })); dst = (uint8_t *)dst
+ 128; } tmp = n; n = ((n - 16 + 0x04) & 127) + 16 - 0x04
; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *
)dst + tmp; if (n >= 32 + 16 - 0x04) { while (n >= 32 +
16 - 0x04) { xmm0 = _mm_loadu_si128((const __m128i *)((const
uint8_t *)src - 0x04 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x04 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04
+ 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x04)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x04));
})); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x04
) & 31) + 16 - 0x04; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x05: __extension__
({ size_t tmp; while (n >= 128 + 16 - 0x05) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x05 + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x05 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x05 + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x05 + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x05 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x05 + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x05 + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x05)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x05));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x05)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x05)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x05));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x05)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x05)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x05));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x05
) & 127) + 16 - 0x05; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x05
) { while (n >= 32 + 16 - 0x05) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x05 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x05 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x05 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x05)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x05)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x05) & 31) + 16 - 0x05; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; case 0x06: __extension__ ({ size_t tmp; while (n >=
128 + 16 - 0x06) { xmm0 = _mm_loadu_si128((const __m128i *)(
(const uint8_t *)src - 0x06 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x06 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06
+ 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x06 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x06 + 4 * 16)); xmm5 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x06 + 5 * 16)); xmm6
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06
+ 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x06 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x06 + 8 * 16)); src = (const uint8_t
*)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0
* 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((
__v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x06)); }
)); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2
), (__v16qi)(__m128i)(xmm1), (0x06)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi
)(__m128i)(xmm2), (0x06)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x06));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5
), (__v16qi)(__m128i)(xmm4), (0x06)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi
)(__m128i)(xmm5), (0x06)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x06));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8
), (__v16qi)(__m128i)(xmm7), (0x06)); })); dst = (uint8_t *)dst
+ 128; } tmp = n; n = ((n - 16 + 0x06) & 127) + 16 - 0x06
; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *
)dst + tmp; if (n >= 32 + 16 - 0x06) { while (n >= 32 +
16 - 0x06) { xmm0 = _mm_loadu_si128((const __m128i *)((const
uint8_t *)src - 0x06 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x06 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06
+ 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x06)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x06));
})); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x06
) & 31) + 16 - 0x06; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x07: __extension__
({ size_t tmp; while (n >= 128 + 16 - 0x07) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x07 + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x07 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x07 + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x07 + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x07 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x07 + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x07 + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x07)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x07));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x07)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x07)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x07));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x07)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x07)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x07));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x07
) & 127) + 16 - 0x07; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x07
) { while (n >= 32 + 16 - 0x07) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x07 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x07 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x07 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x07)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x07)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x07) & 31) + 16 - 0x07; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; case 0x08: __extension__ ({ size_t tmp; while (n >=
128 + 16 - 0x08) { xmm0 = _mm_loadu_si128((const __m128i *)(
(const uint8_t *)src - 0x08 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x08 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08
+ 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x08 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x08 + 4 * 16)); xmm5 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x08 + 5 * 16)); xmm6
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08
+ 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x08 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x08 + 8 * 16)); src = (const uint8_t
*)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0
* 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((
__v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x08)); }
)); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2
), (__v16qi)(__m128i)(xmm1), (0x08)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi
)(__m128i)(xmm2), (0x08)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x08));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5
), (__v16qi)(__m128i)(xmm4), (0x08)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi
)(__m128i)(xmm5), (0x08)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x08));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8
), (__v16qi)(__m128i)(xmm7), (0x08)); })); dst = (uint8_t *)dst
+ 128; } tmp = n; n = ((n - 16 + 0x08) & 127) + 16 - 0x08
; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *
)dst + tmp; if (n >= 32 + 16 - 0x08) { while (n >= 32 +
16 - 0x08) { xmm0 = _mm_loadu_si128((const __m128i *)((const
uint8_t *)src - 0x08 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x08 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08
+ 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x08)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x08));
})); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x08
) & 31) + 16 - 0x08; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x09: __extension__
({ size_t tmp; while (n >= 128 + 16 - 0x09) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x09 + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x09 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x09 + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x09 + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x09 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x09 + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x09 + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x09)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x09));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x09)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x09)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x09));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x09)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x09)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x09));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x09
) & 127) + 16 - 0x09; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x09
) { while (n >= 32 + 16 - 0x09) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x09 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x09 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x09 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x09)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x09)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x09) & 31) + 16 - 0x09; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; case 0x0A: __extension__ ({ size_t tmp; while (n >=
128 + 16 - 0x0A) { xmm0 = _mm_loadu_si128((const __m128i *)(
(const uint8_t *)src - 0x0A + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0A + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A
+ 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0A + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0A + 4 * 16)); xmm5 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0A + 5 * 16)); xmm6
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A
+ 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0A + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0A + 8 * 16)); src = (const uint8_t
*)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0
* 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((
__v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0A)); }
)); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2
), (__v16qi)(__m128i)(xmm1), (0x0A)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi
)(__m128i)(xmm2), (0x0A)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x0A));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5
), (__v16qi)(__m128i)(xmm4), (0x0A)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi
)(__m128i)(xmm5), (0x0A)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x0A));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8
), (__v16qi)(__m128i)(xmm7), (0x0A)); })); dst = (uint8_t *)dst
+ 128; } tmp = n; n = ((n - 16 + 0x0A) & 127) + 16 - 0x0A
; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *
)dst + tmp; if (n >= 32 + 16 - 0x0A) { while (n >= 32 +
16 - 0x0A) { xmm0 = _mm_loadu_si128((const __m128i *)((const
uint8_t *)src - 0x0A + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0A + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A
+ 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0A)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0A));
})); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0A
) & 31) + 16 - 0x0A; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x0B: __extension__
({ size_t tmp; while (n >= 128 + 16 - 0x0B) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0B + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0B + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0B + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0B + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0B + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0B + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0B + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0B)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0B));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x0B)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x0B)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0B));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x0B)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x0B)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0B));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0B
) & 127) + 16 - 0x0B; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0B
) { while (n >= 32 + 16 - 0x0B) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x0B + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x0B + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0B + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0B)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x0B)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x0B) & 31) + 16 - 0x0B; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; case 0x0C: __extension__ ({ size_t tmp; while (n >=
128 + 16 - 0x0C) { xmm0 = _mm_loadu_si128((const __m128i *)(
(const uint8_t *)src - 0x0C + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0C + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C
+ 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0C + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0C + 4 * 16)); xmm5 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0C + 5 * 16)); xmm6
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C
+ 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0C + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0C + 8 * 16)); src = (const uint8_t
*)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0
* 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((
__v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0C)); }
)); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2
), (__v16qi)(__m128i)(xmm1), (0x0C)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi
)(__m128i)(xmm2), (0x0C)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x0C));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5
), (__v16qi)(__m128i)(xmm4), (0x0C)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi
)(__m128i)(xmm5), (0x0C)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x0C));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8
), (__v16qi)(__m128i)(xmm7), (0x0C)); })); dst = (uint8_t *)dst
+ 128; } tmp = n; n = ((n - 16 + 0x0C) & 127) + 16 - 0x0C
; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *
)dst + tmp; if (n >= 32 + 16 - 0x0C) { while (n >= 32 +
16 - 0x0C) { xmm0 = _mm_loadu_si128((const __m128i *)((const
uint8_t *)src - 0x0C + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0C + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C
+ 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0C)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0C));
})); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0C
) & 31) + 16 - 0x0C; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x0D: __extension__
({ size_t tmp; while (n >= 128 + 16 - 0x0D) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0D + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0D + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0D + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0D + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0D + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0D + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0D + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0D)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0D));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x0D)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x0D)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0D));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x0D)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x0D)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0D));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0D
) & 127) + 16 - 0x0D; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0D
) { while (n >= 32 + 16 - 0x0D) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x0D + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x0D + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0D + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0D)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x0D)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x0D) & 31) + 16 - 0x0D; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; case 0x0E: __extension__ ({ size_t tmp; while (n >=
128 + 16 - 0x0E) { xmm0 = _mm_loadu_si128((const __m128i *)(
(const uint8_t *)src - 0x0E + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0E + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E
+ 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0E + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0E + 4 * 16)); xmm5 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0E + 5 * 16)); xmm6
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E
+ 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0E + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0E + 8 * 16)); src = (const uint8_t
*)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0
* 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((
__v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0E)); }
)); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2
), (__v16qi)(__m128i)(xmm1), (0x0E)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi
)(__m128i)(xmm2), (0x0E)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x0E));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5
), (__v16qi)(__m128i)(xmm4), (0x0E)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi
)(__m128i)(xmm5), (0x0E)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x0E));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8
), (__v16qi)(__m128i)(xmm7), (0x0E)); })); dst = (uint8_t *)dst
+ 128; } tmp = n; n = ((n - 16 + 0x0E) & 127) + 16 - 0x0E
; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *
)dst + tmp; if (n >= 32 + 16 - 0x0E) { while (n >= 32 +
16 - 0x0E) { xmm0 = _mm_loadu_si128((const __m128i *)((const
uint8_t *)src - 0x0E + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0E + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E
+ 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0E)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0E));
})); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0E
) & 31) + 16 - 0x0E; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x0F: __extension__
({ size_t tmp; while (n >= 128 + 16 - 0x0F) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0F + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0F + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0F + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0F + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0F + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0F + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0F + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0F)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0F));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x0F)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x0F)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0F));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x0F)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x0F)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0F));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0F
) & 127) + 16 - 0x0F; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0F
) { while (n >= 32 + 16 - 0x0F) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x0F + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x0F + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0F + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0F)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x0F)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x0F) & 31) + 16 - 0x0F; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; default:; } })
\
637__extension__ ({ \
638 switch (offset) { \
639 case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x01) {
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src
- 0x01 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x01 + 1 * 16)); xmm2 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x01 + 2 * 16)); xmm3
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01
+ 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x01 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x01 + 5 * 16)); xmm6 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x01 + 6 * 16)); xmm7
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01
+ 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x01 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x01)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x01));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x01)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x01)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x01));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x01)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x01)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x01));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x01
) & 127) + 16 - 0x01; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x01
) { while (n >= 32 + 16 - 0x01) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x01 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x01 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x01 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x01)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x01)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x01) & 31) + 16 - 0x01; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
)
; break; \
640 case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x02) {
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src
- 0x02 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x02 + 1 * 16)); xmm2 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x02 + 2 * 16)); xmm3
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02
+ 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x02 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x02 + 5 * 16)); xmm6 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x02 + 6 * 16)); xmm7
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02
+ 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x02 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x02)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x02));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x02)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x02)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x02));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x02)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x02)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x02));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x02
) & 127) + 16 - 0x02; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x02
) { while (n >= 32 + 16 - 0x02) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x02 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x02 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x02 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x02)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x02)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x02) & 31) + 16 - 0x02; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
)
; break; \
641 case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x03) {
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src
- 0x03 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x03 + 1 * 16)); xmm2 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x03 + 2 * 16)); xmm3
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03
+ 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x03 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x03 + 5 * 16)); xmm6 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x03 + 6 * 16)); xmm7
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03
+ 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x03 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x03)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x03));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x03)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x03)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x03));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x03)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x03)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x03));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x03
) & 127) + 16 - 0x03; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x03
) { while (n >= 32 + 16 - 0x03) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x03 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x03 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x03 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x03)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x03)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x03) & 31) + 16 - 0x03; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
)
; break; \
642 case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x04) {
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src
- 0x04 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x04 + 1 * 16)); xmm2 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x04 + 2 * 16)); xmm3
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04
+ 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x04 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x04 + 5 * 16)); xmm6 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x04 + 6 * 16)); xmm7
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04
+ 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x04 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x04)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x04));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x04)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x04)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x04));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x04)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x04)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x04));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x04
) & 127) + 16 - 0x04; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x04
) { while (n >= 32 + 16 - 0x04) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x04 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x04 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x04 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x04)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x04)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x04) & 31) + 16 - 0x04; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
)
; break; \
643 case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x05) {
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src
- 0x05 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x05 + 1 * 16)); xmm2 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x05 + 2 * 16)); xmm3
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05
+ 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x05 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x05 + 5 * 16)); xmm6 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x05 + 6 * 16)); xmm7
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05
+ 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x05 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x05)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x05));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x05)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x05)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x05));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x05)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x05)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x05));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x05
) & 127) + 16 - 0x05; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x05
) { while (n >= 32 + 16 - 0x05) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x05 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x05 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x05 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x05)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x05)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x05) & 31) + 16 - 0x05; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
)
; break; \
644 case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x06) {
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src
- 0x06 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x06 + 1 * 16)); xmm2 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x06 + 2 * 16)); xmm3
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06
+ 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x06 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x06 + 5 * 16)); xmm6 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x06 + 6 * 16)); xmm7
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06
+ 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x06 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x06)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x06));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x06)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x06)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x06));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x06)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x06)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x06));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x06
) & 127) + 16 - 0x06; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x06
) { while (n >= 32 + 16 - 0x06) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x06 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x06 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x06 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x06)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x06)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x06) & 31) + 16 - 0x06; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
)
; break; \
645 case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x07) {
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src
- 0x07 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x07 + 1 * 16)); xmm2 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x07 + 2 * 16)); xmm3
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07
+ 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x07 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x07 + 5 * 16)); xmm6 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x07 + 6 * 16)); xmm7
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07
+ 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x07 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x07)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x07));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x07)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x07)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x07));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x07)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x07)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x07));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x07
) & 127) + 16 - 0x07; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x07
) { while (n >= 32 + 16 - 0x07) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x07 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x07 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x07 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x07)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x07)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x07) & 31) + 16 - 0x07; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
)
; break; \
646 case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x08) {
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src
- 0x08 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x08 + 1 * 16)); xmm2 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x08 + 2 * 16)); xmm3
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08
+ 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x08 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x08 + 5 * 16)); xmm6 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x08 + 6 * 16)); xmm7
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08
+ 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x08 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x08)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x08));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x08)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x08)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x08));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x08)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x08)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x08));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x08
) & 127) + 16 - 0x08; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x08
) { while (n >= 32 + 16 - 0x08) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x08 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x08 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x08 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x08)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x08)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x08) & 31) + 16 - 0x08; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
)
; break; \
647 case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x09) {
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src
- 0x09 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x09 + 1 * 16)); xmm2 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x09 + 2 * 16)); xmm3
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09
+ 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x09 + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x09 + 5 * 16)); xmm6 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x09 + 6 * 16)); xmm7
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09
+ 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x09 + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x09)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x09));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x09)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x09)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x09));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x09)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x09)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x09));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x09
) & 127) + 16 - 0x09; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x09
) { while (n >= 32 + 16 - 0x09) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x09 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x09 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x09 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x09)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x09)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x09) & 31) + 16 - 0x09; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
)
; break; \
648 case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0A) {
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src
- 0x0A + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0A + 1 * 16)); xmm2 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0A + 2 * 16)); xmm3
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A
+ 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0A + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0A + 5 * 16)); xmm6 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0A + 6 * 16)); xmm7
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A
+ 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0A + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0A)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0A));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x0A)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x0A)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0A));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x0A)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x0A)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0A));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0A
) & 127) + 16 - 0x0A; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0A
) { while (n >= 32 + 16 - 0x0A) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x0A + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x0A + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0A + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0A)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x0A)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x0A) & 31) + 16 - 0x0A; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
)
; break; \
649 case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0B) {
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src
- 0x0B + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0B + 1 * 16)); xmm2 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0B + 2 * 16)); xmm3
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B
+ 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0B + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0B + 5 * 16)); xmm6 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0B + 6 * 16)); xmm7
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B
+ 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0B + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0B)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0B));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x0B)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x0B)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0B));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x0B)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x0B)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0B));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0B
) & 127) + 16 - 0x0B; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0B
) { while (n >= 32 + 16 - 0x0B) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x0B + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x0B + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0B + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0B)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x0B)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x0B) & 31) + 16 - 0x0B; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
)
; break; \
650 case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0C) {
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src
- 0x0C + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0C + 1 * 16)); xmm2 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0C + 2 * 16)); xmm3
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C
+ 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0C + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0C + 5 * 16)); xmm6 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0C + 6 * 16)); xmm7
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C
+ 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0C + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0C)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0C));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x0C)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x0C)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0C));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x0C)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x0C)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0C));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0C
) & 127) + 16 - 0x0C; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0C
) { while (n >= 32 + 16 - 0x0C) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x0C + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x0C + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0C + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0C)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x0C)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x0C) & 31) + 16 - 0x0C; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
)
; break; \
651 case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0D) {
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src
- 0x0D + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0D + 1 * 16)); xmm2 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0D + 2 * 16)); xmm3
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D
+ 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0D + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0D + 5 * 16)); xmm6 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0D + 6 * 16)); xmm7
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D
+ 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0D + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0D)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0D));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x0D)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x0D)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0D));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x0D)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x0D)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0D));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0D
) & 127) + 16 - 0x0D; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0D
) { while (n >= 32 + 16 - 0x0D) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x0D + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x0D + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0D + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0D)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x0D)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x0D) & 31) + 16 - 0x0D; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
)
; break; \
652 case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0E) {
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src
- 0x0E + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0E + 1 * 16)); xmm2 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0E + 2 * 16)); xmm3
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E
+ 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0E + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0E + 5 * 16)); xmm6 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0E + 6 * 16)); xmm7
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E
+ 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0E + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0E)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0E));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x0E)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x0E)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0E));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x0E)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x0E)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0E));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0E
) & 127) + 16 - 0x0E; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0E
) { while (n >= 32 + 16 - 0x0E) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x0E + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x0E + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0E + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0E)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x0E)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x0E) & 31) + 16 - 0x0E; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
)
; break; \
653 case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F)__extension__ ({ size_t tmp; while (n >= 128 + 16 - 0x0F) {
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src
- 0x0F + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0F + 1 * 16)); xmm2 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0F + 2 * 16)); xmm3
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F
+ 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0F + 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0F + 5 * 16)); xmm6 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0F + 6 * 16)); xmm7
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F
+ 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0F + 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0F)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0F));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x0F)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x0F)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0F));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x0F)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x0F)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0F));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0F
) & 127) + 16 - 0x0F; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0F
) { while (n >= 32 + 16 - 0x0F) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x0F + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x0F + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0F + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0F)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x0F)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x0F) & 31) + 16 - 0x0F; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
)
; break; \
654 default:; \
655 } \
656})
657
658static inline void *
659rte_memcpy_generic(void *dst, const void *src, size_t n)
660{
661 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
662 uintptr_t dstu = (uintptr_t)dst;
663 uintptr_t srcu = (uintptr_t)src;
664 void *ret = dst;
665 size_t dstofss;
666 size_t srcofs;
667
668 /**
669 * Copy less than 16 bytes
670 */
671 if (n < 16) {
672 if (n & 0x01) {
673 *(uint8_t *)dstu = *(const uint8_t *)srcu;
674 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
675 dstu = (uintptr_t)((uint8_t *)dstu + 1);
676 }
677 if (n & 0x02) {
678 *(uint16_t *)dstu = *(const uint16_t *)srcu;
679 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
680 dstu = (uintptr_t)((uint16_t *)dstu + 1);
681 }
682 if (n & 0x04) {
683 *(uint32_t *)dstu = *(const uint32_t *)srcu;
684 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
685 dstu = (uintptr_t)((uint32_t *)dstu + 1);
686 }
687 if (n & 0x08) {
688 *(uint64_t *)dstu = *(const uint64_t *)srcu;
689 }
690 return ret;
691 }
692
693 /**
694 * Fast way when copy size doesn't exceed 512 bytes
695 */
696 if (n <= 32) {
697 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
698 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
699 return ret;
700 }
701 if (n <= 48) {
702 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
703 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
704 return ret;
705 }
706 if (n <= 64) {
707 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
708 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
709 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
710 return ret;
711 }
712 if (n <= 128) {
713 goto COPY_BLOCK_128_BACK15;
714 }
715 if (n <= 512) {
716 if (n >= 256) {
717 n -= 256;
718 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
719 rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
720 src = (const uint8_t *)src + 256;
721 dst = (uint8_t *)dst + 256;
722 }
723COPY_BLOCK_255_BACK15:
724 if (n >= 128) {
725 n -= 128;
726 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
727 src = (const uint8_t *)src + 128;
728 dst = (uint8_t *)dst + 128;
729 }
730COPY_BLOCK_128_BACK15:
731 if (n >= 64) {
732 n -= 64;
733 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
734 src = (const uint8_t *)src + 64;
735 dst = (uint8_t *)dst + 64;
736 }
737COPY_BLOCK_64_BACK15:
738 if (n >= 32) {
739 n -= 32;
740 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
741 src = (const uint8_t *)src + 32;
742 dst = (uint8_t *)dst + 32;
743 }
744 if (n > 16) {
745 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
746 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
747 return ret;
748 }
749 if (n > 0) {
750 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
751 }
752 return ret;
753 }
754
755 /**
756 * Make store aligned when copy size exceeds 512 bytes,
757 * and make sure the first 15 bytes are copied, because
758 * unaligned copy functions require up to 15 bytes
759 * backwards access.
760 */
761 dstofss = (uintptr_t)dst & 0x0F;
762 if (dstofss > 0) {
763 dstofss = 16 - dstofss + 16;
764 n -= dstofss;
765 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
766 src = (const uint8_t *)src + dstofss;
767 dst = (uint8_t *)dst + dstofss;
768 }
769 srcofs = ((uintptr_t)src & 0x0F);
770
771 /**
772 * For aligned copy
773 */
774 if (srcofs == 0) {
775 /**
776 * Copy 256-byte blocks
777 */
778 for (; n >= 256; n -= 256) {
779 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
780 dst = (uint8_t *)dst + 256;
781 src = (const uint8_t *)src + 256;
782 }
783
784 /**
785 * Copy whatever left
786 */
787 goto COPY_BLOCK_255_BACK15;
788 }
789
790 /**
791 * For copy with unaligned load
792 */
793 MOVEUNALIGNED_LEFT47(dst, src, n, srcofs)__extension__ ({ switch (srcofs) { case 0x01: __extension__ (
{ size_t tmp; while (n >= 128 + 16 - 0x01) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x01 + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x01 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x01 + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x01 + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x01 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x01 + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x01 + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x01
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x01)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x01));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x01)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x01)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x01));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x01)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x01)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x01));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x01
) & 127) + 16 - 0x01; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x01
) { while (n >= 32 + 16 - 0x01) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x01 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x01 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x01 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x01)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x01)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x01) & 31) + 16 - 0x01; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; case 0x02: __extension__ ({ size_t tmp; while (n >=
128 + 16 - 0x02) { xmm0 = _mm_loadu_si128((const __m128i *)(
(const uint8_t *)src - 0x02 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x02 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02
+ 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x02 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x02 + 4 * 16)); xmm5 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x02 + 5 * 16)); xmm6
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02
+ 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x02 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x02 + 8 * 16)); src = (const uint8_t
*)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0
* 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((
__v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x02)); }
)); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2
), (__v16qi)(__m128i)(xmm1), (0x02)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi
)(__m128i)(xmm2), (0x02)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x02));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5
), (__v16qi)(__m128i)(xmm4), (0x02)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi
)(__m128i)(xmm5), (0x02)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x02));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8
), (__v16qi)(__m128i)(xmm7), (0x02)); })); dst = (uint8_t *)dst
+ 128; } tmp = n; n = ((n - 16 + 0x02) & 127) + 16 - 0x02
; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *
)dst + tmp; if (n >= 32 + 16 - 0x02) { while (n >= 32 +
16 - 0x02) { xmm0 = _mm_loadu_si128((const __m128i *)((const
uint8_t *)src - 0x02 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x02 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x02
+ 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x02)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x02));
})); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x02
) & 31) + 16 - 0x02; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x03: __extension__
({ size_t tmp; while (n >= 128 + 16 - 0x03) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x03 + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x03 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x03 + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x03 + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x03 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x03 + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x03 + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x03
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x03)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x03));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x03)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x03)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x03));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x03)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x03)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x03));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x03
) & 127) + 16 - 0x03; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x03
) { while (n >= 32 + 16 - 0x03) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x03 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x03 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x03 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x03)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x03)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x03) & 31) + 16 - 0x03; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; case 0x04: __extension__ ({ size_t tmp; while (n >=
128 + 16 - 0x04) { xmm0 = _mm_loadu_si128((const __m128i *)(
(const uint8_t *)src - 0x04 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x04 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04
+ 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x04 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x04 + 4 * 16)); xmm5 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x04 + 5 * 16)); xmm6
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04
+ 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x04 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x04 + 8 * 16)); src = (const uint8_t
*)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0
* 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((
__v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x04)); }
)); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2
), (__v16qi)(__m128i)(xmm1), (0x04)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi
)(__m128i)(xmm2), (0x04)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x04));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5
), (__v16qi)(__m128i)(xmm4), (0x04)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi
)(__m128i)(xmm5), (0x04)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x04));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8
), (__v16qi)(__m128i)(xmm7), (0x04)); })); dst = (uint8_t *)dst
+ 128; } tmp = n; n = ((n - 16 + 0x04) & 127) + 16 - 0x04
; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *
)dst + tmp; if (n >= 32 + 16 - 0x04) { while (n >= 32 +
16 - 0x04) { xmm0 = _mm_loadu_si128((const __m128i *)((const
uint8_t *)src - 0x04 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x04 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x04
+ 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x04)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x04));
})); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x04
) & 31) + 16 - 0x04; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x05: __extension__
({ size_t tmp; while (n >= 128 + 16 - 0x05) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x05 + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x05 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x05 + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x05 + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x05 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x05 + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x05 + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x05
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x05)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x05));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x05)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x05)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x05));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x05)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x05)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x05));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x05
) & 127) + 16 - 0x05; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x05
) { while (n >= 32 + 16 - 0x05) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x05 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x05 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x05 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x05)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x05)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x05) & 31) + 16 - 0x05; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; case 0x06: __extension__ ({ size_t tmp; while (n >=
128 + 16 - 0x06) { xmm0 = _mm_loadu_si128((const __m128i *)(
(const uint8_t *)src - 0x06 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x06 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06
+ 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x06 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x06 + 4 * 16)); xmm5 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x06 + 5 * 16)); xmm6
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06
+ 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x06 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x06 + 8 * 16)); src = (const uint8_t
*)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0
* 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((
__v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x06)); }
)); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2
), (__v16qi)(__m128i)(xmm1), (0x06)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi
)(__m128i)(xmm2), (0x06)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x06));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5
), (__v16qi)(__m128i)(xmm4), (0x06)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi
)(__m128i)(xmm5), (0x06)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x06));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8
), (__v16qi)(__m128i)(xmm7), (0x06)); })); dst = (uint8_t *)dst
+ 128; } tmp = n; n = ((n - 16 + 0x06) & 127) + 16 - 0x06
; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *
)dst + tmp; if (n >= 32 + 16 - 0x06) { while (n >= 32 +
16 - 0x06) { xmm0 = _mm_loadu_si128((const __m128i *)((const
uint8_t *)src - 0x06 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x06 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x06
+ 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x06)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x06));
})); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x06
) & 31) + 16 - 0x06; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x07: __extension__
({ size_t tmp; while (n >= 128 + 16 - 0x07) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x07 + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x07 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x07 + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x07 + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x07 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x07 + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x07 + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x07
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x07)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x07));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x07)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x07)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x07));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x07)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x07)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x07));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x07
) & 127) + 16 - 0x07; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x07
) { while (n >= 32 + 16 - 0x07) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x07 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x07 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x07 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x07)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x07)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x07) & 31) + 16 - 0x07; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; case 0x08: __extension__ ({ size_t tmp; while (n >=
128 + 16 - 0x08) { xmm0 = _mm_loadu_si128((const __m128i *)(
(const uint8_t *)src - 0x08 + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x08 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08
+ 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x08 + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x08 + 4 * 16)); xmm5 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x08 + 5 * 16)); xmm6
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08
+ 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x08 + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x08 + 8 * 16)); src = (const uint8_t
*)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0
* 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((
__v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x08)); }
)); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2
), (__v16qi)(__m128i)(xmm1), (0x08)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi
)(__m128i)(xmm2), (0x08)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x08));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5
), (__v16qi)(__m128i)(xmm4), (0x08)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi
)(__m128i)(xmm5), (0x08)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x08));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8
), (__v16qi)(__m128i)(xmm7), (0x08)); })); dst = (uint8_t *)dst
+ 128; } tmp = n; n = ((n - 16 + 0x08) & 127) + 16 - 0x08
; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *
)dst + tmp; if (n >= 32 + 16 - 0x08) { while (n >= 32 +
16 - 0x08) { xmm0 = _mm_loadu_si128((const __m128i *)((const
uint8_t *)src - 0x08 + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x08 + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x08
+ 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x08)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x08));
})); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x08
) & 31) + 16 - 0x08; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x09: __extension__
({ size_t tmp; while (n >= 128 + 16 - 0x09) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x09 + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x09 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x09 + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x09 + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x09 + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x09 + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x09 + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x09
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x09)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x09));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x09)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x09)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x09));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x09)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x09)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x09));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x09
) & 127) + 16 - 0x09; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x09
) { while (n >= 32 + 16 - 0x09) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x09 + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x09 + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x09 + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x09)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x09)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x09) & 31) + 16 - 0x09; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; case 0x0A: __extension__ ({ size_t tmp; while (n >=
128 + 16 - 0x0A) { xmm0 = _mm_loadu_si128((const __m128i *)(
(const uint8_t *)src - 0x0A + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0A + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A
+ 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0A + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0A + 4 * 16)); xmm5 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0A + 5 * 16)); xmm6
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A
+ 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0A + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0A + 8 * 16)); src = (const uint8_t
*)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0
* 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((
__v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0A)); }
)); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2
), (__v16qi)(__m128i)(xmm1), (0x0A)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi
)(__m128i)(xmm2), (0x0A)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x0A));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5
), (__v16qi)(__m128i)(xmm4), (0x0A)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi
)(__m128i)(xmm5), (0x0A)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x0A));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8
), (__v16qi)(__m128i)(xmm7), (0x0A)); })); dst = (uint8_t *)dst
+ 128; } tmp = n; n = ((n - 16 + 0x0A) & 127) + 16 - 0x0A
; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *
)dst + tmp; if (n >= 32 + 16 - 0x0A) { while (n >= 32 +
16 - 0x0A) { xmm0 = _mm_loadu_si128((const __m128i *)((const
uint8_t *)src - 0x0A + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0A + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0A
+ 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0A)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0A));
})); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0A
) & 31) + 16 - 0x0A; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x0B: __extension__
({ size_t tmp; while (n >= 128 + 16 - 0x0B) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0B + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0B + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0B + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0B + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0B + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0B + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0B + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0B
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0B)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0B));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x0B)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x0B)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0B));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x0B)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x0B)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0B));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0B
) & 127) + 16 - 0x0B; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0B
) { while (n >= 32 + 16 - 0x0B) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x0B + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x0B + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0B + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0B)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x0B)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x0B) & 31) + 16 - 0x0B; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; case 0x0C: __extension__ ({ size_t tmp; while (n >=
128 + 16 - 0x0C) { xmm0 = _mm_loadu_si128((const __m128i *)(
(const uint8_t *)src - 0x0C + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0C + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C
+ 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0C + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0C + 4 * 16)); xmm5 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0C + 5 * 16)); xmm6
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C
+ 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0C + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0C + 8 * 16)); src = (const uint8_t
*)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0
* 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((
__v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0C)); }
)); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2
), (__v16qi)(__m128i)(xmm1), (0x0C)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi
)(__m128i)(xmm2), (0x0C)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x0C));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5
), (__v16qi)(__m128i)(xmm4), (0x0C)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi
)(__m128i)(xmm5), (0x0C)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x0C));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8
), (__v16qi)(__m128i)(xmm7), (0x0C)); })); dst = (uint8_t *)dst
+ 128; } tmp = n; n = ((n - 16 + 0x0C) & 127) + 16 - 0x0C
; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *
)dst + tmp; if (n >= 32 + 16 - 0x0C) { while (n >= 32 +
16 - 0x0C) { xmm0 = _mm_loadu_si128((const __m128i *)((const
uint8_t *)src - 0x0C + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0C + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0C
+ 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0C)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0C));
})); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0C
) & 31) + 16 - 0x0C; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x0D: __extension__
({ size_t tmp; while (n >= 128 + 16 - 0x0D) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0D + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0D + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0D + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0D + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0D + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0D + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0D + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0D
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0D)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0D));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x0D)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x0D)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0D));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x0D)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x0D)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0D));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0D
) & 127) + 16 - 0x0D; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0D
) { while (n >= 32 + 16 - 0x0D) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x0D + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x0D + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0D + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0D)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x0D)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x0D) & 31) + 16 - 0x0D; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; case 0x0E: __extension__ ({ size_t tmp; while (n >=
128 + 16 - 0x0E) { xmm0 = _mm_loadu_si128((const __m128i *)(
(const uint8_t *)src - 0x0E + 0 * 16)); n -= 128; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0E + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E
+ 2 * 16)); xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0E + 3 * 16)); xmm4 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0E + 4 * 16)); xmm5 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0E + 5 * 16)); xmm6
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E
+ 6 * 16)); xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0E + 7 * 16)); xmm8 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0E + 8 * 16)); src = (const uint8_t
*)src + 128; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0
* 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((
__v16qi)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0E)); }
)); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2
), (__v16qi)(__m128i)(xmm1), (0x0E)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 2 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3), (__v16qi
)(__m128i)(xmm2), (0x0E)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm4), (__v16qi)(__m128i)(xmm3), (0x0E));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm5
), (__v16qi)(__m128i)(xmm4), (0x0E)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 5 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6), (__v16qi
)(__m128i)(xmm5), (0x0E)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm7), (__v16qi)(__m128i)(xmm6), (0x0E));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm8
), (__v16qi)(__m128i)(xmm7), (0x0E)); })); dst = (uint8_t *)dst
+ 128; } tmp = n; n = ((n - 16 + 0x0E) & 127) + 16 - 0x0E
; tmp -= n; src = (const uint8_t *)src + tmp; dst = (uint8_t *
)dst + tmp; if (n >= 32 + 16 - 0x0E) { while (n >= 32 +
16 - 0x0E) { xmm0 = _mm_loadu_si128((const __m128i *)((const
uint8_t *)src - 0x0E + 0 * 16)); n -= 32; xmm1 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0E + 1 * 16)); xmm2
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0E
+ 2 * 16)); src = (const uint8_t *)src + 32; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0E)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0E));
})); dst = (uint8_t *)dst + 32; } tmp = n; n = ((n - 16 + 0x0E
) & 31) + 16 - 0x0E; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; } }); break; case 0x0F: __extension__
({ size_t tmp; while (n >= 128 + 16 - 0x0F) { xmm0 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0F + 0 * 16)); n -=
128; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0F + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0F + 2 * 16)); xmm3 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0F + 3 * 16)); xmm4
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F
+ 4 * 16)); xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t
*)src - 0x0F + 5 * 16)); xmm6 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0F + 6 * 16)); xmm7 = _mm_loadu_si128
((const __m128i *)((const uint8_t *)src - 0x0F + 7 * 16)); xmm8
= _mm_loadu_si128((const __m128i *)((const uint8_t *)src - 0x0F
+ 8 * 16)); src = (const uint8_t *)src + 128; _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 0 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm1), (__v16qi
)(__m128i)(xmm0), (0x0F)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm2), (__v16qi)(__m128i)(xmm1), (0x0F));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm3
), (__v16qi)(__m128i)(xmm2), (0x0F)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 3 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm4), (__v16qi
)(__m128i)(xmm3), (0x0F)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 4 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm5), (__v16qi)(__m128i)(xmm4), (0x0F));
})); _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), __extension__
({ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm6
), (__v16qi)(__m128i)(xmm5), (0x0F)); })); _mm_storeu_si128((
__m128i *)((uint8_t *)dst + 6 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm7), (__v16qi
)(__m128i)(xmm6), (0x0F)); })); _mm_storeu_si128((__m128i *)(
(uint8_t *)dst + 7 * 16), __extension__ ({ (__m128i)__builtin_ia32_palignr128
((__v16qi)(__m128i)(xmm8), (__v16qi)(__m128i)(xmm7), (0x0F));
})); dst = (uint8_t *)dst + 128; } tmp = n; n = ((n - 16 + 0x0F
) & 127) + 16 - 0x0F; tmp -= n; src = (const uint8_t *)src
+ tmp; dst = (uint8_t *)dst + tmp; if (n >= 32 + 16 - 0x0F
) { while (n >= 32 + 16 - 0x0F) { xmm0 = _mm_loadu_si128((
const __m128i *)((const uint8_t *)src - 0x0F + 0 * 16)); n -=
32; xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *
)src - 0x0F + 1 * 16)); xmm2 = _mm_loadu_si128((const __m128i
*)((const uint8_t *)src - 0x0F + 2 * 16)); src = (const uint8_t
*)src + 32; _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 *
16), __extension__ ({ (__m128i)__builtin_ia32_palignr128((__v16qi
)(__m128i)(xmm1), (__v16qi)(__m128i)(xmm0), (0x0F)); })); _mm_storeu_si128
((__m128i *)((uint8_t *)dst + 1 * 16), __extension__ ({ (__m128i
)__builtin_ia32_palignr128((__v16qi)(__m128i)(xmm2), (__v16qi
)(__m128i)(xmm1), (0x0F)); })); dst = (uint8_t *)dst + 32; } tmp
= n; n = ((n - 16 + 0x0F) & 31) + 16 - 0x0F; tmp -= n; src
= (const uint8_t *)src + tmp; dst = (uint8_t *)dst + tmp; } }
); break; default:; } })
;
794
795 /**
796 * Copy whatever left
797 */
798 goto COPY_BLOCK_64_BACK15;
799}
800
801#endif /* RTE_MACHINE_CPUFLAG */
802
803static inline void *
804rte_memcpy_aligned(void *dst, const void *src, size_t n)
805{
806 void *ret = dst;
807
808 /* Copy size <= 16 bytes */
809 if (n < 16) {
4
Taking true branch
810 if (n & 0x01) {
5
Taking false branch
811 *(uint8_t *)dst = *(const uint8_t *)src;
812 src = (const uint8_t *)src + 1;
813 dst = (uint8_t *)dst + 1;
814 }
815 if (n & 0x02) {
6
Taking true branch
816 *(uint16_t *)dst = *(const uint16_t *)src;
817 src = (const uint16_t *)src + 1;
818 dst = (uint16_t *)dst + 1;
819 }
820 if (n & 0x04) {
7
Taking true branch
821 *(uint32_t *)dst = *(const uint32_t *)src;
8
Assigned value is garbage or undefined
822 src = (const uint32_t *)src + 1;
823 dst = (uint32_t *)dst + 1;
824 }
825 if (n & 0x08)
826 *(uint64_t *)dst = *(const uint64_t *)src;
827
828 return ret;
829 }
830
831 /* Copy 16 <= size <= 32 bytes */
832 if (n <= 32) {
833 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
834 rte_mov16((uint8_t *)dst - 16 + n,
835 (const uint8_t *)src - 16 + n);
836
837 return ret;
838 }
839
840 /* Copy 32 < size <= 64 bytes */
841 if (n <= 64) {
842 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
843 rte_mov32((uint8_t *)dst - 32 + n,
844 (const uint8_t *)src - 32 + n);
845
846 return ret;
847 }
848
849 /* Copy 64 bytes blocks */
850 for (; n >= 64; n -= 64) {
851 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
852 dst = (uint8_t *)dst + 64;
853 src = (const uint8_t *)src + 64;
854 }
855
856 /* Copy whatever left */
857 rte_mov64((uint8_t *)dst - 64 + n,
858 (const uint8_t *)src - 64 + n);
859
860 return ret;
861}
862
863static inline void *
864rte_memcpy(void *dst, const void *src, size_t n)
865{
866 if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK0x0F))
2
Taking true branch
867 return rte_memcpy_aligned(dst, src, n);
3
Calling 'rte_memcpy_aligned'
868 else
869 return rte_memcpy_generic(dst, src, n);
870}
871
872#ifdef __cplusplus
873}
874#endif
875
876#endif /* _RTE_MEMCPY_X86_64_H_ */