glibc
/
backport-Kunpeng-patches.patch

From 0dfa5db2106d75db595e83f064352fb89d92986e Mon Sep 17 00:00:00 2001
From: wangbin224 <wangbin224@huawei.com>
Date: Sat, 28 Mar 2020 19:14:41 +0800
Subject: [PATCH] glibc: backport Kunpeng patches

backport Kunpeng patches

Signed-off-by: wangbin224 <wangbin224@huawei.com>
---
 manual/tunables.texi                          |   2 +-
 sysdeps/aarch64/memcmp.S                      |   4 +-
 sysdeps/aarch64/memrchr.S                     |  15 +-
 sysdeps/aarch64/multiarch/Makefile            |   2 +-
 sysdeps/aarch64/multiarch/ifunc-impl-list.c   |  54 +-
 sysdeps/aarch64/multiarch/memcpy.c            |   9 +-
 sysdeps/aarch64/multiarch/memcpy_kunpeng.S    | 576 ------------------
 sysdeps/aarch64/multiarch/memmove.c           |  11 +-
 sysdeps/aarch64/multiarch/memset.c            |  14 +-
 sysdeps/aarch64/multiarch/memset_kunpeng.S    |  58 +-
 sysdeps/aarch64/strcpy.S                      |   6 +-
 sysdeps/aarch64/strnlen.S                     |   4 +-
 .../unix/sysv/linux/aarch64/cpu-features.c    |   4 +-
 .../unix/sysv/linux/aarch64/cpu-features.h    |   7 +-
 14 files changed, 86 insertions(+), 680 deletions(-)
 delete mode 100755 sysdeps/aarch64/multiarch/memcpy_kunpeng.S

diff --git a/manual/tunables.texi b/manual/tunables.texi
index bb4819bd..124b39b6 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -333,7 +333,7 @@ This tunable is specific to powerpc, powerpc64 and powerpc64le.
 The @code{glibc.tune.cpu=xxx} tunable allows the user to tell @theglibc{} to
 assume that the CPU is @code{xxx} where xxx may have one of these values:
 @code{generic}, @code{falkor}, @code{thunderxt88}, @code{thunderx2t99},
-@code{thunderx2t99p1}.
+@code{thunderx2t99p1}, @code{kunpeng}.

 This tunable is specific to aarch64.
 @end deftp
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index 04129d83..a2138616 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -1,6 +1,6 @@
 /* memcmp - compare memory

-   Copyright (C) 2013-2019 Free Software Foundation, Inc.
+   Copyright (C) 2013-2018 Free Software Foundation, Inc.

    This file is part of the GNU C Library.

@@ -16,7 +16,7 @@

    You should have received a copy of the GNU Lesser General Public
    License along with the GNU C Library.  If not, see
-   <https://www.gnu.org/licenses/>.  */
+   <http://www.gnu.org/licenses/>.  */

 #include <sysdep.h>

diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
index 9095304b..0565168a 100644
--- a/sysdeps/aarch64/memrchr.S
+++ b/sysdeps/aarch64/memrchr.S
@@ -16,8 +16,8 @@

    You should have received a copy of the GNU Lesser General Public
    License along with the GNU C Library.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
+   <https://www.gnu.org/licenses/>.  */
+
 #include <sysdep.h>

 /* Assumptions:
@@ -61,7 +61,7 @@
  * things occur in the original string, counting trailing zeros allows to
  * identify exactly which byte has matched.
  */
-
+
 ENTRY (__memrchr)
 	/* Do not dereference srcin if no bytes to compare.  */
 	cbz	cntin, L(zero_length)
@@ -101,7 +101,7 @@ ENTRY (__memrchr)
 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
 	mov	synd, vend.2d[0]
 	/* Clear the (32-soff)*2 upper bits */
-	lsl	tmp, soff, #1
+	lsl	tmp, soff, #1
 	lsl	synd, synd, tmp
 	lsr	synd, synd, tmp
 	/* The first block can also be the last */
@@ -135,16 +135,16 @@ L(end):
 	b.hi	L(tail)

 L(masklast):
-	/* Clear the (32 - ((cntrem + (32-soff)) % 32)) * 2 lower bits */
+	/* Clear the (32 - ((cntrem + (32-soff)) % 32)) * 2 lower bits */
 	add	tmp, cntrem, soff
 	and	tmp, tmp, #31
 	sub	tmp, tmp, #32
-	neg	tmp, tmp, lsl #1
+	neg	tmp, tmp, lsl #1
 	lsr	synd, synd, tmp
 	lsl	synd, synd, tmp

 L(tail):
-	/* Compensate the last post-increment*/
+	/* Compensate the last post-increment*/
 	add	seek_dst, seek_dst, #32
 	/* Check that we have found a character */
 	cmp	synd, #0
@@ -163,4 +163,3 @@ L(zero_length):
 END (__memrchr)
 weak_alias (__memrchr, memrchr)
 libc_hidden_builtin_def (memrchr)
-
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 90529d40..722ed824 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,4 +1,4 @@
 ifeq ($(subdir),string)
-sysdep_routines += memcpy_kunpeng memcpy_generic memcpy_thunderx memcpy_thunderx2 \
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
 		   memcpy_falkor memmove_falkor memset_generic memset_falkor memset_kunpeng
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index bef9b06d..0026dbba 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -1,5 +1,5 @@
 /* Enumerate available IFUNC implementations of a function.  AARCH64 version.
-   Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.

    The GNU C Library is free software; you can redistribute it and/or
@@ -25,36 +25,34 @@
 #include <stdio.h>

 /* Maximum number of IFUNC implementations.  */
-#define MAX_IFUNC	5
+#define MAX_IFUNC	4

 size_t
 __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			size_t max)
 {
-	assert(max >= MAX_IFUNC);
-
-	size_t i = 0;
-
-	INIT_ARCH();
-
-	/* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c.  */
-	IFUNC_IMPL(i, name, memcpy,
-			IFUNC_IMPL_ADD(array, i, memcpy, 1, __memcpy_thunderx)
-			IFUNC_IMPL_ADD(array, i, memcpy, 1, __memcpy_thunderx2)
-			IFUNC_IMPL_ADD(array, i, memcpy, 1, __memcpy_falkor)
-			IFUNC_IMPL_ADD(array, i, memcpy, 1, __memcpy_kunpeng)
-			IFUNC_IMPL_ADD(array, i, memcpy, 1, __memcpy_generic))
-	IFUNC_IMPL(i, name, memmove,
-			IFUNC_IMPL_ADD(array, i, memmove, 1, __memmove_thunderx)
-			IFUNC_IMPL_ADD(array, i, memmove, 1, __memmove_falkor)
-			IFUNC_IMPL_ADD(array, i, memmove, 1, __memmove_kunpeng)
-			IFUNC_IMPL_ADD(array, i, memmove, 1, __memmove_generic))
-	IFUNC_IMPL(i, name, memset,
-	/* Enable this on non-falkor processors too so that other cores
-	   can do a comparative analysis with __memset_generic.  */
-	IFUNC_IMPL_ADD(array, i, memset, (zva_size == 64), __memset_falkor)
-			IFUNC_IMPL_ADD(array, i, memset, 1, __memset_generic)
-			IFUNC_IMPL_ADD(array, i, memset, 1, __memset_kunpeng))
-
-	return i;
+  assert (max >= MAX_IFUNC);
+
+  size_t i = 0;
+
+  INIT_ARCH ();
+
+  /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c.  */
+  IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
+  IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+  IFUNC_IMPL (i, name, memset,
+	      /* Enable this on non-falkor processors too so that other cores
+		 can do a comparative analysis with __memset_generic.  */
+	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
+          IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
+
+  return i;
 }
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 150e1ca9..2d358a83 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -1,5 +1,5 @@
 /* Multiple versions of memcpy. AARCH64 version.
-   Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.

    The GNU C Library is free software; you can redistribute it and/or
@@ -32,14 +32,11 @@ extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
-extern __typeof (__redirect_memcpy) __memcpy_kunpeng attribute_hidden;

 libc_ifunc (__libc_memcpy,
-            IS_KUNPENG920(midr)
-	    ?__memcpy_kunpeng
-	    : (IS_THUNDERX (midr)
+            (IS_THUNDERX (midr)
 	     ? __memcpy_thunderx
-	     : (IS_FALKOR (midr) || IS_PHECDA (midr)
+	     : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_KUNPENG920 (midr)
 		? __memcpy_falkor
 		: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
 		  ? __memcpy_thunderx2
diff --git a/sysdeps/aarch64/multiarch/memcpy_kunpeng.S b/sysdeps/aarch64/multiarch/memcpy_kunpeng.S
deleted file mode 100755
index 2102478a..00000000
--- a/sysdeps/aarch64/multiarch/memcpy_kunpeng.S
+++ /dev/null
@@ -1,576 +0,0 @@
-/* A Kunpeng Optimized memcpy implementation for AARCH64.
-   Copyright (C) 2018-2019 Free Software Foundation, Inc.
-
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, unaligned accesses.
- *
- */
-
-#define dstin	x0
-#define src	x1
-#define count	x2
-#define dst	x3
-#define srcend	x4
-#define dstend	x5
-#define tmp2	x6
-#define tmp3	x7
-#define tmp3w   w7
-#define A_l	x6
-#define A_lw	w6
-#define A_h	x7
-#define A_hw	w7
-#define B_l	x8
-#define B_lw	w8
-#define B_h	x9
-#define C_l	x10
-#define C_h	x11
-#define D_l	x12
-#define D_h	x13
-#define E_l	src
-#define E_h	count
-#define F_l	srcend
-#define F_h	dst
-#define G_l	count
-#define G_h	dst
-#define tmp1	x14
-
-#define A_q	q0
-#define B_q	q1
-#define C_q	q2
-#define D_q	q3
-#define E_q	q4
-#define F_q	q5
-#define G_q	q6
-#define H_q	q7
-#define I_q	q16
-#define J_q	q17
-
-#define A_v	v0
-#define B_v	v1
-#define C_v	v2
-#define D_v	v3
-#define E_v	v4
-#define F_v	v5
-#define G_v	v6
-#define H_v	v7
-#define I_v	v16
-#define J_v	v17
-
-#ifndef MEMMOVE
-# define MEMMOVE memmove
-#endif
-#ifndef MEMCPY
-# define MEMCPY memcpy
-#endif
-
-#if IS_IN (libc)
-
-#undef MEMCPY
-#define MEMCPY __memcpy_kunpeng
-#undef MEMMOVE
-#define MEMMOVE __memmove_kunpeng
-
-
-/* Overlapping large forward memmoves use a loop that copies backwards.
-   Otherwise memcpy is used. Small moves branch to memcopy16 directly.
-   The longer memcpy cases fall through to the memcpy head.
-*/
-
-ENTRY_ALIGN (MEMMOVE, 6)
-
-	DELOUSE (0)
-	DELOUSE (1)
-	DELOUSE (2)
-
-	sub	tmp1, dstin, src
-	cmp	count, 512
-	ccmp	tmp1, count, 2, hi
-	b.lo	L(move_long)
-	cmp	count, 96
-	ccmp	tmp1, count, 2, hi
-	b.lo	L(move_middle)
-
-END (MEMMOVE)
-libc_hidden_builtin_def (MEMMOVE)
-
-
-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
-   medium copies of 17..96 bytes which are fully unrolled. Large copies
-   of more than 96 bytes align the destination and use load-and-merge
-   approach in the case src and dst addresses are unaligned not evenly,
-   so that, actual loads and stores are always aligned.
-   Large copies use the loops processing 64 bytes per iteration for
-   unaligned case and 128 bytes per iteration for aligned ones.
-*/
-
-#define MEMCPY_PREFETCH_LDR 640
-
-	.p2align 4
-ENTRY (MEMCPY)
-
-	DELOUSE (0)
-	DELOUSE (1)
-	DELOUSE (2)
-
-	add	srcend, src, count
-	cmp	count, 16
-	b.ls	L(memcopy16)
-	add	dstend, dstin, count
-	cmp	count, 96
-	b.hi	L(memcopy_long)
-
-	/* Medium copies: 17..96 bytes.  */
-    ldr	A_q, [src], #16
-    and	tmp1, src, 15
-	ldr	E_q, [srcend, -16]
-	cmp	count, 64
-	b.gt	L(memcpy_copy96)
-	cmp	count, 48
-	b.le	L(bytes_17_to_48)
-	/* 49..64 bytes */
-	ldp	B_q, C_q, [src]
-	str	E_q, [dstend, -16]
-	stp	A_q, B_q, [dstin]
-	str	C_q, [dstin, 32]
-	ret
-
-L(bytes_17_to_48):
-	/* 17..48 bytes*/
-	cmp	count, 32
-	b.gt	L(bytes_32_to_48)
-	/* 17..32 bytes*/
-	str	A_q, [dstin]
-	str	E_q, [dstend, -16]
-	ret
-
-L(bytes_32_to_48):
-	/* 32..48 */
-	ldr	B_q, [src]
-	str	A_q, [dstin]
-	str	E_q, [dstend, -16]
-	str	B_q, [dstin, 16]
-	ret
-
-	.p2align 4
-	/* Small copies: 0..16 bytes.  */
-L(memcopy16):
-	cmp	count, 8
-	b.lo	L(bytes_0_to_8)
-	ldr	A_l, [src]
-	ldr	A_h, [srcend, -8]
-	add	dstend, dstin, count
-	str	A_l, [dstin]
-	str	A_h, [dstend, -8]
-	ret
-	.p2align 4
-
-L(bytes_0_to_8):
-	tbz	count, 2, L(bytes_0_to_3)
-	ldr	A_lw, [src]
-	ldr	A_hw, [srcend, -4]
-	add	dstend, dstin, count
-	str	A_lw, [dstin]
-	str	A_hw, [dstend, -4]
-	ret
-
-	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
-	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-L(bytes_0_to_3):
-	cbz	count, 1f
-	lsr	tmp1, count, 1
-	ldrb	A_lw, [src]
-	ldrb	A_hw, [srcend, -1]
-	add	dstend, dstin, count
-	ldrb	B_lw, [src, tmp1]
-	strb	B_lw, [dstin, tmp1]
-	strb	A_hw, [dstend, -1]
-	strb	A_lw, [dstin]
-1:
-	ret
-
-	.p2align 4
-
-L(memcpy_copy96):
-	/* Copying 65..96 bytes. A_q (first 16 bytes) and
-	   E_q(last 16 bytes) are already loaded. The size
-	   is large enough to benefit from aligned loads */
-	bic	src, src, 15
-	ldp	B_q, C_q, [src]
-	/* Loaded 64 bytes, second 16-bytes chunk can be
-	   overlapping with the first chunk by tmp1 bytes.
-	   Stored 16 bytes. */
-	sub	dst, dstin, tmp1
-	add	count, count, tmp1
-	/* The range of count being [65..96] becomes [65..111]
-	   after tmp [0..15] gets added to it,
-	   count now is <bytes-left-to-load>+48 */
-	cmp	count, 80
-	b.gt	L(copy96_medium)
-	ldr	D_q, [src, 32]
-	stp	B_q, C_q, [dst, 16]
-	str	D_q, [dst, 48]
-	str	A_q, [dstin]
-	str	E_q, [dstend, -16]
-	ret
-
-	.p2align 4
-L(copy96_medium):
-	ldp	D_q, G_q, [src, 32]
-	cmp	count, 96
-	b.gt	L(copy96_large)
-	stp	B_q, C_q, [dst, 16]
-	stp	D_q, G_q, [dst, 48]
-	str	A_q, [dstin]
-	str	E_q, [dstend, -16]
-	ret
-
-L(copy96_large):
-	ldr	F_q, [src, 64]
-	str	B_q, [dst, 16]
-	stp	C_q, D_q, [dst, 32]
-	stp	G_q, F_q, [dst, 64]
-	str	A_q, [dstin]
-	str	E_q, [dstend, -16]
-	ret
-
-	.p2align 4
-L(memcopy_long):
-    cmp count, 2048
-    b.ls L(copy2048_large)
-    ldr	A_q, [src], #16
-    and	tmp1, src, 15
-	bic	src, src, 15
-	ldp	B_q, C_q, [src], #32
-	sub	dst, dstin, tmp1
-	add	count, count, tmp1
-	add	dst, dst, 16
-	and	tmp1, dst, 15
-	ldp	D_q, E_q, [src], #32
-	str	A_q, [dstin]
-
-	/* Already loaded 64+16 bytes. Check if at
-	   least 64 more bytes left */
-	subs	count, count, 64+64+16
-	b.lt	L(loop128_exit0)
-	cmp	count, MEMCPY_PREFETCH_LDR + 64 + 32
-	b.lt	L(loop128)
-	cbnz	tmp1, L(dst_unaligned)
-	sub	count, count, MEMCPY_PREFETCH_LDR + 64 + 32
-
-	.p2align 4
-
-L(loop128_prefetch):
-	prfm	pldl1strm, [src, MEMCPY_PREFETCH_LDR]
-	ldp	F_q, G_q, [src], #32
-	stp	B_q, C_q, [dst], #32
-	ldp	H_q, I_q, [src], #32
-	prfm	pldl1strm, [src, MEMCPY_PREFETCH_LDR]
-	ldp	B_q, C_q, [src], #32
-	stp	D_q, E_q, [dst], #32
-	ldp	D_q, E_q, [src], #32
-	stp	F_q, G_q, [dst], #32
-	stp	H_q, I_q, [dst], #32
-	subs	count, count, 128
-	b.ge	L(loop128_prefetch)
-
-	add	count, count, MEMCPY_PREFETCH_LDR + 64 + 32
-	.p2align 4
-L(loop128):
-	ldp	F_q, G_q, [src], #32
-	ldp	H_q, I_q, [src], #32
-	stp	B_q, C_q, [dst], #32
-	stp	D_q, E_q, [dst], #32
-	subs	count, count, 64
-	b.lt	L(loop128_exit1)
-	ldp	B_q, C_q, [src], #32
-	ldp	D_q, E_q, [src], #32
-	stp	F_q, G_q, [dst], #32
-	stp	H_q, I_q, [dst], #32
-	subs	count, count, 64
-	b.ge	L(loop128)
-L(loop128_exit0):
-	ldp	F_q, G_q, [srcend, -64]
-	ldp	H_q, I_q, [srcend, -32]
-	stp	B_q, C_q, [dst], #32
-	stp	D_q, E_q, [dst]
-	stp	F_q, G_q, [dstend, -64]
-	stp	H_q, I_q, [dstend, -32]
-	ret
-L(loop128_exit1):
-	ldp	B_q, C_q, [srcend, -64]
-	ldp	D_q, E_q, [srcend, -32]
-	stp	F_q, G_q, [dst], #32
-	stp	H_q, I_q, [dst]
-	stp	B_q, C_q, [dstend, -64]
-	stp	D_q, E_q, [dstend, -32]
-	ret
-
-L(copy2048_large):
-	and	tmp1, dstin, 15
-	bic	dst, dstin, 15
-	ldp	D_l, D_h, [src]
-	sub	src, src, tmp1
-	add	count, count, tmp1	/* Count is now 16 too large.  */
-	ldp	A_l, A_h, [src, 16]
-	stp	D_l, D_h, [dstin]
-	ldp	B_l, B_h, [src, 32]
-	ldp	C_l, C_h, [src, 48]
-	ldp	D_l, D_h, [src, 64]!
-	subs	count, count, 128 + 16	/* Test and readjust count.  */
-	b.ls	L(last64)
-
-L(loop64):
-	stp	A_l, A_h, [dst, 16]
-	ldp	A_l, A_h, [src, 16]
-	stp	B_l, B_h, [dst, 32]
-	ldp	B_l, B_h, [src, 32]
-	stp	C_l, C_h, [dst, 48]
-	ldp	C_l, C_h, [src, 48]
-	stp	D_l, D_h, [dst, 64]
-	ldp	D_l, D_h, [src, 64]
-    add dst, dst, 64
-    add src, src, 64
-	subs	count, count, 64
-	b.hi	L(loop64)
-
-	/* Write the last full set of 64 bytes.  The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the end even if
-	   there is just 1 byte left.  */
-L(last64):
-	ldp	E_l, E_h, [srcend, -64]
-	stp	A_l, A_h, [dst, 16]
-	ldp	A_l, A_h, [srcend, -48]
-	stp	B_l, B_h, [dst, 32]
-	ldp	B_l, B_h, [srcend, -32]
-	stp	C_l, C_h, [dst, 48]
-	ldp	C_l, C_h, [srcend, -16]
-	stp	D_l, D_h, [dst, 64]
-	stp	E_l, E_h, [dstend, -64]
-	stp	A_l, A_h, [dstend, -48]
-	stp	B_l, B_h, [dstend, -32]
-	stp	C_l, C_h, [dstend, -16]
-	ret
-
-
-L(dst_unaligned_tail):
-	ldp	C_q, D_q, [srcend, -64]
-	ldp	E_q, F_q, [srcend, -32]
-	stp	A_q, B_q, [dst], #32
-	stp	H_q, I_q, [dst], #16
-	str	G_q, [dst, tmp1]
-	stp	C_q, D_q, [dstend, -64]
-	stp	E_q, F_q, [dstend, -32]
-	ret
-
-L(dst_unaligned):
-	/* For the unaligned store case the code loads two
-	   aligned chunks and then merges them using ext
-	   instruction. This can be up to 30% faster than
-	   the the simple unaligned store access.
-
-	   Current state: tmp1 = dst % 16; C_q, D_q, E_q
-	   contains data yet to be stored. src and dst points
-	   to next-to-be-processed data. A_q, B_q contains
-	   data already stored before, count = bytes left to
-	   be load decremented by 64.
-
-	   The control is passed here if at least 64 bytes left
-	   to be loaded. The code does two aligned loads and then
-	   extracts (16-tmp1) bytes from the first register and
-	   tmp1 bytes from the next register forming the value
-	   for the aligned store.
-
-	   As ext instruction can only have it's index encoded
-	   as immediate. 15 code chunks process each possible
-	   index value. Computed goto is used to reach the
-	   required code. */
-
-	/* Store the 16 bytes to dst and align dst for further
-	   operations, several bytes will be stored at this
-	   address once more */
-
-	ldp	F_q, G_q, [src], #32
-	stp	B_q, C_q, [dst], #32
-	bic	dst, dst, 15
-	sub	count, count, 32
-	adrp	tmp2, L(ext_table)
-	add	tmp2, tmp2, :lo12:L(ext_table)
-	add	tmp2, tmp2, tmp1, LSL #2
-	ldr	tmp3w, [tmp2]
-	add	tmp2, tmp2, tmp3w, SXTW
-	br	tmp2
-
-.p2align 4
-	/* to make the loop in each chunk 16-bytes aligned */
-	nop
-#define EXT_CHUNK(shft) \
-L(ext_size_ ## shft):;\
-	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
-	ext     B_v.16b, D_v.16b, E_v.16b, 16-shft;\
-	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
-1:;\
-	stp     A_q, B_q, [dst], #32;\
-	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
-	ldp     C_q, D_q, [src], #32;\
-	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	stp     H_q, I_q, [dst], #32;\
-	ext     A_v.16b, G_v.16b, C_v.16b, 16-shft;\
-	ext     B_v.16b, C_v.16b, D_v.16b, 16-shft;\
-	ldp     F_q, G_q, [src], #32;\
-	ext     H_v.16b, D_v.16b, F_v.16b, 16-shft;\
-	subs    count, count, 64;\
-	b.ge    1b;\
-2:;\
-	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	b	L(dst_unaligned_tail);
-
-EXT_CHUNK(1)
-EXT_CHUNK(2)
-EXT_CHUNK(3)
-EXT_CHUNK(4)
-EXT_CHUNK(5)
-EXT_CHUNK(6)
-EXT_CHUNK(7)
-EXT_CHUNK(8)
-EXT_CHUNK(9)
-EXT_CHUNK(10)
-EXT_CHUNK(11)
-EXT_CHUNK(12)
-EXT_CHUNK(13)
-EXT_CHUNK(14)
-EXT_CHUNK(15)
-
-.p2align 4
-L(move_long):
-1:
-	add	srcend, src, count
-	add	dstend, dstin, count
-
-	and	tmp1, dstend, 15
-	ldr	D_q, [srcend, -16]
-	sub	srcend, srcend, tmp1
-	sub	count, count, tmp1
-	ldp	A_q, B_q, [srcend, -32]
-	str	D_q, [dstend, -16]
-	ldp	C_q, D_q, [srcend, -64]!
-	sub	dstend, dstend, tmp1
-	subs	count, count, 128
-	b.ls	2f
-
-.p2align 4
-1:
-	subs	count, count, 64
-	stp	A_q, B_q, [dstend, -32]
-	ldp	A_q, B_q, [srcend, -32]
-	stp	C_q, D_q, [dstend, -64]!
-	ldp	C_q, D_q, [srcend, -64]!
-	b.hi	1b
-
-	/* Write the last full set of 64 bytes.  The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the start even if
-	   there is just 1 byte left.  */
-2:
-	ldp	E_q, F_q, [src, 32]
-	ldp	G_q, H_q, [src]
-	stp	A_q, B_q, [dstend, -32]
-	stp	C_q, D_q, [dstend, -64]
-	stp	E_q, F_q, [dstin, 32]
-	stp	G_q, H_q, [dstin]
-3:	ret
-
-
-.p2align 4
-L(move_middle):
-	cbz	tmp1, 3f
-	add	srcend, src, count
-	prfm	PLDL1STRM, [srcend, -64]
-	add	dstend, dstin, count
-	and	tmp1, dstend, 15
-	ldr D_q, [srcend, -16]
-	sub	srcend, srcend, tmp1
-	sub	count, count, tmp1
-	ldr	A_q, [srcend, -16]
-	str	D_q, [dstend, -16]
-	ldr	B_q, [srcend, -32]
-	ldr	C_q, [srcend, -48]
-	ldr	D_q, [srcend, -64]!
-	sub	dstend, dstend, tmp1
-	subs	count, count, 128
-	b.ls	2f
-
-1:
-	str	A_q, [dstend, -16]
-	ldr	A_q, [srcend, -16]
-	str	B_q, [dstend, -32]
-	ldr	B_q, [srcend, -32]
-	str	C_q, [dstend, -48]
-	ldr	C_q, [srcend, -48]
-	str	D_q, [dstend, -64]!
-	ldr	D_q, [srcend, -64]!
-	subs	count, count, 64
-	b.hi	1b
-
-	/* Write the last full set of 64 bytes.  The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the start even if
-	   there is just 1 byte left.  */
-2:
-	ldr	G_q, [src, 48]
-	str	A_q, [dstend, -16]
-	ldr	A_q, [src, 32]
-	str	B_q, [dstend, -32]
-	ldr	B_q, [src, 16]
-	str	C_q, [dstend, -48]
-	ldr	C_q, [src]
-	str	D_q, [dstend, -64]
-	str	G_q, [dstin, 48]
-	str	A_q, [dstin, 32]
-	str	B_q, [dstin, 16]
-	str	C_q, [dstin]
-3:	ret
-
-
-END (MEMCPY)
-	.section	.rodata
-	.p2align	4
-
-L(ext_table):
-	/* The first entry is for the alignment of 0 and is never
-	   actually used (could be any value).  */
-	.word	0
-	.word	L(ext_size_1) -.
-	.word	L(ext_size_2) -.
-	.word	L(ext_size_3) -.
-	.word	L(ext_size_4) -.
-	.word	L(ext_size_5) -.
-	.word	L(ext_size_6) -.
-	.word	L(ext_size_7) -.
-	.word	L(ext_size_8) -.
-	.word	L(ext_size_9) -.
-	.word	L(ext_size_10) -.
-	.word	L(ext_size_11) -.
-	.word	L(ext_size_12) -.
-	.word	L(ext_size_13) -.
-	.word	L(ext_size_14) -.
-	.word	L(ext_size_15) -.
-
-libc_hidden_builtin_def (MEMCPY)
-#endif
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index 0d8c85b4..e69d8162 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -1,5 +1,5 @@
 /* Multiple versions of memmove. AARCH64 version.
-   Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.

    The GNU C Library is free software; you can redistribute it and/or
@@ -31,16 +31,13 @@ extern __typeof (__redirect_memmove) __libc_memmove;
 extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
-extern __typeof (__redirect_memmove) __memmove_kunpeng attribute_hidden;
-
+
 libc_ifunc (__libc_memmove,
-        (IS_KUNPENG920(midr)
-	    ?__memmove_kunpeng
-        :(IS_THUNDERX (midr)
+            (IS_THUNDERX (midr)
 	     ? __memmove_thunderx
 	     : (IS_FALKOR (midr) || IS_PHECDA (midr)
 		? __memmove_falkor
-		: __memmove_generic))));
+		: __memmove_generic)));

 # undef memmove
 strong_alias (__libc_memmove, memmove);
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 0f7ad0c8..f7ae291e 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -1,5 +1,5 @@
 /* Multiple versions of memset. AARCH64 version.
-   Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.

    The GNU C Library is free software; you can redistribute it and/or
@@ -29,15 +29,15 @@
 extern __typeof (__redirect_memset) __libc_memset;

 extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
-extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
 extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
+extern __typeof (__redirect_memset) __memset_generic attribute_hidden;

 libc_ifunc (__libc_memset,
-		IS_KUNPENG920(midr)
-		?__memset_kunpeng
-		:((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
-	     ?__memset_falkor
-	     :__memset_generic));
+       IS_KUNPENG920 (midr)
+       ?__memset_kunpeng
+       : ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
+	     ? __memset_falkor
+	     : __memset_generic));

 # undef memset
 strong_alias (__libc_memset, memset);
diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S
index 22a3d4a7..a03441ae 100644
--- a/sysdeps/aarch64/multiarch/memset_kunpeng.S
+++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2019 Free Software Foundation, Inc.
+/* Optimized memset for Huawei Kunpeng processor.
+   Copyright (C) 2012-2019 Free Software Foundation, Inc.

    This file is part of the GNU C Library.

@@ -14,7 +15,7 @@

    You should have received a copy of the GNU Lesser General Public
    License along with the GNU C Library.  If not, see
-   <http://www.gnu.org/licenses/>.  */
+   <https://www.gnu.org/licenses/>.  */

 #include <sysdep.h>
 #include <sysdeps/aarch64/memset-reg.h>
@@ -35,7 +36,7 @@ ENTRY_ALIGN (MEMSET, 6)

 	dup	v0.16B, valw
 	add	dstend, dstin, count
-
+
 	cmp	count, 128
 	b.hs	L(set_long)

@@ -44,7 +45,7 @@ ENTRY_ALIGN (MEMSET, 6)

 	/* Set 16..127 bytes.  */
 	str	q0, [dstin]
-	tbnz	count, 6, L(set112)
+	tbnz	count, 6, L(set127)
 	str	q0, [dstend, -16]
 	tbz	count, 5, 1f
 	str	q0, [dstin, 16]
@@ -53,26 +54,14 @@ ENTRY_ALIGN (MEMSET, 6)

 	.p2align 4
 	/* Set 64..127 bytes.  Write 64 bytes from the start and
-	   32 bytes from the end.  */
-L(set112):
-	ands tmp1, dstin, 15
-	bne  2f
-	str	q0, [dstin, 16]
-	stp	q0, q0, [dstin, 32]//finish 64
-	tbz	count, 5, 1f
-	stp	q0, q0, [dstin, 64] //大于96, finish 96
-1:	stp	q0, q0, [dstend, -32]
+	   64 bytes from the end.  */
+L(set127):
+	stp	q0, q0, [dstin, 16]
+	str	q0, [dstin, 48]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
 	ret
-	.p2align 4
-2:	bic dst, dstin, 15//回退到16对齐
-	stp q0,q0, [dst, 16]
-	str	q0, [dst, 48]
-	tbz count, 5, 3f //大于96
-	stp	q0, q0, [dst, 64]
-3:  stp	q0, q0, [dstend, -48]//finish 64~80
-	str q0, [dstend, -16]//finish 96
-    ret
-
+
 	.p2align 4
 	/* Set 0..15 bytes.  */
 L(less16):
@@ -90,10 +79,9 @@ L(less8):
 	tbz	count, 1, 3f
 	str	h0, [dstend, -2]
 3:	ret
-
+
 	.p2align 4
-L(set_long):
-	and	valw, valw, 255
+L(set_long):
 	bic	dst, dstin, 15
 	str	q0, [dstin]
 	sub	count, dstend, dst	/* Count is 16 too large.  */
@@ -103,19 +91,21 @@ L(set_long):
 	stp	q0, q0, [dst, 64]!
 	subs	count, count, 64
 	b.lo	1f
-    stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 32]
 	stp	q0, q0, [dst, 64]!
 	subs	count, count, 64
 	b.lo	1f
-    stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 32]
 	stp	q0, q0, [dst, 64]!
 	subs	count, count, 64
-    b.hs 1b
-
-1:  tbz count, 5, 2f
-	str	q0, [dst, 32]
-	str q0, [dst, 48]
-2:  stp q0, q0, [dstend, -32]
+	b.lo	1f
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.hs	1b
+
+1:	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
 	ret

 END (MEMSET)
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index 290bcf8d..a64c5980 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -1,5 +1,5 @@
 /* strcpy/stpcpy - copy a string returning pointer to start/end.
-   Copyright (C) 2013-2019 Free Software Foundation, Inc.
+   Copyright (C) 2013-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.

    The GNU C Library is free software; you can redistribute it and/or
@@ -14,7 +14,7 @@

    You should have received a copy of the GNU Lesser General Public
    License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
+   <http://www.gnu.org/licenses/>.  */

 /* To build as stpcpy, define BUILD_STPCPY before compiling this file.

@@ -232,7 +232,7 @@ L(entry_no_page_cross):
 #ifdef __AARCH64EB__
 	rev64	datav.16b, datav.16b
 #endif
-	/* loc */
+	/* calculate the loc value */
 	cmeq	datav.16b, datav.16b, #0
 	mov	data1, datav.d[0]
 	mov	data2, datav.d[1]
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index a57753b0..0a42f404 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -1,6 +1,6 @@
 /* strnlen - calculate the length of a string with limit.

-   Copyright (C) 2013-2019 Free Software Foundation, Inc.
+   Copyright (C) 2013-2018 Free Software Foundation, Inc.

    This file is part of the GNU C Library.

@@ -16,7 +16,7 @@

    You should have received a copy of the GNU Lesser General Public
    License along with the GNU C Library.  If not, see
-   <https://www.gnu.org/licenses/>.  */
+   <http://www.gnu.org/licenses/>.  */

 #include <sysdep.h>

diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
index b152c4e3..e60485b0 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
@@ -1,6 +1,6 @@
 /* Initialize CPU feature data.  AArch64 version.
    This file is part of the GNU C Library.
-   Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.

    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -36,7 +36,7 @@ static struct cpu_list cpu_list[] = {
       {"thunderx2t99",   0x431F0AF0},
       {"thunderx2t99p1", 0x420F5160},
       {"phecda",	 0x680F0000},
-      {"kunpeng920",        0x481FD010},
+      {"kunpeng920",    0x481FD010},
       {"generic", 	 0x0}
 };

diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index 4faeed7a..ed77cde7 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -1,6 +1,6 @@
 /* Initialize CPU feature data.  AArch64 version.
    This file is part of the GNU C Library.
-   Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.

    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -51,8 +51,9 @@

 #define IS_PHECDA(midr) (MIDR_IMPLEMENTOR(midr) == 'h'			      \
                         && MIDR_PARTNUM(midr) == 0x000)
-#define IS_KUNPENG920(midr) (MIDR_IMPLEMENTOR(midr) == 'H'                      \
-		        && MIDR_PARTNUM(midr) == 0xd01)
+
+#define IS_KUNPENG920(midr) (MIDR_IMPLEMENTOR(midr) == 'H'            \
+                        && MIDR_PARTNUM(midr) == 0xd01)

 struct cpu_features
 {
--
2.19.1