/* Copyright 2002 Andi Kleen, SuSE Labs.
 * Subject to the GNU Public License v2.
 * 
 * Functions to copy from and to user space.		
 */		 

#define FIX_ALIGNMENT 1

#define movnti movq  /* write to cache for now */
#define prefetch prefetcht2
		
	#include <asm/current.h>
	#include <asm/offset.h>

/* Standard copy_to_user with segment limit checking */		
	.globl copy_to_user
	.p2align 	
copy_to_user:
	GET_CURRENT(%rax)
	movq %rdi,%rcx
	addq %rdx,%rcx
	jc  bad_to_user
	cmpq tsk_addr_limit(%rax),%rcx
	jae bad_to_user
	jmp copy_user_generic

/* Standard copy_from_user with segment limit checking */	
	.globl copy_from_user
	.p2align 	
copy_from_user:
	GET_CURRENT(%rax)
	movq %rsi,%rcx
	addq %rdx,%rcx
	jc  bad_from_user
	cmpq tsk_addr_limit(%rax),%rcx
	jae  bad_from_user
	/* FALL THROUGH to copy_user_generic */
	
	.section .fixup,"ax"
	/* must zero dest */
bad_from_user:
	movl %edx,%ecx
	xorl %eax,%eax
	rep
	stosb
bad_to_user:
	movl	%edx,%eax
	ret
	.previous
	
/*
 * copy_user_generic - memory copy with exception handling.
 * 	
 * Input:	
 * rdi destination
 * rsi source
 * rdx count
 *
 * Output:		
 * eax uncopied bytes or 0 if successfull. 
 */
	.globl copy_user_generic	
copy_user_generic:	
	/* Put the first cacheline into cache. This should handle
	   the small movements in ioctls etc., but not penalize the bigger
	   filesystem data copies too much. */
	pushq %rbx
	prefetch (%rsi)
	xorl %eax,%eax		/*zero for the exception handler */

#ifdef FIX_ALIGNMENT
	/* check for bad alignment of destination */
	movl %edi,%ecx
	andl $7,%ecx
	jnz  bad_alignment
after_bad_alignment:
#endif

	movq %rdx,%rcx

	movl $64,%ebx	
	shrq $6,%rdx
	decq %rdx
	js   handle_tail
	jz   loop_no_prefetch
	
loop:
	prefetch 64(%rsi)
	
loop_no_prefetch:	
s1:	movq (%rsi),%r11
s2:	movq 1*8(%rsi),%r8
s3:	movq 2*8(%rsi),%r9
s4:	movq 3*8(%rsi),%r10
d1:	movnti %r11,(%rdi)
d2:	movnti %r8,1*8(%rdi)
d3:	movnti %r9,2*8(%rdi)
d4:	movnti %r10,3*8(%rdi)
		
s5:	movq 4*8(%rsi),%r11
s6:	movq 5*8(%rsi),%r8
s7:	movq 6*8(%rsi),%r9
s8:	movq 7*8(%rsi),%r10
d5:	movnti %r11,4*8(%rdi)
d6:	movnti %r8,5*8(%rdi)
d7:	movnti %r9,6*8(%rdi)
d8:	movnti %r10,7*8(%rdi)

	addq %rbx,%rsi	
	addq %rbx,%rdi
	
	decq %rdx
	jz   loop_no_prefetch
	jns  loop

handle_tail:
	movl %ecx,%edx
	andl $63,%ecx
	shrl $3,%ecx
	jz   handle_7
	movl $8,%ebx
loop_8:
s9:	movq (%rsi),%r8
d9:	movq %r8,(%rdi)
	addq %rbx,%rdi
	addq %rbx,%rsi
	decl %ecx
	jnz loop_8
	
handle_7:		
	movl %edx,%ecx	
	andl $7,%ecx
	jz   ende
loop_1:
s10:	movb (%rsi),%bl
d10:	movb %bl,(%rdi)
	incq %rdi
	incq %rsi
	decl %ecx
	jnz loop_1
			
ende:
	sfence
	popq %rbx
	ret	

#ifdef FIX_ALIGNMENT		  		
	/* align destination */
bad_alignment:
	movl $8,%r9d
	subl %ecx,%r9d
	movl %r9d,%ecx
	subq %r9,%rdx
	jz   small_align
	js   small_align
align_1:		
s11:	movb (%rsi),%bl
d11:	movb %bl,(%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
	jnz align_1
	jmp after_bad_alignment
small_align:
	addq %r9,%rdx
	jmp handle_7
#endif
	
	/* table sorted by exception address */	
	.section __ex_table,"a"
	.align 8
	.quad s1,s1e
	.quad s2,s2e
	.quad s3,s3e
	.quad s4,s4e	
	.quad d1,s1e
	.quad d2,s2e
	.quad d3,s3e
	.quad d4,s4e
	.quad s5,s5e
	.quad s6,s6e
	.quad s7,s7e
	.quad s8,s8e	
	.quad d5,s5e
	.quad d6,s6e
	.quad d7,s7e
	.quad d8,s8e
	.quad s9,e_quad
	.quad d9,e_quad
	.quad s10,e_byte
	.quad d10,e_byte
#ifdef FIX_ALIGNMENT	
	.quad s11,e_byte
	.quad d11,e_byte
#endif
	.quad e5,e_zero
	.previous

	/* compute 64-offset for main loop. 8 bytes accuracy with error on the 
	   pessimistic side. this is gross. it would be better to fix the 
	   interface. */	
	/* eax: zero, ebx: 64 */
s1e: 	addl $8,%eax
s2e: 	addl $8,%eax
s3e: 	addl $8,%eax
s4e: 	addl $8,%eax
s5e: 	addl $8,%eax
s6e: 	addl $8,%eax
s7e: 	addl $8,%eax
s8e: 	addl $8,%eax
	addq %rbx,%rdi	/* +64 */
	subq %rax,%rdi  /* correct destination with computed offset */

	shlq $6,%rdx	/* loop counter * 64 (stride length) */
	addq %rax,%rdx	/* add offset to loopcnt */
	andl $63,%ecx	/* remaining bytes */
	addq %rcx,%rdx	/* add them */
	jmp zero_rest

	/* exception on quad word loop in tail handling */
	/* ecx:	loopcnt/8, %edx: length, rdi: correct */
e_quad:
	shll $3,%ecx
	andl $7,%edx
	addl %ecx,%edx
	/* edx: bytes to zero, rdi: dest, eax:zero */
zero_rest:
	movq %rdx,%rcx
e_byte:
	xorl %eax,%eax
e5:	rep 
	stosb
	/* when there is another exception while zeroing the rest just return */
e_zero:		
	movq %rdx,%rax
	jmp ende