/*
 * Copyright 2002 Andi Kleen
 *	
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file COPYING in the main directory of this archive
 * for more details. No warranty for anything given at all.
 */
 	#include <linux/linkage.h>
	#include <asm/errno.h>

//	#define FIX_ALIGNMENT 1
/*
 * Checksum copy with exception handling.
 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 
 * destination is zeroed.
 * 
 * Input
 * rdi  source
 * rsi  destination
 * edx  len (32bit)
 * ecx  sum (32bit) 
 * r8   src_err_ptr (int)
 * r9   dst_err_ptr (int)
 *
 * Output
 * eax  64bit sum. undefined in case of exception.
 * 
 * Wrappers need to take care of valid exception sum and zeroing.		 
 */

/* for now - should vary this based on direction */
 #define prefetch prefetcht2
 #define movnti   movq

	.macro source
10:
	.section __ex_table,"a"
	.align 8
	.quad 10b,bad_source
	.previous
	.endm
		
	.macro dest
20:
	.section __ex_table,"a"
	.align 8
	.quad 20b,bad_dest
	.previous
	.endm
			
	.globl csum_partial_copy_generic
	.p2align
csum_partial_copy_generic:
	prefetchnta (%rdi)
	
	pushq %rbx
	pushq %r12
	pushq %r14
	pushq %r15
	movq %r8,%r14
	movq %r9,%r15
	movl  %ecx,%eax
	movl  %edx,%ecx

#ifdef FIX_ALIGNMENT
	/* align source to 8 bytes */	
	movl %edi,%r8d
	andl $7,%r8d
	jnz  bad_alignment	
after_bad_alignment:
#endif

	movl  $64,%r10d
	xorl  %r9d,%r9d
	movq  %rcx,%r12

	shrq  $6,%r12
	/* loopcounter is maintained as one less to test efficiently for the
	   previous to last iteration. This is needed to stop the prefetching. */
	decq  %r12
	js    handle_tail       /* < 64 */
	jz    loop_no_prefetch  /* = 64 + X */ 
	
	/* main loop. clear in 64 byte blocks */
	/* tries hard not to prefetch over the boundary */
	/* r10:	64, r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
	/* r11:	temp3, rdx: temp4, r12 loopcnt */
	.p2align
loop:
	/* Could prefetch more than one loop, but then it would be even
	   trickier to avoid prefetching over the boundary. The hardware prefetch
	   should take care of this anyways. The reason for this prefetch is
	   just the non temporal hint to avoid cache pollution. Hopefully this
	   will be handled properly by the hardware. */
	prefetchnta 64(%rdi) 
	
loop_no_prefetch:
	source
	movq  (%rdi),%rbx
	source
	movq  8(%rdi),%r8
	source
	movq  16(%rdi),%r11
	source
	movq  24(%rdi),%rdx

	dest
	movnti %rbx,(%rsi)
	dest
	movnti %r8,8(%rsi)
	dest
	movnti %r11,16(%rsi)
	dest
	movnti %rdx,24(%rsi)
		
	addq  %rbx,%rax
	adcq  %r8,%rax
	adcq  %r11,%rax
	adcq  %rdx,%rax

	source
	movq  32(%rdi),%rbx
	source
	movq  40(%rdi),%r8
	source
	movq  48(%rdi),%r11
	source
	movq  56(%rdi),%rdx
	
	dest
	movnti %rbx,32(%rsi)
	dest
	movnti %r8,40(%rsi)
	dest
	movnti %r11,48(%rsi)
	dest
	movnti %rdx,56(%rsi)

	adcq %rbx,%rax
	adcq %r8,%rax
	adcq %r11,%rax
	adcq %rdx,%rax
	
	adcq %r9,%rax	/* add in carry */
	
	addq %r10,%rdi		
	addq %r10,%rsi

	decq %r12
	jz   loop_no_prefetch	/* previous to last iteration? */
	jns  loop

	/* do last upto 56 bytes */
handle_tail:
	/* ecx:	count */
	movl %ecx,%r10d
	andl $63,%ecx
	shrl $3,%ecx
	jz 	 fold
	clc
	movl $8,%edx
loop_8:	
	source
	movq (%rdi),%rbx
	adcq %rbx,%rax
	dest
	movnti %rbx,(%rsi)
	leaq (%rsi,%rdx),%rsi /* preserve carry */
	leaq (%rdi,%rdx),%rdi
	decl %ecx
	jnz	loop_8
	adcq %r9,%rax	/* add in carry */

fold:
	movl %eax,%ebx
	shrq $32,%rax
	addq %rbx,%rax

	/* do last upto 6 bytes */	
handle_7:
	movl %r10d,%ecx
	andl $7,%ecx
	shrl $1,%ecx
	jz   handle_1
	movl $2,%edx
	xorl %ebx,%ebx
	clc  
loop_1:	
	source
	movw (%rdi),%bx
	adcq %rbx,%rax
	dest
	movw %bx,(%rsi)
	addq %rdx,%rdi
	addq %rdx,%rsi
	decl %ecx
	jnz loop_1
	adcw %r9w,%ax	/* add in carry */
	
	/* handle last odd byte */
handle_1:	
	testl $1,%r10d
	jz    ende
	xorl  %ebx,%ebx
	source
	movb (%rdi),%bl
	dest
	movb %bl,(%rsi)
	addw %bx,%ax
	adcw %r9w,%ax		/* carry */
			
ende:		
	sfence
	popq %r15
	popq %r14
	popq %r12
	popq %rbx
	ret

#ifdef FIX_ALIGNMENT
	/* align source to 8 bytes. */
	/* r8d:	unalignedness, ecx len */
bad_alignment:
	testl $1,%edi
	jnz   odd_source

	/* compute distance to next aligned position */
	movl $8,%r8d
	xchgl %r8d,%ecx
	subl %r8d,%ecx

	/* handle unaligned part */
	shrl $1,%ecx
	xorl %ebx,%ebx	
	movl $2,%r10d
align_loop:
	source
	movw (%rdi),%bx
	addq %rbx,%rax	/* carry cannot happen */
	dest
	movw %bx,(%rsi)
	addq %r10,%rdi
	addq %r10,%rsi
	decl %ecx
	jnz align_loop
	jmp after_bad_alignment

	/* weird case. need to swap the sum at the end because the spec requires
	   16 bit words of the sum to be always paired. 
	   handle it recursively because it should be rather rare. */
odd_source:
	/* copy odd byte */
	xorl %ebx,%ebx
	source
	movb (%rdi),%bl
	addl %ebx,%eax       /* add to old checksum */
	adcl $0,%ecx
	dest
	movb %al,(%rsi)
	
	/* fix arguments */
	movl %eax,%ecx
	incq %rsi
	incq %rdi
	decq %rdx 
	call csum_partial_copy_generic
	bswap %eax        /* this should work, but check */
	jmp ende
#endif

	/* Exception handlers. Very simple, zeroing is done in the wrappers */
bad_source:
	movl $-EFAULT,(%r14)
	jmp  ende
	
bad_dest:
	movl $-EFAULT,(%r15)
	jmp ende